pokemon-disco/debug_page_loading.py

#!/usr/bin/env python3
"""
Debug Pokemon page loading to understand the dynamic content issue
"""

import requests
from bs4 import BeautifulSoup
import json
import time

def test_pokemon_page():
    """Test both Pokemon URLs to understand the difference"""

    print("Pokemon Page Loading Debug")
    print("=" * 60)

    urls_to_test = [
        "https://www.dollargeneral.com/c/toys/pokemon?q=",
        "https://www.dollargeneral.com/c/toys/pokemon?q=&soldAtStore=true",
        "https://www.dollargeneral.com/c/toys/pokemon"
    ]

    for url in urls_to_test:
        print(f"\n=== Testing: {url} ===")

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        try:
            response = requests.get(url, headers=headers, timeout=30)
            print(f"Status: {response.status_code}")
            print(f"Content Length: {len(response.text)} characters")

            # Parse HTML
            soup = BeautifulSoup(response.text, 'html.parser')

            # Look for specific indicators
            indicators = {
                "Product links (/p/)": len(soup.select('a[href*="/p/"]')),
                "Pokemon mentions": response.text.lower().count('pokemon'),
                "Trading card mentions": response.text.lower().count('trading card'),
                "Pack mentions": response.text.lower().count('pack'),
                "Scripts with 'product'": len([s for s in soup.find_all('script') if s.string and 'product' in s.string.lower()]),
                "Category ID 723960": '723960' in response.text,
                "Store number 17506": '17506' in response.text,
                "Test SKU 41936301": '41936301' in response.text
            }

            for indicator, value in indicators.items():
                print(f"  {indicator}: {value}")

            # Look for category information or product containers
            category_info = soup.select('[data-category-id], [data-category], .category-info, .product-grid, .product-list')
            if category_info:
                print(f"  Category/product containers found: {len(category_info)}")
                for container in category_info[:3]:
                    print(f"    -> {container.name} {container.get('class', [])} {container.get('data-category-id', '')}")

        except Exception as e:
            print(f"  Error: {e}")

def demonstrate_dynamic_loading_issue():
    """Demonstrate why we're not finding products in static HTML"""

    print("\n" + "=" * 60)
    print("DYNAMIC LOADING ANALYSIS")
    print("=" * 60)

    print("""
🔍 THE ISSUE EXPLAINED:

1. ✅ STATIC HTML LOADS: The Pokemon category page loads successfully
   - Page title: "Pokemon"
   - Content length: 139,146 characters
   - Contains Pokemon references and basic page structure

2. ❌ NO PRODUCTS IN HTML: Zero product links found in static content
   - No <a href="/p/..."> links
   - No product tiles, cards, or grids
   - Products are NOT in the initial HTML

3. 🔬 WHAT REALLY HAPPENS (discovered via HAR):
   - Page loads basic structure
   - JavaScript executes and makes API calls
   - API endpoint: https://dggo.dollargeneral.com/omni/api/v2/category/search/provider
   - API returns 4-12 Pokemon products as JSON
   - JavaScript renders products into the page DOM
   - Browser shows the products, but static scraping misses them

4. ✅ HAR ANALYSIS CONFIRMED:
   - Category ID: 723960 (Pokemon)
   - Store number: 17506
   - Found your test product: SKU 41936301
   - Found multiple Pokemon packs and tins

🎯 CONCLUSION:
The Pokemon page IS being scraped, but it's just the empty shell.
The actual products load via JavaScript API calls after page load.
""")

def show_comparison():
    """Show the difference between what we get vs what should be there"""

    print("\n" + "=" * 60)
    print("COMPARISON: STATIC HTML vs DYNAMIC CONTENT")
    print("=" * 60)

    comparison = """
WHAT WE GET (Static HTML):
━━━━━━━━━━━━━━━━━━━━━━
• Page structure: ✅
• Category title: ✅
• Navigation: ✅
• Product links: ❌ (0 found)
• Product data: ❌ (none)

WHAT SHOULD BE THERE (Dynamic Content):
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
• Pokemon Trading Card Game packs
• Pokemon tins and collections
• Product images and prices
• Stock availability
• Your test product (SKU 41936301)
• 4-12 total Pokemon TCG products

THE API RESPONSE WE DISCOVERED:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
  "ItemList": {
    "Items": [
      {
        "Title": "Pokémon Trading Card Game, 15 Card Pack, 1 ct",
        "ItemNbr": "41936301",
        "UPC": "728192558375",
        "ProductUrl": "/p/pok-mon-trading-card-game-card-pack-ct/728192558375",
        "Inventory": {"InStock": false}
      },
      // ... more Pokemon products
    ]
  }
}
"""
    print(comparison)

def main():
    test_pokemon_page()
    demonstrate_dynamic_loading_issue()
    show_comparison()

    print("\n" + "=" * 60)
    print("💡 SOLUTIONS TO GET ALL PRODUCTS:")
    print("=" * 60)
    print("""
OPTION 1 - API Authentication (Best Long-term):
• Solve the Bearer token authentication
• Use the discovered API endpoint directly
• Get all 24+ products per request automatically

OPTION 2 - Browser Automation (Works but Complex):
• Fix ChromeDriver compatibility with Brave
• Let JavaScript load the products completely
• Scrape the dynamically-loaded content

OPTION 3 - Manual Product URL Collection (Works Now):
• Find Pokemon product URLs from other sources
• Add them to the manual list in working_product_finder.py
• Process each product individually (current working method)

OPTION 4 - Hybrid Approach:
• Use individual product extraction for reliability
• Enhance discovery via multiple methods
• Build up a comprehensive product database over time
""")

    print("\n🎯 BOTTOM LINE:")
    print("The Pokemon page IS being scraped successfully!")
    print("But it's just an empty shell - the products load via JavaScript.")
    print("This is why we found the API endpoint - that's where the real data is!")

if __name__ == "__main__":
    main()