#!/usr/bin/env python3 """ Debug Pokemon page loading to understand the dynamic content issue """ import requests from bs4 import BeautifulSoup import json import time def test_pokemon_page(): """Test both Pokemon URLs to understand the difference""" print("Pokemon Page Loading Debug") print("=" * 60) urls_to_test = [ "https://www.dollargeneral.com/c/toys/pokemon?q=", "https://www.dollargeneral.com/c/toys/pokemon?q=&soldAtStore=true", "https://www.dollargeneral.com/c/toys/pokemon" ] for url in urls_to_test: print(f"\n=== Testing: {url} ===") headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } try: response = requests.get(url, headers=headers, timeout=30) print(f"Status: {response.status_code}") print(f"Content Length: {len(response.text)} characters") # Parse HTML soup = BeautifulSoup(response.text, 'html.parser') # Look for specific indicators indicators = { "Product links (/p/)": len(soup.select('a[href*="/p/"]')), "Pokemon mentions": response.text.lower().count('pokemon'), "Trading card mentions": response.text.lower().count('trading card'), "Pack mentions": response.text.lower().count('pack'), "Scripts with 'product'": len([s for s in soup.find_all('script') if s.string and 'product' in s.string.lower()]), "Category ID 723960": '723960' in response.text, "Store number 17506": '17506' in response.text, "Test SKU 41936301": '41936301' in response.text } for indicator, value in indicators.items(): print(f" {indicator}: {value}") # Look for category information or product containers category_info = soup.select('[data-category-id], [data-category], .category-info, .product-grid, .product-list') if category_info: print(f" Category/product containers found: {len(category_info)}") for container in category_info[:3]: print(f" -> {container.name} {container.get('class', [])} {container.get('data-category-id', '')}") except Exception as e: print(f" Error: {e}") def demonstrate_dynamic_loading_issue(): """Demonstrate why we're not finding products in static HTML""" print("\n" + "=" * 60) print("DYNAMIC LOADING ANALYSIS") print("=" * 60) print(""" πŸ” THE ISSUE EXPLAINED: 1. βœ… STATIC HTML LOADS: The Pokemon category page loads successfully - Page title: "Pokemon" - Content length: 139,146 characters - Contains Pokemon references and basic page structure 2. ❌ NO PRODUCTS IN HTML: Zero product links found in static content - No links - No product tiles, cards, or grids - Products are NOT in the initial HTML 3. πŸ”¬ WHAT REALLY HAPPENS (discovered via HAR): - Page loads basic structure - JavaScript executes and makes API calls - API endpoint: https://dggo.dollargeneral.com/omni/api/v2/category/search/provider - API returns 4-12 Pokemon products as JSON - JavaScript renders products into the page DOM - Browser shows the products, but static scraping misses them 4. βœ… HAR ANALYSIS CONFIRMED: - Category ID: 723960 (Pokemon) - Store number: 17506 - Found your test product: SKU 41936301 - Found multiple Pokemon packs and tins 🎯 CONCLUSION: The Pokemon page IS being scraped, but it's just the empty shell. The actual products load via JavaScript API calls after page load. """) def show_comparison(): """Show the difference between what we get vs what should be there""" print("\n" + "=" * 60) print("COMPARISON: STATIC HTML vs DYNAMIC CONTENT") print("=" * 60) comparison = """ WHAT WE GET (Static HTML): ━━━━━━━━━━━━━━━━━━━━━━ β€’ Page structure: βœ… β€’ Category title: βœ… β€’ Navigation: βœ… β€’ Product links: ❌ (0 found) β€’ Product data: ❌ (none) WHAT SHOULD BE THERE (Dynamic Content): ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ β€’ Pokemon Trading Card Game packs β€’ Pokemon tins and collections β€’ Product images and prices β€’ Stock availability β€’ Your test product (SKU 41936301) β€’ 4-12 total Pokemon TCG products THE API RESPONSE WE DISCOVERED: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ { "ItemList": { "Items": [ { "Title": "PokΓ©mon Trading Card Game, 15 Card Pack, 1 ct", "ItemNbr": "41936301", "UPC": "728192558375", "ProductUrl": "/p/pok-mon-trading-card-game-card-pack-ct/728192558375", "Inventory": {"InStock": false} }, // ... more Pokemon products ] } } """ print(comparison) def main(): test_pokemon_page() demonstrate_dynamic_loading_issue() show_comparison() print("\n" + "=" * 60) print("πŸ’‘ SOLUTIONS TO GET ALL PRODUCTS:") print("=" * 60) print(""" OPTION 1 - API Authentication (Best Long-term): β€’ Solve the Bearer token authentication β€’ Use the discovered API endpoint directly β€’ Get all 24+ products per request automatically OPTION 2 - Browser Automation (Works but Complex): β€’ Fix ChromeDriver compatibility with Brave β€’ Let JavaScript load the products completely β€’ Scrape the dynamically-loaded content OPTION 3 - Manual Product URL Collection (Works Now): β€’ Find Pokemon product URLs from other sources β€’ Add them to the manual list in working_product_finder.py β€’ Process each product individually (current working method) OPTION 4 - Hybrid Approach: β€’ Use individual product extraction for reliability β€’ Enhance discovery via multiple methods β€’ Build up a comprehensive product database over time """) print("\n🎯 BOTTOM LINE:") print("The Pokemon page IS being scraped successfully!") print("But it's just an empty shell - the products load via JavaScript.") print("This is why we found the API endpoint - that's where the real data is!") if __name__ == "__main__": main()