pokemon-disco/test_dynamic_scraping.py

#!/usr/bin/env python3
"""
Test dynamic content loading for Pokemon Discovery
"""

import requests
import json
from bs4 import BeautifulSoup
import time

def test_api_endpoints():
    """Try to find API endpoints that might return product data"""

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'en-US,en;q=0.9',
        'Referer': 'https://www.dollargeneral.com/c/toys/pokemon'
    }

    # Test potential API endpoints
    api_tests = [
        'https://www.dollargeneral.com/api/products/search?q=pokemon',
        'https://www.dollargeneral.com/api/v1/products?category=toys&query=pokemon',
        'https://www.dollargeneral.com/dg/search?q=pokemon&category=toys',
        'https://www.dollargeneral.com/api/search?term=pokemon+trading+card',
    ]

    print("=== Testing API Endpoints ===")
    for url in api_tests:
        try:
            print(f"Testing: {url}")
            response = requests.get(url, headers=headers, timeout=10)
            print(f"  Status: {response.status_code}")

            if response.status_code == 200:
                try:
                    data = response.json()
                    print(f"  JSON Response: {len(str(data))} characters")
                    if 'products' in str(data).lower():
                        print("  ✓ Contains 'products'")
                    if 'pokemon' in str(data).lower():
                        print("  ✓ Contains 'pokemon'")
                except:
                    print(f"  Text Response: {len(response.text)} characters")
            print()
        except Exception as e:
            print(f"  Error: {e}")
            print()

def test_network_requests():
    """Analyze the search page to find AJAX calls"""

    url = 'https://www.dollargeneral.com/c/toys/pokemon?q=&soldAtStore=true'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    print("=== Analyzing Search Page for API Calls ===")

    try:
        response = requests.get(url, headers=headers, timeout=30)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Look for API endpoints in JavaScript
        scripts = soup.find_all('script')
        api_patterns = []

        for script in scripts:
            if script.string:
                content = script.string

                # Look for API endpoints
                import re
                patterns = [
                    r'(?:api|Api|API)["\'\s]*[:=]["\'\s]*([^"\']+)',
                    r'(?:endpoint|url|baseURL)["\'\s]*[:=]["\'\s]*([^"\']+)',
                    r'fetch\s*\(\s*["\']([^"\']+)["\']',
                    r'xhr\.open\s*\(\s*["\'][^"\']*["\'],\s*["\']([^"\']+)["\']',
                    r'/api/[^"\'\\s]+',
                    r'/search[^"\'\\s]*',
                ]

                for pattern in patterns:
                    matches = re.findall(pattern, content, re.IGNORECASE)
                    for match in matches:
                        if 'dollargeneral' in match or match.startswith('/'):
                            api_patterns.append(match)

        # Remove duplicates and clean up
        unique_apis = list(set(api_patterns))

        print(f"Found {len(unique_apis)} potential API endpoints:")
        for api in unique_apis[:10]:  # Show first 10
            print(f"  -> {api}")

        return unique_apis

    except Exception as e:
        print(f"Error analyzing page: {e}")
        return []

def test_sitemap_approach():
    """Try to find products via sitemap"""

    print("=== Testing Sitemap Approach ===")

    sitemap_urls = [
        'https://www.dollargeneral.com/sitemap.xml',
        'https://www.dollargeneral.com/robots.txt'
    ]

    for url in sitemap_urls:
        try:
            print(f"Testing: {url}")
            response = requests.get(url, timeout=10)
            print(f"  Status: {response.status_code}")

            if response.status_code == 200:
                content = response.text
                if 'pokemon' in content.lower():
                    print("  ✓ Contains Pokemon references")
                if '/p/' in content:
                    print("  ✓ Contains product URLs (/p/)")
                print(f"  Content length: {len(content)} characters")
            print()
        except Exception as e:
            print(f"  Error: {e}")
            print()

if __name__ == "__main__":
    print("Pokemon Discovery - Dynamic Content Testing")
    print("=" * 60)
    print()

    # Test various approaches to find products
    test_api_endpoints()
    print()

    apis = test_network_requests()
    print()

    test_sitemap_approach()
    print()

    print("=" * 60)
    print("Summary:")
    print("- Individual product extraction: ✅ WORKING")
    print("- Product URLs can be processed if found")
    print("- Main challenge: Finding product URLs from search page")
    print("- Dynamic content requires browser automation or API discovery")