diff --git a/README.md b/README.md index 42fa4d6..3069f2e 100644 --- a/README.md +++ b/README.md @@ -151,10 +151,21 @@ For each Pokemon TCG product: - Network connectivity issues - Placeholder images will be used automatically -4. **Chrome/Selenium issues** - - Ensure Chrome or Chromium is installed - - webdriver-manager will automatically download ChromeDriver +4. **Browser/Selenium issues** + - **Brave browser supported**: Configured to use Brave at `/usr/bin/brave` + - **ChromeDriver compatibility**: May require version matching (Brave 146 vs ChromeDriver 114) + - **Alternative browsers**: Chrome, Chromium, or Firefox with geckodriver - Script falls back to requests-only mode if Selenium fails + + **For Brave users**: If you see ChromeDriver version mismatch: + ```bash + # Test browser integration + python test_brave.py + + # Solutions for version mismatch: + pip install --upgrade webdriver-manager + # or manually install compatible ChromeDriver + ``` ### Debug Mode diff --git a/TEST_RESULTS.md b/TEST_RESULTS.md index aa0bc1e..bb9e5af 100644 --- a/TEST_RESULTS.md +++ b/TEST_RESULTS.md @@ -30,6 +30,13 @@ System: CachyOS (Arch Linux) - ✅ Image placeholder generation - ✅ Error handling and graceful fallbacks +### 5. Brave Browser Integration +- ✅ Brave browser detected and configured +- ✅ Selenium WebDriver setup for Brave +- ⚠️ ChromeDriver version compatibility issue (expected) +- ✅ Graceful fallback when browser automation fails +- ✅ Test script provided (`test_brave.py`) for troubleshooting + ## ⚠️ Current Limitations ### 1. Web Scraping @@ -38,9 +45,12 @@ System: CachyOS (Arch Linux) - **Solution**: Selenium fallback is implemented but requires Chrome/Chromium browser - **Workaround**: Test data demonstrates full pipeline functionality -### 2. External Dependencies -- **LaTeX**: Requires texlive packages for PDF generation (now installed) -- **Chrome**: Needed for Selenium fallback (not installed in test environment) +### 2. External Dependencies & Browser Integration +- **LaTeX**: Requires texlive packages for PDF generation (✅ installed) +- **Brave Browser**: Configured and detected (✅ available at /usr/bin/brave) +- **ChromeDriver Compatibility**: Version mismatch (Brave 146 vs ChromeDriver 114) + - ⚠️ Requires compatible ChromeDriver version for web scraping + - 💡 Main functionality (PDF generation) works without browser - **Network**: External image downloads require internet connectivity ## 📋 Test Results Summary diff --git a/scraper.py b/scraper.py index 422b72d..734206a 100755 --- a/scraper.py +++ b/scraper.py @@ -25,7 +25,7 @@ try: SELENIUM_AVAILABLE = True except ImportError: SELENIUM_AVAILABLE = False - print("Selenium not available, using requests only") + print("Selenium not available, using requests only (install selenium for Brave browser support)") class PokemonTCGScraper: def __init__(self): @@ -58,7 +58,7 @@ class PokemonTCGScraper: return None def get_page_with_selenium(self, url): - """Fallback to selenium for dynamic content""" + """Fallback to selenium for dynamic content using Brave browser""" if not SELENIUM_AVAILABLE: return None @@ -67,26 +67,59 @@ class PokemonTCGScraper: options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-gpu') + options.add_argument('--disable-web-security') + options.add_argument('--disable-features=VizDisplayCompositor') options.add_argument(f'--user-agent={self.headers["User-Agent"]}') + # Use Brave browser + options.binary_location = '/usr/bin/brave' + try: - driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) + print("Starting Brave browser with Selenium...") + from selenium.webdriver.chrome.service import Service + + # Try to get compatible ChromeDriver + try: + # Try with webdriver manager (auto-detects version) + service = Service(ChromeDriverManager().install()) + except Exception as e: + print(f"ChromeDriver auto-install failed: {e}") + print("This usually means ChromeDriver version doesn't match Brave version.") + print("For best results, ensure ChromeDriver and Brave versions are compatible.") + print("You can manually install a compatible ChromeDriver or use a different browser.") + return None + + driver = webdriver.Chrome(service=service, options=options) + + print(f"Navigating to: {url}") driver.get(url) # Wait for content to load - WebDriverWait(driver, 10).until( + print("Waiting for page content to load...") + WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.TAG_NAME, "body")) ) - # Additional wait for dynamic content - time.sleep(3) + # Additional wait for dynamic content and JavaScript execution + print("Waiting for dynamic content...") + time.sleep(5) + + # Try to find product-related elements + print("Looking for product elements...") + try: + # Check if we have product elements loaded + product_elements = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"], .product-item, .product-card') + print(f"Found {len(product_elements)} potential product elements") + except: + print("No specific product elements found, proceeding with full page content") html = driver.page_source + print(f"Retrieved {len(html)} characters of HTML content") driver.quit() return html except Exception as e: - print(f"Selenium failed for {url}: {e}") + print(f"Brave/Selenium failed for {url}: {e}") if 'driver' in locals(): driver.quit() return None @@ -271,8 +304,23 @@ class PokemonTCGScraper: print(f"Found {len(product_links)} potential product links") if not product_links: - print("No product links found. The page structure may have changed.") - print("First 1000 chars of page:") + print("No product links found with requests. Trying Brave browser for dynamic content...") + # Try Selenium with Brave as fallback + selenium_html = self.get_page_with_selenium(self.search_url) + if selenium_html and len(selenium_html) > len(html): + print("Got enhanced content from Brave, re-extracting product links...") + html = selenium_html + product_links = self.extract_product_links(html) + print(f"Found {len(product_links)} product links with Brave browser") + + if not product_links: + print("No product links found even with Brave browser.") + print("This could be due to:") + print("1. No Pokemon TCG products currently in stock") + print("2. Website structure changes") + print("3. Enhanced anti-bot protection") + print("4. Geographic restrictions") + print("\nFirst 1000 chars of final page content:") print(html[:1000]) return [] diff --git a/test_brave.py b/test_brave.py new file mode 100644 index 0000000..afa840f --- /dev/null +++ b/test_brave.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +""" +Test Brave browser integration with Pokemon Discovery +""" + +import sys +import os + +try: + from selenium import webdriver + from selenium.webdriver.chrome.options import Options + from selenium.webdriver.chrome.service import Service + from webdriver_manager.chrome import ChromeDriverManager + + print("✓ Selenium and webdriver-manager are available") + + # Check if Brave is available + if not os.path.exists('/usr/bin/brave'): + print("✗ Brave browser not found at /usr/bin/brave") + sys.exit(1) + + print("✓ Brave browser found at /usr/bin/brave") + + # Get Brave version + import subprocess + try: + result = subprocess.run(['/usr/bin/brave', '--version'], + capture_output=True, text=True, timeout=5) + brave_version = result.stdout.strip() + print(f"✓ {brave_version}") + except: + print("⚠ Could not get Brave version") + + # Test ChromeDriver compatibility + print("\nTesting ChromeDriver compatibility...") + options = Options() + options.add_argument('--headless') + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') + options.binary_location = '/usr/bin/brave' + + try: + service = Service(ChromeDriverManager().install()) + driver = webdriver.Chrome(service=service, options=options) + + # Simple test page + driver.get("data:text/html,

Test

") + title = driver.title + driver.quit() + + print("✓ Brave + ChromeDriver test successful!") + print("✓ Pokemon Discovery is ready to use Brave for dynamic content") + + except Exception as e: + print(f"✗ ChromeDriver compatibility issue: {e}") + print("\n💡 Solutions:") + print("1. Update ChromeDriver: pip install --upgrade webdriver-manager") + print("2. Install matching ChromeDriver version manually") + print("3. Use Firefox with geckodriver as alternative") + print("\nNote: The main PDF generation functionality works without browser automation") + +except ImportError as e: + print(f"✗ Missing dependency: {e}") + print("Run: pip install selenium webdriver-manager") + sys.exit(1) + +print("\n🎯 Test completed!") \ No newline at end of file