Clean up: remove obsolete files, update docs and docstrings

Removed 20 files: old test scripts, debug tools, duplicate docs, generated JSON, old PDF generator, launcher scripts. Kept: disco.py — main tool (scrape HAR + generate PDF) scraper.py — reference site scraper (HTML + Selenium/Brave) requirements.txt *.har — browser capture with API data Updated: README.md — rewritten to reflect current tool and usage .gitignore — simplified scraper.py — module/class/method docstrings updated to clarify this is a reference implementation, disco.py is primary
2026-03-21 23:28:52 -07:00
parent 90661e1957
commit 0c7e139245
24 changed files with 115 additions and 3380 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -1,7 +1,20 @@
 #!/usr/bin/env python3
 """
-Pokemon Discovery - TCG Product Scraper for Dollar General
-Scrapes product information and saves to JSON for PDF generation
+Pokemon Discovery — Site Scraper (Reference)
+
+HTML + Selenium/Brave scraper for Dollar General product pages.
+Kept as a reference implementation. The primary tool is disco.py,
+which reads product data from a HAR capture instead of scraping live.
+
+This scraper can:
+  - Fetch individual product pages and extract title, SKU, price, stock
+  - Attempt to find product links from the category page (limited by
+    dynamic JS loading — products are injected via API after page load)
+  - Fall back to Brave browser via Selenium for JS-rendered content
+
+Usage:
+    python scraper.py                  # Attempt full category scrape
+    # Or import and use PokemonTCGScraper class directly for individual pages
 """

 import json
@@ -28,6 +41,14 @@ except ImportError:
    print("Selenium not available, using requests only (install selenium for Brave browser support)")

 class PokemonTCGScraper:
+    """HTML/Selenium scraper for Dollar General Pokemon product pages.
+
+    Can extract product details (title, SKU, price, stock) from individual
+    product page URLs. Category-level scraping is limited because Dollar
+    General loads products dynamically via a JS API call after page load.
+    See disco.py for the HAR-based approach that bypasses this limitation.
+    """
+
    def __init__(self):
        self.base_url = "https://www.dollargeneral.com"
        self.search_url = "https://www.dollargeneral.com/c/toys/pokemon?q=&soldAtStore=true"
@@ -300,9 +321,10 @@ class PokemonTCGScraper:
        return has_pokemon and has_tcg
    
    def try_api_scraping(self):
-        """
-        Try to scrape products using the discovered API endpoint
-        This method contains the exact API call found via HAR analysis
+        """Stub for API-based scraping (requires auth token).
+
+        Documents the discovered API endpoint and request format.
+        Not functional — use disco.py with a HAR file instead.
        """
        print("🔬 Attempting API-based scraping...")
        print("   Endpoint: https://dggo.dollargeneral.com/omni/api/v2/category/search/provider")