From e9efcf14600c5da72864e25ad2fa1e7c36133111 Mon Sep 17 00:00:00 2001
From: pi-bot-01 <pi-bot-01@dominat.us>
Date: Sat, 21 Mar 2026 16:12:14 -0700
Subject: [PATCH] Add disco.py: single working script that finds all pack/tin
 products and generates PDF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extracts all 12 Pokemon products from HAR API responses,
filters to 6 card pack and tin products, downloads product images,
generates UPC-A barcodes, and produces a 157KB PDF catalog.

Products found:
1. Pokémon Trading Card Game, 15 Card Pack (In Stock)
2. Pokémon TCG Booster Pack with Promo Card & Coin
3. Pokemon Trading Card Game Sword & Shield Booster Pack
4. Pokémon Collectible Stacking Tin
5. Pokémon Trading Card Game Mini Tin
6. Pokémon Trading Card Game, Gardevoir Strong Bond Tin
---
 disco.py | 419 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 419 insertions(+)
 create mode 100644 disco.py

diff --git a/disco.py b/disco.py
new file mode 100644
index 0000000..dc2e78d
--- /dev/null
+++ b/disco.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python3
+"""
+Pokemon Discovery (disco.py)
+Scrapes Pokemon TCG pack & tin products from Dollar General and generates a PDF catalog.
+
+Usage:
+    python disco.py                          # Full run: scrape + generate PDF
+    python disco.py --scrape-only            # Just scrape, output JSON
+    python disco.py --pdf-only FILE.json     # Just generate PDF from existing JSON
+"""
+
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+import requests
+from datetime import datetime
+from pathlib import Path
+from urllib.parse import urljoin, quote
+
+import barcode
+from barcode.writer import ImageWriter
+from bs4 import BeautifulSoup
+from PIL import Image, ImageDraw, ImageFont
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+HAR_FILE = "www.dollargeneral.com_Archive [26-03-21 15-14-28].har"
+BASE_URL = "https://www.dollargeneral.com"
+OUTPUT_DIR = Path("catalog_output")
+IMAGES_DIR = OUTPUT_DIR / "images"
+BARCODES_DIR = OUTPUT_DIR / "barcodes"
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:148.0) Gecko/20100101 Firefox/148.0",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.9",
+}
+
+# Keywords that identify card packs and tins (case-insensitive)
+CARD_TIN_KEYWORDS = ["pack", "tin", "booster", "card game", "tcg"]
+
+# ---------------------------------------------------------------------------
+# Step 1 — Product Discovery (from HAR file API responses)
+# ---------------------------------------------------------------------------
+
+def extract_products_from_har(har_path: str) -> list[dict]:
+    """Parse HAR file and extract all Pokemon products from API responses."""
+    print(f"📦 Reading HAR file: {har_path}")
+
+    with open(har_path, "r", encoding="utf-8") as f:
+        har = json.load(f)
+
+    api_url = "https://dggo.dollargeneral.com/omni/api/v2/category/search/provider"
+    unique: dict[str, dict] = {}
+
+    for entry in har["log"]["entries"]:
+        req = entry["request"]
+        resp = entry["response"]
+        if req["url"] != api_url or req["method"] != "POST":
+            continue
+        text = resp.get("content", {}).get("text", "")
+        if not text:
+            continue
+        try:
+            data = json.loads(text)
+        except json.JSONDecodeError:
+            continue
+        for item in data.get("ItemList", {}).get("Items", []):
+            upc = str(item.get("UPC", ""))
+            if upc and upc not in unique:
+                unique[upc] = item
+
+    print(f"   Found {len(unique)} unique products in HAR data")
+    return list(unique.values())
+
+
+def rootsv_to_sku(rootsv: str) -> str:
+    """Convert rootSV like '0419363_1' to SKU like '41936301'."""
+    if not rootsv:
+        return ""
+    parts = rootsv.split("_")
+    base = parts[0].lstrip("0")
+    suffix = parts[1] if len(parts) > 1 else ""
+    return base + suffix
+
+
+def build_product_url(upc: str) -> str:
+    """Construct a Dollar General product page URL from a UPC."""
+    return f"{BASE_URL}/p/pokemon-product/{upc}"
+
+
+def filter_card_and_tin_products(raw_items: list[dict]) -> list[dict]:
+    """Keep only products whose description contains card/pack/tin keywords."""
+    filtered = []
+    for item in raw_items:
+        desc = item.get("Description", "").lower()
+        if any(kw in desc for kw in CARD_TIN_KEYWORDS):
+            filtered.append(item)
+    return filtered
+
+
+def normalize_product(item: dict) -> dict:
+    """Convert raw API item into a clean product dict."""
+    upc = str(item.get("UPC", ""))
+    rootsv = item.get("rootSV", "")
+    sku = rootsv_to_sku(rootsv)
+    qty = item.get("AvailableQty", 0)
+
+    return {
+        "title": item.get("Description", "Unknown Product"),
+        "sku": sku,
+        "upc": upc,
+        "price": f"${item.get('Price', 0):.2f}",
+        "stock": f"In Stock ({qty})" if qty and qty > 0 else "Out of Stock",
+        "quantity": qty,
+        "image_url": item.get("Image", ""),
+        "rating": item.get("AverageRating", 0),
+        "reviews": item.get("RatingReviewCount", 0),
+        "url": build_product_url(upc),
+    }
+
+# ---------------------------------------------------------------------------
+# Step 2 — Enrich from product pages (get real URL slug, extra details)
+# ---------------------------------------------------------------------------
+
+def enrich_from_product_page(product: dict) -> dict:
+    """Visit the actual product page to get the real URL and any missing data."""
+    upc = product["upc"]
+    sku = product["sku"]
+
+    # Try to get the real product page
+    # DG product pages can be accessed by UPC at search
+    search_url = f"{BASE_URL}/search?q={upc}"
+    try:
+        resp = requests.get(search_url, headers=HEADERS, timeout=15)
+        if resp.status_code == 200:
+            soup = BeautifulSoup(resp.text, "html.parser")
+            # Look for the canonical product link
+            links = soup.select(f'a[href*="/p/"][href*="{upc}"]')
+            if links:
+                href = links[0].get("href", "")
+                product["url"] = urljoin(BASE_URL, href)
+    except Exception:
+        pass
+
+    # Also try visiting the product page directly by known pattern
+    # The image URL contains the DG item number: dg-XXXXXXXX-1
+    img_url = product.get("image_url", "")
+    match = re.search(r"dg-(\d+)-", img_url)
+    if match:
+        dg_item = match.group(1)
+        # This is the item number used in the SKU
+        if not product.get("sku"):
+            product["sku"] = dg_item
+
+    return product
+
+# ---------------------------------------------------------------------------
+# Step 3 — Download images & generate barcodes
+# ---------------------------------------------------------------------------
+
+def download_image(url: str, dest: Path) -> Path | None:
+    """Download image from URL, return local path or None."""
+    if not url:
+        return None
+    try:
+        resp = requests.get(url, headers=HEADERS, timeout=15)
+        resp.raise_for_status()
+        dest.write_bytes(resp.content)
+        return dest
+    except Exception as e:
+        print(f"   ⚠ Image download failed: {e}")
+        return None
+
+
+def make_placeholder(dest: Path, text: str = "No Image") -> Path:
+    """Create a simple placeholder image."""
+    img = Image.new("RGB", (300, 300), "#e0e0e0")
+    draw = ImageDraw.Draw(img)
+    try:
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20)
+    except Exception:
+        font = ImageFont.load_default()
+    bbox = draw.textbbox((0, 0), text, font=font)
+    tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
+    draw.text(((300 - tw) / 2, (300 - th) / 2), text, fill="#888", font=font)
+    img.save(dest)
+    return dest
+
+
+def generate_barcode(sku: str, dest_dir: Path) -> Path | None:
+    """Generate a UPC-A barcode PNG from a SKU. Returns path to the .png file."""
+    digits = re.sub(r"\D", "", sku)
+    if not digits:
+        return None
+    # UPC-A needs exactly 11 digits (12th is check digit, auto-calculated)
+    digits = digits[-11:].zfill(11)
+    try:
+        upc_cls = barcode.get_barcode_class("upca")
+        bc = upc_cls(digits, writer=ImageWriter())
+        # barcode lib appends .png automatically
+        out = dest_dir / f"barcode_{sku}"
+        saved = bc.save(
+            str(out),
+            options={
+                "module_width": 0.3,
+                "module_height": 15.0,
+                "quiet_zone": 6.5,
+                "font_size": 10,
+                "text_distance": 5.0,
+            },
+        )
+        return Path(saved)
+    except Exception as e:
+        print(f"   ⚠ Barcode generation failed for {sku}: {e}")
+        return None
+
+# ---------------------------------------------------------------------------
+# Step 4 — Generate PDF via pandoc
+# ---------------------------------------------------------------------------
+
+def generate_catalog_pdf(products: list[dict]) -> Path | None:
+    """Build a Markdown file and convert to PDF with pandoc."""
+    timestamp_label = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    timestamp_file = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+    md_lines = [
+        "---",
+        'title: "Pokemon TCG Product Catalog — Dollar General"',
+        f'date: "{timestamp_label}"',
+        "geometry: margin=0.75in",
+        "fontsize: 11pt",
+        "---",
+        "",
+        f"**Generated**: {timestamp_label}  ",
+        f"**Products**: {len(products)} Cards & Tins  ",
+        "",
+        "\\newpage",
+        "",
+    ]
+
+    for i, prod in enumerate(products, 1):
+        title = prod["title"]
+        sku = prod["sku"]
+        upc = prod["upc"]
+        price = prod["price"]
+        stock = prod["stock"]
+
+        # Download product image
+        img_dest = IMAGES_DIR / f"product_{i}_{sku}.jpg"
+        img_path = download_image(prod.get("image_url"), img_dest)
+        if not img_path:
+            img_path = make_placeholder(IMAGES_DIR / f"product_{i}_{sku}_placeholder.png", title[:30])
+
+        # Generate barcode
+        bc_path = generate_barcode(sku, BARCODES_DIR)
+
+        # Relative paths for pandoc (run from OUTPUT_DIR)
+        rel_img = os.path.relpath(img_path, OUTPUT_DIR)
+        rel_bc = os.path.relpath(bc_path, OUTPUT_DIR) if bc_path else None
+
+        md_lines += [
+            f"## {i}. {title}",
+            "",
+            f"![{title}]({rel_img}){{ width=200px }}",
+            "",
+            "| Field | Value |",
+            "|-------|-------|",
+            f"| **Price** | {price} |",
+            f"| **Stock** | {stock} |",
+            f"| **SKU** | `{sku}` |",
+            f"| **UPC** | `{upc}` |",
+            "",
+        ]
+
+        if rel_bc:
+            md_lines += [
+                f"![UPC-A Barcode]({rel_bc}){{ width=250px }}",
+                "",
+            ]
+
+        md_lines += ["\\newpage", ""]
+
+        print(f"   ✅ [{i}/{len(products)}] {title}")
+
+    # Write markdown
+    md_file = OUTPUT_DIR / f"pokemon_catalog_{timestamp_file}.md"
+    md_file.write_text("\n".join(md_lines), encoding="utf-8")
+    print(f"\n📝 Markdown: {md_file}")
+
+    # Convert to PDF
+    pdf_file = OUTPUT_DIR / f"pokemon_catalog_{timestamp_file}.pdf"
+    engines = ["pdflatex", "xelatex"]
+
+    for engine in engines:
+        try:
+            cmd = [
+                "pandoc", str(md_file),
+                "-o", str(pdf_file),
+                f"--pdf-engine={engine}",
+                "-V", "colorlinks=true",
+            ]
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+            if result.returncode == 0:
+                print(f"📄 PDF generated: {pdf_file}  ({pdf_file.stat().st_size // 1024} KB)")
+                return pdf_file
+            else:
+                continue
+        except Exception:
+            continue
+
+    print(f"⚠ PDF generation failed. Markdown available at: {md_file}")
+    return None
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    args = sys.argv[1:]
+
+    # Handle --pdf-only mode
+    if "--pdf-only" in args:
+        idx = args.index("--pdf-only")
+        json_file = args[idx + 1] if idx + 1 < len(args) else None
+        if not json_file or not Path(json_file).exists():
+            print(f"Usage: {sys.argv[0]} --pdf-only <products.json>")
+            sys.exit(1)
+        products = json.loads(Path(json_file).read_text())
+        for d in [OUTPUT_DIR, IMAGES_DIR, BARCODES_DIR]:
+            d.mkdir(parents=True, exist_ok=True)
+        print(f"\n🖨️  Generating PDF from {json_file} ({len(products)} products)...")
+        generate_catalog_pdf(products)
+        return
+
+    scrape_only = "--scrape-only" in args
+
+    # --- Banner ---
+    timestamp_file = datetime.now().strftime("%Y%m%d_%H%M%S")
+    print("=" * 60)
+    print("  🔍  Pokemon Discovery (pokemon-disco)")
+    print("  Dollar General — Pokemon TCG Cards & Tins")
+    print(f"  {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print("=" * 60)
+
+    # --- Step 1: Extract from HAR ---
+    if not Path(HAR_FILE).exists():
+        print(f"\n❌ HAR file not found: {HAR_FILE}")
+        print("   Capture a HAR file from the Pokemon page in your browser")
+        print("   and place it in the project directory.")
+        sys.exit(1)
+
+    raw_items = extract_products_from_har(HAR_FILE)
+
+    # --- Step 2: Filter for Cards & Tins ---
+    print(f"\n🎯 Filtering for card packs and tins...")
+    card_tin_items = filter_card_and_tin_products(raw_items)
+    print(f"   {len(card_tin_items)} of {len(raw_items)} products match (pack/tin/booster/tcg)")
+
+    if not card_tin_items:
+        print("❌ No card or tin products found.")
+        sys.exit(1)
+
+    # Show what was filtered out
+    excluded = [i for i in raw_items if i not in card_tin_items]
+    if excluded:
+        print(f"\n   Excluded {len(excluded)} non-card/tin products:")
+        for item in excluded:
+            print(f"     ✗ {item.get('Description', '?')}")
+
+    # --- Step 3: Normalize ---
+    print(f"\n📋 Processing {len(card_tin_items)} products...")
+    products = [normalize_product(item) for item in card_tin_items]
+
+    # Print summary table
+    print()
+    print(f"  {'#':<3} {'Title':<55} {'SKU':<12} {'Price':<8} {'Stock'}")
+    print(f"  {'—'*3} {'—'*55} {'—'*12} {'—'*8} {'—'*15}")
+    for i, p in enumerate(products, 1):
+        title = p['title'][:53]
+        print(f"  {i:<3} {title:<55} {p['sku']:<12} {p['price']:<8} {p['stock']}")
+
+    # --- Step 4: Save JSON ---
+    json_file = f"pokemon_tcg_products_{timestamp_file}.json"
+    Path(json_file).write_text(json.dumps(products, indent=2, ensure_ascii=False))
+    print(f"\n💾 Product data: {json_file}")
+
+    if scrape_only:
+        print("\n✅ Scrape complete (--scrape-only). Run with --pdf-only to generate catalog.")
+        return
+
+    # --- Step 5: Generate PDF ---
+    for d in [OUTPUT_DIR, IMAGES_DIR, BARCODES_DIR]:
+        d.mkdir(parents=True, exist_ok=True)
+
+    print(f"\n🖨️  Generating PDF catalog...")
+    pdf_path = generate_catalog_pdf(products)
+
+    # --- Done ---
+    print("\n" + "=" * 60)
+    if pdf_path:
+        print(f"  ✅ COMPLETE!")
+        print(f"  📄 PDF Catalog:  {pdf_path}")
+        print(f"  💾 Product JSON: {json_file}")
+        print(f"  🏷️  Barcodes:     {BARCODES_DIR}/")
+        print(f"  🖼️  Images:       {IMAGES_DIR}/")
+    else:
+        print(f"  ⚠ PDF generation failed — markdown file available in {OUTPUT_DIR}/")
+        print(f"  💾 Product JSON: {json_file}")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()