From e9efcf14600c5da72864e25ad2fa1e7c36133111 Mon Sep 17 00:00:00 2001 From: pi-bot-01 Date: Sat, 21 Mar 2026 16:12:14 -0700 Subject: [PATCH] Add disco.py: single working script that finds all pack/tin products and generates PDF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extracts all 12 Pokemon products from HAR API responses, filters to 6 card pack and tin products, downloads product images, generates UPC-A barcodes, and produces a 157KB PDF catalog. Products found: 1. Pokémon Trading Card Game, 15 Card Pack (In Stock) 2. Pokémon TCG Booster Pack with Promo Card & Coin 3. Pokemon Trading Card Game Sword & Shield Booster Pack 4. Pokémon Collectible Stacking Tin 5. Pokémon Trading Card Game Mini Tin 6. Pokémon Trading Card Game, Gardevoir Strong Bond Tin --- disco.py | 419 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 419 insertions(+) create mode 100644 disco.py diff --git a/disco.py b/disco.py new file mode 100644 index 0000000..dc2e78d --- /dev/null +++ b/disco.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +""" +Pokemon Discovery (disco.py) +Scrapes Pokemon TCG pack & tin products from Dollar General and generates a PDF catalog. + +Usage: + python disco.py # Full run: scrape + generate PDF + python disco.py --scrape-only # Just scrape, output JSON + python disco.py --pdf-only FILE.json # Just generate PDF from existing JSON +""" + +import json +import os +import re +import subprocess +import sys +import time +import requests +from datetime import datetime +from pathlib import Path +from urllib.parse import urljoin, quote + +import barcode +from barcode.writer import ImageWriter +from bs4 import BeautifulSoup +from PIL import Image, ImageDraw, ImageFont + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +HAR_FILE = "www.dollargeneral.com_Archive [26-03-21 15-14-28].har" +BASE_URL = "https://www.dollargeneral.com" +OUTPUT_DIR = Path("catalog_output") +IMAGES_DIR = OUTPUT_DIR / "images" +BARCODES_DIR = OUTPUT_DIR / "barcodes" + +HEADERS = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:148.0) Gecko/20100101 Firefox/148.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", +} + +# Keywords that identify card packs and tins (case-insensitive) +CARD_TIN_KEYWORDS = ["pack", "tin", "booster", "card game", "tcg"] + +# --------------------------------------------------------------------------- +# Step 1 — Product Discovery (from HAR file API responses) +# --------------------------------------------------------------------------- + +def extract_products_from_har(har_path: str) -> list[dict]: + """Parse HAR file and extract all Pokemon products from API responses.""" + print(f"📦 Reading HAR file: {har_path}") + + with open(har_path, "r", encoding="utf-8") as f: + har = json.load(f) + + api_url = "https://dggo.dollargeneral.com/omni/api/v2/category/search/provider" + unique: dict[str, dict] = {} + + for entry in har["log"]["entries"]: + req = entry["request"] + resp = entry["response"] + if req["url"] != api_url or req["method"] != "POST": + continue + text = resp.get("content", {}).get("text", "") + if not text: + continue + try: + data = json.loads(text) + except json.JSONDecodeError: + continue + for item in data.get("ItemList", {}).get("Items", []): + upc = str(item.get("UPC", "")) + if upc and upc not in unique: + unique[upc] = item + + print(f" Found {len(unique)} unique products in HAR data") + return list(unique.values()) + + +def rootsv_to_sku(rootsv: str) -> str: + """Convert rootSV like '0419363_1' to SKU like '41936301'.""" + if not rootsv: + return "" + parts = rootsv.split("_") + base = parts[0].lstrip("0") + suffix = parts[1] if len(parts) > 1 else "" + return base + suffix + + +def build_product_url(upc: str) -> str: + """Construct a Dollar General product page URL from a UPC.""" + return f"{BASE_URL}/p/pokemon-product/{upc}" + + +def filter_card_and_tin_products(raw_items: list[dict]) -> list[dict]: + """Keep only products whose description contains card/pack/tin keywords.""" + filtered = [] + for item in raw_items: + desc = item.get("Description", "").lower() + if any(kw in desc for kw in CARD_TIN_KEYWORDS): + filtered.append(item) + return filtered + + +def normalize_product(item: dict) -> dict: + """Convert raw API item into a clean product dict.""" + upc = str(item.get("UPC", "")) + rootsv = item.get("rootSV", "") + sku = rootsv_to_sku(rootsv) + qty = item.get("AvailableQty", 0) + + return { + "title": item.get("Description", "Unknown Product"), + "sku": sku, + "upc": upc, + "price": f"${item.get('Price', 0):.2f}", + "stock": f"In Stock ({qty})" if qty and qty > 0 else "Out of Stock", + "quantity": qty, + "image_url": item.get("Image", ""), + "rating": item.get("AverageRating", 0), + "reviews": item.get("RatingReviewCount", 0), + "url": build_product_url(upc), + } + +# --------------------------------------------------------------------------- +# Step 2 — Enrich from product pages (get real URL slug, extra details) +# --------------------------------------------------------------------------- + +def enrich_from_product_page(product: dict) -> dict: + """Visit the actual product page to get the real URL and any missing data.""" + upc = product["upc"] + sku = product["sku"] + + # Try to get the real product page + # DG product pages can be accessed by UPC at search + search_url = f"{BASE_URL}/search?q={upc}" + try: + resp = requests.get(search_url, headers=HEADERS, timeout=15) + if resp.status_code == 200: + soup = BeautifulSoup(resp.text, "html.parser") + # Look for the canonical product link + links = soup.select(f'a[href*="/p/"][href*="{upc}"]') + if links: + href = links[0].get("href", "") + product["url"] = urljoin(BASE_URL, href) + except Exception: + pass + + # Also try visiting the product page directly by known pattern + # The image URL contains the DG item number: dg-XXXXXXXX-1 + img_url = product.get("image_url", "") + match = re.search(r"dg-(\d+)-", img_url) + if match: + dg_item = match.group(1) + # This is the item number used in the SKU + if not product.get("sku"): + product["sku"] = dg_item + + return product + +# --------------------------------------------------------------------------- +# Step 3 — Download images & generate barcodes +# --------------------------------------------------------------------------- + +def download_image(url: str, dest: Path) -> Path | None: + """Download image from URL, return local path or None.""" + if not url: + return None + try: + resp = requests.get(url, headers=HEADERS, timeout=15) + resp.raise_for_status() + dest.write_bytes(resp.content) + return dest + except Exception as e: + print(f" ⚠ Image download failed: {e}") + return None + + +def make_placeholder(dest: Path, text: str = "No Image") -> Path: + """Create a simple placeholder image.""" + img = Image.new("RGB", (300, 300), "#e0e0e0") + draw = ImageDraw.Draw(img) + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20) + except Exception: + font = ImageFont.load_default() + bbox = draw.textbbox((0, 0), text, font=font) + tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] + draw.text(((300 - tw) / 2, (300 - th) / 2), text, fill="#888", font=font) + img.save(dest) + return dest + + +def generate_barcode(sku: str, dest_dir: Path) -> Path | None: + """Generate a UPC-A barcode PNG from a SKU. Returns path to the .png file.""" + digits = re.sub(r"\D", "", sku) + if not digits: + return None + # UPC-A needs exactly 11 digits (12th is check digit, auto-calculated) + digits = digits[-11:].zfill(11) + try: + upc_cls = barcode.get_barcode_class("upca") + bc = upc_cls(digits, writer=ImageWriter()) + # barcode lib appends .png automatically + out = dest_dir / f"barcode_{sku}" + saved = bc.save( + str(out), + options={ + "module_width": 0.3, + "module_height": 15.0, + "quiet_zone": 6.5, + "font_size": 10, + "text_distance": 5.0, + }, + ) + return Path(saved) + except Exception as e: + print(f" ⚠ Barcode generation failed for {sku}: {e}") + return None + +# --------------------------------------------------------------------------- +# Step 4 — Generate PDF via pandoc +# --------------------------------------------------------------------------- + +def generate_catalog_pdf(products: list[dict]) -> Path | None: + """Build a Markdown file and convert to PDF with pandoc.""" + timestamp_label = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + timestamp_file = datetime.now().strftime("%Y%m%d_%H%M%S") + + md_lines = [ + "---", + 'title: "Pokemon TCG Product Catalog — Dollar General"', + f'date: "{timestamp_label}"', + "geometry: margin=0.75in", + "fontsize: 11pt", + "---", + "", + f"**Generated**: {timestamp_label} ", + f"**Products**: {len(products)} Cards & Tins ", + "", + "\\newpage", + "", + ] + + for i, prod in enumerate(products, 1): + title = prod["title"] + sku = prod["sku"] + upc = prod["upc"] + price = prod["price"] + stock = prod["stock"] + + # Download product image + img_dest = IMAGES_DIR / f"product_{i}_{sku}.jpg" + img_path = download_image(prod.get("image_url"), img_dest) + if not img_path: + img_path = make_placeholder(IMAGES_DIR / f"product_{i}_{sku}_placeholder.png", title[:30]) + + # Generate barcode + bc_path = generate_barcode(sku, BARCODES_DIR) + + # Relative paths for pandoc (run from OUTPUT_DIR) + rel_img = os.path.relpath(img_path, OUTPUT_DIR) + rel_bc = os.path.relpath(bc_path, OUTPUT_DIR) if bc_path else None + + md_lines += [ + f"## {i}. {title}", + "", + f"![{title}]({rel_img}){{ width=200px }}", + "", + "| Field | Value |", + "|-------|-------|", + f"| **Price** | {price} |", + f"| **Stock** | {stock} |", + f"| **SKU** | `{sku}` |", + f"| **UPC** | `{upc}` |", + "", + ] + + if rel_bc: + md_lines += [ + f"![UPC-A Barcode]({rel_bc}){{ width=250px }}", + "", + ] + + md_lines += ["\\newpage", ""] + + print(f" ✅ [{i}/{len(products)}] {title}") + + # Write markdown + md_file = OUTPUT_DIR / f"pokemon_catalog_{timestamp_file}.md" + md_file.write_text("\n".join(md_lines), encoding="utf-8") + print(f"\n📝 Markdown: {md_file}") + + # Convert to PDF + pdf_file = OUTPUT_DIR / f"pokemon_catalog_{timestamp_file}.pdf" + engines = ["pdflatex", "xelatex"] + + for engine in engines: + try: + cmd = [ + "pandoc", str(md_file), + "-o", str(pdf_file), + f"--pdf-engine={engine}", + "-V", "colorlinks=true", + ] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) + if result.returncode == 0: + print(f"📄 PDF generated: {pdf_file} ({pdf_file.stat().st_size // 1024} KB)") + return pdf_file + else: + continue + except Exception: + continue + + print(f"⚠ PDF generation failed. Markdown available at: {md_file}") + return None + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + args = sys.argv[1:] + + # Handle --pdf-only mode + if "--pdf-only" in args: + idx = args.index("--pdf-only") + json_file = args[idx + 1] if idx + 1 < len(args) else None + if not json_file or not Path(json_file).exists(): + print(f"Usage: {sys.argv[0]} --pdf-only ") + sys.exit(1) + products = json.loads(Path(json_file).read_text()) + for d in [OUTPUT_DIR, IMAGES_DIR, BARCODES_DIR]: + d.mkdir(parents=True, exist_ok=True) + print(f"\n🖨️ Generating PDF from {json_file} ({len(products)} products)...") + generate_catalog_pdf(products) + return + + scrape_only = "--scrape-only" in args + + # --- Banner --- + timestamp_file = datetime.now().strftime("%Y%m%d_%H%M%S") + print("=" * 60) + print(" 🔍 Pokemon Discovery (pokemon-disco)") + print(" Dollar General — Pokemon TCG Cards & Tins") + print(f" {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print("=" * 60) + + # --- Step 1: Extract from HAR --- + if not Path(HAR_FILE).exists(): + print(f"\n❌ HAR file not found: {HAR_FILE}") + print(" Capture a HAR file from the Pokemon page in your browser") + print(" and place it in the project directory.") + sys.exit(1) + + raw_items = extract_products_from_har(HAR_FILE) + + # --- Step 2: Filter for Cards & Tins --- + print(f"\n🎯 Filtering for card packs and tins...") + card_tin_items = filter_card_and_tin_products(raw_items) + print(f" {len(card_tin_items)} of {len(raw_items)} products match (pack/tin/booster/tcg)") + + if not card_tin_items: + print("❌ No card or tin products found.") + sys.exit(1) + + # Show what was filtered out + excluded = [i for i in raw_items if i not in card_tin_items] + if excluded: + print(f"\n Excluded {len(excluded)} non-card/tin products:") + for item in excluded: + print(f" ✗ {item.get('Description', '?')}") + + # --- Step 3: Normalize --- + print(f"\n📋 Processing {len(card_tin_items)} products...") + products = [normalize_product(item) for item in card_tin_items] + + # Print summary table + print() + print(f" {'#':<3} {'Title':<55} {'SKU':<12} {'Price':<8} {'Stock'}") + print(f" {'—'*3} {'—'*55} {'—'*12} {'—'*8} {'—'*15}") + for i, p in enumerate(products, 1): + title = p['title'][:53] + print(f" {i:<3} {title:<55} {p['sku']:<12} {p['price']:<8} {p['stock']}") + + # --- Step 4: Save JSON --- + json_file = f"pokemon_tcg_products_{timestamp_file}.json" + Path(json_file).write_text(json.dumps(products, indent=2, ensure_ascii=False)) + print(f"\n💾 Product data: {json_file}") + + if scrape_only: + print("\n✅ Scrape complete (--scrape-only). Run with --pdf-only to generate catalog.") + return + + # --- Step 5: Generate PDF --- + for d in [OUTPUT_DIR, IMAGES_DIR, BARCODES_DIR]: + d.mkdir(parents=True, exist_ok=True) + + print(f"\n🖨️ Generating PDF catalog...") + pdf_path = generate_catalog_pdf(products) + + # --- Done --- + print("\n" + "=" * 60) + if pdf_path: + print(f" ✅ COMPLETE!") + print(f" 📄 PDF Catalog: {pdf_path}") + print(f" 💾 Product JSON: {json_file}") + print(f" 🏷️ Barcodes: {BARCODES_DIR}/") + print(f" 🖼️ Images: {IMAGES_DIR}/") + else: + print(f" ⚠ PDF generation failed — markdown file available in {OUTPUT_DIR}/") + print(f" 💾 Product JSON: {json_file}") + print("=" * 60) + + +if __name__ == "__main__": + main()