Add disco.py: single working script that finds all pack/tin products and generates PDF
Extracts all 12 Pokemon products from HAR API responses, filters to 6 card pack and tin products, downloads product images, generates UPC-A barcodes, and produces a 157KB PDF catalog. Products found: 1. Pokémon Trading Card Game, 15 Card Pack (In Stock) 2. Pokémon TCG Booster Pack with Promo Card & Coin 3. Pokemon Trading Card Game Sword & Shield Booster Pack 4. Pokémon Collectible Stacking Tin 5. Pokémon Trading Card Game Mini Tin 6. Pokémon Trading Card Game, Gardevoir Strong Bond Tin
This commit is contained in:
419
disco.py
Normal file
419
disco.py
Normal file
@@ -0,0 +1,419 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Pokemon Discovery (disco.py)
|
||||||
|
Scrapes Pokemon TCG pack & tin products from Dollar General and generates a PDF catalog.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python disco.py # Full run: scrape + generate PDF
|
||||||
|
python disco.py --scrape-only # Just scrape, output JSON
|
||||||
|
python disco.py --pdf-only FILE.json # Just generate PDF from existing JSON
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import requests
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urljoin, quote
|
||||||
|
|
||||||
|
import barcode
|
||||||
|
from barcode.writer import ImageWriter
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Configuration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
HAR_FILE = "www.dollargeneral.com_Archive [26-03-21 15-14-28].har"
|
||||||
|
BASE_URL = "https://www.dollargeneral.com"
|
||||||
|
OUTPUT_DIR = Path("catalog_output")
|
||||||
|
IMAGES_DIR = OUTPUT_DIR / "images"
|
||||||
|
BARCODES_DIR = OUTPUT_DIR / "barcodes"
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:148.0) Gecko/20100101 Firefox/148.0",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Keywords that identify card packs and tins (case-insensitive)
|
||||||
|
CARD_TIN_KEYWORDS = ["pack", "tin", "booster", "card game", "tcg"]
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Step 1 — Product Discovery (from HAR file API responses)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def extract_products_from_har(har_path: str) -> list[dict]:
|
||||||
|
"""Parse HAR file and extract all Pokemon products from API responses."""
|
||||||
|
print(f"📦 Reading HAR file: {har_path}")
|
||||||
|
|
||||||
|
with open(har_path, "r", encoding="utf-8") as f:
|
||||||
|
har = json.load(f)
|
||||||
|
|
||||||
|
api_url = "https://dggo.dollargeneral.com/omni/api/v2/category/search/provider"
|
||||||
|
unique: dict[str, dict] = {}
|
||||||
|
|
||||||
|
for entry in har["log"]["entries"]:
|
||||||
|
req = entry["request"]
|
||||||
|
resp = entry["response"]
|
||||||
|
if req["url"] != api_url or req["method"] != "POST":
|
||||||
|
continue
|
||||||
|
text = resp.get("content", {}).get("text", "")
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
for item in data.get("ItemList", {}).get("Items", []):
|
||||||
|
upc = str(item.get("UPC", ""))
|
||||||
|
if upc and upc not in unique:
|
||||||
|
unique[upc] = item
|
||||||
|
|
||||||
|
print(f" Found {len(unique)} unique products in HAR data")
|
||||||
|
return list(unique.values())
|
||||||
|
|
||||||
|
|
||||||
|
def rootsv_to_sku(rootsv: str) -> str:
|
||||||
|
"""Convert rootSV like '0419363_1' to SKU like '41936301'."""
|
||||||
|
if not rootsv:
|
||||||
|
return ""
|
||||||
|
parts = rootsv.split("_")
|
||||||
|
base = parts[0].lstrip("0")
|
||||||
|
suffix = parts[1] if len(parts) > 1 else ""
|
||||||
|
return base + suffix
|
||||||
|
|
||||||
|
|
||||||
|
def build_product_url(upc: str) -> str:
|
||||||
|
"""Construct a Dollar General product page URL from a UPC."""
|
||||||
|
return f"{BASE_URL}/p/pokemon-product/{upc}"
|
||||||
|
|
||||||
|
|
||||||
|
def filter_card_and_tin_products(raw_items: list[dict]) -> list[dict]:
|
||||||
|
"""Keep only products whose description contains card/pack/tin keywords."""
|
||||||
|
filtered = []
|
||||||
|
for item in raw_items:
|
||||||
|
desc = item.get("Description", "").lower()
|
||||||
|
if any(kw in desc for kw in CARD_TIN_KEYWORDS):
|
||||||
|
filtered.append(item)
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_product(item: dict) -> dict:
|
||||||
|
"""Convert raw API item into a clean product dict."""
|
||||||
|
upc = str(item.get("UPC", ""))
|
||||||
|
rootsv = item.get("rootSV", "")
|
||||||
|
sku = rootsv_to_sku(rootsv)
|
||||||
|
qty = item.get("AvailableQty", 0)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": item.get("Description", "Unknown Product"),
|
||||||
|
"sku": sku,
|
||||||
|
"upc": upc,
|
||||||
|
"price": f"${item.get('Price', 0):.2f}",
|
||||||
|
"stock": f"In Stock ({qty})" if qty and qty > 0 else "Out of Stock",
|
||||||
|
"quantity": qty,
|
||||||
|
"image_url": item.get("Image", ""),
|
||||||
|
"rating": item.get("AverageRating", 0),
|
||||||
|
"reviews": item.get("RatingReviewCount", 0),
|
||||||
|
"url": build_product_url(upc),
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Step 2 — Enrich from product pages (get real URL slug, extra details)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def enrich_from_product_page(product: dict) -> dict:
|
||||||
|
"""Visit the actual product page to get the real URL and any missing data."""
|
||||||
|
upc = product["upc"]
|
||||||
|
sku = product["sku"]
|
||||||
|
|
||||||
|
# Try to get the real product page
|
||||||
|
# DG product pages can be accessed by UPC at search
|
||||||
|
search_url = f"{BASE_URL}/search?q={upc}"
|
||||||
|
try:
|
||||||
|
resp = requests.get(search_url, headers=HEADERS, timeout=15)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
# Look for the canonical product link
|
||||||
|
links = soup.select(f'a[href*="/p/"][href*="{upc}"]')
|
||||||
|
if links:
|
||||||
|
href = links[0].get("href", "")
|
||||||
|
product["url"] = urljoin(BASE_URL, href)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Also try visiting the product page directly by known pattern
|
||||||
|
# The image URL contains the DG item number: dg-XXXXXXXX-1
|
||||||
|
img_url = product.get("image_url", "")
|
||||||
|
match = re.search(r"dg-(\d+)-", img_url)
|
||||||
|
if match:
|
||||||
|
dg_item = match.group(1)
|
||||||
|
# This is the item number used in the SKU
|
||||||
|
if not product.get("sku"):
|
||||||
|
product["sku"] = dg_item
|
||||||
|
|
||||||
|
return product
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Step 3 — Download images & generate barcodes
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def download_image(url: str, dest: Path) -> Path | None:
|
||||||
|
"""Download image from URL, return local path or None."""
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, headers=HEADERS, timeout=15)
|
||||||
|
resp.raise_for_status()
|
||||||
|
dest.write_bytes(resp.content)
|
||||||
|
return dest
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠ Image download failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def make_placeholder(dest: Path, text: str = "No Image") -> Path:
|
||||||
|
"""Create a simple placeholder image."""
|
||||||
|
img = Image.new("RGB", (300, 300), "#e0e0e0")
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20)
|
||||||
|
except Exception:
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
bbox = draw.textbbox((0, 0), text, font=font)
|
||||||
|
tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
|
||||||
|
draw.text(((300 - tw) / 2, (300 - th) / 2), text, fill="#888", font=font)
|
||||||
|
img.save(dest)
|
||||||
|
return dest
|
||||||
|
|
||||||
|
|
||||||
|
def generate_barcode(sku: str, dest_dir: Path) -> Path | None:
|
||||||
|
"""Generate a UPC-A barcode PNG from a SKU. Returns path to the .png file."""
|
||||||
|
digits = re.sub(r"\D", "", sku)
|
||||||
|
if not digits:
|
||||||
|
return None
|
||||||
|
# UPC-A needs exactly 11 digits (12th is check digit, auto-calculated)
|
||||||
|
digits = digits[-11:].zfill(11)
|
||||||
|
try:
|
||||||
|
upc_cls = barcode.get_barcode_class("upca")
|
||||||
|
bc = upc_cls(digits, writer=ImageWriter())
|
||||||
|
# barcode lib appends .png automatically
|
||||||
|
out = dest_dir / f"barcode_{sku}"
|
||||||
|
saved = bc.save(
|
||||||
|
str(out),
|
||||||
|
options={
|
||||||
|
"module_width": 0.3,
|
||||||
|
"module_height": 15.0,
|
||||||
|
"quiet_zone": 6.5,
|
||||||
|
"font_size": 10,
|
||||||
|
"text_distance": 5.0,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return Path(saved)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠ Barcode generation failed for {sku}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Step 4 — Generate PDF via pandoc
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def generate_catalog_pdf(products: list[dict]) -> Path | None:
|
||||||
|
"""Build a Markdown file and convert to PDF with pandoc."""
|
||||||
|
timestamp_label = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
timestamp_file = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
|
||||||
|
md_lines = [
|
||||||
|
"---",
|
||||||
|
'title: "Pokemon TCG Product Catalog — Dollar General"',
|
||||||
|
f'date: "{timestamp_label}"',
|
||||||
|
"geometry: margin=0.75in",
|
||||||
|
"fontsize: 11pt",
|
||||||
|
"---",
|
||||||
|
"",
|
||||||
|
f"**Generated**: {timestamp_label} ",
|
||||||
|
f"**Products**: {len(products)} Cards & Tins ",
|
||||||
|
"",
|
||||||
|
"\\newpage",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, prod in enumerate(products, 1):
|
||||||
|
title = prod["title"]
|
||||||
|
sku = prod["sku"]
|
||||||
|
upc = prod["upc"]
|
||||||
|
price = prod["price"]
|
||||||
|
stock = prod["stock"]
|
||||||
|
|
||||||
|
# Download product image
|
||||||
|
img_dest = IMAGES_DIR / f"product_{i}_{sku}.jpg"
|
||||||
|
img_path = download_image(prod.get("image_url"), img_dest)
|
||||||
|
if not img_path:
|
||||||
|
img_path = make_placeholder(IMAGES_DIR / f"product_{i}_{sku}_placeholder.png", title[:30])
|
||||||
|
|
||||||
|
# Generate barcode
|
||||||
|
bc_path = generate_barcode(sku, BARCODES_DIR)
|
||||||
|
|
||||||
|
# Relative paths for pandoc (run from OUTPUT_DIR)
|
||||||
|
rel_img = os.path.relpath(img_path, OUTPUT_DIR)
|
||||||
|
rel_bc = os.path.relpath(bc_path, OUTPUT_DIR) if bc_path else None
|
||||||
|
|
||||||
|
md_lines += [
|
||||||
|
f"## {i}. {title}",
|
||||||
|
"",
|
||||||
|
f"{{ width=200px }}",
|
||||||
|
"",
|
||||||
|
"| Field | Value |",
|
||||||
|
"|-------|-------|",
|
||||||
|
f"| **Price** | {price} |",
|
||||||
|
f"| **Stock** | {stock} |",
|
||||||
|
f"| **SKU** | `{sku}` |",
|
||||||
|
f"| **UPC** | `{upc}` |",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
if rel_bc:
|
||||||
|
md_lines += [
|
||||||
|
f"{{ width=250px }}",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
md_lines += ["\\newpage", ""]
|
||||||
|
|
||||||
|
print(f" ✅ [{i}/{len(products)}] {title}")
|
||||||
|
|
||||||
|
# Write markdown
|
||||||
|
md_file = OUTPUT_DIR / f"pokemon_catalog_{timestamp_file}.md"
|
||||||
|
md_file.write_text("\n".join(md_lines), encoding="utf-8")
|
||||||
|
print(f"\n📝 Markdown: {md_file}")
|
||||||
|
|
||||||
|
# Convert to PDF
|
||||||
|
pdf_file = OUTPUT_DIR / f"pokemon_catalog_{timestamp_file}.pdf"
|
||||||
|
engines = ["pdflatex", "xelatex"]
|
||||||
|
|
||||||
|
for engine in engines:
|
||||||
|
try:
|
||||||
|
cmd = [
|
||||||
|
"pandoc", str(md_file),
|
||||||
|
"-o", str(pdf_file),
|
||||||
|
f"--pdf-engine={engine}",
|
||||||
|
"-V", "colorlinks=true",
|
||||||
|
]
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||||||
|
if result.returncode == 0:
|
||||||
|
print(f"📄 PDF generated: {pdf_file} ({pdf_file.stat().st_size // 1024} KB)")
|
||||||
|
return pdf_file
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"⚠ PDF generation failed. Markdown available at: {md_file}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = sys.argv[1:]
|
||||||
|
|
||||||
|
# Handle --pdf-only mode
|
||||||
|
if "--pdf-only" in args:
|
||||||
|
idx = args.index("--pdf-only")
|
||||||
|
json_file = args[idx + 1] if idx + 1 < len(args) else None
|
||||||
|
if not json_file or not Path(json_file).exists():
|
||||||
|
print(f"Usage: {sys.argv[0]} --pdf-only <products.json>")
|
||||||
|
sys.exit(1)
|
||||||
|
products = json.loads(Path(json_file).read_text())
|
||||||
|
for d in [OUTPUT_DIR, IMAGES_DIR, BARCODES_DIR]:
|
||||||
|
d.mkdir(parents=True, exist_ok=True)
|
||||||
|
print(f"\n🖨️ Generating PDF from {json_file} ({len(products)} products)...")
|
||||||
|
generate_catalog_pdf(products)
|
||||||
|
return
|
||||||
|
|
||||||
|
scrape_only = "--scrape-only" in args
|
||||||
|
|
||||||
|
# --- Banner ---
|
||||||
|
timestamp_file = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
print("=" * 60)
|
||||||
|
print(" 🔍 Pokemon Discovery (pokemon-disco)")
|
||||||
|
print(" Dollar General — Pokemon TCG Cards & Tins")
|
||||||
|
print(f" {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# --- Step 1: Extract from HAR ---
|
||||||
|
if not Path(HAR_FILE).exists():
|
||||||
|
print(f"\n❌ HAR file not found: {HAR_FILE}")
|
||||||
|
print(" Capture a HAR file from the Pokemon page in your browser")
|
||||||
|
print(" and place it in the project directory.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
raw_items = extract_products_from_har(HAR_FILE)
|
||||||
|
|
||||||
|
# --- Step 2: Filter for Cards & Tins ---
|
||||||
|
print(f"\n🎯 Filtering for card packs and tins...")
|
||||||
|
card_tin_items = filter_card_and_tin_products(raw_items)
|
||||||
|
print(f" {len(card_tin_items)} of {len(raw_items)} products match (pack/tin/booster/tcg)")
|
||||||
|
|
||||||
|
if not card_tin_items:
|
||||||
|
print("❌ No card or tin products found.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Show what was filtered out
|
||||||
|
excluded = [i for i in raw_items if i not in card_tin_items]
|
||||||
|
if excluded:
|
||||||
|
print(f"\n Excluded {len(excluded)} non-card/tin products:")
|
||||||
|
for item in excluded:
|
||||||
|
print(f" ✗ {item.get('Description', '?')}")
|
||||||
|
|
||||||
|
# --- Step 3: Normalize ---
|
||||||
|
print(f"\n📋 Processing {len(card_tin_items)} products...")
|
||||||
|
products = [normalize_product(item) for item in card_tin_items]
|
||||||
|
|
||||||
|
# Print summary table
|
||||||
|
print()
|
||||||
|
print(f" {'#':<3} {'Title':<55} {'SKU':<12} {'Price':<8} {'Stock'}")
|
||||||
|
print(f" {'—'*3} {'—'*55} {'—'*12} {'—'*8} {'—'*15}")
|
||||||
|
for i, p in enumerate(products, 1):
|
||||||
|
title = p['title'][:53]
|
||||||
|
print(f" {i:<3} {title:<55} {p['sku']:<12} {p['price']:<8} {p['stock']}")
|
||||||
|
|
||||||
|
# --- Step 4: Save JSON ---
|
||||||
|
json_file = f"pokemon_tcg_products_{timestamp_file}.json"
|
||||||
|
Path(json_file).write_text(json.dumps(products, indent=2, ensure_ascii=False))
|
||||||
|
print(f"\n💾 Product data: {json_file}")
|
||||||
|
|
||||||
|
if scrape_only:
|
||||||
|
print("\n✅ Scrape complete (--scrape-only). Run with --pdf-only to generate catalog.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# --- Step 5: Generate PDF ---
|
||||||
|
for d in [OUTPUT_DIR, IMAGES_DIR, BARCODES_DIR]:
|
||||||
|
d.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
print(f"\n🖨️ Generating PDF catalog...")
|
||||||
|
pdf_path = generate_catalog_pdf(products)
|
||||||
|
|
||||||
|
# --- Done ---
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
if pdf_path:
|
||||||
|
print(f" ✅ COMPLETE!")
|
||||||
|
print(f" 📄 PDF Catalog: {pdf_path}")
|
||||||
|
print(f" 💾 Product JSON: {json_file}")
|
||||||
|
print(f" 🏷️ Barcodes: {BARCODES_DIR}/")
|
||||||
|
print(f" 🖼️ Images: {IMAGES_DIR}/")
|
||||||
|
else:
|
||||||
|
print(f" ⚠ PDF generation failed — markdown file available in {OUTPUT_DIR}/")
|
||||||
|
print(f" 💾 Product JSON: {json_file}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user