- Comprehensive scraper for Dollar General Pokemon TCG products - Professional PDF catalog generator with UPC-A barcodes - Robust anti-bot handling with requests + Selenium fallback - Automatic image downloading and barcode generation - Unix-friendly timestamped filenames - Virtual environment support and dependency management - Complete documentation and usage guides
278 lines
9.7 KiB
Python
Executable File
278 lines
9.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Pokemon Discovery - TCG Product Catalog PDF Generator
|
|
Generates PDF catalog with product images, details, and UPC-A barcodes
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import requests
|
|
import subprocess
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import barcode
|
|
from barcode.writer import ImageWriter
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
import tempfile
|
|
import shutil
|
|
|
|
class PokemonTCGCatalogGenerator:
|
|
def __init__(self, json_file):
|
|
self.json_file = json_file
|
|
self.output_dir = Path("catalog_output")
|
|
self.images_dir = self.output_dir / "images"
|
|
self.barcodes_dir = self.output_dir / "barcodes"
|
|
|
|
# Create output directories
|
|
self.output_dir.mkdir(exist_ok=True)
|
|
self.images_dir.mkdir(exist_ok=True)
|
|
self.barcodes_dir.mkdir(exist_ok=True)
|
|
|
|
# Load product data
|
|
with open(json_file, 'r') as f:
|
|
self.products = json.load(f)
|
|
|
|
def download_image(self, url, filename):
|
|
"""Download product image"""
|
|
if not url:
|
|
return None
|
|
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
filepath = self.images_dir / filename
|
|
with open(filepath, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
return filepath
|
|
except Exception as e:
|
|
print(f"Failed to download image {url}: {e}")
|
|
return None
|
|
|
|
def generate_upc_barcode(self, sku):
|
|
"""Generate UPC-A barcode from SKU"""
|
|
try:
|
|
# Convert SKU to 12-digit UPC-A format
|
|
# Remove non-digits and pad/truncate to 11 digits (12th is check digit)
|
|
digits_only = ''.join(filter(str.isdigit, str(sku)))
|
|
|
|
if len(digits_only) < 11:
|
|
# Pad with zeros at the start
|
|
upc_base = digits_only.zfill(11)
|
|
else:
|
|
# Take the last 11 digits
|
|
upc_base = digits_only[-11:]
|
|
|
|
# Generate UPC-A barcode
|
|
upc_generator = barcode.get_barcode_class('upca')
|
|
upc = upc_generator(upc_base, writer=ImageWriter())
|
|
|
|
# Save barcode image
|
|
barcode_filename = f"barcode_{sku.replace('/', '_').replace(' ', '_')}.png"
|
|
barcode_path = self.barcodes_dir / barcode_filename
|
|
|
|
# Save with specific options for better appearance
|
|
upc.save(str(barcode_path).replace('.png', ''), options={
|
|
'module_width': 0.2,
|
|
'module_height': 15.0,
|
|
'quiet_zone': 6.5,
|
|
'font_size': 10,
|
|
'text_distance': 5.0,
|
|
'background': 'white',
|
|
'foreground': 'black'
|
|
})
|
|
|
|
return f"{barcode_path}.png"
|
|
|
|
except Exception as e:
|
|
print(f"Failed to generate barcode for SKU {sku}: {e}")
|
|
return None
|
|
|
|
def create_placeholder_image(self, width=300, height=200):
|
|
"""Create a placeholder image when product image is not available"""
|
|
img = Image.new('RGB', (width, height), color='lightgray')
|
|
draw = ImageDraw.Draw(img)
|
|
|
|
try:
|
|
# Try to use a system font
|
|
font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 24)
|
|
except:
|
|
try:
|
|
font = ImageFont.truetype('arial.ttf', 24)
|
|
except:
|
|
font = ImageFont.load_default()
|
|
|
|
text = "No Image\nAvailable"
|
|
|
|
# Get text bounding box for centering
|
|
lines = text.split('\n')
|
|
y_offset = height // 2 - (len(lines) * 30) // 2
|
|
|
|
for line in lines:
|
|
bbox = draw.textbbox((0, 0), line, font=font)
|
|
text_width = bbox[2] - bbox[0]
|
|
x_offset = (width - text_width) // 2
|
|
draw.text((x_offset, y_offset), line, fill='darkgray', font=font)
|
|
y_offset += 30
|
|
|
|
placeholder_path = self.images_dir / "placeholder.png"
|
|
img.save(placeholder_path)
|
|
return placeholder_path
|
|
|
|
def generate_markdown(self):
|
|
"""Generate markdown content for the catalog"""
|
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
markdown = f"""---
|
|
title: "Pokemon TCG Product Catalog"
|
|
subtitle: "Dollar General - Generated {timestamp}"
|
|
author: "Automated Scraper"
|
|
date: "{timestamp}"
|
|
geometry: margin=1in
|
|
fontsize: 11pt
|
|
documentclass: article
|
|
---
|
|
|
|
# Pokemon TCG Product Catalog
|
|
|
|
Generated on: {timestamp}
|
|
Source: Dollar General
|
|
Total Products: {len(self.products)}
|
|
|
|
---
|
|
|
|
"""
|
|
|
|
for i, product in enumerate(self.products, 1):
|
|
print(f"Processing product {i}/{len(self.products)}: {product.get('title', 'Unknown')}")
|
|
|
|
# Download product image
|
|
image_path = None
|
|
if product.get('image_url'):
|
|
filename = f"product_{i}_{product.get('sku', 'unknown').replace('/', '_').replace(' ', '_')}.jpg"
|
|
image_path = self.download_image(product.get('image_url'), filename)
|
|
|
|
if not image_path:
|
|
# Use placeholder
|
|
image_path = self.create_placeholder_image()
|
|
|
|
# Generate barcode
|
|
barcode_path = None
|
|
if product.get('sku'):
|
|
barcode_path = self.generate_upc_barcode(product.get('sku'))
|
|
|
|
# Add product section to markdown
|
|
markdown += f"## {i}. {product.get('title', 'Unknown Product')}\n\n"
|
|
|
|
# Product image
|
|
if image_path:
|
|
rel_image_path = os.path.relpath(image_path, self.output_dir)
|
|
markdown += f"{{width=300px}}\n\n"
|
|
|
|
# Product details in a table
|
|
markdown += "| Field | Value |\n"
|
|
markdown += "|-------|-------|\n"
|
|
markdown += f"| **Title** | {product.get('title', 'N/A')} |\n"
|
|
markdown += f"| **Price** | {product.get('price', 'N/A')} |\n"
|
|
markdown += f"| **Stock** | {product.get('stock', 'N/A')} |\n"
|
|
markdown += f"| **SKU** | `{product.get('sku', 'N/A')}` |\n"
|
|
markdown += f"| **URL** | {product.get('url', 'N/A')} |\n"
|
|
markdown += "\n"
|
|
|
|
# Barcode
|
|
if barcode_path:
|
|
rel_barcode_path = os.path.relpath(barcode_path, self.output_dir)
|
|
markdown += f"**UPC-A Barcode:**\n\n"
|
|
markdown += f"{{width=200px}}\n\n"
|
|
|
|
markdown += "---\n\n"
|
|
|
|
return markdown
|
|
|
|
def generate_pdf(self):
|
|
"""Generate PDF catalog using pandoc"""
|
|
print("Generating markdown content...")
|
|
markdown_content = self.generate_markdown()
|
|
|
|
# Save markdown file
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
markdown_file = self.output_dir / f"pokemon_tcg_catalog_{timestamp}.md"
|
|
|
|
with open(markdown_file, 'w', encoding='utf-8') as f:
|
|
f.write(markdown_content)
|
|
|
|
print(f"Markdown saved to: {markdown_file}")
|
|
|
|
# Generate PDF using pandoc
|
|
pdf_file = self.output_dir / f"pokemon_tcg_catalog_{timestamp}.pdf"
|
|
|
|
print("Converting to PDF using pandoc...")
|
|
|
|
try:
|
|
subprocess.run([
|
|
'pandoc',
|
|
str(markdown_file),
|
|
'-o', str(pdf_file),
|
|
'--pdf-engine=xelatex',
|
|
'-V', 'colorlinks=true',
|
|
'-V', 'linkcolor=blue',
|
|
'-V', 'filecolor=magenta',
|
|
'-V', 'urlcolor=cyan',
|
|
'--toc',
|
|
'--toc-depth=2'
|
|
], check=True)
|
|
|
|
print(f"PDF generated successfully: {pdf_file}")
|
|
return pdf_file
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Pandoc conversion failed: {e}")
|
|
print("Trying with pdflatex instead...")
|
|
|
|
try:
|
|
subprocess.run([
|
|
'pandoc',
|
|
str(markdown_file),
|
|
'-o', str(pdf_file),
|
|
'--pdf-engine=pdflatex',
|
|
'--toc'
|
|
], check=True)
|
|
|
|
print(f"PDF generated successfully: {pdf_file}")
|
|
return pdf_file
|
|
|
|
except subprocess.CalledProcessError as e2:
|
|
print(f"PDF generation failed with both engines: {e2}")
|
|
print(f"Markdown file available at: {markdown_file}")
|
|
return None
|
|
|
|
except FileNotFoundError:
|
|
print("Error: pandoc not found. Please install pandoc to generate PDF.")
|
|
print(f"Markdown file available at: {markdown_file}")
|
|
return None
|
|
|
|
def main():
|
|
if len(sys.argv) != 2:
|
|
print("Usage: python3 pdf_generator.py <json_file>")
|
|
print("Example: python3 pdf_generator.py pokemon_tcg_products_20241221_143025.json")
|
|
sys.exit(1)
|
|
|
|
json_file = sys.argv[1]
|
|
|
|
if not os.path.exists(json_file):
|
|
print(f"Error: JSON file '{json_file}' not found")
|
|
sys.exit(1)
|
|
|
|
generator = PokemonTCGCatalogGenerator(json_file)
|
|
pdf_file = generator.generate_pdf()
|
|
|
|
if pdf_file:
|
|
print(f"\nCatalog generation completed!")
|
|
print(f"PDF file: {pdf_file}")
|
|
print(f"Output directory: {generator.output_dir}")
|
|
else:
|
|
print(f"\nPDF generation failed, but markdown file is available in: {generator.output_dir}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |