Initial commit: Pokemon Discovery - TCG product scraper and PDF catalog generator
- Comprehensive scraper for Dollar General Pokemon TCG products - Professional PDF catalog generator with UPC-A barcodes - Robust anti-bot handling with requests + Selenium fallback - Automatic image downloading and barcode generation - Unix-friendly timestamped filenames - Virtual environment support and dependency management - Complete documentation and usage guides
This commit is contained in:
278
pdf_generator.py
Executable file
278
pdf_generator.py
Executable file
@@ -0,0 +1,278 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Pokemon Discovery - TCG Product Catalog PDF Generator
|
||||
Generates PDF catalog with product images, details, and UPC-A barcodes
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import requests
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
import barcode
|
||||
from barcode.writer import ImageWriter
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
class PokemonTCGCatalogGenerator:
|
||||
def __init__(self, json_file):
|
||||
self.json_file = json_file
|
||||
self.output_dir = Path("catalog_output")
|
||||
self.images_dir = self.output_dir / "images"
|
||||
self.barcodes_dir = self.output_dir / "barcodes"
|
||||
|
||||
# Create output directories
|
||||
self.output_dir.mkdir(exist_ok=True)
|
||||
self.images_dir.mkdir(exist_ok=True)
|
||||
self.barcodes_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Load product data
|
||||
with open(json_file, 'r') as f:
|
||||
self.products = json.load(f)
|
||||
|
||||
def download_image(self, url, filename):
|
||||
"""Download product image"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
try:
|
||||
response = requests.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
filepath = self.images_dir / filename
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(response.content)
|
||||
|
||||
return filepath
|
||||
except Exception as e:
|
||||
print(f"Failed to download image {url}: {e}")
|
||||
return None
|
||||
|
||||
def generate_upc_barcode(self, sku):
|
||||
"""Generate UPC-A barcode from SKU"""
|
||||
try:
|
||||
# Convert SKU to 12-digit UPC-A format
|
||||
# Remove non-digits and pad/truncate to 11 digits (12th is check digit)
|
||||
digits_only = ''.join(filter(str.isdigit, str(sku)))
|
||||
|
||||
if len(digits_only) < 11:
|
||||
# Pad with zeros at the start
|
||||
upc_base = digits_only.zfill(11)
|
||||
else:
|
||||
# Take the last 11 digits
|
||||
upc_base = digits_only[-11:]
|
||||
|
||||
# Generate UPC-A barcode
|
||||
upc_generator = barcode.get_barcode_class('upca')
|
||||
upc = upc_generator(upc_base, writer=ImageWriter())
|
||||
|
||||
# Save barcode image
|
||||
barcode_filename = f"barcode_{sku.replace('/', '_').replace(' ', '_')}.png"
|
||||
barcode_path = self.barcodes_dir / barcode_filename
|
||||
|
||||
# Save with specific options for better appearance
|
||||
upc.save(str(barcode_path).replace('.png', ''), options={
|
||||
'module_width': 0.2,
|
||||
'module_height': 15.0,
|
||||
'quiet_zone': 6.5,
|
||||
'font_size': 10,
|
||||
'text_distance': 5.0,
|
||||
'background': 'white',
|
||||
'foreground': 'black'
|
||||
})
|
||||
|
||||
return f"{barcode_path}.png"
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to generate barcode for SKU {sku}: {e}")
|
||||
return None
|
||||
|
||||
def create_placeholder_image(self, width=300, height=200):
|
||||
"""Create a placeholder image when product image is not available"""
|
||||
img = Image.new('RGB', (width, height), color='lightgray')
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
try:
|
||||
# Try to use a system font
|
||||
font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 24)
|
||||
except:
|
||||
try:
|
||||
font = ImageFont.truetype('arial.ttf', 24)
|
||||
except:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
text = "No Image\nAvailable"
|
||||
|
||||
# Get text bounding box for centering
|
||||
lines = text.split('\n')
|
||||
y_offset = height // 2 - (len(lines) * 30) // 2
|
||||
|
||||
for line in lines:
|
||||
bbox = draw.textbbox((0, 0), line, font=font)
|
||||
text_width = bbox[2] - bbox[0]
|
||||
x_offset = (width - text_width) // 2
|
||||
draw.text((x_offset, y_offset), line, fill='darkgray', font=font)
|
||||
y_offset += 30
|
||||
|
||||
placeholder_path = self.images_dir / "placeholder.png"
|
||||
img.save(placeholder_path)
|
||||
return placeholder_path
|
||||
|
||||
def generate_markdown(self):
|
||||
"""Generate markdown content for the catalog"""
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
markdown = f"""---
|
||||
title: "Pokemon TCG Product Catalog"
|
||||
subtitle: "Dollar General - Generated {timestamp}"
|
||||
author: "Automated Scraper"
|
||||
date: "{timestamp}"
|
||||
geometry: margin=1in
|
||||
fontsize: 11pt
|
||||
documentclass: article
|
||||
---
|
||||
|
||||
# Pokemon TCG Product Catalog
|
||||
|
||||
Generated on: {timestamp}
|
||||
Source: Dollar General
|
||||
Total Products: {len(self.products)}
|
||||
|
||||
---
|
||||
|
||||
"""
|
||||
|
||||
for i, product in enumerate(self.products, 1):
|
||||
print(f"Processing product {i}/{len(self.products)}: {product.get('title', 'Unknown')}")
|
||||
|
||||
# Download product image
|
||||
image_path = None
|
||||
if product.get('image_url'):
|
||||
filename = f"product_{i}_{product.get('sku', 'unknown').replace('/', '_').replace(' ', '_')}.jpg"
|
||||
image_path = self.download_image(product.get('image_url'), filename)
|
||||
|
||||
if not image_path:
|
||||
# Use placeholder
|
||||
image_path = self.create_placeholder_image()
|
||||
|
||||
# Generate barcode
|
||||
barcode_path = None
|
||||
if product.get('sku'):
|
||||
barcode_path = self.generate_upc_barcode(product.get('sku'))
|
||||
|
||||
# Add product section to markdown
|
||||
markdown += f"## {i}. {product.get('title', 'Unknown Product')}\n\n"
|
||||
|
||||
# Product image
|
||||
if image_path:
|
||||
rel_image_path = os.path.relpath(image_path, self.output_dir)
|
||||
markdown += f"{{width=300px}}\n\n"
|
||||
|
||||
# Product details in a table
|
||||
markdown += "| Field | Value |\n"
|
||||
markdown += "|-------|-------|\n"
|
||||
markdown += f"| **Title** | {product.get('title', 'N/A')} |\n"
|
||||
markdown += f"| **Price** | {product.get('price', 'N/A')} |\n"
|
||||
markdown += f"| **Stock** | {product.get('stock', 'N/A')} |\n"
|
||||
markdown += f"| **SKU** | `{product.get('sku', 'N/A')}` |\n"
|
||||
markdown += f"| **URL** | {product.get('url', 'N/A')} |\n"
|
||||
markdown += "\n"
|
||||
|
||||
# Barcode
|
||||
if barcode_path:
|
||||
rel_barcode_path = os.path.relpath(barcode_path, self.output_dir)
|
||||
markdown += f"**UPC-A Barcode:**\n\n"
|
||||
markdown += f"{{width=200px}}\n\n"
|
||||
|
||||
markdown += "---\n\n"
|
||||
|
||||
return markdown
|
||||
|
||||
def generate_pdf(self):
|
||||
"""Generate PDF catalog using pandoc"""
|
||||
print("Generating markdown content...")
|
||||
markdown_content = self.generate_markdown()
|
||||
|
||||
# Save markdown file
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
markdown_file = self.output_dir / f"pokemon_tcg_catalog_{timestamp}.md"
|
||||
|
||||
with open(markdown_file, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
print(f"Markdown saved to: {markdown_file}")
|
||||
|
||||
# Generate PDF using pandoc
|
||||
pdf_file = self.output_dir / f"pokemon_tcg_catalog_{timestamp}.pdf"
|
||||
|
||||
print("Converting to PDF using pandoc...")
|
||||
|
||||
try:
|
||||
subprocess.run([
|
||||
'pandoc',
|
||||
str(markdown_file),
|
||||
'-o', str(pdf_file),
|
||||
'--pdf-engine=xelatex',
|
||||
'-V', 'colorlinks=true',
|
||||
'-V', 'linkcolor=blue',
|
||||
'-V', 'filecolor=magenta',
|
||||
'-V', 'urlcolor=cyan',
|
||||
'--toc',
|
||||
'--toc-depth=2'
|
||||
], check=True)
|
||||
|
||||
print(f"PDF generated successfully: {pdf_file}")
|
||||
return pdf_file
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Pandoc conversion failed: {e}")
|
||||
print("Trying with pdflatex instead...")
|
||||
|
||||
try:
|
||||
subprocess.run([
|
||||
'pandoc',
|
||||
str(markdown_file),
|
||||
'-o', str(pdf_file),
|
||||
'--pdf-engine=pdflatex',
|
||||
'--toc'
|
||||
], check=True)
|
||||
|
||||
print(f"PDF generated successfully: {pdf_file}")
|
||||
return pdf_file
|
||||
|
||||
except subprocess.CalledProcessError as e2:
|
||||
print(f"PDF generation failed with both engines: {e2}")
|
||||
print(f"Markdown file available at: {markdown_file}")
|
||||
return None
|
||||
|
||||
except FileNotFoundError:
|
||||
print("Error: pandoc not found. Please install pandoc to generate PDF.")
|
||||
print(f"Markdown file available at: {markdown_file}")
|
||||
return None
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python3 pdf_generator.py <json_file>")
|
||||
print("Example: python3 pdf_generator.py pokemon_tcg_products_20241221_143025.json")
|
||||
sys.exit(1)
|
||||
|
||||
json_file = sys.argv[1]
|
||||
|
||||
if not os.path.exists(json_file):
|
||||
print(f"Error: JSON file '{json_file}' not found")
|
||||
sys.exit(1)
|
||||
|
||||
generator = PokemonTCGCatalogGenerator(json_file)
|
||||
pdf_file = generator.generate_pdf()
|
||||
|
||||
if pdf_file:
|
||||
print(f"\nCatalog generation completed!")
|
||||
print(f"PDF file: {pdf_file}")
|
||||
print(f"Output directory: {generator.output_dir}")
|
||||
else:
|
||||
print(f"\nPDF generation failed, but markdown file is available in: {generator.output_dir}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user