- Comprehensive scraper for Dollar General Pokemon TCG products - Professional PDF catalog generator with UPC-A barcodes - Robust anti-bot handling with requests + Selenium fallback - Automatic image downloading and barcode generation - Unix-friendly timestamped filenames - Virtual environment support and dependency management - Complete documentation and usage guides
139 lines
4.5 KiB
Python
Executable File
139 lines
4.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Pokemon Discovery - Scraper and Catalog Generator
|
|
Main script that runs both scraping and PDF generation
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import subprocess
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
def install_requirements():
|
|
"""Install Python requirements"""
|
|
print("Installing Python requirements...")
|
|
try:
|
|
subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'],
|
|
check=True)
|
|
print("Requirements installed successfully!")
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Failed to install requirements: {e}")
|
|
return False
|
|
return True
|
|
|
|
def run_scraper():
|
|
"""Run the scraper to collect product data"""
|
|
print("=" * 60)
|
|
print("STEP 1: SCRAPING POKEMON TCG PRODUCTS")
|
|
print("=" * 60)
|
|
|
|
try:
|
|
result = subprocess.run([sys.executable, 'scraper.py'],
|
|
capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
print("Scraping completed successfully!")
|
|
print(result.stdout)
|
|
|
|
# Find the generated JSON file
|
|
json_files = list(Path('.').glob('pokemon_tcg_products_*.json'))
|
|
if json_files:
|
|
latest_file = max(json_files, key=os.path.getctime)
|
|
return str(latest_file)
|
|
else:
|
|
print("No JSON file was generated")
|
|
return None
|
|
else:
|
|
print("Scraping failed:")
|
|
print(result.stderr)
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"Error running scraper: {e}")
|
|
return None
|
|
|
|
def run_pdf_generator(json_file):
|
|
"""Run the PDF generator with the scraped data"""
|
|
print("=" * 60)
|
|
print("STEP 2: GENERATING PDF CATALOG")
|
|
print("=" * 60)
|
|
|
|
try:
|
|
result = subprocess.run([sys.executable, 'pdf_generator.py', json_file],
|
|
capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
print("PDF generation completed successfully!")
|
|
print(result.stdout)
|
|
return True
|
|
else:
|
|
print("PDF generation failed:")
|
|
print(result.stderr)
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"Error running PDF generator: {e}")
|
|
return False
|
|
|
|
def main():
|
|
print("Pokemon Discovery - Product Scraper & Catalog Generator")
|
|
print("=" * 60)
|
|
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print()
|
|
|
|
# Check if requirements are installed
|
|
try:
|
|
import requests, bs4, barcode, PIL
|
|
print("✓ Required packages are available")
|
|
except ImportError as e:
|
|
print(f"✗ Missing required package: {e}")
|
|
print("Installing requirements...")
|
|
if not install_requirements():
|
|
sys.exit(1)
|
|
|
|
# Check if pandoc is available
|
|
try:
|
|
subprocess.run(['pandoc', '--version'],
|
|
capture_output=True, check=True)
|
|
print("✓ Pandoc is available for PDF generation")
|
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
print("⚠ Pandoc not found. PDF generation may fail.")
|
|
print(" Install pandoc with: sudo apt install pandoc (Ubuntu/Debian)")
|
|
print(" or: brew install pandoc (macOS)")
|
|
print(" or: pacman -S pandoc (Arch Linux)")
|
|
|
|
print()
|
|
|
|
# Run scraper
|
|
json_file = run_scraper()
|
|
if not json_file:
|
|
print("Scraping failed. Exiting.")
|
|
sys.exit(1)
|
|
|
|
# Run PDF generator
|
|
if run_pdf_generator(json_file):
|
|
print("=" * 60)
|
|
print("SUCCESS! Both scraping and PDF generation completed.")
|
|
print("=" * 60)
|
|
print(f"JSON data: {json_file}")
|
|
print("PDF catalog: Check the catalog_output/ directory")
|
|
print()
|
|
print("Files generated:")
|
|
|
|
# List generated files
|
|
for file_pattern in ['pokemon_tcg_products_*.json', 'catalog_output/pokemon_tcg_catalog_*.pdf']:
|
|
files = list(Path('.').glob(file_pattern))
|
|
if files:
|
|
latest = max(files, key=os.path.getctime)
|
|
print(f" - {latest}")
|
|
else:
|
|
print("=" * 60)
|
|
print("PARTIAL SUCCESS: Scraping completed, but PDF generation failed.")
|
|
print("=" * 60)
|
|
print(f"JSON data: {json_file}")
|
|
print("You can manually run the PDF generator with:")
|
|
print(f" python3 pdf_generator.py {json_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |