From e6dd999aeb91a980bd73b7bc0dfeaaa64d4dabf1 Mon Sep 17 00:00:00 2001 From: pi-bot-01 Date: Sat, 21 Mar 2026 14:41:17 -0700 Subject: [PATCH] Initial commit: Pokemon Discovery - TCG product scraper and PDF catalog generator - Comprehensive scraper for Dollar General Pokemon TCG products - Professional PDF catalog generator with UPC-A barcodes - Robust anti-bot handling with requests + Selenium fallback - Automatic image downloading and barcode generation - Unix-friendly timestamped filenames - Virtual environment support and dependency management - Complete documentation and usage guides --- .gitignore | 37 ++++++ README.md | 208 ++++++++++++++++++++++++++++++ USAGE.md | 115 +++++++++++++++++ pdf_generator.py | 278 +++++++++++++++++++++++++++++++++++++++ requirements.txt | 8 ++ run.sh | 31 +++++ run_scraper.py | 139 ++++++++++++++++++++ scraper.py | 329 +++++++++++++++++++++++++++++++++++++++++++++++ test_barcode.py | 55 ++++++++ 9 files changed, 1200 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 USAGE.md create mode 100755 pdf_generator.py create mode 100644 requirements.txt create mode 100755 run.sh create mode 100755 run_scraper.py create mode 100755 scraper.py create mode 100644 test_barcode.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9721fbc --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# Virtual environment +venv/ +env/ +.env + +# Python cache +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.so +.pytest_cache/ + +# Output files +*.json +catalog_output/ +test_output/ + +# Logs +*.log + +# OS files +.DS_Store +Thumbs.db +.directory + +# IDE files +.vscode/ +.idea/ +*.swp +*.swo + +# Temporary files +*.tmp +*.temp +.cache/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..42fa4d6 --- /dev/null +++ b/README.md @@ -0,0 +1,208 @@ +# Pokemon Discovery (pokemon-disco) + +A comprehensive tool for discovering Pokemon Trading Card Game products from Dollar General's website and generating a professional PDF catalog with product images, details, and UPC-A barcodes. + +## Features + +- **Web Scraping**: Automatically scrapes Pokemon TCG products from Dollar General +- **Robust Data Extraction**: Extracts product name, price, stock status, SKU, and images +- **Anti-Bot Handling**: Uses both requests and Selenium for dynamic content +- **Barcode Generation**: Creates UPC-A barcodes for each product SKU +- **PDF Catalog**: Professional PDF with images, details, and barcodes +- **Unix-Friendly Naming**: Timestamped filenames for easy sorting + +## Requirements + +### System Requirements +- Python 3.7+ +- pandoc (for PDF generation) +- Chrome/Chromium browser (for Selenium fallback) + +### Python Dependencies +All dependencies are automatically installed via `requirements.txt`: +- requests +- beautifulsoup4 +- selenium +- webdriver-manager +- python-barcode +- Pillow +- pandas +- lxml + +## Installation + +1. **Clone/Download** this directory to your system + +2. **Install pandoc** (required for PDF generation): + ```bash + # Ubuntu/Debian + sudo apt install pandoc + + # macOS + brew install pandoc + + # Arch Linux + sudo pacman -S pandoc + ``` + +3. **Install Python dependencies** (automatically done by the script): + ```bash + cd pokemon-disco + pip3 install -r requirements.txt + ``` + +## Usage + +### Quick Start (Recommended) + +Run the complete pipeline with one command: + +```bash +cd pokemon-disco +python3 run_scraper.py +``` + +This will: +1. Check and install Python requirements +2. Scrape Pokemon TCG products from Dollar General +3. Generate a PDF catalog with images and barcodes +4. Create timestamped files for easy organization + +### Manual Usage + +If you prefer to run components separately: + +#### 1. Scrape Products +```bash +python3 scraper.py +``` +This creates a JSON file like `pokemon_tcg_products_20241221_143025.json` + +#### 2. Generate PDF Catalog +```bash +python3 pdf_generator.py pokemon_tcg_products_20241221_143025.json +``` + +## Output Files + +### Generated Files +- **JSON Data**: `pokemon_tcg_products_YYYYMMDD_HHMMSS.json` + - Raw scraped data in JSON format + - Contains all product information + +- **PDF Catalog**: `catalog_output/pokemon_tcg_catalog_YYYYMMDD_HHMMSS.pdf` + - Professional PDF catalog + - Includes product images, details, and UPC-A barcodes + +### Output Directory Structure +``` +pokemon-disco/ +├── pokemon_tcg_products_YYYYMMDD_HHMMSS.json +├── catalog_output/ +│ ├── pokemon_tcg_catalog_YYYYMMDD_HHMMSS.pdf +│ ├── pokemon_tcg_catalog_YYYYMMDD_HHMMSS.md +│ ├── images/ +│ │ ├── product_1_SKU123.jpg +│ │ ├── product_2_SKU456.jpg +│ │ └── placeholder.png +│ └── barcodes/ +│ ├── barcode_SKU123.png +│ ├── barcode_SKU456.png +│ └── ... +``` + +## PDF Catalog Features + +Each product in the PDF includes: +- **Product Image**: Downloaded from Dollar General or placeholder +- **Product Details Table**: + - Title + - Price + - Stock Status + - SKU (formatted as code) + - Product URL +- **UPC-A Barcode**: Generated from SKU for inventory management + +## Data Fields Extracted + +For each Pokemon TCG product: +- `title`: Product name +- `price`: Current price +- `stock`: Availability status +- `sku`: Product SKU/item number +- `image_url`: Direct link to product image +- `url`: Link to product page + +## Troubleshooting + +### Common Issues + +1. **No products found** + - Dollar General may have anti-bot protection + - The script will automatically retry with Selenium + - Website structure may have changed + +2. **PDF generation fails** + - Ensure pandoc is installed: `pandoc --version` + - Try alternative LaTeX engines if available + - Markdown file is still generated for manual conversion + +3. **Image download failures** + - Network connectivity issues + - Placeholder images will be used automatically + +4. **Chrome/Selenium issues** + - Ensure Chrome or Chromium is installed + - webdriver-manager will automatically download ChromeDriver + - Script falls back to requests-only mode if Selenium fails + +### Debug Mode + +To see more detailed output, check the console output during scraping. The scripts provide detailed logging of: +- Which products are found and filtered +- Network request status +- File generation progress + +## Technical Details + +### Scraping Strategy +1. **Primary Method**: Uses requests with browser-like headers +2. **Fallback Method**: Selenium with headless Chrome for dynamic content +3. **Product Filtering**: Only includes products matching Pokemon TCG keywords +4. **Rate Limiting**: 1-second delay between requests to be respectful + +### Barcode Generation +- Converts SKUs to 11-digit numeric format +- Generates UPC-A barcodes with check digits +- High-quality PNG images suitable for printing + +### PDF Generation +- Uses pandoc with LaTeX for professional formatting +- Includes table of contents +- Optimized for printing and digital viewing +- Images scaled appropriately for page layout + +## Customization + +### Modifying Product Filters +Edit the `is_pokemon_tcg_product()` method in `scraper.py` to change which products are included. + +### Changing PDF Layout +Modify the markdown generation in `pdf_generator.py` or add custom pandoc templates. + +### Adding New Data Fields +Extend the `extract_product_info()` method in `scraper.py` to capture additional product information. + +## License + +This tool is for educational and personal use. Please respect Dollar General's terms of service and robots.txt when using this scraper. + +## Support + +If you encounter issues: +1. Check the console output for error messages +2. Ensure all system requirements are installed +3. Verify internet connectivity +4. Check if the Dollar General website structure has changed + +Generated files include timestamps for easy organization and version tracking. \ No newline at end of file diff --git a/USAGE.md b/USAGE.md new file mode 100644 index 0000000..52ee7a3 --- /dev/null +++ b/USAGE.md @@ -0,0 +1,115 @@ +# Quick Start Guide + +## Simple Usage (Recommended) + +1. **Make sure you're in the project directory:** + ```bash + cd pokemon-disco + ``` + +2. **Run the complete scraper and PDF generator:** + ```bash + ./run.sh + ``` + + This single command will: + - Set up the Python virtual environment + - Install all required packages + - Scrape Pokemon TCG products from Dollar General + - Generate a professional PDF catalog with barcodes + - Create timestamped files for easy organization + +## What You'll Get + +### Generated Files: +- **`pokemon_tcg_products_YYYYMMDD_HHMMSS.json`** - Raw data in JSON format +- **`catalog_output/pokemon_tcg_catalog_YYYYMMDD_HHMMSS.pdf`** - Professional PDF catalog + +### PDF Catalog Contents: +- Product images (downloaded automatically) +- Product details (title, price, stock, SKU) +- UPC-A barcodes for each product (generated from SKU) +- Table of contents for easy navigation +- Professional formatting suitable for printing + +## Alternative Commands + +If you prefer more control: + +```bash +# Activate virtual environment first +source venv/bin/activate + +# Run only the scraper +python scraper.py + +# Run only the PDF generator (after scraping) +python pdf_generator.py pokemon_tcg_products_YYYYMMDD_HHMMSS.json + +# Run everything (installs requirements automatically) +python run_scraper.py +``` + +## Output Location + +All generated files will be in: +- JSON data: Current directory +- PDF catalog: `catalog_output/` directory +- Product images: `catalog_output/images/` +- Barcode images: `catalog_output/barcodes/` + +## Requirements + +- Python 3.7+ +- pandoc (for PDF generation) +- Internet connection (for scraping) + +The script will automatically handle Python dependencies via virtual environment. + +## Troubleshooting + +If you encounter issues: + +1. **Permission denied:** Make sure the script is executable: + ```bash + chmod +x run.sh + ``` + +2. **Pandoc not found:** Install pandoc for your system: + ```bash + # Ubuntu/Debian + sudo apt install pandoc + + # Arch Linux + sudo pacman -S pandoc + + # macOS + brew install pandoc + ``` + +3. **No products found:** The website may have anti-bot protection or changed structure. The script includes fallback mechanisms. + +4. **PDF generation fails:** The markdown file will still be generated, which you can manually convert or view. + +## File Naming Convention + +All output files include Unix-friendly timestamps: +- Format: `YYYYMMDD_HHMMSS` (e.g., `20241221_143025`) +- This ensures chronological sorting with `ls` command +- No spaces or special characters for script-friendly handling + +## Example Output + +``` +pokemon-disco/ +├── pokemon_tcg_products_20241221_143025.json # Scraped data +├── catalog_output/ +│ ├── pokemon_tcg_catalog_20241221_143025.pdf # Final catalog +│ ├── pokemon_tcg_catalog_20241221_143025.md # Markdown source +│ ├── images/ +│ │ ├── product_1_SKU123456.jpg # Product images +│ │ └── product_2_SKU789012.jpg +│ └── barcodes/ +│ ├── barcode_SKU123456.png # UPC-A barcodes +│ └── barcode_SKU789012.png +``` \ No newline at end of file diff --git a/pdf_generator.py b/pdf_generator.py new file mode 100755 index 0000000..c79065c --- /dev/null +++ b/pdf_generator.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +""" +Pokemon Discovery - TCG Product Catalog PDF Generator +Generates PDF catalog with product images, details, and UPC-A barcodes +""" + +import json +import os +import sys +import requests +import subprocess +from datetime import datetime +from pathlib import Path +import barcode +from barcode.writer import ImageWriter +from PIL import Image, ImageDraw, ImageFont +import tempfile +import shutil + +class PokemonTCGCatalogGenerator: + def __init__(self, json_file): + self.json_file = json_file + self.output_dir = Path("catalog_output") + self.images_dir = self.output_dir / "images" + self.barcodes_dir = self.output_dir / "barcodes" + + # Create output directories + self.output_dir.mkdir(exist_ok=True) + self.images_dir.mkdir(exist_ok=True) + self.barcodes_dir.mkdir(exist_ok=True) + + # Load product data + with open(json_file, 'r') as f: + self.products = json.load(f) + + def download_image(self, url, filename): + """Download product image""" + if not url: + return None + + try: + response = requests.get(url, timeout=30) + response.raise_for_status() + + filepath = self.images_dir / filename + with open(filepath, 'wb') as f: + f.write(response.content) + + return filepath + except Exception as e: + print(f"Failed to download image {url}: {e}") + return None + + def generate_upc_barcode(self, sku): + """Generate UPC-A barcode from SKU""" + try: + # Convert SKU to 12-digit UPC-A format + # Remove non-digits and pad/truncate to 11 digits (12th is check digit) + digits_only = ''.join(filter(str.isdigit, str(sku))) + + if len(digits_only) < 11: + # Pad with zeros at the start + upc_base = digits_only.zfill(11) + else: + # Take the last 11 digits + upc_base = digits_only[-11:] + + # Generate UPC-A barcode + upc_generator = barcode.get_barcode_class('upca') + upc = upc_generator(upc_base, writer=ImageWriter()) + + # Save barcode image + barcode_filename = f"barcode_{sku.replace('/', '_').replace(' ', '_')}.png" + barcode_path = self.barcodes_dir / barcode_filename + + # Save with specific options for better appearance + upc.save(str(barcode_path).replace('.png', ''), options={ + 'module_width': 0.2, + 'module_height': 15.0, + 'quiet_zone': 6.5, + 'font_size': 10, + 'text_distance': 5.0, + 'background': 'white', + 'foreground': 'black' + }) + + return f"{barcode_path}.png" + + except Exception as e: + print(f"Failed to generate barcode for SKU {sku}: {e}") + return None + + def create_placeholder_image(self, width=300, height=200): + """Create a placeholder image when product image is not available""" + img = Image.new('RGB', (width, height), color='lightgray') + draw = ImageDraw.Draw(img) + + try: + # Try to use a system font + font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', 24) + except: + try: + font = ImageFont.truetype('arial.ttf', 24) + except: + font = ImageFont.load_default() + + text = "No Image\nAvailable" + + # Get text bounding box for centering + lines = text.split('\n') + y_offset = height // 2 - (len(lines) * 30) // 2 + + for line in lines: + bbox = draw.textbbox((0, 0), line, font=font) + text_width = bbox[2] - bbox[0] + x_offset = (width - text_width) // 2 + draw.text((x_offset, y_offset), line, fill='darkgray', font=font) + y_offset += 30 + + placeholder_path = self.images_dir / "placeholder.png" + img.save(placeholder_path) + return placeholder_path + + def generate_markdown(self): + """Generate markdown content for the catalog""" + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + markdown = f"""--- +title: "Pokemon TCG Product Catalog" +subtitle: "Dollar General - Generated {timestamp}" +author: "Automated Scraper" +date: "{timestamp}" +geometry: margin=1in +fontsize: 11pt +documentclass: article +--- + +# Pokemon TCG Product Catalog + +Generated on: {timestamp} +Source: Dollar General +Total Products: {len(self.products)} + +--- + +""" + + for i, product in enumerate(self.products, 1): + print(f"Processing product {i}/{len(self.products)}: {product.get('title', 'Unknown')}") + + # Download product image + image_path = None + if product.get('image_url'): + filename = f"product_{i}_{product.get('sku', 'unknown').replace('/', '_').replace(' ', '_')}.jpg" + image_path = self.download_image(product.get('image_url'), filename) + + if not image_path: + # Use placeholder + image_path = self.create_placeholder_image() + + # Generate barcode + barcode_path = None + if product.get('sku'): + barcode_path = self.generate_upc_barcode(product.get('sku')) + + # Add product section to markdown + markdown += f"## {i}. {product.get('title', 'Unknown Product')}\n\n" + + # Product image + if image_path: + rel_image_path = os.path.relpath(image_path, self.output_dir) + markdown += f"![Product Image]({rel_image_path}){{width=300px}}\n\n" + + # Product details in a table + markdown += "| Field | Value |\n" + markdown += "|-------|-------|\n" + markdown += f"| **Title** | {product.get('title', 'N/A')} |\n" + markdown += f"| **Price** | {product.get('price', 'N/A')} |\n" + markdown += f"| **Stock** | {product.get('stock', 'N/A')} |\n" + markdown += f"| **SKU** | `{product.get('sku', 'N/A')}` |\n" + markdown += f"| **URL** | {product.get('url', 'N/A')} |\n" + markdown += "\n" + + # Barcode + if barcode_path: + rel_barcode_path = os.path.relpath(barcode_path, self.output_dir) + markdown += f"**UPC-A Barcode:**\n\n" + markdown += f"![UPC-A Barcode]({rel_barcode_path}){{width=200px}}\n\n" + + markdown += "---\n\n" + + return markdown + + def generate_pdf(self): + """Generate PDF catalog using pandoc""" + print("Generating markdown content...") + markdown_content = self.generate_markdown() + + # Save markdown file + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + markdown_file = self.output_dir / f"pokemon_tcg_catalog_{timestamp}.md" + + with open(markdown_file, 'w', encoding='utf-8') as f: + f.write(markdown_content) + + print(f"Markdown saved to: {markdown_file}") + + # Generate PDF using pandoc + pdf_file = self.output_dir / f"pokemon_tcg_catalog_{timestamp}.pdf" + + print("Converting to PDF using pandoc...") + + try: + subprocess.run([ + 'pandoc', + str(markdown_file), + '-o', str(pdf_file), + '--pdf-engine=xelatex', + '-V', 'colorlinks=true', + '-V', 'linkcolor=blue', + '-V', 'filecolor=magenta', + '-V', 'urlcolor=cyan', + '--toc', + '--toc-depth=2' + ], check=True) + + print(f"PDF generated successfully: {pdf_file}") + return pdf_file + + except subprocess.CalledProcessError as e: + print(f"Pandoc conversion failed: {e}") + print("Trying with pdflatex instead...") + + try: + subprocess.run([ + 'pandoc', + str(markdown_file), + '-o', str(pdf_file), + '--pdf-engine=pdflatex', + '--toc' + ], check=True) + + print(f"PDF generated successfully: {pdf_file}") + return pdf_file + + except subprocess.CalledProcessError as e2: + print(f"PDF generation failed with both engines: {e2}") + print(f"Markdown file available at: {markdown_file}") + return None + + except FileNotFoundError: + print("Error: pandoc not found. Please install pandoc to generate PDF.") + print(f"Markdown file available at: {markdown_file}") + return None + +def main(): + if len(sys.argv) != 2: + print("Usage: python3 pdf_generator.py ") + print("Example: python3 pdf_generator.py pokemon_tcg_products_20241221_143025.json") + sys.exit(1) + + json_file = sys.argv[1] + + if not os.path.exists(json_file): + print(f"Error: JSON file '{json_file}' not found") + sys.exit(1) + + generator = PokemonTCGCatalogGenerator(json_file) + pdf_file = generator.generate_pdf() + + if pdf_file: + print(f"\nCatalog generation completed!") + print(f"PDF file: {pdf_file}") + print(f"Output directory: {generator.output_dir}") + else: + print(f"\nPDF generation failed, but markdown file is available in: {generator.output_dir}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0192fb4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +requests +beautifulsoup4 +selenium +webdriver-manager +python-barcode +Pillow +pandas +lxml \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..d8abd5a --- /dev/null +++ b/run.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Pokemon Discovery - Scraper & Catalog Generator Launcher +# Automatically activates virtual environment and runs the scraper + +set -e + +cd "$(dirname "$0")" + +echo "Pokemon Discovery - Product Scraper & Catalog Generator" +echo "================================================" + +# Check if virtual environment exists +if [[ ! -d "venv" ]]; then + echo "Creating virtual environment..." + python3 -m venv venv +fi + +# Activate virtual environment +source venv/bin/activate + +# Check if requirements are installed +if ! python -c "import requests, bs4, barcode, selenium" 2>/dev/null; then + echo "Installing Python requirements..." + pip install -r requirements.txt +fi + +# Run the main script +python run_scraper.py + +echo "" +echo "Script completed. Check the output above for results." \ No newline at end of file diff --git a/run_scraper.py b/run_scraper.py new file mode 100755 index 0000000..6269523 --- /dev/null +++ b/run_scraper.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +""" +Pokemon Discovery - Scraper and Catalog Generator +Main script that runs both scraping and PDF generation +""" + +import os +import sys +import subprocess +from datetime import datetime +from pathlib import Path + +def install_requirements(): + """Install Python requirements""" + print("Installing Python requirements...") + try: + subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'], + check=True) + print("Requirements installed successfully!") + except subprocess.CalledProcessError as e: + print(f"Failed to install requirements: {e}") + return False + return True + +def run_scraper(): + """Run the scraper to collect product data""" + print("=" * 60) + print("STEP 1: SCRAPING POKEMON TCG PRODUCTS") + print("=" * 60) + + try: + result = subprocess.run([sys.executable, 'scraper.py'], + capture_output=True, text=True) + + if result.returncode == 0: + print("Scraping completed successfully!") + print(result.stdout) + + # Find the generated JSON file + json_files = list(Path('.').glob('pokemon_tcg_products_*.json')) + if json_files: + latest_file = max(json_files, key=os.path.getctime) + return str(latest_file) + else: + print("No JSON file was generated") + return None + else: + print("Scraping failed:") + print(result.stderr) + return None + + except Exception as e: + print(f"Error running scraper: {e}") + return None + +def run_pdf_generator(json_file): + """Run the PDF generator with the scraped data""" + print("=" * 60) + print("STEP 2: GENERATING PDF CATALOG") + print("=" * 60) + + try: + result = subprocess.run([sys.executable, 'pdf_generator.py', json_file], + capture_output=True, text=True) + + if result.returncode == 0: + print("PDF generation completed successfully!") + print(result.stdout) + return True + else: + print("PDF generation failed:") + print(result.stderr) + return False + + except Exception as e: + print(f"Error running PDF generator: {e}") + return False + +def main(): + print("Pokemon Discovery - Product Scraper & Catalog Generator") + print("=" * 60) + print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print() + + # Check if requirements are installed + try: + import requests, bs4, barcode, PIL + print("✓ Required packages are available") + except ImportError as e: + print(f"✗ Missing required package: {e}") + print("Installing requirements...") + if not install_requirements(): + sys.exit(1) + + # Check if pandoc is available + try: + subprocess.run(['pandoc', '--version'], + capture_output=True, check=True) + print("✓ Pandoc is available for PDF generation") + except (subprocess.CalledProcessError, FileNotFoundError): + print("⚠ Pandoc not found. PDF generation may fail.") + print(" Install pandoc with: sudo apt install pandoc (Ubuntu/Debian)") + print(" or: brew install pandoc (macOS)") + print(" or: pacman -S pandoc (Arch Linux)") + + print() + + # Run scraper + json_file = run_scraper() + if not json_file: + print("Scraping failed. Exiting.") + sys.exit(1) + + # Run PDF generator + if run_pdf_generator(json_file): + print("=" * 60) + print("SUCCESS! Both scraping and PDF generation completed.") + print("=" * 60) + print(f"JSON data: {json_file}") + print("PDF catalog: Check the catalog_output/ directory") + print() + print("Files generated:") + + # List generated files + for file_pattern in ['pokemon_tcg_products_*.json', 'catalog_output/pokemon_tcg_catalog_*.pdf']: + files = list(Path('.').glob(file_pattern)) + if files: + latest = max(files, key=os.path.getctime) + print(f" - {latest}") + else: + print("=" * 60) + print("PARTIAL SUCCESS: Scraping completed, but PDF generation failed.") + print("=" * 60) + print(f"JSON data: {json_file}") + print("You can manually run the PDF generator with:") + print(f" python3 pdf_generator.py {json_file}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100755 index 0000000..422b72d --- /dev/null +++ b/scraper.py @@ -0,0 +1,329 @@ +#!/usr/bin/env python3 +""" +Pokemon Discovery - TCG Product Scraper for Dollar General +Scrapes product information and saves to JSON for PDF generation +""" + +import json +import os +import time +import requests +from datetime import datetime +from urllib.parse import urljoin, urlparse +import pandas as pd +from bs4 import BeautifulSoup + +# Try selenium imports (fallback for dynamic content) +try: + from selenium import webdriver + from selenium.webdriver.chrome.options import Options + from selenium.webdriver.common.by import By + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + from selenium.common.exceptions import TimeoutException + from webdriver_manager.chrome import ChromeDriverManager + SELENIUM_AVAILABLE = True +except ImportError: + SELENIUM_AVAILABLE = False + print("Selenium not available, using requests only") + +class PokemonTCGScraper: + def __init__(self): + self.base_url = "https://www.dollargeneral.com" + self.search_url = "https://www.dollargeneral.com/c/toys/pokemon?q=&soldAtStore=true" + self.session = requests.Session() + + # Headers to appear more like a real browser + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + } + self.session.headers.update(self.headers) + + self.products = [] + + def get_page_with_requests(self, url): + """Try to get page content using requests""" + try: + response = self.session.get(url, timeout=30) + response.raise_for_status() + return response.text + except requests.RequestException as e: + print(f"Requests failed for {url}: {e}") + return None + + def get_page_with_selenium(self, url): + """Fallback to selenium for dynamic content""" + if not SELENIUM_AVAILABLE: + return None + + options = Options() + options.add_argument('--headless') + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') + options.add_argument('--disable-gpu') + options.add_argument(f'--user-agent={self.headers["User-Agent"]}') + + try: + driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) + driver.get(url) + + # Wait for content to load + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.TAG_NAME, "body")) + ) + + # Additional wait for dynamic content + time.sleep(3) + + html = driver.page_source + driver.quit() + return html + + except Exception as e: + print(f"Selenium failed for {url}: {e}") + if 'driver' in locals(): + driver.quit() + return None + + def get_page_content(self, url): + """Get page content, trying requests first, then selenium""" + print(f"Fetching: {url}") + + # Try requests first + content = self.get_page_with_requests(url) + if content and len(content) > 1000: # Basic content check + return content + + # Fallback to selenium + print("Falling back to Selenium...") + return self.get_page_with_selenium(url) + + def extract_product_links(self, html): + """Extract product page links from search results""" + soup = BeautifulSoup(html, 'html.parser') + links = [] + + # Common selectors for product links + selectors = [ + 'a[href*="/p/"]', + '.product-item a', + '.product-card a', + '.product-link', + '[data-testid*="product"] a' + ] + + for selector in selectors: + elements = soup.select(selector) + for element in elements: + href = element.get('href') + if href and '/p/' in href: + full_url = urljoin(self.base_url, href) + if full_url not in links: + links.append(full_url) + + return links + + def extract_product_info(self, url, html): + """Extract product information from product page""" + soup = BeautifulSoup(html, 'html.parser') + product = {'url': url} + + # Extract title + title_selectors = [ + 'h1', + '.product-title', + '.product-name', + '[data-testid="product-title"]', + '.pdp-product-name' + ] + + for selector in title_selectors: + title_elem = soup.select_one(selector) + if title_elem: + product['title'] = title_elem.get_text().strip() + break + + # Extract price + price_selectors = [ + '.price', + '.product-price', + '[data-testid="price"]', + '.price-current', + '.current-price' + ] + + for selector in price_selectors: + price_elem = soup.select_one(selector) + if price_elem: + price_text = price_elem.get_text().strip() + product['price'] = price_text + break + + # Extract SKU + sku_selectors = [ + '[data-sku]', + '.sku', + '.product-sku', + '*[text()*="SKU"]', + 'script[type="application/ld+json"]' + ] + + # Try data attributes first + for selector in sku_selectors[:-1]: + elem = soup.select_one(selector) + if elem: + sku = elem.get('data-sku') or elem.get_text().strip() + if sku and sku.lower() != 'sku': + product['sku'] = sku + break + + # Try JSON-LD structured data + if 'sku' not in product: + scripts = soup.find_all('script', type='application/ld+json') + for script in scripts: + try: + data = json.loads(script.string) + if isinstance(data, dict) and 'sku' in data: + product['sku'] = data['sku'] + break + elif isinstance(data, list): + for item in data: + if isinstance(item, dict) and 'sku' in item: + product['sku'] = item['sku'] + break + except: + continue + + # Extract stock information + stock_selectors = [ + '.stock', + '.inventory', + '.availability', + '[data-testid="stock"]', + '.in-stock', + '.out-of-stock' + ] + + for selector in stock_selectors: + stock_elem = soup.select_one(selector) + if stock_elem: + stock_text = stock_elem.get_text().strip().lower() + if 'in stock' in stock_text: + product['stock'] = 'In Stock' + elif 'out of stock' in stock_text: + product['stock'] = 'Out of Stock' + else: + product['stock'] = stock_text + break + + # Extract image URL + img_selectors = [ + '.product-image img', + '.product-photo img', + '.pdp-image img', + '[data-testid="product-image"] img', + 'img[alt*="Pokemon"]', + 'img[alt*="TCG"]' + ] + + for selector in img_selectors: + img_elem = soup.select_one(selector) + if img_elem: + src = img_elem.get('src') or img_elem.get('data-src') + if src: + product['image_url'] = urljoin(self.base_url, src) + break + + return product + + def is_pokemon_tcg_product(self, product): + """Check if product is a Pokemon TCG card pack or tin""" + if not product.get('title'): + return False + + title = product['title'].lower() + pokemon_keywords = ['pokemon', 'tcg', 'trading card', 'cards'] + tcg_keywords = ['pack', 'tin', 'box', 'booster', 'collection'] + + has_pokemon = any(keyword in title for keyword in pokemon_keywords) + has_tcg = any(keyword in title for keyword in tcg_keywords) + + return has_pokemon and has_tcg + + def scrape_products(self): + """Main scraping method""" + print(f"Starting scrape of: {self.search_url}") + + # Get search results page + html = self.get_page_content(self.search_url) + if not html: + print("Failed to get search results page") + return [] + + # Extract product links + product_links = self.extract_product_links(html) + print(f"Found {len(product_links)} potential product links") + + if not product_links: + print("No product links found. The page structure may have changed.") + print("First 1000 chars of page:") + print(html[:1000]) + return [] + + # Scrape each product page + for i, link in enumerate(product_links): + print(f"Scraping product {i+1}/{len(product_links)}: {link}") + + product_html = self.get_page_content(link) + if not product_html: + continue + + product = self.extract_product_info(link, product_html) + + # Filter for Pokemon TCG products + if self.is_pokemon_tcg_product(product): + print(f"Found Pokemon TCG product: {product.get('title', 'Unknown')}") + self.products.append(product) + else: + print(f"Skipping non-TCG product: {product.get('title', 'Unknown')}") + + # Be respectful to the server + time.sleep(1) + + return self.products + + def save_to_json(self, filename=None): + """Save scraped products to JSON file""" + if not filename: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"pokemon_tcg_products_{timestamp}.json" + + with open(filename, 'w') as f: + json.dump(self.products, f, indent=2) + + print(f"Saved {len(self.products)} products to {filename}") + return filename + +def main(): + scraper = PokemonTCGScraper() + products = scraper.scrape_products() + + if products: + filename = scraper.save_to_json() + print(f"\nScraping completed successfully!") + print(f"Found {len(products)} Pokemon TCG products") + print(f"Data saved to: {filename}") + else: + print("\nNo products found. This could be due to:") + print("1. No Pokemon TCG products in stock") + print("2. Website structure changes") + print("3. Anti-bot protection") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test_barcode.py b/test_barcode.py new file mode 100644 index 0000000..885a8ca --- /dev/null +++ b/test_barcode.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +""" +Test script to verify barcode generation functionality +""" + +import sys +import os +from pathlib import Path + +# Add current directory to path if running in venv +sys.path.insert(0, '.') + +try: + import barcode + from barcode.writer import ImageWriter + print("✓ Barcode generation libraries are available") + + # Test barcode generation + test_sku = "123456789012" + + upc_generator = barcode.get_barcode_class('upca') + test_barcode = upc_generator("12345678901", writer=ImageWriter()) + + # Create test output directory + test_dir = Path("test_output") + test_dir.mkdir(exist_ok=True) + + # Generate test barcode + barcode_path = test_dir / "test_barcode" + test_barcode.save(str(barcode_path), options={ + 'module_width': 0.2, + 'module_height': 15.0, + 'quiet_zone': 6.5, + 'font_size': 10, + 'text_distance': 5.0, + 'background': 'white', + 'foreground': 'black' + }) + + final_path = f"{barcode_path}.png" + if os.path.exists(final_path): + print(f"✓ Test barcode generated successfully: {final_path}") + print(f" File size: {os.path.getsize(final_path)} bytes") + else: + print(f"✗ Failed to generate test barcode") + sys.exit(1) + +except ImportError as e: + print(f"✗ Missing barcode library: {e}") + sys.exit(1) +except Exception as e: + print(f"✗ Barcode generation failed: {e}") + sys.exit(1) + +print("✓ All barcode generation tests passed!") \ No newline at end of file