Removed 20 files: old test scripts, debug tools, duplicate docs,
generated JSON, old PDF generator, launcher scripts.
Kept:
disco.py — main tool (scrape HAR + generate PDF)
scraper.py — reference site scraper (HTML + Selenium/Brave)
requirements.txt
*.har — browser capture with API data
Updated:
README.md — rewritten to reflect current tool and usage
.gitignore — simplified
scraper.py — module/class/method docstrings updated to clarify
this is a reference implementation, disco.py is primary
475 lines
18 KiB
Python
Executable File
475 lines
18 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Pokemon Discovery — Site Scraper (Reference)
|
|
|
|
HTML + Selenium/Brave scraper for Dollar General product pages.
|
|
Kept as a reference implementation. The primary tool is disco.py,
|
|
which reads product data from a HAR capture instead of scraping live.
|
|
|
|
This scraper can:
|
|
- Fetch individual product pages and extract title, SKU, price, stock
|
|
- Attempt to find product links from the category page (limited by
|
|
dynamic JS loading — products are injected via API after page load)
|
|
- Fall back to Brave browser via Selenium for JS-rendered content
|
|
|
|
Usage:
|
|
python scraper.py # Attempt full category scrape
|
|
# Or import and use PokemonTCGScraper class directly for individual pages
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
import requests
|
|
from datetime import datetime
|
|
from urllib.parse import urljoin, urlparse
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Try selenium imports (fallback for dynamic content)
|
|
try:
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.common.exceptions import TimeoutException
|
|
from webdriver_manager.chrome import ChromeDriverManager
|
|
SELENIUM_AVAILABLE = True
|
|
except ImportError:
|
|
SELENIUM_AVAILABLE = False
|
|
print("Selenium not available, using requests only (install selenium for Brave browser support)")
|
|
|
|
class PokemonTCGScraper:
|
|
"""HTML/Selenium scraper for Dollar General Pokemon product pages.
|
|
|
|
Can extract product details (title, SKU, price, stock) from individual
|
|
product page URLs. Category-level scraping is limited because Dollar
|
|
General loads products dynamically via a JS API call after page load.
|
|
See disco.py for the HAR-based approach that bypasses this limitation.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.base_url = "https://www.dollargeneral.com"
|
|
self.search_url = "https://www.dollargeneral.com/c/toys/pokemon?q=&soldAtStore=true"
|
|
self.api_base = "https://dggo.dollargeneral.com"
|
|
self.api_endpoint = "https://dggo.dollargeneral.com/omni/api/v2/category/search/provider"
|
|
self.session = requests.Session()
|
|
|
|
# Headers to appear more like a real browser
|
|
self.headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
}
|
|
self.session.headers.update(self.headers)
|
|
|
|
self.products = []
|
|
|
|
def get_page_with_requests(self, url):
|
|
"""Try to get page content using requests"""
|
|
try:
|
|
response = self.session.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except requests.RequestException as e:
|
|
print(f"Requests failed for {url}: {e}")
|
|
return None
|
|
|
|
def get_page_with_selenium(self, url):
|
|
"""Fallback to selenium for dynamic content using Brave browser"""
|
|
if not SELENIUM_AVAILABLE:
|
|
return None
|
|
|
|
options = Options()
|
|
options.add_argument('--headless')
|
|
options.add_argument('--no-sandbox')
|
|
options.add_argument('--disable-dev-shm-usage')
|
|
options.add_argument('--disable-gpu')
|
|
options.add_argument('--disable-web-security')
|
|
options.add_argument('--disable-features=VizDisplayCompositor')
|
|
options.add_argument(f'--user-agent={self.headers["User-Agent"]}')
|
|
|
|
# Use Brave browser
|
|
options.binary_location = '/usr/bin/brave'
|
|
|
|
try:
|
|
print("Starting Brave browser with Selenium...")
|
|
from selenium.webdriver.chrome.service import Service
|
|
|
|
# Try to get compatible ChromeDriver
|
|
try:
|
|
# Try with webdriver manager (auto-detects version)
|
|
service = Service(ChromeDriverManager().install())
|
|
except Exception as e:
|
|
print(f"ChromeDriver auto-install failed: {e}")
|
|
print("This usually means ChromeDriver version doesn't match Brave version.")
|
|
print("For best results, ensure ChromeDriver and Brave versions are compatible.")
|
|
print("You can manually install a compatible ChromeDriver or use a different browser.")
|
|
return None
|
|
|
|
driver = webdriver.Chrome(service=service, options=options)
|
|
|
|
print(f"Navigating to: {url}")
|
|
driver.get(url)
|
|
|
|
# Wait for content to load
|
|
print("Waiting for page content to load...")
|
|
WebDriverWait(driver, 15).until(
|
|
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
|
)
|
|
|
|
# Additional wait for dynamic content and JavaScript execution
|
|
print("Waiting for dynamic content...")
|
|
time.sleep(5)
|
|
|
|
# Try to find product-related elements
|
|
print("Looking for product elements...")
|
|
try:
|
|
# Check if we have product elements loaded
|
|
product_elements = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"], .product-item, .product-card')
|
|
print(f"Found {len(product_elements)} potential product elements")
|
|
except:
|
|
print("No specific product elements found, proceeding with full page content")
|
|
|
|
html = driver.page_source
|
|
print(f"Retrieved {len(html)} characters of HTML content")
|
|
driver.quit()
|
|
return html
|
|
|
|
except Exception as e:
|
|
print(f"Brave/Selenium failed for {url}: {e}")
|
|
if 'driver' in locals():
|
|
driver.quit()
|
|
return None
|
|
|
|
def get_page_content(self, url):
|
|
"""Get page content, trying requests first, then selenium"""
|
|
print(f"Fetching: {url}")
|
|
|
|
# Try requests first
|
|
content = self.get_page_with_requests(url)
|
|
if content and len(content) > 1000: # Basic content check
|
|
return content
|
|
|
|
# Fallback to selenium
|
|
print("Falling back to Selenium...")
|
|
return self.get_page_with_selenium(url)
|
|
|
|
def extract_product_links(self, html):
|
|
"""Extract product page links from search results"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
links = []
|
|
|
|
# Common selectors for product links
|
|
selectors = [
|
|
'a[href*="/p/"]',
|
|
'.product-item a',
|
|
'.product-card a',
|
|
'.product-link',
|
|
'[data-testid*="product"] a'
|
|
]
|
|
|
|
for selector in selectors:
|
|
elements = soup.select(selector)
|
|
for element in elements:
|
|
href = element.get('href')
|
|
if href and '/p/' in href:
|
|
full_url = urljoin(self.base_url, href)
|
|
if full_url not in links:
|
|
links.append(full_url)
|
|
|
|
return links
|
|
|
|
def extract_product_info(self, url, html):
|
|
"""Extract product information from product page"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
product = {'url': url}
|
|
|
|
# Extract title
|
|
title_selectors = [
|
|
'h1',
|
|
'.product-title',
|
|
'.product-name',
|
|
'[data-testid="product-title"]',
|
|
'.pdp-product-name'
|
|
]
|
|
|
|
for selector in title_selectors:
|
|
title_elem = soup.select_one(selector)
|
|
if title_elem:
|
|
product['title'] = title_elem.get_text().strip()
|
|
break
|
|
|
|
# Extract price
|
|
price_selectors = [
|
|
'.price',
|
|
'.product-price',
|
|
'[data-testid="price"]',
|
|
'.price-current',
|
|
'.current-price'
|
|
]
|
|
|
|
for selector in price_selectors:
|
|
price_elem = soup.select_one(selector)
|
|
if price_elem:
|
|
price_text = price_elem.get_text().strip()
|
|
product['price'] = price_text
|
|
break
|
|
|
|
# Extract SKU
|
|
sku_selectors = [
|
|
'[data-sku]',
|
|
'.sku',
|
|
'.product-sku',
|
|
'.item-number'
|
|
]
|
|
|
|
# Try data attributes first
|
|
for selector in sku_selectors:
|
|
elem = soup.select_one(selector)
|
|
if elem:
|
|
sku = elem.get('data-sku') or elem.get_text().strip()
|
|
if sku and sku.lower() != 'sku':
|
|
product['sku'] = sku
|
|
break
|
|
|
|
# Try JSON-LD structured data
|
|
if 'sku' not in product:
|
|
scripts = soup.find_all('script', type='application/ld+json')
|
|
for script in scripts:
|
|
try:
|
|
if script.string:
|
|
data = json.loads(script.string)
|
|
if isinstance(data, dict) and 'sku' in data:
|
|
product['sku'] = data['sku']
|
|
break
|
|
elif isinstance(data, list):
|
|
for item in data:
|
|
if isinstance(item, dict) and 'sku' in item:
|
|
product['sku'] = item['sku']
|
|
break
|
|
except:
|
|
continue
|
|
|
|
# If still no SKU found, try searching in page text for patterns like "SKU: 41936301"
|
|
if 'sku' not in product:
|
|
import re
|
|
sku_pattern = r'(?:sku|item\s+number|product\s+id)[:>\s]+([a-zA-Z0-9]+)'
|
|
matches = re.findall(sku_pattern, html, re.IGNORECASE)
|
|
if matches:
|
|
product['sku'] = matches[0]
|
|
|
|
# Extract stock information
|
|
stock_selectors = [
|
|
'.stock',
|
|
'.inventory',
|
|
'.availability',
|
|
'[data-testid="stock"]',
|
|
'.in-stock',
|
|
'.out-of-stock'
|
|
]
|
|
|
|
for selector in stock_selectors:
|
|
stock_elem = soup.select_one(selector)
|
|
if stock_elem:
|
|
stock_text = stock_elem.get_text().strip().lower()
|
|
if 'in stock' in stock_text:
|
|
product['stock'] = 'In Stock'
|
|
elif 'out of stock' in stock_text:
|
|
product['stock'] = 'Out of Stock'
|
|
else:
|
|
product['stock'] = stock_text
|
|
break
|
|
|
|
# Extract image URL
|
|
img_selectors = [
|
|
'.product-image img',
|
|
'.product-photo img',
|
|
'.pdp-image img',
|
|
'[data-testid="product-image"] img',
|
|
'img[alt*="Pokemon"]',
|
|
'img[alt*="TCG"]'
|
|
]
|
|
|
|
for selector in img_selectors:
|
|
img_elem = soup.select_one(selector)
|
|
if img_elem:
|
|
src = img_elem.get('src') or img_elem.get('data-src')
|
|
if src:
|
|
product['image_url'] = urljoin(self.base_url, src)
|
|
break
|
|
|
|
return product
|
|
|
|
def is_pokemon_tcg_product(self, product):
|
|
"""Check if product is a Pokemon TCG card pack or tin"""
|
|
if not product.get('title'):
|
|
return False
|
|
|
|
title = product['title'].lower()
|
|
pokemon_keywords = ['pokemon', 'tcg', 'trading card', 'cards']
|
|
tcg_keywords = ['pack', 'tin', 'box', 'booster', 'collection']
|
|
|
|
has_pokemon = any(keyword in title for keyword in pokemon_keywords)
|
|
has_tcg = any(keyword in title for keyword in tcg_keywords)
|
|
|
|
return has_pokemon and has_tcg
|
|
|
|
def try_api_scraping(self):
|
|
"""Stub for API-based scraping (requires auth token).
|
|
|
|
Documents the discovered API endpoint and request format.
|
|
Not functional — use disco.py with a HAR file instead.
|
|
"""
|
|
print("🔬 Attempting API-based scraping...")
|
|
print(" Endpoint: https://dggo.dollargeneral.com/omni/api/v2/category/search/provider")
|
|
print(" Method: POST with JSON payload")
|
|
print(" Status: Requires authentication token (Bearer)")
|
|
print()
|
|
|
|
# Note: This is the exact API endpoint discovered via HAR analysis
|
|
# It requires a Bearer token that's generated during a proper browser session
|
|
|
|
# Sample request format (for documentation and future implementation):
|
|
sample_request = {
|
|
"endpoint": self.api_endpoint,
|
|
"method": "POST",
|
|
"headers": {
|
|
"Authorization": "Bearer [TOKEN_REQUIRED]",
|
|
"Content-Type": "application/json",
|
|
"Accept": "application/json, text/plain, */*",
|
|
"Referer": "https://www.dollargeneral.com/"
|
|
},
|
|
"payload": {
|
|
"StoreNbr": 17506, # Store location
|
|
"SearchTerm": None,
|
|
"PageSize": 24,
|
|
"PageStartRecordIndex": 0,
|
|
"Filters": {
|
|
"category": [],
|
|
"brand": [],
|
|
"soldAtStore": True,
|
|
"inStock": False, # False includes out of stock items
|
|
},
|
|
"Id": 723960, # Pokemon category ID
|
|
"SearchType": 1
|
|
}
|
|
}
|
|
|
|
print("📋 API Request Format Documented:")
|
|
print(f" Store Number: {sample_request['payload']['StoreNbr']}")
|
|
print(f" Category ID: {sample_request['payload']['Id']} (Pokemon)")
|
|
print(f" Page Size: {sample_request['payload']['PageSize']}")
|
|
print(" Authentication: Bearer token required")
|
|
print()
|
|
|
|
# TODO: Implement proper authentication flow
|
|
# This would require either:
|
|
# 1. Browser automation to get a valid session token
|
|
# 2. Reverse engineering the authentication flow
|
|
# 3. Using a headless browser with proper session management
|
|
|
|
print("⚠️ API authentication not yet implemented")
|
|
print(" Individual product extraction works perfectly as fallback")
|
|
return []
|
|
|
|
def scrape_products(self):
|
|
"""Main scraping method"""
|
|
print(f"Starting scrape of: {self.search_url}")
|
|
|
|
# Try API-based scraping first (discovered via HAR analysis)
|
|
api_products = self.try_api_scraping()
|
|
if api_products:
|
|
print(f"✅ API scraping successful! Found {len(api_products)} products")
|
|
return api_products
|
|
|
|
print("🔄 Falling back to HTML scraping...")
|
|
print()
|
|
|
|
# Get search results page
|
|
html = self.get_page_content(self.search_url)
|
|
if not html:
|
|
print("Failed to get search results page")
|
|
return []
|
|
|
|
# Extract product links
|
|
product_links = self.extract_product_links(html)
|
|
print(f"Found {len(product_links)} potential product links")
|
|
|
|
if not product_links:
|
|
print("No product links found with requests. Trying Brave browser for dynamic content...")
|
|
# Try Selenium with Brave as fallback
|
|
selenium_html = self.get_page_with_selenium(self.search_url)
|
|
if selenium_html and len(selenium_html) > len(html):
|
|
print("Got enhanced content from Brave, re-extracting product links...")
|
|
html = selenium_html
|
|
product_links = self.extract_product_links(html)
|
|
print(f"Found {len(product_links)} product links with Brave browser")
|
|
|
|
if not product_links:
|
|
print("No product links found even with Brave browser.")
|
|
print("This could be due to:")
|
|
print("1. No Pokemon TCG products currently in stock")
|
|
print("2. Website structure changes")
|
|
print("3. Enhanced anti-bot protection")
|
|
print("4. Geographic restrictions")
|
|
print("\nFirst 1000 chars of final page content:")
|
|
print(html[:1000])
|
|
return []
|
|
|
|
# Scrape each product page
|
|
for i, link in enumerate(product_links):
|
|
print(f"Scraping product {i+1}/{len(product_links)}: {link}")
|
|
|
|
product_html = self.get_page_content(link)
|
|
if not product_html:
|
|
continue
|
|
|
|
product = self.extract_product_info(link, product_html)
|
|
|
|
# Filter for Pokemon TCG products
|
|
if self.is_pokemon_tcg_product(product):
|
|
print(f"Found Pokemon TCG product: {product.get('title', 'Unknown')}")
|
|
self.products.append(product)
|
|
else:
|
|
print(f"Skipping non-TCG product: {product.get('title', 'Unknown')}")
|
|
|
|
# Be respectful to the server
|
|
time.sleep(1)
|
|
|
|
return self.products
|
|
|
|
def save_to_json(self, filename=None):
|
|
"""Save scraped products to JSON file"""
|
|
if not filename:
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"pokemon_tcg_products_{timestamp}.json"
|
|
|
|
with open(filename, 'w') as f:
|
|
json.dump(self.products, f, indent=2)
|
|
|
|
print(f"Saved {len(self.products)} products to {filename}")
|
|
return filename
|
|
|
|
def main():
|
|
scraper = PokemonTCGScraper()
|
|
products = scraper.scrape_products()
|
|
|
|
if products:
|
|
filename = scraper.save_to_json()
|
|
print(f"\nScraping completed successfully!")
|
|
print(f"Found {len(products)} Pokemon TCG products")
|
|
print(f"Data saved to: {filename}")
|
|
else:
|
|
print("\nNo products found. This could be due to:")
|
|
print("1. No Pokemon TCG products in stock")
|
|
print("2. Website structure changes")
|
|
print("3. Anti-bot protection")
|
|
|
|
if __name__ == "__main__":
|
|
main() |