Initial commit.
This commit is contained in:
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
env_api/
|
||||||
|
pkm_data/
|
||||||
|
|
||||||
|
# too big
|
||||||
|
mtg_cards_20251019_095943.csv
|
||||||
|
mtg_cards_20251019_101118.xlsx
|
||||||
|
mtg-default-cards-20251018212333.json
|
||||||
293
mtg_card_fetcher copy.py
Normal file
293
mtg_card_fetcher copy.py
Normal file
@@ -0,0 +1,293 @@
|
|||||||
|
import json
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
import openpyxl
|
||||||
|
from openpyxl import Workbook
|
||||||
|
|
||||||
|
def flatten_card_data(card):
|
||||||
|
"""
|
||||||
|
Flatten a single card's data structure into a dictionary suitable for CSV.
|
||||||
|
Handles nested fields and converts lists to comma-separated strings.
|
||||||
|
"""
|
||||||
|
flat_card = {}
|
||||||
|
|
||||||
|
# Basic fields
|
||||||
|
simple_fields = [
|
||||||
|
'id', 'oracle_id', 'name', 'lang', 'released_at', 'uri', 'scryfall_uri',
|
||||||
|
'layout', 'highres_image', 'image_status', 'mana_cost', 'cmc', 'type_line',
|
||||||
|
'oracle_text', 'power', 'toughness', 'loyalty', 'life_modifier', 'hand_modifier',
|
||||||
|
'reserved', 'foil', 'nonfoil', 'oversized', 'promo', 'reprint', 'variation',
|
||||||
|
'set_id', 'set', 'set_name', 'set_type', 'set_uri', 'set_search_uri',
|
||||||
|
'scryfall_set_uri', 'rulings_uri', 'prints_search_uri', 'collector_number',
|
||||||
|
'digital', 'rarity', 'card_back_id', 'artist', 'border_color', 'frame',
|
||||||
|
'full_art', 'textless', 'booster', 'story_spotlight', 'edhrec_rank',
|
||||||
|
'penny_rank', 'flavor_text', 'watermark', 'printed_name', 'printed_type_line',
|
||||||
|
'printed_text', 'security_stamp', 'preview_text', 'content_warning',
|
||||||
|
'flavor_name', 'game_changer'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Copy simple fields
|
||||||
|
for field in simple_fields:
|
||||||
|
if field in card:
|
||||||
|
flat_card[field] = card[field]
|
||||||
|
|
||||||
|
# Handle array fields - convert to comma-separated strings
|
||||||
|
array_fields = [
|
||||||
|
'multiverse_ids', 'colors', 'color_identity', 'keywords', 'produced_mana',
|
||||||
|
'games', 'finishes', 'artist_ids', 'all_parts', 'card_faces', 'related_cards'
|
||||||
|
]
|
||||||
|
|
||||||
|
for field in array_fields:
|
||||||
|
if field in card:
|
||||||
|
if isinstance(card[field], list):
|
||||||
|
# Convert list items to strings and join
|
||||||
|
flat_card[field] = ', '.join(str(item) for item in card[field])
|
||||||
|
else:
|
||||||
|
flat_card[field] = card[field]
|
||||||
|
|
||||||
|
# Handle MTGO and Arena IDs
|
||||||
|
if 'mtgo_id' in card:
|
||||||
|
flat_card['mtgo_id'] = card['mtgo_id']
|
||||||
|
if 'arena_id' in card:
|
||||||
|
flat_card['arena_id'] = card['arena_id']
|
||||||
|
if 'tcgplayer_id' in card:
|
||||||
|
flat_card['tcgplayer_id'] = card['tcgplayer_id']
|
||||||
|
if 'cardmarket_id' in card:
|
||||||
|
flat_card['cardmarket_id'] = card['cardmarket_id']
|
||||||
|
|
||||||
|
# Handle image_uris (nested dict)
|
||||||
|
if 'image_uris' in card and isinstance(card['image_uris'], dict):
|
||||||
|
for key, value in card['image_uris'].items():
|
||||||
|
flat_card[f'image_uri_{key}'] = value
|
||||||
|
|
||||||
|
# Handle legalities (nested dict)
|
||||||
|
if 'legalities' in card and isinstance(card['legalities'], dict):
|
||||||
|
for format_name, status in card['legalities'].items():
|
||||||
|
flat_card[f'legal_{format_name}'] = status
|
||||||
|
|
||||||
|
# Handle prices (nested dict)
|
||||||
|
if 'prices' in card and isinstance(card['prices'], dict):
|
||||||
|
for currency, price in card['prices'].items():
|
||||||
|
flat_card[f'price_{currency}'] = price
|
||||||
|
|
||||||
|
# Handle related_uris (nested dict)
|
||||||
|
if 'related_uris' in card and isinstance(card['related_uris'], dict):
|
||||||
|
for uri_type, uri in card['related_uris'].items():
|
||||||
|
flat_card[f'uri_{uri_type}'] = uri
|
||||||
|
|
||||||
|
# Handle purchase_uris (nested dict)
|
||||||
|
if 'purchase_uris' in card and isinstance(card['purchase_uris'], dict):
|
||||||
|
for store, uri in card['purchase_uris'].items():
|
||||||
|
flat_card[f'purchase_{store}'] = uri
|
||||||
|
|
||||||
|
# Handle preview information
|
||||||
|
if 'preview' in card and isinstance(card['preview'], dict):
|
||||||
|
if 'source' in card['preview']:
|
||||||
|
flat_card['preview_source'] = card['preview']['source']
|
||||||
|
if 'source_uri' in card['preview']:
|
||||||
|
flat_card['preview_source_uri'] = card['preview']['source_uri']
|
||||||
|
if 'previewed_at' in card['preview']:
|
||||||
|
flat_card['preview_date'] = card['preview']['previewed_at']
|
||||||
|
|
||||||
|
return flat_card
|
||||||
|
|
||||||
|
def detect_json_format(input_file):
|
||||||
|
"""
|
||||||
|
Detect if the file is NDJSON or JSON array format
|
||||||
|
"""
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
# Read first few characters
|
||||||
|
first_chars = f.read(100).strip()
|
||||||
|
if first_chars.startswith('['):
|
||||||
|
return 'array'
|
||||||
|
elif first_chars.startswith('{'):
|
||||||
|
return 'ndjson'
|
||||||
|
else:
|
||||||
|
# Try to read first line and detect
|
||||||
|
f.seek(0)
|
||||||
|
first_line = f.readline().strip()
|
||||||
|
if first_line.startswith('[') or first_line == '[':
|
||||||
|
return 'array'
|
||||||
|
else:
|
||||||
|
return 'ndjson'
|
||||||
|
|
||||||
|
def process_scryfall_array(input_file, output_file):
|
||||||
|
"""
|
||||||
|
Process a Scryfall JSON array file using streaming JSON parser to handle large files.
|
||||||
|
"""
|
||||||
|
print(f"Processing {input_file} (JSON array format)...")
|
||||||
|
print("Loading and parsing JSON data (this may take a minute for large files)...")
|
||||||
|
|
||||||
|
import ijson # We'll need ijson for streaming large JSON arrays
|
||||||
|
|
||||||
|
# First, let's try with regular json
|
||||||
|
try:
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
print(f"Loaded {len(data):,} cards")
|
||||||
|
|
||||||
|
# Analyze structure
|
||||||
|
print("Analyzing card structure...")
|
||||||
|
all_fields = set()
|
||||||
|
for i, card in enumerate(data):
|
||||||
|
flat_card = flatten_card_data(card)
|
||||||
|
all_fields.update(flat_card.keys())
|
||||||
|
if (i + 1) % 10000 == 0:
|
||||||
|
print(f" Analyzed {i + 1:,} cards...")
|
||||||
|
|
||||||
|
print(f"Found {len(all_fields)} unique fields")
|
||||||
|
fieldnames = sorted(list(all_fields))
|
||||||
|
|
||||||
|
# Write CSV
|
||||||
|
print("Writing CSV file...")
|
||||||
|
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
for i, card in enumerate(data):
|
||||||
|
flat_card = flatten_card_data(card)
|
||||||
|
writer.writerow(flat_card)
|
||||||
|
if (i + 1) % 10000 == 0:
|
||||||
|
print(f" Written {i + 1:,} cards...")
|
||||||
|
|
||||||
|
print(f"\nComplete! Written {len(data):,} cards to {output_file}")
|
||||||
|
|
||||||
|
except MemoryError:
|
||||||
|
print("File too large for memory. Please install ijson: pip install ijson")
|
||||||
|
print("Then run the script again.")
|
||||||
|
sys.exit(1)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error parsing JSON: {e}")
|
||||||
|
print("The file might be corrupted or not in valid JSON format.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def process_scryfall_ndjson(input_file, output_file):
|
||||||
|
"""
|
||||||
|
Process a Scryfall NDJSON (newline-delimited JSON) file and convert it to CSV format.
|
||||||
|
"""
|
||||||
|
print(f"Processing {input_file} (NDJSON format)...")
|
||||||
|
|
||||||
|
# First pass: collect all possible fields
|
||||||
|
print("First pass: Analyzing card structure...")
|
||||||
|
all_fields = set()
|
||||||
|
cards_processed = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
for line_num, line in enumerate(f, 1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
card = json.loads(line)
|
||||||
|
flat_card = flatten_card_data(card)
|
||||||
|
all_fields.update(flat_card.keys())
|
||||||
|
cards_processed += 1
|
||||||
|
|
||||||
|
if cards_processed % 10000 == 0:
|
||||||
|
print(f" Analyzed {cards_processed:,} cards...")
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Found {len(all_fields)} unique fields across {cards_processed:,} cards")
|
||||||
|
if errors > 0:
|
||||||
|
print(f" (Skipped {errors} malformed lines)")
|
||||||
|
|
||||||
|
# Sort fields for consistent column order
|
||||||
|
fieldnames = sorted(list(all_fields))
|
||||||
|
|
||||||
|
# Second pass: write CSV
|
||||||
|
print("\nSecond pass: Writing CSV...")
|
||||||
|
cards_written = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
for line_num, line in enumerate(f, 1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
card = json.loads(line)
|
||||||
|
flat_card = flatten_card_data(card)
|
||||||
|
writer.writerow(flat_card)
|
||||||
|
cards_written += 1
|
||||||
|
|
||||||
|
if cards_written % 10000 == 0:
|
||||||
|
print(f" Written {cards_written:,} cards...")
|
||||||
|
|
||||||
|
except (json.JSONDecodeError, Exception) as e:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"\nComplete! Written {cards_written:,} cards to {output_file}")
|
||||||
|
if errors > 0:
|
||||||
|
print(f"Skipped {errors} problematic lines")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
Main function to handle command line arguments and run the conversion.
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: python3 mtg_card_fetcher.py <input_json_file> <output_csv_file>")
|
||||||
|
print("Example: python3 mtg_card_fetcher.py mtg-default-cards-20251018212333.json cards_collection.csv")
|
||||||
|
sys.exit(1)
|
||||||
|
"""
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: python3 mtg_card_fetcher.py <input_json_file>")
|
||||||
|
print("Example: python3 mtg_card_fetcher.py mtg-default-cards-20251018212333.json")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||||
|
|
||||||
|
input_file = sys.argv[1]
|
||||||
|
output_file = f'mtg_cards_{timestamp}.csv' # sys.argv[2]
|
||||||
|
|
||||||
|
# Validate input file exists
|
||||||
|
if not Path(input_file).exists():
|
||||||
|
print(f"Error: Input file '{input_file}' not found!")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Check file size
|
||||||
|
file_size = Path(input_file).stat().st_size / (1024 * 1024) # Size in MB
|
||||||
|
print(f"Input file size: {file_size:.1f} MB")
|
||||||
|
|
||||||
|
# Detect format
|
||||||
|
print("Detecting file format...")
|
||||||
|
format_type = detect_json_format(input_file)
|
||||||
|
print(f"Detected format: {format_type.upper()}")
|
||||||
|
|
||||||
|
# Warn if output file exists
|
||||||
|
if Path(output_file).exists():
|
||||||
|
response = input(f"Warning: Output file '{output_file}' already exists. Overwrite? (y/n): ")
|
||||||
|
if response.lower() != 'y':
|
||||||
|
print("Cancelled.")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if format_type == 'array':
|
||||||
|
process_scryfall_array(input_file, output_file)
|
||||||
|
else:
|
||||||
|
process_scryfall_ndjson(input_file, output_file)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\nProcess interrupted by user.")
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing file: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
351
mtg_card_fetcher.py
Normal file
351
mtg_card_fetcher.py
Normal file
@@ -0,0 +1,351 @@
|
|||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from openpyxl import Workbook
|
||||||
|
from openpyxl.utils import get_column_letter
|
||||||
|
from openpyxl.styles import Font
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
def flatten_card_data(card):
|
||||||
|
"""
|
||||||
|
Flatten a single card's data structure into a dictionary suitable for Excel.
|
||||||
|
Handles nested fields and converts lists to comma-separated strings.
|
||||||
|
"""
|
||||||
|
flat_card = {}
|
||||||
|
|
||||||
|
# Basic fields
|
||||||
|
simple_fields = [
|
||||||
|
'id', 'oracle_id', 'name', 'lang', 'released_at', 'uri', 'scryfall_uri',
|
||||||
|
'layout', 'highres_image', 'image_status', 'mana_cost', 'cmc', 'type_line',
|
||||||
|
'oracle_text', 'power', 'toughness', 'loyalty', 'life_modifier', 'hand_modifier',
|
||||||
|
'reserved', 'foil', 'nonfoil', 'oversized', 'promo', 'reprint', 'variation',
|
||||||
|
'set_id', 'set', 'set_name', 'set_type', 'set_uri', 'set_search_uri',
|
||||||
|
'scryfall_set_uri', 'rulings_uri', 'prints_search_uri', 'collector_number',
|
||||||
|
'digital', 'rarity', 'card_back_id', 'artist', 'border_color', 'frame',
|
||||||
|
'full_art', 'textless', 'booster', 'story_spotlight', 'edhrec_rank',
|
||||||
|
'penny_rank', 'flavor_text', 'watermark', 'printed_name', 'printed_type_line',
|
||||||
|
'printed_text', 'security_stamp', 'preview_text', 'content_warning',
|
||||||
|
'flavor_name', 'game_changer'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Copy simple fields
|
||||||
|
for field in simple_fields:
|
||||||
|
if field in card:
|
||||||
|
flat_card[field] = card[field]
|
||||||
|
|
||||||
|
# Handle array fields - convert to comma-separated strings
|
||||||
|
array_fields = [
|
||||||
|
'multiverse_ids', 'colors', 'color_identity', 'keywords', 'produced_mana',
|
||||||
|
'games', 'finishes', 'artist_ids', 'all_parts', 'card_faces', 'related_cards'
|
||||||
|
]
|
||||||
|
|
||||||
|
for field in array_fields:
|
||||||
|
if field in card:
|
||||||
|
if isinstance(card[field], list):
|
||||||
|
# Convert list items to strings and join
|
||||||
|
flat_card[field] = ', '.join(str(item) for item in card[field])
|
||||||
|
else:
|
||||||
|
flat_card[field] = card[field]
|
||||||
|
|
||||||
|
# Handle MTGO and Arena IDs
|
||||||
|
if 'mtgo_id' in card:
|
||||||
|
flat_card['mtgo_id'] = card['mtgo_id']
|
||||||
|
if 'arena_id' in card:
|
||||||
|
flat_card['arena_id'] = card['arena_id']
|
||||||
|
if 'tcgplayer_id' in card:
|
||||||
|
flat_card['tcgplayer_id'] = card['tcgplayer_id']
|
||||||
|
if 'cardmarket_id' in card:
|
||||||
|
flat_card['cardmarket_id'] = card['cardmarket_id']
|
||||||
|
|
||||||
|
# Handle image_uris (nested dict)
|
||||||
|
if 'image_uris' in card and isinstance(card['image_uris'], dict):
|
||||||
|
for key, value in card['image_uris'].items():
|
||||||
|
flat_card[f'image_uri_{key}'] = value
|
||||||
|
|
||||||
|
# Handle legalities (nested dict)
|
||||||
|
if 'legalities' in card and isinstance(card['legalities'], dict):
|
||||||
|
for format_name, status in card['legalities'].items():
|
||||||
|
flat_card[f'legal_{format_name}'] = status
|
||||||
|
|
||||||
|
# Handle prices (nested dict)
|
||||||
|
if 'prices' in card and isinstance(card['prices'], dict):
|
||||||
|
for currency, price in card['prices'].items():
|
||||||
|
flat_card[f'price_{currency}'] = price
|
||||||
|
|
||||||
|
# Handle related_uris (nested dict)
|
||||||
|
if 'related_uris' in card and isinstance(card['related_uris'], dict):
|
||||||
|
for uri_type, uri in card['related_uris'].items():
|
||||||
|
flat_card[f'uri_{uri_type}'] = uri
|
||||||
|
|
||||||
|
# Handle purchase_uris (nested dict)
|
||||||
|
if 'purchase_uris' in card and isinstance(card['purchase_uris'], dict):
|
||||||
|
for store, uri in card['purchase_uris'].items():
|
||||||
|
flat_card[f'purchase_{store}'] = uri
|
||||||
|
|
||||||
|
# Handle preview information
|
||||||
|
if 'preview' in card and isinstance(card['preview'], dict):
|
||||||
|
if 'source' in card['preview']:
|
||||||
|
flat_card['preview_source'] = card['preview']['source']
|
||||||
|
if 'source_uri' in card['preview']:
|
||||||
|
flat_card['preview_source_uri'] = card['preview']['source_uri']
|
||||||
|
if 'previewed_at' in card['preview']:
|
||||||
|
flat_card['preview_date'] = card['preview']['previewed_at']
|
||||||
|
|
||||||
|
return flat_card
|
||||||
|
|
||||||
|
def detect_json_format(input_file):
|
||||||
|
"""
|
||||||
|
Detect if the file is NDJSON or JSON array format
|
||||||
|
"""
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
# Read first few characters
|
||||||
|
first_chars = f.read(100).strip()
|
||||||
|
if first_chars.startswith('['):
|
||||||
|
return 'array'
|
||||||
|
elif first_chars.startswith('{'):
|
||||||
|
return 'ndjson'
|
||||||
|
else:
|
||||||
|
# Try to read first line and detect
|
||||||
|
f.seek(0)
|
||||||
|
first_line = f.readline().strip()
|
||||||
|
if first_line.startswith('[') or first_line == '[':
|
||||||
|
return 'array'
|
||||||
|
else:
|
||||||
|
return 'ndjson'
|
||||||
|
|
||||||
|
def write_to_excel(cards_data, fieldnames, output_file):
|
||||||
|
"""
|
||||||
|
Write the card data to an Excel file with formatting
|
||||||
|
"""
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = "Magic Cards"
|
||||||
|
|
||||||
|
# Write headers with bold formatting
|
||||||
|
header_font = Font(bold=True)
|
||||||
|
for col, field in enumerate(fieldnames, 1):
|
||||||
|
cell = ws.cell(row=1, column=col, value=field)
|
||||||
|
cell.font = header_font
|
||||||
|
|
||||||
|
# Write data
|
||||||
|
print("Writing to Excel file...")
|
||||||
|
for row_num, card_data in enumerate(cards_data, 2):
|
||||||
|
for col, field in enumerate(fieldnames, 1):
|
||||||
|
value = card_data.get(field, '')
|
||||||
|
# Excel has a cell character limit of 32,767
|
||||||
|
if isinstance(value, str) and len(value) > 32767:
|
||||||
|
value = value[:32764] + "..."
|
||||||
|
ws.cell(row=row_num, column=col, value=value)
|
||||||
|
|
||||||
|
if row_num % 10000 == 0:
|
||||||
|
print(f" Written {row_num - 1:,} cards...")
|
||||||
|
|
||||||
|
# Auto-adjust column widths (limited to prevent excessive widths)
|
||||||
|
print("Adjusting column widths...")
|
||||||
|
for column in ws.columns:
|
||||||
|
max_length = 0
|
||||||
|
column_letter = get_column_letter(column[0].column)
|
||||||
|
|
||||||
|
for cell in column[:100]: # Check first 100 rows for performance
|
||||||
|
try:
|
||||||
|
if cell.value:
|
||||||
|
max_length = max(max_length, len(str(cell.value)))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
adjusted_width = min(max_length + 2, 50) # Cap at 50 characters
|
||||||
|
ws.column_dimensions[column_letter].width = adjusted_width
|
||||||
|
|
||||||
|
# Freeze the header row
|
||||||
|
ws.freeze_panes = 'A2'
|
||||||
|
|
||||||
|
# Enable filters
|
||||||
|
ws.auto_filter.ref = ws.dimensions
|
||||||
|
|
||||||
|
print("Saving Excel file...")
|
||||||
|
wb.save(output_file)
|
||||||
|
print(f"Saved {len(cards_data):,} cards to {output_file}")
|
||||||
|
|
||||||
|
def process_scryfall_array(input_file, output_file):
|
||||||
|
"""
|
||||||
|
Process a Scryfall JSON array file and convert to Excel.
|
||||||
|
"""
|
||||||
|
print(f"Processing {input_file} (JSON array format)...")
|
||||||
|
print("Loading and parsing JSON data (this may take a minute for large files)...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
print(f"Loaded {len(data):,} cards")
|
||||||
|
|
||||||
|
# Analyze structure
|
||||||
|
print("Analyzing card structure...")
|
||||||
|
all_fields = set()
|
||||||
|
processed_cards = []
|
||||||
|
|
||||||
|
for i, card in enumerate(data):
|
||||||
|
flat_card = flatten_card_data(card)
|
||||||
|
all_fields.update(flat_card.keys())
|
||||||
|
processed_cards.append(flat_card)
|
||||||
|
if (i + 1) % 10000 == 0:
|
||||||
|
print(f" Analyzed {i + 1:,} cards...")
|
||||||
|
|
||||||
|
print(f"Found {len(all_fields)} unique fields")
|
||||||
|
fieldnames = sorted(list(all_fields))
|
||||||
|
|
||||||
|
# Write to Excel
|
||||||
|
write_to_excel(processed_cards, fieldnames, output_file)
|
||||||
|
print(f"\nComplete! You can now open {output_file} in LibreOffice Calc or Excel")
|
||||||
|
|
||||||
|
except MemoryError:
|
||||||
|
print("File too large for memory. The file might be too big to process at once.")
|
||||||
|
print("Consider using a streaming JSON parser or processing in chunks.")
|
||||||
|
sys.exit(1)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error parsing JSON: {e}")
|
||||||
|
print("The file might be corrupted or not in valid JSON format.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def process_scryfall_ndjson(input_file, output_file):
|
||||||
|
"""
|
||||||
|
Process a Scryfall NDJSON file and convert to Excel.
|
||||||
|
"""
|
||||||
|
print(f"Processing {input_file} (NDJSON format)...")
|
||||||
|
|
||||||
|
# First pass: collect all possible fields
|
||||||
|
print("First pass: Analyzing card structure...")
|
||||||
|
all_fields = set()
|
||||||
|
cards_processed = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
for line_num, line in enumerate(f, 1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
card = json.loads(line)
|
||||||
|
flat_card = flatten_card_data(card)
|
||||||
|
all_fields.update(flat_card.keys())
|
||||||
|
cards_processed += 1
|
||||||
|
|
||||||
|
if cards_processed % 10000 == 0:
|
||||||
|
print(f" Analyzed {cards_processed:,} cards...")
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Found {len(all_fields)} unique fields across {cards_processed:,} cards")
|
||||||
|
if errors > 0:
|
||||||
|
print(f" (Skipped {errors} malformed lines)")
|
||||||
|
|
||||||
|
fieldnames = sorted(list(all_fields))
|
||||||
|
|
||||||
|
# Second pass: collect all data
|
||||||
|
print("\nSecond pass: Reading card data...")
|
||||||
|
processed_cards = []
|
||||||
|
cards_read = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
for line_num, line in enumerate(f, 1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
card = json.loads(line)
|
||||||
|
flat_card = flatten_card_data(card)
|
||||||
|
processed_cards.append(flat_card)
|
||||||
|
cards_read += 1
|
||||||
|
|
||||||
|
if cards_read % 10000 == 0:
|
||||||
|
print(f" Read {cards_read:,} cards...")
|
||||||
|
|
||||||
|
except (json.JSONDecodeError, Exception) as e:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Read {cards_read:,} cards successfully")
|
||||||
|
if errors > 0:
|
||||||
|
print(f" (Skipped {errors} problematic lines)")
|
||||||
|
|
||||||
|
# Write to Excel
|
||||||
|
write_to_excel(processed_cards, fieldnames, output_file)
|
||||||
|
print(f"\nComplete! You can now open {output_file} in LibreOffice Calc or Excel")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
Main function to handle command line arguments and run the conversion.
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: python scryfall_to_xlsx.py <input_json_file> <output_xlsx_file>")
|
||||||
|
print("Example: python scryfall_to_xlsx.py all-cards-20241019.json cards_collection.xlsx")
|
||||||
|
sys.exit(1)
|
||||||
|
"""
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: python3 mtg_card_fetcher.py <input_json_file>")
|
||||||
|
print("Example: python3 mtg_card_fetcher.py mtg-default-cards-20251018212333.json")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||||
|
|
||||||
|
input_file = sys.argv[1]
|
||||||
|
output_file = f'mtg_cards_{timestamp}.xlsx' # sys.argv[2]
|
||||||
|
|
||||||
|
# Validate input file exists
|
||||||
|
if not Path(input_file).exists():
|
||||||
|
print(f"Error: Input file '{input_file}' not found!")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Check output file has xlsx extension
|
||||||
|
if not output_file.endswith('.xlsx'):
|
||||||
|
print("Warning: Output file should have .xlsx extension")
|
||||||
|
response = input("Continue anyway? (y/n): ")
|
||||||
|
if response.lower() != 'y':
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Check file size
|
||||||
|
file_size = Path(input_file).stat().st_size / (1024 * 1024) # Size in MB
|
||||||
|
print(f"Input file size: {file_size:.1f} MB")
|
||||||
|
|
||||||
|
# Detect format
|
||||||
|
print("Detecting file format...")
|
||||||
|
format_type = detect_json_format(input_file)
|
||||||
|
print(f"Detected format: {format_type.upper()}")
|
||||||
|
|
||||||
|
# Warn if output file exists
|
||||||
|
if Path(output_file).exists():
|
||||||
|
response = input(f"Warning: Output file '{output_file}' already exists. Overwrite? (y/n): ")
|
||||||
|
if response.lower() != 'y':
|
||||||
|
print("Cancelled.")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Check if openpyxl is installed
|
||||||
|
try:
|
||||||
|
import openpyxl
|
||||||
|
except ImportError:
|
||||||
|
print("\nError: openpyxl library is required for Excel output.")
|
||||||
|
print("Please install it using: pip install openpyxl")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if format_type == 'array':
|
||||||
|
process_scryfall_array(input_file, output_file)
|
||||||
|
else:
|
||||||
|
process_scryfall_ndjson(input_file, output_file)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\nProcess interrupted by user.")
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing file: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
277
pkm_card_fetcher.py
Normal file
277
pkm_card_fetcher.py
Normal file
@@ -0,0 +1,277 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from datetime import datetime
|
||||||
|
import glob
|
||||||
|
|
||||||
|
def flatten_list(items):
|
||||||
|
"""Convert list to comma-separated string"""
|
||||||
|
if items is None:
|
||||||
|
return ""
|
||||||
|
if isinstance(items, list):
|
||||||
|
return ", ".join(str(item) for item in items)
|
||||||
|
return str(items)
|
||||||
|
|
||||||
|
def extract_ability_info(abilities):
|
||||||
|
"""Extract ability information as comma-separated values"""
|
||||||
|
if not abilities:
|
||||||
|
return "", "", ""
|
||||||
|
|
||||||
|
names = []
|
||||||
|
texts = []
|
||||||
|
types = []
|
||||||
|
|
||||||
|
for ability in abilities:
|
||||||
|
names.append(ability.get('name', ''))
|
||||||
|
texts.append(ability.get('text', ''))
|
||||||
|
types.append(ability.get('type', ''))
|
||||||
|
|
||||||
|
return flatten_list(names), flatten_list(texts), flatten_list(types)
|
||||||
|
|
||||||
|
def extract_attack_info(attacks):
|
||||||
|
"""Extract attack information as comma-separated values"""
|
||||||
|
if not attacks:
|
||||||
|
return "", "", "", "", ""
|
||||||
|
|
||||||
|
names = []
|
||||||
|
costs = []
|
||||||
|
damages = []
|
||||||
|
texts = []
|
||||||
|
converted_costs = []
|
||||||
|
|
||||||
|
for attack in attacks:
|
||||||
|
names.append(attack.get('name', ''))
|
||||||
|
costs.append(flatten_list(attack.get('cost', [])))
|
||||||
|
damages.append(attack.get('damage', ''))
|
||||||
|
texts.append(attack.get('text', ''))
|
||||||
|
converted_costs.append(str(attack.get('convertedEnergyCost', '')))
|
||||||
|
|
||||||
|
return (flatten_list(names),
|
||||||
|
" | ".join(costs), # Use | to separate different attacks' costs
|
||||||
|
flatten_list(damages),
|
||||||
|
flatten_list(texts),
|
||||||
|
flatten_list(converted_costs))
|
||||||
|
|
||||||
|
def extract_weakness_resistance(items):
|
||||||
|
"""Extract weakness or resistance information"""
|
||||||
|
if not items:
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
types = []
|
||||||
|
values = []
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
types.append(item.get('type', ''))
|
||||||
|
values.append(item.get('value', ''))
|
||||||
|
|
||||||
|
return flatten_list(types), flatten_list(values)
|
||||||
|
|
||||||
|
def extract_prices(price_dict, prefix):
|
||||||
|
"""Extract price information from nested price dictionaries"""
|
||||||
|
if not price_dict:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
for price_type, prices in price_dict.items():
|
||||||
|
if isinstance(prices, dict):
|
||||||
|
for metric, value in prices.items():
|
||||||
|
key = f"{prefix}_{price_type}_{metric}"
|
||||||
|
result[key] = value
|
||||||
|
else:
|
||||||
|
# Handle direct price values
|
||||||
|
key = f"{prefix}_{price_type}"
|
||||||
|
result[key] = prices
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def process_card(card):
|
||||||
|
"""Process a single card and return a flattened dictionary"""
|
||||||
|
row = {
|
||||||
|
'id': card.get('id', ''),
|
||||||
|
'name': card.get('name', ''),
|
||||||
|
'supertype': card.get('supertype', ''),
|
||||||
|
'subtypes': flatten_list(card.get('subtypes', [])),
|
||||||
|
'level': card.get('level', ''),
|
||||||
|
'hp': card.get('hp', ''),
|
||||||
|
'types': flatten_list(card.get('types', [])),
|
||||||
|
'evolvesFrom': card.get('evolvesFrom', ''),
|
||||||
|
'evolvesTo': flatten_list(card.get('evolvesTo', [])),
|
||||||
|
'rules': flatten_list(card.get('rules', [])),
|
||||||
|
'number': card.get('number', ''),
|
||||||
|
'artist': card.get('artist', ''),
|
||||||
|
'rarity': card.get('rarity', ''),
|
||||||
|
'flavorText': card.get('flavorText', ''),
|
||||||
|
'nationalPokedexNumbers': flatten_list(card.get('nationalPokedexNumbers', [])),
|
||||||
|
'regulationMark': card.get('regulationMark', ''),
|
||||||
|
'retreatCost': flatten_list(card.get('retreatCost', [])),
|
||||||
|
'convertedRetreatCost': card.get('convertedRetreatCost', ''),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Ancient Trait
|
||||||
|
ancient_trait = card.get('ancientTrait', {})
|
||||||
|
if ancient_trait:
|
||||||
|
row['ancientTrait_name'] = ancient_trait.get('name', '')
|
||||||
|
row['ancientTrait_text'] = ancient_trait.get('text', '')
|
||||||
|
else:
|
||||||
|
row['ancientTrait_name'] = ''
|
||||||
|
row['ancientTrait_text'] = ''
|
||||||
|
|
||||||
|
# Abilities
|
||||||
|
abilities = card.get('abilities', [])
|
||||||
|
row['ability_names'], row['ability_texts'], row['ability_types'] = extract_ability_info(abilities)
|
||||||
|
|
||||||
|
# Attacks
|
||||||
|
attacks = card.get('attacks', [])
|
||||||
|
row['attack_names'], row['attack_costs'], row['attack_damages'], row['attack_texts'], row['attack_convertedCosts'] = extract_attack_info(attacks)
|
||||||
|
|
||||||
|
# Weaknesses
|
||||||
|
weaknesses = card.get('weaknesses', [])
|
||||||
|
row['weakness_types'], row['weakness_values'] = extract_weakness_resistance(weaknesses)
|
||||||
|
|
||||||
|
# Resistances
|
||||||
|
resistances = card.get('resistances', [])
|
||||||
|
row['resistance_types'], row['resistance_values'] = extract_weakness_resistance(resistances)
|
||||||
|
|
||||||
|
# Set information
|
||||||
|
set_info = card.get('set', {})
|
||||||
|
if set_info:
|
||||||
|
row['set_id'] = set_info.get('id', '')
|
||||||
|
row['set_name'] = set_info.get('name', '')
|
||||||
|
row['set_series'] = set_info.get('series', '')
|
||||||
|
row['set_printedTotal'] = set_info.get('printedTotal', '')
|
||||||
|
row['set_total'] = set_info.get('total', '')
|
||||||
|
row['set_ptcgoCode'] = set_info.get('ptcgoCode', '')
|
||||||
|
row['set_releaseDate'] = set_info.get('releaseDate', '')
|
||||||
|
|
||||||
|
# Legalities
|
||||||
|
legalities = card.get('legalities', {})
|
||||||
|
row['legal_standard'] = legalities.get('standard', '')
|
||||||
|
row['legal_expanded'] = legalities.get('expanded', '')
|
||||||
|
row['legal_unlimited'] = legalities.get('unlimited', '')
|
||||||
|
|
||||||
|
# Images
|
||||||
|
images = card.get('images', {})
|
||||||
|
row['image_small'] = images.get('small', '')
|
||||||
|
row['image_large'] = images.get('large', '')
|
||||||
|
|
||||||
|
# TCGPlayer prices
|
||||||
|
tcgplayer = card.get('tcgplayer', {})
|
||||||
|
if tcgplayer:
|
||||||
|
row['tcgplayer_url'] = tcgplayer.get('url', '')
|
||||||
|
row['tcgplayer_updatedAt'] = tcgplayer.get('updatedAt', '')
|
||||||
|
|
||||||
|
# Extract all price types
|
||||||
|
prices = tcgplayer.get('prices', {})
|
||||||
|
tcg_prices = extract_prices(prices, 'tcgplayer')
|
||||||
|
row.update(tcg_prices)
|
||||||
|
|
||||||
|
# Cardmarket prices
|
||||||
|
cardmarket = card.get('cardmarket', {})
|
||||||
|
if cardmarket:
|
||||||
|
row['cardmarket_url'] = cardmarket.get('url', '')
|
||||||
|
row['cardmarket_updatedAt'] = cardmarket.get('updatedAt', '')
|
||||||
|
|
||||||
|
# Extract all price types
|
||||||
|
prices = cardmarket.get('prices', {})
|
||||||
|
cm_prices = extract_prices(prices, 'cardmarket')
|
||||||
|
row.update(cm_prices)
|
||||||
|
|
||||||
|
return row
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Directory containing the JSON files
|
||||||
|
data_dir = './pkm_data/cards/en'
|
||||||
|
|
||||||
|
# Get all JSON files
|
||||||
|
json_files = glob.glob(os.path.join(data_dir, '*.json'))
|
||||||
|
|
||||||
|
if not json_files:
|
||||||
|
print(f"No JSON files found in {data_dir}")
|
||||||
|
return
|
||||||
|
|
||||||
|
all_cards = []
|
||||||
|
|
||||||
|
# Process each JSON file
|
||||||
|
for json_file in json_files:
|
||||||
|
print(f"Processing {os.path.basename(json_file)}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(json_file, 'r', encoding='utf-8') as f:
|
||||||
|
# Handle newline-delimited JSON
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
try:
|
||||||
|
# Try to parse as a single card
|
||||||
|
card = json.loads(line)
|
||||||
|
if isinstance(card, dict):
|
||||||
|
all_cards.append(process_card(card))
|
||||||
|
elif isinstance(card, list):
|
||||||
|
# If it's a list of cards
|
||||||
|
for c in card:
|
||||||
|
all_cards.append(process_card(c))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Try parsing the entire file as one JSON array
|
||||||
|
f.seek(0)
|
||||||
|
data = json.load(f)
|
||||||
|
if isinstance(data, list):
|
||||||
|
for card in data:
|
||||||
|
all_cards.append(process_card(card))
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {json_file}: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create DataFrame
|
||||||
|
df = pd.DataFrame(all_cards)
|
||||||
|
|
||||||
|
# Sort by set and number
|
||||||
|
"""
|
||||||
|
df['number_int'] = df['number'].str.extract('(\d+)').astype(float, errors='ignore')
|
||||||
|
df = df.sort_values(['set_name', 'number_int', 'number'], na_position='last')
|
||||||
|
df = df.drop('number_int', axis=1)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Save to Excel
|
||||||
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||||
|
output_file = f'pkm_cards_{timestamp}.xlsx'
|
||||||
|
|
||||||
|
# Create Excel writer with xlsxwriter engine
|
||||||
|
with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
|
||||||
|
df.to_excel(writer, sheet_name='Pokemon Cards', index=False)
|
||||||
|
|
||||||
|
# Get the workbook and worksheet
|
||||||
|
workbook = writer.book
|
||||||
|
worksheet = writer.sheets['Pokemon Cards']
|
||||||
|
|
||||||
|
# Add some formatting
|
||||||
|
header_format = workbook.add_format({
|
||||||
|
'bold': True,
|
||||||
|
'text_wrap': True,
|
||||||
|
'valign': 'top',
|
||||||
|
'fg_color': '#D7E4BD',
|
||||||
|
'border': 1
|
||||||
|
})
|
||||||
|
|
||||||
|
# Write headers with formatting
|
||||||
|
for col_num, value in enumerate(df.columns.values):
|
||||||
|
worksheet.write(0, col_num, value, header_format)
|
||||||
|
|
||||||
|
# Auto-adjust column widths
|
||||||
|
for i, col in enumerate(df.columns):
|
||||||
|
# Find maximum length in column
|
||||||
|
max_len = df[col].astype(str).str.len().max()
|
||||||
|
max_len = max(max_len, len(col)) + 2
|
||||||
|
# Cap column width at 50
|
||||||
|
max_len = min(max_len, 50)
|
||||||
|
worksheet.set_column(i, i, max_len)
|
||||||
|
|
||||||
|
print(f"\nSuccessfully created {output_file}")
|
||||||
|
print(f"Total cards processed: {len(df)}")
|
||||||
|
print(f"\nColumns in the spreadsheet:")
|
||||||
|
for col in df.columns:
|
||||||
|
print(f" - {col}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
BIN
pkm_cards_20251019_101651.xlsx
Normal file
BIN
pkm_cards_20251019_101651.xlsx
Normal file
Binary file not shown.
1
product_scraping/.~lock.TCG Sole Trader Copy.xlsx#
Normal file
1
product_scraping/.~lock.TCG Sole Trader Copy.xlsx#
Normal file
@@ -0,0 +1 @@
|
|||||||
|
,teddy,lord-T-1024,09.01.2026 15:38,file:///home/teddy/.config/libreoffice/4;
|
||||||
BIN
product_scraping/TCG Sole Trader Copy - dead.xlsx
Normal file
BIN
product_scraping/TCG Sole Trader Copy - dead.xlsx
Normal file
Binary file not shown.
BIN
product_scraping/TCG Sole Trader Copy.xlsx
Normal file
BIN
product_scraping/TCG Sole Trader Copy.xlsx
Normal file
Binary file not shown.
593
product_scraping/cost_fetcher_base.py
Normal file
593
product_scraping/cost_fetcher_base.py
Normal file
@@ -0,0 +1,593 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from openpyxl import load_workbook, Workbook
|
||||||
|
from openpyxl.worksheet.worksheet import Worksheet
|
||||||
|
import requests
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.common.exceptions import StaleElementReferenceException
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
# import undetected_chromedriver as uc
|
||||||
|
from undetected_chromedriver import Chrome
|
||||||
|
|
||||||
|
|
||||||
|
class Cost_Fetcher_Base:
|
||||||
|
PRODUCT_WORKSHEET_NAME = 'Product'
|
||||||
|
SOURCING_WORKSHEET_NAME = 'Sourcing'
|
||||||
|
WORKBOOK_NAME = 'TCG Sole Trader Copy.xlsx'
|
||||||
|
|
||||||
|
driver: Chrome # webdriver.Chrome
|
||||||
|
eur_to_gbp_rate: float
|
||||||
|
index_column_active_sourcing: int
|
||||||
|
index_column_is_booster_product: int
|
||||||
|
index_column_is_booster_box_product: int
|
||||||
|
index_column_is_precon_product: int
|
||||||
|
index_column_link_sourcing: int
|
||||||
|
index_column_name_sourcing: int
|
||||||
|
index_column_product_id_product: int
|
||||||
|
index_column_product_id_sourcing: int
|
||||||
|
index_column_unit_cost_sourcing: int
|
||||||
|
index_row_header_product: int
|
||||||
|
index_row_header_sourcing: int
|
||||||
|
product_sheet: Worksheet
|
||||||
|
sourcing_sheet: Worksheet
|
||||||
|
wait: WebDriverWait
|
||||||
|
workbook: Workbook
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_cost(cost_text):
|
||||||
|
if not cost_text:
|
||||||
|
return None
|
||||||
|
cost_clean = re.sub(r'[^\d,]', '', cost_text)
|
||||||
|
try:
|
||||||
|
return float(cost_clean) / 100
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
@classmethod
|
||||||
|
def parse_cost_from_pennies(cls, cost_text):
|
||||||
|
if not cost_text:
|
||||||
|
return None
|
||||||
|
cost_clean = cls.parse_cost(cost_text = cost_text)
|
||||||
|
if cost_clean is not None:
|
||||||
|
cost_clean = cost_clean / 100
|
||||||
|
return cost_clean
|
||||||
|
@classmethod
|
||||||
|
def parse_cost_chaoscards(cls, cost_text):
|
||||||
|
return cls.parse_cost(cost_text = cost_text)
|
||||||
|
@classmethod
|
||||||
|
def parse_cost_cardmarket(cls, cost_text):
|
||||||
|
# return cls.parse_cost(cost_text = cost_text)
|
||||||
|
"""Convert '141,30 €' format to float in EUR"""
|
||||||
|
if not cost_text:
|
||||||
|
return None
|
||||||
|
cost_clean = re.sub(r'[^\d,]', '', cost_text)
|
||||||
|
cost_clean = cost_clean.replace(',', '.')
|
||||||
|
try:
|
||||||
|
return float(cost_clean)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
@classmethod
|
||||||
|
def parse_cost_gameslore(cls, cost_text):
|
||||||
|
return cls.parse_cost(cost_text = cost_text)
|
||||||
|
@classmethod
|
||||||
|
def parse_cost_magicmadhouse(cls, cost_text):
|
||||||
|
return cls.parse_cost(cost_text = cost_text)
|
||||||
|
|
||||||
|
def get_eur_to_gbp_rate(self):
|
||||||
|
try:
|
||||||
|
response = requests.get('https://api.exchangerate-api.com/v4/latest/EUR', timeout=10)
|
||||||
|
data = response.json()
|
||||||
|
self.eur_to_gbp_rate = data['rates']['GBP']
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error fetching exchange rate: {e}")
|
||||||
|
print("Using fallback rate: 0.85")
|
||||||
|
self.eur_to_gbp_rate = 0.85
|
||||||
|
|
||||||
|
def setup_driver(self):
|
||||||
|
print("Starting driver")
|
||||||
|
"""
|
||||||
|
chrome_options = Options()
|
||||||
|
# Remove headless mode to see the browser
|
||||||
|
# chrome_options.add_argument('--headless')
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||||
|
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
||||||
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
self.driver = Chrome(version_main=133) # webdriver.Chrome(options=chrome_options)
|
||||||
|
# return driver
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error setting up Chrome driver: {e}")
|
||||||
|
print("Make sure Chrome and chromedriver are installed")
|
||||||
|
# return None
|
||||||
|
self.wait = WebDriverWait(self.driver, 15)
|
||||||
|
|
||||||
|
def scrape_cost_and_active_selenium(self, url, page_load_element_selector, cost_selector, active_selector, invalid_active_statuses):
|
||||||
|
try:
|
||||||
|
print(f" Loading page...")
|
||||||
|
# time.sleep(random.uniform(6, 10))
|
||||||
|
try:
|
||||||
|
self.driver.get(url)
|
||||||
|
element = self.wait.until(
|
||||||
|
EC.presence_of_element_located((By.CSS_SELECTOR, page_load_element_selector))
|
||||||
|
)
|
||||||
|
element = self.wait.until(
|
||||||
|
EC.element_to_be_clickable((By.CSS_SELECTOR, page_load_element_selector))
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.driver.get(url)
|
||||||
|
element = self.wait.until(
|
||||||
|
EC.presence_of_element_located((By.CSS_SELECTOR, page_load_element_selector))
|
||||||
|
)
|
||||||
|
element = self.wait.until(
|
||||||
|
EC.element_to_be_clickable((By.CSS_SELECTOR, page_load_element_selector))
|
||||||
|
)
|
||||||
|
|
||||||
|
max_attempts = 10
|
||||||
|
for attempt in range(max_attempts):
|
||||||
|
try:
|
||||||
|
element = None
|
||||||
|
element = self.driver.find_element(By.CSS_SELECTOR, page_load_element_selector)
|
||||||
|
text = element.text
|
||||||
|
print(f"✓ Element loaded successfully on attempt {attempt + 1}")
|
||||||
|
# return True
|
||||||
|
break
|
||||||
|
except StaleElementReferenceException:
|
||||||
|
print(f"Stale element on attempt {attempt + 1}, retrying...")
|
||||||
|
if attempt < max_attempts - 1:
|
||||||
|
time.sleep(1)
|
||||||
|
else:
|
||||||
|
raise ValueError("StaleElementReferenceException")
|
||||||
|
|
||||||
|
print(f" Page title: {self.driver.title}")
|
||||||
|
|
||||||
|
cost = None
|
||||||
|
element = None
|
||||||
|
counter = 0
|
||||||
|
while cost is None:
|
||||||
|
counter += 1
|
||||||
|
try:
|
||||||
|
element = self.driver.find_element(By.CSS_SELECTOR, cost_selector)
|
||||||
|
text = element.text
|
||||||
|
print(f" Text: '{text}'")
|
||||||
|
cost = text
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Selector failed: {e}")
|
||||||
|
cost = None
|
||||||
|
time.sleep(random.uniform(2, 4))
|
||||||
|
if counter > 10:
|
||||||
|
print("10 cost selector fails")
|
||||||
|
break
|
||||||
|
|
||||||
|
active = None
|
||||||
|
if active_selector is None: # or invalid_active_statuses is None or invalid_active_statuses == []:
|
||||||
|
active = (cost is not None)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
elements = None
|
||||||
|
elements = self.driver.find_elements(By.CSS_SELECTOR, active_selector)
|
||||||
|
if len(elements) == 0:
|
||||||
|
active = True
|
||||||
|
else:
|
||||||
|
text = elements[0].text
|
||||||
|
print(f" Text: '{text}'")
|
||||||
|
active = (invalid_active_statuses is None or text not in invalid_active_statuses)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Selector failed: {e}")
|
||||||
|
|
||||||
|
if cost is None or active is None:
|
||||||
|
print(f" ✗ No cost found")
|
||||||
|
print(f"Cost: {cost}, Active: {active}")
|
||||||
|
input("Press Enter to continue to next URL...")
|
||||||
|
return cost, active
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
input("Press Enter to continue to next URL...")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def scrape_cost_and_active_selenium_cardmarket(self, url):
|
||||||
|
page_load_element_selector = "body > main.container > div.page-title-container"
|
||||||
|
cost_selector = '#table > div:nth-child(1) > div.table-body > .row.article-row.g-0:nth-child(1) > div.col-offer > div.price-container > div > div:nth-child(1) > span:nth-child(1)'
|
||||||
|
cost_text, active = self.scrape_cost_and_active_selenium(
|
||||||
|
url = url
|
||||||
|
, page_load_element_selector = page_load_element_selector
|
||||||
|
, cost_selector = cost_selector
|
||||||
|
, active_selector = None
|
||||||
|
, invalid_active_statuses = []
|
||||||
|
)
|
||||||
|
cost = Cost_Fetcher_Base.parse_cost_cardmarket(cost_text)
|
||||||
|
if cost is not None:
|
||||||
|
item_shipping_cost_in = 0
|
||||||
|
if cost < 10:
|
||||||
|
item_shipping_cost_in = 2
|
||||||
|
elif cost < 100:
|
||||||
|
item_shipping_cost_in = 8
|
||||||
|
else:
|
||||||
|
item_shipping_cost_in = 20
|
||||||
|
cost = cost * self.eur_to_gbp_rate + item_shipping_cost_in
|
||||||
|
active = (cost is not None)
|
||||||
|
return cost, active
|
||||||
|
|
||||||
|
def scrape_cost_and_active_selenium_chaoscards(self, url):
|
||||||
|
# page_load_element_selector = '#prod_title'
|
||||||
|
cost_selector = '.price_inc > span:nth-child(2)'
|
||||||
|
active_selector = '.product__right > form > ul.prod_det_fields.left.product-section.product-section--stock > li > div:nth-child(1) > div:nth-child(2)'
|
||||||
|
cost_text, active = self.scrape_cost_and_active_selenium(
|
||||||
|
url = url
|
||||||
|
, page_load_element_selector = cost_selector # page_load_element_selector
|
||||||
|
, cost_selector = cost_selector
|
||||||
|
, active_selector = active_selector
|
||||||
|
, invalid_active_statuses = ["Out of stock", "Coming soon"]
|
||||||
|
)
|
||||||
|
cost = Cost_Fetcher_Base.parse_cost_chaoscards(cost_text)
|
||||||
|
return cost, active
|
||||||
|
|
||||||
|
def scrape_cost_and_active_selenium_gameslore(self, url):
|
||||||
|
# page_load_element_selector = '.page-title'
|
||||||
|
cost_selector = 'div.columns > div.column.main > div.product-info-main > div.product-info-price > div.price-box > span.special-price > span.price-container > span.price-wrapper > span.price'
|
||||||
|
active_selector = '.stock > span:nth-child(1)'
|
||||||
|
cost_text, active = self.scrape_cost_and_active_selenium(
|
||||||
|
url = url
|
||||||
|
, page_load_element_selector = cost_selector # page_load_element_selector
|
||||||
|
, cost_selector = cost_selector
|
||||||
|
, active_selector = active_selector
|
||||||
|
, invalid_active_statuses = ["OUT OF STOCK"]
|
||||||
|
)
|
||||||
|
cost = Cost_Fetcher_Base.parse_cost_gameslore(cost_text)
|
||||||
|
return cost, active
|
||||||
|
|
||||||
|
def scrape_cost_and_active_selenium_magicmadhouse(self, url):
|
||||||
|
page_load_element_selector = '.productView-title'
|
||||||
|
cost_selector = 'div.body > div.container > div > div.productView > section.productView-details > div.productView-options > form > div.productView-options-selections > div.productView-product > div.productView-info > div.price-rating > div.productView-price > div.price-section.actual-price > span.price'
|
||||||
|
active_selector = '.alertBox.alertBox--error'
|
||||||
|
cost_text, active = self.scrape_cost_and_active_selenium(
|
||||||
|
url = url
|
||||||
|
, page_load_element_selector = page_load_element_selector
|
||||||
|
, cost_selector = cost_selector
|
||||||
|
, active_selector = active_selector
|
||||||
|
, invalid_active_statuses = []
|
||||||
|
)
|
||||||
|
cost = Cost_Fetcher_Base.parse_cost_magicmadhouse(cost_text)
|
||||||
|
return cost, active
|
||||||
|
|
||||||
|
def scrape_prices_and_quantities_selenium_cardmarket(self, url):
|
||||||
|
offer_container_selector = '#table > div:nth-child(1) > div.table-body > .row.article-row.g-0:nth-child(1) > div.col-offer'
|
||||||
|
price_selector = 'div.price-container > div > div:nth-child(1) > span:nth-child(1)'
|
||||||
|
quantity_selector = 'div.amount-container > span:nth-child(1)'
|
||||||
|
|
||||||
|
try:
|
||||||
|
print(f" Loading page...")
|
||||||
|
# time.sleep(random.uniform(6, 10))
|
||||||
|
try:
|
||||||
|
self.driver.get(url)
|
||||||
|
element = self.wait.until(
|
||||||
|
EC.presence_of_element_located((By.CSS_SELECTOR, offer_container_selector))
|
||||||
|
)
|
||||||
|
element = self.wait.until(
|
||||||
|
EC.element_to_be_clickable((By.CSS_SELECTOR, offer_container_selector))
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.driver.get(url)
|
||||||
|
element = self.wait.until(
|
||||||
|
EC.presence_of_element_located((By.CSS_SELECTOR, offer_container_selector))
|
||||||
|
)
|
||||||
|
element = self.wait.until(
|
||||||
|
EC.element_to_be_clickable((By.CSS_SELECTOR, offer_container_selector))
|
||||||
|
)
|
||||||
|
|
||||||
|
max_attempts = 10
|
||||||
|
for attempt in range(max_attempts):
|
||||||
|
try:
|
||||||
|
element = None
|
||||||
|
element = self.driver.find_element(By.CSS_SELECTOR, offer_container_selector)
|
||||||
|
text = element.text
|
||||||
|
print(f"✓ Element loaded successfully on attempt {attempt + 1}")
|
||||||
|
# return True
|
||||||
|
break
|
||||||
|
except StaleElementReferenceException:
|
||||||
|
print(f"Stale element on attempt {attempt + 1}, retrying...")
|
||||||
|
if attempt < max_attempts - 1:
|
||||||
|
time.sleep(1)
|
||||||
|
else:
|
||||||
|
raise ValueError("StaleElementReferenceException")
|
||||||
|
|
||||||
|
print(f" Page title: {self.driver.title}")
|
||||||
|
|
||||||
|
price_quantity_pairs = []
|
||||||
|
try:
|
||||||
|
offer_containers = self.driver.find_elements(By.CSS_SELECTOR, offer_container_selector)
|
||||||
|
print(f" Offer container selector: Found {len(offer_containers)} elements")
|
||||||
|
for offer_container in offer_containers:
|
||||||
|
price_element = offer_container.find_element(By.CSS_SELECTOR, price_selector)
|
||||||
|
price_text = price_element.text
|
||||||
|
if '€' in price_text and re.search(r'\d', price_text):
|
||||||
|
print(f" ✓ Found price: {price_text}")
|
||||||
|
else:
|
||||||
|
price_text = None
|
||||||
|
|
||||||
|
quantity_element = offer_container.find_element(By.CSS_SELECTOR, quantity_selector)
|
||||||
|
quantity_text = quantity_element.text
|
||||||
|
|
||||||
|
if price_text is None or quantity_text is None:
|
||||||
|
continue
|
||||||
|
price_quantity_pairs.append({
|
||||||
|
'price': Cost_Fetcher_Base.parse_cost_cardmarket(price_text = price_text)
|
||||||
|
, 'quantity': Cost_Fetcher_Base.parse_cost_cardmarket(quantity_text = quantity_text)
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Price selector failed: {e}")
|
||||||
|
return []
|
||||||
|
finally:
|
||||||
|
return price_quantity_pairs
|
||||||
|
|
||||||
|
def load_tcg_sole_trader_workbook(self):
|
||||||
|
print("Loading workbook...")
|
||||||
|
self.workbook = load_workbook(Cost_Fetcher_Base.WORKBOOK_NAME)
|
||||||
|
|
||||||
|
if Cost_Fetcher_Base.SOURCING_WORKSHEET_NAME not in self.workbook.sheetnames:
|
||||||
|
print(f"Error: Sheet '{Cost_Fetcher_Base.SOURCING_WORKSHEET_NAME}' not found")
|
||||||
|
return
|
||||||
|
if Cost_Fetcher_Base.PRODUCT_WORKSHEET_NAME not in self.workbook.sheetnames:
|
||||||
|
print(f"Error: Sheet '{Cost_Fetcher_Base.PRODUCT_WORKSHEET_NAME}' not found")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.sourcing_sheet = self.workbook[Cost_Fetcher_Base.SOURCING_WORKSHEET_NAME]
|
||||||
|
self.product_sheet = self.workbook[Cost_Fetcher_Base.PRODUCT_WORKSHEET_NAME]
|
||||||
|
|
||||||
|
sourcing_table_found = False
|
||||||
|
for row in range(1, self.sourcing_sheet.max_row + 1):
|
||||||
|
if self.sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(self.sourcing_sheet.cell(row, 3).value):
|
||||||
|
self.index_row_header_sourcing = row
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not sourcing_table_found or not self.index_row_header_sourcing:
|
||||||
|
for row in range(1, min(20, self.sourcing_sheet.max_row + 1)):
|
||||||
|
if 'Source Name' in str(self.sourcing_sheet.cell(row, 3).value):
|
||||||
|
self.index_row_header_sourcing = row
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not sourcing_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Sourcing'")
|
||||||
|
return
|
||||||
|
|
||||||
|
product_table_found = False
|
||||||
|
for row in range(1, self.product_sheet.max_row + 1):
|
||||||
|
if self.product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(self.product_sheet.cell(row, 1).value):
|
||||||
|
self.index_row_header_product = row
|
||||||
|
product_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not product_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Product'")
|
||||||
|
return
|
||||||
|
|
||||||
|
for index_column in range(1, self.sourcing_sheet.max_column + 1):
|
||||||
|
header = str(self.sourcing_sheet.cell(self.index_row_header_sourcing, index_column).value).strip()
|
||||||
|
if 'Source Name' == header:
|
||||||
|
self.index_column_name_sourcing = index_column
|
||||||
|
elif 'Source Link' == header:
|
||||||
|
self.index_column_link_sourcing = index_column
|
||||||
|
elif 'Source Unit Cost' == header:
|
||||||
|
self.index_column_unit_cost_sourcing = index_column
|
||||||
|
elif 'Active' == header:
|
||||||
|
self.index_column_active_sourcing = index_column
|
||||||
|
elif 'Product Id' == header:
|
||||||
|
self.index_column_product_id_sourcing = index_column
|
||||||
|
|
||||||
|
for index_column in range(1, self.product_sheet.max_column + 1):
|
||||||
|
header = str(self.product_sheet.cell(self.index_row_header_product, index_column).value).strip()
|
||||||
|
if 'Is Booster Box' == header:
|
||||||
|
self.index_column_is_booster_box_product = index_column
|
||||||
|
elif 'Is Booster' == header:
|
||||||
|
self.index_column_is_booster_product = index_column
|
||||||
|
elif 'Is Precon' == header:
|
||||||
|
self.index_column_is_precon_product = index_column
|
||||||
|
elif 'Product Id' == header:
|
||||||
|
self.index_column_product_id_product = index_column
|
||||||
|
|
||||||
|
print(f"Sourcing max row: {self.sourcing_sheet.max_row}")
|
||||||
|
print(f"Sourcing header row: {self.index_row_header_sourcing}")
|
||||||
|
print(f"Sourcing header 1: {self.sourcing_sheet.cell(self.index_row_header_sourcing, 1).value}")
|
||||||
|
print(f"Sourcing Columns - Name: {self.index_column_name_sourcing}, Link: {self.index_column_link_sourcing}, Unit Cost: {self.index_column_unit_cost_sourcing}, Active: {self.index_column_active_sourcing}, Product Id: {self.index_column_product_id_sourcing}")
|
||||||
|
print(f"Product max row: {self.product_sheet.max_row}")
|
||||||
|
print(f"Product header row: {self.index_row_header_product}")
|
||||||
|
print(f"Sourcing header 1: {self.product_sheet.cell(self.index_row_header_product, 1).value}")
|
||||||
|
print(f"Product Columns - Id: {self.index_column_product_id_product}, Is Booster: {self.index_column_is_booster_product}, Is Booster Box: {self.index_column_is_booster_box_product}, Is Precon: {self.index_column_is_precon_product}")
|
||||||
|
|
||||||
|
if not all([
|
||||||
|
self.index_column_name_sourcing
|
||||||
|
, self.index_column_link_sourcing
|
||||||
|
, self.index_column_unit_cost_sourcing
|
||||||
|
, self.index_column_product_id_sourcing
|
||||||
|
, self.index_column_active_sourcing
|
||||||
|
, self.index_column_product_id_product
|
||||||
|
, self.index_column_is_booster_product
|
||||||
|
, self.index_column_is_booster_box_product
|
||||||
|
, self.index_column_is_precon_product
|
||||||
|
]):
|
||||||
|
print("Error: Could not find required columns")
|
||||||
|
return
|
||||||
|
|
||||||
|
def scrape_all_costs(self):
|
||||||
|
try:
|
||||||
|
processed_count = 0
|
||||||
|
updated_count = 0
|
||||||
|
cardmarket_accessed_last_on = 0
|
||||||
|
chaoscards_accessed_last_on = 0
|
||||||
|
gameslore_accessed_last_on = 0
|
||||||
|
magicmadhouse_accessed_last_on = 0
|
||||||
|
did_restart_since_last_chaos_cards_visit = True
|
||||||
|
did_restart_since_last_games_lore_visit = True
|
||||||
|
for index_row in range(self.index_row_header_sourcing + 1, self.sourcing_sheet.max_row + 1):
|
||||||
|
# print(f"index_row: {index_row}")
|
||||||
|
# print(f"{self.sourcing_sheet.cell(index_row, 1).value}, {self.sourcing_sheet.cell(index_row, 2).value}, {self.sourcing_sheet.cell(index_row, 3).value}, {self.sourcing_sheet.cell(index_row, 4).value}, {self.sourcing_sheet.cell(index_row, 5).value}, {self.sourcing_sheet.cell(index_row, 6).value}, {self.sourcing_sheet.cell(index_row, 7).value}, {self.sourcing_sheet.cell(index_row, 8).value}, {self.sourcing_sheet.cell(index_row, 9).value}, {self.sourcing_sheet.cell(index_row, 10).value}, {self.sourcing_sheet.cell(index_row, 11).value}, {self.sourcing_sheet.cell(index_row, 12).value}, {self.sourcing_sheet.cell(index_row, 13).value}, {self.sourcing_sheet.cell(index_row, 14).value}, {self.sourcing_sheet.cell(index_row, 15).value}, {self.sourcing_sheet.cell(index_row, 16).value}, {self.sourcing_sheet.cell(index_row, 17).value}, {self.sourcing_sheet.cell(index_row, 18).value}, {self.sourcing_sheet.cell(index_row, 19).value}")
|
||||||
|
source_name = self.sourcing_sheet.cell(index_row, self.index_column_name_sourcing).value
|
||||||
|
source_link = self.sourcing_sheet.cell(index_row, self.index_column_link_sourcing).value
|
||||||
|
source_product_id = self.sourcing_sheet.cell(index_row, self.index_column_product_id_sourcing).value
|
||||||
|
|
||||||
|
if not source_name or not source_link: # or not str(source_link).strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}")
|
||||||
|
|
||||||
|
product_is_booster = False
|
||||||
|
for product_row in range(self.index_row_header_product + 1, self.product_sheet.max_row + 1):
|
||||||
|
product_id = self.product_sheet.cell(product_row, self.index_column_product_id_product).value
|
||||||
|
# print(f"found product: id {product_id}")
|
||||||
|
if product_id == source_product_id:
|
||||||
|
product_is_booster_text = str(self.product_sheet.cell(product_row, self.index_column_is_booster_product).value).upper()
|
||||||
|
# print(f"product is booster: {product_is_booster_text}, type: {str(type(product_is_booster_text))}")
|
||||||
|
product_is_booster = (product_is_booster_text == "TRUE")
|
||||||
|
break
|
||||||
|
print(f"product is booster: {product_is_booster}")
|
||||||
|
|
||||||
|
if (
|
||||||
|
(
|
||||||
|
source_name == "Chaos Cards"
|
||||||
|
and not did_restart_since_last_chaos_cards_visit
|
||||||
|
)
|
||||||
|
or (
|
||||||
|
source_name == "Games Lore"
|
||||||
|
and not did_restart_since_last_games_lore_visit
|
||||||
|
)
|
||||||
|
):
|
||||||
|
self.stop_driver()
|
||||||
|
self.setup_driver()
|
||||||
|
if not self.driver:
|
||||||
|
return
|
||||||
|
did_restart_since_last_chaos_cards_visit = True
|
||||||
|
did_restart_since_last_games_lore_visit = True
|
||||||
|
|
||||||
|
if source_name in ["Card Market", "Chaos Cards", "Games Lore", "Magic Madhouse"]:
|
||||||
|
self.clear_row_sourcing_sheet(index_row = index_row)
|
||||||
|
processed_count += 1
|
||||||
|
Cost_Fetcher_Base.log_processing_new_row(
|
||||||
|
index_row = index_row
|
||||||
|
, source_link = source_link
|
||||||
|
)
|
||||||
|
|
||||||
|
cost = None
|
||||||
|
active = None
|
||||||
|
if source_name == "Card Market":
|
||||||
|
while (time.time() - cardmarket_accessed_last_on < random.uniform(10, 20)):
|
||||||
|
time.sleep(random.uniform(3, 5))
|
||||||
|
if product_is_booster:
|
||||||
|
price_quantity_pairs = self.scrape_prices_and_quantities_selenium_cardmarket(url = source_link)
|
||||||
|
if price_quantity_pairs:
|
||||||
|
self.sourcing_sheet.cell(index_row, self.index_column_active_sourcing).value = "TRUE"
|
||||||
|
max_quantity = 0
|
||||||
|
updated_row_price = False
|
||||||
|
for price_quantity_pair in price_quantity_pairs:
|
||||||
|
eur_price = price_quantity_pair['price']
|
||||||
|
quantity = price_quantity_pair['quantity']
|
||||||
|
print(f" Found price: €{eur_price}")
|
||||||
|
print(f" Found quantity: {quantity}")
|
||||||
|
max_quantity = max(max_quantity, quantity)
|
||||||
|
if quantity >= 8:
|
||||||
|
if eur_price:
|
||||||
|
gbp_price = eur_price * self.eur_to_gbp_rate
|
||||||
|
print(f" Converted: €{eur_price:.2f} → £{gbp_price:.2f}")
|
||||||
|
self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value = gbp_price
|
||||||
|
updated_count += 1
|
||||||
|
updated_row_price = True
|
||||||
|
print(f"output row: {index_row}, value: {self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not parse price")
|
||||||
|
if not updated_row_price:
|
||||||
|
print("Offer with quantity >= 8 not found")
|
||||||
|
for price_quantity_pair in price_quantity_pairs:
|
||||||
|
eur_price = price_quantity_pair['price']
|
||||||
|
quantity = price_quantity_pair['quantity']
|
||||||
|
print(f" Found price: €{eur_price}")
|
||||||
|
print(f" Found quantity: {quantity}")
|
||||||
|
if max_quantity <= 2 or quantity == max_quantity:
|
||||||
|
if eur_price:
|
||||||
|
gbp_price = eur_price * self.eur_to_gbp_rate
|
||||||
|
print(f" Converted: €{eur_price:.2f} → £{gbp_price:.2f}")
|
||||||
|
self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value = gbp_price
|
||||||
|
updated_count += 1
|
||||||
|
updated_row_price = True
|
||||||
|
print(f"output row: {index_row}, value: {self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not parse price")
|
||||||
|
else:
|
||||||
|
cost, active = self.scrape_cost_and_active_selenium_cardmarket(url = source_link)
|
||||||
|
cardmarket_accessed_last_on = time.time()
|
||||||
|
elif source_name == "Chaos Cards":
|
||||||
|
while (time.time() - chaoscards_accessed_last_on < random.uniform(20, 30)):
|
||||||
|
time.sleep(random.uniform(3, 5))
|
||||||
|
cost, active = self.scrape_cost_and_active_selenium_chaoscards(url = source_link)
|
||||||
|
chaoscards_accessed_last_on = time.time()
|
||||||
|
did_restart_since_last_chaos_cards_visit = False
|
||||||
|
elif source_name == "Games Lore":
|
||||||
|
while (time.time() - gameslore_accessed_last_on < random.uniform(10, 20)):
|
||||||
|
time.sleep(random.uniform(3, 5))
|
||||||
|
cost, active = self.scrape_cost_and_active_selenium_gameslore(url = source_link)
|
||||||
|
gameslore_accessed_last_on = time.time()
|
||||||
|
did_restart_since_last_games_lore_visit = False
|
||||||
|
elif source_name == "Magic Madhouse":
|
||||||
|
while (time.time() - magicmadhouse_accessed_last_on < random.uniform(10, 20)):
|
||||||
|
time.sleep(random.uniform(3, 5))
|
||||||
|
cost, active = self.scrape_cost_and_active_selenium_magicmadhouse(url = source_link)
|
||||||
|
magicmadhouse_accessed_last_on = time.time()
|
||||||
|
|
||||||
|
if (cost is not None and active is not None):
|
||||||
|
print(f" Found cost: {cost}, active: {active}")
|
||||||
|
|
||||||
|
self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value = cost
|
||||||
|
self.sourcing_sheet.cell(index_row, self.index_column_active_sourcing).value = "TRUE" if active else "FALSE"
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not find cost on page")
|
||||||
|
# Save workbook
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Saving workbook...")
|
||||||
|
self.workbook.save(Cost_Fetcher_Base.WORKBOOK_NAME)
|
||||||
|
|
||||||
|
print(f"\nComplete!")
|
||||||
|
print(f"Processed: {processed_count} entries")
|
||||||
|
print(f"Updated: {updated_count} costs")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
|
||||||
|
def clear_row_sourcing_sheet(self, index_row):
|
||||||
|
self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value = None
|
||||||
|
self.sourcing_sheet.cell(index_row, self.index_column_active_sourcing).value = "FALSE"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def log_processing_new_row(index_row, source_link):
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Processing row {index_row}: {source_link}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
print("Setting up browser automation (browser will not be visible)...")
|
||||||
|
self.setup_driver()
|
||||||
|
if not self.driver:
|
||||||
|
return
|
||||||
|
self.load_tcg_sole_trader_workbook()
|
||||||
|
self.get_eur_to_gbp_rate()
|
||||||
|
|
||||||
|
def stop_driver(self):
|
||||||
|
self.driver.quit()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
cost_fetcher = Cost_Fetcher_Base()
|
||||||
|
cost_fetcher.scrape_all_costs()
|
||||||
|
cost_fetcher.stop_driver()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
267
product_scraping/mtg_booster_box_price_cardmarket_fetcher.py
Normal file
267
product_scraping/mtg_booster_box_price_cardmarket_fetcher.py
Normal file
@@ -0,0 +1,267 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
import requests
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
ITEM_SHIPPING_COST_IN = 8
|
||||||
|
|
||||||
|
def get_eur_to_gbp_rate():
|
||||||
|
"""Fetch current EUR to GBP conversion rate"""
|
||||||
|
try:
|
||||||
|
response = requests.get('https://api.exchangerate-api.com/v4/latest/EUR', timeout=10)
|
||||||
|
data = response.json()
|
||||||
|
return data['rates']['GBP']
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error fetching exchange rate: {e}")
|
||||||
|
print("Using fallback rate: 0.85")
|
||||||
|
return 0.85
|
||||||
|
|
||||||
|
def parse_cardmarket_price(price_text):
|
||||||
|
"""Convert '141,30 €' format to float in EUR"""
|
||||||
|
if not price_text:
|
||||||
|
return None
|
||||||
|
price_clean = re.sub(r'[^\d,]', '', price_text)
|
||||||
|
price_clean = price_clean.replace(',', '.')
|
||||||
|
try:
|
||||||
|
return float(price_clean)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def setup_driver():
|
||||||
|
"""Setup Chrome driver with visible window"""
|
||||||
|
chrome_options = Options()
|
||||||
|
# Remove headless mode to see the browser
|
||||||
|
# chrome_options.add_argument('--headless')
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||||
|
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
||||||
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
return driver
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error setting up Chrome driver: {e}")
|
||||||
|
print("Make sure Chrome and chromedriver are installed")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def scrape_cardmarket_price_selenium(driver, url):
|
||||||
|
"""Scrape price from Card Market URL using Selenium"""
|
||||||
|
try:
|
||||||
|
print(f" Loading page...")
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# Wait for page to load
|
||||||
|
time.sleep(random.uniform(10, 20))
|
||||||
|
#time.sleep(3)
|
||||||
|
|
||||||
|
print(f" Page title: {driver.title}")
|
||||||
|
|
||||||
|
# Try multiple selector strategies
|
||||||
|
price_selector = '#table > div:nth-child(1) > div.table-body > .row.article-row.g-0:nth-child(1) > div.col-offer > div.price-container > div > div:nth-child(1) > span:nth-child(1)'
|
||||||
|
|
||||||
|
try:
|
||||||
|
elements = driver.find_elements(By.CSS_SELECTOR, price_selector)
|
||||||
|
print(f" Selector: Found {len(elements)} elements")
|
||||||
|
for elem in elements[:3]: # Check first 3
|
||||||
|
text = elem.text
|
||||||
|
print(f" Text: '{text}'")
|
||||||
|
if '€' in text and re.search(r'\d', text):
|
||||||
|
print(f" ✓ Found price with selector: {text}")
|
||||||
|
# input("Confirm")
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Selector failed: {e}")
|
||||||
|
|
||||||
|
print(f" ✗ No price found")
|
||||||
|
# input("Press Enter to continue to next URL...")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def main():
|
||||||
|
workbook_name = 'TCG Sole Trader Copy.xlsx'
|
||||||
|
sourcing_sheet_name = 'Sourcing'
|
||||||
|
product_sheet_name = 'Product'
|
||||||
|
|
||||||
|
print("Loading workbook...")
|
||||||
|
wb = load_workbook(workbook_name)
|
||||||
|
|
||||||
|
if sourcing_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{sourcing_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
if product_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{product_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
|
||||||
|
sourcing_sheet = wb[sourcing_sheet_name]
|
||||||
|
product_sheet = wb[product_sheet_name]
|
||||||
|
|
||||||
|
sourcing_table_found = False
|
||||||
|
start_row = None
|
||||||
|
for row in range(1, sourcing_sheet.max_row + 1):
|
||||||
|
if sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(sourcing_sheet.cell(row, 3).value):
|
||||||
|
start_row = row + 1
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not sourcing_table_found or not start_row:
|
||||||
|
for row in range(1, min(20, sourcing_sheet.max_row + 1)):
|
||||||
|
if 'Source Name' in str(sourcing_sheet.cell(row, 3).value):
|
||||||
|
start_row = row + 1
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
start_row = None
|
||||||
|
product_table_found = False
|
||||||
|
for row in range(1, product_sheet.max_row + 1):
|
||||||
|
if product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(product_sheet.cell(row, 1).value):
|
||||||
|
start_row = row + 1
|
||||||
|
product_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not sourcing_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Sourcing'")
|
||||||
|
return
|
||||||
|
if not product_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Product'")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Find column indices
|
||||||
|
header_row = start_row - 1
|
||||||
|
source_name_col = None
|
||||||
|
source_link_col = None
|
||||||
|
source_unit_price_col = None
|
||||||
|
source_is_available_col = None
|
||||||
|
source_product_id_col = None
|
||||||
|
|
||||||
|
product_id_col = None
|
||||||
|
product_is_booster_box_col = None
|
||||||
|
product_is_precon_col = None
|
||||||
|
|
||||||
|
for col in range(1, sourcing_sheet.max_column + 1):
|
||||||
|
header = str(sourcing_sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'Source Name' in header:
|
||||||
|
source_name_col = col
|
||||||
|
elif 'Source Link' in header:
|
||||||
|
source_link_col = col
|
||||||
|
elif 'Source Unit Cost' in header:
|
||||||
|
source_unit_price_col = col
|
||||||
|
elif 'Active' in header:
|
||||||
|
source_is_available_col = col
|
||||||
|
elif 'Product Id' in header:
|
||||||
|
source_product_id_col = col
|
||||||
|
|
||||||
|
for col in range(1, product_sheet.max_column + 1):
|
||||||
|
header = str(product_sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'Is Booster Box' in header:
|
||||||
|
product_is_booster_box_col = col
|
||||||
|
elif 'Is Precon' in header:
|
||||||
|
product_is_precon_col = col
|
||||||
|
elif 'Product Id' in header:
|
||||||
|
product_id_col = col
|
||||||
|
|
||||||
|
print(f"Starting from row {start_row}")
|
||||||
|
print(f"Sourcing Columns - Source Name: {source_name_col}, Source Link: {source_link_col}, Source Unit Cost: {source_unit_price_col}, Active: {source_is_available_col}, Product Id: {source_product_id_col}")
|
||||||
|
print(f"Product Columns - Id: {product_id_col}, Is Booster: {product_is_booster_box_col}, Is Precon: {product_is_precon_col}")
|
||||||
|
|
||||||
|
if not all([source_name_col, source_link_col, source_unit_price_col, source_is_available_col, source_product_id_col, product_id_col, product_is_booster_box_col, product_is_precon_col]):
|
||||||
|
print("Error: Could not find required columns")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get EUR to GBP rate
|
||||||
|
eur_to_gbp = get_eur_to_gbp_rate()
|
||||||
|
print(f"Using EUR to GBP rate: {eur_to_gbp}")
|
||||||
|
|
||||||
|
# Setup Selenium driver
|
||||||
|
print("Setting up browser automation (browser will be visible)...")
|
||||||
|
driver = setup_driver()
|
||||||
|
if not driver:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
processed_count = 0
|
||||||
|
updated_count = 0
|
||||||
|
|
||||||
|
for row in range(start_row, sourcing_sheet.max_row + 1):
|
||||||
|
source_name = sourcing_sheet.cell(row, source_name_col).value
|
||||||
|
source_link = sourcing_sheet.cell(row, source_link_col).value
|
||||||
|
source_product_id = sourcing_sheet.cell(row, source_product_id_col).value
|
||||||
|
|
||||||
|
if not source_name and not source_link:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}")
|
||||||
|
|
||||||
|
product_is_booster_box = False
|
||||||
|
product_is_precon = False
|
||||||
|
for product_row in range(start_row, product_sheet.max_row + 1):
|
||||||
|
product_id = product_sheet.cell(product_row, product_id_col).value
|
||||||
|
# print(f"found product: id {product_id}")
|
||||||
|
if product_id == source_product_id:
|
||||||
|
product_is_booster_box_text = str(product_sheet.cell(product_row, product_is_booster_box_col).value).upper()
|
||||||
|
product_is_booster_box = (product_is_booster_box_text == "TRUE")
|
||||||
|
product_is_precon_text = str(product_sheet.cell(product_row, product_is_precon_col).value).upper()
|
||||||
|
product_is_precon = (product_is_precon_text == "TRUE")
|
||||||
|
break
|
||||||
|
print(f"product is booster box: {product_is_booster_box}")
|
||||||
|
# Check conditions
|
||||||
|
if (
|
||||||
|
(product_is_booster_box or product_is_precon)
|
||||||
|
and source_name == "Card Market"
|
||||||
|
and source_link
|
||||||
|
and str(source_link).strip()
|
||||||
|
):
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = None
|
||||||
|
sourcing_sheet.cell(row, source_is_available_col).value = "FALSE"
|
||||||
|
|
||||||
|
processed_count += 1
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Processing row {row}: {source_link}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Scrape price
|
||||||
|
price_text = scrape_cardmarket_price_selenium(driver, source_link)
|
||||||
|
|
||||||
|
if price_text:
|
||||||
|
print(f" Found price: {price_text}")
|
||||||
|
|
||||||
|
# Parse and convert
|
||||||
|
eur_price = parse_cardmarket_price(price_text)
|
||||||
|
if eur_price:
|
||||||
|
gbp_price = eur_price * eur_to_gbp
|
||||||
|
print(f" Converted: €{eur_price:.2f} → £{gbp_price:.2f}")
|
||||||
|
|
||||||
|
# Update cell
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price + ITEM_SHIPPING_COST_IN
|
||||||
|
sourcing_sheet.cell(row, source_is_available_col).value = "TRUE"
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not parse price")
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not find price on page")
|
||||||
|
# Save workbook
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Saving workbook...")
|
||||||
|
wb.save(workbook_name)
|
||||||
|
|
||||||
|
print(f"\nComplete!")
|
||||||
|
print(f"Processed: {processed_count} Card Market entries")
|
||||||
|
print(f"Updated: {updated_count} prices")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
312
product_scraping/mtg_booster_expected_value_fetcher.py
Normal file
312
product_scraping/mtg_booster_expected_value_fetcher.py
Normal file
@@ -0,0 +1,312 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
def setup_driver(headless=True):
|
||||||
|
"""Setup Chrome driver"""
|
||||||
|
chrome_options = Options()
|
||||||
|
if headless:
|
||||||
|
chrome_options.add_argument('--headless')
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||||
|
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
||||||
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
return driver
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error setting up Chrome driver: {e}")
|
||||||
|
print("Make sure Chrome and chromedriver are installed")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_price_value(text):
|
||||||
|
"""Extract numeric value from price string like '$5.50' or '€5,50'"""
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
# Remove currency symbols and extract numbers
|
||||||
|
cleaned = re.sub(r'[^\d,.\-]', '', text)
|
||||||
|
# Replace comma with period for decimal
|
||||||
|
cleaned = cleaned.replace(',', '.')
|
||||||
|
try:
|
||||||
|
return float(cleaned)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def scrape_mtg_stocks_values(driver, url):
|
||||||
|
"""Scrape expected value and market value from MTG Stocks"""
|
||||||
|
try:
|
||||||
|
print(f" Loading page...")
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# Wait for table to load
|
||||||
|
time.sleep(random.uniform(10, 20))
|
||||||
|
|
||||||
|
# Valid booster types to match
|
||||||
|
valid_play_booster_types = [
|
||||||
|
'Play Booster Pack',
|
||||||
|
'Set Booster Pack',
|
||||||
|
'Booster Pack',
|
||||||
|
'Play Booster',
|
||||||
|
'Set Booster',
|
||||||
|
'Booster'
|
||||||
|
]
|
||||||
|
valid_collector_booster_types = [
|
||||||
|
'Collector Booster Pack',
|
||||||
|
'Collector Booster'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Find all rows in the table
|
||||||
|
row_selector = 'mtg-sets-expected-value > mtg-product-tree > .table-responsive > table > tbody:nth-child(2) > tr'
|
||||||
|
rows = driver.find_elements(By.CSS_SELECTOR, row_selector)
|
||||||
|
|
||||||
|
print(f" Found {len(rows)} rows in table")
|
||||||
|
found_play = False
|
||||||
|
found_collector = False
|
||||||
|
play_expected_value = None
|
||||||
|
play_market_value = None
|
||||||
|
collector_expected_value = None
|
||||||
|
collector_market_value = None
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
try:
|
||||||
|
# Get the booster type from first column
|
||||||
|
booster_type_elem = row.find_element(By.CSS_SELECTOR, 'td:nth-child(1) > div.d-flex.align-items-center:nth-child(1) > a:nth-child(2)')
|
||||||
|
booster_type = booster_type_elem.text.strip()
|
||||||
|
|
||||||
|
print(f" Checking row: '{booster_type}'")
|
||||||
|
|
||||||
|
if booster_type in valid_play_booster_types and found_play == False:
|
||||||
|
print(f" ✓ Match found: '{booster_type}'")
|
||||||
|
found_play = True
|
||||||
|
|
||||||
|
# Get expected value (3rd column)
|
||||||
|
expected_value_elem = row.find_element(By.CSS_SELECTOR, 'td:nth-child(3)')
|
||||||
|
expected_value_text = expected_value_elem.text.strip()
|
||||||
|
|
||||||
|
# Get market value (5th column)
|
||||||
|
market_value_elem = row.find_element(By.CSS_SELECTOR, 'td:nth-child(5)')
|
||||||
|
market_value_text = market_value_elem.text.strip()
|
||||||
|
|
||||||
|
print(f" Expected Value: '{expected_value_text}'")
|
||||||
|
print(f" Market Value: '{market_value_text}'")
|
||||||
|
|
||||||
|
# Parse values
|
||||||
|
play_expected_value = parse_price_value(expected_value_text)
|
||||||
|
play_market_value = parse_price_value(market_value_text)
|
||||||
|
|
||||||
|
if booster_type in valid_collector_booster_types and found_collector == False:
|
||||||
|
print(f" ✓ Match found: '{booster_type}'")
|
||||||
|
found_collector = True
|
||||||
|
|
||||||
|
# Get expected value (3rd column)
|
||||||
|
expected_value_elem = row.find_element(By.CSS_SELECTOR, 'td:nth-child(3)')
|
||||||
|
expected_value_text = expected_value_elem.text.strip()
|
||||||
|
|
||||||
|
# Get market value (5th column)
|
||||||
|
market_value_elem = row.find_element(By.CSS_SELECTOR, 'td:nth-child(5)')
|
||||||
|
market_value_text = market_value_elem.text.strip()
|
||||||
|
|
||||||
|
print(f" Expected Value: '{expected_value_text}'")
|
||||||
|
print(f" Market Value: '{market_value_text}'")
|
||||||
|
|
||||||
|
# Parse values
|
||||||
|
collector_expected_value = parse_price_value(expected_value_text)
|
||||||
|
collector_market_value = parse_price_value(market_value_text)
|
||||||
|
|
||||||
|
if found_play and found_collector:
|
||||||
|
return {
|
||||||
|
'play_expected_value': play_expected_value,
|
||||||
|
'play_market_value': play_market_value,
|
||||||
|
'collector_expected_value': collector_expected_value,
|
||||||
|
'collector_market_value': collector_market_value,
|
||||||
|
'found_play': True,
|
||||||
|
'found_collector': True
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Row doesn't match structure, continue to next
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f" ✗ No matching booster type found")
|
||||||
|
return {
|
||||||
|
'play_expected_value': play_expected_value,
|
||||||
|
'play_market_value': play_market_value,
|
||||||
|
'collector_expected_value': collector_expected_value,
|
||||||
|
'collector_market_value': collector_market_value,
|
||||||
|
'found_play': found_play,
|
||||||
|
'found_collector': found_collector
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
return {
|
||||||
|
'play_expected_value': play_expected_value,
|
||||||
|
'play_market_value': play_market_value,
|
||||||
|
'collector_expected_value': collector_expected_value,
|
||||||
|
'collector_market_value': collector_market_value,
|
||||||
|
'found_play': found_play,
|
||||||
|
'found_collector': found_collector
|
||||||
|
}
|
||||||
|
|
||||||
|
def main():
|
||||||
|
workbook_name = 'TCG Sole Trader Copy.xlsx'
|
||||||
|
sheet_name = 'MTG Set'
|
||||||
|
|
||||||
|
print("Loading workbook...")
|
||||||
|
wb = load_workbook(workbook_name)
|
||||||
|
|
||||||
|
if sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{sheet_name}' not found")
|
||||||
|
return
|
||||||
|
|
||||||
|
sheet = wb[sheet_name]
|
||||||
|
|
||||||
|
# Find table boundaries and columns
|
||||||
|
table_found = False
|
||||||
|
start_row = None
|
||||||
|
header_row = None
|
||||||
|
|
||||||
|
# Search for table header
|
||||||
|
print("max sheet column: ", str(sheet.max_column))
|
||||||
|
|
||||||
|
for row in range(2, max(50, sheet.max_row + 1)):
|
||||||
|
cell_value = str(sheet.cell(row, 1).value)
|
||||||
|
# Check multiple columns for table indicators
|
||||||
|
for col in range(1, max(10, sheet.max_column + 1)):
|
||||||
|
cell_value = str(sheet.cell(row, col).value)
|
||||||
|
if 'EV MTG Stocks Link' in cell_value:
|
||||||
|
header_row = row
|
||||||
|
start_row = row + 1
|
||||||
|
table_found = True
|
||||||
|
break
|
||||||
|
if table_found:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not table_found:
|
||||||
|
print("Error: Could not find 'EV MTG Stocks Link' column")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Found table header at row {header_row}")
|
||||||
|
print(f"Starting from row {start_row}")
|
||||||
|
|
||||||
|
# Find column indices
|
||||||
|
ev_link_col = None
|
||||||
|
play_expected_value_col = None
|
||||||
|
play_market_value_col = None
|
||||||
|
collector_expected_value_col = None
|
||||||
|
collector_market_value_col = None
|
||||||
|
|
||||||
|
for col in range(1, sheet.max_column + 1):
|
||||||
|
header = str(sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'EV MTG Stocks Link' in header:
|
||||||
|
ev_link_col = col
|
||||||
|
elif 'Play Booster Expected Market Value' in header:
|
||||||
|
play_expected_value_col = col
|
||||||
|
elif 'Play Boost Sealed Market Value' in header:
|
||||||
|
play_market_value_col = col
|
||||||
|
elif 'Collector Booster Expected Market Value' in header:
|
||||||
|
collector_expected_value_col = col
|
||||||
|
elif 'Collector Boost Sealed Market Value' in header:
|
||||||
|
collector_market_value_col = col
|
||||||
|
|
||||||
|
print(f"Columns - EV Link: {ev_link_col}, Play Expected Value: {play_expected_value_col}, Play Market Value: {play_market_value_col}, Collector Expected Value: {collector_expected_value_col}, Collector Market Value: {collector_market_value_col}")
|
||||||
|
|
||||||
|
if not all([ev_link_col, play_expected_value_col, play_market_value_col, collector_expected_value_col, collector_market_value_col]):
|
||||||
|
print("Error: Could not find all required columns")
|
||||||
|
print(f" EV MTG Stocks Link: {'Found' if ev_link_col else 'NOT FOUND'}")
|
||||||
|
print(f" Play Booster Expected Market Value: {'Found' if play_expected_value_col else 'NOT FOUND'}")
|
||||||
|
print(f" Play Boost Sealed Market Value: {'Found' if play_market_value_col else 'NOT FOUND'}")
|
||||||
|
print(f" Collector Booster Expected Market Value: {'Found' if collector_expected_value_col else 'NOT FOUND'}")
|
||||||
|
print(f" Collector Boost Sealed Market Value: {'Found' if collector_market_value_col else 'NOT FOUND'}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Setup Selenium driver
|
||||||
|
print("Setting up browser automation...")
|
||||||
|
driver = setup_driver(headless=False) # Set to False to see browser
|
||||||
|
if not driver:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Process rows
|
||||||
|
processed_count = 0
|
||||||
|
updated_count = 0
|
||||||
|
play_cleared_count = 0
|
||||||
|
collector_cleared_count = 0
|
||||||
|
|
||||||
|
for row in range(start_row, sheet.max_row + 1):
|
||||||
|
ev_link = sheet.cell(row, ev_link_col).value
|
||||||
|
|
||||||
|
# Check if row is empty
|
||||||
|
if not ev_link:
|
||||||
|
# Check if we've passed the end of the table
|
||||||
|
empty_count = 0
|
||||||
|
for check_col in range(1, min(10, sheet.max_column + 1)):
|
||||||
|
if not sheet.cell(row, check_col).value:
|
||||||
|
empty_count += 1
|
||||||
|
if empty_count >= 5: # If most columns are empty, assume end of table
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
|
||||||
|
processed_count += 1
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print(f"Processing row {row}: {ev_link}")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
|
||||||
|
# Scrape values
|
||||||
|
result = scrape_mtg_stocks_values(driver, ev_link)
|
||||||
|
|
||||||
|
if result['found_play']:
|
||||||
|
# Update cells with found values
|
||||||
|
sheet.cell(row, play_expected_value_col).value = result['play_expected_value']
|
||||||
|
sheet.cell(row, play_market_value_col).value = result['play_market_value']
|
||||||
|
updated_count += 1
|
||||||
|
print(f" ✓ Updated - Expected: {result['play_expected_value']}, Market: {result['play_market_value']}")
|
||||||
|
else:
|
||||||
|
# Clear cells - no matching booster type found
|
||||||
|
sheet.cell(row, play_expected_value_col).value = ''
|
||||||
|
sheet.cell(row, play_market_value_col).value = ''
|
||||||
|
play_cleared_count += 1
|
||||||
|
print(f" ✗ Cleared values - no matching booster type found")
|
||||||
|
|
||||||
|
if result['found_collector']:
|
||||||
|
# Update cells with found values
|
||||||
|
sheet.cell(row, collector_expected_value_col).value = result['collector_expected_value']
|
||||||
|
sheet.cell(row, collector_market_value_col).value = result['collector_market_value']
|
||||||
|
updated_count += 1
|
||||||
|
print(f" ✓ Updated - Expected: {result['collector_expected_value']}, Market: {result['collector_market_value']}")
|
||||||
|
else:
|
||||||
|
# Clear cells - no matching booster type found
|
||||||
|
sheet.cell(row, collector_expected_value_col).value = ''
|
||||||
|
sheet.cell(row, collector_market_value_col).value = ''
|
||||||
|
collector_cleared_count += 1
|
||||||
|
print(f" ✗ Cleared values - no matching booster type found")
|
||||||
|
|
||||||
|
# Small delay between requests
|
||||||
|
time.sleep(random.uniform(10, 20))
|
||||||
|
|
||||||
|
# Save workbook
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print(f"Saving workbook...")
|
||||||
|
wb.save(workbook_name)
|
||||||
|
|
||||||
|
print(f"\nComplete!")
|
||||||
|
print(f"Processed: {processed_count} entries")
|
||||||
|
print(f"Updated: {updated_count} entries")
|
||||||
|
print(f"Play fields cleared: {play_cleared_count} entries (no matching data)")
|
||||||
|
print(f"Collector fields cleared: {collector_cleared_count} entries (no matching data)")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
328
product_scraping/mtg_booster_sale_price_cardmarket_fetcher.py
Normal file
328
product_scraping/mtg_booster_sale_price_cardmarket_fetcher.py
Normal file
@@ -0,0 +1,328 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
import requests
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
def get_eur_to_gbp_rate():
|
||||||
|
"""Fetch current EUR to GBP conversion rate"""
|
||||||
|
try:
|
||||||
|
response = requests.get('https://api.exchangerate-api.com/v4/latest/EUR', timeout=10)
|
||||||
|
data = response.json()
|
||||||
|
return data['rates']['GBP']
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error fetching exchange rate: {e}")
|
||||||
|
print("Using fallback rate: 0.85")
|
||||||
|
return 0.85
|
||||||
|
|
||||||
|
def parse_cardmarket_price(price_text):
|
||||||
|
"""Convert '141,30 €' format to float in EUR"""
|
||||||
|
if not price_text:
|
||||||
|
return None
|
||||||
|
price_clean = re.sub(r'[^\d,]', '', price_text)
|
||||||
|
price_clean = price_clean.replace(',', '.')
|
||||||
|
try:
|
||||||
|
return float(price_clean)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_cardmarket_quantity(quantity_text):
|
||||||
|
if not quantity_text:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return float(quantity_text)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def setup_driver():
|
||||||
|
"""Setup Chrome driver with visible window"""
|
||||||
|
chrome_options = Options()
|
||||||
|
# Remove headless mode to see the browser
|
||||||
|
# chrome_options.add_argument('--headless')
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||||
|
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
||||||
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
return driver
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error setting up Chrome driver: {e}")
|
||||||
|
print("Make sure Chrome and chromedriver are installed")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def scrape_cardmarket_prices_and_quantities_selenium(driver, url):
|
||||||
|
try:
|
||||||
|
print(f" Loading page...")
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# Wait for page to load + human reading time
|
||||||
|
time.sleep(random.uniform(20, 30))
|
||||||
|
|
||||||
|
print(f" Page title: {driver.title}")
|
||||||
|
|
||||||
|
offer_container_selector = '#table > div:nth-child(1) > div.table-body > .row.article-row.g-0:nth-child(1) > div.col-offer'
|
||||||
|
price_selector = 'div.price-container > div > div:nth-child(1) > span:nth-child(1)'
|
||||||
|
quantity_selector = 'div.amount-container > span:nth-child(1)'
|
||||||
|
price_quantity_pairs = []
|
||||||
|
try:
|
||||||
|
offer_containers = driver.find_elements(By.CSS_SELECTOR, offer_container_selector)
|
||||||
|
print(f" Offer container selector: Found {len(offer_containers)} elements")
|
||||||
|
for offer_container in offer_containers:
|
||||||
|
price_element = offer_container.find_element(By.CSS_SELECTOR, price_selector)
|
||||||
|
price_text = price_element.text
|
||||||
|
if '€' in price_text and re.search(r'\d', price_text):
|
||||||
|
print(f" ✓ Found price: {price_text}")
|
||||||
|
else:
|
||||||
|
price_text = None
|
||||||
|
|
||||||
|
quantity_element = offer_container.find_element(By.CSS_SELECTOR, quantity_selector)
|
||||||
|
quantity_text = quantity_element.text
|
||||||
|
|
||||||
|
if price_text is None or quantity_text is None:
|
||||||
|
continue
|
||||||
|
price_quantity_pairs.append({
|
||||||
|
'price': parse_cardmarket_price(price_text = price_text)
|
||||||
|
, 'quantity': parse_cardmarket_quantity(quantity_text = quantity_text)
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Price selector failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
"""
|
||||||
|
if price_text is None:
|
||||||
|
print(f" ✗ No price found")
|
||||||
|
if quantity_text is not None:
|
||||||
|
input("Press Enter to continue to next URL...")
|
||||||
|
|
||||||
|
if quantity_text is None:
|
||||||
|
print(f" ✗ No quantity found")
|
||||||
|
input("Press Enter to continue to next URL...")
|
||||||
|
"""
|
||||||
|
return price_quantity_pairs # price_text, quantity_text
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def main():
|
||||||
|
workbook_name = 'TCG Sole Trader Copy.xlsx'
|
||||||
|
sourcing_sheet_name = 'Sourcing'
|
||||||
|
# mtg_set_sheet_name = 'MTG Set'
|
||||||
|
product_sheet_name = 'Product'
|
||||||
|
|
||||||
|
print("Loading workbook...")
|
||||||
|
wb = load_workbook(workbook_name)
|
||||||
|
|
||||||
|
if sourcing_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{sourcing_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
if mtg_set_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{mtg_set_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
if product_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{product_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
|
||||||
|
sourcing_sheet = wb[sourcing_sheet_name]
|
||||||
|
# mtg_set_sheet = wb[mtg_set_sheet_name]
|
||||||
|
product_sheet = wb[product_sheet_name]
|
||||||
|
|
||||||
|
sourcing_table_found = False
|
||||||
|
start_row = None
|
||||||
|
for row in range(1, sourcing_sheet.max_row + 1):
|
||||||
|
if sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(sourcing_sheet.cell(row, 3).value):
|
||||||
|
start_row = row + 1
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not sourcing_table_found or not start_row:
|
||||||
|
for row in range(1, min(20, sourcing_sheet.max_row + 1)):
|
||||||
|
if 'Source Name' in str(sourcing_sheet.cell(row, 2).value):
|
||||||
|
start_row = row + 1
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
start_row = None
|
||||||
|
# mtg_set_table_found = False
|
||||||
|
for row in range(1, mtg_set_sheet.max_row + 1):
|
||||||
|
if mtg_set_sheet.cell(row, 1).value == 'tbl_MTG_Set' or 'Set Name' in str(mtg_set_sheet.cell(row, 2).value):
|
||||||
|
start_row = row + 1
|
||||||
|
mtg_set_table_found = True
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
|
||||||
|
start_row = None
|
||||||
|
product_table_found = False
|
||||||
|
for row in range(1, product_sheet.max_row + 1):
|
||||||
|
if product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(product_sheet.cell(row, 1).value):
|
||||||
|
start_row = row + 1
|
||||||
|
product_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not sourcing_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Sourcing' or 'Source Name' column")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
if not mtg_set_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_MTG_Set' or 'Set Name' column")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
if not product_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Product' or 'Product Id' column")
|
||||||
|
return
|
||||||
|
|
||||||
|
header_row = start_row - 1
|
||||||
|
source_name_col = None
|
||||||
|
source_link_col = None
|
||||||
|
source_unit_price_col = None
|
||||||
|
source_is_available_col = None
|
||||||
|
source_product_id_col = None
|
||||||
|
|
||||||
|
product_id_col = None
|
||||||
|
product_is_booster_col = None
|
||||||
|
|
||||||
|
for col in range(1, sourcing_sheet.max_column + 1):
|
||||||
|
header = str(sourcing_sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'Source Name' in header:
|
||||||
|
source_name_col = col
|
||||||
|
elif 'Source Link' in header:
|
||||||
|
source_link_col = col
|
||||||
|
elif 'Sale Price' in header:
|
||||||
|
source_unit_price_col = col
|
||||||
|
elif 'Active' in header:
|
||||||
|
source_is_available_col = col
|
||||||
|
elif 'Product Id' in header:
|
||||||
|
source_product_id_col = col
|
||||||
|
|
||||||
|
for col in range(1, product_sheet.max_column + 1):
|
||||||
|
header = str(product_sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'Is Booster' in header:
|
||||||
|
product_is_booster_col = col
|
||||||
|
elif 'Product Id' in header:
|
||||||
|
product_id_col = col
|
||||||
|
|
||||||
|
print(f"Starting from row {start_row}")
|
||||||
|
print(f"Sourcing Columns - Source Name: {source_name_col}, Source Link: {source_link_col}, Source Unit Cost: {source_unit_price_col}, Active: {source_is_available_col}, Product Id: {source_product_id_col}")
|
||||||
|
print(f"Product Columns - Id: {product_id_col}, Is Booster: {product_is_booster_col}")
|
||||||
|
|
||||||
|
if not all([source_name_col, source_link_col, source_unit_price_col, source_is_available_col, source_product_id_col, product_id_col, product_is_booster_col]):
|
||||||
|
print("Error: Could not find required columns")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get EUR to GBP rate
|
||||||
|
eur_to_gbp = get_eur_to_gbp_rate()
|
||||||
|
print(f"Using EUR to GBP rate: {eur_to_gbp}")
|
||||||
|
|
||||||
|
# Setup Selenium driver
|
||||||
|
print("Setting up browser automation (browser will be visible)...")
|
||||||
|
driver = setup_driver()
|
||||||
|
if not driver:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
processed_count = 0
|
||||||
|
updated_count = 0
|
||||||
|
|
||||||
|
for row in range(start_row, sourcing_sheet.max_row + 1):
|
||||||
|
source_name = sourcing_sheet.cell(row, source_name_col).value
|
||||||
|
source_link = sourcing_sheet.cell(row, source_link_col).value
|
||||||
|
source_product_id = sourcing_sheet.cell(row, source_product_id_col).value
|
||||||
|
# Check if row is empty
|
||||||
|
if not source_name or not source_link or not source_product_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}")
|
||||||
|
|
||||||
|
product_is_booster = False
|
||||||
|
for product_row in range(start_row, product_sheet.max_row + 1):
|
||||||
|
product_id = product_sheet.cell(product_row, product_id_col).value
|
||||||
|
# print(f"found product: id {product_id}")
|
||||||
|
if product_id == source_product_id:
|
||||||
|
product_is_booster_text = str(product_sheet.cell(product_row, product_is_booster_col).value).upper()
|
||||||
|
# print(f"product is booster: {product_is_booster_text}, type: {str(type(product_is_booster_text))}")
|
||||||
|
product_is_booster = (product_is_booster_text == "TRUE")
|
||||||
|
break
|
||||||
|
print(f"product is booster: {product_is_booster}")
|
||||||
|
# Check conditions
|
||||||
|
if product_is_booster and source_name == "Card Market" and source_link and str(source_link).strip():
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = None
|
||||||
|
# sourcing_sheet.cell(row, source_is_available_col).value = "FALSE"
|
||||||
|
|
||||||
|
processed_count += 1
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Processing row {row}: {source_link}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Scrape price
|
||||||
|
price_quantity_pairs = scrape_cardmarket_prices_and_quantities_selenium(driver, source_link)
|
||||||
|
|
||||||
|
if price_quantity_pairs:
|
||||||
|
# sourcing_sheet.cell(row, source_is_available_col).value = "TRUE"
|
||||||
|
max_quantity = 0
|
||||||
|
updated_row_price = False
|
||||||
|
for price_quantity_pair in price_quantity_pairs:
|
||||||
|
eur_price = price_quantity_pair['price']
|
||||||
|
quantity = price_quantity_pair['quantity']
|
||||||
|
print(f" Found price: €{eur_price}")
|
||||||
|
print(f" Found quantity: {quantity}")
|
||||||
|
max_quantity = max(max_quantity, quantity)
|
||||||
|
if quantity >= 8:
|
||||||
|
if eur_price:
|
||||||
|
gbp_price = eur_price * eur_to_gbp
|
||||||
|
print(f" Converted: €{eur_price:.2f} → £{gbp_price:.2f}")
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price
|
||||||
|
updated_count += 1
|
||||||
|
updated_row_price = True
|
||||||
|
print(f"output row: {row}, value: {sourcing_sheet.cell(row, source_unit_price_col).value}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not parse price")
|
||||||
|
if not updated_row_price:
|
||||||
|
print("Offer with quantity >= 8 not found")
|
||||||
|
for price_quantity_pair in price_quantity_pairs:
|
||||||
|
eur_price = price_quantity_pair['price']
|
||||||
|
quantity = price_quantity_pair['quantity']
|
||||||
|
print(f" Found price: €{eur_price}")
|
||||||
|
print(f" Found quantity: {quantity}")
|
||||||
|
if max_quantity <= 2 or quantity == max_quantity:
|
||||||
|
if eur_price:
|
||||||
|
gbp_price = eur_price * eur_to_gbp
|
||||||
|
print(f" Converted: €{eur_price:.2f} → £{gbp_price:.2f}")
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price
|
||||||
|
updated_count += 1
|
||||||
|
updated_row_price = True
|
||||||
|
print(f"output row: {row}, value: {sourcing_sheet.cell(row, source_unit_price_col).value}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not parse price")
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not find price on page")
|
||||||
|
|
||||||
|
# Save workbook
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Saving workbook...")
|
||||||
|
wb.save(workbook_name)
|
||||||
|
|
||||||
|
print(f"\nComplete!")
|
||||||
|
print(f"Processed: {processed_count} Card Market entries")
|
||||||
|
print(f"Updated: {updated_count} prices")
|
||||||
|
print(datetime.now())
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
241
product_scraping/mtg_product_price_chaos_cards_fetcher.py
Normal file
241
product_scraping/mtg_product_price_chaos_cards_fetcher.py
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
import requests
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
ITEM_SHIPPING_COST_IN = 8
|
||||||
|
|
||||||
|
def parse_chaoscards_price(price_text):
|
||||||
|
"""Convert '141,30 €' format to float in EUR"""
|
||||||
|
if not price_text:
|
||||||
|
return None
|
||||||
|
price_clean = re.sub(r'[^\d,]', '', price_text)
|
||||||
|
try:
|
||||||
|
return float(price_clean) / 100
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def setup_driver():
|
||||||
|
"""Setup Chrome driver with visible window"""
|
||||||
|
chrome_options = Options()
|
||||||
|
# Remove headless mode to see the browser
|
||||||
|
# chrome_options.add_argument('--headless')
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||||
|
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
||||||
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
return driver
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error setting up Chrome driver: {e}")
|
||||||
|
print("Make sure Chrome and chromedriver are installed")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def scrape_chaoscards_price_selenium(driver, url):
|
||||||
|
try:
|
||||||
|
print(f" Loading page...")
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
time.sleep(random.uniform(5, 10))
|
||||||
|
|
||||||
|
print(f" Page title: {driver.title}")
|
||||||
|
|
||||||
|
price_selector = '.price_inc > span:nth-child(2)'
|
||||||
|
price = None
|
||||||
|
try:
|
||||||
|
element = driver.find_element(By.CSS_SELECTOR, price_selector)
|
||||||
|
text = element.text
|
||||||
|
print(f" Text: '{text}'")
|
||||||
|
price = parse_chaoscards_price(text)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Selector failed: {e}")
|
||||||
|
|
||||||
|
active_selector = '.product__right > form > ul.prod_det_fields.left.product-section.product-section--stock > li > div:nth-child(1) > div:nth-child(2)'
|
||||||
|
active = None
|
||||||
|
try:
|
||||||
|
element = driver.find_element(By.CSS_SELECTOR, active_selector)
|
||||||
|
text = element.text
|
||||||
|
print(f" Text: '{text}'")
|
||||||
|
active = (text != "Out of stock")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Selector failed: {e}")
|
||||||
|
|
||||||
|
if price is None or active is None:
|
||||||
|
print(f" ✗ No price found")
|
||||||
|
input("Press Enter to continue to next URL...")
|
||||||
|
return price, active
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def main():
|
||||||
|
workbook_name = 'TCG Sole Trader Copy.xlsx'
|
||||||
|
sourcing_sheet_name = 'Sourcing'
|
||||||
|
# product_sheet_name = 'Product'
|
||||||
|
|
||||||
|
print("Loading workbook...")
|
||||||
|
wb = load_workbook(workbook_name)
|
||||||
|
|
||||||
|
if sourcing_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{sourcing_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
if product_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{product_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
sourcing_sheet = wb[sourcing_sheet_name]
|
||||||
|
# product_sheet = wb[product_sheet_name]
|
||||||
|
|
||||||
|
sourcing_table_found = False
|
||||||
|
start_row = None
|
||||||
|
for row in range(1, sourcing_sheet.max_row + 1):
|
||||||
|
if sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(sourcing_sheet.cell(row, 3).value):
|
||||||
|
start_row = row + 1
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not sourcing_table_found or not start_row:
|
||||||
|
for row in range(1, min(20, sourcing_sheet.max_row + 1)):
|
||||||
|
if 'Source Name' in str(sourcing_sheet.cell(row, 3).value):
|
||||||
|
start_row = row + 1
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
start_row = None
|
||||||
|
product_table_found = False
|
||||||
|
for row in range(1, product_sheet.max_row + 1):
|
||||||
|
if product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(product_sheet.cell(row, 1).value):
|
||||||
|
start_row = row + 1
|
||||||
|
product_table_found = True
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
if not sourcing_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Sourcing'")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
if not product_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Product'")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Find column indices
|
||||||
|
header_row = start_row - 1
|
||||||
|
source_name_col = None
|
||||||
|
source_link_col = None
|
||||||
|
source_unit_price_col = None
|
||||||
|
source_is_available_col = None
|
||||||
|
source_product_id_col = None
|
||||||
|
"""
|
||||||
|
product_id_col = None
|
||||||
|
product_is_booster_box_col = None
|
||||||
|
product_is_precon_col = None
|
||||||
|
"""
|
||||||
|
for col in range(1, sourcing_sheet.max_column + 1):
|
||||||
|
header = str(sourcing_sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'Source Name' in header:
|
||||||
|
source_name_col = col
|
||||||
|
elif 'Source Link' in header:
|
||||||
|
source_link_col = col
|
||||||
|
elif 'Source Unit Cost' in header:
|
||||||
|
source_unit_price_col = col
|
||||||
|
elif 'Active' in header:
|
||||||
|
source_is_available_col = col
|
||||||
|
elif 'Product Id' in header:
|
||||||
|
source_product_id_col = col
|
||||||
|
"""
|
||||||
|
for col in range(1, product_sheet.max_column + 1):
|
||||||
|
header = str(product_sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'Is Booster Box' in header:
|
||||||
|
product_is_booster_box_col = col
|
||||||
|
elif 'Is Precon' in header:
|
||||||
|
product_is_precon_col = col
|
||||||
|
elif 'Product Id' in header:
|
||||||
|
product_id_col = col
|
||||||
|
"""
|
||||||
|
print(f"Starting from row {start_row}")
|
||||||
|
print(f"Sourcing Columns - Source Name: {source_name_col}, Source Link: {source_link_col}, Source Unit Cost: {source_unit_price_col}, Active: {source_is_available_col}, Product Id: {source_product_id_col}")
|
||||||
|
# print(f"Product Columns - Id: {product_id_col}, Is Booster: {product_is_booster_box_col}, Is Precon: {product_is_precon_col}")
|
||||||
|
|
||||||
|
if not all([source_name_col, source_link_col, source_unit_price_col, source_is_available_col, source_product_id_col]): # , product_id_col, product_is_booster_box_col, product_is_precon_col]):
|
||||||
|
print("Error: Could not find required columns")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Setup Selenium driver
|
||||||
|
print("Setting up browser automation (browser will be visible)...")
|
||||||
|
driver = setup_driver()
|
||||||
|
if not driver:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
processed_count = 0
|
||||||
|
updated_count = 0
|
||||||
|
|
||||||
|
for row in range(start_row, sourcing_sheet.max_row + 1):
|
||||||
|
driver.quit()
|
||||||
|
driver = setup_driver()
|
||||||
|
if not driver:
|
||||||
|
return
|
||||||
|
|
||||||
|
source_name = sourcing_sheet.cell(row, source_name_col).value
|
||||||
|
source_link = sourcing_sheet.cell(row, source_link_col).value
|
||||||
|
source_product_id = sourcing_sheet.cell(row, source_product_id_col).value
|
||||||
|
|
||||||
|
if not source_name and not source_link:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}")
|
||||||
|
|
||||||
|
if (
|
||||||
|
source_name == "Chaos Cards"
|
||||||
|
and source_link
|
||||||
|
and str(source_link).strip()
|
||||||
|
):
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = None
|
||||||
|
sourcing_sheet.cell(row, source_is_available_col).value = "FALSE"
|
||||||
|
|
||||||
|
processed_count += 1
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Processing row {row}: {source_link}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Scrape price
|
||||||
|
gbp_price, active = scrape_chaoscards_price_selenium(driver, source_link)
|
||||||
|
|
||||||
|
if (gbp_price is not None and active is not None):
|
||||||
|
print(f" Found price: {gbp_price}, active: {active}")
|
||||||
|
|
||||||
|
if gbp_price:
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price + ITEM_SHIPPING_COST_IN
|
||||||
|
sourcing_sheet.cell(row, source_is_available_col).value = "TRUE" if active else "FALSE"
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not parse price")
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not find price on page")
|
||||||
|
# Save workbook
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Saving workbook...")
|
||||||
|
wb.save(workbook_name)
|
||||||
|
|
||||||
|
print(f"\nComplete!")
|
||||||
|
print(f"Processed: {processed_count} Chaos Cards entries")
|
||||||
|
print(f"Updated: {updated_count} prices")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
235
product_scraping/mtg_product_price_games_lore_fetcher.py
Normal file
235
product_scraping/mtg_product_price_games_lore_fetcher.py
Normal file
@@ -0,0 +1,235 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
import requests
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
ITEM_SHIPPING_COST_IN = 8
|
||||||
|
|
||||||
|
def parse_gameslore_price(price_text):
|
||||||
|
if not price_text:
|
||||||
|
return None
|
||||||
|
price_clean = re.sub(r'[^\d,]', '', price_text)
|
||||||
|
try:
|
||||||
|
return float(price_clean) / 100
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def setup_driver():
|
||||||
|
"""Setup Chrome driver with visible window"""
|
||||||
|
chrome_options = Options()
|
||||||
|
# Remove headless mode to see the browser
|
||||||
|
# chrome_options.add_argument('--headless')
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||||
|
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
||||||
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
return driver
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error setting up Chrome driver: {e}")
|
||||||
|
print("Make sure Chrome and chromedriver are installed")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def scrape_gameslore_price_selenium(driver, url):
|
||||||
|
try:
|
||||||
|
print(f" Loading page...")
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
time.sleep(random.uniform(10, 20))
|
||||||
|
|
||||||
|
print(f" Page title: {driver.title}")
|
||||||
|
|
||||||
|
price_selector = 'div.columns > div.column.main > div.product-info-main > div.product-info-price > div.price-box > span.special-price > span.price-container > span.price-wrapper > span.price'
|
||||||
|
price = None
|
||||||
|
try:
|
||||||
|
element = driver.find_element(By.CSS_SELECTOR, price_selector)
|
||||||
|
text = element.text
|
||||||
|
print(f" Text: '{text}'")
|
||||||
|
price = parse_gameslore_price(text)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Selector failed: {e}")
|
||||||
|
|
||||||
|
active_selector = '.stock > span:nth-child(1)'
|
||||||
|
active = None
|
||||||
|
try:
|
||||||
|
element = driver.find_element(By.CSS_SELECTOR, active_selector)
|
||||||
|
text = element.text
|
||||||
|
print(f" ✓ Found stock availability with selector: {text}")
|
||||||
|
active = (text != "OUT OF STOCK")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Selector failed: {e}")
|
||||||
|
|
||||||
|
if price is None or active is None:
|
||||||
|
print(f" ✗ No price found")
|
||||||
|
input("Press Enter to continue to next URL...")
|
||||||
|
return price, active
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def main():
|
||||||
|
workbook_name = 'TCG Sole Trader Copy.xlsx'
|
||||||
|
sourcing_sheet_name = 'Sourcing'
|
||||||
|
# product_sheet_name = 'Product'
|
||||||
|
|
||||||
|
print("Loading workbook...")
|
||||||
|
wb = load_workbook(workbook_name)
|
||||||
|
|
||||||
|
if sourcing_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{sourcing_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
if product_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{product_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
sourcing_sheet = wb[sourcing_sheet_name]
|
||||||
|
# product_sheet = wb[product_sheet_name]
|
||||||
|
|
||||||
|
sourcing_table_found = False
|
||||||
|
start_row = None
|
||||||
|
for row in range(1, sourcing_sheet.max_row + 1):
|
||||||
|
if sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(sourcing_sheet.cell(row, 3).value):
|
||||||
|
start_row = row + 1
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not sourcing_table_found or not start_row:
|
||||||
|
for row in range(1, min(20, sourcing_sheet.max_row + 1)):
|
||||||
|
if 'Source Name' in str(sourcing_sheet.cell(row, 3).value):
|
||||||
|
start_row = row + 1
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
start_row = None
|
||||||
|
product_table_found = False
|
||||||
|
for row in range(1, product_sheet.max_row + 1):
|
||||||
|
if product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(product_sheet.cell(row, 1).value):
|
||||||
|
start_row = row + 1
|
||||||
|
product_table_found = True
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
if not sourcing_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Sourcing'")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
if not product_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Product'")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Find column indices
|
||||||
|
header_row = start_row - 1
|
||||||
|
source_name_col = None
|
||||||
|
source_link_col = None
|
||||||
|
source_unit_price_col = None
|
||||||
|
source_is_available_col = None
|
||||||
|
source_product_id_col = None
|
||||||
|
"""
|
||||||
|
product_id_col = None
|
||||||
|
product_is_booster_box_col = None
|
||||||
|
product_is_precon_col = None
|
||||||
|
"""
|
||||||
|
for col in range(1, sourcing_sheet.max_column + 1):
|
||||||
|
header = str(sourcing_sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'Source Name' in header:
|
||||||
|
source_name_col = col
|
||||||
|
elif 'Source Link' in header:
|
||||||
|
source_link_col = col
|
||||||
|
elif 'Source Unit Cost' in header:
|
||||||
|
source_unit_price_col = col
|
||||||
|
elif 'Active' in header:
|
||||||
|
source_is_available_col = col
|
||||||
|
elif 'Product Id' in header:
|
||||||
|
source_product_id_col = col
|
||||||
|
"""
|
||||||
|
for col in range(1, product_sheet.max_column + 1):
|
||||||
|
header = str(product_sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'Is Booster Box' in header:
|
||||||
|
product_is_booster_box_col = col
|
||||||
|
elif 'Is Precon' in header:
|
||||||
|
product_is_precon_col = col
|
||||||
|
elif 'Product Id' in header:
|
||||||
|
product_id_col = col
|
||||||
|
"""
|
||||||
|
print(f"Starting from row {start_row}")
|
||||||
|
print(f"Sourcing Columns - Source Name: {source_name_col}, Source Link: {source_link_col}, Source Unit Cost: {source_unit_price_col}, Active: {source_is_available_col}, Product Id: {source_product_id_col}")
|
||||||
|
# print(f"Product Columns - Id: {product_id_col}, Is Booster: {product_is_booster_box_col}, Is Precon: {product_is_precon_col}")
|
||||||
|
|
||||||
|
if not all([source_name_col, source_link_col, source_unit_price_col, source_is_available_col, source_product_id_col]): # , product_id_col, product_is_booster_box_col, product_is_precon_col]):
|
||||||
|
print("Error: Could not find required columns")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Setup Selenium driver
|
||||||
|
print("Setting up browser automation (browser will be visible)...")
|
||||||
|
driver = setup_driver()
|
||||||
|
if not driver:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
processed_count = 0
|
||||||
|
updated_count = 0
|
||||||
|
|
||||||
|
for row in range(start_row, sourcing_sheet.max_row + 1):
|
||||||
|
source_name = sourcing_sheet.cell(row, source_name_col).value
|
||||||
|
source_link = sourcing_sheet.cell(row, source_link_col).value
|
||||||
|
source_product_id = sourcing_sheet.cell(row, source_product_id_col).value
|
||||||
|
|
||||||
|
if not source_name and not source_link:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}")
|
||||||
|
|
||||||
|
if (
|
||||||
|
source_name == "Games Lore"
|
||||||
|
and source_link
|
||||||
|
and str(source_link).strip()
|
||||||
|
):
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = None
|
||||||
|
sourcing_sheet.cell(row, source_is_available_col).value = "FALSE"
|
||||||
|
|
||||||
|
processed_count += 1
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Processing row {row}: {source_link}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Scrape price
|
||||||
|
gbp_price, active = scrape_gameslore_price_selenium(driver, source_link)
|
||||||
|
|
||||||
|
if (gbp_price is not None and active is not None):
|
||||||
|
print(f" Found price: {gbp_price}, active: {active}")
|
||||||
|
|
||||||
|
if gbp_price:
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price + ITEM_SHIPPING_COST_IN
|
||||||
|
sourcing_sheet.cell(row, source_is_available_col).value = "TRUE" if active else "FALSE"
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not parse price")
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not find price on page")
|
||||||
|
# Save workbook
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Saving workbook...")
|
||||||
|
wb.save(workbook_name)
|
||||||
|
|
||||||
|
print(f"\nComplete!")
|
||||||
|
print(f"Processed: {processed_count} Games Lore entries")
|
||||||
|
print(f"Updated: {updated_count} prices")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
235
product_scraping/mtg_product_price_magic_madhouse_fetcher.py
Normal file
235
product_scraping/mtg_product_price_magic_madhouse_fetcher.py
Normal file
@@ -0,0 +1,235 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
import requests
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
ITEM_SHIPPING_COST_IN = 8
|
||||||
|
|
||||||
|
def parse_magicmadhouse_price(price_text):
|
||||||
|
"""Convert '141,30 €' format to float in EUR"""
|
||||||
|
if not price_text:
|
||||||
|
return None
|
||||||
|
price_clean = re.sub(r'[^\d,]', '', price_text)
|
||||||
|
price_clean = price_clean.replace(',', '')
|
||||||
|
try:
|
||||||
|
return float(price_clean) / 100
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def setup_driver():
|
||||||
|
"""Setup Chrome driver with visible window"""
|
||||||
|
chrome_options = Options()
|
||||||
|
# Remove headless mode to see the browser
|
||||||
|
# chrome_options.add_argument('--headless')
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||||
|
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
||||||
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
return driver
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error setting up Chrome driver: {e}")
|
||||||
|
print("Make sure Chrome and chromedriver are installed")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def scrape_magicmadhouse_price_selenium(driver, url):
|
||||||
|
try:
|
||||||
|
print(f" Loading page...")
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
time.sleep(random.uniform(10, 20))
|
||||||
|
|
||||||
|
print(f" Page title: {driver.title}")
|
||||||
|
|
||||||
|
price_selector = 'div.body > div.container > div > div.productView > section.productView-details > div.productView-options > form > div.productView-options-selections > div.productView-product > div.productView-info > div.price-rating > div.productView-price > div.price-section.actual-price > span.price'
|
||||||
|
price = None
|
||||||
|
try:
|
||||||
|
element = driver.find_element(By.CSS_SELECTOR, price_selector)
|
||||||
|
text = element.text
|
||||||
|
print(f" Text: '{text}'")
|
||||||
|
price = parse_magicmadhouse_price(text)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Selector failed: {e}")
|
||||||
|
|
||||||
|
active_selector = '.alertBox.alertBox--error'
|
||||||
|
active = False
|
||||||
|
try:
|
||||||
|
element = driver.find_element(By.CSS_SELECTOR, active_selector)
|
||||||
|
active = False
|
||||||
|
except Exception as e:
|
||||||
|
active = True
|
||||||
|
|
||||||
|
if price is None or active is None:
|
||||||
|
print(f" ✗ No out of stock item found")
|
||||||
|
input("Press Enter to continue to next URL...")
|
||||||
|
return price, active
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def main():
|
||||||
|
workbook_name = 'TCG Sole Trader Copy.xlsx'
|
||||||
|
sourcing_sheet_name = 'Sourcing'
|
||||||
|
# product_sheet_name = 'Product'
|
||||||
|
|
||||||
|
print("Loading workbook...")
|
||||||
|
wb = load_workbook(workbook_name)
|
||||||
|
|
||||||
|
if sourcing_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{sourcing_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
if product_sheet_name not in wb.sheetnames:
|
||||||
|
print(f"Error: Sheet '{product_sheet_name}' not found")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
sourcing_sheet = wb[sourcing_sheet_name]
|
||||||
|
# product_sheet = wb[product_sheet_name]
|
||||||
|
|
||||||
|
sourcing_table_found = False
|
||||||
|
start_row = None
|
||||||
|
for row in range(1, sourcing_sheet.max_row + 1):
|
||||||
|
if sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(sourcing_sheet.cell(row, 3).value):
|
||||||
|
start_row = row + 1
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not sourcing_table_found or not start_row:
|
||||||
|
for row in range(1, min(20, sourcing_sheet.max_row + 1)):
|
||||||
|
if 'Source Name' in str(sourcing_sheet.cell(row, 3).value):
|
||||||
|
start_row = row + 1
|
||||||
|
sourcing_table_found = True
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
start_row = None
|
||||||
|
product_table_found = False
|
||||||
|
for row in range(1, product_sheet.max_row + 1):
|
||||||
|
if product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(product_sheet.cell(row, 1).value):
|
||||||
|
start_row = row + 1
|
||||||
|
product_table_found = True
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
if not sourcing_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Sourcing'")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
if not product_table_found:
|
||||||
|
print("Error: Could not find table 'tbl_Product'")
|
||||||
|
return
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Find column indices
|
||||||
|
header_row = start_row - 1
|
||||||
|
source_name_col = None
|
||||||
|
source_link_col = None
|
||||||
|
source_unit_price_col = None
|
||||||
|
source_is_available_col = None
|
||||||
|
source_product_id_col = None
|
||||||
|
"""
|
||||||
|
product_id_col = None
|
||||||
|
product_is_booster_box_col = None
|
||||||
|
product_is_precon_col = None
|
||||||
|
"""
|
||||||
|
for col in range(1, sourcing_sheet.max_column + 1):
|
||||||
|
header = str(sourcing_sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'Source Name' in header:
|
||||||
|
source_name_col = col
|
||||||
|
elif 'Source Link' in header:
|
||||||
|
source_link_col = col
|
||||||
|
elif 'Source Unit Cost' in header:
|
||||||
|
source_unit_price_col = col
|
||||||
|
elif 'Active' in header:
|
||||||
|
source_is_available_col = col
|
||||||
|
elif 'Product Id' in header:
|
||||||
|
source_product_id_col = col
|
||||||
|
"""
|
||||||
|
for col in range(1, product_sheet.max_column + 1):
|
||||||
|
header = str(product_sheet.cell(header_row, col).value).strip()
|
||||||
|
if 'Is Booster Box' in header:
|
||||||
|
product_is_booster_box_col = col
|
||||||
|
elif 'Is Precon' in header:
|
||||||
|
product_is_precon_col = col
|
||||||
|
elif 'Product Id' in header:
|
||||||
|
product_id_col = col
|
||||||
|
"""
|
||||||
|
print(f"Starting from row {start_row}")
|
||||||
|
print(f"Sourcing Columns - Source Name: {source_name_col}, Source Link: {source_link_col}, Source Unit Cost: {source_unit_price_col}, Active: {source_is_available_col}, Product Id: {source_product_id_col}")
|
||||||
|
# print(f"Product Columns - Id: {product_id_col}, Is Booster: {product_is_booster_box_col}, Is Precon: {product_is_precon_col}")
|
||||||
|
|
||||||
|
if not all([source_name_col, source_link_col, source_unit_price_col, source_is_available_col, source_product_id_col]): # , product_id_col, product_is_booster_box_col, product_is_precon_col]):
|
||||||
|
print("Error: Could not find required columns")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Setup Selenium driver
|
||||||
|
print("Setting up browser automation (browser will be visible)...")
|
||||||
|
driver = setup_driver()
|
||||||
|
if not driver:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
processed_count = 0
|
||||||
|
updated_count = 0
|
||||||
|
|
||||||
|
for row in range(start_row, sourcing_sheet.max_row + 1):
|
||||||
|
source_name = sourcing_sheet.cell(row, source_name_col).value
|
||||||
|
source_link = sourcing_sheet.cell(row, source_link_col).value
|
||||||
|
source_product_id = sourcing_sheet.cell(row, source_product_id_col).value
|
||||||
|
|
||||||
|
if not source_name and not source_link:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}")
|
||||||
|
|
||||||
|
if (
|
||||||
|
source_name == "Magic Madhouse"
|
||||||
|
and source_link
|
||||||
|
and str(source_link).strip()
|
||||||
|
):
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = None
|
||||||
|
sourcing_sheet.cell(row, source_is_available_col).value = "FALSE"
|
||||||
|
|
||||||
|
processed_count += 1
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Processing row {row}: {source_link}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Scrape price
|
||||||
|
gbp_price, active = scrape_magicmadhouse_price_selenium(driver, source_link)
|
||||||
|
|
||||||
|
if (gbp_price is not None and active is not None):
|
||||||
|
print(f" Found price: {gbp_price}, active: {active}")
|
||||||
|
|
||||||
|
if gbp_price:
|
||||||
|
sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price + ITEM_SHIPPING_COST_IN
|
||||||
|
sourcing_sheet.cell(row, source_is_available_col).value = "TRUE" if active else "FALSE"
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not parse price")
|
||||||
|
else:
|
||||||
|
print(f" Error: Could not find price on page")
|
||||||
|
# Save workbook
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Saving workbook...")
|
||||||
|
wb.save(workbook_name)
|
||||||
|
|
||||||
|
print(f"\nComplete!")
|
||||||
|
print(f"Processed: {processed_count} Magic Madhouse entries")
|
||||||
|
print(f"Updated: {updated_count} prices")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
14
requirements.txt
Normal file
14
requirements.txt
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# YGH
|
||||||
|
pandas
|
||||||
|
|
||||||
|
# MTG
|
||||||
|
ijson
|
||||||
|
|
||||||
|
# PKM
|
||||||
|
openpyxl
|
||||||
|
xlsxwriter
|
||||||
|
|
||||||
|
|
||||||
|
# Product Scraping
|
||||||
|
selenium
|
||||||
|
undetected_chromedriver
|
||||||
154
yugioh_card_fetcher.py
Normal file
154
yugioh_card_fetcher.py
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Yu-Gi-Oh Card Data Importer
|
||||||
|
Fetches all TCG cards from YGOPRODeck API and exports to Excel
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
def fetch_yugioh_cards():
|
||||||
|
"""Fetch all Yu-Gi-Oh cards from the API"""
|
||||||
|
print("Fetching card data from API...")
|
||||||
|
|
||||||
|
url = "https://db.ygoprodeck.com/api/v7/cardinfo.php"
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, timeout=30)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
return data['data']
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"Error fetching data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_card_data(cards):
|
||||||
|
"""Parse card data into a flat structure for Excel"""
|
||||||
|
print(f"Processing {len(cards)} cards...")
|
||||||
|
|
||||||
|
parsed_cards = []
|
||||||
|
|
||||||
|
for card in cards:
|
||||||
|
# Basic card info
|
||||||
|
card_info = {
|
||||||
|
'ID': card.get('id'),
|
||||||
|
'Name': card.get('name'),
|
||||||
|
'Type': card.get('type'),
|
||||||
|
'Human Readable Type': card.get('humanReadableCardType', ''),
|
||||||
|
'Frame Type': card.get('frameType', ''),
|
||||||
|
'Description': card.get('desc', ''),
|
||||||
|
'Race': card.get('race', ''),
|
||||||
|
'Archetype': card.get('archetype', ''),
|
||||||
|
'ATK': card.get('atk', ''),
|
||||||
|
'DEF': card.get('def', ''),
|
||||||
|
'Level': card.get('level', ''),
|
||||||
|
'Attribute': card.get('attribute', ''),
|
||||||
|
'Scale': card.get('scale', ''),
|
||||||
|
'Linkval': card.get('linkval', ''),
|
||||||
|
'YGOPRODeck URL': card.get('ygoprodeck_url', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get all set info if available (comma-separated lists)
|
||||||
|
if 'card_sets' in card and card['card_sets']:
|
||||||
|
set_names = []
|
||||||
|
set_codes = []
|
||||||
|
set_rarities = []
|
||||||
|
set_prices = []
|
||||||
|
|
||||||
|
for card_set in card['card_sets']:
|
||||||
|
set_names.append(card_set.get('set_name', ''))
|
||||||
|
set_codes.append(card_set.get('set_code', ''))
|
||||||
|
set_rarities.append(card_set.get('set_rarity', ''))
|
||||||
|
set_prices.append(card_set.get('set_price', ''))
|
||||||
|
|
||||||
|
card_info['Set Name'] = ', '.join(set_names)
|
||||||
|
card_info['Set Code'] = ', '.join(set_codes)
|
||||||
|
card_info['Set Rarity'] = ', '.join(set_rarities)
|
||||||
|
card_info['Set Price'] = ', '.join(set_prices)
|
||||||
|
else:
|
||||||
|
card_info['Set Name'] = ''
|
||||||
|
card_info['Set Code'] = ''
|
||||||
|
card_info['Set Rarity'] = ''
|
||||||
|
card_info['Set Price'] = ''
|
||||||
|
|
||||||
|
# Get price info if available
|
||||||
|
if 'card_prices' in card and card['card_prices']:
|
||||||
|
prices = card['card_prices'][0]
|
||||||
|
card_info['TCGPlayer Price'] = prices.get('tcgplayer_price', '')
|
||||||
|
card_info['Cardmarket Price'] = prices.get('cardmarket_price', '')
|
||||||
|
card_info['eBay Price'] = prices.get('ebay_price', '')
|
||||||
|
card_info['Amazon Price'] = prices.get('amazon_price', '')
|
||||||
|
else:
|
||||||
|
card_info['TCGPlayer Price'] = ''
|
||||||
|
card_info['Cardmarket Price'] = ''
|
||||||
|
card_info['eBay Price'] = ''
|
||||||
|
card_info['Amazon Price'] = ''
|
||||||
|
|
||||||
|
parsed_cards.append(card_info)
|
||||||
|
|
||||||
|
return parsed_cards
|
||||||
|
|
||||||
|
def export_to_excel(cards, filename='yugioh_cards.xlsx'):
|
||||||
|
"""Export cards to Excel file"""
|
||||||
|
print(f"Creating Excel file: {filename}")
|
||||||
|
|
||||||
|
# Create DataFrame
|
||||||
|
df = pd.DataFrame(cards)
|
||||||
|
|
||||||
|
# Create Excel writer
|
||||||
|
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
|
||||||
|
df.to_excel(writer, sheet_name='YuGiOh Cards', index=False)
|
||||||
|
|
||||||
|
# Get the worksheet
|
||||||
|
worksheet = writer.sheets['YuGiOh Cards']
|
||||||
|
|
||||||
|
# Auto-adjust column widths
|
||||||
|
for column in worksheet.columns:
|
||||||
|
max_length = 0
|
||||||
|
column_letter = column[0].column_letter
|
||||||
|
|
||||||
|
for cell in column:
|
||||||
|
try:
|
||||||
|
if cell.value:
|
||||||
|
max_length = max(max_length, len(str(cell.value)))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Set width (max 50 for description column)
|
||||||
|
adjusted_width = min(max_length + 2, 50)
|
||||||
|
worksheet.column_dimensions[column_letter].width = adjusted_width
|
||||||
|
|
||||||
|
# Apply header formatting
|
||||||
|
for cell in worksheet[1]:
|
||||||
|
cell.font = cell.font.copy(bold=True)
|
||||||
|
|
||||||
|
# Add autofilter
|
||||||
|
worksheet.auto_filter.ref = worksheet.dimensions
|
||||||
|
|
||||||
|
print(f"Successfully exported {len(cards)} cards to {filename}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function"""
|
||||||
|
print("Yu-Gi-Oh Card Data Importer")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
# Fetch cards
|
||||||
|
cards = fetch_yugioh_cards()
|
||||||
|
|
||||||
|
if not cards:
|
||||||
|
print("Failed to fetch card data. Exiting.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse cards
|
||||||
|
parsed_cards = parse_card_data(cards)
|
||||||
|
|
||||||
|
# Export to Excel
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
filename = f"yugioh_cards_{timestamp}.xlsx"
|
||||||
|
export_to_excel(parsed_cards, filename)
|
||||||
|
|
||||||
|
print("\nDone! You can now open the file in LibreOffice Calc or Excel.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
BIN
yugioh_cards_20251010_154113.xlsx
Normal file
BIN
yugioh_cards_20251010_154113.xlsx
Normal file
Binary file not shown.
BIN
yugioh_cards_20251011_110545.xlsx
Normal file
BIN
yugioh_cards_20251011_110545.xlsx
Normal file
Binary file not shown.
Reference in New Issue
Block a user