diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..069a480 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +env_api/ +pkm_data/ + +# too big +mtg_cards_20251019_095943.csv +mtg_cards_20251019_101118.xlsx +mtg-default-cards-20251018212333.json \ No newline at end of file diff --git a/mtg_card_fetcher copy.py b/mtg_card_fetcher copy.py new file mode 100644 index 0000000..9d3149a --- /dev/null +++ b/mtg_card_fetcher copy.py @@ -0,0 +1,293 @@ +import json +import csv +import sys +from pathlib import Path +from datetime import datetime +import openpyxl +from openpyxl import Workbook + +def flatten_card_data(card): + """ + Flatten a single card's data structure into a dictionary suitable for CSV. + Handles nested fields and converts lists to comma-separated strings. + """ + flat_card = {} + + # Basic fields + simple_fields = [ + 'id', 'oracle_id', 'name', 'lang', 'released_at', 'uri', 'scryfall_uri', + 'layout', 'highres_image', 'image_status', 'mana_cost', 'cmc', 'type_line', + 'oracle_text', 'power', 'toughness', 'loyalty', 'life_modifier', 'hand_modifier', + 'reserved', 'foil', 'nonfoil', 'oversized', 'promo', 'reprint', 'variation', + 'set_id', 'set', 'set_name', 'set_type', 'set_uri', 'set_search_uri', + 'scryfall_set_uri', 'rulings_uri', 'prints_search_uri', 'collector_number', + 'digital', 'rarity', 'card_back_id', 'artist', 'border_color', 'frame', + 'full_art', 'textless', 'booster', 'story_spotlight', 'edhrec_rank', + 'penny_rank', 'flavor_text', 'watermark', 'printed_name', 'printed_type_line', + 'printed_text', 'security_stamp', 'preview_text', 'content_warning', + 'flavor_name', 'game_changer' + ] + + # Copy simple fields + for field in simple_fields: + if field in card: + flat_card[field] = card[field] + + # Handle array fields - convert to comma-separated strings + array_fields = [ + 'multiverse_ids', 'colors', 'color_identity', 'keywords', 'produced_mana', + 'games', 'finishes', 'artist_ids', 'all_parts', 'card_faces', 'related_cards' + ] + + for field in array_fields: + if field in card: + if isinstance(card[field], list): + # Convert list items to strings and join + flat_card[field] = ', '.join(str(item) for item in card[field]) + else: + flat_card[field] = card[field] + + # Handle MTGO and Arena IDs + if 'mtgo_id' in card: + flat_card['mtgo_id'] = card['mtgo_id'] + if 'arena_id' in card: + flat_card['arena_id'] = card['arena_id'] + if 'tcgplayer_id' in card: + flat_card['tcgplayer_id'] = card['tcgplayer_id'] + if 'cardmarket_id' in card: + flat_card['cardmarket_id'] = card['cardmarket_id'] + + # Handle image_uris (nested dict) + if 'image_uris' in card and isinstance(card['image_uris'], dict): + for key, value in card['image_uris'].items(): + flat_card[f'image_uri_{key}'] = value + + # Handle legalities (nested dict) + if 'legalities' in card and isinstance(card['legalities'], dict): + for format_name, status in card['legalities'].items(): + flat_card[f'legal_{format_name}'] = status + + # Handle prices (nested dict) + if 'prices' in card and isinstance(card['prices'], dict): + for currency, price in card['prices'].items(): + flat_card[f'price_{currency}'] = price + + # Handle related_uris (nested dict) + if 'related_uris' in card and isinstance(card['related_uris'], dict): + for uri_type, uri in card['related_uris'].items(): + flat_card[f'uri_{uri_type}'] = uri + + # Handle purchase_uris (nested dict) + if 'purchase_uris' in card and isinstance(card['purchase_uris'], dict): + for store, uri in card['purchase_uris'].items(): + flat_card[f'purchase_{store}'] = uri + + # Handle preview information + if 'preview' in card and isinstance(card['preview'], dict): + if 'source' in card['preview']: + flat_card['preview_source'] = card['preview']['source'] + if 'source_uri' in card['preview']: + flat_card['preview_source_uri'] = card['preview']['source_uri'] + if 'previewed_at' in card['preview']: + flat_card['preview_date'] = card['preview']['previewed_at'] + + return flat_card + +def detect_json_format(input_file): + """ + Detect if the file is NDJSON or JSON array format + """ + with open(input_file, 'r', encoding='utf-8') as f: + # Read first few characters + first_chars = f.read(100).strip() + if first_chars.startswith('['): + return 'array' + elif first_chars.startswith('{'): + return 'ndjson' + else: + # Try to read first line and detect + f.seek(0) + first_line = f.readline().strip() + if first_line.startswith('[') or first_line == '[': + return 'array' + else: + return 'ndjson' + +def process_scryfall_array(input_file, output_file): + """ + Process a Scryfall JSON array file using streaming JSON parser to handle large files. + """ + print(f"Processing {input_file} (JSON array format)...") + print("Loading and parsing JSON data (this may take a minute for large files)...") + + import ijson # We'll need ijson for streaming large JSON arrays + + # First, let's try with regular json + try: + with open(input_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + print(f"Loaded {len(data):,} cards") + + # Analyze structure + print("Analyzing card structure...") + all_fields = set() + for i, card in enumerate(data): + flat_card = flatten_card_data(card) + all_fields.update(flat_card.keys()) + if (i + 1) % 10000 == 0: + print(f" Analyzed {i + 1:,} cards...") + + print(f"Found {len(all_fields)} unique fields") + fieldnames = sorted(list(all_fields)) + + # Write CSV + print("Writing CSV file...") + with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + for i, card in enumerate(data): + flat_card = flatten_card_data(card) + writer.writerow(flat_card) + if (i + 1) % 10000 == 0: + print(f" Written {i + 1:,} cards...") + + print(f"\nComplete! Written {len(data):,} cards to {output_file}") + + except MemoryError: + print("File too large for memory. Please install ijson: pip install ijson") + print("Then run the script again.") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error parsing JSON: {e}") + print("The file might be corrupted or not in valid JSON format.") + sys.exit(1) + +def process_scryfall_ndjson(input_file, output_file): + """ + Process a Scryfall NDJSON (newline-delimited JSON) file and convert it to CSV format. + """ + print(f"Processing {input_file} (NDJSON format)...") + + # First pass: collect all possible fields + print("First pass: Analyzing card structure...") + all_fields = set() + cards_processed = 0 + errors = 0 + + with open(input_file, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + + try: + card = json.loads(line) + flat_card = flatten_card_data(card) + all_fields.update(flat_card.keys()) + cards_processed += 1 + + if cards_processed % 10000 == 0: + print(f" Analyzed {cards_processed:,} cards...") + + except json.JSONDecodeError as e: + errors += 1 + continue + + print(f"Found {len(all_fields)} unique fields across {cards_processed:,} cards") + if errors > 0: + print(f" (Skipped {errors} malformed lines)") + + # Sort fields for consistent column order + fieldnames = sorted(list(all_fields)) + + # Second pass: write CSV + print("\nSecond pass: Writing CSV...") + cards_written = 0 + errors = 0 + + with open(input_file, 'r', encoding='utf-8') as f: + with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + + try: + card = json.loads(line) + flat_card = flatten_card_data(card) + writer.writerow(flat_card) + cards_written += 1 + + if cards_written % 10000 == 0: + print(f" Written {cards_written:,} cards...") + + except (json.JSONDecodeError, Exception) as e: + errors += 1 + continue + + print(f"\nComplete! Written {cards_written:,} cards to {output_file}") + if errors > 0: + print(f"Skipped {errors} problematic lines") + +def main(): + """ + Main function to handle command line arguments and run the conversion. + """ + """ + if len(sys.argv) != 3: + print("Usage: python3 mtg_card_fetcher.py ") + print("Example: python3 mtg_card_fetcher.py mtg-default-cards-20251018212333.json cards_collection.csv") + sys.exit(1) + """ + if len(sys.argv) != 2: + print("Usage: python3 mtg_card_fetcher.py ") + print("Example: python3 mtg_card_fetcher.py mtg-default-cards-20251018212333.json") + sys.exit(1) + + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + + input_file = sys.argv[1] + output_file = f'mtg_cards_{timestamp}.csv' # sys.argv[2] + + # Validate input file exists + if not Path(input_file).exists(): + print(f"Error: Input file '{input_file}' not found!") + sys.exit(1) + + # Check file size + file_size = Path(input_file).stat().st_size / (1024 * 1024) # Size in MB + print(f"Input file size: {file_size:.1f} MB") + + # Detect format + print("Detecting file format...") + format_type = detect_json_format(input_file) + print(f"Detected format: {format_type.upper()}") + + # Warn if output file exists + if Path(output_file).exists(): + response = input(f"Warning: Output file '{output_file}' already exists. Overwrite? (y/n): ") + if response.lower() != 'y': + print("Cancelled.") + sys.exit(0) + + try: + if format_type == 'array': + process_scryfall_array(input_file, output_file) + else: + process_scryfall_ndjson(input_file, output_file) + except KeyboardInterrupt: + print("\n\nProcess interrupted by user.") + sys.exit(1) + except Exception as e: + print(f"Error processing file: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/mtg_card_fetcher.py b/mtg_card_fetcher.py new file mode 100644 index 0000000..2ef153f --- /dev/null +++ b/mtg_card_fetcher.py @@ -0,0 +1,351 @@ +import json +import sys +from pathlib import Path +from openpyxl import Workbook +from openpyxl.utils import get_column_letter +from openpyxl.styles import Font +from datetime import datetime + +def flatten_card_data(card): + """ + Flatten a single card's data structure into a dictionary suitable for Excel. + Handles nested fields and converts lists to comma-separated strings. + """ + flat_card = {} + + # Basic fields + simple_fields = [ + 'id', 'oracle_id', 'name', 'lang', 'released_at', 'uri', 'scryfall_uri', + 'layout', 'highres_image', 'image_status', 'mana_cost', 'cmc', 'type_line', + 'oracle_text', 'power', 'toughness', 'loyalty', 'life_modifier', 'hand_modifier', + 'reserved', 'foil', 'nonfoil', 'oversized', 'promo', 'reprint', 'variation', + 'set_id', 'set', 'set_name', 'set_type', 'set_uri', 'set_search_uri', + 'scryfall_set_uri', 'rulings_uri', 'prints_search_uri', 'collector_number', + 'digital', 'rarity', 'card_back_id', 'artist', 'border_color', 'frame', + 'full_art', 'textless', 'booster', 'story_spotlight', 'edhrec_rank', + 'penny_rank', 'flavor_text', 'watermark', 'printed_name', 'printed_type_line', + 'printed_text', 'security_stamp', 'preview_text', 'content_warning', + 'flavor_name', 'game_changer' + ] + + # Copy simple fields + for field in simple_fields: + if field in card: + flat_card[field] = card[field] + + # Handle array fields - convert to comma-separated strings + array_fields = [ + 'multiverse_ids', 'colors', 'color_identity', 'keywords', 'produced_mana', + 'games', 'finishes', 'artist_ids', 'all_parts', 'card_faces', 'related_cards' + ] + + for field in array_fields: + if field in card: + if isinstance(card[field], list): + # Convert list items to strings and join + flat_card[field] = ', '.join(str(item) for item in card[field]) + else: + flat_card[field] = card[field] + + # Handle MTGO and Arena IDs + if 'mtgo_id' in card: + flat_card['mtgo_id'] = card['mtgo_id'] + if 'arena_id' in card: + flat_card['arena_id'] = card['arena_id'] + if 'tcgplayer_id' in card: + flat_card['tcgplayer_id'] = card['tcgplayer_id'] + if 'cardmarket_id' in card: + flat_card['cardmarket_id'] = card['cardmarket_id'] + + # Handle image_uris (nested dict) + if 'image_uris' in card and isinstance(card['image_uris'], dict): + for key, value in card['image_uris'].items(): + flat_card[f'image_uri_{key}'] = value + + # Handle legalities (nested dict) + if 'legalities' in card and isinstance(card['legalities'], dict): + for format_name, status in card['legalities'].items(): + flat_card[f'legal_{format_name}'] = status + + # Handle prices (nested dict) + if 'prices' in card and isinstance(card['prices'], dict): + for currency, price in card['prices'].items(): + flat_card[f'price_{currency}'] = price + + # Handle related_uris (nested dict) + if 'related_uris' in card and isinstance(card['related_uris'], dict): + for uri_type, uri in card['related_uris'].items(): + flat_card[f'uri_{uri_type}'] = uri + + # Handle purchase_uris (nested dict) + if 'purchase_uris' in card and isinstance(card['purchase_uris'], dict): + for store, uri in card['purchase_uris'].items(): + flat_card[f'purchase_{store}'] = uri + + # Handle preview information + if 'preview' in card and isinstance(card['preview'], dict): + if 'source' in card['preview']: + flat_card['preview_source'] = card['preview']['source'] + if 'source_uri' in card['preview']: + flat_card['preview_source_uri'] = card['preview']['source_uri'] + if 'previewed_at' in card['preview']: + flat_card['preview_date'] = card['preview']['previewed_at'] + + return flat_card + +def detect_json_format(input_file): + """ + Detect if the file is NDJSON or JSON array format + """ + with open(input_file, 'r', encoding='utf-8') as f: + # Read first few characters + first_chars = f.read(100).strip() + if first_chars.startswith('['): + return 'array' + elif first_chars.startswith('{'): + return 'ndjson' + else: + # Try to read first line and detect + f.seek(0) + first_line = f.readline().strip() + if first_line.startswith('[') or first_line == '[': + return 'array' + else: + return 'ndjson' + +def write_to_excel(cards_data, fieldnames, output_file): + """ + Write the card data to an Excel file with formatting + """ + wb = Workbook() + ws = wb.active + ws.title = "Magic Cards" + + # Write headers with bold formatting + header_font = Font(bold=True) + for col, field in enumerate(fieldnames, 1): + cell = ws.cell(row=1, column=col, value=field) + cell.font = header_font + + # Write data + print("Writing to Excel file...") + for row_num, card_data in enumerate(cards_data, 2): + for col, field in enumerate(fieldnames, 1): + value = card_data.get(field, '') + # Excel has a cell character limit of 32,767 + if isinstance(value, str) and len(value) > 32767: + value = value[:32764] + "..." + ws.cell(row=row_num, column=col, value=value) + + if row_num % 10000 == 0: + print(f" Written {row_num - 1:,} cards...") + + # Auto-adjust column widths (limited to prevent excessive widths) + print("Adjusting column widths...") + for column in ws.columns: + max_length = 0 + column_letter = get_column_letter(column[0].column) + + for cell in column[:100]: # Check first 100 rows for performance + try: + if cell.value: + max_length = max(max_length, len(str(cell.value))) + except: + pass + + adjusted_width = min(max_length + 2, 50) # Cap at 50 characters + ws.column_dimensions[column_letter].width = adjusted_width + + # Freeze the header row + ws.freeze_panes = 'A2' + + # Enable filters + ws.auto_filter.ref = ws.dimensions + + print("Saving Excel file...") + wb.save(output_file) + print(f"Saved {len(cards_data):,} cards to {output_file}") + +def process_scryfall_array(input_file, output_file): + """ + Process a Scryfall JSON array file and convert to Excel. + """ + print(f"Processing {input_file} (JSON array format)...") + print("Loading and parsing JSON data (this may take a minute for large files)...") + + try: + with open(input_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + print(f"Loaded {len(data):,} cards") + + # Analyze structure + print("Analyzing card structure...") + all_fields = set() + processed_cards = [] + + for i, card in enumerate(data): + flat_card = flatten_card_data(card) + all_fields.update(flat_card.keys()) + processed_cards.append(flat_card) + if (i + 1) % 10000 == 0: + print(f" Analyzed {i + 1:,} cards...") + + print(f"Found {len(all_fields)} unique fields") + fieldnames = sorted(list(all_fields)) + + # Write to Excel + write_to_excel(processed_cards, fieldnames, output_file) + print(f"\nComplete! You can now open {output_file} in LibreOffice Calc or Excel") + + except MemoryError: + print("File too large for memory. The file might be too big to process at once.") + print("Consider using a streaming JSON parser or processing in chunks.") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error parsing JSON: {e}") + print("The file might be corrupted or not in valid JSON format.") + sys.exit(1) + +def process_scryfall_ndjson(input_file, output_file): + """ + Process a Scryfall NDJSON file and convert to Excel. + """ + print(f"Processing {input_file} (NDJSON format)...") + + # First pass: collect all possible fields + print("First pass: Analyzing card structure...") + all_fields = set() + cards_processed = 0 + errors = 0 + + with open(input_file, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + + try: + card = json.loads(line) + flat_card = flatten_card_data(card) + all_fields.update(flat_card.keys()) + cards_processed += 1 + + if cards_processed % 10000 == 0: + print(f" Analyzed {cards_processed:,} cards...") + + except json.JSONDecodeError as e: + errors += 1 + continue + + print(f"Found {len(all_fields)} unique fields across {cards_processed:,} cards") + if errors > 0: + print(f" (Skipped {errors} malformed lines)") + + fieldnames = sorted(list(all_fields)) + + # Second pass: collect all data + print("\nSecond pass: Reading card data...") + processed_cards = [] + cards_read = 0 + errors = 0 + + with open(input_file, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + + try: + card = json.loads(line) + flat_card = flatten_card_data(card) + processed_cards.append(flat_card) + cards_read += 1 + + if cards_read % 10000 == 0: + print(f" Read {cards_read:,} cards...") + + except (json.JSONDecodeError, Exception) as e: + errors += 1 + continue + + print(f"Read {cards_read:,} cards successfully") + if errors > 0: + print(f" (Skipped {errors} problematic lines)") + + # Write to Excel + write_to_excel(processed_cards, fieldnames, output_file) + print(f"\nComplete! You can now open {output_file} in LibreOffice Calc or Excel") + +def main(): + """ + Main function to handle command line arguments and run the conversion. + """ + """ + if len(sys.argv) != 3: + print("Usage: python scryfall_to_xlsx.py ") + print("Example: python scryfall_to_xlsx.py all-cards-20241019.json cards_collection.xlsx") + sys.exit(1) + """ + if len(sys.argv) != 2: + print("Usage: python3 mtg_card_fetcher.py ") + print("Example: python3 mtg_card_fetcher.py mtg-default-cards-20251018212333.json") + sys.exit(1) + + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + + input_file = sys.argv[1] + output_file = f'mtg_cards_{timestamp}.xlsx' # sys.argv[2] + + # Validate input file exists + if not Path(input_file).exists(): + print(f"Error: Input file '{input_file}' not found!") + sys.exit(1) + + # Check output file has xlsx extension + if not output_file.endswith('.xlsx'): + print("Warning: Output file should have .xlsx extension") + response = input("Continue anyway? (y/n): ") + if response.lower() != 'y': + sys.exit(0) + + # Check file size + file_size = Path(input_file).stat().st_size / (1024 * 1024) # Size in MB + print(f"Input file size: {file_size:.1f} MB") + + # Detect format + print("Detecting file format...") + format_type = detect_json_format(input_file) + print(f"Detected format: {format_type.upper()}") + + # Warn if output file exists + if Path(output_file).exists(): + response = input(f"Warning: Output file '{output_file}' already exists. Overwrite? (y/n): ") + if response.lower() != 'y': + print("Cancelled.") + sys.exit(0) + + # Check if openpyxl is installed + try: + import openpyxl + except ImportError: + print("\nError: openpyxl library is required for Excel output.") + print("Please install it using: pip install openpyxl") + sys.exit(1) + + try: + if format_type == 'array': + process_scryfall_array(input_file, output_file) + else: + process_scryfall_ndjson(input_file, output_file) + except KeyboardInterrupt: + print("\n\nProcess interrupted by user.") + sys.exit(1) + except Exception as e: + print(f"Error processing file: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/pkm_card_fetcher.py b/pkm_card_fetcher.py new file mode 100644 index 0000000..1f5f131 --- /dev/null +++ b/pkm_card_fetcher.py @@ -0,0 +1,277 @@ +import json +import os +import pandas as pd +from datetime import datetime +import glob + +def flatten_list(items): + """Convert list to comma-separated string""" + if items is None: + return "" + if isinstance(items, list): + return ", ".join(str(item) for item in items) + return str(items) + +def extract_ability_info(abilities): + """Extract ability information as comma-separated values""" + if not abilities: + return "", "", "" + + names = [] + texts = [] + types = [] + + for ability in abilities: + names.append(ability.get('name', '')) + texts.append(ability.get('text', '')) + types.append(ability.get('type', '')) + + return flatten_list(names), flatten_list(texts), flatten_list(types) + +def extract_attack_info(attacks): + """Extract attack information as comma-separated values""" + if not attacks: + return "", "", "", "", "" + + names = [] + costs = [] + damages = [] + texts = [] + converted_costs = [] + + for attack in attacks: + names.append(attack.get('name', '')) + costs.append(flatten_list(attack.get('cost', []))) + damages.append(attack.get('damage', '')) + texts.append(attack.get('text', '')) + converted_costs.append(str(attack.get('convertedEnergyCost', ''))) + + return (flatten_list(names), + " | ".join(costs), # Use | to separate different attacks' costs + flatten_list(damages), + flatten_list(texts), + flatten_list(converted_costs)) + +def extract_weakness_resistance(items): + """Extract weakness or resistance information""" + if not items: + return "", "" + + types = [] + values = [] + + for item in items: + types.append(item.get('type', '')) + values.append(item.get('value', '')) + + return flatten_list(types), flatten_list(values) + +def extract_prices(price_dict, prefix): + """Extract price information from nested price dictionaries""" + if not price_dict: + return {} + + result = {} + for price_type, prices in price_dict.items(): + if isinstance(prices, dict): + for metric, value in prices.items(): + key = f"{prefix}_{price_type}_{metric}" + result[key] = value + else: + # Handle direct price values + key = f"{prefix}_{price_type}" + result[key] = prices + + return result + +def process_card(card): + """Process a single card and return a flattened dictionary""" + row = { + 'id': card.get('id', ''), + 'name': card.get('name', ''), + 'supertype': card.get('supertype', ''), + 'subtypes': flatten_list(card.get('subtypes', [])), + 'level': card.get('level', ''), + 'hp': card.get('hp', ''), + 'types': flatten_list(card.get('types', [])), + 'evolvesFrom': card.get('evolvesFrom', ''), + 'evolvesTo': flatten_list(card.get('evolvesTo', [])), + 'rules': flatten_list(card.get('rules', [])), + 'number': card.get('number', ''), + 'artist': card.get('artist', ''), + 'rarity': card.get('rarity', ''), + 'flavorText': card.get('flavorText', ''), + 'nationalPokedexNumbers': flatten_list(card.get('nationalPokedexNumbers', [])), + 'regulationMark': card.get('regulationMark', ''), + 'retreatCost': flatten_list(card.get('retreatCost', [])), + 'convertedRetreatCost': card.get('convertedRetreatCost', ''), + } + + # Ancient Trait + ancient_trait = card.get('ancientTrait', {}) + if ancient_trait: + row['ancientTrait_name'] = ancient_trait.get('name', '') + row['ancientTrait_text'] = ancient_trait.get('text', '') + else: + row['ancientTrait_name'] = '' + row['ancientTrait_text'] = '' + + # Abilities + abilities = card.get('abilities', []) + row['ability_names'], row['ability_texts'], row['ability_types'] = extract_ability_info(abilities) + + # Attacks + attacks = card.get('attacks', []) + row['attack_names'], row['attack_costs'], row['attack_damages'], row['attack_texts'], row['attack_convertedCosts'] = extract_attack_info(attacks) + + # Weaknesses + weaknesses = card.get('weaknesses', []) + row['weakness_types'], row['weakness_values'] = extract_weakness_resistance(weaknesses) + + # Resistances + resistances = card.get('resistances', []) + row['resistance_types'], row['resistance_values'] = extract_weakness_resistance(resistances) + + # Set information + set_info = card.get('set', {}) + if set_info: + row['set_id'] = set_info.get('id', '') + row['set_name'] = set_info.get('name', '') + row['set_series'] = set_info.get('series', '') + row['set_printedTotal'] = set_info.get('printedTotal', '') + row['set_total'] = set_info.get('total', '') + row['set_ptcgoCode'] = set_info.get('ptcgoCode', '') + row['set_releaseDate'] = set_info.get('releaseDate', '') + + # Legalities + legalities = card.get('legalities', {}) + row['legal_standard'] = legalities.get('standard', '') + row['legal_expanded'] = legalities.get('expanded', '') + row['legal_unlimited'] = legalities.get('unlimited', '') + + # Images + images = card.get('images', {}) + row['image_small'] = images.get('small', '') + row['image_large'] = images.get('large', '') + + # TCGPlayer prices + tcgplayer = card.get('tcgplayer', {}) + if tcgplayer: + row['tcgplayer_url'] = tcgplayer.get('url', '') + row['tcgplayer_updatedAt'] = tcgplayer.get('updatedAt', '') + + # Extract all price types + prices = tcgplayer.get('prices', {}) + tcg_prices = extract_prices(prices, 'tcgplayer') + row.update(tcg_prices) + + # Cardmarket prices + cardmarket = card.get('cardmarket', {}) + if cardmarket: + row['cardmarket_url'] = cardmarket.get('url', '') + row['cardmarket_updatedAt'] = cardmarket.get('updatedAt', '') + + # Extract all price types + prices = cardmarket.get('prices', {}) + cm_prices = extract_prices(prices, 'cardmarket') + row.update(cm_prices) + + return row + +def main(): + # Directory containing the JSON files + data_dir = './pkm_data/cards/en' + + # Get all JSON files + json_files = glob.glob(os.path.join(data_dir, '*.json')) + + if not json_files: + print(f"No JSON files found in {data_dir}") + return + + all_cards = [] + + # Process each JSON file + for json_file in json_files: + print(f"Processing {os.path.basename(json_file)}...") + + try: + with open(json_file, 'r', encoding='utf-8') as f: + # Handle newline-delimited JSON + for line in f: + line = line.strip() + if line: + try: + # Try to parse as a single card + card = json.loads(line) + if isinstance(card, dict): + all_cards.append(process_card(card)) + elif isinstance(card, list): + # If it's a list of cards + for c in card: + all_cards.append(process_card(c)) + except json.JSONDecodeError: + # Try parsing the entire file as one JSON array + f.seek(0) + data = json.load(f) + if isinstance(data, list): + for card in data: + all_cards.append(process_card(card)) + break + + except Exception as e: + print(f"Error processing {json_file}: {str(e)}") + continue + + # Create DataFrame + df = pd.DataFrame(all_cards) + + # Sort by set and number + """ + df['number_int'] = df['number'].str.extract('(\d+)').astype(float, errors='ignore') + df = df.sort_values(['set_name', 'number_int', 'number'], na_position='last') + df = df.drop('number_int', axis=1) + """ + + # Save to Excel + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + output_file = f'pkm_cards_{timestamp}.xlsx' + + # Create Excel writer with xlsxwriter engine + with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer: + df.to_excel(writer, sheet_name='Pokemon Cards', index=False) + + # Get the workbook and worksheet + workbook = writer.book + worksheet = writer.sheets['Pokemon Cards'] + + # Add some formatting + header_format = workbook.add_format({ + 'bold': True, + 'text_wrap': True, + 'valign': 'top', + 'fg_color': '#D7E4BD', + 'border': 1 + }) + + # Write headers with formatting + for col_num, value in enumerate(df.columns.values): + worksheet.write(0, col_num, value, header_format) + + # Auto-adjust column widths + for i, col in enumerate(df.columns): + # Find maximum length in column + max_len = df[col].astype(str).str.len().max() + max_len = max(max_len, len(col)) + 2 + # Cap column width at 50 + max_len = min(max_len, 50) + worksheet.set_column(i, i, max_len) + + print(f"\nSuccessfully created {output_file}") + print(f"Total cards processed: {len(df)}") + print(f"\nColumns in the spreadsheet:") + for col in df.columns: + print(f" - {col}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pkm_cards_20251019_101651.xlsx b/pkm_cards_20251019_101651.xlsx new file mode 100644 index 0000000..c849cf0 Binary files /dev/null and b/pkm_cards_20251019_101651.xlsx differ diff --git a/product_scraping/.~lock.TCG Sole Trader Copy.xlsx# b/product_scraping/.~lock.TCG Sole Trader Copy.xlsx# new file mode 100644 index 0000000..dfc9a02 --- /dev/null +++ b/product_scraping/.~lock.TCG Sole Trader Copy.xlsx# @@ -0,0 +1 @@ +,teddy,lord-T-1024,09.01.2026 15:38,file:///home/teddy/.config/libreoffice/4; \ No newline at end of file diff --git a/product_scraping/TCG Sole Trader Copy - dead.xlsx b/product_scraping/TCG Sole Trader Copy - dead.xlsx new file mode 100644 index 0000000..1401153 Binary files /dev/null and b/product_scraping/TCG Sole Trader Copy - dead.xlsx differ diff --git a/product_scraping/TCG Sole Trader Copy.xlsx b/product_scraping/TCG Sole Trader Copy.xlsx new file mode 100644 index 0000000..3f5e57e Binary files /dev/null and b/product_scraping/TCG Sole Trader Copy.xlsx differ diff --git a/product_scraping/cost_fetcher_base.py b/product_scraping/cost_fetcher_base.py new file mode 100644 index 0000000..54be02e --- /dev/null +++ b/product_scraping/cost_fetcher_base.py @@ -0,0 +1,593 @@ +import pandas as pd +from openpyxl import load_workbook, Workbook +from openpyxl.worksheet.worksheet import Worksheet +import requests +from selenium import webdriver +from selenium.common.exceptions import StaleElementReferenceException +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +import re +import time +import random + +# import undetected_chromedriver as uc +from undetected_chromedriver import Chrome + + +class Cost_Fetcher_Base: + PRODUCT_WORKSHEET_NAME = 'Product' + SOURCING_WORKSHEET_NAME = 'Sourcing' + WORKBOOK_NAME = 'TCG Sole Trader Copy.xlsx' + + driver: Chrome # webdriver.Chrome + eur_to_gbp_rate: float + index_column_active_sourcing: int + index_column_is_booster_product: int + index_column_is_booster_box_product: int + index_column_is_precon_product: int + index_column_link_sourcing: int + index_column_name_sourcing: int + index_column_product_id_product: int + index_column_product_id_sourcing: int + index_column_unit_cost_sourcing: int + index_row_header_product: int + index_row_header_sourcing: int + product_sheet: Worksheet + sourcing_sheet: Worksheet + wait: WebDriverWait + workbook: Workbook + + @staticmethod + def parse_cost(cost_text): + if not cost_text: + return None + cost_clean = re.sub(r'[^\d,]', '', cost_text) + try: + return float(cost_clean) / 100 + except ValueError: + return None + @classmethod + def parse_cost_from_pennies(cls, cost_text): + if not cost_text: + return None + cost_clean = cls.parse_cost(cost_text = cost_text) + if cost_clean is not None: + cost_clean = cost_clean / 100 + return cost_clean + @classmethod + def parse_cost_chaoscards(cls, cost_text): + return cls.parse_cost(cost_text = cost_text) + @classmethod + def parse_cost_cardmarket(cls, cost_text): + # return cls.parse_cost(cost_text = cost_text) + """Convert '141,30 €' format to float in EUR""" + if not cost_text: + return None + cost_clean = re.sub(r'[^\d,]', '', cost_text) + cost_clean = cost_clean.replace(',', '.') + try: + return float(cost_clean) + except ValueError: + return None + @classmethod + def parse_cost_gameslore(cls, cost_text): + return cls.parse_cost(cost_text = cost_text) + @classmethod + def parse_cost_magicmadhouse(cls, cost_text): + return cls.parse_cost(cost_text = cost_text) + + def get_eur_to_gbp_rate(self): + try: + response = requests.get('https://api.exchangerate-api.com/v4/latest/EUR', timeout=10) + data = response.json() + self.eur_to_gbp_rate = data['rates']['GBP'] + except Exception as e: + print(f"Error fetching exchange rate: {e}") + print("Using fallback rate: 0.85") + self.eur_to_gbp_rate = 0.85 + + def setup_driver(self): + print("Starting driver") + """ + chrome_options = Options() + # Remove headless mode to see the browser + # chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36') + chrome_options.add_argument('--window-size=1920,1080') + """ + try: + self.driver = Chrome(version_main=133) # webdriver.Chrome(options=chrome_options) + # return driver + except Exception as e: + print(f"Error setting up Chrome driver: {e}") + print("Make sure Chrome and chromedriver are installed") + # return None + self.wait = WebDriverWait(self.driver, 15) + + def scrape_cost_and_active_selenium(self, url, page_load_element_selector, cost_selector, active_selector, invalid_active_statuses): + try: + print(f" Loading page...") + # time.sleep(random.uniform(6, 10)) + try: + self.driver.get(url) + element = self.wait.until( + EC.presence_of_element_located((By.CSS_SELECTOR, page_load_element_selector)) + ) + element = self.wait.until( + EC.element_to_be_clickable((By.CSS_SELECTOR, page_load_element_selector)) + ) + except Exception as e: + self.driver.get(url) + element = self.wait.until( + EC.presence_of_element_located((By.CSS_SELECTOR, page_load_element_selector)) + ) + element = self.wait.until( + EC.element_to_be_clickable((By.CSS_SELECTOR, page_load_element_selector)) + ) + + max_attempts = 10 + for attempt in range(max_attempts): + try: + element = None + element = self.driver.find_element(By.CSS_SELECTOR, page_load_element_selector) + text = element.text + print(f"✓ Element loaded successfully on attempt {attempt + 1}") + # return True + break + except StaleElementReferenceException: + print(f"Stale element on attempt {attempt + 1}, retrying...") + if attempt < max_attempts - 1: + time.sleep(1) + else: + raise ValueError("StaleElementReferenceException") + + print(f" Page title: {self.driver.title}") + + cost = None + element = None + counter = 0 + while cost is None: + counter += 1 + try: + element = self.driver.find_element(By.CSS_SELECTOR, cost_selector) + text = element.text + print(f" Text: '{text}'") + cost = text + except Exception as e: + print(f" Selector failed: {e}") + cost = None + time.sleep(random.uniform(2, 4)) + if counter > 10: + print("10 cost selector fails") + break + + active = None + if active_selector is None: # or invalid_active_statuses is None or invalid_active_statuses == []: + active = (cost is not None) + else: + try: + elements = None + elements = self.driver.find_elements(By.CSS_SELECTOR, active_selector) + if len(elements) == 0: + active = True + else: + text = elements[0].text + print(f" Text: '{text}'") + active = (invalid_active_statuses is None or text not in invalid_active_statuses) + except Exception as e: + print(f" Selector failed: {e}") + + if cost is None or active is None: + print(f" ✗ No cost found") + print(f"Cost: {cost}, Active: {active}") + input("Press Enter to continue to next URL...") + return cost, active + + except Exception as e: + print(f" Error: {e}") + input("Press Enter to continue to next URL...") + return None, None + + def scrape_cost_and_active_selenium_cardmarket(self, url): + page_load_element_selector = "body > main.container > div.page-title-container" + cost_selector = '#table > div:nth-child(1) > div.table-body > .row.article-row.g-0:nth-child(1) > div.col-offer > div.price-container > div > div:nth-child(1) > span:nth-child(1)' + cost_text, active = self.scrape_cost_and_active_selenium( + url = url + , page_load_element_selector = page_load_element_selector + , cost_selector = cost_selector + , active_selector = None + , invalid_active_statuses = [] + ) + cost = Cost_Fetcher_Base.parse_cost_cardmarket(cost_text) + if cost is not None: + item_shipping_cost_in = 0 + if cost < 10: + item_shipping_cost_in = 2 + elif cost < 100: + item_shipping_cost_in = 8 + else: + item_shipping_cost_in = 20 + cost = cost * self.eur_to_gbp_rate + item_shipping_cost_in + active = (cost is not None) + return cost, active + + def scrape_cost_and_active_selenium_chaoscards(self, url): + # page_load_element_selector = '#prod_title' + cost_selector = '.price_inc > span:nth-child(2)' + active_selector = '.product__right > form > ul.prod_det_fields.left.product-section.product-section--stock > li > div:nth-child(1) > div:nth-child(2)' + cost_text, active = self.scrape_cost_and_active_selenium( + url = url + , page_load_element_selector = cost_selector # page_load_element_selector + , cost_selector = cost_selector + , active_selector = active_selector + , invalid_active_statuses = ["Out of stock", "Coming soon"] + ) + cost = Cost_Fetcher_Base.parse_cost_chaoscards(cost_text) + return cost, active + + def scrape_cost_and_active_selenium_gameslore(self, url): + # page_load_element_selector = '.page-title' + cost_selector = 'div.columns > div.column.main > div.product-info-main > div.product-info-price > div.price-box > span.special-price > span.price-container > span.price-wrapper > span.price' + active_selector = '.stock > span:nth-child(1)' + cost_text, active = self.scrape_cost_and_active_selenium( + url = url + , page_load_element_selector = cost_selector # page_load_element_selector + , cost_selector = cost_selector + , active_selector = active_selector + , invalid_active_statuses = ["OUT OF STOCK"] + ) + cost = Cost_Fetcher_Base.parse_cost_gameslore(cost_text) + return cost, active + + def scrape_cost_and_active_selenium_magicmadhouse(self, url): + page_load_element_selector = '.productView-title' + cost_selector = 'div.body > div.container > div > div.productView > section.productView-details > div.productView-options > form > div.productView-options-selections > div.productView-product > div.productView-info > div.price-rating > div.productView-price > div.price-section.actual-price > span.price' + active_selector = '.alertBox.alertBox--error' + cost_text, active = self.scrape_cost_and_active_selenium( + url = url + , page_load_element_selector = page_load_element_selector + , cost_selector = cost_selector + , active_selector = active_selector + , invalid_active_statuses = [] + ) + cost = Cost_Fetcher_Base.parse_cost_magicmadhouse(cost_text) + return cost, active + + def scrape_prices_and_quantities_selenium_cardmarket(self, url): + offer_container_selector = '#table > div:nth-child(1) > div.table-body > .row.article-row.g-0:nth-child(1) > div.col-offer' + price_selector = 'div.price-container > div > div:nth-child(1) > span:nth-child(1)' + quantity_selector = 'div.amount-container > span:nth-child(1)' + + try: + print(f" Loading page...") + # time.sleep(random.uniform(6, 10)) + try: + self.driver.get(url) + element = self.wait.until( + EC.presence_of_element_located((By.CSS_SELECTOR, offer_container_selector)) + ) + element = self.wait.until( + EC.element_to_be_clickable((By.CSS_SELECTOR, offer_container_selector)) + ) + except Exception as e: + self.driver.get(url) + element = self.wait.until( + EC.presence_of_element_located((By.CSS_SELECTOR, offer_container_selector)) + ) + element = self.wait.until( + EC.element_to_be_clickable((By.CSS_SELECTOR, offer_container_selector)) + ) + + max_attempts = 10 + for attempt in range(max_attempts): + try: + element = None + element = self.driver.find_element(By.CSS_SELECTOR, offer_container_selector) + text = element.text + print(f"✓ Element loaded successfully on attempt {attempt + 1}") + # return True + break + except StaleElementReferenceException: + print(f"Stale element on attempt {attempt + 1}, retrying...") + if attempt < max_attempts - 1: + time.sleep(1) + else: + raise ValueError("StaleElementReferenceException") + + print(f" Page title: {self.driver.title}") + + price_quantity_pairs = [] + try: + offer_containers = self.driver.find_elements(By.CSS_SELECTOR, offer_container_selector) + print(f" Offer container selector: Found {len(offer_containers)} elements") + for offer_container in offer_containers: + price_element = offer_container.find_element(By.CSS_SELECTOR, price_selector) + price_text = price_element.text + if '€' in price_text and re.search(r'\d', price_text): + print(f" ✓ Found price: {price_text}") + else: + price_text = None + + quantity_element = offer_container.find_element(By.CSS_SELECTOR, quantity_selector) + quantity_text = quantity_element.text + + if price_text is None or quantity_text is None: + continue + price_quantity_pairs.append({ + 'price': Cost_Fetcher_Base.parse_cost_cardmarket(price_text = price_text) + , 'quantity': Cost_Fetcher_Base.parse_cost_cardmarket(quantity_text = quantity_text) + }) + except Exception as e: + print(f" Price selector failed: {e}") + return [] + finally: + return price_quantity_pairs + + def load_tcg_sole_trader_workbook(self): + print("Loading workbook...") + self.workbook = load_workbook(Cost_Fetcher_Base.WORKBOOK_NAME) + + if Cost_Fetcher_Base.SOURCING_WORKSHEET_NAME not in self.workbook.sheetnames: + print(f"Error: Sheet '{Cost_Fetcher_Base.SOURCING_WORKSHEET_NAME}' not found") + return + if Cost_Fetcher_Base.PRODUCT_WORKSHEET_NAME not in self.workbook.sheetnames: + print(f"Error: Sheet '{Cost_Fetcher_Base.PRODUCT_WORKSHEET_NAME}' not found") + return + + self.sourcing_sheet = self.workbook[Cost_Fetcher_Base.SOURCING_WORKSHEET_NAME] + self.product_sheet = self.workbook[Cost_Fetcher_Base.PRODUCT_WORKSHEET_NAME] + + sourcing_table_found = False + for row in range(1, self.sourcing_sheet.max_row + 1): + if self.sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(self.sourcing_sheet.cell(row, 3).value): + self.index_row_header_sourcing = row + sourcing_table_found = True + break + + if not sourcing_table_found or not self.index_row_header_sourcing: + for row in range(1, min(20, self.sourcing_sheet.max_row + 1)): + if 'Source Name' in str(self.sourcing_sheet.cell(row, 3).value): + self.index_row_header_sourcing = row + sourcing_table_found = True + break + + if not sourcing_table_found: + print("Error: Could not find table 'tbl_Sourcing'") + return + + product_table_found = False + for row in range(1, self.product_sheet.max_row + 1): + if self.product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(self.product_sheet.cell(row, 1).value): + self.index_row_header_product = row + product_table_found = True + break + + if not product_table_found: + print("Error: Could not find table 'tbl_Product'") + return + + for index_column in range(1, self.sourcing_sheet.max_column + 1): + header = str(self.sourcing_sheet.cell(self.index_row_header_sourcing, index_column).value).strip() + if 'Source Name' == header: + self.index_column_name_sourcing = index_column + elif 'Source Link' == header: + self.index_column_link_sourcing = index_column + elif 'Source Unit Cost' == header: + self.index_column_unit_cost_sourcing = index_column + elif 'Active' == header: + self.index_column_active_sourcing = index_column + elif 'Product Id' == header: + self.index_column_product_id_sourcing = index_column + + for index_column in range(1, self.product_sheet.max_column + 1): + header = str(self.product_sheet.cell(self.index_row_header_product, index_column).value).strip() + if 'Is Booster Box' == header: + self.index_column_is_booster_box_product = index_column + elif 'Is Booster' == header: + self.index_column_is_booster_product = index_column + elif 'Is Precon' == header: + self.index_column_is_precon_product = index_column + elif 'Product Id' == header: + self.index_column_product_id_product = index_column + + print(f"Sourcing max row: {self.sourcing_sheet.max_row}") + print(f"Sourcing header row: {self.index_row_header_sourcing}") + print(f"Sourcing header 1: {self.sourcing_sheet.cell(self.index_row_header_sourcing, 1).value}") + print(f"Sourcing Columns - Name: {self.index_column_name_sourcing}, Link: {self.index_column_link_sourcing}, Unit Cost: {self.index_column_unit_cost_sourcing}, Active: {self.index_column_active_sourcing}, Product Id: {self.index_column_product_id_sourcing}") + print(f"Product max row: {self.product_sheet.max_row}") + print(f"Product header row: {self.index_row_header_product}") + print(f"Sourcing header 1: {self.product_sheet.cell(self.index_row_header_product, 1).value}") + print(f"Product Columns - Id: {self.index_column_product_id_product}, Is Booster: {self.index_column_is_booster_product}, Is Booster Box: {self.index_column_is_booster_box_product}, Is Precon: {self.index_column_is_precon_product}") + + if not all([ + self.index_column_name_sourcing + , self.index_column_link_sourcing + , self.index_column_unit_cost_sourcing + , self.index_column_product_id_sourcing + , self.index_column_active_sourcing + , self.index_column_product_id_product + , self.index_column_is_booster_product + , self.index_column_is_booster_box_product + , self.index_column_is_precon_product + ]): + print("Error: Could not find required columns") + return + + def scrape_all_costs(self): + try: + processed_count = 0 + updated_count = 0 + cardmarket_accessed_last_on = 0 + chaoscards_accessed_last_on = 0 + gameslore_accessed_last_on = 0 + magicmadhouse_accessed_last_on = 0 + did_restart_since_last_chaos_cards_visit = True + did_restart_since_last_games_lore_visit = True + for index_row in range(self.index_row_header_sourcing + 1, self.sourcing_sheet.max_row + 1): + # print(f"index_row: {index_row}") + # print(f"{self.sourcing_sheet.cell(index_row, 1).value}, {self.sourcing_sheet.cell(index_row, 2).value}, {self.sourcing_sheet.cell(index_row, 3).value}, {self.sourcing_sheet.cell(index_row, 4).value}, {self.sourcing_sheet.cell(index_row, 5).value}, {self.sourcing_sheet.cell(index_row, 6).value}, {self.sourcing_sheet.cell(index_row, 7).value}, {self.sourcing_sheet.cell(index_row, 8).value}, {self.sourcing_sheet.cell(index_row, 9).value}, {self.sourcing_sheet.cell(index_row, 10).value}, {self.sourcing_sheet.cell(index_row, 11).value}, {self.sourcing_sheet.cell(index_row, 12).value}, {self.sourcing_sheet.cell(index_row, 13).value}, {self.sourcing_sheet.cell(index_row, 14).value}, {self.sourcing_sheet.cell(index_row, 15).value}, {self.sourcing_sheet.cell(index_row, 16).value}, {self.sourcing_sheet.cell(index_row, 17).value}, {self.sourcing_sheet.cell(index_row, 18).value}, {self.sourcing_sheet.cell(index_row, 19).value}") + source_name = self.sourcing_sheet.cell(index_row, self.index_column_name_sourcing).value + source_link = self.sourcing_sheet.cell(index_row, self.index_column_link_sourcing).value + source_product_id = self.sourcing_sheet.cell(index_row, self.index_column_product_id_sourcing).value + + if not source_name or not source_link: # or not str(source_link).strip(): + continue + + print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}") + + product_is_booster = False + for product_row in range(self.index_row_header_product + 1, self.product_sheet.max_row + 1): + product_id = self.product_sheet.cell(product_row, self.index_column_product_id_product).value + # print(f"found product: id {product_id}") + if product_id == source_product_id: + product_is_booster_text = str(self.product_sheet.cell(product_row, self.index_column_is_booster_product).value).upper() + # print(f"product is booster: {product_is_booster_text}, type: {str(type(product_is_booster_text))}") + product_is_booster = (product_is_booster_text == "TRUE") + break + print(f"product is booster: {product_is_booster}") + + if ( + ( + source_name == "Chaos Cards" + and not did_restart_since_last_chaos_cards_visit + ) + or ( + source_name == "Games Lore" + and not did_restart_since_last_games_lore_visit + ) + ): + self.stop_driver() + self.setup_driver() + if not self.driver: + return + did_restart_since_last_chaos_cards_visit = True + did_restart_since_last_games_lore_visit = True + + if source_name in ["Card Market", "Chaos Cards", "Games Lore", "Magic Madhouse"]: + self.clear_row_sourcing_sheet(index_row = index_row) + processed_count += 1 + Cost_Fetcher_Base.log_processing_new_row( + index_row = index_row + , source_link = source_link + ) + + cost = None + active = None + if source_name == "Card Market": + while (time.time() - cardmarket_accessed_last_on < random.uniform(10, 20)): + time.sleep(random.uniform(3, 5)) + if product_is_booster: + price_quantity_pairs = self.scrape_prices_and_quantities_selenium_cardmarket(url = source_link) + if price_quantity_pairs: + self.sourcing_sheet.cell(index_row, self.index_column_active_sourcing).value = "TRUE" + max_quantity = 0 + updated_row_price = False + for price_quantity_pair in price_quantity_pairs: + eur_price = price_quantity_pair['price'] + quantity = price_quantity_pair['quantity'] + print(f" Found price: €{eur_price}") + print(f" Found quantity: {quantity}") + max_quantity = max(max_quantity, quantity) + if quantity >= 8: + if eur_price: + gbp_price = eur_price * self.eur_to_gbp_rate + print(f" Converted: €{eur_price:.2f} → £{gbp_price:.2f}") + self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value = gbp_price + updated_count += 1 + updated_row_price = True + print(f"output row: {index_row}, value: {self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value}") + break + else: + print(f" Error: Could not parse price") + if not updated_row_price: + print("Offer with quantity >= 8 not found") + for price_quantity_pair in price_quantity_pairs: + eur_price = price_quantity_pair['price'] + quantity = price_quantity_pair['quantity'] + print(f" Found price: €{eur_price}") + print(f" Found quantity: {quantity}") + if max_quantity <= 2 or quantity == max_quantity: + if eur_price: + gbp_price = eur_price * self.eur_to_gbp_rate + print(f" Converted: €{eur_price:.2f} → £{gbp_price:.2f}") + self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value = gbp_price + updated_count += 1 + updated_row_price = True + print(f"output row: {index_row}, value: {self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value}") + break + else: + print(f" Error: Could not parse price") + else: + cost, active = self.scrape_cost_and_active_selenium_cardmarket(url = source_link) + cardmarket_accessed_last_on = time.time() + elif source_name == "Chaos Cards": + while (time.time() - chaoscards_accessed_last_on < random.uniform(20, 30)): + time.sleep(random.uniform(3, 5)) + cost, active = self.scrape_cost_and_active_selenium_chaoscards(url = source_link) + chaoscards_accessed_last_on = time.time() + did_restart_since_last_chaos_cards_visit = False + elif source_name == "Games Lore": + while (time.time() - gameslore_accessed_last_on < random.uniform(10, 20)): + time.sleep(random.uniform(3, 5)) + cost, active = self.scrape_cost_and_active_selenium_gameslore(url = source_link) + gameslore_accessed_last_on = time.time() + did_restart_since_last_games_lore_visit = False + elif source_name == "Magic Madhouse": + while (time.time() - magicmadhouse_accessed_last_on < random.uniform(10, 20)): + time.sleep(random.uniform(3, 5)) + cost, active = self.scrape_cost_and_active_selenium_magicmadhouse(url = source_link) + magicmadhouse_accessed_last_on = time.time() + + if (cost is not None and active is not None): + print(f" Found cost: {cost}, active: {active}") + + self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value = cost + self.sourcing_sheet.cell(index_row, self.index_column_active_sourcing).value = "TRUE" if active else "FALSE" + updated_count += 1 + else: + print(f" Error: Could not find cost on page") + # Save workbook + print(f"\n{'='*60}") + print(f"Saving workbook...") + self.workbook.save(Cost_Fetcher_Base.WORKBOOK_NAME) + + print(f"\nComplete!") + print(f"Processed: {processed_count} entries") + print(f"Updated: {updated_count} costs") + except Exception as e: + print(f"Error: {e}") + + def clear_row_sourcing_sheet(self, index_row): + self.sourcing_sheet.cell(index_row, self.index_column_unit_cost_sourcing).value = None + self.sourcing_sheet.cell(index_row, self.index_column_active_sourcing).value = "FALSE" + + @staticmethod + def log_processing_new_row(index_row, source_link): + print(f"\n{'='*60}") + print(f"Processing row {index_row}: {source_link}") + print(f"{'='*60}") + + def __init__(self): + print("Setting up browser automation (browser will not be visible)...") + self.setup_driver() + if not self.driver: + return + self.load_tcg_sole_trader_workbook() + self.get_eur_to_gbp_rate() + + def stop_driver(self): + self.driver.quit() + +def main(): + cost_fetcher = Cost_Fetcher_Base() + cost_fetcher.scrape_all_costs() + cost_fetcher.stop_driver() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/product_scraping/mtg_booster_box_price_cardmarket_fetcher.py b/product_scraping/mtg_booster_box_price_cardmarket_fetcher.py new file mode 100644 index 0000000..404bd09 --- /dev/null +++ b/product_scraping/mtg_booster_box_price_cardmarket_fetcher.py @@ -0,0 +1,267 @@ +import pandas as pd +from openpyxl import load_workbook +import requests +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +import re +import time +import random + +ITEM_SHIPPING_COST_IN = 8 + +def get_eur_to_gbp_rate(): + """Fetch current EUR to GBP conversion rate""" + try: + response = requests.get('https://api.exchangerate-api.com/v4/latest/EUR', timeout=10) + data = response.json() + return data['rates']['GBP'] + except Exception as e: + print(f"Error fetching exchange rate: {e}") + print("Using fallback rate: 0.85") + return 0.85 + +def parse_cardmarket_price(price_text): + """Convert '141,30 €' format to float in EUR""" + if not price_text: + return None + price_clean = re.sub(r'[^\d,]', '', price_text) + price_clean = price_clean.replace(',', '.') + try: + return float(price_clean) + except ValueError: + return None + +def setup_driver(): + """Setup Chrome driver with visible window""" + chrome_options = Options() + # Remove headless mode to see the browser + # chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36') + chrome_options.add_argument('--window-size=1920,1080') + + try: + driver = webdriver.Chrome(options=chrome_options) + return driver + except Exception as e: + print(f"Error setting up Chrome driver: {e}") + print("Make sure Chrome and chromedriver are installed") + return None + +def scrape_cardmarket_price_selenium(driver, url): + """Scrape price from Card Market URL using Selenium""" + try: + print(f" Loading page...") + driver.get(url) + + # Wait for page to load + time.sleep(random.uniform(10, 20)) + #time.sleep(3) + + print(f" Page title: {driver.title}") + + # Try multiple selector strategies + price_selector = '#table > div:nth-child(1) > div.table-body > .row.article-row.g-0:nth-child(1) > div.col-offer > div.price-container > div > div:nth-child(1) > span:nth-child(1)' + + try: + elements = driver.find_elements(By.CSS_SELECTOR, price_selector) + print(f" Selector: Found {len(elements)} elements") + for elem in elements[:3]: # Check first 3 + text = elem.text + print(f" Text: '{text}'") + if '€' in text and re.search(r'\d', text): + print(f" ✓ Found price with selector: {text}") + # input("Confirm") + return text + except Exception as e: + print(f" Selector failed: {e}") + + print(f" ✗ No price found") + # input("Press Enter to continue to next URL...") + return None + + except Exception as e: + print(f" Error: {e}") + return None + +def main(): + workbook_name = 'TCG Sole Trader Copy.xlsx' + sourcing_sheet_name = 'Sourcing' + product_sheet_name = 'Product' + + print("Loading workbook...") + wb = load_workbook(workbook_name) + + if sourcing_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{sourcing_sheet_name}' not found") + return + if product_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{product_sheet_name}' not found") + return + + sourcing_sheet = wb[sourcing_sheet_name] + product_sheet = wb[product_sheet_name] + + sourcing_table_found = False + start_row = None + for row in range(1, sourcing_sheet.max_row + 1): + if sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(sourcing_sheet.cell(row, 3).value): + start_row = row + 1 + sourcing_table_found = True + break + + if not sourcing_table_found or not start_row: + for row in range(1, min(20, sourcing_sheet.max_row + 1)): + if 'Source Name' in str(sourcing_sheet.cell(row, 3).value): + start_row = row + 1 + sourcing_table_found = True + break + + start_row = None + product_table_found = False + for row in range(1, product_sheet.max_row + 1): + if product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(product_sheet.cell(row, 1).value): + start_row = row + 1 + product_table_found = True + break + + if not sourcing_table_found: + print("Error: Could not find table 'tbl_Sourcing'") + return + if not product_table_found: + print("Error: Could not find table 'tbl_Product'") + return + + # Find column indices + header_row = start_row - 1 + source_name_col = None + source_link_col = None + source_unit_price_col = None + source_is_available_col = None + source_product_id_col = None + + product_id_col = None + product_is_booster_box_col = None + product_is_precon_col = None + + for col in range(1, sourcing_sheet.max_column + 1): + header = str(sourcing_sheet.cell(header_row, col).value).strip() + if 'Source Name' in header: + source_name_col = col + elif 'Source Link' in header: + source_link_col = col + elif 'Source Unit Cost' in header: + source_unit_price_col = col + elif 'Active' in header: + source_is_available_col = col + elif 'Product Id' in header: + source_product_id_col = col + + for col in range(1, product_sheet.max_column + 1): + header = str(product_sheet.cell(header_row, col).value).strip() + if 'Is Booster Box' in header: + product_is_booster_box_col = col + elif 'Is Precon' in header: + product_is_precon_col = col + elif 'Product Id' in header: + product_id_col = col + + print(f"Starting from row {start_row}") + print(f"Sourcing Columns - Source Name: {source_name_col}, Source Link: {source_link_col}, Source Unit Cost: {source_unit_price_col}, Active: {source_is_available_col}, Product Id: {source_product_id_col}") + print(f"Product Columns - Id: {product_id_col}, Is Booster: {product_is_booster_box_col}, Is Precon: {product_is_precon_col}") + + if not all([source_name_col, source_link_col, source_unit_price_col, source_is_available_col, source_product_id_col, product_id_col, product_is_booster_box_col, product_is_precon_col]): + print("Error: Could not find required columns") + return + + # Get EUR to GBP rate + eur_to_gbp = get_eur_to_gbp_rate() + print(f"Using EUR to GBP rate: {eur_to_gbp}") + + # Setup Selenium driver + print("Setting up browser automation (browser will be visible)...") + driver = setup_driver() + if not driver: + return + + try: + processed_count = 0 + updated_count = 0 + + for row in range(start_row, sourcing_sheet.max_row + 1): + source_name = sourcing_sheet.cell(row, source_name_col).value + source_link = sourcing_sheet.cell(row, source_link_col).value + source_product_id = sourcing_sheet.cell(row, source_product_id_col).value + + if not source_name and not source_link: + break + + print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}") + + product_is_booster_box = False + product_is_precon = False + for product_row in range(start_row, product_sheet.max_row + 1): + product_id = product_sheet.cell(product_row, product_id_col).value + # print(f"found product: id {product_id}") + if product_id == source_product_id: + product_is_booster_box_text = str(product_sheet.cell(product_row, product_is_booster_box_col).value).upper() + product_is_booster_box = (product_is_booster_box_text == "TRUE") + product_is_precon_text = str(product_sheet.cell(product_row, product_is_precon_col).value).upper() + product_is_precon = (product_is_precon_text == "TRUE") + break + print(f"product is booster box: {product_is_booster_box}") + # Check conditions + if ( + (product_is_booster_box or product_is_precon) + and source_name == "Card Market" + and source_link + and str(source_link).strip() + ): + sourcing_sheet.cell(row, source_unit_price_col).value = None + sourcing_sheet.cell(row, source_is_available_col).value = "FALSE" + + processed_count += 1 + print(f"\n{'='*60}") + print(f"Processing row {row}: {source_link}") + print(f"{'='*60}") + + # Scrape price + price_text = scrape_cardmarket_price_selenium(driver, source_link) + + if price_text: + print(f" Found price: {price_text}") + + # Parse and convert + eur_price = parse_cardmarket_price(price_text) + if eur_price: + gbp_price = eur_price * eur_to_gbp + print(f" Converted: €{eur_price:.2f} → £{gbp_price:.2f}") + + # Update cell + sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price + ITEM_SHIPPING_COST_IN + sourcing_sheet.cell(row, source_is_available_col).value = "TRUE" + updated_count += 1 + else: + print(f" Error: Could not parse price") + else: + print(f" Error: Could not find price on page") + # Save workbook + print(f"\n{'='*60}") + print(f"Saving workbook...") + wb.save(workbook_name) + + print(f"\nComplete!") + print(f"Processed: {processed_count} Card Market entries") + print(f"Updated: {updated_count} prices") + + finally: + driver.quit() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/product_scraping/mtg_booster_expected_value_fetcher.py b/product_scraping/mtg_booster_expected_value_fetcher.py new file mode 100644 index 0000000..1cd23be --- /dev/null +++ b/product_scraping/mtg_booster_expected_value_fetcher.py @@ -0,0 +1,312 @@ +import pandas as pd +from openpyxl import load_workbook +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +import re +import time +import random + +def setup_driver(headless=True): + """Setup Chrome driver""" + chrome_options = Options() + if headless: + chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36') + chrome_options.add_argument('--window-size=1920,1080') + + try: + driver = webdriver.Chrome(options=chrome_options) + return driver + except Exception as e: + print(f"Error setting up Chrome driver: {e}") + print("Make sure Chrome and chromedriver are installed") + return None + +def parse_price_value(text): + """Extract numeric value from price string like '$5.50' or '€5,50'""" + if not text: + return None + # Remove currency symbols and extract numbers + cleaned = re.sub(r'[^\d,.\-]', '', text) + # Replace comma with period for decimal + cleaned = cleaned.replace(',', '.') + try: + return float(cleaned) + except ValueError: + return None + +def scrape_mtg_stocks_values(driver, url): + """Scrape expected value and market value from MTG Stocks""" + try: + print(f" Loading page...") + driver.get(url) + + # Wait for table to load + time.sleep(random.uniform(10, 20)) + + # Valid booster types to match + valid_play_booster_types = [ + 'Play Booster Pack', + 'Set Booster Pack', + 'Booster Pack', + 'Play Booster', + 'Set Booster', + 'Booster' + ] + valid_collector_booster_types = [ + 'Collector Booster Pack', + 'Collector Booster' + ] + + # Find all rows in the table + row_selector = 'mtg-sets-expected-value > mtg-product-tree > .table-responsive > table > tbody:nth-child(2) > tr' + rows = driver.find_elements(By.CSS_SELECTOR, row_selector) + + print(f" Found {len(rows)} rows in table") + found_play = False + found_collector = False + play_expected_value = None + play_market_value = None + collector_expected_value = None + collector_market_value = None + + for row in rows: + try: + # Get the booster type from first column + booster_type_elem = row.find_element(By.CSS_SELECTOR, 'td:nth-child(1) > div.d-flex.align-items-center:nth-child(1) > a:nth-child(2)') + booster_type = booster_type_elem.text.strip() + + print(f" Checking row: '{booster_type}'") + + if booster_type in valid_play_booster_types and found_play == False: + print(f" ✓ Match found: '{booster_type}'") + found_play = True + + # Get expected value (3rd column) + expected_value_elem = row.find_element(By.CSS_SELECTOR, 'td:nth-child(3)') + expected_value_text = expected_value_elem.text.strip() + + # Get market value (5th column) + market_value_elem = row.find_element(By.CSS_SELECTOR, 'td:nth-child(5)') + market_value_text = market_value_elem.text.strip() + + print(f" Expected Value: '{expected_value_text}'") + print(f" Market Value: '{market_value_text}'") + + # Parse values + play_expected_value = parse_price_value(expected_value_text) + play_market_value = parse_price_value(market_value_text) + + if booster_type in valid_collector_booster_types and found_collector == False: + print(f" ✓ Match found: '{booster_type}'") + found_collector = True + + # Get expected value (3rd column) + expected_value_elem = row.find_element(By.CSS_SELECTOR, 'td:nth-child(3)') + expected_value_text = expected_value_elem.text.strip() + + # Get market value (5th column) + market_value_elem = row.find_element(By.CSS_SELECTOR, 'td:nth-child(5)') + market_value_text = market_value_elem.text.strip() + + print(f" Expected Value: '{expected_value_text}'") + print(f" Market Value: '{market_value_text}'") + + # Parse values + collector_expected_value = parse_price_value(expected_value_text) + collector_market_value = parse_price_value(market_value_text) + + if found_play and found_collector: + return { + 'play_expected_value': play_expected_value, + 'play_market_value': play_market_value, + 'collector_expected_value': collector_expected_value, + 'collector_market_value': collector_market_value, + 'found_play': True, + 'found_collector': True + } + + + except Exception as e: + # Row doesn't match structure, continue to next + continue + + print(f" ✗ No matching booster type found") + return { + 'play_expected_value': play_expected_value, + 'play_market_value': play_market_value, + 'collector_expected_value': collector_expected_value, + 'collector_market_value': collector_market_value, + 'found_play': found_play, + 'found_collector': found_collector + } + + except Exception as e: + print(f" Error: {e}") + return { + 'play_expected_value': play_expected_value, + 'play_market_value': play_market_value, + 'collector_expected_value': collector_expected_value, + 'collector_market_value': collector_market_value, + 'found_play': found_play, + 'found_collector': found_collector + } + +def main(): + workbook_name = 'TCG Sole Trader Copy.xlsx' + sheet_name = 'MTG Set' + + print("Loading workbook...") + wb = load_workbook(workbook_name) + + if sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{sheet_name}' not found") + return + + sheet = wb[sheet_name] + + # Find table boundaries and columns + table_found = False + start_row = None + header_row = None + + # Search for table header + print("max sheet column: ", str(sheet.max_column)) + + for row in range(2, max(50, sheet.max_row + 1)): + cell_value = str(sheet.cell(row, 1).value) + # Check multiple columns for table indicators + for col in range(1, max(10, sheet.max_column + 1)): + cell_value = str(sheet.cell(row, col).value) + if 'EV MTG Stocks Link' in cell_value: + header_row = row + start_row = row + 1 + table_found = True + break + if table_found: + break + + if not table_found: + print("Error: Could not find 'EV MTG Stocks Link' column") + return + + print(f"Found table header at row {header_row}") + print(f"Starting from row {start_row}") + + # Find column indices + ev_link_col = None + play_expected_value_col = None + play_market_value_col = None + collector_expected_value_col = None + collector_market_value_col = None + + for col in range(1, sheet.max_column + 1): + header = str(sheet.cell(header_row, col).value).strip() + if 'EV MTG Stocks Link' in header: + ev_link_col = col + elif 'Play Booster Expected Market Value' in header: + play_expected_value_col = col + elif 'Play Boost Sealed Market Value' in header: + play_market_value_col = col + elif 'Collector Booster Expected Market Value' in header: + collector_expected_value_col = col + elif 'Collector Boost Sealed Market Value' in header: + collector_market_value_col = col + + print(f"Columns - EV Link: {ev_link_col}, Play Expected Value: {play_expected_value_col}, Play Market Value: {play_market_value_col}, Collector Expected Value: {collector_expected_value_col}, Collector Market Value: {collector_market_value_col}") + + if not all([ev_link_col, play_expected_value_col, play_market_value_col, collector_expected_value_col, collector_market_value_col]): + print("Error: Could not find all required columns") + print(f" EV MTG Stocks Link: {'Found' if ev_link_col else 'NOT FOUND'}") + print(f" Play Booster Expected Market Value: {'Found' if play_expected_value_col else 'NOT FOUND'}") + print(f" Play Boost Sealed Market Value: {'Found' if play_market_value_col else 'NOT FOUND'}") + print(f" Collector Booster Expected Market Value: {'Found' if collector_expected_value_col else 'NOT FOUND'}") + print(f" Collector Boost Sealed Market Value: {'Found' if collector_market_value_col else 'NOT FOUND'}") + return + + # Setup Selenium driver + print("Setting up browser automation...") + driver = setup_driver(headless=False) # Set to False to see browser + if not driver: + return + + try: + # Process rows + processed_count = 0 + updated_count = 0 + play_cleared_count = 0 + collector_cleared_count = 0 + + for row in range(start_row, sheet.max_row + 1): + ev_link = sheet.cell(row, ev_link_col).value + + # Check if row is empty + if not ev_link: + # Check if we've passed the end of the table + empty_count = 0 + for check_col in range(1, min(10, sheet.max_column + 1)): + if not sheet.cell(row, check_col).value: + empty_count += 1 + if empty_count >= 5: # If most columns are empty, assume end of table + break + continue + + processed_count += 1 + print(f"\n{'='*80}") + print(f"Processing row {row}: {ev_link}") + print(f"{'='*80}") + + # Scrape values + result = scrape_mtg_stocks_values(driver, ev_link) + + if result['found_play']: + # Update cells with found values + sheet.cell(row, play_expected_value_col).value = result['play_expected_value'] + sheet.cell(row, play_market_value_col).value = result['play_market_value'] + updated_count += 1 + print(f" ✓ Updated - Expected: {result['play_expected_value']}, Market: {result['play_market_value']}") + else: + # Clear cells - no matching booster type found + sheet.cell(row, play_expected_value_col).value = '' + sheet.cell(row, play_market_value_col).value = '' + play_cleared_count += 1 + print(f" ✗ Cleared values - no matching booster type found") + + if result['found_collector']: + # Update cells with found values + sheet.cell(row, collector_expected_value_col).value = result['collector_expected_value'] + sheet.cell(row, collector_market_value_col).value = result['collector_market_value'] + updated_count += 1 + print(f" ✓ Updated - Expected: {result['collector_expected_value']}, Market: {result['collector_market_value']}") + else: + # Clear cells - no matching booster type found + sheet.cell(row, collector_expected_value_col).value = '' + sheet.cell(row, collector_market_value_col).value = '' + collector_cleared_count += 1 + print(f" ✗ Cleared values - no matching booster type found") + + # Small delay between requests + time.sleep(random.uniform(10, 20)) + + # Save workbook + print(f"\n{'='*80}") + print(f"Saving workbook...") + wb.save(workbook_name) + + print(f"\nComplete!") + print(f"Processed: {processed_count} entries") + print(f"Updated: {updated_count} entries") + print(f"Play fields cleared: {play_cleared_count} entries (no matching data)") + print(f"Collector fields cleared: {collector_cleared_count} entries (no matching data)") + + finally: + driver.quit() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/product_scraping/mtg_booster_sale_price_cardmarket_fetcher.py b/product_scraping/mtg_booster_sale_price_cardmarket_fetcher.py new file mode 100644 index 0000000..8fcfa15 --- /dev/null +++ b/product_scraping/mtg_booster_sale_price_cardmarket_fetcher.py @@ -0,0 +1,328 @@ +import pandas as pd +from openpyxl import load_workbook +import requests +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +import re +import time +import random +from datetime import datetime + +def get_eur_to_gbp_rate(): + """Fetch current EUR to GBP conversion rate""" + try: + response = requests.get('https://api.exchangerate-api.com/v4/latest/EUR', timeout=10) + data = response.json() + return data['rates']['GBP'] + except Exception as e: + print(f"Error fetching exchange rate: {e}") + print("Using fallback rate: 0.85") + return 0.85 + +def parse_cardmarket_price(price_text): + """Convert '141,30 €' format to float in EUR""" + if not price_text: + return None + price_clean = re.sub(r'[^\d,]', '', price_text) + price_clean = price_clean.replace(',', '.') + try: + return float(price_clean) + except ValueError: + return None + +def parse_cardmarket_quantity(quantity_text): + if not quantity_text: + return None + try: + return float(quantity_text) + except ValueError: + return None + +def setup_driver(): + """Setup Chrome driver with visible window""" + chrome_options = Options() + # Remove headless mode to see the browser + # chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36') + chrome_options.add_argument('--window-size=1920,1080') + + try: + driver = webdriver.Chrome(options=chrome_options) + return driver + except Exception as e: + print(f"Error setting up Chrome driver: {e}") + print("Make sure Chrome and chromedriver are installed") + return None + +def scrape_cardmarket_prices_and_quantities_selenium(driver, url): + try: + print(f" Loading page...") + driver.get(url) + + # Wait for page to load + human reading time + time.sleep(random.uniform(20, 30)) + + print(f" Page title: {driver.title}") + + offer_container_selector = '#table > div:nth-child(1) > div.table-body > .row.article-row.g-0:nth-child(1) > div.col-offer' + price_selector = 'div.price-container > div > div:nth-child(1) > span:nth-child(1)' + quantity_selector = 'div.amount-container > span:nth-child(1)' + price_quantity_pairs = [] + try: + offer_containers = driver.find_elements(By.CSS_SELECTOR, offer_container_selector) + print(f" Offer container selector: Found {len(offer_containers)} elements") + for offer_container in offer_containers: + price_element = offer_container.find_element(By.CSS_SELECTOR, price_selector) + price_text = price_element.text + if '€' in price_text and re.search(r'\d', price_text): + print(f" ✓ Found price: {price_text}") + else: + price_text = None + + quantity_element = offer_container.find_element(By.CSS_SELECTOR, quantity_selector) + quantity_text = quantity_element.text + + if price_text is None or quantity_text is None: + continue + price_quantity_pairs.append({ + 'price': parse_cardmarket_price(price_text = price_text) + , 'quantity': parse_cardmarket_quantity(quantity_text = quantity_text) + }) + except Exception as e: + print(f" Price selector failed: {e}") + return [] + + """ + if price_text is None: + print(f" ✗ No price found") + if quantity_text is not None: + input("Press Enter to continue to next URL...") + + if quantity_text is None: + print(f" ✗ No quantity found") + input("Press Enter to continue to next URL...") + """ + return price_quantity_pairs # price_text, quantity_text + + except Exception as e: + print(f" Error: {e}") + return [] + +def main(): + workbook_name = 'TCG Sole Trader Copy.xlsx' + sourcing_sheet_name = 'Sourcing' + # mtg_set_sheet_name = 'MTG Set' + product_sheet_name = 'Product' + + print("Loading workbook...") + wb = load_workbook(workbook_name) + + if sourcing_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{sourcing_sheet_name}' not found") + return + """ + if mtg_set_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{mtg_set_sheet_name}' not found") + return + """ + if product_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{product_sheet_name}' not found") + return + + sourcing_sheet = wb[sourcing_sheet_name] + # mtg_set_sheet = wb[mtg_set_sheet_name] + product_sheet = wb[product_sheet_name] + + sourcing_table_found = False + start_row = None + for row in range(1, sourcing_sheet.max_row + 1): + if sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(sourcing_sheet.cell(row, 3).value): + start_row = row + 1 + sourcing_table_found = True + break + + if not sourcing_table_found or not start_row: + for row in range(1, min(20, sourcing_sheet.max_row + 1)): + if 'Source Name' in str(sourcing_sheet.cell(row, 2).value): + start_row = row + 1 + sourcing_table_found = True + break + """ + start_row = None + # mtg_set_table_found = False + for row in range(1, mtg_set_sheet.max_row + 1): + if mtg_set_sheet.cell(row, 1).value == 'tbl_MTG_Set' or 'Set Name' in str(mtg_set_sheet.cell(row, 2).value): + start_row = row + 1 + mtg_set_table_found = True + break + """ + + start_row = None + product_table_found = False + for row in range(1, product_sheet.max_row + 1): + if product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(product_sheet.cell(row, 1).value): + start_row = row + 1 + product_table_found = True + break + + if not sourcing_table_found: + print("Error: Could not find table 'tbl_Sourcing' or 'Source Name' column") + return + """ + if not mtg_set_table_found: + print("Error: Could not find table 'tbl_MTG_Set' or 'Set Name' column") + return + """ + if not product_table_found: + print("Error: Could not find table 'tbl_Product' or 'Product Id' column") + return + + header_row = start_row - 1 + source_name_col = None + source_link_col = None + source_unit_price_col = None + source_is_available_col = None + source_product_id_col = None + + product_id_col = None + product_is_booster_col = None + + for col in range(1, sourcing_sheet.max_column + 1): + header = str(sourcing_sheet.cell(header_row, col).value).strip() + if 'Source Name' in header: + source_name_col = col + elif 'Source Link' in header: + source_link_col = col + elif 'Sale Price' in header: + source_unit_price_col = col + elif 'Active' in header: + source_is_available_col = col + elif 'Product Id' in header: + source_product_id_col = col + + for col in range(1, product_sheet.max_column + 1): + header = str(product_sheet.cell(header_row, col).value).strip() + if 'Is Booster' in header: + product_is_booster_col = col + elif 'Product Id' in header: + product_id_col = col + + print(f"Starting from row {start_row}") + print(f"Sourcing Columns - Source Name: {source_name_col}, Source Link: {source_link_col}, Source Unit Cost: {source_unit_price_col}, Active: {source_is_available_col}, Product Id: {source_product_id_col}") + print(f"Product Columns - Id: {product_id_col}, Is Booster: {product_is_booster_col}") + + if not all([source_name_col, source_link_col, source_unit_price_col, source_is_available_col, source_product_id_col, product_id_col, product_is_booster_col]): + print("Error: Could not find required columns") + return + + # Get EUR to GBP rate + eur_to_gbp = get_eur_to_gbp_rate() + print(f"Using EUR to GBP rate: {eur_to_gbp}") + + # Setup Selenium driver + print("Setting up browser automation (browser will be visible)...") + driver = setup_driver() + if not driver: + return + + try: + processed_count = 0 + updated_count = 0 + + for row in range(start_row, sourcing_sheet.max_row + 1): + source_name = sourcing_sheet.cell(row, source_name_col).value + source_link = sourcing_sheet.cell(row, source_link_col).value + source_product_id = sourcing_sheet.cell(row, source_product_id_col).value + # Check if row is empty + if not source_name or not source_link or not source_product_id: + continue + + print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}") + + product_is_booster = False + for product_row in range(start_row, product_sheet.max_row + 1): + product_id = product_sheet.cell(product_row, product_id_col).value + # print(f"found product: id {product_id}") + if product_id == source_product_id: + product_is_booster_text = str(product_sheet.cell(product_row, product_is_booster_col).value).upper() + # print(f"product is booster: {product_is_booster_text}, type: {str(type(product_is_booster_text))}") + product_is_booster = (product_is_booster_text == "TRUE") + break + print(f"product is booster: {product_is_booster}") + # Check conditions + if product_is_booster and source_name == "Card Market" and source_link and str(source_link).strip(): + sourcing_sheet.cell(row, source_unit_price_col).value = None + # sourcing_sheet.cell(row, source_is_available_col).value = "FALSE" + + processed_count += 1 + print(f"\n{'='*60}") + print(f"Processing row {row}: {source_link}") + print(f"{'='*60}") + + # Scrape price + price_quantity_pairs = scrape_cardmarket_prices_and_quantities_selenium(driver, source_link) + + if price_quantity_pairs: + # sourcing_sheet.cell(row, source_is_available_col).value = "TRUE" + max_quantity = 0 + updated_row_price = False + for price_quantity_pair in price_quantity_pairs: + eur_price = price_quantity_pair['price'] + quantity = price_quantity_pair['quantity'] + print(f" Found price: €{eur_price}") + print(f" Found quantity: {quantity}") + max_quantity = max(max_quantity, quantity) + if quantity >= 8: + if eur_price: + gbp_price = eur_price * eur_to_gbp + print(f" Converted: €{eur_price:.2f} → £{gbp_price:.2f}") + sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price + updated_count += 1 + updated_row_price = True + print(f"output row: {row}, value: {sourcing_sheet.cell(row, source_unit_price_col).value}") + break + else: + print(f" Error: Could not parse price") + if not updated_row_price: + print("Offer with quantity >= 8 not found") + for price_quantity_pair in price_quantity_pairs: + eur_price = price_quantity_pair['price'] + quantity = price_quantity_pair['quantity'] + print(f" Found price: €{eur_price}") + print(f" Found quantity: {quantity}") + if max_quantity <= 2 or quantity == max_quantity: + if eur_price: + gbp_price = eur_price * eur_to_gbp + print(f" Converted: €{eur_price:.2f} → £{gbp_price:.2f}") + sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price + updated_count += 1 + updated_row_price = True + print(f"output row: {row}, value: {sourcing_sheet.cell(row, source_unit_price_col).value}") + break + else: + print(f" Error: Could not parse price") + else: + print(f" Error: Could not find price on page") + + # Save workbook + print(f"\n{'='*60}") + print(f"Saving workbook...") + wb.save(workbook_name) + + print(f"\nComplete!") + print(f"Processed: {processed_count} Card Market entries") + print(f"Updated: {updated_count} prices") + print(datetime.now()) + + finally: + driver.quit() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/product_scraping/mtg_product_price_chaos_cards_fetcher.py b/product_scraping/mtg_product_price_chaos_cards_fetcher.py new file mode 100644 index 0000000..3d48823 --- /dev/null +++ b/product_scraping/mtg_product_price_chaos_cards_fetcher.py @@ -0,0 +1,241 @@ +import pandas as pd +from openpyxl import load_workbook +import requests +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +import re +import time +import random + +ITEM_SHIPPING_COST_IN = 8 + +def parse_chaoscards_price(price_text): + """Convert '141,30 €' format to float in EUR""" + if not price_text: + return None + price_clean = re.sub(r'[^\d,]', '', price_text) + try: + return float(price_clean) / 100 + except ValueError: + return None + +def setup_driver(): + """Setup Chrome driver with visible window""" + chrome_options = Options() + # Remove headless mode to see the browser + # chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36') + chrome_options.add_argument('--window-size=1920,1080') + + try: + driver = webdriver.Chrome(options=chrome_options) + return driver + except Exception as e: + print(f"Error setting up Chrome driver: {e}") + print("Make sure Chrome and chromedriver are installed") + return None + +def scrape_chaoscards_price_selenium(driver, url): + try: + print(f" Loading page...") + driver.get(url) + + time.sleep(random.uniform(5, 10)) + + print(f" Page title: {driver.title}") + + price_selector = '.price_inc > span:nth-child(2)' + price = None + try: + element = driver.find_element(By.CSS_SELECTOR, price_selector) + text = element.text + print(f" Text: '{text}'") + price = parse_chaoscards_price(text) + except Exception as e: + print(f" Selector failed: {e}") + + active_selector = '.product__right > form > ul.prod_det_fields.left.product-section.product-section--stock > li > div:nth-child(1) > div:nth-child(2)' + active = None + try: + element = driver.find_element(By.CSS_SELECTOR, active_selector) + text = element.text + print(f" Text: '{text}'") + active = (text != "Out of stock") + except Exception as e: + print(f" Selector failed: {e}") + + if price is None or active is None: + print(f" ✗ No price found") + input("Press Enter to continue to next URL...") + return price, active + + except Exception as e: + print(f" Error: {e}") + return None, None + +def main(): + workbook_name = 'TCG Sole Trader Copy.xlsx' + sourcing_sheet_name = 'Sourcing' + # product_sheet_name = 'Product' + + print("Loading workbook...") + wb = load_workbook(workbook_name) + + if sourcing_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{sourcing_sheet_name}' not found") + return + """ + if product_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{product_sheet_name}' not found") + return + """ + sourcing_sheet = wb[sourcing_sheet_name] + # product_sheet = wb[product_sheet_name] + + sourcing_table_found = False + start_row = None + for row in range(1, sourcing_sheet.max_row + 1): + if sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(sourcing_sheet.cell(row, 3).value): + start_row = row + 1 + sourcing_table_found = True + break + + if not sourcing_table_found or not start_row: + for row in range(1, min(20, sourcing_sheet.max_row + 1)): + if 'Source Name' in str(sourcing_sheet.cell(row, 3).value): + start_row = row + 1 + sourcing_table_found = True + break + """ + start_row = None + product_table_found = False + for row in range(1, product_sheet.max_row + 1): + if product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(product_sheet.cell(row, 1).value): + start_row = row + 1 + product_table_found = True + break + """ + if not sourcing_table_found: + print("Error: Could not find table 'tbl_Sourcing'") + return + """ + if not product_table_found: + print("Error: Could not find table 'tbl_Product'") + return + """ + + # Find column indices + header_row = start_row - 1 + source_name_col = None + source_link_col = None + source_unit_price_col = None + source_is_available_col = None + source_product_id_col = None + """ + product_id_col = None + product_is_booster_box_col = None + product_is_precon_col = None + """ + for col in range(1, sourcing_sheet.max_column + 1): + header = str(sourcing_sheet.cell(header_row, col).value).strip() + if 'Source Name' in header: + source_name_col = col + elif 'Source Link' in header: + source_link_col = col + elif 'Source Unit Cost' in header: + source_unit_price_col = col + elif 'Active' in header: + source_is_available_col = col + elif 'Product Id' in header: + source_product_id_col = col + """ + for col in range(1, product_sheet.max_column + 1): + header = str(product_sheet.cell(header_row, col).value).strip() + if 'Is Booster Box' in header: + product_is_booster_box_col = col + elif 'Is Precon' in header: + product_is_precon_col = col + elif 'Product Id' in header: + product_id_col = col + """ + print(f"Starting from row {start_row}") + print(f"Sourcing Columns - Source Name: {source_name_col}, Source Link: {source_link_col}, Source Unit Cost: {source_unit_price_col}, Active: {source_is_available_col}, Product Id: {source_product_id_col}") + # print(f"Product Columns - Id: {product_id_col}, Is Booster: {product_is_booster_box_col}, Is Precon: {product_is_precon_col}") + + if not all([source_name_col, source_link_col, source_unit_price_col, source_is_available_col, source_product_id_col]): # , product_id_col, product_is_booster_box_col, product_is_precon_col]): + print("Error: Could not find required columns") + return + + # Setup Selenium driver + print("Setting up browser automation (browser will be visible)...") + driver = setup_driver() + if not driver: + return + + try: + processed_count = 0 + updated_count = 0 + + for row in range(start_row, sourcing_sheet.max_row + 1): + driver.quit() + driver = setup_driver() + if not driver: + return + + source_name = sourcing_sheet.cell(row, source_name_col).value + source_link = sourcing_sheet.cell(row, source_link_col).value + source_product_id = sourcing_sheet.cell(row, source_product_id_col).value + + if not source_name and not source_link: + break + + print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}") + + if ( + source_name == "Chaos Cards" + and source_link + and str(source_link).strip() + ): + sourcing_sheet.cell(row, source_unit_price_col).value = None + sourcing_sheet.cell(row, source_is_available_col).value = "FALSE" + + processed_count += 1 + print(f"\n{'='*60}") + print(f"Processing row {row}: {source_link}") + print(f"{'='*60}") + + # Scrape price + gbp_price, active = scrape_chaoscards_price_selenium(driver, source_link) + + if (gbp_price is not None and active is not None): + print(f" Found price: {gbp_price}, active: {active}") + + if gbp_price: + sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price + ITEM_SHIPPING_COST_IN + sourcing_sheet.cell(row, source_is_available_col).value = "TRUE" if active else "FALSE" + updated_count += 1 + else: + print(f" Error: Could not parse price") + else: + print(f" Error: Could not find price on page") + # Save workbook + print(f"\n{'='*60}") + print(f"Saving workbook...") + wb.save(workbook_name) + + print(f"\nComplete!") + print(f"Processed: {processed_count} Chaos Cards entries") + print(f"Updated: {updated_count} prices") + + finally: + driver.quit() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/product_scraping/mtg_product_price_games_lore_fetcher.py b/product_scraping/mtg_product_price_games_lore_fetcher.py new file mode 100644 index 0000000..d8dd16b --- /dev/null +++ b/product_scraping/mtg_product_price_games_lore_fetcher.py @@ -0,0 +1,235 @@ +import pandas as pd +from openpyxl import load_workbook +import requests +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +import re +import time +import random + +ITEM_SHIPPING_COST_IN = 8 + +def parse_gameslore_price(price_text): + if not price_text: + return None + price_clean = re.sub(r'[^\d,]', '', price_text) + try: + return float(price_clean) / 100 + except ValueError: + return None + +def setup_driver(): + """Setup Chrome driver with visible window""" + chrome_options = Options() + # Remove headless mode to see the browser + # chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36') + chrome_options.add_argument('--window-size=1920,1080') + + try: + driver = webdriver.Chrome(options=chrome_options) + return driver + except Exception as e: + print(f"Error setting up Chrome driver: {e}") + print("Make sure Chrome and chromedriver are installed") + return None + +def scrape_gameslore_price_selenium(driver, url): + try: + print(f" Loading page...") + driver.get(url) + + time.sleep(random.uniform(10, 20)) + + print(f" Page title: {driver.title}") + + price_selector = 'div.columns > div.column.main > div.product-info-main > div.product-info-price > div.price-box > span.special-price > span.price-container > span.price-wrapper > span.price' + price = None + try: + element = driver.find_element(By.CSS_SELECTOR, price_selector) + text = element.text + print(f" Text: '{text}'") + price = parse_gameslore_price(text) + except Exception as e: + print(f" Selector failed: {e}") + + active_selector = '.stock > span:nth-child(1)' + active = None + try: + element = driver.find_element(By.CSS_SELECTOR, active_selector) + text = element.text + print(f" ✓ Found stock availability with selector: {text}") + active = (text != "OUT OF STOCK") + except Exception as e: + print(f" Selector failed: {e}") + + if price is None or active is None: + print(f" ✗ No price found") + input("Press Enter to continue to next URL...") + return price, active + + except Exception as e: + print(f" Error: {e}") + return None, None + +def main(): + workbook_name = 'TCG Sole Trader Copy.xlsx' + sourcing_sheet_name = 'Sourcing' + # product_sheet_name = 'Product' + + print("Loading workbook...") + wb = load_workbook(workbook_name) + + if sourcing_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{sourcing_sheet_name}' not found") + return + """ + if product_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{product_sheet_name}' not found") + return + """ + sourcing_sheet = wb[sourcing_sheet_name] + # product_sheet = wb[product_sheet_name] + + sourcing_table_found = False + start_row = None + for row in range(1, sourcing_sheet.max_row + 1): + if sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(sourcing_sheet.cell(row, 3).value): + start_row = row + 1 + sourcing_table_found = True + break + + if not sourcing_table_found or not start_row: + for row in range(1, min(20, sourcing_sheet.max_row + 1)): + if 'Source Name' in str(sourcing_sheet.cell(row, 3).value): + start_row = row + 1 + sourcing_table_found = True + break + """ + start_row = None + product_table_found = False + for row in range(1, product_sheet.max_row + 1): + if product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(product_sheet.cell(row, 1).value): + start_row = row + 1 + product_table_found = True + break + """ + if not sourcing_table_found: + print("Error: Could not find table 'tbl_Sourcing'") + return + """ + if not product_table_found: + print("Error: Could not find table 'tbl_Product'") + return + """ + + # Find column indices + header_row = start_row - 1 + source_name_col = None + source_link_col = None + source_unit_price_col = None + source_is_available_col = None + source_product_id_col = None + """ + product_id_col = None + product_is_booster_box_col = None + product_is_precon_col = None + """ + for col in range(1, sourcing_sheet.max_column + 1): + header = str(sourcing_sheet.cell(header_row, col).value).strip() + if 'Source Name' in header: + source_name_col = col + elif 'Source Link' in header: + source_link_col = col + elif 'Source Unit Cost' in header: + source_unit_price_col = col + elif 'Active' in header: + source_is_available_col = col + elif 'Product Id' in header: + source_product_id_col = col + """ + for col in range(1, product_sheet.max_column + 1): + header = str(product_sheet.cell(header_row, col).value).strip() + if 'Is Booster Box' in header: + product_is_booster_box_col = col + elif 'Is Precon' in header: + product_is_precon_col = col + elif 'Product Id' in header: + product_id_col = col + """ + print(f"Starting from row {start_row}") + print(f"Sourcing Columns - Source Name: {source_name_col}, Source Link: {source_link_col}, Source Unit Cost: {source_unit_price_col}, Active: {source_is_available_col}, Product Id: {source_product_id_col}") + # print(f"Product Columns - Id: {product_id_col}, Is Booster: {product_is_booster_box_col}, Is Precon: {product_is_precon_col}") + + if not all([source_name_col, source_link_col, source_unit_price_col, source_is_available_col, source_product_id_col]): # , product_id_col, product_is_booster_box_col, product_is_precon_col]): + print("Error: Could not find required columns") + return + + # Setup Selenium driver + print("Setting up browser automation (browser will be visible)...") + driver = setup_driver() + if not driver: + return + + try: + processed_count = 0 + updated_count = 0 + + for row in range(start_row, sourcing_sheet.max_row + 1): + source_name = sourcing_sheet.cell(row, source_name_col).value + source_link = sourcing_sheet.cell(row, source_link_col).value + source_product_id = sourcing_sheet.cell(row, source_product_id_col).value + + if not source_name and not source_link: + break + + print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}") + + if ( + source_name == "Games Lore" + and source_link + and str(source_link).strip() + ): + sourcing_sheet.cell(row, source_unit_price_col).value = None + sourcing_sheet.cell(row, source_is_available_col).value = "FALSE" + + processed_count += 1 + print(f"\n{'='*60}") + print(f"Processing row {row}: {source_link}") + print(f"{'='*60}") + + # Scrape price + gbp_price, active = scrape_gameslore_price_selenium(driver, source_link) + + if (gbp_price is not None and active is not None): + print(f" Found price: {gbp_price}, active: {active}") + + if gbp_price: + sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price + ITEM_SHIPPING_COST_IN + sourcing_sheet.cell(row, source_is_available_col).value = "TRUE" if active else "FALSE" + updated_count += 1 + else: + print(f" Error: Could not parse price") + else: + print(f" Error: Could not find price on page") + # Save workbook + print(f"\n{'='*60}") + print(f"Saving workbook...") + wb.save(workbook_name) + + print(f"\nComplete!") + print(f"Processed: {processed_count} Games Lore entries") + print(f"Updated: {updated_count} prices") + + finally: + driver.quit() + +if __name__ == "__main__": + main() diff --git a/product_scraping/mtg_product_price_magic_madhouse_fetcher.py b/product_scraping/mtg_product_price_magic_madhouse_fetcher.py new file mode 100644 index 0000000..0921826 --- /dev/null +++ b/product_scraping/mtg_product_price_magic_madhouse_fetcher.py @@ -0,0 +1,235 @@ +import pandas as pd +from openpyxl import load_workbook +import requests +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +import re +import time +import random + +ITEM_SHIPPING_COST_IN = 8 + +def parse_magicmadhouse_price(price_text): + """Convert '141,30 €' format to float in EUR""" + if not price_text: + return None + price_clean = re.sub(r'[^\d,]', '', price_text) + price_clean = price_clean.replace(',', '') + try: + return float(price_clean) / 100 + except ValueError: + return None + +def setup_driver(): + """Setup Chrome driver with visible window""" + chrome_options = Options() + # Remove headless mode to see the browser + # chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36') + chrome_options.add_argument('--window-size=1920,1080') + + try: + driver = webdriver.Chrome(options=chrome_options) + return driver + except Exception as e: + print(f"Error setting up Chrome driver: {e}") + print("Make sure Chrome and chromedriver are installed") + return None + +def scrape_magicmadhouse_price_selenium(driver, url): + try: + print(f" Loading page...") + driver.get(url) + + time.sleep(random.uniform(10, 20)) + + print(f" Page title: {driver.title}") + + price_selector = 'div.body > div.container > div > div.productView > section.productView-details > div.productView-options > form > div.productView-options-selections > div.productView-product > div.productView-info > div.price-rating > div.productView-price > div.price-section.actual-price > span.price' + price = None + try: + element = driver.find_element(By.CSS_SELECTOR, price_selector) + text = element.text + print(f" Text: '{text}'") + price = parse_magicmadhouse_price(text) + except Exception as e: + print(f" Selector failed: {e}") + + active_selector = '.alertBox.alertBox--error' + active = False + try: + element = driver.find_element(By.CSS_SELECTOR, active_selector) + active = False + except Exception as e: + active = True + + if price is None or active is None: + print(f" ✗ No out of stock item found") + input("Press Enter to continue to next URL...") + return price, active + + except Exception as e: + print(f" Error: {e}") + return None, None + +def main(): + workbook_name = 'TCG Sole Trader Copy.xlsx' + sourcing_sheet_name = 'Sourcing' + # product_sheet_name = 'Product' + + print("Loading workbook...") + wb = load_workbook(workbook_name) + + if sourcing_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{sourcing_sheet_name}' not found") + return + """ + if product_sheet_name not in wb.sheetnames: + print(f"Error: Sheet '{product_sheet_name}' not found") + return + """ + sourcing_sheet = wb[sourcing_sheet_name] + # product_sheet = wb[product_sheet_name] + + sourcing_table_found = False + start_row = None + for row in range(1, sourcing_sheet.max_row + 1): + if sourcing_sheet.cell(row, 1).value == 'tbl_Sourcing' or 'Source Name' in str(sourcing_sheet.cell(row, 3).value): + start_row = row + 1 + sourcing_table_found = True + break + + if not sourcing_table_found or not start_row: + for row in range(1, min(20, sourcing_sheet.max_row + 1)): + if 'Source Name' in str(sourcing_sheet.cell(row, 3).value): + start_row = row + 1 + sourcing_table_found = True + break + """ + start_row = None + product_table_found = False + for row in range(1, product_sheet.max_row + 1): + if product_sheet.cell(row, 1).value == 'tbl_Product' or 'Product Id' in str(product_sheet.cell(row, 1).value): + start_row = row + 1 + product_table_found = True + break + """ + if not sourcing_table_found: + print("Error: Could not find table 'tbl_Sourcing'") + return + """ + if not product_table_found: + print("Error: Could not find table 'tbl_Product'") + return + """ + + # Find column indices + header_row = start_row - 1 + source_name_col = None + source_link_col = None + source_unit_price_col = None + source_is_available_col = None + source_product_id_col = None + """ + product_id_col = None + product_is_booster_box_col = None + product_is_precon_col = None + """ + for col in range(1, sourcing_sheet.max_column + 1): + header = str(sourcing_sheet.cell(header_row, col).value).strip() + if 'Source Name' in header: + source_name_col = col + elif 'Source Link' in header: + source_link_col = col + elif 'Source Unit Cost' in header: + source_unit_price_col = col + elif 'Active' in header: + source_is_available_col = col + elif 'Product Id' in header: + source_product_id_col = col + """ + for col in range(1, product_sheet.max_column + 1): + header = str(product_sheet.cell(header_row, col).value).strip() + if 'Is Booster Box' in header: + product_is_booster_box_col = col + elif 'Is Precon' in header: + product_is_precon_col = col + elif 'Product Id' in header: + product_id_col = col + """ + print(f"Starting from row {start_row}") + print(f"Sourcing Columns - Source Name: {source_name_col}, Source Link: {source_link_col}, Source Unit Cost: {source_unit_price_col}, Active: {source_is_available_col}, Product Id: {source_product_id_col}") + # print(f"Product Columns - Id: {product_id_col}, Is Booster: {product_is_booster_box_col}, Is Precon: {product_is_precon_col}") + + if not all([source_name_col, source_link_col, source_unit_price_col, source_is_available_col, source_product_id_col]): # , product_id_col, product_is_booster_box_col, product_is_precon_col]): + print("Error: Could not find required columns") + return + + # Setup Selenium driver + print("Setting up browser automation (browser will be visible)...") + driver = setup_driver() + if not driver: + return + + try: + processed_count = 0 + updated_count = 0 + + for row in range(start_row, sourcing_sheet.max_row + 1): + source_name = sourcing_sheet.cell(row, source_name_col).value + source_link = sourcing_sheet.cell(row, source_link_col).value + source_product_id = sourcing_sheet.cell(row, source_product_id_col).value + + if not source_name and not source_link: + break + + print(f"found source: {source_name} - product: {source_product_id} - link: {source_link}") + + if ( + source_name == "Magic Madhouse" + and source_link + and str(source_link).strip() + ): + sourcing_sheet.cell(row, source_unit_price_col).value = None + sourcing_sheet.cell(row, source_is_available_col).value = "FALSE" + + processed_count += 1 + print(f"\n{'='*60}") + print(f"Processing row {row}: {source_link}") + print(f"{'='*60}") + + # Scrape price + gbp_price, active = scrape_magicmadhouse_price_selenium(driver, source_link) + + if (gbp_price is not None and active is not None): + print(f" Found price: {gbp_price}, active: {active}") + + if gbp_price: + sourcing_sheet.cell(row, source_unit_price_col).value = gbp_price + ITEM_SHIPPING_COST_IN + sourcing_sheet.cell(row, source_is_available_col).value = "TRUE" if active else "FALSE" + updated_count += 1 + else: + print(f" Error: Could not parse price") + else: + print(f" Error: Could not find price on page") + # Save workbook + print(f"\n{'='*60}") + print(f"Saving workbook...") + wb.save(workbook_name) + + print(f"\nComplete!") + print(f"Processed: {processed_count} Magic Madhouse entries") + print(f"Updated: {updated_count} prices") + + finally: + driver.quit() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0d54297 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +# YGH +pandas + +# MTG +ijson + +# PKM +openpyxl +xlsxwriter + + +# Product Scraping +selenium +undetected_chromedriver \ No newline at end of file diff --git a/yugioh_card_fetcher.py b/yugioh_card_fetcher.py new file mode 100644 index 0000000..1eeb42f --- /dev/null +++ b/yugioh_card_fetcher.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Yu-Gi-Oh Card Data Importer +Fetches all TCG cards from YGOPRODeck API and exports to Excel +""" + +import requests +import pandas as pd +from datetime import datetime + +def fetch_yugioh_cards(): + """Fetch all Yu-Gi-Oh cards from the API""" + print("Fetching card data from API...") + + url = "https://db.ygoprodeck.com/api/v7/cardinfo.php" + + try: + response = requests.get(url, timeout=30) + response.raise_for_status() + data = response.json() + return data['data'] + except requests.exceptions.RequestException as e: + print(f"Error fetching data: {e}") + return None + +def parse_card_data(cards): + """Parse card data into a flat structure for Excel""" + print(f"Processing {len(cards)} cards...") + + parsed_cards = [] + + for card in cards: + # Basic card info + card_info = { + 'ID': card.get('id'), + 'Name': card.get('name'), + 'Type': card.get('type'), + 'Human Readable Type': card.get('humanReadableCardType', ''), + 'Frame Type': card.get('frameType', ''), + 'Description': card.get('desc', ''), + 'Race': card.get('race', ''), + 'Archetype': card.get('archetype', ''), + 'ATK': card.get('atk', ''), + 'DEF': card.get('def', ''), + 'Level': card.get('level', ''), + 'Attribute': card.get('attribute', ''), + 'Scale': card.get('scale', ''), + 'Linkval': card.get('linkval', ''), + 'YGOPRODeck URL': card.get('ygoprodeck_url', '') + } + + # Get all set info if available (comma-separated lists) + if 'card_sets' in card and card['card_sets']: + set_names = [] + set_codes = [] + set_rarities = [] + set_prices = [] + + for card_set in card['card_sets']: + set_names.append(card_set.get('set_name', '')) + set_codes.append(card_set.get('set_code', '')) + set_rarities.append(card_set.get('set_rarity', '')) + set_prices.append(card_set.get('set_price', '')) + + card_info['Set Name'] = ', '.join(set_names) + card_info['Set Code'] = ', '.join(set_codes) + card_info['Set Rarity'] = ', '.join(set_rarities) + card_info['Set Price'] = ', '.join(set_prices) + else: + card_info['Set Name'] = '' + card_info['Set Code'] = '' + card_info['Set Rarity'] = '' + card_info['Set Price'] = '' + + # Get price info if available + if 'card_prices' in card and card['card_prices']: + prices = card['card_prices'][0] + card_info['TCGPlayer Price'] = prices.get('tcgplayer_price', '') + card_info['Cardmarket Price'] = prices.get('cardmarket_price', '') + card_info['eBay Price'] = prices.get('ebay_price', '') + card_info['Amazon Price'] = prices.get('amazon_price', '') + else: + card_info['TCGPlayer Price'] = '' + card_info['Cardmarket Price'] = '' + card_info['eBay Price'] = '' + card_info['Amazon Price'] = '' + + parsed_cards.append(card_info) + + return parsed_cards + +def export_to_excel(cards, filename='yugioh_cards.xlsx'): + """Export cards to Excel file""" + print(f"Creating Excel file: {filename}") + + # Create DataFrame + df = pd.DataFrame(cards) + + # Create Excel writer + with pd.ExcelWriter(filename, engine='openpyxl') as writer: + df.to_excel(writer, sheet_name='YuGiOh Cards', index=False) + + # Get the worksheet + worksheet = writer.sheets['YuGiOh Cards'] + + # Auto-adjust column widths + for column in worksheet.columns: + max_length = 0 + column_letter = column[0].column_letter + + for cell in column: + try: + if cell.value: + max_length = max(max_length, len(str(cell.value))) + except: + pass + + # Set width (max 50 for description column) + adjusted_width = min(max_length + 2, 50) + worksheet.column_dimensions[column_letter].width = adjusted_width + + # Apply header formatting + for cell in worksheet[1]: + cell.font = cell.font.copy(bold=True) + + # Add autofilter + worksheet.auto_filter.ref = worksheet.dimensions + + print(f"Successfully exported {len(cards)} cards to {filename}") + +def main(): + """Main function""" + print("Yu-Gi-Oh Card Data Importer") + print("=" * 50) + + # Fetch cards + cards = fetch_yugioh_cards() + + if not cards: + print("Failed to fetch card data. Exiting.") + return + + # Parse cards + parsed_cards = parse_card_data(cards) + + # Export to Excel + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"yugioh_cards_{timestamp}.xlsx" + export_to_excel(parsed_cards, filename) + + print("\nDone! You can now open the file in LibreOffice Calc or Excel.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/yugioh_cards_20251010_154113.xlsx b/yugioh_cards_20251010_154113.xlsx new file mode 100644 index 0000000..6a4f4a3 Binary files /dev/null and b/yugioh_cards_20251010_154113.xlsx differ diff --git a/yugioh_cards_20251011_110545.xlsx b/yugioh_cards_20251011_110545.xlsx new file mode 100644 index 0000000..6180b9c Binary files /dev/null and b/yugioh_cards_20251011_110545.xlsx differ