From 752bb52e024cab17c64276a046cbd5c842499009 Mon Sep 17 00:00:00 2001 From: Teddy Middleton-Smith Date: Mon, 12 Jan 2026 00:13:26 +0000 Subject: [PATCH] Product Scraper - async still broken. --- product_scraping/product_scraper.py | 138 ++++++++++++++++------------ 1 file changed, 79 insertions(+), 59 deletions(-) diff --git a/product_scraping/product_scraper.py b/product_scraping/product_scraper.py index 475d897..20bb66e 100644 --- a/product_scraping/product_scraper.py +++ b/product_scraping/product_scraper.py @@ -511,9 +511,23 @@ class Cost_Fetcher: processed_count = 0 updated_count = 0 self.product_sources = self.workbook_container.get_sourcing_entries() - sourced_products = await self.scrape_with_browser_pool() + # sourced_products = await self.scrape_with_browser_pool() + # for sourced_product in sourced_products: + # for product_source in self.product_sources: + # sourced_product = await self.fetch_single(product_source = product_source) + # for index_product_source, product_source in self.product_sources.iterrows(): + # sourced_product = await self.fetch_single(product_source = product_source) + # Create tasks for parallel execution + tasks = [] + for index_product_source, product_source in self.product_sources.iterrows(): + task = self.fetch_single(product_source = product_source) + tasks.append(task) + + # Execute all tasks in parallel + sourced_products = await asyncio.gather(*tasks) + + # Process results for sourced_product in sourced_products: - # self.workbook_container.workbook.cell(index_row, self.workbook_container.IND) index_row = sourced_product[self.workbook_container.NAME_COLUMN_INDEX_ROW] unit_cost = sourced_product[self.workbook_container.NAME_COLUMN_UNIT_COST] unit_price = sourced_product[self.workbook_container.index_column_unit_price_sourcing] @@ -534,9 +548,10 @@ class Cost_Fetcher: print(f"Updated: {updated_count} costs") except Exception as e: print(f"Error: {e}") - + """ async def scrape_with_browser_pool(self): count_domains = len(self.domain_names) + async with async_playwright() as p: browsers = [await p.chromium.launch(headless = False) for _ in range(count_domains)] @@ -552,62 +567,67 @@ class Cost_Fetcher: finally: for browser in browsers: await browser.close() - async def fetch_single(self, browser, product_source): - print(f'Product source: {product_source}') - index_row = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_INDEX_ROW] - source_name = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_SOURCE_NAME] - source_link = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_LINK] - index_domain = None - try: - index_domain = self.get_index_domain_from_name(source_name) - except: - return self.make_result_data_json(index_row = index_row) - domain_details = self.domain_details[source_name] - self.workbook_container.clear_row_sourcing_sheet(index_row = index_row) - Cost_Fetcher.log_processing_new_row( - index_row = index_row - , source_link = source_link - ) - - cost = None - price = None - active = None - if source_name == self.NAME_DOMAIN_CARD_MARKET: - while (self.active_row_indices[index_domain] is None or time.time() - domain_details[self.ACCESSED_LAST_ON_FLAG] < random.uniform(30, 40)): - await asyncio.sleep(random.uniform(3, 5)) - self.active_row_indices[index_domain] = index_row - if product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_PRODUCT_IS_BOOSTER]: - price_quantity_pairs = await self.product_scrapers[index_domain].scrape_prices_and_quantities_playwright_cardmarket(browser = browser, url = source_link, eur_to_gbp_rate = self.eur_to_gbp_rate) - price = self.get_sale_price_from_price_quantity_pairs(price_quantity_pairs = price_quantity_pairs) - cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_cardmarket(browser = browser, url = source_link, eur_to_gbp_rate = self.eur_to_gbp_rate) - elif source_name == self.NAME_DOMAIN_CHAOS_CARDS: - while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(30, 40)): - await asyncio.sleep(random.uniform(3, 5)) - self.active_row_indices[index_domain] = index_row - cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_chaoscards(browser = browser, url = source_link) - elif source_name == self.NAME_DOMAIN_GAMES_LORE: - while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(10, 20)): - await asyncio.sleep(random.uniform(3, 5)) - self.active_row_indices[index_domain] = index_row - cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_gameslore(browser = browser, url = source_link) - elif source_name == self.NAME_DOMAIN_MAGIC_MADHOUSE: - while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(10, 20)): - await asyncio.sleep(random.uniform(3, 5)) - self.active_row_indices[index_domain] = index_row - cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_magicmadhouse(browser = browser, url = source_link) - - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] = time.time() - self.active_row_indices[index_domain] = None - - if ((cost is None and price is None) or active is None): - print(f" Error: Could not find cost on page") - - return self.make_result_data_json( - index_row = index_row - , cost = cost - , price = price - , active = active - ) + """ + async def fetch_single(self, product_source): # browser, + async with async_playwright() as p: + browser = await p.chromium.launch(headless = False) + print(f'Product source: {product_source}') + index_row = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_INDEX_ROW] + source_name = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_SOURCE_NAME] + source_link = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_LINK] + index_domain = None + try: + index_domain = self.get_index_domain_from_name(source_name) + except: + await browser.close() + return self.make_result_data_json(index_row = index_row) + domain_details = self.domain_details[source_name] + self.workbook_container.clear_row_sourcing_sheet(index_row = index_row) + Cost_Fetcher.log_processing_new_row( + index_row = index_row + , source_link = source_link + ) + + cost = None + price = None + active = None + if source_name == self.NAME_DOMAIN_CARD_MARKET: + while (self.active_row_indices[index_domain] is None or time.time() - domain_details[self.ACCESSED_LAST_ON_FLAG] < random.uniform(30, 40)): + await asyncio.sleep(random.uniform(3, 5)) + self.active_row_indices[index_domain] = index_row + if product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_PRODUCT_IS_BOOSTER]: + price_quantity_pairs = await self.product_scrapers[index_domain].scrape_prices_and_quantities_playwright_cardmarket(browser = browser, url = source_link, eur_to_gbp_rate = self.eur_to_gbp_rate) + price = self.get_sale_price_from_price_quantity_pairs(price_quantity_pairs = price_quantity_pairs) + cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_cardmarket(browser = browser, url = source_link, eur_to_gbp_rate = self.eur_to_gbp_rate) + elif source_name == self.NAME_DOMAIN_CHAOS_CARDS: + while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(30, 40)): + await asyncio.sleep(random.uniform(3, 5)) + self.active_row_indices[index_domain] = index_row + cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_chaoscards(browser = browser, url = source_link) + elif source_name == self.NAME_DOMAIN_GAMES_LORE: + while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(10, 20)): + await asyncio.sleep(random.uniform(3, 5)) + self.active_row_indices[index_domain] = index_row + cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_gameslore(browser = browser, url = source_link) + elif source_name == self.NAME_DOMAIN_MAGIC_MADHOUSE: + while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(10, 20)): + await asyncio.sleep(random.uniform(3, 5)) + self.active_row_indices[index_domain] = index_row + cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_magicmadhouse(browser = browser, url = source_link) + + self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] = time.time() + self.active_row_indices[index_domain] = None + + if ((cost is None and price is None) or active is None): + print(f" Error: Could not find cost on page") + + await browser.close() + return self.make_result_data_json( + index_row = index_row + , cost = cost + , price = price + , active = active + ) @classmethod def make_result(cls, url, success, data, error): return {