Product Scraper - async still broken.

This commit is contained in:
2026-01-12 00:13:26 +00:00
parent 72ee0bb104
commit 752bb52e02

View File

@@ -511,9 +511,23 @@ class Cost_Fetcher:
processed_count = 0 processed_count = 0
updated_count = 0 updated_count = 0
self.product_sources = self.workbook_container.get_sourcing_entries() self.product_sources = self.workbook_container.get_sourcing_entries()
sourced_products = await self.scrape_with_browser_pool() # sourced_products = await self.scrape_with_browser_pool()
# for sourced_product in sourced_products:
# for product_source in self.product_sources:
# sourced_product = await self.fetch_single(product_source = product_source)
# for index_product_source, product_source in self.product_sources.iterrows():
# sourced_product = await self.fetch_single(product_source = product_source)
# Create tasks for parallel execution
tasks = []
for index_product_source, product_source in self.product_sources.iterrows():
task = self.fetch_single(product_source = product_source)
tasks.append(task)
# Execute all tasks in parallel
sourced_products = await asyncio.gather(*tasks)
# Process results
for sourced_product in sourced_products: for sourced_product in sourced_products:
# self.workbook_container.workbook.cell(index_row, self.workbook_container.IND)
index_row = sourced_product[self.workbook_container.NAME_COLUMN_INDEX_ROW] index_row = sourced_product[self.workbook_container.NAME_COLUMN_INDEX_ROW]
unit_cost = sourced_product[self.workbook_container.NAME_COLUMN_UNIT_COST] unit_cost = sourced_product[self.workbook_container.NAME_COLUMN_UNIT_COST]
unit_price = sourced_product[self.workbook_container.index_column_unit_price_sourcing] unit_price = sourced_product[self.workbook_container.index_column_unit_price_sourcing]
@@ -534,9 +548,10 @@ class Cost_Fetcher:
print(f"Updated: {updated_count} costs") print(f"Updated: {updated_count} costs")
except Exception as e: except Exception as e:
print(f"Error: {e}") print(f"Error: {e}")
"""
async def scrape_with_browser_pool(self): async def scrape_with_browser_pool(self):
count_domains = len(self.domain_names) count_domains = len(self.domain_names)
async with async_playwright() as p: async with async_playwright() as p:
browsers = [await p.chromium.launch(headless = False) for _ in range(count_domains)] browsers = [await p.chromium.launch(headless = False) for _ in range(count_domains)]
@@ -552,7 +567,10 @@ class Cost_Fetcher:
finally: finally:
for browser in browsers: for browser in browsers:
await browser.close() await browser.close()
async def fetch_single(self, browser, product_source): """
async def fetch_single(self, product_source): # browser,
async with async_playwright() as p:
browser = await p.chromium.launch(headless = False)
print(f'Product source: {product_source}') print(f'Product source: {product_source}')
index_row = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_INDEX_ROW] index_row = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_INDEX_ROW]
source_name = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_SOURCE_NAME] source_name = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_SOURCE_NAME]
@@ -561,6 +579,7 @@ class Cost_Fetcher:
try: try:
index_domain = self.get_index_domain_from_name(source_name) index_domain = self.get_index_domain_from_name(source_name)
except: except:
await browser.close()
return self.make_result_data_json(index_row = index_row) return self.make_result_data_json(index_row = index_row)
domain_details = self.domain_details[source_name] domain_details = self.domain_details[source_name]
self.workbook_container.clear_row_sourcing_sheet(index_row = index_row) self.workbook_container.clear_row_sourcing_sheet(index_row = index_row)
@@ -602,6 +621,7 @@ class Cost_Fetcher:
if ((cost is None and price is None) or active is None): if ((cost is None and price is None) or active is None):
print(f" Error: Could not find cost on page") print(f" Error: Could not find cost on page")
await browser.close()
return self.make_result_data_json( return self.make_result_data_json(
index_row = index_row index_row = index_row
, cost = cost , cost = cost