Product Scraper - async still broken.
This commit is contained in:
@@ -511,9 +511,23 @@ class Cost_Fetcher:
|
|||||||
processed_count = 0
|
processed_count = 0
|
||||||
updated_count = 0
|
updated_count = 0
|
||||||
self.product_sources = self.workbook_container.get_sourcing_entries()
|
self.product_sources = self.workbook_container.get_sourcing_entries()
|
||||||
sourced_products = await self.scrape_with_browser_pool()
|
# sourced_products = await self.scrape_with_browser_pool()
|
||||||
|
# for sourced_product in sourced_products:
|
||||||
|
# for product_source in self.product_sources:
|
||||||
|
# sourced_product = await self.fetch_single(product_source = product_source)
|
||||||
|
# for index_product_source, product_source in self.product_sources.iterrows():
|
||||||
|
# sourced_product = await self.fetch_single(product_source = product_source)
|
||||||
|
# Create tasks for parallel execution
|
||||||
|
tasks = []
|
||||||
|
for index_product_source, product_source in self.product_sources.iterrows():
|
||||||
|
task = self.fetch_single(product_source = product_source)
|
||||||
|
tasks.append(task)
|
||||||
|
|
||||||
|
# Execute all tasks in parallel
|
||||||
|
sourced_products = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
# Process results
|
||||||
for sourced_product in sourced_products:
|
for sourced_product in sourced_products:
|
||||||
# self.workbook_container.workbook.cell(index_row, self.workbook_container.IND)
|
|
||||||
index_row = sourced_product[self.workbook_container.NAME_COLUMN_INDEX_ROW]
|
index_row = sourced_product[self.workbook_container.NAME_COLUMN_INDEX_ROW]
|
||||||
unit_cost = sourced_product[self.workbook_container.NAME_COLUMN_UNIT_COST]
|
unit_cost = sourced_product[self.workbook_container.NAME_COLUMN_UNIT_COST]
|
||||||
unit_price = sourced_product[self.workbook_container.index_column_unit_price_sourcing]
|
unit_price = sourced_product[self.workbook_container.index_column_unit_price_sourcing]
|
||||||
@@ -534,9 +548,10 @@ class Cost_Fetcher:
|
|||||||
print(f"Updated: {updated_count} costs")
|
print(f"Updated: {updated_count} costs")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error: {e}")
|
print(f"Error: {e}")
|
||||||
|
"""
|
||||||
async def scrape_with_browser_pool(self):
|
async def scrape_with_browser_pool(self):
|
||||||
count_domains = len(self.domain_names)
|
count_domains = len(self.domain_names)
|
||||||
|
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browsers = [await p.chromium.launch(headless = False) for _ in range(count_domains)]
|
browsers = [await p.chromium.launch(headless = False) for _ in range(count_domains)]
|
||||||
|
|
||||||
@@ -552,62 +567,67 @@ class Cost_Fetcher:
|
|||||||
finally:
|
finally:
|
||||||
for browser in browsers:
|
for browser in browsers:
|
||||||
await browser.close()
|
await browser.close()
|
||||||
async def fetch_single(self, browser, product_source):
|
"""
|
||||||
print(f'Product source: {product_source}')
|
async def fetch_single(self, product_source): # browser,
|
||||||
index_row = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_INDEX_ROW]
|
async with async_playwright() as p:
|
||||||
source_name = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_SOURCE_NAME]
|
browser = await p.chromium.launch(headless = False)
|
||||||
source_link = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_LINK]
|
print(f'Product source: {product_source}')
|
||||||
index_domain = None
|
index_row = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_INDEX_ROW]
|
||||||
try:
|
source_name = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_SOURCE_NAME]
|
||||||
index_domain = self.get_index_domain_from_name(source_name)
|
source_link = product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_LINK]
|
||||||
except:
|
index_domain = None
|
||||||
return self.make_result_data_json(index_row = index_row)
|
try:
|
||||||
domain_details = self.domain_details[source_name]
|
index_domain = self.get_index_domain_from_name(source_name)
|
||||||
self.workbook_container.clear_row_sourcing_sheet(index_row = index_row)
|
except:
|
||||||
Cost_Fetcher.log_processing_new_row(
|
await browser.close()
|
||||||
index_row = index_row
|
return self.make_result_data_json(index_row = index_row)
|
||||||
, source_link = source_link
|
domain_details = self.domain_details[source_name]
|
||||||
)
|
self.workbook_container.clear_row_sourcing_sheet(index_row = index_row)
|
||||||
|
Cost_Fetcher.log_processing_new_row(
|
||||||
cost = None
|
index_row = index_row
|
||||||
price = None
|
, source_link = source_link
|
||||||
active = None
|
)
|
||||||
if source_name == self.NAME_DOMAIN_CARD_MARKET:
|
|
||||||
while (self.active_row_indices[index_domain] is None or time.time() - domain_details[self.ACCESSED_LAST_ON_FLAG] < random.uniform(30, 40)):
|
cost = None
|
||||||
await asyncio.sleep(random.uniform(3, 5))
|
price = None
|
||||||
self.active_row_indices[index_domain] = index_row
|
active = None
|
||||||
if product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_PRODUCT_IS_BOOSTER]:
|
if source_name == self.NAME_DOMAIN_CARD_MARKET:
|
||||||
price_quantity_pairs = await self.product_scrapers[index_domain].scrape_prices_and_quantities_playwright_cardmarket(browser = browser, url = source_link, eur_to_gbp_rate = self.eur_to_gbp_rate)
|
while (self.active_row_indices[index_domain] is None or time.time() - domain_details[self.ACCESSED_LAST_ON_FLAG] < random.uniform(30, 40)):
|
||||||
price = self.get_sale_price_from_price_quantity_pairs(price_quantity_pairs = price_quantity_pairs)
|
await asyncio.sleep(random.uniform(3, 5))
|
||||||
cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_cardmarket(browser = browser, url = source_link, eur_to_gbp_rate = self.eur_to_gbp_rate)
|
self.active_row_indices[index_domain] = index_row
|
||||||
elif source_name == self.NAME_DOMAIN_CHAOS_CARDS:
|
if product_source[TCG_Sole_Trader_Workbook_Container.NAME_COLUMN_PRODUCT_IS_BOOSTER]:
|
||||||
while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(30, 40)):
|
price_quantity_pairs = await self.product_scrapers[index_domain].scrape_prices_and_quantities_playwright_cardmarket(browser = browser, url = source_link, eur_to_gbp_rate = self.eur_to_gbp_rate)
|
||||||
await asyncio.sleep(random.uniform(3, 5))
|
price = self.get_sale_price_from_price_quantity_pairs(price_quantity_pairs = price_quantity_pairs)
|
||||||
self.active_row_indices[index_domain] = index_row
|
cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_cardmarket(browser = browser, url = source_link, eur_to_gbp_rate = self.eur_to_gbp_rate)
|
||||||
cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_chaoscards(browser = browser, url = source_link)
|
elif source_name == self.NAME_DOMAIN_CHAOS_CARDS:
|
||||||
elif source_name == self.NAME_DOMAIN_GAMES_LORE:
|
while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(30, 40)):
|
||||||
while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(10, 20)):
|
await asyncio.sleep(random.uniform(3, 5))
|
||||||
await asyncio.sleep(random.uniform(3, 5))
|
self.active_row_indices[index_domain] = index_row
|
||||||
self.active_row_indices[index_domain] = index_row
|
cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_chaoscards(browser = browser, url = source_link)
|
||||||
cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_gameslore(browser = browser, url = source_link)
|
elif source_name == self.NAME_DOMAIN_GAMES_LORE:
|
||||||
elif source_name == self.NAME_DOMAIN_MAGIC_MADHOUSE:
|
while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(10, 20)):
|
||||||
while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(10, 20)):
|
await asyncio.sleep(random.uniform(3, 5))
|
||||||
await asyncio.sleep(random.uniform(3, 5))
|
self.active_row_indices[index_domain] = index_row
|
||||||
self.active_row_indices[index_domain] = index_row
|
cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_gameslore(browser = browser, url = source_link)
|
||||||
cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_magicmadhouse(browser = browser, url = source_link)
|
elif source_name == self.NAME_DOMAIN_MAGIC_MADHOUSE:
|
||||||
|
while (self.active_row_indices[index_domain] is None or time.time() - self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] < random.uniform(10, 20)):
|
||||||
self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] = time.time()
|
await asyncio.sleep(random.uniform(3, 5))
|
||||||
self.active_row_indices[index_domain] = None
|
self.active_row_indices[index_domain] = index_row
|
||||||
|
cost, active = await self.product_scrapers[index_domain].scrape_cost_and_active_playwright_magicmadhouse(browser = browser, url = source_link)
|
||||||
if ((cost is None and price is None) or active is None):
|
|
||||||
print(f" Error: Could not find cost on page")
|
self.domain_details[source_name][self.ACCESSED_LAST_ON_FLAG] = time.time()
|
||||||
|
self.active_row_indices[index_domain] = None
|
||||||
return self.make_result_data_json(
|
|
||||||
index_row = index_row
|
if ((cost is None and price is None) or active is None):
|
||||||
, cost = cost
|
print(f" Error: Could not find cost on page")
|
||||||
, price = price
|
|
||||||
, active = active
|
await browser.close()
|
||||||
)
|
return self.make_result_data_json(
|
||||||
|
index_row = index_row
|
||||||
|
, cost = cost
|
||||||
|
, price = price
|
||||||
|
, active = active
|
||||||
|
)
|
||||||
@classmethod
|
@classmethod
|
||||||
def make_result(cls, url, success, data, error):
|
def make_result(cls, url, success, data, error):
|
||||||
return {
|
return {
|
||||||
|
|||||||
Reference in New Issue
Block a user