import requests import pandas as pd from bs4 import BeautifulSoup from datetime import date today = date.today() Date = today headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'} URL = ['https://www.amazon.com/Dove-Intensive-Concentrate-Technology-Protects/dp/B0B1VVXTKL', 'https://www.amazon.com/Dove-Intensive-Concentrate-Conditioner-Technology/dp/B0B1VXFLQ2'] data = [] for url in URL: webpage = requests.get(url, headers=headers) soup = BeautifulSoup(webpage.content) data.append({ 'Rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0], #'rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0].replace('#', '').split(), 'Category': " ".join(soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[2:6]), 'Sub-Category Rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[5].get_text().split()[0], 'Sub-Category': " ".join(soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[5].get_text().split()[2:6]), # ASIN 'ASIN': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("ASIN")').contents[3].get_text(), # Product Title 'Product Title': soup.find("span", attrs={"id":'productTitle'}).text.strip(), 'Date': Date }) df = pd.DataFrame(data) df['Rank'] = df['Rank'].str.replace('#', '') df['Sub-Category Rank'] = df['Sub-Category Rank'].str.replace('#', '') # to local file df.to_csv(local_path, mode='a', header=False, index=False)
I am trying to use import schedule library in Jupiter notebook as the Cron and Task scheduler in Windows doesn’t work for me. I am trying to execute this code every day at 8am. Can some one help define the job, thank you so much!
Advertisement
Answer
this one should work
import requests import pandas as pd from bs4 import BeautifulSoup from datetime import date import datetime import asyncio def wait_for_clock(hour, minute, result=None): t = datetime.datetime.combine( datetime.date.today(), datetime.time(hour, minute) ) tt = datetime.datetime.now() if tt >= t: t += datetime.timedelta(days=1) delta = t - tt delta_sec = delta.seconds + delta.microseconds * 0.000001 return asyncio.sleep(delta_sec, result) async def do_that(): today = date.today() Date = today headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'} URL = ['https://www.amazon.com/Dove-Intensive-Concentrate-Technology-Protects/dp/B0B1VVXTKL', 'https://www.amazon.com/Dove-Intensive-Concentrate-Conditioner-Technology/dp/B0B1VXFLQ2'] data = [] for url in URL: webpage = requests.get(url, headers=headers) soup = BeautifulSoup(webpage.content) data.append({ 'Rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[ 2].get_text().split()[0], # 'rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0].replace('#', '').split(), 'Category': " ".join( soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[ 2].get_text().split()[2:6]), 'Sub-Category Rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[ 5].get_text().split()[0], 'Sub-Category': " ".join( soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[ 5].get_text().split()[2:6]), # ASIN 'ASIN': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("ASIN")').contents[ 3].get_text(), # Product Title 'Product Title': soup.find("span", attrs={"id": 'productTitle'}).text.strip(), 'Date': Date }) df = pd.DataFrame(data) df['Rank'] = df['Rank'].str.replace('#', '') df['Sub-Category Rank'] = df['Sub-Category Rank'].str.replace('#', '') # to local file df.to_csv(local_path, mode='a', header=False, index=False) if __name__ == '__main__': while True: asyncio.run(wait_for_clock(8, 0)) asyncio.run(do_that())
And instead of shedule library, i using here just own code for waiting on clock (also I wrote that in “asynchronous” way, but you can also change function do_that to be sync and then at bottom, instead of asyncio.run(do_that()), just “do_that()”)