Skip to content
Advertisement

How to use schedule library to schedule script in Python

import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date

today = date.today()

Date = today

headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US, en;q=0.5'}
URL = ['https://www.amazon.com/Dove-Intensive-Concentrate-Technology-Protects/dp/B0B1VVXTKL',
             'https://www.amazon.com/Dove-Intensive-Concentrate-Conditioner-Technology/dp/B0B1VXFLQ2']
data = []
for url in URL:
    webpage = requests.get(url, headers=headers)
    soup = BeautifulSoup(webpage.content)
    data.append({
        'Rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0],
        #'rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0].replace('#', '').split(),
        'Category': " ".join(soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[2:6]),
         'Sub-Category Rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[5].get_text().split()[0],
        'Sub-Category': " ".join(soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[5].get_text().split()[2:6]),
        # ASIN 
       'ASIN': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("ASIN")').contents[3].get_text(),
        # Product Title
       'Product Title': soup.find("span", attrs={"id":'productTitle'}).text.strip(),
        'Date': Date
    })
    
df = pd.DataFrame(data)
df['Rank'] = df['Rank'].str.replace('#', '')
df['Sub-Category Rank'] = df['Sub-Category Rank'].str.replace('#', '')
# to local file
df.to_csv(local_path, mode='a', header=False, index=False)

I am trying to use import schedule library in Jupiter notebook as the Cron and Task scheduler in Windows doesn’t work for me. I am trying to execute this code every day at 8am. Can some one help define the job, thank you so much!

Advertisement

Answer

this one should work

import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date

import datetime
import asyncio

def wait_for_clock(hour, minute, result=None):
    t = datetime.datetime.combine(
        datetime.date.today(),
        datetime.time(hour, minute)
    )
    
    tt = datetime.datetime.now()
    
    if tt >= t:
        t += datetime.timedelta(days=1)
    
    delta = t - tt
    delta_sec = delta.seconds + delta.microseconds * 0.000001
    
    return asyncio.sleep(delta_sec, result)


async def do_that():
    today = date.today()
    
    Date = today
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'}
    URL = ['https://www.amazon.com/Dove-Intensive-Concentrate-Technology-Protects/dp/B0B1VVXTKL',
           'https://www.amazon.com/Dove-Intensive-Concentrate-Conditioner-Technology/dp/B0B1VXFLQ2']
    data = []
    for url in URL:
        webpage = requests.get(url, headers=headers)
        soup = BeautifulSoup(webpage.content)
        data.append({
            'Rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[
                2].get_text().split()[0],
            # 'rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0].replace('#', '').split(),
            'Category': " ".join(
                soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[
                    2].get_text().split()[2:6]),
            'Sub-Category Rank':
                soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[
                    5].get_text().split()[0],
            'Sub-Category': " ".join(
                soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[
                    5].get_text().split()[2:6]),
            # ASIN
            'ASIN': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("ASIN")').contents[
                3].get_text(),
            # Product Title
            'Product Title': soup.find("span", attrs={"id": 'productTitle'}).text.strip(),
            'Date': Date
        })
    
    df = pd.DataFrame(data)
    df['Rank'] = df['Rank'].str.replace('#', '')
    df['Sub-Category Rank'] = df['Sub-Category Rank'].str.replace('#', '')
    # to local file
    df.to_csv(local_path, mode='a', header=False, index=False)


if __name__ == '__main__':
    while True:
        asyncio.run(wait_for_clock(8, 0))
        asyncio.run(do_that())

And instead of shedule library, i using here just own code for waiting on clock (also I wrote that in “asynchronous” way, but you can also change function do_that to be sync and then at bottom, instead of asyncio.run(do_that()), just “do_that()”)

User contributions licensed under: CC BY-SA
4 People found this is helpful
Advertisement