JavaScript
x
36
36
1
import requests
2
import pandas as pd
3
from bs4 import BeautifulSoup
4
from datetime import date
5
6
today = date.today()
7
8
Date = today
9
10
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
11
'Accept-Language': 'en-US, en;q=0.5'}
12
URL = ['https://www.amazon.com/Dove-Intensive-Concentrate-Technology-Protects/dp/B0B1VVXTKL',
13
'https://www.amazon.com/Dove-Intensive-Concentrate-Conditioner-Technology/dp/B0B1VXFLQ2']
14
data = []
15
for url in URL:
16
webpage = requests.get(url, headers=headers)
17
soup = BeautifulSoup(webpage.content)
18
data.append({
19
'Rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0],
20
#'rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0].replace('#', '').split(),
21
'Category': " ".join(soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[2:6]),
22
'Sub-Category Rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[5].get_text().split()[0],
23
'Sub-Category': " ".join(soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[5].get_text().split()[2:6]),
24
# ASIN
25
'ASIN': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("ASIN")').contents[3].get_text(),
26
# Product Title
27
'Product Title': soup.find("span", attrs={"id":'productTitle'}).text.strip(),
28
'Date': Date
29
})
30
31
df = pd.DataFrame(data)
32
df['Rank'] = df['Rank'].str.replace('#', '')
33
df['Sub-Category Rank'] = df['Sub-Category Rank'].str.replace('#', '')
34
# to local file
35
df.to_csv(local_path, mode='a', header=False, index=False)
36
I am trying to use import schedule library in Jupiter notebook as the Cron and Task scheduler in Windows doesn’t work for me. I am trying to execute this code every day at 8am. Can some one help define the job, thank you so much!
Advertisement
Answer
this one should work
JavaScript
1
72
72
1
import requests
2
import pandas as pd
3
from bs4 import BeautifulSoup
4
from datetime import date
5
6
import datetime
7
import asyncio
8
9
def wait_for_clock(hour, minute, result=None):
10
t = datetime.datetime.combine(
11
datetime.date.today(),
12
datetime.time(hour, minute)
13
)
14
15
tt = datetime.datetime.now()
16
17
if tt >= t:
18
t += datetime.timedelta(days=1)
19
20
delta = t - tt
21
delta_sec = delta.seconds + delta.microseconds * 0.000001
22
23
return asyncio.sleep(delta_sec, result)
24
25
26
async def do_that():
27
today = date.today()
28
29
Date = today
30
31
headers = {
32
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
33
'Accept-Language': 'en-US, en;q=0.5'}
34
URL = ['https://www.amazon.com/Dove-Intensive-Concentrate-Technology-Protects/dp/B0B1VVXTKL',
35
'https://www.amazon.com/Dove-Intensive-Concentrate-Conditioner-Technology/dp/B0B1VXFLQ2']
36
data = []
37
for url in URL:
38
webpage = requests.get(url, headers=headers)
39
soup = BeautifulSoup(webpage.content)
40
data.append({
41
'Rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[
42
2].get_text().split()[0],
43
# 'rank': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0].replace('#', '').split(),
44
'Category': " ".join(
45
soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[
46
2].get_text().split()[2:6]),
47
'Sub-Category Rank':
48
soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[
49
5].get_text().split()[0],
50
'Sub-Category': " ".join(
51
soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[
52
5].get_text().split()[2:6]),
53
# ASIN
54
'ASIN': soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("ASIN")').contents[
55
3].get_text(),
56
# Product Title
57
'Product Title': soup.find("span", attrs={"id": 'productTitle'}).text.strip(),
58
'Date': Date
59
})
60
61
df = pd.DataFrame(data)
62
df['Rank'] = df['Rank'].str.replace('#', '')
63
df['Sub-Category Rank'] = df['Sub-Category Rank'].str.replace('#', '')
64
# to local file
65
df.to_csv(local_path, mode='a', header=False, index=False)
66
67
68
if __name__ == '__main__':
69
while True:
70
asyncio.run(wait_for_clock(8, 0))
71
asyncio.run(do_that())
72
And instead of shedule library, i using here just own code for waiting on clock (also I wrote that in “asynchronous” way, but you can also change function do_that to be sync and then at bottom, instead of asyncio.run(do_that()), just “do_that()”)