Can someone help me modify this script so that it also scraps the URL associated with each job. The purpose would be when browsing the .csv file in a spreadsheet I can click on the link if I would like to know more information about the job. Thank you in advance.
JavaScript
x
45
45
1
import requests
2
from bs4 import BeautifulSoup
3
import pandas as pd
4
5
6
def extract(page):
7
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
8
url= f'https://www.indeed.com/jobs?q=Dispensary&l=Denver%2C+CO&radius={page}'
9
r = requests.get(url, headers)
10
soup = BeautifulSoup(r.content, 'html.parser')
11
return soup
12
13
def transform(soup):
14
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
15
for item in divs:
16
title = item.find('a').text.strip()
17
company = item.find('span', class_ = 'company').text.strip()
18
try:
19
salary = item.find('span', class_ = 'salaryText').text.strip()
20
except:
21
salary = ''
22
summary = item.find('div', class_ = 'summary').text.strip().replace('n', '')
23
24
job = {
25
'title': title,
26
'company': company,
27
'salary': salary,
28
'summary': summary
29
30
}
31
joblist.append(job)
32
return
33
34
35
joblist = []
36
37
for i in range(0,90,10):
38
print(f'Getting page, {i}')
39
c = extract(0)
40
transform(c)
41
42
df = pd.DataFrame(joblist)
43
print(df.head())
44
df.to_csv('jobs.csv')
45
Advertisement
Answer
You can use one of this
JavaScript
1
5
1
url = 'https://www.indeed.com' + item.find('a')['href']
2
url = 'https://www.indeed.com' + item.find('a').get('href')
3
url = 'https://www.indeed.com' + item.find('a').attrs['href']
4
url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
5
BTW:
You always load the same page. To get next page you have to use start=...
in url.
And you can do this more readable using dictionary and params=
in requests
JavaScript
1
11
11
1
payload = {
2
'q': 'Dispensary',
3
'l': 'Denver,+CO',
4
'radius': 0,
5
'start': page,
6
}
7
8
url= 'https://www.indeed.com/jobs'
9
10
r = requests.get(url, params=payload, headers=headers)
11
Working code:
JavaScript
1
67
67
1
import requests
2
from bs4 import BeautifulSoup
3
import pandas as pd
4
5
6
def extract(start):
7
headers = {
8
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
9
}
10
11
payload = {
12
'q': 'Dispensary',
13
'l': 'Denver,+CO',
14
'radius': 0,
15
'start': start,
16
}
17
18
url= 'https://www.indeed.com/jobs'
19
20
r = requests.get(url, params=payload, headers=headers)
21
22
soup = BeautifulSoup(r.content, 'html.parser')
23
24
return soup
25
26
27
def transform(soup, joblist):
28
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
29
30
for item in divs:
31
title = item.find('a').text.strip()
32
33
url = 'https://www.indeed.com' + item.find('a')['href']
34
#url = 'https://www.indeed.com' + item.find('a').get('href')
35
#url = 'https://www.indeed.com' + item.find('a').attrs['href']
36
#url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
37
38
company = item.find('span', class_ = 'company').text.strip()
39
40
try:
41
salary = item.find('span', class_ = 'salaryText').text.strip()
42
except:
43
salary = ''
44
45
summary = item.find('div', class_ = 'summary').text.strip().replace('n', '')
46
47
joblist.append({
48
'title': title,
49
'url': url,
50
'company': company,
51
'salary': salary,
52
'summary': summary
53
})
54
55
# --- main ---
56
57
joblist = []
58
59
for start in range(0, 90, 10):
60
print('Getting page', start)
61
c = extract(start)
62
transform(c, joblist)
63
64
df = pd.DataFrame(joblist)
65
df.to_csv('jobs.csv')
66
print(df.head())
67