Can someone help me modify this script so that it also scraps the URL associated with each job. The purpose would be when browsing the .csv file in a spreadsheet I can click on the link if I would like to know more information about the job. Thank you in advance.
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
url= f'https://www.indeed.com/jobs?q=Dispensary&l=Denver%2C+CO&radius={page}'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('n', '')
job = {
'title': title,
'company': company,
'salary': salary,
'summary': summary
}
joblist.append(job)
return
joblist = []
for i in range(0,90,10):
print(f'Getting page, {i}')
c = extract(0)
transform(c)
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')
Advertisement
Answer
You can use one of this
url = 'https://www.indeed.com' + item.find('a')['href']
url = 'https://www.indeed.com' + item.find('a').get('href')
url = 'https://www.indeed.com' + item.find('a').attrs['href']
url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
BTW:
You always load the same page. To get next page you have to use start=... in url.
And you can do this more readable using dictionary and params= in requests
payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': page,
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
Working code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(start):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': start,
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup, joblist):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
url = 'https://www.indeed.com' + item.find('a')['href']
#url = 'https://www.indeed.com' + item.find('a').get('href')
#url = 'https://www.indeed.com' + item.find('a').attrs['href']
#url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('n', '')
joblist.append({
'title': title,
'url': url,
'company': company,
'salary': salary,
'summary': summary
})
# --- main ---
joblist = []
for start in range(0, 90, 10):
print('Getting page', start)
c = extract(start)
transform(c, joblist)
df = pd.DataFrame(joblist)
df.to_csv('jobs.csv')
print(df.head())