Can someone help me modify this script so that it also scraps the URL associated with each job. The purpose would be when browsing the .csv file in a spreadsheet I can click on the link if I would like to know more information about the job. Thank you in advance.
import requests from bs4 import BeautifulSoup import pandas as pd def extract(page): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'} url= f'https://www.indeed.com/jobs?q=Dispensary&l=Denver%2C+CO&radius={page}' r = requests.get(url, headers) soup = BeautifulSoup(r.content, 'html.parser') return soup def transform(soup): divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard') for item in divs: title = item.find('a').text.strip() company = item.find('span', class_ = 'company').text.strip() try: salary = item.find('span', class_ = 'salaryText').text.strip() except: salary = '' summary = item.find('div', class_ = 'summary').text.strip().replace('n', '') job = { 'title': title, 'company': company, 'salary': salary, 'summary': summary } joblist.append(job) return joblist = [] for i in range(0,90,10): print(f'Getting page, {i}') c = extract(0) transform(c) df = pd.DataFrame(joblist) print(df.head()) df.to_csv('jobs.csv')
Advertisement
Answer
You can use one of this
url = 'https://www.indeed.com' + item.find('a')['href'] url = 'https://www.indeed.com' + item.find('a').get('href') url = 'https://www.indeed.com' + item.find('a').attrs['href'] url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
BTW:
You always load the same page. To get next page you have to use start=...
in url.
And you can do this more readable using dictionary and params=
in requests
payload = { 'q': 'Dispensary', 'l': 'Denver,+CO', 'radius': 0, 'start': page, } url= 'https://www.indeed.com/jobs' r = requests.get(url, params=payload, headers=headers)
Working code:
import requests from bs4 import BeautifulSoup import pandas as pd def extract(start): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36' } payload = { 'q': 'Dispensary', 'l': 'Denver,+CO', 'radius': 0, 'start': start, } url= 'https://www.indeed.com/jobs' r = requests.get(url, params=payload, headers=headers) soup = BeautifulSoup(r.content, 'html.parser') return soup def transform(soup, joblist): divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard') for item in divs: title = item.find('a').text.strip() url = 'https://www.indeed.com' + item.find('a')['href'] #url = 'https://www.indeed.com' + item.find('a').get('href') #url = 'https://www.indeed.com' + item.find('a').attrs['href'] #url = 'https://www.indeed.com' + item.find('a').attrs.get('href') company = item.find('span', class_ = 'company').text.strip() try: salary = item.find('span', class_ = 'salaryText').text.strip() except: salary = '' summary = item.find('div', class_ = 'summary').text.strip().replace('n', '') joblist.append({ 'title': title, 'url': url, 'company': company, 'salary': salary, 'summary': summary }) # --- main --- joblist = [] for start in range(0, 90, 10): print('Getting page', start) c = extract(start) transform(c, joblist) df = pd.DataFrame(joblist) df.to_csv('jobs.csv') print(df.head())