Skip to content
Advertisement

How do I capture the URL of each job so I can open full job description when looking at csv file

Can someone help me modify this script so that it also scraps the URL associated with each job. The purpose would be when browsing the .csv file in a spreadsheet I can click on the link if I would like to know more information about the job. Thank you in advance.

import requests
from bs4 import BeautifulSoup
import pandas as pd


def extract(page):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
    url= f'https://www.indeed.com/jobs?q=Dispensary&l=Denver%2C+CO&radius={page}'
    r = requests.get(url, headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup

def transform(soup):
    divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
    for item in divs:
        title = item.find('a').text.strip()
        company = item.find('span', class_ = 'company').text.strip()
        try:
            salary = item.find('span', class_ = 'salaryText').text.strip()
        except:
            salary = ''
        summary = item.find('div', class_ = 'summary').text.strip().replace('n', '')
        
        job = {
            'title': title,
            'company': company,
            'salary': salary,
            'summary': summary
            
            }
        joblist.append(job)
    return
        

joblist = []

for i in range(0,90,10):
    print(f'Getting page, {i}')
    c = extract(0)
    transform(c)

df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')

Advertisement

Answer

You can use one of this

url = 'https://www.indeed.com' + item.find('a')['href']
url = 'https://www.indeed.com' + item.find('a').get('href')
url = 'https://www.indeed.com' + item.find('a').attrs['href']
url = 'https://www.indeed.com' + item.find('a').attrs.get('href')

BTW:

You always load the same page. To get next page you have to use start=... in url.
And you can do this more readable using dictionary and params= in requests

payload = {
    'q': 'Dispensary',
    'l': 'Denver,+CO',
    'radius': 0,
    'start': page,    
}

url= 'https://www.indeed.com/jobs'

r = requests.get(url, params=payload, headers=headers)

Working code:

import requests
from bs4 import BeautifulSoup
import pandas as pd


def extract(start):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
    }
    
    payload = {
        'q': 'Dispensary',
        'l': 'Denver,+CO',
        'radius': 0,
        'start': start,    
    }
    
    url= 'https://www.indeed.com/jobs'
    
    r = requests.get(url, params=payload, headers=headers)
    
    soup = BeautifulSoup(r.content, 'html.parser')
    
    return soup


def transform(soup, joblist):
    divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
    
    for item in divs:
        title = item.find('a').text.strip()
        
        url = 'https://www.indeed.com' + item.find('a')['href']
        #url = 'https://www.indeed.com' + item.find('a').get('href')
        #url = 'https://www.indeed.com' + item.find('a').attrs['href']
        #url = 'https://www.indeed.com' + item.find('a').attrs.get('href')

        company = item.find('span', class_ = 'company').text.strip()
        
        try:
            salary = item.find('span', class_ = 'salaryText').text.strip()
        except:
            salary = ''
        
        summary = item.find('div', class_ = 'summary').text.strip().replace('n', '')
        
        joblist.append({
            'title': title,
            'url': url,
            'company': company,
            'salary': salary,
            'summary': summary
        })
        
# --- main ---

joblist = []

for start in range(0, 90, 10):
    print('Getting page', start)
    c = extract(start)
    transform(c, joblist)

df = pd.DataFrame(joblist)
df.to_csv('jobs.csv')
print(df.head())
User contributions licensed under: CC BY-SA
1 People found this is helpful
Advertisement