I have so far created the below code, to extract all the informations, but I cant figure out how to extract the link. I have tried with a for loop, but i get differents links. I really hope anyone can point me at the right direction.
JavaScript
x
10
10
1
def extract(page, tag):
2
3
url = f"https://www.jobindex.dk/jobsoegning?page ={page} &q ={tag}"
4
5
r = requests.get(url, headers)
6
7
soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
8
9
return soup
10
JavaScript
1
22
22
1
def transform(soup):
2
divs = soup.find_all("div", class_="jobsearch-result")
3
for item in divs:
4
title = item.find_all("b")[0].text.strip()
5
company = item.find_all("b")[1].text.strip()
6
published_date = item.find("time").text.strip()
7
summary = item.find_all("p")[1].text.strip()
8
job_location = item.find_all("p")[0].text.strip()
9
job_url = item.find_all("href")
10
11
job = {
12
"title" : title,
13
"company" : company,
14
"published_date" : published_date,
15
"summary" : summary,
16
"job_location" : job_location,
17
"Job_url" : job_url
18
}
19
joblist.append(job)
20
21
return
22
Advertisement
Answer
You can combine an attribute = value css selector with contains *
operator to target onclick
attribute by a substring. Add to that selector list :has
to specify element with matched onclick
attribute must have immediate child b
tag which restricts matches to those with the bold job title
JavaScript
1
2
1
[data-click*="u="]:has(> b)
2
JavaScript
1
45
45
1
import requests
2
from bs4 import BeautifulSoup
3
4
def extract(page, tag):
5
6
headers = {'User-Agent':'Mozilla/5.0'}
7
8
url = f"https://www.jobindex.dk/jobsoegning?page={page}&q={tag}"
9
10
r = requests.get(url, headers)
11
12
soup = BeautifulSoup(r.content.decode("utf-8"), "html.parser")
13
14
return soup
15
16
def transform(soup):
17
18
divs = soup.find_all("div", class_="jobsearch-result")
19
20
for item in divs:
21
title = item.find_all("b")[0].text.strip()
22
company = item.find_all("b")[1].text.strip()
23
published_date = item.find("time").text.strip()
24
summary = item.find_all("p")[1].text.strip()
25
job_location = item.find_all("p")[0].text.strip()
26
job_url = item.select_one('[data-click*="u="]:has(> b)')['href']
27
28
job = {
29
"title" : title,
30
"company" : company,
31
"published_date" : published_date,
32
"summary" : summary,
33
"job_location" : job_location,
34
"Job_url" : job_url
35
}
36
joblist.append(job)
37
38
return
39
40
joblist = []
41
soup = extract(1, "python")
42
#print(soup)
43
transform(soup)
44
print(joblist)
45