I am trying to create a data frame from a dictionary I have and it gives me an error that says:
> ValueError: could not broadcast input array from shape (3) into shape > (1)
Here is the code:
import requests from bs4 import BeautifulSoup from requests.api import request from selenium import webdriver from bs4 import Tag, NavigableString baseurl = "https://www.olx.com.eg/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36" } product_links = [] for x in range(1,13): r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers) soup = BeautifulSoup(r.content, "lxml") product_list = soup.findAll("div", class_="ads__item") for item in product_list: for link in item.findAll("a",href=True): product_links.append(link['href']) for thing in product_links: if '#' in product_links: product_links.remove('#') # test_link = 'https://www.olx.com.eg/ad/-IDcjqyP.html' for link in product_links: r = requests.get(link, headers=headers) soup = BeautifulSoup(r.content, "lxml") job_title = soup.find('h1',class_="brkword") job_location = soup.find('strong',class_="c2b") job_date = soup.find('span',class_="pdingleft10 brlefte5") try: seniority = soup.find_all('td',class_='value')[0].text.strip() except: print("") try: full_or_part = soup.find_all('td',class_='value')[1].text.strip() except: print("") try: education_level = soup.find_all('td',class_='value')[2].text.strip() except: print("") try: sector = soup.find_all('td',class_='value')[3].text.strip() except: print("") description = soup.find_all('p',class_='pding10') df = { "Job Title" : job_title, "Job Location" : job_location, "Post Date" : job_date, "Seniority Level" : seniority, "Full or Part time" : full_or_part, "Educational Level" : education_level, "Sector" : sector, "Job Description" : description } job_data = pd.DataFrame(df)
Please tell me how I can transform the data I have into a data frame so I can export it into a csv first of all I was trying to to scrape this jobs website and it scraped it successfully returning 500 jobs in the dictionary but I was unfortunately not able to transform the code into a dataframe, so later on i can export that out to a csv file, so i can do some analysis on it
Advertisement
Answer
To create dataframe from the job ads, you can try next example (some column names needs to be renamed from arabic to english though):
import requests import pandas as pd from bs4 import BeautifulSoup baseurl = "https://www.olx.com.eg/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36" } product_links = [] for x in range(1, 2): # <-- increase the range here r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers) soup = BeautifulSoup(r.content, "lxml") product_list = soup.findAll("div", class_="ads__item") for item in product_list: for link in item.findAll("a", href=True): if link["href"] != "#": product_links.append(link["href"]) all_data = [] for link in product_links: print(f"Getting {link} ...") soup = BeautifulSoup(requests.get(link, headers=headers).content, "lxml") d = {} job_title = soup.find("h1").get_text(strip=True) job_location = soup.find("strong", class_="c2b") job_date = soup.find("span", class_="pdingleft10 brlefte5") d["title"] = job_title d["location"] = job_location.get_text(strip=True) if job_location else "N/A" d["date"] = job_date.get_text(strip=True) if job_date else "N/A" for table in soup.select("table.item"): d[table.th.get_text(strip=True)] = table.td.get_text(strip=True) all_data.append(d) job_data = pd.DataFrame(all_data) print(job_data) job_data.to_csv("data.csv", index=False)
Creates data.csv
(screenshot from LibreOffice):