I am trying to create a data frame from a dictionary I have and it gives me an error that says:
> ValueError: could not broadcast input array from shape (3) into shape > (1)
Here is the code:
import requests from bs4 import BeautifulSoup from requests.api import request from selenium import webdriver from bs4 import Tag, NavigableString baseurl = "https://www.olx.com.eg/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36" } product_links = [] for x in range(1,13): r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers) soup = BeautifulSoup(r.content, "lxml") product_list = soup.findAll("div", class_="ads__item") for item in product_list: for link in item.findAll("a",href=True): product_links.append(link['href']) for thing in product_links: if '#' in product_links: product_links.remove('#') # test_link = 'https://www.olx.com.eg/ad/-IDcjqyP.html' for link in product_links: r = requests.get(link, headers=headers) soup = BeautifulSoup(r.content, "lxml") job_title = soup.find('h1',class_="brkword") job_location = soup.find('strong',class_="c2b") job_date = soup.find('span',class_="pdingleft10 brlefte5") try: seniority = soup.find_all('td',class_='value')[0].text.strip() except: print("") try: full_or_part = soup.find_all('td',class_='value')[1].text.strip() except: print("") try: education_level = soup.find_all('td',class_='value')[2].text.strip() except: print("") try: sector = soup.find_all('td',class_='value')[3].text.strip() except: print("") description = soup.find_all('p',class_='pding10') df = { "Job Title" : job_title, "Job Location" : job_location, "Post Date" : job_date, "Seniority Level" : seniority, "Full or Part time" : full_or_part, "Educational Level" : education_level, "Sector" : sector, "Job Description" : description } job_data = pd.DataFrame(df)
Please tell me how I can transform the data I have into a data frame so I can export it into a csv first of all I was trying to to scrape this jobs website and it scraped it successfully returning 500 jobs in the dictionary but I was unfortunately not able to transform the code into a dataframe, so later on i can export that out to a csv file, so i can do some analysis on it
To create dataframe from the job ads, you can try next example (some column names needs to be renamed from arabic to english though):
import requests import pandas as pd from bs4 import BeautifulSoup baseurl = "https://www.olx.com.eg/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36" } product_links = [] for x in range(1, 2): # <-- increase the range here r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers) soup = BeautifulSoup(r.content, "lxml") product_list = soup.findAll("div", class_="ads__item") for item in product_list: for link in item.findAll("a", href=True): if link["href"] != "#": product_links.append(link["href"]) all_data = [] for link in product_links: print(f"Getting {link} ...") soup = BeautifulSoup(requests.get(link, headers=headers).content, "lxml") d = {} job_title = soup.find("h1").get_text(strip=True) job_location = soup.find("strong", class_="c2b") job_date = soup.find("span", class_="pdingleft10 brlefte5") d["title"] = job_title d["location"] = job_location.get_text(strip=True) if job_location else "N/A" d["date"] = job_date.get_text(strip=True) if job_date else "N/A" for table in soup.select("table.item"): d[table.th.get_text(strip=True)] = table.td.get_text(strip=True) all_data.append(d) job_data = pd.DataFrame(all_data) print(job_data) job_data.to_csv("data.csv", index=False)
Creates data.csv
(screenshot from LibreOffice):