I am trying to create a data frame from a dictionary I have and it gives me an error that says: Here is the code: Please tell me how I can transform the data I have into a data frame so I can export it into a csv first of all I was trying to to scrape this jobs website

Creating a dataframe from a dictionary is giving me a could not broadcast error

I am trying to create a data frame from a dictionary I have and it gives me an error that says:

> ValueError: could not broadcast input array from shape (3) into shape
> (1)

JavaScript
​x
 
> ValueError: could not broadcast input array from shape (3) into shape
> (1)
​

Here is the code:

import requests
from bs4 import BeautifulSoup
from requests.api import request
from selenium import webdriver
from bs4 import Tag, NavigableString
baseurl = "https://www.olx.com.eg/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}

product_links = []
for x in range(1,13):
    r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers)
    soup = BeautifulSoup(r.content, "lxml")


    product_list = soup.findAll("div", class_="ads__item")
  


        
    for item in product_list:
        for link in item.findAll("a",href=True):
            product_links.append(link['href'])


    for thing in product_links:
        if '#' in product_links: product_links.remove('#')


# test_link = 'https://www.olx.com.eg/ad/-IDcjqyP.html'

for link in product_links:
    r = requests.get(link, headers=headers)
    soup = BeautifulSoup(r.content, "lxml")
    job_title = soup.find('h1',class_="brkword")
    job_location = soup.find('strong',class_="c2b")
    job_date = soup.find('span',class_="pdingleft10 brlefte5")
    try:

        seniority = soup.find_all('td',class_='value')[0].text.strip()
    except:
        print("")

    try:    
        full_or_part = soup.find_all('td',class_='value')[1].text.strip()
    except:
        print("")
    try:    
        education_level = soup.find_all('td',class_='value')[2].text.strip()
    except:
        print("")
    try:
        sector = soup.find_all('td',class_='value')[3].text.strip()
    except:
        print("")
    description = soup.find_all('p',class_='pding10')

    df = {
    "Job Title" : job_title,
    "Job Location" : job_location,
    "Post Date" : job_date,
    "Seniority Level" : seniority,
    "Full or Part time" : full_or_part,
    "Educational Level" : education_level,
    "Sector" : sector,
    "Job Description" : description
    }


job_data = pd.DataFrame(df)

JavaScript
 
import requests
from bs4 import BeautifulSoup
from requests.api import request
from selenium import webdriver
from bs4 import Tag, NavigableString
baseurl = "https://www.olx.com.eg/"
​
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
​
product_links = []
for x in range(1,13):
    r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers)
    soup = BeautifulSoup(r.content, "lxml")
​
​
    product_list = soup.findAll("div", class_="ads__item")
  
​
​
        
    for item in product_list:
        for link in item.findAll("a",href=True):
            product_links.append(link['href'])
​
​
    for thing in product_links:
        if '#' in product_links: product_links.remove('#')
​
​
# test_link = 'https://www.olx.com.eg/ad/-IDcjqyP.html'
​
for link in product_links:
    r = requests.get(link, headers=headers)
    soup = BeautifulSoup(r.content, "lxml")
    job_title = soup.find('h1',class_="brkword")
    job_location = soup.find('strong',class_="c2b")
    job_date = soup.find('span',class_="pdingleft10 brlefte5")
    try:
​
        seniority = soup.find_all('td',class_='value')[0].text.strip()
    except:
        print("")
​
    try:    
        full_or_part = soup.find_all('td',class_='value')[1].text.strip()
    except:
        print("")
    try:    
        education_level = soup.find_all('td',class_='value')[2].text.strip()
    except:
        print("")
    try:
        sector = soup.find_all('td',class_='value')[3].text.strip()
    except:
        print("")
    description = soup.find_all('p',class_='pding10')
​
    df = {
    "Job Title" : job_title,
    "Job Location" : job_location,
    "Post Date" : job_date,
    "Seniority Level" : seniority,
    "Full or Part time" : full_or_part,
    "Educational Level" : education_level,
    "Sector" : sector,
    "Job Description" : description
    }
​
​
job_data = pd.DataFrame(df)
​

Please tell me how I can transform the data I have into a data frame so I can export it into a csv first of all I was trying to to scrape this jobs website and it scraped it successfully returning 500 jobs in the dictionary but I was unfortunately not able to transform the code into a dataframe, so later on i can export that out to a csv file, so i can do some analysis on it

Answer

To create dataframe from the job ads, you can try next example (some column names needs to be renamed from arabic to english though):

import requests
import pandas as pd
from bs4 import BeautifulSoup


baseurl = "https://www.olx.com.eg/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}

product_links = []
for x in range(1, 2):  # <-- increase the range here
    r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers)
    soup = BeautifulSoup(r.content, "lxml")

    product_list = soup.findAll("div", class_="ads__item")

    for item in product_list:
        for link in item.findAll("a", href=True):
            if link["href"] != "#":
                product_links.append(link["href"])

all_data = []
for link in product_links:
    print(f"Getting {link} ...")

    soup = BeautifulSoup(requests.get(link, headers=headers).content, "lxml")

    d = {}

    job_title = soup.find("h1").get_text(strip=True)
    job_location = soup.find("strong", class_="c2b")
    job_date = soup.find("span", class_="pdingleft10 brlefte5")

    d["title"] = job_title
    d["location"] = job_location.get_text(strip=True) if job_location else "N/A"
    d["date"] = job_date.get_text(strip=True) if job_date else "N/A"

    for table in soup.select("table.item"):
        d[table.th.get_text(strip=True)] = table.td.get_text(strip=True)

    all_data.append(d)

job_data = pd.DataFrame(all_data)
print(job_data)
job_data.to_csv("data.csv", index=False)

JavaScript
 
import requests
import pandas as pd
from bs4 import BeautifulSoup
​
​
baseurl = "https://www.olx.com.eg/"
​
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
​
product_links = []
for x in range(1, 2):  # <-- increase the range here
    r = requests.get(f"https://www.olx.com.eg/jobs/?page={x}", headers=headers)
    soup = BeautifulSoup(r.content, "lxml")
​
    product_list = soup.findAll("div", class_="ads__item")
​
    for item in product_list:
        for link in item.findAll("a", href=True):
            if link["href"] != "#":
                product_links.append(link["href"])
​
all_data = []
for link in product_links:
    print(f"Getting {link} ...")
​
    soup = BeautifulSoup(requests.get(link, headers=headers).content, "lxml")
​
    d = {}
​
    job_title = soup.find("h1").get_text(strip=True)
    job_location = soup.find("strong", class_="c2b")
    job_date = soup.find("span", class_="pdingleft10 brlefte5")
​
    d["title"] = job_title
    d["location"] = job_location.get_text(strip=True) if job_location else "N/A"
    d["date"] = job_date.get_text(strip=True) if job_date else "N/A"
​
    for table in soup.select("table.item"):
        d[table.th.get_text(strip=True)] = table.td.get_text(strip=True)
​
    all_data.append(d)
​
job_data = pd.DataFrame(all_data)
print(job_data)
job_data.to_csv("data.csv", index=False)
​

Creates data.csv (screenshot from LibreOffice):

Advertisement

Answer