Skip to content
Advertisement

How to separate data per column when writing data to excel from web scraping results

I know how to separate it when the data looks like:

x, y, z

But I can’t figure out how to do it when the data format is like:

Doe, John, BookName, Year, abstract with commas, links. 

This is what the data looks like in excel after the scrape
This is what i wanted it to looks like
This is my code

from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


fakdep = '165'  
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
    }

datas = []
count_page = 0
for page in range(1,2): 
    count_page+=1
    print('Scraping Offset No:', count_page)
    result = requests.get(url+str(page), verify=False) 
    
    soup = BeautifulSoup(result.text, 'html.parser')
    items = soup.find_all('li','ds-artifact-item')
    for it in items:
        author = it.find('span','author h4').text
        title = ''.join(it.find('a',href=True).text.strip().split('n'))
        year = it.find('span','date').text
        abstract = ''.join(it.find('div','artifact-abstract').text.strip().split('n'))
        link = it.find('a')['href']
        datas.append([author, title, year, abstract, link])

kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''))
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)

Advertisement

Answer

This is my suggestion. I will need to know an offset to be able to test it.

A CSV separated by semi-colons will be far easier to separate in Excel.

from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


fakdep = '165'  
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
    }

datas = []
count_page = 0
for page in range(1,2): 
    count_page+=1
    print('Scraping Offset No:', count_page)
    result = requests.get(url+str(page), verify=False) 
    
    soup = BeautifulSoup(result.text, 'html.parser')
    items = soup.find_all('li','ds-artifact-item')
    for it in items:
        author = it.find('span','author h4').text
        title = ''.join(it.find('a',href=True).text.strip().replace('/n', ''))
        year = it.find('span','date').text
        abstract = ''.join(it.find('div','artifact-abstract').text.strip().replace('/n', ''))
        link = it.find('a')['href']
        datas.append([author, title, year, abstract, link])

kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''), delimiter=";")
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)
User contributions licensed under: CC BY-SA
9 People found this is helpful
Advertisement