I know how to separate it when the data looks like:
x, y, z
But I can’t figure out how to do it when the data format is like:
Doe, John, BookName, Year, abstract with commas, links.
This is what the data looks like in excel after the scrape
This is what i wanted it to looks like
This is my code
from unittest import result import requests from bs4 import BeautifulSoup import csv import urllib3.request urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) fakdep = '165' offset = input('Please enter number of offset:') url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset) headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36' } datas = [] count_page = 0 for page in range(1,2): count_page+=1 print('Scraping Offset No:', count_page) result = requests.get(url+str(page), verify=False) soup = BeautifulSoup(result.text, 'html.parser') items = soup.find_all('li','ds-artifact-item') for it in items: author = it.find('span','author h4').text title = ''.join(it.find('a',href=True).text.strip().split('n')) year = it.find('span','date').text abstract = ''.join(it.find('div','artifact-abstract').text.strip().split('n')) link = it.find('a')['href'] datas.append([author, title, year, abstract, link]) kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link'] thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline='')) thewriter.writerow(kepala) for d in datas: thewriter.writerow(d)
Advertisement
Answer
This is my suggestion. I will need to know an offset to be able to test it.
A CSV separated by semi-colons will be far easier to separate in Excel.
from unittest import result import requests from bs4 import BeautifulSoup import csv import urllib3.request urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) fakdep = '165' offset = input('Please enter number of offset:') url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset) headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36' } datas = [] count_page = 0 for page in range(1,2): count_page+=1 print('Scraping Offset No:', count_page) result = requests.get(url+str(page), verify=False) soup = BeautifulSoup(result.text, 'html.parser') items = soup.find_all('li','ds-artifact-item') for it in items: author = it.find('span','author h4').text title = ''.join(it.find('a',href=True).text.strip().replace('/n', '')) year = it.find('span','date').text abstract = ''.join(it.find('div','artifact-abstract').text.strip().replace('/n', '')) link = it.find('a')['href'] datas.append([author, title, year, abstract, link]) kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link'] thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''), delimiter=";") thewriter.writerow(kepala) for d in datas: thewriter.writerow(d)