I know how to separate it when the data looks like:
JavaScript
x
2
1
x, y, z
2
But I can’t figure out how to do it when the data format is like:
JavaScript
1
2
1
Doe, John, BookName, Year, abstract with commas, links.
2
This is what the data looks like in excel after the scrape
This is what i wanted it to looks like
This is my code
JavaScript
1
37
37
1
from unittest import result
2
import requests
3
from bs4 import BeautifulSoup
4
import csv
5
import urllib3.request
6
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
7
8
9
fakdep = '165'
10
offset = input('Please enter number of offset:')
11
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
12
headers = {
13
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
14
}
15
16
datas = []
17
count_page = 0
18
for page in range(1,2):
19
count_page+=1
20
print('Scraping Offset No:', count_page)
21
result = requests.get(url+str(page), verify=False)
22
23
soup = BeautifulSoup(result.text, 'html.parser')
24
items = soup.find_all('li','ds-artifact-item')
25
for it in items:
26
author = it.find('span','author h4').text
27
title = ''.join(it.find('a',href=True).text.strip().split('n'))
28
year = it.find('span','date').text
29
abstract = ''.join(it.find('div','artifact-abstract').text.strip().split('n'))
30
link = it.find('a')['href']
31
datas.append([author, title, year, abstract, link])
32
33
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
34
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''))
35
thewriter.writerow(kepala)
36
for d in datas: thewriter.writerow(d)
37
Advertisement
Answer
This is my suggestion. I will need to know an offset to be able to test it.
A CSV separated by semi-colons will be far easier to separate in Excel.
JavaScript
1
37
37
1
from unittest import result
2
import requests
3
from bs4 import BeautifulSoup
4
import csv
5
import urllib3.request
6
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
7
8
9
fakdep = '165'
10
offset = input('Please enter number of offset:')
11
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
12
headers = {
13
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
14
}
15
16
datas = []
17
count_page = 0
18
for page in range(1,2):
19
count_page+=1
20
print('Scraping Offset No:', count_page)
21
result = requests.get(url+str(page), verify=False)
22
23
soup = BeautifulSoup(result.text, 'html.parser')
24
items = soup.find_all('li','ds-artifact-item')
25
for it in items:
26
author = it.find('span','author h4').text
27
title = ''.join(it.find('a',href=True).text.strip().replace('/n', ''))
28
year = it.find('span','date').text
29
abstract = ''.join(it.find('div','artifact-abstract').text.strip().replace('/n', ''))
30
link = it.find('a')['href']
31
datas.append([author, title, year, abstract, link])
32
33
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
34
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''), delimiter=";")
35
thewriter.writerow(kepala)
36
for d in datas: thewriter.writerow(d)
37