I’m new to python and I’m doing it for my project. Can someone help me save it to an excel file?.
This is needed for multiple site URLs so need to add each information into new rows in excel. Sample HTML code is attached below. Please help me on saving it to excel rows and columns and how to iterate it using for loop.
Advertisement
Answer
I would suggest you just use openpyxl
directly rather than via Pandas, this would give you much greater control over how your Excel file would be formatted.
Here is how you could build up multiple row in an Excel file:
JavaScript
x
76
76
1
import requests
2
from bs4 import BeautifulSoup
3
from xlwt import Workbook
4
import openpyxl
5
from openpyxl.styles.borders import Border, Side
6
from openpyxl.utils import get_column_letter
7
from openpyxl.styles import Alignment
8
9
website_url = "https://www.example.com/"
10
res = requests.get(website_url, verify=False)
11
soup = BeautifulSoup(res.text, 'lxml')
12
Links = soup.find_all("a", {"class": "jobTitleLink"},)
13
url = [tag.get('href') for tag in Links]
14
wb = openpyxl.Workbook()
15
16
# Write a header row
17
columns = [
18
("SL No", 10),
19
("Job Title", 25),
20
("Company Name", 20),
21
("Posted on", 13),
22
("Closing on", 13),
23
("Location", 20),
24
("Description", 40),
25
("Skills", 70),
26
("Link Email", 30),
27
]
28
29
thin_border = Border(left=Side(style='thin'), right=Side(
30
style='thin'), top=Side(style='thin'), bottom=Side(style='thin'))
31
ws = wb.active
32
33
for col_number, (value, width), in enumerate(columns, start=1):
34
ws.cell(column=col_number, row=1, value=value).border = thin_border
35
ws.column_dimensions[get_column_letter(col_number)].width = width
36
37
row_number = 2
38
39
# get the first link in the entire page
40
# get value of the href attribute
41
42
for x in url[1:5]:
43
res = requests.get(f'https://www.example/com/{x}', verify=False)
44
soup = BeautifulSoup(res.text, 'lxml')
45
data = []
46
47
for div_block in soup.find_all('div', class_='block', style=None):
48
data.append([line.strip() for line in div_block.stripped_strings])
49
50
li_fr = soup.find('li', class_="fr")
51
company_name = li_fr.a.text
52
location = list(li_fr.find_next_sibling('li').stripped_strings)[1]
53
54
# Write a data row
55
row = [
56
'', # SL No
57
data[0][0], # Job title
58
company_name, # Company name
59
data[1][1],
60
data[2][1],
61
location,
62
data[4][1],
63
'n'.join(data[5][1:]),
64
data[3][1],
65
]
66
67
for col_number, value in enumerate(row, start=1):
68
cell = ws.cell(column=col_number, row=row_number, value=value)
69
cell.border = thin_border
70
cell.alignment = Alignment(wrapText=True)
71
72
row_number += 1
73
74
wb.save('output.xlsx')
75
print('Saved all the data')
76
This would give you an Excel sheet looking like:
With extra work you can apply any styling you prefer.