I have such a code which gets the values of all paragraphs from a div and inserts them into a list as a new element for each car model year. I wanted to add the possibility of creating a dictionary which would contain values in such form
JavaScript
x
2
1
d = { 'reviewer_name': 'xyz', 'car_model' : '2017 Audi A4', 'review_content' : 'all paragraphs from the div which is already visible in the code' }
2
this dictionary should contain values for different years, so if I specify years to 2017 and 2018 I would like entries in the dictionary for both of those years.
JavaScript
1
22
22
1
from bs4 import BeautifulSoup
2
import requests
3
import pandas as pd
4
import time
5
from fake_useragent import UserAgent
6
import random
7
8
articles = []
9
ua = UserAgent()
10
header = {'User-Agent':str(ua.safari)}
11
for i in range(2017, 2019):
12
url = f'https://www.caranddriver.com/audi/a4-{i}'
13
response = requests.get(url, headers=header)
14
print(response)
15
html_soup = BeautifulSoup(response.text, 'lxml')
16
article = html_soup.find('div', attrs={'class': 'review-body-content'}).findAll('p')
17
article_text = ''
18
19
for element in article:
20
article_text = article_text + 'n' + ''.join(element.findAll(text = True))
21
articles.append(article_text)
22
Advertisement
Answer
Here you go, just add it to a dictionary then append the dictionary into your list.
JavaScript
1
41
41
1
from bs4 import BeautifulSoup
2
import requests
3
import pandas as pd
4
import time
5
from fake_useragent import UserAgent
6
import random
7
import re
8
9
articles = []
10
ua = UserAgent()
11
header = {'User-Agent':str(ua.safari)}
12
for i in range(2017, 2020):
13
url = f'https://www.caranddriver.com/audi/a4-{i}'
14
response = requests.get(url, headers=header)
15
print(response)
16
html_soup = BeautifulSoup(response.text, 'lxml')
17
article = html_soup.find('div', attrs={'class': 'review-body-content'}).findAll('p')
18
article_text = ''
19
20
for element in article:
21
article_text = article_text + 'n' + ''.join(element.findAll(text = True))
22
article_text = re.sub('.css.*}', '', article_text)
23
article_text = article_text.strip()
24
25
car_model = html_soup.find('div', class_= re.compile("^review-header-inner")).find('h1').text
26
27
try:
28
reviewer_name = html_soup.find('a', {'href':re.compile("^/author")}).text
29
except:
30
reviewer_name = 'NA'
31
32
row = {
33
'reviewer_name': reviewer_name,
34
'car_model' : car_model,
35
'review_content' : article_text }
36
37
38
articles.append(row)
39
40
df = pd.DataFrame(articles)
41
Output:
JavaScript
1
8
1
print(df)
2
reviewer_name review_content
3
0 NA The A4 embodies everything we love about Audi:
4
1 NA The 2018 Audi A4 is perhaps the most well-roun
5
2 Drew Dorian Audi's A4 has proven to be a wündercar that ou...
6
7
[3 rows x 3 columns]
8