JavaScript
x
24
24
1
import bs4
2
import requests
3
import re
4
5
r = requests.get('https://www.the961.com/latest-news/lebanon-news/').text
6
7
soup = bs4.BeautifulSoup(r, 'lxml')
8
9
for article in soup.find_all('article'):
10
11
title = article.h3.text
12
print(title)
13
14
date = article.find('span', class_='byline-part date')
15
if date: print('Date:', date.text)
16
17
author = article.find('span', class_="byline-part author")
18
if author: print('Author:', author.text)
19
20
link = article.find('h3', class_='title').a['href']
21
link_r = requests.get(link).text
22
23
soup_link = bs4.BeautifulSoup(link_r, 'lxml')
24
// scraping link from title, then opening that link and trying to scrape the whole article, very new to this so I don’t know what to do!
JavaScript
1
7
1
for article in soup_link.find_all('article'):
2
paragraph = article.find('p')
3
print(paragraph)
4
5
6
print()
7
Advertisement
Answer
On some pages the <p>
tags are not under an <article>
, and therefor is returning None
. Instead, to scrape all the paragraphs (and <li>
tags if they exist) use the following CSS Selector: .entry-content > p, .entry-content li
.
To use a CSS Selector, use the .select()
method instead of .find_all()
.
In your code example:
JavaScript
1
29
29
1
import bs4
2
import requests
3
4
5
r = requests.get("https://www.the961.com/latest-news/lebanon-news/").text
6
soup = bs4.BeautifulSoup(r, "lxml")
7
8
for article in soup.find_all("article"):
9
title = article.h3.text
10
print(title)
11
12
date = article.find("span", class_="byline-part date")
13
if date:
14
print("Date:", date.text)
15
16
author = article.find("span", class_="byline-part author")
17
if author:
18
print("Author:", author.text, "n")
19
20
link = article.find("h3", class_="title").a["href"]
21
link_r = requests.get(link).text
22
soup_link = bs4.BeautifulSoup(link_r, "lxml")
23
24
# Select all `p` tags (and `li`) under the class `entry-content`
25
for page in soup_link.select(".entry-content > p, .entry-content li"):
26
print(page.get_text(strip=True))
27
print("-" * 80)
28
print()
29