Hello Stack Overflow contributors!
I want to scrape multiple pages of a news website; it shows an error message during this step
JavaScript
x
2
1
response = requests.get(page, headers = user_agent)
2
The error message is
JavaScript
1
2
1
AttributeError: 'int' object has no attribute 'get'
2
The lines of code are
JavaScript
1
13
13
1
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko'}
2
3
#controlling the crawl-rate
4
start_time = time()
5
request = 0
6
7
def scrape(url):
8
urls = [url + str(x) for x in range(0,10)]
9
for page in urls:
10
response = requests.get(page, headers = user_agent)
11
print(page)
12
13
JavaScript
1
2
1
print(scrape('https://nypost.com/search/China+COVID-19/page/'))
2
More specifically, this page and pages next to it are what I want to scrape:
https://nypost.com/search/China+COVID-19/page/1/?orderby=relevance
Any helps would be greatly appreciated!!
Advertisement
Answer
For me this code runs okay. I did have to put request
inside your function. Make sure you do not mix up the module requests
with your variable request
.
JavaScript
1
44
44
1
from random import randint
2
from time import sleep, time
3
from bs4 import BeautifulSoup as bs
4
5
6
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko'}
7
8
# controlling the crawl-rate
9
start_time = time()
10
11
def scrape(url):
12
request = 0
13
urls = [f"{url}{x}" for x in range(0,10)]
14
params = {
15
"orderby": "relevance",
16
}
17
for page in urls:
18
response = requests.get(url=page,
19
headers=user_agent,
20
params=params)
21
22
#pause the loop
23
sleep(randint(8,15))
24
25
#monitor the requests
26
request += 1
27
elapsed_time = time() - start_time
28
print('Request:{}; Frequency: {} request/s'.format(request, request/elapsed_time))
29
# clear_output(wait = True)
30
31
#throw a warning for non-200 status codes
32
if response.status_code != 200:
33
warn('Request: {}; Status code: {}'.format(request, response.status_code))
34
35
#Break the loop if the number of requests is greater than expected
36
if request > 72:
37
warn('Number of request was greater than expected.')
38
break
39
40
#parse the content
41
soup_page = bs(response.text, 'lxml')
42
43
print(scrape('https://nypost.com/search/China+COVID-19/page/'))
44