I was wondering if my requests is stopped by the website and I need to set a proxy.I first try to close the http’s connection ,bu I failed.I also try to test my code but now it seems no outputs.Mybe I use a proxy everything will be OK? Here is the code.
JavaScript
x
98
98
1
import requests
2
from urllib.parse import urlencode
3
import json
4
from bs4 import BeautifulSoup
5
import re
6
from html.parser import HTMLParser
7
from multiprocessing import Pool
8
from requests.exceptions import RequestException
9
import time
10
11
12
def get_page_index(offset, keyword):
13
#headers = {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}
14
data = {
15
'offset': offset,
16
'format': 'json',
17
'keyword': keyword,
18
'autoload': 'true',
19
'count': 20,
20
'cur_tab': 1
21
}
22
url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
23
try:
24
response = requests.get(url, headers={'Connection': 'close'})
25
response.encoding = 'utf-8'
26
if response.status_code == 200:
27
return response.text
28
return None
29
except RequestException as e:
30
print(e)
31
32
def parse_page_index(html):
33
data = json.loads(html)
34
if data and 'data' in data.keys():
35
for item in data.get('data'):
36
url = item.get('article_url')
37
if url and len(url) < 100:
38
yield url
39
40
def get_page_detail(url):
41
try:
42
response = requests.get(url, headers={'Connection': 'close'})
43
response.encoding = 'utf-8'
44
if response.status_code == 200:
45
return response.text
46
return None
47
except RequestException as e:
48
print(e)
49
50
def parse_page_detail(html):
51
soup = BeautifulSoup(html, 'lxml')
52
title = soup.select('title')[0].get_text()
53
pattern = re.compile(r'articleInfo: (.*?)},', re.S)
54
pattern_abstract = re.compile(r'abstract: (.*?).', re.S)
55
res = re.search(pattern, html)
56
res_abstract = re.search(pattern_abstract, html)
57
if res and res_abstract:
58
data = res.group(1).replace(r".replace(/<br />|n|r/ig, '')", "") + '}'
59
abstract = res_abstract.group(1).replace(r"'", "")
60
content = re.search(r'content: (.*?),', data).group(1)
61
source = re.search(r'source: (.*?),', data).group(1)
62
time_pattern = re.compile(r'time: (.*?)}', re.S)
63
date = re.search(time_pattern, data).group(1)
64
date_today = time.strftime('%Y-%m-%d')
65
img = re.findall(r'src="(.*?)"', content)
66
if date[1:11] == date_today and len(content) > 50 and img:
67
return {
68
'title': title,
69
'content': content,
70
'source': source,
71
'date': date,
72
'abstract': abstract,
73
'img': img[0]
74
}
75
76
def main(offset):
77
flag = 1
78
html = get_page_index(offset, '光伏')
79
for url in parse_page_index(html):
80
html = get_page_detail(url)
81
if html:
82
data = parse_page_detail(html)
83
if data:
84
html_parser = HTMLParser()
85
cwl = html_parser.unescape(data.get('content'))
86
data['content'] = cwl
87
print(data)
88
print(data.get('img'))
89
flag += 1
90
if flag == 5:
91
break
92
93
94
95
if __name__ == '__main__':
96
pool = Pool()
97
pool.map(main, [i*20 for i in range(10)])
98
and the error is the here!
JavaScript
1
2
1
HTTPConnectionPool(host='tech.jinghua.cn', port=80): Max retries exceeded with url: /zixun/20160720/f191549.shtml (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x00000000048523C8>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))
2
By the way, When I test my code at first it shows everything is OK! Thanks in advance!
Advertisement
Answer
It seems to me you’re hitting the limit of connection in the HTTPConnectionPool. Since you start 10 threads at the same time
Try one of the following:
- Increase the request timeout (seconds):
requests.get('url', timeout=5)
- Close the response:
Response.close()
. Instead of returning response.text, assign response to a varialble, close Response, and then return variable