#Scrapy News Crawler
JavaScript
x
12
12
1
#Importing Scrapy library
2
import scrapy
3
4
5
#Defining spider's url,headers
6
class DawnSpider(scrapy.Spider):
7
name = 'dawn'
8
allowed_domains = ['www.dawn.com'] #Channel link
9
# start_urls = ['https://www.dawn.com/archive/2022-02-09']
10
# url = ['https://www.dawn.com']
11
# page = 1
12
#defining function to set headers and setting Link from where to start scraping
JavaScript
1
21
21
1
def start_requests(self):
2
yield scrapy.Request(url='https://www.dawn.com/archive/2022-03-21', callback=self.parse, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'})
3
4
#Getting news healines and their links
5
def parse(self, response):
6
titles = response.xpath("//h2[@class = 'story__title text-6 font-bold font-merriweather pt-1 pb-2 ']/a")
7
8
for title in titles:
9
headline = title.xpath(".//text()").get()
10
headline_link = title.xpath(".//@href").get()
11
#itrating News headline links
12
13
yield response.follow(url=headline_link, callback=self.parse_headline, meta={'heading': headline}, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'})
14
15
#COde for going to previous pages
16
prev_page = response.xpath("//li[1]/a/@href").get()
17
prev = 'https://www.dawn.com' + str(prev_page)
18
19
yield scrapy.Request(url=prev, callback=self.parse, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'})
20
21
#Iterating headline links and getting healine details and date/time
JavaScript
1
13
13
1
def parse_headline(self, response):
2
headline = response.request.meta['heading']
3
# logging.info(response.url)
4
full_detail = response.xpath("//div[contains(@class , story__content)]/p[1]")
5
date_and_time = response.xpath("//span[@class='timestamp--date']/text()").get()
6
for detail in full_detail:
7
data = detail.xpath(".//text()").get()
8
yield {
9
'headline': headline,
10
'date_and_time': date_and_time,
11
'details': data
12
}
13
#Python script (Separate FIle )
JavaScript
1
4
1
from scrapy import cmdline
2
3
cmdline.execute("scrapy crawl dawn -o data.csv".split(" "))
4
Advertisement
Answer
- Instead of running you spider with
cmdline.execute
you can run it withCrawlerProcess
, read about common practices. You can seemain.py
as an example. - You can declare the headers once.
- You’re getting a lot of 403, so you should add download delay to avoid getting banned.
- You can use feeds export for the csv file.
- It’s possible you’re interrupting the writing of the csv file, but it’s only a guess.
Here’s a working example (I checked it with 'CLOSESPIDER_ITEMCOUNT': 10
, so give it some time when run run it).
spider.py:
JavaScript
1
69
69
1
#Importing Scrapy library
2
import scrapy
3
4
5
#Defining spider's url,headers
6
class DawnSpider(scrapy.Spider):
7
name = 'dawn'
8
allowed_domains = ['dawn.com'] #Channel link
9
# start_urls = ['https://www.dawn.com/archive/2022-02-09']
10
# url = ['https://www.dawn.com']
11
# page = 1
12
13
custom_settings = {
14
'DOWNLOAD_DELAY': 0.8,
15
'FEEDS': {'data.csv': {'format': 'csv'}},
16
}
17
18
headers = {
19
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
20
"Accept-Encoding": "gzip, deflate, br",
21
"Accept-Language": "en-US,en;q=0.5",
22
"Cache-Control": "no-cache",
23
"Connection": "keep-alive",
24
"Cookie": "scribe=true",
25
"DNT": "1",
26
"Host": "www.dawn.com",
27
"Pragma": "no-cache",
28
"Sec-Fetch-Dest": "document",
29
"Sec-Fetch-Mode": "navigate",
30
"Sec-Fetch-Site": "none",
31
"Sec-Fetch-User": "?1",
32
"Sec-GPC": "1",
33
"TE": "trailers",
34
"Upgrade-Insecure-Requests": "1",
35
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0"
36
}
37
38
def start_requests(self):
39
yield scrapy.Request(url='https://www.dawn.com/archive/2022-03-21', headers=self.headers)
40
41
#Getting news healines and their links
42
def parse(self, response):
43
titles = response.xpath("//h2[@class = 'story__title text-6 font-bold font-merriweather pt-1 pb-2 ']/a")
44
45
for title in titles:
46
headline = title.xpath(".//text()").get()
47
headline_link = title.xpath(".//@href").get()
48
#itrating News headline links
49
50
yield response.follow(url=headline_link, callback=self.parse_headline, cb_kwargs={'headline': headline}, headers=self.headers)
51
52
#COde for going to previous pages
53
prev_page = response.xpath("//li[1]/a/@href").get()
54
if prev_page:
55
prev = 'https://www.dawn.com' + str(prev_page)
56
yield scrapy.Request(url=prev, callback=self.parse, headers=self.headers)
57
58
def parse_headline(self, response, headline):
59
# logging.info(response.url)
60
full_detail = response.xpath("//div[contains(@class , story__content)]/p[1]")
61
date_and_time = response.xpath("//span[@class='timestamp--date']/text()").get()
62
for detail in full_detail:
63
data = detail.xpath(".//text()").get()
64
yield {
65
'headline': headline,
66
'date_and_time': date_and_time,
67
'details': data
68
}
69
main.py:
JavaScript
1
10
10
1
from scrapy.crawler import CrawlerProcess
2
from scrapy.utils.project import get_project_settings
3
4
5
if __name__ == "__main__":
6
settings = get_project_settings()
7
process = CrawlerProcess(settings)
8
process.crawl('dawn')
9
process.start()
10