#Scrapy News Crawler
#Importing Scrapy library import scrapy #Defining spider's url,headers class DawnSpider(scrapy.Spider): name = 'dawn' allowed_domains = ['www.dawn.com'] #Channel link # start_urls = ['https://www.dawn.com/archive/2022-02-09'] # url = ['https://www.dawn.com'] # page = 1
#defining function to set headers and setting Link from where to start scraping
def start_requests(self): yield scrapy.Request(url='https://www.dawn.com/archive/2022-03-21', callback=self.parse, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}) #Getting news healines and their links def parse(self, response): titles = response.xpath("//h2[@class = 'story__title text-6 font-bold font-merriweather pt-1 pb-2 ']/a") for title in titles: headline = title.xpath(".//text()").get() headline_link = title.xpath(".//@href").get() #itrating News headline links yield response.follow(url=headline_link, callback=self.parse_headline, meta={'heading': headline}, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}) #COde for going to previous pages prev_page = response.xpath("//li[1]/a/@href").get() prev = 'https://www.dawn.com' + str(prev_page) yield scrapy.Request(url=prev, callback=self.parse, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'})
#Iterating headline links and getting healine details and date/time
def parse_headline(self, response): headline = response.request.meta['heading'] # logging.info(response.url) full_detail = response.xpath("//div[contains(@class , story__content)]/p[1]") date_and_time = response.xpath("//span[@class='timestamp--date']/text()").get() for detail in full_detail: data = detail.xpath(".//text()").get() yield { 'headline': headline, 'date_and_time': date_and_time, 'details': data }
#Python script (Separate FIle )
from scrapy import cmdline cmdline.execute("scrapy crawl dawn -o data.csv".split(" "))
Advertisement
Answer
- Instead of running you spider with
cmdline.execute
you can run it withCrawlerProcess
, read about common practices. You can seemain.py
as an example. - You can declare the headers once.
- You’re getting a lot of 403, so you should add download delay to avoid getting banned.
- You can use feeds export for the csv file.
- It’s possible you’re interrupting the writing of the csv file, but it’s only a guess.
Here’s a working example (I checked it with 'CLOSESPIDER_ITEMCOUNT': 10
, so give it some time when run run it).
spider.py:
#Importing Scrapy library import scrapy #Defining spider's url,headers class DawnSpider(scrapy.Spider): name = 'dawn' allowed_domains = ['dawn.com'] #Channel link # start_urls = ['https://www.dawn.com/archive/2022-02-09'] # url = ['https://www.dawn.com'] # page = 1 custom_settings = { 'DOWNLOAD_DELAY': 0.8, 'FEEDS': {'data.csv': {'format': 'csv'}}, } headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.5", "Cache-Control": "no-cache", "Connection": "keep-alive", "Cookie": "scribe=true", "DNT": "1", "Host": "www.dawn.com", "Pragma": "no-cache", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Sec-GPC": "1", "TE": "trailers", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0" } def start_requests(self): yield scrapy.Request(url='https://www.dawn.com/archive/2022-03-21', headers=self.headers) #Getting news healines and their links def parse(self, response): titles = response.xpath("//h2[@class = 'story__title text-6 font-bold font-merriweather pt-1 pb-2 ']/a") for title in titles: headline = title.xpath(".//text()").get() headline_link = title.xpath(".//@href").get() #itrating News headline links yield response.follow(url=headline_link, callback=self.parse_headline, cb_kwargs={'headline': headline}, headers=self.headers) #COde for going to previous pages prev_page = response.xpath("//li[1]/a/@href").get() if prev_page: prev = 'https://www.dawn.com' + str(prev_page) yield scrapy.Request(url=prev, callback=self.parse, headers=self.headers) def parse_headline(self, response, headline): # logging.info(response.url) full_detail = response.xpath("//div[contains(@class , story__content)]/p[1]") date_and_time = response.xpath("//span[@class='timestamp--date']/text()").get() for detail in full_detail: data = detail.xpath(".//text()").get() yield { 'headline': headline, 'date_and_time': date_and_time, 'details': data }
main.py:
from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings if __name__ == "__main__": settings = get_project_settings() process = CrawlerProcess(settings) process.crawl('dawn') process.start()