Skip to content
Advertisement

How can I scrape all 1000 novels? ,my code scrapes only 691

url to scrape Genre

url = requests.get('http://books.toscrape.com/index.html')
soup = BeautifulSoup(url.text, 'html.parser')
navlist = soup.select('.nav-list')[0].find('li').find('ul').findAll('li')

List for Genre in the Website

genre_list = []

List for scraped Novels

novel_list = []

For Loop to loop through navlist to append genre into the genre_list

for i in navlist:
    a = i.find('a').getText()
    genre_list.append(a.strip().lower())

For Loop to loop through the genre_list

for x, y in enumerate(genre_list):

    count = 1
    
    url_1=requests.get(f'http://books.toscrape.com/catalogue/category/books/{y}_{x+2}/page 
          {count}.html')
    url_2 = requests.get(f'http://books.toscrape.com/catalogue/category/books/{y}_{x+2}/index.html')
    url = url_1
    if url.status_code == 404:
        url = url_2
    else:
        url = url_1
    while url:

        soup1 = BeautifulSoup(url.text, 'html.parser')
        novel = soup1.select('.product_pod')
        count += 1
       

for loop to loop each novel to get its title, cost, ratings

        for i, j in enumerate(novel):
            rate_list = ['One', 'Two', 'Three', 'Four', 'Five']
            novel_1 = novel[i].find('h3')
            title = novel_1.find('a').get('title')
            cost = novel[i].find('p', class_='price_color').getText().strip('Â')
            

Loop to find novels of a particular rating

            for rating in rate_list:
                rate = novel[i].find('p', class_=f'{rating}')
                if rate:
                    novel_list.append({'Title': title, 'Rating': rating, 'Price': cost, 'Genre': y})
        if url == url_2:
            break
        else:
            url=requests.get(f'http://books.toscrape.com/catalogue/category/books/{y}_{x + 2}/page
                {count}.html')

Writing these novels into my CSV file

with open('novel.csv','w', encoding="utf-8", newline='') as f:
    fieldnames = ['Title', 'Rating', 'Price', 'Genre']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

    for i in novel_list:
        writer.writerow(i)

Advertisement

Answer

Use for loop and provide the page number.Then use pandas dataframe to load data into dataframe and then do to_csv.

Code:

from bs4 import BeautifulSoup
import requests
import pandas as pd
url='http://books.toscrape.com/catalogue/page-{}.html'

title=[]
ratings=[]
cost=[]
for page in range(1,51):
    finalurl=url.format(page)
    res=requests.get(finalurl).text
    soup=BeautifulSoup(res,'html.parser')
    for t,r,c in zip(soup.select('.image_container >a>img'),soup.select('p.star-rating'),soup.select('p.price_color')):
        title.append(t['alt'])
        ratings.append(r.attrs['class'][-1])
        cost.append(c.text[1:])

df = pd.DataFrame({"Title":title,"Ratings":ratings,"Cost":cost})
print(df)
df.to_csv('Titlebooks.csv')

Output on console:

      Cost Ratings                                              Title
0    £51.77   Three                               A Light in the Attic
1    £53.74     One                                 Tipping the Velvet
2    £50.10     One                                         Soumission
3    £47.82    Four                                      Sharp Objects
4    £54.23    Five              Sapiens: A Brief History of Humankind
5    £22.65     One                                    The Requiem Red
6    £33.34    Four  The Dirty Little Secrets of Getting Your Dream...
7    £17.93   Three  The Coming Woman: A Novel Based on the Life of...
8    £22.60    Four  The Boys in the Boat: Nine Americans and Their...
9    £52.15     One                                    The Black Maria
10   £13.99     Two     Starving Hearts (Triangular Trade Trilogy, #1)
11   £20.66    Four                              Shakespeare's Sonnets
12   £17.46    Five                                        Set Me Free
13   £52.29    Five  Scott Pilgrim's Precious Little Life (Scott Pi...
14   £35.02    Five                          Rip it Up and Start Again
15   £57.25   Three  Our Band Could Be Your Life: Scenes from the A...
16   £23.88     One                                               Olio
17   £37.59     One  Mesaerion: The Best Science Fiction Stories 18...
18   £51.33     Two                       Libertarianism for Beginners
19   £45.17     Two                            It's Only the Himalayas
20   £12.84     One                                        In Her Wake
21   £37.32     Two                                    How Music Works
22   £30.52   Three  Foolproof Preserving: A Guide to Small Batch J...
23   £25.27    Five                         Chase Me (Paris Nights #2)
24   £34.53    Five                                         Black Dust
25   £54.64   Three                      Birdsong: A Story in Pictures
26   £22.50   Three  America's Cradle of Quarterbacks: Western Penn...
27   £53.13   Three                     Aladdin and His Wonderful Lamp
28   £40.30    Five  Worlds Elsewhere: Journeys Around Shakespeareâ...
29   £44.18    Four                                     Wall and Piece
..      ...     ...                                                ...
970  £24.89   Three                                  Lord of the Flies
971  £58.99   Three                           Listen to Me (Fusion #1)
972  £57.20    Five                      Kitchens of the Great Midwest
973  £38.43    Five                                          Jane Eyre
974  £34.74    Four                                  Imperfect Harmony
975  £40.44    Four                             Icing (Aces Hockey #2)
976  £45.24   Three  Hawkeye, Vol. 1: My Life as a Weapon (Hawkeye #1)
977  £34.96    Four  Having the Barbarian's Baby (Ice Planet Barbar...
978  £56.76    Four               Giant Days, Vol. 1 (Giant Days #1-4)
979  £40.28    Five           Fruits Basket, Vol. 1 (Fruits Basket #1)
980  £38.00     Two                                       Frankenstein
981  £28.80   Three                   Forever Rockers (The Rocker #12)
982  £39.24   Three                        Fighting Fate (Fighting #6)
983  £32.93     Two                                               Emma
984  £51.32   Three                                    Eat, Pray, Love
985  £47.09    Five                    Deep Under (Walker Security #1)
986  £28.42    Four  Choosing Our Religion: The Spiritual Lives of ...
987  £22.85   Three  Charlie and the Chocolate Factory (Charlie Buc...
988  £41.24     One          Charity's Cross (Charles Towne Belles #4)
989  £39.07    Five                                       Bright Lines
990  £29.82     One           Bridget Jones's Diary (Bridget Jones #1)
991  £37.26    Four                      Bounty (Colorado Mountain #7)
992  £20.30   Three               Blood Defense (Samantha Brinkman #1)
993  £34.65    Five  Bleach, Vol. 1: Strawberry and the Soul Reaper...
994  £43.38     One                               Beyond Good and Evil
995  £55.53     One  Alice in Wonderland (Alice's Adventures in Won...
996  £57.06    Four   Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)
997  £16.97    Five  A Spy's Devotion (The Regency Spies of London #1)
998  £53.98     One                1st to Die (Women's Murder Club #1)
999  £26.08    Five                 1,000 Places to See Before You Die

[1000 rows x 3 columns]

If you don’t want to hardcode final page count.Then try this.

from bs4 import BeautifulSoup
import requests
import pandas as pd

res=requests.get("http://books.toscrape.com/index.html").text
soup=BeautifulSoup(res,'html.parser')
#Get the total page count
pagecount=soup.select_one('.current').text.split('of')[-1].strip()

title=[]
ratings=[]
cost=[]
for page in range(1,int(pagecount)+1):
    finalurl="http://books.toscrape.com/catalogue/page-{}.html".format(page)
    res=requests.get(finalurl).text
    soup=BeautifulSoup(res,'html.parser')
    for t,r,c in zip(soup.select('.image_container >a>img'),soup.select('p.star-rating'),soup.select('p.price_color')):
        title.append(t['alt'])
        ratings.append(r.attrs['class'][-1])
        cost.append(c.text[1:])

df = pd.DataFrame({"Title":title,"Ratings":ratings,"Cost":cost})
print(df)
df.to_csv('Titlebooks.csv')
User contributions licensed under: CC BY-SA
5 People found this is helpful
Advertisement