url to scrape Genre
url = requests.get('http://books.toscrape.com/index.html') soup = BeautifulSoup(url.text, 'html.parser') navlist = soup.select('.nav-list')[0].find('li').find('ul').findAll('li')
List for Genre in the Website
genre_list = []
List for scraped Novels
novel_list = []
For Loop to loop through navlist to append genre into the genre_list
for i in navlist: a = i.find('a').getText() genre_list.append(a.strip().lower())
For Loop to loop through the genre_list
for x, y in enumerate(genre_list): count = 1 url_1=requests.get(f'http://books.toscrape.com/catalogue/category/books/{y}_{x+2}/page {count}.html') url_2 = requests.get(f'http://books.toscrape.com/catalogue/category/books/{y}_{x+2}/index.html') url = url_1 if url.status_code == 404: url = url_2 else: url = url_1 while url: soup1 = BeautifulSoup(url.text, 'html.parser') novel = soup1.select('.product_pod') count += 1
for loop to loop each novel to get its title, cost, ratings
for i, j in enumerate(novel): rate_list = ['One', 'Two', 'Three', 'Four', 'Five'] novel_1 = novel[i].find('h3') title = novel_1.find('a').get('title') cost = novel[i].find('p', class_='price_color').getText().strip('Â')
Loop to find novels of a particular rating
for rating in rate_list: rate = novel[i].find('p', class_=f'{rating}') if rate: novel_list.append({'Title': title, 'Rating': rating, 'Price': cost, 'Genre': y}) if url == url_2: break else: url=requests.get(f'http://books.toscrape.com/catalogue/category/books/{y}_{x + 2}/page {count}.html')
Writing these novels into my CSV file
with open('novel.csv','w', encoding="utf-8", newline='') as f: fieldnames = ['Title', 'Rating', 'Price', 'Genre'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for i in novel_list: writer.writerow(i)
Advertisement
Answer
Use for loop and provide the page number.Then use pandas dataframe
to load data into dataframe
and then do to_csv.
Code:
from bs4 import BeautifulSoup import requests import pandas as pd url='http://books.toscrape.com/catalogue/page-{}.html' title=[] ratings=[] cost=[] for page in range(1,51): finalurl=url.format(page) res=requests.get(finalurl).text soup=BeautifulSoup(res,'html.parser') for t,r,c in zip(soup.select('.image_container >a>img'),soup.select('p.star-rating'),soup.select('p.price_color')): title.append(t['alt']) ratings.append(r.attrs['class'][-1]) cost.append(c.text[1:]) df = pd.DataFrame({"Title":title,"Ratings":ratings,"Cost":cost}) print(df) df.to_csv('Titlebooks.csv')
Output on console:
Cost Ratings Title 0 £51.77 Three A Light in the Attic 1 £53.74 One Tipping the Velvet 2 £50.10 One Soumission 3 £47.82 Four Sharp Objects 4 £54.23 Five Sapiens: A Brief History of Humankind 5 £22.65 One The Requiem Red 6 £33.34 Four The Dirty Little Secrets of Getting Your Dream... 7 £17.93 Three The Coming Woman: A Novel Based on the Life of... 8 £22.60 Four The Boys in the Boat: Nine Americans and Their... 9 £52.15 One The Black Maria 10 £13.99 Two Starving Hearts (Triangular Trade Trilogy, #1) 11 £20.66 Four Shakespeare's Sonnets 12 £17.46 Five Set Me Free 13 £52.29 Five Scott Pilgrim's Precious Little Life (Scott Pi... 14 £35.02 Five Rip it Up and Start Again 15 £57.25 Three Our Band Could Be Your Life: Scenes from the A... 16 £23.88 One Olio 17 £37.59 One Mesaerion: The Best Science Fiction Stories 18... 18 £51.33 Two Libertarianism for Beginners 19 £45.17 Two It's Only the Himalayas 20 £12.84 One In Her Wake 21 £37.32 Two How Music Works 22 £30.52 Three Foolproof Preserving: A Guide to Small Batch J... 23 £25.27 Five Chase Me (Paris Nights #2) 24 £34.53 Five Black Dust 25 £54.64 Three Birdsong: A Story in Pictures 26 £22.50 Three America's Cradle of Quarterbacks: Western Penn... 27 £53.13 Three Aladdin and His Wonderful Lamp 28 £40.30 Five Worlds Elsewhere: Journeys Around Shakespeareâ... 29 £44.18 Four Wall and Piece .. ... ... ... 970 £24.89 Three Lord of the Flies 971 £58.99 Three Listen to Me (Fusion #1) 972 £57.20 Five Kitchens of the Great Midwest 973 £38.43 Five Jane Eyre 974 £34.74 Four Imperfect Harmony 975 £40.44 Four Icing (Aces Hockey #2) 976 £45.24 Three Hawkeye, Vol. 1: My Life as a Weapon (Hawkeye #1) 977 £34.96 Four Having the Barbarian's Baby (Ice Planet Barbar... 978 £56.76 Four Giant Days, Vol. 1 (Giant Days #1-4) 979 £40.28 Five Fruits Basket, Vol. 1 (Fruits Basket #1) 980 £38.00 Two Frankenstein 981 £28.80 Three Forever Rockers (The Rocker #12) 982 £39.24 Three Fighting Fate (Fighting #6) 983 £32.93 Two Emma 984 £51.32 Three Eat, Pray, Love 985 £47.09 Five Deep Under (Walker Security #1) 986 £28.42 Four Choosing Our Religion: The Spiritual Lives of ... 987 £22.85 Three Charlie and the Chocolate Factory (Charlie Buc... 988 £41.24 One Charity's Cross (Charles Towne Belles #4) 989 £39.07 Five Bright Lines 990 £29.82 One Bridget Jones's Diary (Bridget Jones #1) 991 £37.26 Four Bounty (Colorado Mountain #7) 992 £20.30 Three Blood Defense (Samantha Brinkman #1) 993 £34.65 Five Bleach, Vol. 1: Strawberry and the Soul Reaper... 994 £43.38 One Beyond Good and Evil 995 £55.53 One Alice in Wonderland (Alice's Adventures in Won... 996 £57.06 Four Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1) 997 £16.97 Five A Spy's Devotion (The Regency Spies of London #1) 998 £53.98 One 1st to Die (Women's Murder Club #1) 999 £26.08 Five 1,000 Places to See Before You Die [1000 rows x 3 columns]
If you don’t want to hardcode final page count.Then try this.
from bs4 import BeautifulSoup import requests import pandas as pd res=requests.get("http://books.toscrape.com/index.html").text soup=BeautifulSoup(res,'html.parser') #Get the total page count pagecount=soup.select_one('.current').text.split('of')[-1].strip() title=[] ratings=[] cost=[] for page in range(1,int(pagecount)+1): finalurl="http://books.toscrape.com/catalogue/page-{}.html".format(page) res=requests.get(finalurl).text soup=BeautifulSoup(res,'html.parser') for t,r,c in zip(soup.select('.image_container >a>img'),soup.select('p.star-rating'),soup.select('p.price_color')): title.append(t['alt']) ratings.append(r.attrs['class'][-1]) cost.append(c.text[1:]) df = pd.DataFrame({"Title":title,"Ratings":ratings,"Cost":cost}) print(df) df.to_csv('Titlebooks.csv')