Im making a python web scraper for a project, It
s getting all info that I want, but the only problem is that he does it for the first profile without getting others
I tried to found out the problem but I`m stuck, any kind of advice will be helpful
import requests import pandas from bs4 import BeautifulSoup base_url = "https://www.ratemds.com/best-doctors/?page=1" for page in range(1, 2, 1): r = requests.get(base_url) c = r.content soup = BeautifulSoup(c, 'html.parser') all = soup.find_all("div", {"class": "search-item doctor-profile"}) l = [] for item in all: d = {} d["Name"] = item.find("a", {"class": "search-item-doctor-link"}).text d["Phone Number"] = item.find("div", {"class": "search-item-specialty"}).text n = item.find("a", {"class": "search-item-doctor-link"}) a = n.get('href') new_url = ("https://www.ratemds.com"+a) r1 = requests.get(new_url) c1 = r1.content soup1 = BeautifulSoup(c1, 'html.parser') sve = soup1.find_all("div", {"class": "col-sm-3 col-md-4 search-item-extra"}) for profil in sve: try: d["Phone Number"] = profil.find("meta", itemprop = "telephone")["content"] except: d["Phone Number"] = None try: d["Adress"] = profil.find("meta", itemprop = "streetAddress")["content"] except: d["Adress"] = None try: d["Website"] = profil.find("a", itemprop = "sameAs")["href"] except: d["Website"] = None pass l.append(d) df = pandas.DataFrame(l) df.to_csv("123.csv") print(df)
Advertisement
Answer
Here is your code with a couple of adjustments:
base_url = "https://www.ratemds.com/best-doctors/?page={}" # Change base url to this # Moved the list of dicts outsided of the main loop l = [] for page in range(1, 5): r = requests.get(base_url.format(page)) # substitute 'page' variable in base_url c = r.content soup = BeautifulSoup(c, 'html.parser') all = soup.find_all("div", {"class": "search-item doctor-profile"}) for item in all: d = {} d["Name"] = item.find("a", {"class": "search-item-doctor-link"}).text d["Phone Number"] = item.find("div", {"class": "search-item-specialty"}).text n = item.find("a", {"class": "search-item-doctor-link"}) a = n.get('href') new_url = ("https://www.ratemds.com"+a) r1 = requests.get(new_url) c1 = r1.content soup1 = BeautifulSoup(c1, 'html.parser') sve = soup1.find_all("div", {"class": "col-sm-3 col-md-4 search-item-extra"}) for profil in sve: try: d["Phone Number"] = profil.find("meta", itemprop = "telephone")["content"] except: d["Phone Number"] = None try: d["Adress"] = profil.find("meta", itemprop = "streetAddress")["content"] except: d["Adress"] = None try: d["Website"] = profil.find("a", itemprop = "sameAs")["href"] except: d["Website"] = None pass l.append(d) # indented this line to append within this loop df = pd.DataFrame(l) df.to_csv("123.csv")