Skip to content
Advertisement

Python Web Scraper : My script is just printing the first one, instead of all?

Im making a python web scraper for a project, Its getting all info that I want, but the only problem is that he does it for the first profile without getting others

I tried to found out the problem but I`m stuck, any kind of advice will be helpful

import requests
    import pandas
    from bs4 import BeautifulSoup
    
    
    base_url = "https://www.ratemds.com/best-doctors/?page=1"
    for page in range(1, 2, 1):
        r = requests.get(base_url)
        c = r.content
        soup = BeautifulSoup(c, 'html.parser')
        all = soup.find_all("div", {"class": "search-item doctor-profile"})
        l = []
        for item in all:
            d = {}
            d["Name"] = item.find("a", {"class": "search-item-doctor-link"}).text
            d["Phone Number"] = item.find("div", {"class": "search-item-specialty"}).text
            n = item.find("a", {"class": "search-item-doctor-link"})
            a = n.get('href')
            new_url = ("https://www.ratemds.com"+a)
            r1 = requests.get(new_url)
            c1 = r1.content
            soup1 = BeautifulSoup(c1, 'html.parser')
            sve = soup1.find_all("div", {"class": "col-sm-3 col-md-4 search-item-extra"})
            for profil in sve:
                try:
                    d["Phone Number"] = profil.find("meta", itemprop = "telephone")["content"]
                except:
                    d["Phone Number"] = None
                try:
                    d["Adress"] =  profil.find("meta", itemprop = "streetAddress")["content"]
                except:
                    d["Adress"] = None
                try:
                    d["Website"] =  profil.find("a", itemprop = "sameAs")["href"]
                except:
                    d["Website"] = None
                pass
    l.append(d)
    df = pandas.DataFrame(l)
    df.to_csv("123.csv")
    print(df)

Advertisement

Answer

Here is your code with a couple of adjustments:

base_url = "https://www.ratemds.com/best-doctors/?page={}"  # Change base url to this
# Moved the list of dicts outsided of the main loop
l = []

for page in range(1, 5):
    r = requests.get(base_url.format(page))   #  substitute 'page' variable in base_url
    c = r.content
    soup = BeautifulSoup(c, 'html.parser')
    all = soup.find_all("div", {"class": "search-item doctor-profile"})
    for item in all:
        d = {}
        d["Name"] = item.find("a", {"class": "search-item-doctor-link"}).text
        d["Phone Number"] = item.find("div", {"class": "search-item-specialty"}).text
        n = item.find("a", {"class": "search-item-doctor-link"})
        a = n.get('href')
        new_url = ("https://www.ratemds.com"+a)
        r1 = requests.get(new_url)
        c1 = r1.content
        soup1 = BeautifulSoup(c1, 'html.parser')
        sve = soup1.find_all("div", {"class": "col-sm-3 col-md-4 search-item-extra"})
        for profil in sve:
            try:
                d["Phone Number"] = profil.find("meta", itemprop = "telephone")["content"]
            except:
                d["Phone Number"] = None
            try:
                d["Adress"] =  profil.find("meta", itemprop = "streetAddress")["content"]
            except:
                d["Adress"] = None
            try:
                d["Website"] =  profil.find("a", itemprop = "sameAs")["href"]
            except:
                d["Website"] = None
            pass
        l.append(d)  # indented this line to append within this loop

df = pd.DataFrame(l)
df.to_csv("123.csv")
User contributions licensed under: CC BY-SA
4 People found this is helpful
Advertisement