I want to extract Name, Position and Email from the webpage for every person, I did extract the name and position but as the email is not possible to extract since you have to contact them to know. I just want to extract their contact URL. Every person has different contact URL I want to extract in a way that the CSV file should contain third column as contact in front of their row and whenever I click to contact I should be redirect to there particular contact page. Here is my code:
import requests from bs4 import BeautifulSoup from csv import writer for page in range(0,30): url = 'https://fairfaxhs.fcps.edu/staff-directory?field_last_name_from=&field_last_name_to=&items_per_page=10&keywords=&page={page}'.format(page =page) R = requests.get(url) soup = BeautifulSoup(R.content, 'html.parser') lists = soup.find_all('tr') with open('FCPS.csv', 'a', encoding='utf8', newline='') as l: thewriter = writer(l) if page == 0: header = (['Name', 'Position','Contact']) thewriter.writerow(header) else : for list in lists: name = list.find('td', class_ = 'views-field views-field-field-last-name') if name: name = name.text else : name = 'N/A' position = list.find('td', class_='views-field views-field-field-staff-title') if position: position = position.text else : position = 'N/A' #contact = list.find('td', class_='views-field views-field-rendered-item') #if contact: # contact = contact.text #else : # contact = 'N/A' thewriter.writerow([name,position])
Advertisement
Answer
You can grab data according to your requireqment from each listing page/detailed page
following the next example:
import pandas as pd from bs4 import BeautifulSoup import requests url = 'https://fairfaxhs.fcps.edu/staff-directory?field_last_name_from=&field_last_name_to=&items_per_page=10&keywords=&page={page}' data = [] for page in range(0,30): soup = BeautifulSoup(requests.get(url.format(page=page)).text,'lxml') try: for u in ['https://fairfaxhs.fcps.edu'+link.a.get('href') for link in soup.table.select('tr td[class="views-field views-field-rendered-item"]')]: soup2 = BeautifulSoup(requests.get(u).text,'lxml') d={ 'Name': soup2.select_one('h1.node__title.fcps-color--dark11').get_text(strip=True), 'Position': soup2.select_one('h1+div').get_text(strip=True), 'contact_url': u } data.append(d) except: pass df=pd.DataFrame(data).to_csv('out.csv',index=False) #print(df)