I’m very new to web scraping and am trying to build an algorithm to pull all of the information from my school’s course catalog. What I have so far is:
import requests from bs4 import BeautifulSoup import selenium from selenium.webdriver.support.ui import Select from selenium import webdriver import os import time from selenium.common import exceptions driver = webdriver.Chrome() url=.("https://webapps.lsa.umich.edu/CrsMaint/Public/CB_PublicBulletin.aspx?crselevel=ug/robots.txt") driver.get(url) time.sleep(1) driver.find_element_by_xpath('//*.[@id="ContentPlaceHolder1_ddlPage"]/option[4]').click() driver.find_element_by_xpath('//*[@name="ctl00$ContentPlaceHolder1$ddlTerm"]/option[1]').click() driver.find_element_by_xpath('//*[@name="ctl00$ContentPlaceHolder1$ddlSubject"]/option[8]').click() driver.find_element_by_xpath('//*[@id="ContentPlaceHolder1_btnSearch"]').click()
I’ve had much more but keep running into Selenium errors about not being able to locate the information when it is correct. Can anyone get me on the right track? Trying to pull all of the information!
Cheers
Advertisement
Answer
I’ve played around with your code and used it as a base for something a bit more what you’d expect.
Try this:
import textwrap from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options options = Options() options.headless = False driver = webdriver.Chrome(options=options) driver.get("https://webapps.lsa.umich.edu/CrsMaint/Public/CB_PublicBulletin.aspx") driver.find_element_by_xpath('//*[@id="ContentPlaceHolder1_ddlPage"]/option[4]').click() driver.find_element_by_xpath('//*[@name="ctl00$ContentPlaceHolder1$ddlTerm"]/option[1]').click() driver.find_element_by_xpath('//*[@name="ctl00$ContentPlaceHolder1$ddlSubject"]/option[8]').click() driver.find_element_by_xpath('//*[@id="ContentPlaceHolder1_btnSearch"]').click() tables = BeautifulSoup(driver.page_source, "html.parser").find_all("td", {"valign": "top"}) def wrapper(text: str, width: int = 120) -> str: return "n".join(textwrap.wrap(text, width=width)) + "n" for table in tables: try: title = table.find("b").getText(strip=True) course_info = " ".join(table.find("i").text.split()) desc = table.find("p").getText(strip=True) urls = [f"{a.text.strip()} - {a['href']}" for a in table.find_all("a")] print(title) print(wrapper(course_info)) print(wrapper(desc)) print("n".join(urls) if urls else "No URL's found.") print("-" * 120) except AttributeError: continue driver.close()
In my terminal the output (it’s just a small part of it, as there’s lots) looks like this: