Use Beautiful Soup to unify #text after a tag

Question

I&#8217;m using Beautiful Soup to put in a excel table some infos from a website. The bold titles are shown in the head columns while the text after the colon appear in the rows. What I&#8217;m doing is finding the text and searching for next_sibling &#8211;> The problem is that in some cases the text afte…

Accepted Answer

Parsing this document isn&#8217;t pretty. Probable the document is hand-written in Word and then exported to HTML:import requestsimport pandas as pdfrom bs4 import BeautifulSoupurl = "http://www.letteraturenordiche.it/danimarca.htm"soup = BeautifulSoup(requests.get(url).content, "html.parser")# preprocess the document:# remove all whitespaces:for w in soup.find_all(text=True):    if not w.strip():        w.extract()# unwrap not necessary tags:for t in soup.select("i, font, span"):    t.unwrap()# merge NavigableStrings together:soup.smooth()data = []for t in soup.select("table"):    title = t.p.get_text(separator=" ", strip=True)    year = (        t.select_one('b:-soup-contains("Anno:")')        .find_next_sibling(text=True)        .strip()    )    author = (        t.find_previous("hr", attrs={"size": "6"})        .find_previous("p")        .get_text(strip=True)    )    editor = (        t.select_one('b:-soup-contains("Editore:")')        .find_next_sibling(text=True)        .strip()    )    pages = (        t.select_one('b:-soup-contains("Pagine:")')        .find_next_sibling(text=True)        .strip()    )    notes = (        t.select_one('b:-soup-contains("Note:", "Comprende")')        .find_next_sibling(text=True)        .strip()    )    original_title = t.select_one(        'b:-soup-contains("Titolo Original", "Titolo original", "Titoli originali")'    )    if not original_title:        original_title = t.find(lambda t: t.text.strip() == ":")    if not original_title:        original_title = ""    else:        original_title = original_title.find_next_sibling(text=True).strip()    data.append((title, year, author, editor, pages, notes, original_title))df = pd.DataFrame(    data,    columns=[        "title",        "year",        "author",        "editor",        "pages",        "notes",        "original_title",    ],)df["title"] = df["title"].str.replace(r"r?n", " ", regex=True)df["author"] = df["author"].str.replace(r"r?n", " ", regex=True)print(df)df.to_csv("data.csv", index=False)Creates the dataframe and saves it as data.csv (screenshot from LibreOffice):

Advertisement

Answer