Scrapping with request_HTML

Question

I am trying to scrape this website down below: https://www.kayak-polo.info/kphistorique.php?Group=CE&lang=en down below is my code. I am trying to actually get the text inside the caption element (as shown on the screenshot). However I believe I cannot find the tag because it has no closing tag and that's why I think it's not returning the text. For clarity purposes. I

Accepted Answer

Here is one possible solution:from time import timefrom typing import Generatorfrom requests_html import HTMLSessionfrom requests_html import HTMLResponsedef get_competition_types(html: HTMLResponse) -> Generator[None, None, str]:    return (i.attrs.get('value') for i in html.html.find('select[name="Group"] option'))def get_competition_urls(url: str, comp_types: Generator[None, None, str]) -> Generator[None, None, str]:    return (f'{url}?Group={_type}&lang=en' for _type in comp_types)def get_data(competition_url: str, session: HTMLSession) -> None:    response = session.get(competition_url)    print(competition_url)    article_data = response.html.find('article.tab-pane')    for article in article_data:        for data in (i.text.split('n') for i in article.find('div caption')):            if len(data) > 1:                print(f"{data[0]} {article.find('h3')[0].text.split()[1]} {data[1]}n")            else:                print(f"{data[0]} {article.find('h3')[0].text.split()[1]}n")session = HTMLSession()url = 'https://www.kayak-polo.info/kphistorique.php'html = session.get(url)start = time()competition_types = get_competition_types(html)competition_urls = get_competition_urls(url, competition_types)for url in competition_urls:    get_data(url, session)print(f"Total time: {round(time()-start, 3)}")The performance of this solution(processing all 4960 elements) is 55 secOutput:ECA European Championships - Catania (ITA) 2021 MenECA European Championships - Catania (ITA) 2021 WomenECA European Championships - Catania (ITA) 2021 U21 MenSolution based on ThreadPoolExecutor:from time import timefrom itertools import repeatfrom typing import Generatorfrom requests_html import HTMLSessionfrom requests_html import HTMLResponsefrom concurrent.futures import ThreadPoolExecutordef get_competition_types(html: HTMLResponse) -> Generator[None, None, str]:    return (i.attrs.get('value') for i in html.html.find('select[name="Group"] option'))def get_competition_urls(url: str, comp_types: Generator[None, None, str]) -> Generator[None, None, str]:    return (f'{url}?Group={_type}&lang=en' for _type in comp_types)def get_data(competition_url: str, session: HTMLSession) -> None:    response = session.get(competition_url)    print(competition_url)    article_data = response.html.find('article.tab-pane')    for article in article_data:        for data in (i.text.split('n') for i in article.find('div caption')):            if len(data) > 1:                print(f"{data[0]} {article.find('h3')[0].text.split()[1]} {data[1]}n")            else:                print(f"{data[0]} {article.find('h3')[0].text.split()[1]}n")session = HTMLSession()url = 'https://www.kayak-polo.info/kphistorique.php'html = session.get(url)start = time()competition_types = get_competition_types(html)competition_urls = get_competition_urls(url, competition_types)with ThreadPoolExecutor() as executor:    executor.map(get_data, list(competition_urls), repeat(session))print(f"Total time: {round(time()-start, 3)}")The performance of this solution(processing all 4960 elements) is ~35 secAnd of course, since in this solution we work with threads all data will be mixedOutput:European Championships - Sheffield (GBR) 1993 WomenCoupe d'Europe des Nations - Strasbourg (FRA) 1990 MenEuropean Club Championship - Duisbourg (GER) 2021 Men

Advertisement

Answer