Web scraping problem during passing fuction as paramater in function

Question

Hello I've created two functions that work well well called alone. But when I try to use a for loop with these functions I got a problem with my parameter. First function to search and get link to pass to the second one. Second function to scrape a link. All these function worked when I tested them on a link.

Accepted Answer

So I have noted throughout the code some of the problems I saw with your code as posted.Some things I noticed:Not handling cases of where something is not found e.g. &#8216;PARIS-SAINT-GERMAIN-FOOTBALL&#8217; will fail whereas &#8216;PARIS SAINT GERMAIN FOOTBALL&#8217; as a search term will notOpportunities for simplification missed e.g. creating a dataframe by looping tr then td when could just use read_html on table; Using find_all when a single table or a tag is neededOverwriting variables in loops as well as typos e.g.for tr in table_rows:    td = tr.find_all('td')    row = row = [tr.text.strip() for tr in td]  # presumable a typo with row = rowNot testing if a dataframe is emptyRisking generating incorrect urls by using 'https://www.verif.com/' as the next part you concatenate on starts with &#8220;/&#8221; as wellInconsistent variable naming e.g. what is single_item? The function I see is called single_text.These are just some observations and there is certainly still room for improvement.import requests, timefrom bs4 import BeautifulSoup as bsimport pandas as pddef searchsport(terme):    url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'    response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})    response.raise_for_status()    return terme, response.textdef crawl(keyword):          try:        keyword, html = searchsport(keyword)        soup = bs(html,'lxml')        a_tag = soup.select_one('td.verif_col1 a[href]')         # your code before when looping tds would just overwrite truelink if more than one found. Instead        if a_tag is None:             #handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of             #crawl('PARIS SAINT GERMAIN FOOTBALL')            truelink = ''        else:            # print(a_tag['href'])            # adding to the list premier served no purpose. Using split on href would result in list index out of range            truelink = f'https://www.verif.com{a_tag["href"]}'  #relative link already so no extra / after .com    except Exception as e:        print(e)        truelink = '' #handle case of 'other' fail. Make sure there is an assigment    finally:        time.sleep(5)    return truelink #unless try succeeded this would have failed with local variable referenced before assignmentdef single_text(item_url):    source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})    print('nivo1 ok')    plain_text = source_code.text # La page en html avec toutes ces balises    soup = bs(plain_text,features="lxml")     print('nivo2 ok')    table = soup.select_one('.table') # on cherche que la balise table    #print('nivo1 ok', 'n', table)    if table is None:        df = pd.DataFrame()    else:        df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops    return dfdef main():    terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']    for term in terms:        item_url = crawl(term)        if item_url:            print(item_url)            df = single_text(item_url)  # what is single_item in your question? There is single_text            if not df.empty: #test if dataframe is empty                print(df.head(1)) if __name__ == '__main__':    main()Returning df from main()import requests, timefrom bs4 import BeautifulSoup as bsimport pandas as pddef searchsport(terme):    url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'    response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})    response.raise_for_status()    return terme, response.textdef crawl(keyword):          try:        keyword, html = searchsport(keyword)        soup = bs(html,'lxml')        a_tag = soup.select_one('td.verif_col1 a[href]')         # your code before when looping tds would just overwrite truelink if more than one found. Instead        if a_tag is None:             #handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of             #crawl('PARIS SAINT GERMAIN FOOTBALL')            truelink = ''        else:            # print(a_tag['href'])            # adding to the list premier served no purpose. Using split on href would result in list index out of range            truelink = f'https://www.verif.com{a_tag["href"]}'  #relative link already so no extra / after .com    except Exception as e:        print(e)        truelink = '' #handle case of 'other' fail. Make sure there is an assigment    finally:        time.sleep(5)    return truelink #unless try succeeded this would have failed with local variable referenced before assignmentdef single_text(item_url):    source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})    print('nivo1 ok')    plain_text = source_code.text # La page en html avec toutes ces balises    soup = bs(plain_text,features="lxml")     print('nivo2 ok')    table = soup.select_one('.table') # on cherche que la balise table    #print('nivo1 ok', 'n', table)    if table is None:        df = pd.DataFrame()    else:        df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops    return dfdef main():    terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']    for term in terms:        item_url = crawl(term)        if item_url:            #print(item_url)            df = single_text(item_url)  # what is single_item in your question? There is single_text    return dfif __name__ == '__main__':    df = main()    print(df)

Advertisement

Answer