Hello I’ve created two functions that work well well called alone. But when I try to use a for loop with these functions I got a problem with my parameter.
First function to search and get link to pass to the second one.
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} def searchsport(terme): url = 'https://www.verif.com/recherche/{}/1/ca/d/?ville=null'.format(terme) response = requests.get(url, headers= USER_AGENT) response.raise_for_status() return terme, response.text def crawl(keyword): if __name__ == '__main__': try: keyword, html = searchsport(keyword) soup = bs(html,'html.parser') table = soup.find_all('td', attrs={'class': 'verif_col1'}) premier = [] for result in table: link = result.find('a', href=True) premier.append(link) truelink = 'https://www.verif.com/'+str(premier[0]).split('"')[1] #print("le lien", truelink) except Exception as e: print(e) finally: time.sleep(10) return truelink
Second function to scrape a link.
def single_text(item_url): source_code = requests.get(item_url) print('nivo1 ok') plain_text = source_code.text # La page en html avec toutes ces balises soup = bs(plain_text,features="lxml" ) print('nivo2 ok') table = soup.find('table',{'class':"table infoGen hidden-smallDevice"}) # on cherche que la balise table print('nivo1 ok', 'n', table) table_rows = table.find_all('tr') # les données de tables sont dans les celulles tr #print(table_rows) l = [] for tr in table_rows: td = tr.find_all('td') row = row = [tr.text.strip() for tr in td] l.append(row) # On enleve certains caractères unitiles df = pd.DataFrame(l) return df
All these function worked when I tested them on a link.
Now I have a csv file with name of companies using searchsport() to search in website and the returned link is passed to single_text() to scrape.
for keyword in list(pd.read_csv('sport.csv').name): l = crawl(keyword) print(l) # THIS PRINT THE LINK single_item(l) # HERE I GOT THE PROBLEME
Error:
nivo1 ok nivo2 ok nivo1 ok None --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-55-263d95d6748c> in <module> 3 l = crawl(keyword) 4 ----> 5 single_item(item_url=l) <ipython-input-53-6d3b5c1b1ee8> in single_item(item_url) 7 table = soup.find('table',{'class':"table infoGen hidden-smallDevice"}) # on cherche que la balise table 8 print('nivo1 ok', 'n', table) ----> 9 table_rows = table.find_all('tr') # les données de tables sont dans les celulles tr 10 #print(table_rows) 11 AttributeError: 'NoneType' object has no attribute 'find_all'
When I run this I got a df.
single_item(item_url="https://www.verif.com/societe/COMPANYNAME-XXXXXXXXX/").head(1)
My expected results should be two DataFrame for every keyword. Why it doesn’t work?
Advertisement
Answer
So I have noted throughout the code some of the problems I saw with your code as posted.
Some things I noticed:
Not handling cases of where something is not found e.g. ‘PARIS-SAINT-GERMAIN-FOOTBALL’ will fail whereas ‘PARIS SAINT GERMAIN FOOTBALL’ as a search term will not
Opportunities for simplification missed e.g. creating a dataframe by looping tr
then td
when could just use read_html
on table
; Using find_all when a single table
or a
tag is needed
Overwriting variables in loops as well as typos e.g.
for tr in table_rows: td = tr.find_all('td') row = row = [tr.text.strip() for tr in td] # presumable a typo with row = row
Not testing if a dataframe is empty
Risking generating incorrect urls by using 'https://www.verif.com/'
as the next part you concatenate on starts with “/” as well
Inconsistent variable naming e.g. what is single_item
? The function I see is called single_text
.
These are just some observations and there is certainly still room for improvement.
import requests, time from bs4 import BeautifulSoup as bs import pandas as pd def searchsport(terme): url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null' response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'}) response.raise_for_status() return terme, response.text def crawl(keyword): try: keyword, html = searchsport(keyword) soup = bs(html,'lxml') a_tag = soup.select_one('td.verif_col1 a[href]') # your code before when looping tds would just overwrite truelink if more than one found. Instead if a_tag is None: #handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of #crawl('PARIS SAINT GERMAIN FOOTBALL') truelink = '' else: # print(a_tag['href']) # adding to the list premier served no purpose. Using split on href would result in list index out of range truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com except Exception as e: print(e) truelink = '' #handle case of 'other' fail. Make sure there is an assigment finally: time.sleep(5) return truelink #unless try succeeded this would have failed with local variable referenced before assignment def single_text(item_url): source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'}) print('nivo1 ok') plain_text = source_code.text # La page en html avec toutes ces balises soup = bs(plain_text,features="lxml") print('nivo2 ok') table = soup.select_one('.table') # on cherche que la balise table #print('nivo1 ok', 'n', table) if table is None: df = pd.DataFrame() else: df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops return df def main(): terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL'] for term in terms: item_url = crawl(term) if item_url: print(item_url) df = single_text(item_url) # what is single_item in your question? There is single_text if not df.empty: #test if dataframe is empty print(df.head(1)) if __name__ == '__main__': main()
Returning df from main()
import requests, time from bs4 import BeautifulSoup as bs import pandas as pd def searchsport(terme): url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null' response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'}) response.raise_for_status() return terme, response.text def crawl(keyword): try: keyword, html = searchsport(keyword) soup = bs(html,'lxml') a_tag = soup.select_one('td.verif_col1 a[href]') # your code before when looping tds would just overwrite truelink if more than one found. Instead if a_tag is None: #handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of #crawl('PARIS SAINT GERMAIN FOOTBALL') truelink = '' else: # print(a_tag['href']) # adding to the list premier served no purpose. Using split on href would result in list index out of range truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com except Exception as e: print(e) truelink = '' #handle case of 'other' fail. Make sure there is an assigment finally: time.sleep(5) return truelink #unless try succeeded this would have failed with local variable referenced before assignment def single_text(item_url): source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'}) print('nivo1 ok') plain_text = source_code.text # La page en html avec toutes ces balises soup = bs(plain_text,features="lxml") print('nivo2 ok') table = soup.select_one('.table') # on cherche que la balise table #print('nivo1 ok', 'n', table) if table is None: df = pd.DataFrame() else: df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops return df def main(): terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL'] for term in terms: item_url = crawl(term) if item_url: #print(item_url) df = single_text(item_url) # what is single_item in your question? There is single_text return df if __name__ == '__main__': df = main() print(df)