Hello I’ve created two functions that work well well called alone. But when I try to use a for loop with these functions I got a problem with my parameter.
First function to search and get link to pass to the second one.
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
def searchsport(terme):
url = 'https://www.verif.com/recherche/{}/1/ca/d/?ville=null'.format(terme)
response = requests.get(url, headers= USER_AGENT)
response.raise_for_status()
return terme, response.text
def crawl(keyword):
if __name__ == '__main__':
try:
keyword, html = searchsport(keyword)
soup = bs(html,'html.parser')
table = soup.find_all('td', attrs={'class': 'verif_col1'})
premier = []
for result in table:
link = result.find('a', href=True)
premier.append(link)
truelink = 'https://www.verif.com/'+str(premier[0]).split('"')[1]
#print("le lien", truelink)
except Exception as e:
print(e)
finally:
time.sleep(10)
return truelink
Second function to scrape a link.
def single_text(item_url):
source_code = requests.get(item_url)
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml" )
print('nivo2 ok')
table = soup.find('table',{'class':"table infoGen hidden-smallDevice"}) # on cherche que la balise table
print('nivo1 ok', 'n', table)
table_rows = table.find_all('tr') # les données de tables sont dans les celulles tr
#print(table_rows)
l = []
for tr in table_rows:
td = tr.find_all('td')
row = row = [tr.text.strip() for tr in td]
l.append(row)
# On enleve certains caractères unitiles
df = pd.DataFrame(l)
return df
All these function worked when I tested them on a link.
Now I have a csv file with name of companies using searchsport() to search in website and the returned link is passed to single_text() to scrape.
for keyword in list(pd.read_csv('sport.csv').name):
l = crawl(keyword)
print(l) # THIS PRINT THE LINK
single_item(l) # HERE I GOT THE PROBLEME
Error:
nivo1 ok
nivo2 ok
nivo1 ok
None
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-55-263d95d6748c> in <module>
3 l = crawl(keyword)
4
----> 5 single_item(item_url=l)
<ipython-input-53-6d3b5c1b1ee8> in single_item(item_url)
7 table = soup.find('table',{'class':"table infoGen hidden-smallDevice"}) # on cherche que la balise table
8 print('nivo1 ok', 'n', table)
----> 9 table_rows = table.find_all('tr') # les données de tables sont dans les celulles tr
10 #print(table_rows)
11
AttributeError: 'NoneType' object has no attribute 'find_all'
When I run this I got a df.
single_item(item_url="https://www.verif.com/societe/COMPANYNAME-XXXXXXXXX/").head(1)
My expected results should be two DataFrame for every keyword. Why it doesn’t work?
Advertisement
Answer
So I have noted throughout the code some of the problems I saw with your code as posted.
Some things I noticed:
Not handling cases of where something is not found e.g. ‘PARIS-SAINT-GERMAIN-FOOTBALL’ will fail whereas ‘PARIS SAINT GERMAIN FOOTBALL’ as a search term will not
Opportunities for simplification missed e.g. creating a dataframe by looping tr
then td
when could just use read_html
on table
; Using find_all when a single table
or a
tag is needed
Overwriting variables in loops as well as typos e.g.
for tr in table_rows:
td = tr.find_all('td')
row = row = [tr.text.strip() for tr in td] # presumable a typo with row = row
Not testing if a dataframe is empty
Risking generating incorrect urls by using 'https://www.verif.com/'
as the next part you concatenate on starts with “/” as well
Inconsistent variable naming e.g. what is single_item
? The function I see is called single_text
.
These are just some observations and there is certainly still room for improvement.
import requests, time
from bs4 import BeautifulSoup as bs
import pandas as pd
def searchsport(terme):
url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'
response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})
response.raise_for_status()
return terme, response.text
def crawl(keyword):
try:
keyword, html = searchsport(keyword)
soup = bs(html,'lxml')
a_tag = soup.select_one('td.verif_col1 a[href]')
# your code before when looping tds would just overwrite truelink if more than one found. Instead
if a_tag is None:
#handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of
#crawl('PARIS SAINT GERMAIN FOOTBALL')
truelink = ''
else:
# print(a_tag['href'])
# adding to the list premier served no purpose. Using split on href would result in list index out of range
truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com
except Exception as e:
print(e)
truelink = '' #handle case of 'other' fail. Make sure there is an assigment
finally:
time.sleep(5)
return truelink #unless try succeeded this would have failed with local variable referenced before assignment
def single_text(item_url):
source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml")
print('nivo2 ok')
table = soup.select_one('.table') # on cherche que la balise table
#print('nivo1 ok', 'n', table)
if table is None:
df = pd.DataFrame()
else:
df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops
return df
def main():
terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']
for term in terms:
item_url = crawl(term)
if item_url:
print(item_url)
df = single_text(item_url) # what is single_item in your question? There is single_text
if not df.empty: #test if dataframe is empty
print(df.head(1))
if __name__ == '__main__':
main()
Returning df from main()
import requests, time
from bs4 import BeautifulSoup as bs
import pandas as pd
def searchsport(terme):
url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'
response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})
response.raise_for_status()
return terme, response.text
def crawl(keyword):
try:
keyword, html = searchsport(keyword)
soup = bs(html,'lxml')
a_tag = soup.select_one('td.verif_col1 a[href]')
# your code before when looping tds would just overwrite truelink if more than one found. Instead
if a_tag is None:
#handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of
#crawl('PARIS SAINT GERMAIN FOOTBALL')
truelink = ''
else:
# print(a_tag['href'])
# adding to the list premier served no purpose. Using split on href would result in list index out of range
truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com
except Exception as e:
print(e)
truelink = '' #handle case of 'other' fail. Make sure there is an assigment
finally:
time.sleep(5)
return truelink #unless try succeeded this would have failed with local variable referenced before assignment
def single_text(item_url):
source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml")
print('nivo2 ok')
table = soup.select_one('.table') # on cherche que la balise table
#print('nivo1 ok', 'n', table)
if table is None:
df = pd.DataFrame()
else:
df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops
return df
def main():
terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']
for term in terms:
item_url = crawl(term)
if item_url:
#print(item_url)
df = single_text(item_url) # what is single_item in your question? There is single_text
return df
if __name__ == '__main__':
df = main()
print(df)