Trying to get the “all splits” line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the ‘all splits’ text instead of the numbers I’m looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.
import requests from bs4 import BeautifulSoup import re from concurrent.futures import ThreadPoolExecutor import pandas as pd import csv urls = [] data = [] for year in range(2003, 2005): for page in range(1, 9): url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false' if url is not None: urls.append(url) def GetData(url): names_list = [] # names of players pers = [] # player efficency ratings playeridlist = [] # list of player ids to be used in making new stats searchable url statsurls = [] # list of urls generated to get player stats # makes a pattern for the function to look for pattern = re.compile('playerId=(d+)') # setsup soup function req = requests.get(url) soup = BeautifulSoup(req.text, 'lxml') # finds players names and adds to list names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href']) bodytext = names.text names_list.append(bodytext) # finds plays player efficency rating and adds to list pertag = soup.find('td', class_='sortcell') per = pertag.text pers.append(per) # finds player id names = soup.find('a', href=pattern) player_id = names['href'].split('playerId=')[1] playeridlist.append(player_id) # uses player id to make a list of new urls for that player and get stats for player_id in playeridlist: statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame" if statsurl is not None: statsurls.append(statsurl) # parses stats to get stats def GetStats(statsurl): # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!! statsreq = requests.get(statsurl) statssoup = BeautifulSoup(statsreq.text, 'lxml') focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'}) playerstathtml = focusing_search.find('td', class_='Table__TD') stat_values = [playerstats.text for playerstats in playerstathtml] print(stat_values) GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame") #name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers)) print(f"{bodytext}: {per}") print(player_id) GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')
Advertisement
Answer
To get the all_splits
stats from:
https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame
This is what I did:
- I grabbed the table body using
soup.select
- Then I grabbed the headings and relevant stats by iterating through the columns/rows.
The list comprehension provides the text in list format, which is easy to convert to a dataframe.
Code:
import requests from bs4 import BeautifulSoup import pandas as pd url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame' soup = BeautifulSoup(requests.get(url).content, "html.parser") t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody') headings = [h.text for h in t[0].find_next('tr').find_all('td')] all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')] df = pd.DataFrame([all_splits], columns=headings) print(df)