Trying to get the “all splits” line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the ‘all splits’ text instead of the numbers I’m looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.
JavaScript
x
71
71
1
import requests
2
from bs4 import BeautifulSoup
3
import re
4
from concurrent.futures import ThreadPoolExecutor
5
import pandas as pd
6
import csv
7
8
urls = []
9
data = []
10
11
for year in range(2003, 2005):
12
for page in range(1, 9):
13
url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false'
14
if url is not None:
15
urls.append(url)
16
17
def GetData(url):
18
names_list = [] # names of players
19
pers = [] # player efficency ratings
20
playeridlist = [] # list of player ids to be used in making new stats searchable url
21
statsurls = [] # list of urls generated to get player stats
22
23
# makes a pattern for the function to look for
24
pattern = re.compile('playerId=(d+)')
25
26
# setsup soup function
27
req = requests.get(url)
28
soup = BeautifulSoup(req.text, 'lxml')
29
30
# finds players names and adds to list
31
names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href'])
32
bodytext = names.text
33
names_list.append(bodytext)
34
35
# finds plays player efficency rating and adds to list
36
pertag = soup.find('td', class_='sortcell')
37
per = pertag.text
38
pers.append(per)
39
40
# finds player id
41
names = soup.find('a', href=pattern)
42
player_id = names['href'].split('playerId=')[1]
43
playeridlist.append(player_id)
44
45
# uses player id to make a list of new urls for that player and get stats
46
for player_id in playeridlist:
47
statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame"
48
if statsurl is not None:
49
statsurls.append(statsurl)
50
51
# parses stats to get stats
52
def GetStats(statsurl): # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!!
53
statsreq = requests.get(statsurl)
54
statssoup = BeautifulSoup(statsreq.text, 'lxml')
55
focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'})
56
playerstathtml = focusing_search.find('td', class_='Table__TD')
57
stat_values = [playerstats.text for playerstats in playerstathtml]
58
print(stat_values)
59
60
GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame")
61
62
63
#name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers))
64
65
66
print(f"{bodytext}: {per}")
67
print(player_id)
68
69
GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')
70
71
Advertisement
Answer
To get the all_splits
stats from:
https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame
This is what I did:
- I grabbed the table body using
soup.select
- Then I grabbed the headings and relevant stats by iterating through the columns/rows.
The list comprehension provides the text in list format, which is easy to convert to a dataframe.
Code:
JavaScript
1
14
14
1
import requests
2
from bs4 import BeautifulSoup
3
import pandas as pd
4
5
url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'
6
soup = BeautifulSoup(requests.get(url).content, "html.parser")
7
8
t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
9
headings = [h.text for h in t[0].find_next('tr').find_all('td')]
10
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
11
12
df = pd.DataFrame([all_splits], columns=headings)
13
print(df)
14