Skip to content
Advertisement

Trying to get data from a table using beautifulsoup in python

Trying to get the “all splits” line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the ‘all splits’ text instead of the numbers I’m looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.

import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import csv

urls = []
data = []

for year in range(2003, 2005):
    for page in range(1, 9):
        url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false'
        if url is not None:
            urls.append(url)

    def GetData(url):
        names_list = []  # names of players
        pers = []  # player efficency ratings
        playeridlist = []  # list of player ids to be used in making new stats searchable url
        statsurls = []  # list of urls generated to get player stats

# makes a pattern for the function to look for
        pattern = re.compile('playerId=(d+)')

# setsup soup function
        req = requests.get(url)
        soup = BeautifulSoup(req.text, 'lxml')

# finds players names and adds to list
        names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href'])
        bodytext = names.text
        names_list.append(bodytext)

# finds plays player efficency rating and adds to list
        pertag = soup.find('td', class_='sortcell')
        per = pertag.text
        pers.append(per)

# finds player id
        names = soup.find('a', href=pattern)
        player_id = names['href'].split('playerId=')[1]
        playeridlist.append(player_id)

# uses player id to make a list of new urls for that player and get stats
        for player_id in playeridlist:
            statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame"
            if statsurl is not None:
                statsurls.append(statsurl)

# parses stats to get stats
        def GetStats(statsurl):  # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!!
            statsreq = requests.get(statsurl)
            statssoup = BeautifulSoup(statsreq.text, 'lxml')
            focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'})
            playerstathtml = focusing_search.find('td', class_='Table__TD')
            stat_values = [playerstats.text for playerstats in playerstathtml]
            print(stat_values)
            
   GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame")


        #name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers))


        print(f"{bodytext}: {per}")
        print(player_id)

GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')

Advertisement

Answer

To get the all_splits stats from: https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame

This is what I did:

  • I grabbed the table body using soup.select
  • Then I grabbed the headings and relevant stats by iterating through the columns/rows.

The list comprehension provides the text in list format, which is easy to convert to a dataframe.

Code:

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'
soup = BeautifulSoup(requests.get(url).content, "html.parser")

t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
headings = [h.text for h in t[0].find_next('tr').find_all('td')]
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]

df = pd.DataFrame([all_splits], columns=headings)
print(df)

Output:

all_splits

User contributions licensed under: CC BY-SA
5 People found this is helpful
Advertisement