Scraping Basketball Data

Posted on August 08, 2020 in NBA-Basketball

In order to do some analysis on the free throw percentage of NBA players in the bubble, I had to first go and find the data

basketballreference.com has a trove of historical data, so I decided to see if I could write a web-scrapper using Selenium to go and find box scores for every game in the 2020 season

I started by first getting the schedule page for each month of the 2020 season. On this schedule page, we can find the links to the box scores. Selenum was used to crawl through the table and find all box score URLs.

Then we accessed each box score url and saved the basic box score. Finally, the data was added do a dictionary and pickled for offline usage.

from selenium import webdriver
import panadas as pd
import time
import csv
import os
from os import path
import pickle
import datetime
from datetime import timedelta


def get_box_score_from_schedule_page(url : str) -> list():
    """
    Accesses the schedule page for a month and returns all of the box scores urls in a list
    :param url: a string such as https://www.basketball-reference.com/leagues/NBA_2020_games-october.html
    :return:
    """
    # Login to website
    browser.get(url)
    boxscore_urls = list()
    # Get list of box score URLs
    table_rows = browser.find_element_by_class_name('suppress_glossary').find_elements_by_xpath("tbody/tr")
    for row in table_rows:
        if row.get_attribute('class') != 'thead':
            # Get the box score url
            if len(row.find_elements_by_xpath('td[6]/a')) > 0 :
                boxscore_urls.append(row.find_element_by_xpath('td[6]/a').get_attribute('href'))
    return boxscore_urls

def get_basic_box_score_stats(url: str):
    """
    Gets the basic box score stats for the given URL
    :param url:
    :return:
    """
    browser.get(url)
    tables = browser.find_elements_by_class_name('sortable')
    for table in tables:
        if "game-basic" in table.get_attribute("id"):
            df = pd.read_html(table.get_attribute("outerHTML"), header=1, index_col=0)
            df = df[0].drop(['Reserves','Team Totals'])
            df = df[~df.iloc[:, 0].isin(['Did Not Play'])]
            key = url.split("/")[-1].split(".")[0] + "-" + table.get_attribute("id").split("-")[1]
            boxscores_dict[key] = df


# Create webdriver
option = webdriver.ChromeOptions()
option.add_argument("—-incognito")

browser = webdriver.Chrome(executable_path='/', chrome_options=option)

# Create array of months in the season
months_2020_season = ['october', 'november', 'december', 'january', 'february', 'march', 'july', 'august']
months_2020_season = ['august']

# example https://www.basketball-reference.com/leagues/NBA_2020_games-october.html
website_header = 'https://www.basketball-reference.com/leagues/NBA_2020_games-'
for month in months_2020_season:
    url = website_header + month + '.html'
    boxscore_urls = get_box_score_from_schedule_page(url)

    boxscores_dict = dict()
    boxscores_processed = list()
    working_dir = os.getcwd()
    working_dir = working_dir + "/data"
    if path.exists(working_dir + '/box_score_processed.csv'):
        with open(working_dir + '/box_score_processed.csv', newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                boxscores_processed.append(row[0])

    with open(working_dir + '/box_score_processed.csv', 'a') as fd:
        for boxscore in boxscore_urls:
            if not boxscores_processed.count(boxscore):
                time.sleep(1)
                get_basic_box_score_stats(boxscore)
                fd.write(boxscore + '\n')
                boxscores_processed.append(boxscore)

    with open(working_dir + '/basic_box_score_' + month + "_2020_season.pickle", 'wb') as handle:
        pickle.dump(boxscores_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)