Getting Started with Statsbomb Data

1) Get the Data

Go to https://github.com/statsbomb/open-data, click on the green ‘Clone or download’ button and choose download ZIP to your harddrive. Unpack the zip.

2) Take the time to read their terms and conditions.

From their github page:

“Whilst we are keen to share data and facilitate research, we also urge you to be responsible with the data. Please register your details on https://www.statsbomb.com/resource-centre and read our User Agreement carefully.”

https://towardsdatascience.com/advanced-sports-visualization-with-pandas-matplotlib-and-seaborn-9c16df80a81b

is a great article to get started using StatsBomb Data and showcases Panda’s json_normalize function.
It provides us with a super easy way to take the nested json structure and put it into a nice and orderly DataFrame.

3) Process the data and combine it into DataFrames

This will create a DataFrame for all events, one for all Freeze Frames, one for all lineup information, including the minutes played, and one with the information for all Matches

So far I relied on R (FC_rStats made a great series of posts here: https://github.com/FCrSTATS/StatsBomb_WomensData)
to process the data, now I can get all the information Statsbomb Data provides with Python, including all the Freeze Frame Information.

import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from os import listdir
from os.path import isfile, join

'''
Set mypath to your open-data-master/data/ path
'''
mypath =


# EVENTS AND FREEZE-FRAMES
files = [f for f in listdir(mypath+'events/') if isfile(join(mypath+'events/', f))]
try: #if you're on MacOS like I am this file might mess with you, so try removing it
    files.remove('.DS_Store')
except:
    pass

dfs = {}
ffs = {}

for file in files:
    with open(mypath+'events/'+file) as data_file:
        #print (mypath+'events/'+file)
        data = json.load(data_file)
        #get the nested structure into a dataframe
        df = json_normalize(data, sep = "_").assign(match_id = file[:-5])
        #store the dataframe in a dictionary with the match id as key (remove '.json' from string)
        dfs[file[:-5]] = df.set_index('id')    
        shots = df.loc[df['type_name'] == 'Shot'].set_index('id')

        #get the freeze frame information for every shot in the df
        for id_, row in shots.iterrows():
            try:
                ff = json_normalize(row.shot_freeze_frame, sep = "_")
                ff = ff.assign(x = ff.apply(lambda x: x.location[0], axis = 1)).\
                        assign(y = ff.apply(lambda x: x.location[1], axis = 1)).\
                        drop('location', axis = 1).\
                        assign(id = id_)
                ffs[id_] = ff
            except:
                pass

#concatenate all the dictionaries
#this creates a multi-index with the dictionary key as first level
df = pd.concat(dfs, axis = 0)

#split locations into x and y components
df[['location_x', 'location_y']] = df['location'].apply(pd.Series)
df[['pass_end_location_x', 'pass_end_location_y']] = df['pass_end_location'].apply(pd.Series)

#split the shot_end_locations into x,y and z components (some don't include the z-part)
df['shot_end_location_x'], df['shot_end_location_y'], df['shot_end_location_z'] = np.nan, np.nan, np.nan
end_locations = np.vstack(df.loc[df.type_name == 'Shot'].shot_end_location.apply(lambda x: x if len(x) == 3
                                       else x + [np.nan]).values)
df.loc[df.type_name == 'Shot', 'shot_end_location_x'] = end_locations[:, 0]
df.loc[df.type_name == 'Shot', 'shot_end_location_y'] = end_locations[:, 1]
df.loc[df.type_name == 'Shot', 'shot_end_location_z'] = end_locations[:, 2]
events_df = df.drop(['location', 'pass_end_location', 'shot_end_location'], axis = 1)

#concatenate all the Freeze Frame dataframes
ff_df = pd.concat(ffs, axis = 0)


# MATCHES
files = [f for f in listdir(mypath+'matches/') if isfile(join(mypath+'matches/', f))]
try:
    files.remove('.DS_Store')
except:
    pass

matches_dfs = {}
for file in files:
    with open(mypath+'matches/'+file) as data_file:
        #print (mypath+'lineups/'+file)
        data = json.load(data_file)
        #get the nested structure into a dataframe
        df_ = json_normalize(data, sep = "_")
        #store the dataframe in a dictionary with the competition id as key
        matches_dfs[file[:-5]] = df_

matches_df = pd.concat(matches_dfs)


# LINEUPS w Minutes played
files = [f for f in listdir(mypath+'lineups/') if isfile(join(mypath+'lineups/', f))]
try: #if you're on MacOS like I am this file might mess with you, so try removing it
    files.remove('.DS_Store')
except:
    pass


dfs = {}
ffs = {}

for file in files:
    with open(mypath+'lineups/'+file) as data_file:
        #print (mypath+'events/'+file)
        data = json.load(data_file)
        #get the nested structure into a dataframe
        df = json_normalize(data, sep = "_").assign(match_id = file[:-5])
        df_1 = json_normalize(df.lineup.iloc[0], sep = "_").assign(
                team_id = df.team_id.iloc[0],
                team_name = df.team_name.iloc[0],
                match_id = df.match_id.iloc[0])
        df_2 = json_normalize(df.lineup.iloc[1], sep = "_").assign(
                team_id = df.team_id.iloc[1],
                team_name = df.team_name.iloc[1],
                match_id = df.match_id.iloc[1])
        dfs[file[:-5]] = pd.concat([df_1, df_2])

lineups_df = pd.concat(dfs.values())

# get the lengths of matches
match_lengths = events_df.groupby('match_id')['minute'].max()

# get all substitutions
substitutions = events_df.loc[events_df.substitution_outcome_name.notnull(),
                               ['minute', 'player_name', 'substitution_replacement_name']].\
                    reset_index().\
                    drop('id', axis = 1).\
                    rename(columns = {'level_0': 'match_id'}).\
                    set_index(['match_id'])

# assign all minutes played to the lineups_df
a = lineups_df.reset_index().set_index('match_id').assign(minutes_played = match_lengths)

for idx, row in substitutions.iterrows():
    a.loc[(a.index == idx)&(a.player_name == row.player_name), 'minutes_played'] = row.minute
    a.loc[(a.index == idx)&(a.player_name == row.substitution_replacement_name), 'minutes_played'] = \
        a.loc[(a.index == idx)&(a.player_name == row.substitution_replacement_name), 'minutes_played'] - row.minute

lineups_df = a.reset_index().set_index(['match_id', 'index'])

Save the data

HDF Files provide an easy and fast way to store and read larger files.

events_df.to_hdf(mypath+'Statsbomb_Data_df.hdf', key = 'df')
ff_df.to_hdf(mypath+'Statsbomb_Data_ff_df.hdf', key = 'ff_df')
matches_df.to_hdf(mypath+'Statsbomb_Data_matches_df.hdf', key = 'matches_df')
lineups_df.to_hdf(mypath+'Statsbomb_Data_lineups_df.hdf', key = 'lineups_df')

Read the data

df = pd.read_hdf(mypath+'Statsbomb_Data_df.hdf')
ff_df = pd.read_hdf(mypath+'Statsbomb_Data_ff_df.hdf')
matches_df = pd.read_hdf(mypath+'Statsbomb_Data_matches_df.hdf')
lineups_df = pd.read_hdf(mypath+'Statsbomb_Data_lineups_df.hdf')
df.head()
50_50_outcome_id 50_50_outcome_name bad_behaviour_card_id bad_behaviour_card_name ball_receipt_outcome_id ball_receipt_outcome_name ball_recovery_offensive ball_recovery_recovery_failure block_deflection block_offensive ... type_id type_name under_pressure location_x location_y pass_end_location_x pass_end_location_y shot_end_location_x shot_end_location_y shot_end_location_z
id
19714 85e3649d-96cf-43b0-9327-8f7e847f9c2d NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 35 Starting XI NaN NaN NaN NaN NaN NaN NaN NaN
a9f001b3-0beb-4fe3-9562-eaf5298a69b8 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 35 Starting XI NaN NaN NaN NaN NaN NaN NaN NaN
2b624578-b4b5-4bfd-a8c0-531d9717ac19 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 18 Half Start NaN NaN NaN NaN NaN NaN NaN NaN
e4989789-13c9-48ca-8406-5587f40be36e NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 18 Half Start NaN NaN NaN NaN NaN NaN NaN NaN
226163ce-4a29-45b0-90f5-6eb02594223b NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 30 Pass NaN 61.0 41.0 50.0 35.0 NaN NaN NaN

5 rows × 121 columns

ff_df.head()
player_id player_name position_id position_name teammate x y id
000e60b5-955a-4c75-8874-f8b5e4579abf 0 15614 Sophie Elizabeth Bradley-Auckland 4 Center Back False 109.0 41.0 000e60b5-955a-4c75-8874-f8b5e4579abf
1 15618 Jasmine Matthews 5 Left Center Back False 106.0 43.0 000e60b5-955a-4c75-8874-f8b5e4579abf
2 15626 Anke Preuß 1 Goalkeeper False 119.0 43.0 000e60b5-955a-4c75-8874-f8b5e4579abf
3 15629 Satara Murray 3 Right Center Back False 103.0 27.0 000e60b5-955a-4c75-8874-f8b5e4579abf
4 15616 Kim Little 15 Left Center Midfield True 100.0 34.0 000e60b5-955a-4c75-8874-f8b5e4579abf
matches_df.head()
away_score away_team_away_team_id away_team_away_team_name competition_competition_id competition_competition_name competition_country_name data_version home_score home_team_home_team_id home_team_home_team_name kick_off last_updated match_date match_id match_status referee_name season_season_id season_season_name stadium_name
37 0 0 746 Manchester City WFC 37 FA Women's Super League England 1.0.3 0 971 Chelsea LFC 15:00:00.000 2018-10-16T17:04:34.945 2018-09-09 19714 available None 4 2018/2019 Cherry Red Records Fans' Stadium
1 0 967 Everton LFC 37 FA Women's Super League England 1.0.3 1 969 Birmingham City WFC 15:00:00.000 2018-09-14T14:42:50.415805 2018-09-09 19718 available None 4 2018/2019 Damson Park
2 0 970 Yeovil Town LFC 37 FA Women's Super League England 1.0.3 4 974 Reading WFC 15:00:00.000 2018-10-16T17:02:59.817 2018-09-09 19716 available None 4 2018/2019 Adams Park
3 0 966 Liverpool WFC 37 FA Women's Super League England 1.0.3 5 968 Arsenal WFC 13:30:00.000 2018-10-16T17:03:47.733 2018-09-09 19717 available None 4 2018/2019 Meadow Park
4 1 973 Bristol City WFC 37 FA Women's Super League England 1.0.3 0 965 Brighton & Hove Albion WFC 15:00:00.000 2018-10-16T17:05:18.428 2018-09-09 19715 available None 4 2018/2019 Broadfield Stadium
lineups_df.head()
country_id country_name jersey_number player_id player_name team_id team_name minutes_played
match_id index
7298 0 220.0 Sweden 16 4633 Magdalena Ericsson 971 Chelsea LFC 94
1 241.0 United States of America 19 4634 Crystal Dunn 971 Chelsea LFC 94
2 171.0 Norway 2 4636 Maria Thorisdottir 971 Chelsea LFC 12
3 68.0 England 24 4638 Drew Spence 971 Chelsea LFC 55
4 171.0 Norway 18 4639 Maren Mjelde 971 Chelsea LFC 94