Getting Started with Statsbomb Data

1) Get the Data

Go to https://github.com/statsbomb/open-data, click on the green ‘Clone or download’ button and choose download ZIP to your harddrive. Unpack the zip.

2) Take the time to read their terms and conditions.

From their github page:

“Whilst we are keen to share data and facilitate research, we also urge you to be responsible with the data. Please register your details on https://www.statsbomb.com/resource-centre and read our User Agreement carefully.”

https://towardsdatascience.com/advanced-sports-visualization-with-pandas-matplotlib-and-seaborn-9c16df80a81b

is a great article to get started using StatsBomb Data and showcases Panda’s json_normalize function.
It provides us with a super easy way to take the nested json structure and put it into a nice and orderly DataFrame.

3) Process the data and combine it into DataFrames

This will create a DataFrame for all events, one for all Freeze Frames, one for all lineup information, including the minutes played, and one with the information for all Matches

So far I relied on R (FC_rStats made a great series of posts here: https://github.com/FCrSTATS/StatsBomb_WomensData)
to process the data, now I can get all the information Statsbomb Data provides with Python, including all the Freeze Frame Information.

import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from os import listdir
from os.path import isfile, join

'''
Set mypath to your open-data-master/data/ path
'''
mypath =


# EVENTS AND FREEZE-FRAMES
files = [f for f in listdir(mypath+'events/') if isfile(join(mypath+'events/', f))]
try: #if you're on MacOS like I am this file might mess with you, so try removing it
    files.remove('.DS_Store')
except:
    pass

dfs = {}
ffs = {}

for file in files:
    with open(mypath+'events/'+file) as data_file:
        #print (mypath+'events/'+file)
        data = json.load(data_file)
        #get the nested structure into a dataframe
        df = json_normalize(data, sep = "_").assign(match_id = file[:-5])
        #store the dataframe in a dictionary with the match id as key (remove '.json' from string)
        dfs[file[:-5]] = df.set_index('id')    
        shots = df.loc[df['type_name'] == 'Shot'].set_index('id')

        #get the freeze frame information for every shot in the df
        for id_, row in shots.iterrows():
            try:
                ff = json_normalize(row.shot_freeze_frame, sep = "_")
                ff = ff.assign(x = ff.apply(lambda x: x.location[0], axis = 1)).\
                        assign(y = ff.apply(lambda x: x.location[1], axis = 1)).\
                        drop('location', axis = 1).\
                        assign(id = id_)
                ffs[id_] = ff
            except:
                pass

#concatenate all the dictionaries
#this creates a multi-index with the dictionary key as first level
df = pd.concat(dfs, axis = 0)

#split locations into x and y components
df[['location_x', 'location_y']] = df['location'].apply(pd.Series)
df[['pass_end_location_x', 'pass_end_location_y']] = df['pass_end_location'].apply(pd.Series)

#split the shot_end_locations into x,y and z components (some don't include the z-part)
df['shot_end_location_x'], df['shot_end_location_y'], df['shot_end_location_z'] = np.nan, np.nan, np.nan
end_locations = np.vstack(df.loc[df.type_name == 'Shot'].shot_end_location.apply(lambda x: x if len(x) == 3
                                       else x + [np.nan]).values)
df.loc[df.type_name == 'Shot', 'shot_end_location_x'] = end_locations[:, 0]
df.loc[df.type_name == 'Shot', 'shot_end_location_y'] = end_locations[:, 1]
df.loc[df.type_name == 'Shot', 'shot_end_location_z'] = end_locations[:, 2]
events_df = df.drop(['location', 'pass_end_location', 'shot_end_location'], axis = 1)

#concatenate all the Freeze Frame dataframes
ff_df = pd.concat(ffs, axis = 0)


# MATCHES
files = [f for f in listdir(mypath+'matches/') if isfile(join(mypath+'matches/', f))]
try:
    files.remove('.DS_Store')
except:
    pass

matches_dfs = {}
for file in files:
    with open(mypath+'matches/'+file) as data_file:
        #print (mypath+'lineups/'+file)
        data = json.load(data_file)
        #get the nested structure into a dataframe
        df_ = json_normalize(data, sep = "_")
        #store the dataframe in a dictionary with the competition id as key
        matches_dfs[file[:-5]] = df_

matches_df = pd.concat(matches_dfs)


# LINEUPS w Minutes played
files = [f for f in listdir(mypath+'lineups/') if isfile(join(mypath+'lineups/', f))]
try: #if you're on MacOS like I am this file might mess with you, so try removing it
    files.remove('.DS_Store')
except:
    pass


dfs = {}
ffs = {}

for file in files:
    with open(mypath+'lineups/'+file) as data_file:
        #print (mypath+'events/'+file)
        data = json.load(data_file)
        #get the nested structure into a dataframe
        df = json_normalize(data, sep = "_").assign(match_id = file[:-5])
        df_1 = json_normalize(df.lineup.iloc[0], sep = "_").assign(
                team_id = df.team_id.iloc[0],
                team_name = df.team_name.iloc[0],
                match_id = df.match_id.iloc[0])
        df_2 = json_normalize(df.lineup.iloc[1], sep = "_").assign(
                team_id = df.team_id.iloc[1],
                team_name = df.team_name.iloc[1],
                match_id = df.match_id.iloc[1])
        dfs[file[:-5]] = pd.concat([df_1, df_2])

lineups_df = pd.concat(dfs.values())

# get the lengths of matches
match_lengths = events_df.groupby('match_id')['minute'].max()

# get all substitutions
substitutions = events_df.loc[events_df.substitution_outcome_name.notnull(),
                               ['minute', 'player_name', 'substitution_replacement_name']].\
                    reset_index().\
                    drop('id', axis = 1).\
                    rename(columns = {'level_0': 'match_id'}).\
                    set_index(['match_id'])

# assign all minutes played to the lineups_df
a = lineups_df.reset_index().set_index('match_id').assign(minutes_played = match_lengths)

for idx, row in substitutions.iterrows():
    a.loc[(a.index == idx)&(a.player_name == row.player_name), 'minutes_played'] = row.minute
    a.loc[(a.index == idx)&(a.player_name == row.substitution_replacement_name), 'minutes_played'] = \
        a.loc[(a.index == idx)&(a.player_name == row.substitution_replacement_name), 'minutes_played'] - row.minute

lineups_df = a.reset_index().set_index(['match_id', 'index'])

Save the data

HDF Files provide an easy and fast way to store and read larger files.

events_df.to_hdf(mypath+'Statsbomb_Data_df.hdf', key = 'df')
ff_df.to_hdf(mypath+'Statsbomb_Data_ff_df.hdf', key = 'ff_df')
matches_df.to_hdf(mypath+'Statsbomb_Data_matches_df.hdf', key = 'matches_df')
lineups_df.to_hdf(mypath+'Statsbomb_Data_lineups_df.hdf', key = 'lineups_df')

Read the data

df = pd.read_hdf(mypath+'Statsbomb_Data_df.hdf')
ff_df = pd.read_hdf(mypath+'Statsbomb_Data_ff_df.hdf')
matches_df = pd.read_hdf(mypath+'Statsbomb_Data_matches_df.hdf')
lineups_df = pd.read_hdf(mypath+'Statsbomb_Data_lineups_df.hdf')

df.head()

		50_50_outcome_id	50_50_outcome_name	bad_behaviour_card_id	bad_behaviour_card_name	ball_receipt_outcome_id	ball_receipt_outcome_name	ball_recovery_offensive	ball_recovery_recovery_failure	block_deflection	block_offensive	...	type_id	type_name	under_pressure	location_x	location_y	pass_end_location_x	pass_end_location_y	shot_end_location_x	shot_end_location_y	shot_end_location_z
	id
19714	85e3649d-96cf-43b0-9327-8f7e847f9c2d	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	35	Starting XI	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	a9f001b3-0beb-4fe3-9562-eaf5298a69b8	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	35	Starting XI	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	2b624578-b4b5-4bfd-a8c0-531d9717ac19	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	18	Half Start	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	e4989789-13c9-48ca-8406-5587f40be36e	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	18	Half Start	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	226163ce-4a29-45b0-90f5-6eb02594223b	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	30	Pass	NaN	61.0	41.0	50.0	35.0	NaN	NaN	NaN

5 rows × 121 columns

ff_df.head()

		player_id	player_name	position_id	position_name	teammate	x	y	id
000e60b5-955a-4c75-8874-f8b5e4579abf	0	15614	Sophie Elizabeth Bradley-Auckland	4	Center Back	False	109.0	41.0	000e60b5-955a-4c75-8874-f8b5e4579abf
	1	15618	Jasmine Matthews	5	Left Center Back	False	106.0	43.0	000e60b5-955a-4c75-8874-f8b5e4579abf
	2	15626	Anke Preuß	1	Goalkeeper	False	119.0	43.0	000e60b5-955a-4c75-8874-f8b5e4579abf
	3	15629	Satara Murray	3	Right Center Back	False	103.0	27.0	000e60b5-955a-4c75-8874-f8b5e4579abf
	4	15616	Kim Little	15	Left Center Midfield	True	100.0	34.0	000e60b5-955a-4c75-8874-f8b5e4579abf

matches_df.head()

		away_score	away_team_away_team_id	away_team_away_team_name	competition_competition_id	competition_competition_name	competition_country_name	data_version	home_score	home_team_home_team_id	home_team_home_team_name	kick_off	last_updated	match_date	match_id	match_status	referee_name	season_season_id	season_season_name	stadium_name
37	0	0	746	Manchester City WFC	37	FA Women's Super League	England	1.0.3	0	971	Chelsea LFC	15:00:00.000	2018-10-16T17:04:34.945	2018-09-09	19714	available	None	4	2018/2019	Cherry Red Records Fans' Stadium
	1	0	967	Everton LFC	37	FA Women's Super League	England	1.0.3	1	969	Birmingham City WFC	15:00:00.000	2018-09-14T14:42:50.415805	2018-09-09	19718	available	None	4	2018/2019	Damson Park
	2	0	970	Yeovil Town LFC	37	FA Women's Super League	England	1.0.3	4	974	Reading WFC	15:00:00.000	2018-10-16T17:02:59.817	2018-09-09	19716	available	None	4	2018/2019	Adams Park
	3	0	966	Liverpool WFC	37	FA Women's Super League	England	1.0.3	5	968	Arsenal WFC	13:30:00.000	2018-10-16T17:03:47.733	2018-09-09	19717	available	None	4	2018/2019	Meadow Park
	4	1	973	Bristol City WFC	37	FA Women's Super League	England	1.0.3	0	965	Brighton & Hove Albion WFC	15:00:00.000	2018-10-16T17:05:18.428	2018-09-09	19715	available	None	4	2018/2019	Broadfield Stadium

lineups_df.head()

		country_id	country_name	jersey_number	player_id	player_name	team_id	team_name	minutes_played
match_id	index
7298	0	220.0	Sweden	16	4633	Magdalena Ericsson	971	Chelsea LFC	94
	1	241.0	United States of America	19	4634	Crystal Dunn	971	Chelsea LFC	94
	2	171.0	Norway	2	4636	Maria Thorisdottir	971	Chelsea LFC	12
	3	68.0	England	24	4638	Drew Spence	971	Chelsea LFC	55
	4	171.0	Norway	18	4639	Maren Mjelde	971	Chelsea LFC	94