Getting Started with Statsbomb Data
1) Get the Data
Go to https://github.com/statsbomb/open-data, click on the green ‘Clone or download’ button and choose download ZIP to your harddrive. Unpack the zip.
2) Take the time to read their terms and conditions.
From their github page:
“Whilst we are keen to share data and facilitate research, we also urge you to be responsible with the data. Please register your details on https://www.statsbomb.com/resource-centre and read our User Agreement carefully.”
https://towardsdatascience.com/advanced-sports-visualization-with-pandas-matplotlib-and-seaborn-9c16df80a81b
is a great article to get started using StatsBomb Data and showcases Panda’s json_normalize function.
It provides us with a super easy way to take the nested json structure and put it into a nice and orderly DataFrame.
3) Process the data and combine it into DataFrames
This will create a DataFrame for all events, one for all Freeze Frames, one for all lineup information, including the minutes played, and one with the information for all Matches
So far I relied on R (FC_rStats made a great series of posts here: https://github.com/FCrSTATS/StatsBomb_WomensData)
to process the data, now I can get all the information Statsbomb Data provides with Python, including all the Freeze Frame Information.
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from os import listdir
from os.path import isfile, join
'''
Set mypath to your open-data-master/data/ path
'''
mypath =
# EVENTS AND FREEZE-FRAMES
files = [f for f in listdir(mypath+'events/') if isfile(join(mypath+'events/', f))]
try: #if you're on MacOS like I am this file might mess with you, so try removing it
files.remove('.DS_Store')
except:
pass
dfs = {}
ffs = {}
for file in files:
with open(mypath+'events/'+file) as data_file:
#print (mypath+'events/'+file)
data = json.load(data_file)
#get the nested structure into a dataframe
df = json_normalize(data, sep = "_").assign(match_id = file[:-5])
#store the dataframe in a dictionary with the match id as key (remove '.json' from string)
dfs[file[:-5]] = df.set_index('id')
shots = df.loc[df['type_name'] == 'Shot'].set_index('id')
#get the freeze frame information for every shot in the df
for id_, row in shots.iterrows():
try:
ff = json_normalize(row.shot_freeze_frame, sep = "_")
ff = ff.assign(x = ff.apply(lambda x: x.location[0], axis = 1)).\
assign(y = ff.apply(lambda x: x.location[1], axis = 1)).\
drop('location', axis = 1).\
assign(id = id_)
ffs[id_] = ff
except:
pass
#concatenate all the dictionaries
#this creates a multi-index with the dictionary key as first level
df = pd.concat(dfs, axis = 0)
#split locations into x and y components
df[['location_x', 'location_y']] = df['location'].apply(pd.Series)
df[['pass_end_location_x', 'pass_end_location_y']] = df['pass_end_location'].apply(pd.Series)
#split the shot_end_locations into x,y and z components (some don't include the z-part)
df['shot_end_location_x'], df['shot_end_location_y'], df['shot_end_location_z'] = np.nan, np.nan, np.nan
end_locations = np.vstack(df.loc[df.type_name == 'Shot'].shot_end_location.apply(lambda x: x if len(x) == 3
else x + [np.nan]).values)
df.loc[df.type_name == 'Shot', 'shot_end_location_x'] = end_locations[:, 0]
df.loc[df.type_name == 'Shot', 'shot_end_location_y'] = end_locations[:, 1]
df.loc[df.type_name == 'Shot', 'shot_end_location_z'] = end_locations[:, 2]
events_df = df.drop(['location', 'pass_end_location', 'shot_end_location'], axis = 1)
#concatenate all the Freeze Frame dataframes
ff_df = pd.concat(ffs, axis = 0)
# MATCHES
files = [f for f in listdir(mypath+'matches/') if isfile(join(mypath+'matches/', f))]
try:
files.remove('.DS_Store')
except:
pass
matches_dfs = {}
for file in files:
with open(mypath+'matches/'+file) as data_file:
#print (mypath+'lineups/'+file)
data = json.load(data_file)
#get the nested structure into a dataframe
df_ = json_normalize(data, sep = "_")
#store the dataframe in a dictionary with the competition id as key
matches_dfs[file[:-5]] = df_
matches_df = pd.concat(matches_dfs)
# LINEUPS w Minutes played
files = [f for f in listdir(mypath+'lineups/') if isfile(join(mypath+'lineups/', f))]
try: #if you're on MacOS like I am this file might mess with you, so try removing it
files.remove('.DS_Store')
except:
pass
dfs = {}
ffs = {}
for file in files:
with open(mypath+'lineups/'+file) as data_file:
#print (mypath+'events/'+file)
data = json.load(data_file)
#get the nested structure into a dataframe
df = json_normalize(data, sep = "_").assign(match_id = file[:-5])
df_1 = json_normalize(df.lineup.iloc[0], sep = "_").assign(
team_id = df.team_id.iloc[0],
team_name = df.team_name.iloc[0],
match_id = df.match_id.iloc[0])
df_2 = json_normalize(df.lineup.iloc[1], sep = "_").assign(
team_id = df.team_id.iloc[1],
team_name = df.team_name.iloc[1],
match_id = df.match_id.iloc[1])
dfs[file[:-5]] = pd.concat([df_1, df_2])
lineups_df = pd.concat(dfs.values())
# get the lengths of matches
match_lengths = events_df.groupby('match_id')['minute'].max()
# get all substitutions
substitutions = events_df.loc[events_df.substitution_outcome_name.notnull(),
['minute', 'player_name', 'substitution_replacement_name']].\
reset_index().\
drop('id', axis = 1).\
rename(columns = {'level_0': 'match_id'}).\
set_index(['match_id'])
# assign all minutes played to the lineups_df
a = lineups_df.reset_index().set_index('match_id').assign(minutes_played = match_lengths)
for idx, row in substitutions.iterrows():
a.loc[(a.index == idx)&(a.player_name == row.player_name), 'minutes_played'] = row.minute
a.loc[(a.index == idx)&(a.player_name == row.substitution_replacement_name), 'minutes_played'] = \
a.loc[(a.index == idx)&(a.player_name == row.substitution_replacement_name), 'minutes_played'] - row.minute
lineups_df = a.reset_index().set_index(['match_id', 'index'])
Save the data
HDF Files provide an easy and fast way to store and read larger files.
events_df.to_hdf(mypath+'Statsbomb_Data_df.hdf', key = 'df')
ff_df.to_hdf(mypath+'Statsbomb_Data_ff_df.hdf', key = 'ff_df')
matches_df.to_hdf(mypath+'Statsbomb_Data_matches_df.hdf', key = 'matches_df')
lineups_df.to_hdf(mypath+'Statsbomb_Data_lineups_df.hdf', key = 'lineups_df')
Read the data
df = pd.read_hdf(mypath+'Statsbomb_Data_df.hdf')
ff_df = pd.read_hdf(mypath+'Statsbomb_Data_ff_df.hdf')
matches_df = pd.read_hdf(mypath+'Statsbomb_Data_matches_df.hdf')
lineups_df = pd.read_hdf(mypath+'Statsbomb_Data_lineups_df.hdf')
df.head()
50_50_outcome_id | 50_50_outcome_name | bad_behaviour_card_id | bad_behaviour_card_name | ball_receipt_outcome_id | ball_receipt_outcome_name | ball_recovery_offensive | ball_recovery_recovery_failure | block_deflection | block_offensive | ... | type_id | type_name | under_pressure | location_x | location_y | pass_end_location_x | pass_end_location_y | shot_end_location_x | shot_end_location_y | shot_end_location_z | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||||||||||||
19714 | 85e3649d-96cf-43b0-9327-8f7e847f9c2d | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 35 | Starting XI | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
a9f001b3-0beb-4fe3-9562-eaf5298a69b8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 35 | Starting XI | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
2b624578-b4b5-4bfd-a8c0-531d9717ac19 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 18 | Half Start | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
e4989789-13c9-48ca-8406-5587f40be36e | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 18 | Half Start | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
226163ce-4a29-45b0-90f5-6eb02594223b | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 30 | Pass | NaN | 61.0 | 41.0 | 50.0 | 35.0 | NaN | NaN | NaN |
5 rows × 121 columns
ff_df.head()
player_id | player_name | position_id | position_name | teammate | x | y | id | ||
---|---|---|---|---|---|---|---|---|---|
000e60b5-955a-4c75-8874-f8b5e4579abf | 0 | 15614 | Sophie Elizabeth Bradley-Auckland | 4 | Center Back | False | 109.0 | 41.0 | 000e60b5-955a-4c75-8874-f8b5e4579abf |
1 | 15618 | Jasmine Matthews | 5 | Left Center Back | False | 106.0 | 43.0 | 000e60b5-955a-4c75-8874-f8b5e4579abf | |
2 | 15626 | Anke Preuß | 1 | Goalkeeper | False | 119.0 | 43.0 | 000e60b5-955a-4c75-8874-f8b5e4579abf | |
3 | 15629 | Satara Murray | 3 | Right Center Back | False | 103.0 | 27.0 | 000e60b5-955a-4c75-8874-f8b5e4579abf | |
4 | 15616 | Kim Little | 15 | Left Center Midfield | True | 100.0 | 34.0 | 000e60b5-955a-4c75-8874-f8b5e4579abf |
matches_df.head()
away_score | away_team_away_team_id | away_team_away_team_name | competition_competition_id | competition_competition_name | competition_country_name | data_version | home_score | home_team_home_team_id | home_team_home_team_name | kick_off | last_updated | match_date | match_id | match_status | referee_name | season_season_id | season_season_name | stadium_name | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
37 | 0 | 0 | 746 | Manchester City WFC | 37 | FA Women's Super League | England | 1.0.3 | 0 | 971 | Chelsea LFC | 15:00:00.000 | 2018-10-16T17:04:34.945 | 2018-09-09 | 19714 | available | None | 4 | 2018/2019 | Cherry Red Records Fans' Stadium |
1 | 0 | 967 | Everton LFC | 37 | FA Women's Super League | England | 1.0.3 | 1 | 969 | Birmingham City WFC | 15:00:00.000 | 2018-09-14T14:42:50.415805 | 2018-09-09 | 19718 | available | None | 4 | 2018/2019 | Damson Park | |
2 | 0 | 970 | Yeovil Town LFC | 37 | FA Women's Super League | England | 1.0.3 | 4 | 974 | Reading WFC | 15:00:00.000 | 2018-10-16T17:02:59.817 | 2018-09-09 | 19716 | available | None | 4 | 2018/2019 | Adams Park | |
3 | 0 | 966 | Liverpool WFC | 37 | FA Women's Super League | England | 1.0.3 | 5 | 968 | Arsenal WFC | 13:30:00.000 | 2018-10-16T17:03:47.733 | 2018-09-09 | 19717 | available | None | 4 | 2018/2019 | Meadow Park | |
4 | 1 | 973 | Bristol City WFC | 37 | FA Women's Super League | England | 1.0.3 | 0 | 965 | Brighton & Hove Albion WFC | 15:00:00.000 | 2018-10-16T17:05:18.428 | 2018-09-09 | 19715 | available | None | 4 | 2018/2019 | Broadfield Stadium |
lineups_df.head()
country_id | country_name | jersey_number | player_id | player_name | team_id | team_name | minutes_played | ||
---|---|---|---|---|---|---|---|---|---|
match_id | index | ||||||||
7298 | 0 | 220.0 | Sweden | 16 | 4633 | Magdalena Ericsson | 971 | Chelsea LFC | 94 |
1 | 241.0 | United States of America | 19 | 4634 | Crystal Dunn | 971 | Chelsea LFC | 94 | |
2 | 171.0 | Norway | 2 | 4636 | Maria Thorisdottir | 971 | Chelsea LFC | 12 | |
3 | 68.0 | England | 24 | 4638 | Drew Spence | 971 | Chelsea LFC | 55 | |
4 | 171.0 | Norway | 18 | 4639 | Maren Mjelde | 971 | Chelsea LFC | 94 |