-
Notifications
You must be signed in to change notification settings - Fork 2
/
Masterplot.py
385 lines (319 loc) · 16.9 KB
/
Masterplot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import matplotlib as mpl
mpl.rcParams['font.family'] = 'Candara'
mpl.rcParams['font.weight'] = 'bold'
mpl.rcParams['font.size'] = 12
from adjustText.adjustText import adjust_text
# Here we develop an OOPs oriented format in which we have various methods to operate on the csv file
class DataInput:
def __init__(self, filepath_data):
self.data = pd.read_csv(filepath_data, index_col='Rk')
try:
self.data['Player'] = self.data['Player'].apply(lambda x: ' '.join(x.split('\\')[1].split('-')))
self.data['Comp'] = self.data['Comp'].apply(lambda x: ' '.join(x.split()[1:]))
self.data['Nation'] = self.data['Nation'].apply(lambda x: x.split()[-1])
except:
print('Error in loading file:' + filepath_data)
self.filtereddf = self.data.copy()
# The below method shows the head of the data after a filtering option. We can control the number of
# data rows by changing the no option
def showdata(self, no=5):
print(self.filtereddf.head(no))
# The below method will filter the data based on min number of 90s played, team, league and nation
# We can list the positions we want to remove by adding them as parameters after the min_90s parameter
# To find the names of different leagues, teams, nations and positions, we have defined various methods later on
def filterdata(self, min_90s=5, *pos_to_remove, team: object = False,
league: object = False, nation: object = False):
notpos = []
for pos in pos_to_remove:
notpos.append(pos)
self.filtereddf = self.filtereddf[self.filtereddf['Pos'].apply(lambda x: x not in notpos)]
self.filtereddf = self.filtereddf[self.filtereddf['90s'] >= min_90s]
if team:
self.filtereddf = self.filtereddf[self.filtereddf['Squad'] == team]
if league:
self.filtereddf = self.filtereddf[self.filtereddf['Comp'] == league]
if nation:
self.filtereddf = self.filtereddf[self.filtereddf['Nation'] == nation]
# This method lists the various positions which can be found in the original or filtered dataset
# To find positions in filtered dataset make the filtered_dataset param = True
def show_different_positions(self, filtered_dataset=False):
if filtered_dataset:
print(self.filtereddf['Pos'].unique())
print(self.data['Pos'].unique())
# This method shows us the different nations which can be found in the original or filtered dataset
def show_different_nations(self, filtered_dataset=False):
if filtered_dataset:
print(self.filtereddf['Pos'].unique())
print(self.data['Pos'].unique())
# This method shows all the the columns or only the stats columns in the original dataset
def show_relevant_columns(self, all_cols=False):
if all_cols:
print(list(self.data.columns))
columns = list(self.data.columns)
col_to_remove = ['Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', '90s']
for col in col_to_remove:
if col in columns:
columns.remove(col)
print(columns)
# This properly lists all the names of the leagues in the original dataset
@property
def show_leaguenames(self):
print(self.data['Comp'].unique())
# This methods shows all the teams present in a particular league
def show_teams_in_league(self, leaguename):
print(self.data[self.data['Comp'] == leaguename]['Squad'].unique())
# This methods shows all the players present in a particular team
def show_players_in_team(self, teamname):
print(self.data[self.data['Squad'] == teamname][['Player','90s','Pos']].reset_index(drop=True))
# This method saves the filtered df as a csv file on your system
# The first parameters saves the file in that name(don't add the .csv)
# The second parameter saves the file in the directory you choose(if not sure, leave as be, it will
# save the csv in the same directory as this Python script)
def save_new_csv(self, filename='new_csv', filepath=os.getcwd()):
try:
self.filtereddf.to_csv(filepath + '\\' + filename + '.csv', encoding='utf-8-sig')
except:
print('An error occurred in saving the file')
else:
print('File has been saved as {0} at {1}'.format(filename, filepath))
# This is a plotting method
# on_x refers to the x-axis parameter while #on_y refers to the y-axis parameter
# per90 makes the graph a per90 graph(make sure no parameters are per90 among the two though)
# no_of_players limits the graph the top n players in the two metrics
# season changes the input of season to add to title
# plot_on_page plots the graph(default in Jupyter Notebook)
# custom_headers lets you make custom labels for x, y axes and title
# size is the figure size(input as a tuple, first refers to x width, second to y width)
# point_col, text_col and bg_color refer to the colour of the points, the text and background respectively
# annotate tells whether you want to label all points or not
# player_focus labels a particular player you want to focus on in the filtered dataframe, for this annotate has to be off
# save saves the plot in a directory of your choice
# add_xref, add_yred and add_refline adds reference lines according to slope needed
# repel creates a ggrepel type feature which repels each label
# Above code taken from https://github.com/Phlya/adjustText
def plot(self, on_x, on_y, per90=False, no_of_players=False, season='20-21', plot_on_page=True,
custom_headers=False, figure_size=(12, 8), point_col='red', text_col='black',
bgcolor='white', annotate = True, player_focus = None, save=False, add_xref=False,
add_yref=False, add_refline=False, repel=True):
df = self.filtereddf.copy().reset_index(drop=True)
x_axis, y_axis = on_x, on_y
p90statement: str = ''
if per90:
p90statement += ' per 90'
if (on_x[-1] != '%') and (on_x[-2:] != '90'):
df[on_x] = df[on_x] / df['90s']
x_axis += p90statement
if (on_y[-1] != '%') and (on_y[-2:] != '90'):
df[on_y] = df[on_y] / df['90s']
y_axis += p90statement
if annotate or not player_focus:
if no_of_players:
scaled = MinMaxScaler().fit_transform(df[[on_x, on_y]])
df_scaled = []
for i in range(scaled.shape[0]):
df_scaled.append(np.prod(scaled[i]))
df['Custom'] = pd.Series(df_scaled)
# df['Custom'] = df[on_x] * df[on_y]
df = df.sort_values('Custom', ascending=False)[:no_of_players].reset_index(drop=True)
df.drop('Custom', axis=1, inplace=True)
league = ''
nation = ''
team = ''
if (int(df['Comp'].nunique()) > 1) and (int(df['Squad'].nunique()) > 1):
title = '{0} vs {1} top {2} players across Europe\'s top 5 leagues for {3}{4}'.format(on_x, on_y,
str(no_of_players),
str(season),
p90statement)
if int(df['Nation'].nunique()) == 1:
nation += df['Nation'].iloc[0] + ' nationality'
title = '{0} vs {1} top {2} players across Europe\'s top 5 leagues of {3} for {4}{5}'.format(on_x, on_y,
str(no_of_players), nation,
str(season),p90statement)
else:
if int(df['Comp'].nunique()) == 1:
league += df['Comp'].iloc[0]
if int(df['Squad'].nunique()) == 1:
team += df['Squad'].iloc[0]
league = ''
if int(df['Nation'].nunique()) == 1:
nation += df['Nation'].iloc[0] + ' nationality'
# title_dict = {'League': league, 'Team': team, 'Nation': nation}
title_lst = [league, team, nation]
title_lst = [name for name in title_lst if len(name) > 0]
if len(title_lst) == 1:
title_st = title_lst[0]
else:
title_st = ' and of '.join(title_lst)
title = '{0} vs {1} top {2} players in {3} for {4}{5}'.format(on_x, on_y, str(no_of_players), title_st,
season, p90statement)
if custom_headers:
title = input('Title header: ')
x_axis = input('X axis title: ')
y_axis = input('Y axis title: ')
fig, ax = plt.subplots(figsize=figure_size)
ax.patch.set_facecolor(bgcolor)
fig.set_facecolor(bgcolor)
x_dist = (df[on_x].max() - df[on_x].min()) / 50
y_dist = (df[on_y].max() - df[on_y].min()) / 50
size = 12
# SCATTERPLOT
alpha = 0.75 if len(df.index)<30 else 0.375
p = sns.scatterplot(x=on_x, y=on_y, data=df, color=point_col, edgecolor='black', alpha=alpha)
p.grid(True, axis='both', c='gray', ls='--', alpha=0.5)
# TEXT ANNOTATIONS
if annotate:
if not repel:
for line in range(df.shape[0]):
plt.text(df[on_x][line] - x_dist, df[on_y][line] + y_dist,
df['Player'][line],
fontdict=dict(color=text_col, size=size))
else:
texts = []
for x, y, s in zip(df[on_x], df[on_y], df['Player']):
texts.append(plt.text(x, y, s,
fontdict=dict(color=text_col, size=size)))
adjust_text(texts, force_points=0.2, force_text=0.2,
expand_points=(1, 1), expand_text=(1, 1),
arrowprops=dict(arrowstyle="-", color='white', lw=0.5,alpha=0))
else:
if player_focus:
df_player = df[df['Player']==player_focus]
try:
ax.scatter(df_player[on_x], df_player[on_y], s=50, c='black', ec='black', zorder=2, alpha=0.8)
ax.text(df_player[on_x] - x_dist, df_player[on_y] + y_dist, df_player['Player'].values[0], size=14)
except: pass
# FOOTER
fig.text(0.01,0.015, 'Made using FootballFbrefPlotting by Shreyas Khatri/@khatri_shreyas. '+
'Code repo present at https://github.com/shreyas7kha/FootballFbrefPlotting.',
size=10, weight='bold')
# SET TITLES AND AXES LABELS
p.set_xlabel(x_axis, weight='heavy', family='Century Gothic', size=15, color=text_col)
p.set_ylabel(y_axis, weight='heavy', family='Century Gothic', size=15, color=text_col)
p.set_title(title.upper(), weight='heavy', family='Century Gothic', size=20, color=text_col)
if add_xref:
p.axvline(add_xref, ls='--')
if add_yref:
p.axhline(add_yref, ls='--')
if add_refline:
slope, intercept = add_refline
def abline(slope, intercept):
"""Plot a line from slope and intercept"""
axes = plt.gca()
x_vals = np.array(axes.get_xlim())
y_vals = intercept + slope * x_vals
plt.plot(x_vals, y_vals, '--')
abline(slope, intercept)
if save:
file_path = input('Choose filepath you want to save in or click enter if in same directory: ')
filename = input('Save file as(don\'t add the .png): ')
if len(file_path.split()) == 0:
file_path = os.getcwd()
if input('Do you want to save the image in an uncreated sub directory:').lower()[0] == 'y':
sub_dir = input('Name of the sub directory: ')
os.mkdir(file_path + '\\' + sub_dir)
file_path = file_path + '\\' + sub_dir
try:
plt.savefig(file_path + '\\' + filename + '.png', facecolor=bgcolor ,dpi=300)
except:
print('An error occurred in saving the plot')
else:
print(f'File was saved succesfully at {file_path} with name {filename}')
if plot_on_page:
plt.show()
# Many a times we want to create a plot from two different csv files, the below class helps us do so
# TwoDataInput takes 2 csvs as input and merges them, all the methods remain similar to the previous class
# A preliminary function which combines both dfs and cleans the dataframe
def combine_and_cleancolumns(d1, d2):
data = pd.merge(d1, d2, on=['Player','Squad'])
columns = data.columns
new_cols = []
for col in columns:
if (col[-2:] != '_x') and (col[-2:]!='_y'):
new_cols.append(col)
else:
if col[:-2] not in new_cols:
new_cols.append(col[:-2])
else:
data.drop(col, inplace=True, axis=1)
data.columns = new_cols
try:
data['Player'] = data['Player'].apply(lambda x: ' '.join(x.split('\\')[1].split('-')))
data['Comp'] = data['Comp'].apply(lambda x: ' '.join(x.split()[1:]))
data['Nation'] = data['Nation'].apply(lambda x: x.split()[-1])
except:
print('Error in loading file at ' + d1 + ' and ' + d2)
finally:
return data
class TwoDataInput(DataInput):
def __init__(self, filepath_data1, filepath_data2):
super().__init__(filepath_data1)
super().__init__(filepath_data2)
self.d1 = pd.read_csv(filepath_data1, index_col='Rk')
self.d2 = pd.read_csv(filepath_data2, index_col='Rk')
self.data = combine_and_cleancolumns(self.d1, self.d2)
self.filtereddf = self.data.copy()
# A function which can read data from a Fbref webpage
def readfromhtml(filepath):
df = pd.read_html(filepath)[0]
column_lst = list(df.columns)
unique_col_names = []
for col in column_lst:
if col[1] not in unique_col_names:
unique_col_names.append(col[1])
else:
unique_col_names.append(col[0]+' '+col[1])
df.columns = unique_col_names
df.drop(df[df['Player'] == 'Player'].index, inplace=True)
df = df.fillna('0')
df.set_index('Rk', drop=True, inplace=True)
try:
df['Comp'] = df['Comp'].apply(lambda x: ' '.join(x.split()[1:]))
df['Nation'] = df['Nation'].astype(str)
df['Nation'] = df['Nation'].apply(lambda x: x.split()[-1])
except:
print('Error in uploading file:' + filepath)
finally:
df = df.apply(pd.to_numeric, errors='ignore')
return df
# Similar to the data input, this particular class takes information directly from
# Fbref's website based on url
class DataInputFromWebpage(DataInput):
def __init__(self, filepath_data):
self.data = readfromhtml(filepath_data)
self.filtereddf = self.data.copy()
# This is the two class extension extended to both files from Fbref's website
class TwoDataInputFromWebpage(DataInput):
def __init__(self, filepath_data1, filepath_data2):
self.d1 = readfromhtml(filepath_data1)
self.d2 = readfromhtml(filepath_data2)
self.data = pd.merge(self.d1, self.d2)
self.filtereddf = self.data.copy()
from bs4 import BeautifulSoup as soup
# If you want all data for the big 5 leagues, you just need to run this function with
# the filepath where you want to save all the files
def save_all_csvs(base_url='https://fbref.com/en/comps/Big5/Big-5-European-Leagues-Stats',
filepath=os.getcwd()):
req = requests.get(base_url)
parse_soup = soup(req.content, 'lxml')
scripts = parse_soup.find_all('ul')
url_list = scripts[4]
urls = []
for url in url_list.find_all('a', href=True):
urls.append(url['href'])
urls = [base_url[:17] + url for url in urls]
for url in urls:
df = readfromhtml(url)
filename = url.split('/')[6]
try:
df.to_csv(filepath + '\\' + filename + '.csv', encoding='utf-8-sig')
except:
print('An error occurred in saving the file')
else:
print('File has been saved as {0} at {1}'.format(filename, filepath))