-
Notifications
You must be signed in to change notification settings - Fork 7
/
nba.py
62 lines (52 loc) · 2.32 KB
/
nba.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
import numpy as np
import scipy.stats as stats
import re
def clear_data(string1):
if re.search(r'\[[a-z]* [0-9]+\]', string1) is None:
return string1
else:
return string1.replace(re.search(r'\[[a-z]* [0-9]+\]', string1).group(), '')
def clear_nba_data(string1):
if re.search(r'\* \([0-9]*\)| \([0-9]*\)', string1) is None:
return string1
else:
return string1.replace(re.search(r'\* \([0-9]*\)| \([0-9]*\)', string1).group(), '')
def get_area(team):
for each in list(nba_cities.index.values):
if team in each:
return nba_cities.at[each, 'Metropolitan area']
def get_nba_data():
return out_df
population_by_region = [] # pass in metropolitan area population from cities
win_loss_by_region = [] # pass in win/loss ratio from nhl_df in the same order as cities["Metropolitan area"]
nba_df = pd.read_csv("assets/nba.csv")
cities = pd.read_html("assets/wikipedia_data.html")[1]
cities = cities.iloc[:-1, [0, 3, 5, 6, 7, 8]]
nba_df = nba_df[nba_df['year'] == 2018] # get only 2018 stats no need of dropping rows
population = cities[['Metropolitan area', 'Population (2016 est.)[8]']]
population = population.set_index('Metropolitan area')
cities['NBA'] = cities['NBA'].apply(lambda x: clear_data(x))
nba_cities = cities[['Metropolitan area', 'NBA']].set_index('NBA')
nba_cities = nba_cities.drop(['—', ''], axis=0)
nba_df['team'] = nba_df['team'].apply(lambda x: clear_nba_data(x))
nba_df['area'] = nba_df['team'].apply(lambda x: x.split(" ")[-1])
nba_df['area'] = nba_df['area'].apply(lambda x: get_area(x))
out = []
for group, frame in nba_df.groupby('area'):
total_wins = np.sum(pd.to_numeric(frame['W']))
total_losses = np.sum(pd.to_numeric(frame['L']))
total_matches = total_wins + total_losses
ratio = (total_wins / total_matches)
out_dict = {
'Area': group,
'Ratio': ratio
}
out.append(out_dict)
new_df = pd.DataFrame(out)
new_df = new_df.set_index('Area')
out_df = pd.merge(new_df, population, how="inner", left_index=True, right_index=True)
out_df['Population (2016 est.)[8]'] = pd.to_numeric(out_df['Population (2016 est.)[8]'])
population_by_region = out_df['Population (2016 est.)[8]'].to_list()
win_loss_by_region = out_df['Ratio'].to_list()
corr = stats.pearsonr(population_by_region, win_loss_by_region)[0]