-
Notifications
You must be signed in to change notification settings - Fork 1
/
Data_Handler.py
163 lines (132 loc) · 5.89 KB
/
Data_Handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import pandas as pd
import random
import os
import matplotlib.pyplot as plt
from Utils.Utilities import Predictors_Finder
"""
This files handles data loading as well as building shorter datasets.
It also runs some data analysis
"""
def Initializer(original_dataframe = "Data/signed_predictors_all_wide_with_returns.csv", short_data_trial = False, shorten_data = "../Data/shorter_data.csv", predictors_names = "Data/predictors.csv", predictors_per_permno = "Data/permno_predictors.csv"):
if short_data_trial:
if not os.path.isfile(shorten_data):
raise FileNotFoundError(f"File '{shorten_data}' not found, Please load the original set and generate the shorten dataset using Data_shorter")
data = Load(shorten_data, False)
print("Data from a shorten dataset has been loaded")
# Count the number of unique permno values
num_permno = data['permno'].nunique()
print("This dataset contains",num_permno, " different permnos")
else:
data = Load(original_dataframe, True)
print("Data from the original file has been loaded")
if not os.path.isfile(predictors_names):
Predictors_Finder(data = data, filename = predictors_names)
print(f"Full predictors list has been generated and saved as {predictors_names}")
else:
print(f"predictors list already exists as {predictors_names}")
if not os.path.isfile(predictors_per_permno):
Data_Analizer(data=data, filename=predictors_per_permno)
print(f"Full predictors per permno file has been generated and saved as {predictors_per_permno}")
else:
print(f"Predictors per permno file already exists as {predictors_per_permno}")
return data
def Load(filename = "Data/signed_predictors_all_wide_with_returns.csv", show_head = False):
"""
Simply loads the file with all the predictors and the returns and share prices
prints a preview
:param filename: name of the file containing the data
:param show_head: if true it will print the first few rows
:return: return a pandas object
"""
# Load the dataset
data = pd.read_csv(filename)
# Print the first few rows
if show_head:
print(data.head())
return data
def Firm_Extractor(permnos,data):
"""
Retrieve all rows associated with specific permnos from the DataFrame.
:param permnos: The ID of each stock
:param data: Dataframe
:return: A DataFrame containing all rows associated with the specified permnos.
"""
# Check if permnos is a single integer or a list
if isinstance(permnos, int):
permnos = [permnos]
# Filter the DataFrame based on the permno
id_data = data[data['permno'].isin(permnos)]
return id_data
def Data_Analizer(data, filename = "Data/permno_predictors.csv"):
"""
:param data:
:param filename:
:return:
"""
# 1. Count the number of observations per stock (permno) and dates (yyyymm)
obs_per_stock = data.groupby('permno')['yyyymm'].nunique()
print(obs_per_stock)
# 2. Plot a density plot for the number of observations per stock
plt.figure(figsize=(12, 6))
# sns.kdeplot(obs_per_stock, fill=True, color="r", bw_adjust=0.1)
plt.plot(obs_per_stock)
plt.xlabel('Number of years of observations per Stock')
plt.ylabel('Density')
plt.title('Density Plot of Number of Observations per Stock')
plt.show()
# 3. Run descriptive statistics and identify outliers
stats = obs_per_stock.describe()
print(stats)
print(data.shape)
total_nans = data.isna().sum().sum()
print(total_nans)
num_permno = data['permno'].nunique()
print(num_permno)
num_dates = data['yyyymm'].nunique()
print(num_dates)
# Init. dict to store associated tables for each permno
associated_tables = {}
# This one is for the column names
associated_column_names = {}
# Iterate over each permno
for permno, group in data.groupby("permno"):
# Exclude the yyyymm and permno column
wo_permno_date = group.drop(columns=["permno","yyyymm"])
# for each group find the non-empty columns for this permno
non_empty_columns = wo_permno_date.columns[wo_permno_date.notnull().any()].tolist()
# Store associated table for this permno
associated_tables[permno] = wo_permno_date[non_empty_columns]
associated_column_names[permno] = non_empty_columns
# Print associated tables for each permno
#for permno, table in associated_tables.items():
#print(f"Permno: {permno}")
#print(table)
#print()
# Print the table with permno and associated non-empty column names
#print("Permno\tAssociated Columns")
#for permno, columns in associated_column_names.items():
#print(f"{permno}\t{', '.join(columns)}")
permno_predictors = pd.DataFrame(list(associated_column_names.items()), columns=['permno', 'Associated Columns'])
# Save if needed
permno_predictors.to_csv(filename, index=False)
return None
def Data_Shorter(data, nbr_asset, save = False, filename = "Data/shorter_data.csv"):
"""
The function take the whole dataset and returns a smaller random set of data to work faster.
If specified, it will save this new dataframe as csv file.
:param data: original dataset
:param nbr_asset: number of different permno we want to keep
:param save: boolean, whether to save the new dataframe as a CSV file
:param filename: path to save the CSV file
:return: new dataframe contains random permnos
"""
random.seed(28)
permno_unique = data['permno'].unique()
#Check that the nbr of asset required is doesn't exceed the number of asset
nbr_asset = min(nbr_asset, len(permno_unique))
permno_list = random.sample(list(permno_unique), nbr_asset)
shorter_data = Firm_Extractor(permno_list, data)
# Save the DataFrame as a CSV file
if save:
shorter_data.to_csv(filename, index=False)
return shorter_data