-
Notifications
You must be signed in to change notification settings - Fork 13
/
profits.py
138 lines (98 loc) · 3.94 KB
/
profits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
This script maximises a campaign's revenue.
The context and data are from the KDD Cup 98 Competition.
'''
import numpy as np
import pandas as pd
from pydoc import help
from scipy.stats.stats import pearsonr
from sklearn.linear_model import LinearRegression, LogisticRegression
# Reads project's classes
from lib.importer import Importer
from lib.preprocessor import Preprocessor
from lib.analyser import Analyser
from lib.utils import Performance
if __name__ == '__main__':
#### Importation ####
# Loads configuration
cfg = Importer.get_cfg()
cfg['target'] = 'TARGET_D'
# Loads raw data
raw_dat = Importer.get_raw_dat(cfg)
# Creates a reduced version of raw_dat
# Otherwise I can't test my buggy solution
# TODO: optimise such that this workaround is not necessary
pos = raw_dat[raw_dat.TARGET_B == 1]
neg = raw_dat[raw_dat.TARGET_B == 0][1:pos.shape[0]]
y_train_bal = [1] * pos.shape[0]
y_train_bal.extend([0] * neg.shape[0])
raw_dat = pos.append(neg, ignore_index = True)
#### Exploratory Analysis ####
# !!! It's already done at donors.py !!! #
# Correlation between TARGET_D and the predictors
TARGET_D_corr = raw_dat.corr()["TARGET_D"].copy()
TARGET_D_corr.sort(ascending = False)
TARGET_D_corr
# TODO: see how donations are distributed among age groups
# TODO: see how donations are distributed per gender
# The majority of the donations are smaller than 20 dollars.
#### Preprocessing ####
# Gets some redundant variables based on variance, sparsity & common sense
redundant_vars = Analyser.get_redundant_vars(cfg, raw_dat)
# Drops redundant cols
dat = raw_dat.drop(redundant_vars, axis = 1)
# Imputes the data and fills in the missing values
dat = Preprocessor.fill_nans(raw_dat)
# Shuffles observations
dat = dat.apply(np.random.permutation)
#### Feature Selection ####
# Gets important variables
important_vars = Analyser.get_important_vars(cfg, dat)
important_vars.extend(['TARGET_B'])
# Changes categorical vars to a numerical form
# TODO: find a faster alternative, or clever cleaning to the vars before
feats = pd.get_dummies(dat)
# Drops the non-important variables
feats = feats[important_vars]
# Does train/test datasets, 70% and 30% respectively
cut = int(feats.shape[0] * .5)
train = feats[1:cut].drop(['TARGET_B', 'TARGET_D'], axis = 1)
y_train = feats.TARGET_B[1:cut]
test = feats[(cut + 1):-1].drop(['TARGET_B', 'TARGET_D'], axis = 1)
y_test = feats.TARGET_B[(cut + 1):-1]
#### Model Selection ####
# Do cross-validation Grid Search to find the optimal parameters
# TODO
#### Get Estimated Donors ####
# Linear Regression Model to predict who are the donors
# Training
# TODO: do cross validation training
clf = LogisticRegression(verbose = 1, max_iter = 200)
clf = clf.fit(train.values, y_train.values)
# Testing
y_test_pred = clf.predict(test.values)
# Confusion Matrix
print pd.crosstab(
y_test, y_test_pred, rownames = ['actual'], colnames = ['preds'])
# Gets performance
perf_model = Performance.get_perf(y_test, y_test_pred)
# Extracts donors from dat
sub_feats = feats[(cut + 1):-1].drop(['TARGET_B'], axis = 1)
sub_feats = sub_feats[y_test_pred == 1]
# Divide sub_test into train and test
# TODO: do cross validation here
cut = int(sub_feats.shape[0] * .5)
train = sub_feats[1:cut]
test = sub_feats[(cut + 1):-1]
#### For the estimated donors predict how much they will donate ####
# TODO: cross validation
# Training
clf = LinearRegression(n_jobs = -1)
clf = clf.fit(train.drop('TARGET_D', axis = 1).values,
train.TARGET_D.values)
# Testing
y_test_pred = clf.predict(test.drop('TARGET_D', axis = 1).values)
# Evaluates result
print pearsonr(y_test_pred, test.TARGET_D.values)