diff --git a/README.md b/README.md index 3b0b316..9ad7795 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,15 @@ # housing-submarkets-cdmx Supplementary material for peer review + + +This repository contains the scripts we used to perform the analysis in _An interdisciplinary framework for the comparison of geography aware housing market segmentation models using web scraped listing price data for Mexico City_ + +All scripts are in _src_ folder + +* Accesibility.py Contains the code used to model accessibility to ammenities +* Clustering.py Contains all clustering code to define submarkets +* Outliers_LOF.py contains the code to filter outliers +* Prediction.py Contains the XGBoost model fitting procedure +* Welch_Test.py Contains the code to obtain confidence intervals for prices in submarkets + +All data needed to run the scripts can be found in the `data` folder. \ No newline at end of file diff --git a/data/listing_price_data.zip b/data/listing_price_data.zip new file mode 100644 index 0000000..dabfad3 Binary files /dev/null and b/data/listing_price_data.zip differ diff --git a/src/Accessibility.py b/src/Accessibility.py new file mode 100644 index 0000000..22033cd --- /dev/null +++ b/src/Accessibility.py @@ -0,0 +1,117 @@ +############################################################################# +##### Accesibility +# This file produces an accesibility model for servicers defined in file OFERTA +############################################################################# + +#### Imports +from h3 import h3 +import numpy as np +import pandas as pd +import geopandas as gpd +from shapely.geometry import Polygon, Point +from access import Access, weights, Datasets + +import warnings +warnings.filterwarnings('ignore') + +def id2poly(hex_id): + boundary = [x[::-1] for x in h3.h3_to_geo_boundary(hex_id)] + return Polygon(boundary) + +############################################################################# +##### Inputs + +Entrada = '../data' +Salida = '../data' + +############################################################################# +##### Base hexagons + +HEX_CDMX = gpd.read_file(Entrada + 'HEX_CDMX.shp') + +HOGARES_ALL = gpd.read_file(Entrada + 'HOGARES_ALL.shp') + +OFERTA = gpd.read_file(Entrada + 'office.shp') + +############################################################################# +##### Join services to hexagons + +Hex_Unidad = gpd.sjoin( HEX_CDMX, + OFERTA, + how = "right", + op = 'contains') + +############################################################################# +##### Count services by hexagon + +Hex_Unidad = Hex_Unidad.groupby('hex')['name'].count().reset_index() +Hex_Unidad ['geometry'] = Hex_Unidad['hex'].apply(id2poly) +Hex_Unidad = gpd.GeoDataFrame( Hex_Unidad, + geometry = Hex_Unidad['geometry']) +Hex_Unidad.crs = "EPSG:4326" +Hex_Unidad = Hex_Unidad.to_crs("EPSG:4326") +Hex_Unidad = Hex_Unidad.to_crs(6372) + +############################################################################# +##### SPATIAL ACCESIBILITY : GRAVITY MODEL + +Unidad_access = Access( demand_df = HOGARES_ALL, + demand_index = "hex", + demand_value= "TOTHOG", + supply_df = Hex_Unidad, + supply_index = "hex", + supply_value = "name") + + +Unidad_access.create_euclidean_distance( name = "euclidean_neighbors", + threshold = 5000, ### Unidad Metros + centroid_d = True, + centroid_o=True) + +gravity = weights.gravity( scale = 30, + alpha = -2, + min_dist = 10) + +weighted = Unidad_access.weighted_catchment( name = "gravity", + weight_fn = gravity) + + +############################################################################# +##### GRAVITY MODEL WEIGHTS + +weighted = weighted.reset_index() +weighted ['geometry'] = weighted ['hex'].apply(id2poly) +weighted = gpd.GeoDataFrame( weighted, + geometry = weighted['geometry']) +weighted.crs = "EPSG:4326" +weighted = weighted.to_crs("EPSG:4326") +weighted = weighted.rename({'gravity_name':'gravity_value'}, axis=1) +weighted['gravity_value'] = weighted['gravity_value'].fillna(0) + +print('Max value',weighted.gravity_value.max()) +print('Min value',weighted.gravity_value.min()) +print('Weighted shape',weighted.shape) + +############################################################################# +##### PLOT + +BASE = HOGARES_ALL.to_crs(4326).plot( color="black", + markersize=7, + figsize=(11, 11)) + +weighted.plot( ax = BASE, + column = "gravity_value", + scheme = 'Quantiles', + cmap = 'viridis', + classification_kwds = {'k':8}) + +OFERTA.to_crs(4326).plot( ax = BASE, + color = "red", + markersize = 1) + +############################################################################# +##### Save as csv + +weighted[[ 'hex', 'gravity_value']].to_csv(Salida + "Oficinas_SA.csv", index = False) + +############################################################################# diff --git a/src/Clustering.py b/src/Clustering.py new file mode 100644 index 0000000..d891a41 --- /dev/null +++ b/src/Clustering.py @@ -0,0 +1,209 @@ +############################################################################# +##### KMEANS, TSNE Y AZP + +############################################################################# + +#### Imports + +import libpysal +from h3 import h3 +import numpy as np +import pandas as pd +import geopandas as gpd +from spopt.region import AZP +import matplotlib.pyplot as plt +from shapely.geometry import Polygon +import seaborn as sns + +import warnings +warnings.filterwarnings('ignore') + +from sklearn.preprocessing import PowerTransformer + +from sklearn.manifold import TSNE +from sklearn.cluster import KMeans +from scipy.cluster.hierarchy import ward, fcluster +from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer + +def id2poly(hex_id): + boundary = [x[::-1] for x in h3.h3_to_geo_boundary(hex_id)] + return Polygon(boundary) + +############################################################################# +##### Input - output paths + +Entradas = '' +Salidas = '' + +############################################################################# +##### + +#### Input dataset must have all acessibility and geographic variables +#### + +DATA_DEPART = pd.read_csv(Entradas + 'DEPART_DIM.csv') + +DATA_CASAS = pd.read_csv(Entradas + 'CASA_DIM.csv') + +#### Example for apartments dataset + +DATA_DPTO = DATA_DEPART.groupby('hex').agg({'superficie':'mean', + 'habitacion':'mean', + 'baños':'mean', + 'Parques_SA':'mean', + 'Angel_SA':'mean', + 'Teatros_SA':'mean', + 'SuperM_SA':'mean', + 'Restau_SA':'mean', + 'Oficina_SA':'mean', + 'MetroBus_SA':'mean', + 'Metro_SA':'mean', + 'Hospital_SA':'mean', + 'GyMFree_SA':'mean', + 'GyM_SA':'mean', + 'Cinema_SA':'mean', + 'EscSecu_SA':'mean', + 'EscPr_SA':'mean', + 'EscMS_SA':'mean', + 'EcoBici_SA':'mean', + 'Delitos':'mean', + 'Socioeco':'mean'}).reset_index() + +LABEL = DATA_DPTO['hex'] +DIMENSIONS = DATA_DPTO.drop(['hex'], axis=1) + + +S_S = PowerTransformer() +Var_Scales = pd.DataFrame(S_S.fit_transform(DIMENSIONS), + columns = DIMENSIONS.columns) + + +############################################################################# +##### KMEANS CLUSTER + + +def K_model(Dimensiones_tran, numero_min, numero_max, Title): + ### Input: + ### -> Variables explicativas transformadas + ### -> Número mínimo de cluster + ### -> Número máximo de cluster + model = KMeans() + visualizer = KElbowVisualizer( model, + k = (numero_min, numero_max), + timings= True) + visualizer.fit(Dimensiones_tran) + visualizer.show(outpath = Title, dpi = 300) + + +K_model(Var_Scales, 2, 10, 'CURVA DATA') + + +def K_means(Dimensiones_tran, numero_cluster, semilla): + ### Input: + ### -> Variables explicativas transformadas + ### -> Número de Cluster + ### -> Número de semilla en random-state + kmeans = KMeans(n_clusters = numero_cluster, random_state = semilla) + kmeans.fit(Dimensiones_tran) + clusters = kmeans.predict(Dimensiones_tran) + ### Output: + ### -> Valores de cluster + return (clusters) + +Clusters_K = K_means(Var_Scales, 5, 42) + +############################################################################# +##### Ward CLUSTER + +def T_NSE(Dimensiones_tran, Numero_compo, Semilla, Perplexity): + ### Input: + ### -> Variables explicativas transformadas + ### -> Número de componentes debe ser inferior a 4 (algoritmo barnes_hut, basado en quad-tree y oct-tree) + ### -> Número de semilla en random-state + ### -> Perplexity o valores de perplejidad en el rango (5 - 50) sugeridos por van der Maaten & Hinton + tsn = TSNE(n_components = Numero_compo, random_state = Semilla, perplexity = Perplexity) + res_tsne = tsn.fit_transform(Dimensiones_tran) + fig = plt.figure(figsize=(25,25)) + ax1 = fig.add_subplot(3,3,1) + axpl_1 = sns.scatterplot(res_tsne[:,0],res_tsne[:,1]); + ax2 = fig.add_subplot(3,3,2) + axpl_2 = sns.scatterplot(res_tsne[:,0],res_tsne[:,2]); + ### Output: + ### -> Valores TNSE de las componentes reducidas + return (res_tsne) + +TNSE_Model = T_NSE(Var_Scales, 3, 42, 50) + +### Cluster t-SNE :WARD + +def Cluster_TNSE(Modelo_tSNE, Cluster_max): + ### Input: + ### -> Array correspondiente a cada componente TNSE resultado de reducción dimensional + ### -> Número de cluster designados bajo el criterio maxclust de Fcluster + link = ward(Modelo_tSNE) + Cluster_Values = fcluster(link, t = Cluster_max, criterion = 'maxclust') + fig = plt.figure(figsize = (25,25)) + ax1 = fig.add_subplot(3,3,1) + pd.value_counts(Cluster_Values).plot(kind='barh') + ax2 = fig.add_subplot(3,3,2) + axpl_2 = sns.scatterplot(x = Modelo_tSNE[:,0], y = Modelo_tSNE[:,1],hue = Cluster_Values, palette="Set2"); + ax2 = fig.add_subplot(3,3,3) + axpl_2 = sns.scatterplot(x = Modelo_tSNE[:,0], y = Modelo_tSNE[:,2], hue = Cluster_Values, palette="Set2"); + ### Output: + ### -> Valores de cluster TNSE + return (Cluster_Values) + + +TNSE_Cluster = Cluster_TNSE(TNSE_Model, 5) + + +### CLUSTER (KMEANS + WARD) EN LA TABLA DE DIMENSIONES + +DATA_DPTO["K_Means"] = Clusters_K +DATA_DPTO["t_SNE"] = TNSE_Cluster + + +############################################################################# +##### AZP CLUSTER + +Var_Scales['geometry'] = Var_Scales['hex'].apply(id2poly) +Var_Scales = gpd.GeoDataFrame( Var_Scales, + geometry = Var_Scales['geometry']) +Var_Scales.crs = "EPSG:4326" +Var_Scales = Var_Scales.to_crs("EPSG:4326").to_crs(6372).set_index('hex') + + +COLUMN_DIMEN = ['superficie', 'habitacion', 'baños', 'Parques_SA', + 'Angel_SA', 'Teatros_SA', 'SuperM_SA', 'Restau_SA', 'Oficina_SA', + 'MetroBus_SA', 'Metro_SA', 'Hospital_SA', 'GyMFree_SA', 'GyM_SA', + 'Cinema_SA', 'EscSecu_SA', 'EscPr_SA', 'EscMS_SA', + 'EcoBici_SA', 'Delitos', 'Socioeco'] + +############################################################################# +##### AZP CLUSTER + +KKN_MATRIX = libpysal.weights.KNN.from_dataframe(Var_Scales, k = 8) + +model = AZP(Var_Scales, KKN_MATRIX, COLUMN_DIMEN, 5) +model.solve() + +Var_Scales['AZP'] = model.labels_ + +DATA_ALL = DATA_DPTO.merge( Var_Scales[['hex', 'AZP']], + left_on = 'hex', + right_on = 'hex', + how ='left') + +DPTO_CLUSTER_ALL = DATA_DEPART.merge( DATA_ALL[['hex', + 'AZP', + 'K_Means', + 't_SNE']], + left_on = 'hex', + right_on = 'hex', + how = 'left') + + +############################################################################# +##### Write as csv + +DPTO_CLUSTER_ALL.to_csv(Salidas + "DPTO_CLUSTER_ALL.csv", index = False) diff --git a/src/Outliers_LOF.py b/src/Outliers_LOF.py new file mode 100644 index 0000000..09f1a70 --- /dev/null +++ b/src/Outliers_LOF.py @@ -0,0 +1,123 @@ +############################################################################# +##### OUTLIER DETECTION +##### LOCAL OUTLIER FACTOR (LOF) +############################################################################# + +#### Imports + +import numpy as np +import pandas as pd +import geopandas as gpd + +import warnings +warnings.filterwarnings('ignore') + +### OUTLIER + +from sklearn.neighbors import LocalOutlierFactor + + +############################################################################# +##### Input - output paths + +Entradas = '../data' +Salidas = '../data' + +############################################################################# +##### + +INMUEBLES = gpd.read_file(Entradas + 'INMUEBLES_VENTA.shp') +HEX_CDMX = gpd.read_file(Entradas + 'HEX_CDMX.shp') + + +############################################################################# +##### House outliers + +CASAS_VENTA = INMUEBLES[INMUEBLES['Oferta'].isin(['Casa'])] + +CLF_CASA = LocalOutlierFactor( n_neighbors = 300, + contamination ='auto') + +X_CASAS = CASAS_VENTA[['precio','superficie','habitacion','baños']].values + +Y_CASAS = CLF_CASA.fit_predict(X_CASAS) + +DF_CASAS = pd.DataFrame(Y_CASAS, columns=['outlier']) + +OUTLIER_CASAS = pd.DataFrame(CASAS_VENTA[[ 'id','precio', + 'superficie','habitacion', + 'baños']]) + +OUTLIER_CASAS['Outlier'] = DF_CASAS.values + +OUTLIER_CASAS = OUTLIER_CASAS.query('Outlier == 1') + +OUT_LIER = CASAS_VENTA.merge( OUTLIER_CASAS[['id','Outlier']], + left_on = 'id', + right_on = 'id', + how ='right') + +OUT_LIER = OUT_LIER.to_crs(6372) + +##### Hex codes + +HEX_CASA = gpd.sjoin( HEX_CDMX, + OUT_LIER, + how="right", + op='contains') + +HEX_CASA = HEX_CASA[HEX_CASA['hex'].notna()] + +HEX_CASA = HEX_CASA.drop([ 'index_left', 'Outlier', + 'geometry'], axis=1) + +############################################################################# +##### Apartment Outliers + +DEPAS_VENTA = INMUEBLES[INMUEBLES['Oferta'].isin(['Departamento'])] + +CLF_DEPA = LocalOutlierFactor( n_neighbors = 300, + contamination ='auto') + +X_DEPA = DEPAS_VENTA[['precio','superficie','habitacion','baños']].values + +Y_DEPA = CLF_DEPA.fit_predict(X_DEPA) + +DF_DEPA= pd.DataFrame(Y_DEPA, columns=['outlier']) + +OUTLIER_DEPA = pd.DataFrame(DEPAS_VENTA[[ 'id','precio', + 'superficie','habitacion', + 'baños']]) + +OUTLIER_DEPA['Outlier'] = DF_DEPA.values + +OUTLIER_DEPA = OUTLIER_DEPA.query('Outlier == 1') + +OUT_LIED = DEPAS_VENTA.merge( OUTLIER_DEPA[['id','Outlier']], + left_on = 'id', + right_on = 'id', + how ='right') + +OUT_LIED = OUT_LIED.to_crs(6372) + +##### Assign hex code + +HEX_DPTO = gpd.sjoin( HEX_CDMX, + OUT_LIED, + how="right", + op='contains') + +HEX_DPTO = HEX_DPTO[HEX_DPTO['hex'].notna()] + +HEX_DPTO = HEX_DPTO.drop([ 'index_left', 'Outlier', + 'geometry'], axis=1) + + +############################################################################# +##### Write as csv + +### CASAS +HEX_CASA.to_csv(Salidas + "CASAS_EN_VENTA.csv", index = False) + +### DEPARTAMENTOS +HEX_DPTO.to_csv(Salidas + "DEPAS_EN_VENTA.csv", index = False) diff --git a/src/Prediction.py b/src/Prediction.py new file mode 100644 index 0000000..30e0100 --- /dev/null +++ b/src/Prediction.py @@ -0,0 +1,524 @@ +############################################################################# +##### XGBOOST, SHAP, Confidence intervals +############################################################################# +##### Imports + +import shap +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib as mpl +import plotly.express as px +from pandas import DataFrame +import matplotlib.pyplot as plt +import plotly.graph_objects as go +from plotly.subplots import make_subplots + +############################################################################# +##### Figure size +# ============================================================================== +mpl.rcParams["figure.figsize"] = (10,6) +sns.set_context("paper") + +############################################################################# +##### Standarization +# ============================================================================== + +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import RobustScaler +from sklearn.preprocessing import PowerTransformer + + +import multiprocessing +from xgboost import XGBRegressor +from sklearn.linear_model import RidgeCV +from sklearn.model_selection import KFold +from sklearn.metrics import mean_squared_error +from sklearn.metrics import mean_absolute_error +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import ParameterGrid +from sklearn.model_selection import RepeatedKFold +from sklearn.model_selection import cross_val_score +from sklearn.model_selection import train_test_split +from sklearn.inspection import permutation_importance +from sklearn.ensemble import GradientBoostingRegressor + + +############################################################################# +##### SHAP +shap.initjs() + +############################################################################# +##### DROP COLUMNS + +def DROP_COLUMN (BASE_USO): + feature_drop = ['hex', 'Entidad_cvgeo', 'Entidad_n', 'Municipio_cvgeo', + 'Municipio_n', 'Colonia_clave', 'Colonia_n', 'Colonia_CP', 'Tipo', + 'Oferta','Longitud','Latitud'] + datasets = [BASE_USO] + for df in datasets: + df.drop(feature_drop, axis = 1, inplace = True) + return (BASE_USO) + + +############################################################################# +##### Transform input data +# ============================================================================== +def tran_values(Tabla_uso, Transformacion, testsize, rstate): + ### Input: + ### -> Base de datos + ### -> Tipo de transformación de datos + ### -> Tamaño de la muestra para la prueba (Test size) + ### -> Aleatorización (Random state) + Etiqueta = np.log1p(Tabla_uso['precio']) + Var_explica = Tabla_uso.drop(['precio'],axis=1) + SS = Transformacion() + V_Scaler = pd.DataFrame(SS.fit_transform(Var_explica),columns = Var_explica.columns) + X_train, X_test, y_train, y_test = train_test_split(V_Scaler, Etiqueta, test_size=testsize, + random_state = 42, shuffle = True) + + print ('Shapes', '\n', 'X_train: ', X_train.shape, '\n', 'y_train: ', y_train.shape, '\n', 'X_test: ', X_test.shape, + '\n', 'y_test: ', y_test.shape,) + ### Output: + ### -> Muestra de variables explicativas para entrenamiento (X_train) + ### -> Muestra de etiquetas para entrenamiento (y_train) + ### -> Muestra de variables explicativas para prueba (X_test) + ### -> Muestra de etiquetas para prueba (y_test) + ### -> Se imprime la forma de los datos + return (X_train, y_train, X_test, y_test) + + + +############################################################################# +##### XGBOOST +# ============================================================================== + +def XGBtest (Xtrain ,ytrain, nstimators, nsplits, nrepeats): + ### Input: + ### -> Muestra de variables explicativas para entrenamiento (X_train) + ### -> Muestra de etiquetas para entrenamiento (y_train) + ### -> Número máximo de estimadores + ### -> Número de pliegues (división de la muestra) + ### -> Número de veces que es necesario repetir el validador cruzado. + + + + # part 01: Hyperparameters + # ============================================================================== + param_grid = {'max_depth' : [None, 1, 3, 5, 10, 20], + 'subsample' : [0.5, 1], + 'learning_rate' : [0.001, 0.01, 0.1], + 'booster' : ['gbtree'] + } +# ================================================================================== + # part 02: validation data + # ============================================================================== + np.random.seed(123) + idx_validacion = np.random.choice(Xtrain.shape[0], size= int(Xtrain.shape[0]*0.1), replace=False) + X_val = Xtrain.iloc[idx_validacion, :].copy() + y_val = ytrain.iloc[idx_validacion].copy() + X_train_grid = Xtrain.reset_index(drop = True).drop(idx_validacion, axis = 0).copy() + y_train_grid = ytrain.reset_index(drop = True).drop(idx_validacion, axis = 0).copy() + +# ================================================================================== + # Part 03: training params + # metodo = .fit() + # ============================================================================== + fit_params = {"early_stopping_rounds" : 10, + "eval_metric" : "rmse", + "eval_set" : [(X_val, y_val)], + "verbose" : 0 + } +# ================================================================================== + # Part 04: grid search + # ============================================================================== + grid = GridSearchCV(estimator=XGBRegressor(n_estimators = nstimators, random_state = 123), + param_grid = param_grid, + scoring = 'neg_root_mean_squared_error', + n_jobs = multiprocessing.cpu_count() - 1, + cv = RepeatedKFold(n_splits=nsplits, n_repeats= nrepeats , random_state=123), + refit = True, + verbose = 0, + return_train_score = True + ) + + + + + GridFit = grid.fit(X = X_train_grid, y = y_train_grid, **fit_params) + +# ================================================================================== + # Part 05: # results + # ============================================================================== + resultados = pd.DataFrame(GridFit.cv_results_) + Tresultados = resultados.filter(regex = '(param.*|mean_t|std_t)') .drop(columns = 'params') .sort_values('mean_test_score', ascending = False) .head(4) + ### Output: + ### -> grid.fit en validación cruzada, mejores metricas de aprendizaje + ### -> Tabla de resultados con los mejores parametros + return (GridFit,Tresultados) + + +############################################################################# +##### Best parameters on trainig data + +# ============================================================================== +def BestParam (Xtest, ytest, GridFit): + ### Input: + ### -> Varianles explicativas de prueba + ### -> Variable predictiva de prueba + ### -> Resultados en grid.fit, en el aprednizaje de los datos +# ================================================================================== + # ============================================================================== + print("----------------------------------------") + print("Mejores hiperparámetros encontrados (cv)") + print("----------------------------------------") + print(GridFit.best_params_, ":", GridFit.best_score_, GridFit.scoring) + +# ================================================================================== + # ============================================================================== + n_arboles_incluidos = len(GridFit.best_estimator_.get_booster().get_dump()) + print("----------------------------------------") + print(f"Número de árboles incluidos en el modelo: {n_arboles_incluidos}") + +# ================================================================================== + # Estimate error on validation data + # ============================================================================== + F_Model = GridFit.best_estimator_ + predictor = F_Model.predict(data = Xtest) + rmse = mean_squared_error( y_true = ytest, y_pred = predictor, squared = False ) + print("----------------------------------------") + print(f"El error RMSE de test es: {round(rmse, 3)}") + print("----------------------------------------") + + ### Output: + ### -> Mejor estimador en grid.fit, mejores parametros de aprendizaje (F Model) + ### -> Valores de predicción (Y_prediction) + return (F_Model, predictor) + + +############################################################################# +##### SHAP + +def SHAP_INFO(F_Model,X_test): + explainer = shap.TreeExplainer(F_Model) + shap_values = explainer.shap_values(X_test) + return (shap_values) + +############################################################################# +##### SHAP Swarm plots + +def SHAP_ABEJAS(shap_values,X_test): + shap.summary_plot(shap_values, X_test, show = False, cmap = 'copper_r') + plt.savefig(dpi = 300, bbox_inches = 'tight') + +### VIOLIN +def SHAP_VIOLIN(shap_values,X_test): + shap.summary_plot(shap_values, X_test, plot_type='violin') + + + +### Predictor importance from random forest +# ============================================================================== +def Ypred_Import(FModl, Xtrainer, Fdrop): + ### Input: + ### -> Mejores resultados en los parametros de aprendizaje ".best_estimator_" + ### -> Variables explicativas de entrenamiento (X_train) + ### -> Variables omitidas en set de entrenamiento (Drop X_train) + importancia_predictores = pd.DataFrame( + {'predictor': Xtrainer.drop(Fdrop, axis = 1).columns, + 'importancia': FModl.feature_importances_}) + print("Importancia de los predictores en el modelo") + print("-------------------------------------------") + Ta_impo = importancia_predictores.sort_values('importancia', ascending=False) + ### Output: + ### -> Dataframe de importancia de variables explicativas en el modelo de predicción de manera ascendente. + return (Ta_impo) + + +### Predictor importance under permutations +# ============================================================================== + +def Permu_Import (FModl, Xtrain, ytrain, nrepeat, rstate): + ### Input: + ### -> Mejores resultados en los parametros de aprendizaje ".best_estimator_" + ### -> Variables explicativas de entrenamiento (X_train) + ### -> Variables omitidas en set de entrenamiento (X_train) + ### -> Número de repeticiones + ### -> Random state en permutaciones + importancia = permutation_importance( + estimator = FModl, + X = Xtrain, + y = ytrain, + n_repeats = nrepeat, + scoring = 'neg_root_mean_squared_error', + n_jobs = multiprocessing.cpu_count() - 1, + random_state = rstate) + df_importancia = pd.DataFrame({k: importancia[k] for k in ['importances_mean', 'importances_std']}) + df_importancia['feature'] = Xtrain.columns + print("Importancia bajo permutaciones") + print("-------------------------------------------") + Per_Impo = df_importancia.sort_values('importances_mean', ascending=False) + ### Output: + ### -> Parametros de importancia de permutaciones + ### -> Tabla relacionada a variables explicativas y su importancia por permutaciones" + return (importancia, Per_Impo) + + +### Permutations plot +# ============================================================================== +def Graph_Permu (Xtrain, Fdrop,importancia): + ### Imput: + ### -> Variables explicativas de entrenamiento (X_train) + ### -> variables omitidas en el set de entrenamiento (F_drop) + ### -> Importancia cde las variables calculada en las permutaciones + fig, ax = plt.subplots() + sorted_idx = importancia.importances_mean.argsort() + ax.boxplot(importancia.importances[sorted_idx].T, + vert = False, + labels = Xtrain.drop(Fdrop,axis=1).columns[sorted_idx] ) + ax.set_title('Importancia de los predictores (train)') + ax.set_xlabel('Incremento del error tras la permutación') + fig.tight_layout(); + ### Output: + ### -> Grafica de importancia de predictores vs incremento de error tras permutación + + +### MAPE +# ============================================================================== +def Mape(y_true, y_pred): + ### Input: + ### -> Variable predictiva (etiqueta) de prueba (y_test) + ### -> Resultados de predicción del modelo (y_hat) + def percentage_error(actual, predicted): + res = np.empty(actual.shape) + for j in range(actual.shape[0]): + if actual[j] != 0: + res[j] = (actual[j] - predicted[j]) / actual[j] + else: + res[j] = predicted[j] / np.mean(actual) + return res +#===================================================================================== + MAE =mean_absolute_error(np.asarray((y_true)), np.asarray((y_pred))) + MAE_0 =mean_absolute_error(np.asarray(np.expm1(y_true)), np.asarray(np.expm1(y_pred))) +#===================================================================================== + MSE = mean_squared_error(np.asarray((y_true)), np.asarray((y_pred)), squared=True) + MSE_0 = mean_squared_error(np.asarray(np.expm1(y_true)), np.asarray(np.expm1(y_pred)), squared=True) +#===================================================================================== + RMSE = np.sqrt(MSE) + RMSE_0 = np.sqrt(MSE_0) +#===================================================================================== + MAPE = np.mean(np.abs(percentage_error(np.asarray((y_true)), np.asarray((y_pred))))) * 100 + MAPE_0 = np.mean(np.abs(percentage_error(np.asarray(np.expm1(y_true)), np.asarray(np.expm1(y_pred))))) * 100 +#===================================================================================== + print("Performance measures") + print("-------------------------------------------") + print("-------------------------------------------") + print ('Mean absolute error (log1p) :', round(MAE, 3), ) + print("-------------------------------------------") + print ('Mean Squared Error (log1p) :', round(MSE, 3),) + print("-------------------------------------------") + print ('Root Mean Squared Error (log1p) :', round(RMSE, 3),) + print("-------------------------------------------") + print ('Mean Absolute Percentage Error (log1p) :', round(MAPE, 3), '%') + print("-------------------------------------------") + print("-------------------------------------------") + print("-------------------------------------------") + print ('Mean absolute error (Normal) :', round(MAE_0, 3), ) + print("-------------------------------------------") + print ('Mean Squared Error (Normal) :', round(MSE_0, 3),) + print("-------------------------------------------") + print ('Root Mean Squared Error(Normal) :', round(RMSE_0, 3),) + print("-------------------------------------------") + print ('Mean Absolute Percentage Error (Normal) :', round(MAPE_0, 3), '%') + print("-------------------------------------------") + ### Output: + ### -> Metrica del error porcentual medio absoluto + return (MAPE, MAPE_0) + + +### Prediction plots +# ============================================================================== +def Predicciones(ytest, yhat): + predicciones = ytest.reset_index() + predicciones = predicciones.rename(columns={"id": "Id_Inmueble", "precio": "Actual"}) + predicciones['Prediccion'] = pd.Series(np.reshape(yhat, (yhat.shape[0]))) + predicciones['diff'] = predicciones['Prediccion'] - predicciones['Actual'] + +# ============================================================================== + plt.figure(figsize=(18,5)) + plt.plot(predicciones.Actual,'bo', markersize=4, color='#00A287', alpha=0.7, label='Actual') + plt.plot(predicciones.Prediccion, linewidth=0.6, color='#FF6700', linestyle ="-", alpha=0.7, label='Prediccion') + plt.legend() + plt.title('Actual and prediction values') +# ============================================================================== + plt.figure(figsize=(18,5)) + sns.distplot(predicciones['diff'], + kde=True, + rug=True, + rug_kws={"color": "#01939A", "alpha": 1, "linewidth": 0.2, "height":0.1}, + kde_kws={"color": "#FF4540", "alpha": 0.3, "linewidth": 2, "shade": True}) + plt.title('Distribution of differences between actual and prediction') + +# ============================================================================== + plt.figure(figsize=(18,5)) + sns.regplot(x=predicciones.Actual, y=predicciones.Prediccion, + line_kws={"color":"#9040D5","alpha":1,"lw":4}, + marker="+", + scatter_kws={"color": "#A68F00"}) + plt.title('Relationship between actual and predicted values') + plt.show() + + + return (predicciones) + + +### Prediction intervals + +# ============================================================================== +### LOSS & UPPER QUANTILE MODEL + +def LoUp_Loss (Xtrain ,ytrain, v_alpha, nstima, nsplit, nrept): + ### Input: + ### -> Muestra de variables explicativas para entrenamiento (X_train) + ### -> Muestra de etiquetas para entrenamiento (y_train) + ### -> Número máximo de estimadores + ### -> Número de pliegues (división de la muestra) + ### -> Número de veces que es necesario repetir el validador cruzado. + param_grid = {'max_features' : ['auto', 'sqrt', 'log2'], + 'max_depth' : [None, 1, 3, 5, 10, 20], + 'subsample' : [0.5, 1], + 'learning_rate' : [0.001, 0.01, 0.1] + } + grid = GridSearchCV( + estimator = GradientBoostingRegressor( + loss ="quantile", + alpha = v_alpha, + n_estimators = nstima, + random_state = 123, + # Activación de la parada temprana + validation_fraction = 0.1, + n_iter_no_change = 5, + tol = 0.0001 + ), + param_grid = param_grid, + scoring = 'neg_root_mean_squared_error', + n_jobs = multiprocessing.cpu_count() - 1, + cv = RepeatedKFold(n_splits=nsplit, n_repeats=nrept, random_state=123), + refit = True, + verbose = 0, + return_train_score = True + ) + LoUp_M = grid.fit(X = Xtrain, y = ytrain) + + # ================================================================================== + # Part 05: # Results + # ============================================================================== + resultados = pd.DataFrame(LoUp_M.cv_results_) + LoUp_Table = resultados.filter(regex = '(param.*|mean_t|std_t)') \ + .drop(columns = 'params') \ + .sort_values('mean_test_score', ascending = False) \ + .head(4) + ### Output: + ### -> grid.fit en validación cruzada, mejores metricas de aprendizaje + ### -> Tabla de resultados con los mejores parametros + return (LoUp_M,LoUp_Table) + + +### +### Quantile Loss results +# ============================================================================== +def Best_Qua (Xtest, ytest, GridFit): + # Mejores hiperparámetros por validación cruzada + print("----------------------------------------") + print("Mejores hiperparámetros encontrados (cv)") + print("----------------------------------------") + print(GridFit.best_params_, ":", GridFit.best_score_, GridFit.scoring) + # Número de árboles del modelo final (early stopping) + # ============================================================================== + print("----------------------------------------") + print(f"Número de árboles del modelo: {GridFit.best_estimator_.n_estimators_}") + print("----------------------------------------") + # Error de test del modelo final + # ============================================================================== + F_LowUp = GridFit.best_estimator_ + LwUp_predi = F_LowUp.predict(X = Xtest) + rmse = mean_squared_error( + y_true = ytest, + y_pred = LwUp_predi, + squared = False + ) + print("----------------------------------------") + print(f"El error (rmse) de test es: {round(rmse, 3)}") + print("----------------------------------------") + return (F_LowUp, LwUp_predi) +### Output: + ### -> Mejor estimador en grid.fit, mejores parametros de aprendizaje (F Model) + ### -> Valores de predicción (Y_prediction) + +# ============================================================================== +def Frame_Predic (T_Predic,yUpper,yLow): + ### Input: + ### -> Tabla de predicciones XGboost con campos index, actual, prediccion y diferencia (y_prediction) + ### -> Predicciones de perdida en cuantil inferior (y_Low) + ### -> Predicciones de perdida en cualtil superior (y_upper) + T_Prediccions = T_Predic.copy() + T_Prediccions['y_Upper'] = pd.Series(np.reshape(yUpper, (yUpper.shape[0]))) + T_Prediccions['y_Low'] = pd.Series(np.reshape(yLow, (yLow.shape[0]))) + T_export = T_Prediccions.copy() + T_export['Actual'] = np.expm1(T_export.Actual) + T_export['Prediccion'] = np.expm1(T_export.Prediccion) + T_export['diff'] = T_export['Actual'] - T_export['Prediccion'] + T_export['y_Low'] = np.expm1(T_export.y_Low) + T_export['y_Upper'] = np.expm1(T_export.y_Upper) + T_export = T_export.rename(columns={"Prediccion": "Y_Pred", + "diff": "Diff", + "y_Low": "Y_Low", + "y_Upper": "Y_Upper"}) + return (T_export, T_Prediccions) +# ============================================================================== + + +def Graph_Trace(T_Pred,HTML_Name): + + trace1=go.Scatter(x = T_Pred.index, y = np.expm1(T_Pred.y_Upper), + name = 'high', + fill = None, + mode = 'lines', + line=dict(color='#00BB3F', width=.5)) + + trace2=go.Scatter(x = T_Pred.index, y = np.expm1(T_Pred.y_Low), + name = 'low', + fill = 'tonexty', + mode = 'lines', + line=dict(color='#00BB3F', width=.5)) + + trace3 = go.Scatter(x=T_Pred.index, y = np.expm1(T_Pred.Actual), + mode='markers', + marker=dict(symbol='circle', color='#FF7F00', size = 3), + name = "Actual") + + trace4 = go.Scatter(x=T_Pred.index, y= np.expm1(T_Pred.Prediccion), + mode='markers', + marker=dict(symbol='circle', color='#044367', size = 3), + name="Prediction") + + data=[trace1,trace2,trace3,trace4 ] + fig = go.Figure(data=data) + + fig.update_layout(hovermode='x unified') + fig['layout']['xaxis']['title']='Inmueble' + fig['layout']['yaxis']['title']='Precio' + + + fig.update_layout( title="House Prices XGBOOST", legend_title="Simbología", + font=dict( family="Averta light, monospace", + size=10, + color="black") ) + + fig.update_xaxes(showspikes=True, spikecolor="purple", spikesnap="cursor", spikemode="across") + fig.update_yaxes(showspikes=True, spikecolor="purple", spikethickness = 1) + fig.update_layout(spikedistance=1000, hoverdistance=100) + + fig.show() + fig.write_html(HTML_Name) diff --git a/src/Welch_Test.py b/src/Welch_Test.py new file mode 100644 index 0000000..d182d33 --- /dev/null +++ b/src/Welch_Test.py @@ -0,0 +1,181 @@ +############################################################################# +##### Welch tests + +############################################################################# + +#### Imports + +import numpy as np +import pandas as pd +import seaborn as sns +from matplotlib import rc +import matplotlib.pyplot as plt +from scipy.stats import ttest_ind +from itertools import combinations +from statannot import add_stat_annotation + + +############################################################################# +##### Input - output paths + +Entradas = '../data' +Salidas = '../data' + +############################################################################# +##### WELCH + +def Welch_test(Data, Dimension_cluster, dimension_analisis): + if ((Data.columns[-1] == "AZP")|(Data.columns[-1] == "K_Means")): + Data[Dimension_cluster] = Data[Dimension_cluster].replace([0, 1, 2, 3, 4],[1, 2, 3, 4, 5]) + DATA = Data[[ Dimension_cluster, dimension_analisis]] + + Sub_01 = DATA[DATA[Dimension_cluster] == 1][dimension_analisis].dropna() + Sub_02 = DATA[DATA[Dimension_cluster] == 2][dimension_analisis].dropna() + Sub_03 = DATA[DATA[Dimension_cluster] == 3][dimension_analisis].dropna() + Sub_04 = DATA[DATA[Dimension_cluster] == 4][dimension_analisis].dropna() + Sub_05 = DATA[DATA[Dimension_cluster] == 5][dimension_analisis].dropna() + + Sub_01_err = 1.96*(Sub_01.std())/(np.sqrt(Sub_01.shape[0])) + Sub_02_err = 1.96*(Sub_02.std())/(np.sqrt(Sub_02.shape[0])) + Sub_03_err = 1.96*(Sub_03.std())/(np.sqrt(Sub_03.shape[0])) + Sub_04_err = 1.96*(Sub_04.std())/(np.sqrt(Sub_04.shape[0])) + Sub_05_err = 1.96*(Sub_05.std())/(np.sqrt(Sub_05.shape[0])) + + + New_Data = pd.DataFrame([['01', Sub_01.mean(), Sub_01_err], + ['02', Sub_02.mean(), Sub_02_err], + ['03', Sub_03.mean(), Sub_03_err], + ['04', Sub_04.mean(), Sub_04_err], + ['05', Sub_05.mean(), Sub_05_err]], + columns=['x', 'y', 'yerr']) + else: + DATA = Data[[ Dimension_cluster, dimension_analisis]] + + Sub_01 = DATA[DATA[Dimension_cluster] == 1][dimension_analisis].dropna() + Sub_02 = DATA[DATA[Dimension_cluster] == 2][dimension_analisis].dropna() + Sub_03 = DATA[DATA[Dimension_cluster] == 3][dimension_analisis].dropna() + Sub_04 = DATA[DATA[Dimension_cluster] == 4][dimension_analisis].dropna() + Sub_05 = DATA[DATA[Dimension_cluster] == 5][dimension_analisis].dropna() + + Sub_01_err = 1.96*(Sub_01.std())/(np.sqrt(Sub_01.shape[0])) + Sub_02_err = 1.96*(Sub_02.std())/(np.sqrt(Sub_02.shape[0])) + Sub_03_err = 1.96*(Sub_03.std())/(np.sqrt(Sub_03.shape[0])) + Sub_04_err = 1.96*(Sub_04.std())/(np.sqrt(Sub_04.shape[0])) + Sub_05_err = 1.96*(Sub_05.std())/(np.sqrt(Sub_05.shape[0])) + + New_Data = pd.DataFrame([['01', Sub_01.mean(), Sub_01_err], + ['02', Sub_02.mean(), Sub_02_err], + ['03', Sub_03.mean(), Sub_03_err], + ['04', Sub_04.mean(), Sub_04_err], + ['05', Sub_05.mean(), Sub_05_err]], + columns=['x', 'y', 'yerr']) + + return DATA, New_Data + +############################################################################# +##### WELCH Plots + +def MedErr_graph (Data, Title): + + def formatter(x, pos): + return str(round(x / 1e6, 1)) + " MDP" + + color = ['#A65E1A','#E9C27D','#B2B2B2','#80CDC1','#018571'] + + sns.set(font_scale = 1.5, font="Algerian") + sns.set_style("white") + + fig = plt.figure (figsize = (16,8), facecolor = "white") + + ax = sns.barplot(x = Data.x, + y = Data.y, + yerr = Data.yerr, + palette = color) + ax.set(yticks=[Data.y.mean(), Data.y.max()]) + ax.yaxis.set_tick_params(labelsize = 18) + ax.yaxis.set_major_formatter(formatter) + ax.xaxis.label.set_visible(False) + plt.ylabel(' $\overline{X}_{Price}$',fontsize = 18) + sns.despine() + plt.savefig(Title, dpi=300, bbox_inches='tight') + + +############################################################################# +##### t-TEST & p-VALUE + +def testWelch(Data): + + dp_drup = dict(tuple(Data.drop_duplicates(inplace=False).groupby(Data.columns[0]))) + + def t_test(pair): + results= ttest_ind(dp_drup[pair[0]][Data.columns[-1]], + dp_drup[pair[1]][Data.columns[-1]]) + return (results) + + all_combinations = list(combinations(list(dp_drup.keys()), 2)) + t_test = pd.DataFrame([t_test(i) for i in all_combinations]) + Combina = pd.DataFrame(all_combinations, columns =['Ld', 'Jm']) + + New_Data = pd.concat([t_test, Combina], axis=1) + + New_Data = New_Data.replace({1:'01', 2:'02', + 3:'03', 4:'04', + 5:'05'}) + + New_Data = New_Data[['Ld', 'Jm', 'statistic','pvalue']] + + return(New_Data) + + +############################################################################# +##### WELCH plots (p-VALUES) + +def Welch_full (Table, Test, Title): + + def formatter(x, pos): + return str(round(x / 1e6, 1)) + " MDP" + + color = ['#A65E1A','#E9C27D','#B2B2B2','#80CDC1','#018571'] + + sns.set(font_scale = 1, font="Algerian") + sns.set_style("white") + + fig = plt.figure (figsize = (16,8), facecolor = "white") + + ax = sns.barplot(x = Table.x, + y = Table.y, + yerr = Table.yerr, + palette = color) + + add_stat_annotation(ax, + x = Table.x, + y = Table.y, + order = Table.x, + box_pairs = Test[['Ld', 'Jm']].values.tolist(), + perform_stat_test = False, + pvalues = Test["pvalue"].values.tolist(), + test = None, + text_format = 'simple', + loc='outside', + verbose=2); + + ax.yaxis.set_major_formatter(formatter) + ax.xaxis.label.set_visible(False) + plt.ylabel(' $\overline{X}_{Price}$') + plt.title(Title, fontsize = 12) + sns.despine() + plt.savefig(Title, dpi=300, bbox_inches='tight') + + +############################################################################# +##### Example + +PRUEBA = pd.read_csv(Entradas + 'TSNE_DEPAS_DEPAS.csv') + +DATA, TABLE = Welch_test(PRUEBA, 't_SNE', 'precio') + +MedErr_graph(TABLE, 'TITULO SALIDA') + +TEST = testWelch(DATA) + +Welch_full (TABLE, TEST, 'TITULO SALIDA')