Added script files.

CentroGeo · Dec 20, 2022 · 7c2e78b · 7c2e78b
1 parent f1ab368
commit 7c2e78b
Show file tree

Hide file tree

Showing 7 changed files with 1,167 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,15 @@
 # housing-submarkets-cdmx
 Supplementary material for peer review
+
+
+This repository contains the scripts we used to perform the analysis in _An interdisciplinary framework for the comparison of geography aware housing market segmentation models using web scraped listing price data for Mexico City_
+
+All scripts are in _src_ folder
+
+* Accesibility.py Contains the code used to model accessibility to ammenities
+* Clustering.py Contains all clustering code to define submarkets
+* Outliers_LOF.py contains the code to filter outliers
+* Prediction.py Contains the XGBoost model  fitting procedure
+* Welch_Test.py Contains the code to obtain confidence intervals for prices in submarkets
+
+All data needed to run the scripts can be found in the `data` folder.  
diff --git a/data/listing_price_data.zip b/data/listing_price_data.zip
diff --git a/src/Accessibility.py b/src/Accessibility.py
@@ -0,0 +1,117 @@
+#############################################################################
+##### Accesibility
+# This file produces an accesibility model for servicers defined in file OFERTA
+#############################################################################
+
+#### Imports
+from h3 import h3
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+from shapely.geometry import Polygon, Point
+from access import Access, weights, Datasets
+
+import warnings
+warnings.filterwarnings('ignore')
+
+def id2poly(hex_id):
+    boundary = [x[::-1] for x in h3.h3_to_geo_boundary(hex_id)]
+    return Polygon(boundary)
+
+#############################################################################
+##### Inputs
+
+Entrada = '../data'
+Salida = '../data'
+
+#############################################################################
+##### Base hexagons
+
+HEX_CDMX = gpd.read_file(Entrada + 'HEX_CDMX.shp')
+
+HOGARES_ALL = gpd.read_file(Entrada + 'HOGARES_ALL.shp')
+
+OFERTA = gpd.read_file(Entrada + 'office.shp')
+
+#############################################################################
+##### Join services to hexagons
+
+Hex_Unidad = gpd.sjoin( HEX_CDMX,
+                        OFERTA,
+                        how = "right",
+                        op = 'contains')
+
+#############################################################################
+##### Count services by hexagon
+
+Hex_Unidad = Hex_Unidad.groupby('hex')['name'].count().reset_index()
+Hex_Unidad ['geometry'] = Hex_Unidad['hex'].apply(id2poly)
+Hex_Unidad = gpd.GeoDataFrame( Hex_Unidad,
+                               geometry = Hex_Unidad['geometry'])
+Hex_Unidad.crs = "EPSG:4326"
+Hex_Unidad = Hex_Unidad.to_crs("EPSG:4326")
+Hex_Unidad = Hex_Unidad.to_crs(6372)
+
+#############################################################################
+##### SPATIAL ACCESIBILITY : GRAVITY MODEL
+
+Unidad_access = Access( demand_df = HOGARES_ALL,
+                        demand_index = "hex",
+                        demand_value=  "TOTHOG",
+                        supply_df = Hex_Unidad,
+                        supply_index = "hex",
+                        supply_value = "name")
+
+
+Unidad_access.create_euclidean_distance( name = "euclidean_neighbors",
+                                         threshold = 5000, ### Unidad Metros
+                                         centroid_d = True,
+                                         centroid_o=True)
+
+gravity = weights.gravity( scale = 30,
+                           alpha = -2,
+                           min_dist = 10)
+
+weighted = Unidad_access.weighted_catchment( name = "gravity",
+                                             weight_fn = gravity)
+
+
+#############################################################################
+##### GRAVITY MODEL WEIGHTS
+
+weighted = weighted.reset_index()
+weighted ['geometry'] = weighted ['hex'].apply(id2poly)
+weighted = gpd.GeoDataFrame( weighted,
+                             geometry = weighted['geometry'])
+weighted.crs = "EPSG:4326"
+weighted = weighted.to_crs("EPSG:4326")
+weighted = weighted.rename({'gravity_name':'gravity_value'}, axis=1)
+weighted['gravity_value'] = weighted['gravity_value'].fillna(0)
+
+print('Max value',weighted.gravity_value.max())
+print('Min value',weighted.gravity_value.min())
+print('Weighted shape',weighted.shape)
+
+#############################################################################
+##### PLOT
+
+BASE = HOGARES_ALL.to_crs(4326).plot( color="black",
+                                      markersize=7,
+                                      figsize=(11, 11))
+
+weighted.plot( ax = BASE,
+               column = "gravity_value",
+               scheme = 'Quantiles',
+               cmap = 'viridis',
+               classification_kwds = {'k':8})
+
+OFERTA.to_crs(4326).plot( ax = BASE,
+                          color = "red",
+                          markersize = 1)
+
+#############################################################################
+##### Save as csv
+
+weighted[[ 'hex', 'gravity_value']].to_csv(Salida + "Oficinas_SA.csv", index = False)
+
+#############################################################################
diff --git a/src/Clustering.py b/src/Clustering.py
@@ -0,0 +1,209 @@
+#############################################################################
+#####  KMEANS, TSNE Y AZP
+
+#############################################################################
+
+#### Imports
+
+import libpysal
+from h3 import h3
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+from spopt.region import AZP
+import matplotlib.pyplot as plt
+from shapely.geometry import Polygon
+import seaborn as sns
+
+import warnings
+warnings.filterwarnings('ignore')
+
+from sklearn.preprocessing import PowerTransformer
+
+from sklearn.manifold import TSNE
+from sklearn.cluster import KMeans
+from scipy.cluster.hierarchy import ward, fcluster
+from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
+
+def id2poly(hex_id):
+    boundary = [x[::-1] for x in h3.h3_to_geo_boundary(hex_id)]
+    return Polygon(boundary)
+
+#############################################################################
+##### Input - output paths
+
+Entradas = ''
+Salidas = ''
+
+#############################################################################
+#####
+
+#### Input dataset must have all acessibility and geographic variables 
+####
+
+DATA_DEPART = pd.read_csv(Entradas + 'DEPART_DIM.csv')
+
+DATA_CASAS = pd.read_csv(Entradas + 'CASA_DIM.csv')
+
+#### Example for apartments dataset
+
+DATA_DPTO = DATA_DEPART.groupby('hex').agg({'superficie':'mean',
+                                            'habitacion':'mean',
+                                            'baños':'mean',
+                                            'Parques_SA':'mean',
+                                            'Angel_SA':'mean',
+                                            'Teatros_SA':'mean',
+                                            'SuperM_SA':'mean',
+                                            'Restau_SA':'mean',
+                                            'Oficina_SA':'mean',
+                                            'MetroBus_SA':'mean',
+                                            'Metro_SA':'mean',
+                                            'Hospital_SA':'mean',
+                                            'GyMFree_SA':'mean',
+                                            'GyM_SA':'mean',
+                                            'Cinema_SA':'mean',
+                                            'EscSecu_SA':'mean',
+                                            'EscPr_SA':'mean',
+                                            'EscMS_SA':'mean',
+                                            'EcoBici_SA':'mean',
+                                            'Delitos':'mean',
+                                            'Socioeco':'mean'}).reset_index()
+
+LABEL = DATA_DPTO['hex']
+DIMENSIONS = DATA_DPTO.drop(['hex'], axis=1)
+
+
+S_S = PowerTransformer()
+Var_Scales = pd.DataFrame(S_S.fit_transform(DIMENSIONS),
+                          columns = DIMENSIONS.columns)
+
+
+#############################################################################
+##### KMEANS CLUSTER
+
+
+def K_model(Dimensiones_tran, numero_min, numero_max, Title):
+    ### Input:
+    ### -> Variables explicativas transformadas
+    ### -> Número mínimo de cluster
+    ### -> Número máximo de cluster
+    model = KMeans()
+    visualizer = KElbowVisualizer( model,
+                                   k = (numero_min, numero_max),
+                                   timings= True)
+    visualizer.fit(Dimensiones_tran)
+    visualizer.show(outpath = Title, dpi = 300)
+
+
+K_model(Var_Scales, 2, 10, 'CURVA DATA')
+
+
+def K_means(Dimensiones_tran, numero_cluster, semilla):
+    ### Input:
+    ### -> Variables explicativas transformadas
+    ### -> Número de Cluster
+    ### -> Número de semilla en random-state
+    kmeans = KMeans(n_clusters = numero_cluster, random_state = semilla)
+    kmeans.fit(Dimensiones_tran)
+    clusters = kmeans.predict(Dimensiones_tran)
+    ### Output:
+    ### -> Valores de cluster
+    return (clusters)
+
+Clusters_K = K_means(Var_Scales, 5, 42)
+
+#############################################################################
+#####  Ward CLUSTER
+
+def T_NSE(Dimensiones_tran, Numero_compo, Semilla, Perplexity):
+    ### Input:
+    ### -> Variables explicativas transformadas
+    ### -> Número de componentes debe ser inferior a 4 (algoritmo barnes_hut, basado en quad-tree y oct-tree)
+    ### -> Número de semilla en random-state
+    ### -> Perplexity o valores de perplejidad en el rango (5 - 50) sugeridos por van der Maaten & Hinton
+    tsn = TSNE(n_components = Numero_compo, random_state = Semilla, perplexity = Perplexity)
+    res_tsne = tsn.fit_transform(Dimensiones_tran)
+    fig = plt.figure(figsize=(25,25))
+    ax1 = fig.add_subplot(3,3,1)
+    axpl_1 = sns.scatterplot(res_tsne[:,0],res_tsne[:,1]);
+    ax2 = fig.add_subplot(3,3,2)
+    axpl_2 = sns.scatterplot(res_tsne[:,0],res_tsne[:,2]);
+    ### Output:
+    ### -> Valores TNSE de las componentes reducidas
+    return (res_tsne)
+
+TNSE_Model = T_NSE(Var_Scales, 3, 42, 50)
+
+### Cluster t-SNE :WARD
+
+def Cluster_TNSE(Modelo_tSNE, Cluster_max):
+    ### Input:
+    ### -> Array correspondiente a cada componente TNSE resultado de reducción dimensional
+    ### -> Número de cluster designados bajo el criterio maxclust de Fcluster
+    link = ward(Modelo_tSNE)
+    Cluster_Values = fcluster(link, t = Cluster_max, criterion = 'maxclust')
+    fig = plt.figure(figsize = (25,25))
+    ax1 = fig.add_subplot(3,3,1)
+    pd.value_counts(Cluster_Values).plot(kind='barh')
+    ax2 = fig.add_subplot(3,3,2)
+    axpl_2 = sns.scatterplot(x = Modelo_tSNE[:,0], y = Modelo_tSNE[:,1],hue = Cluster_Values, palette="Set2");
+    ax2 = fig.add_subplot(3,3,3)
+    axpl_2 = sns.scatterplot(x = Modelo_tSNE[:,0], y = Modelo_tSNE[:,2], hue = Cluster_Values, palette="Set2");
+    ### Output:
+    ### -> Valores de cluster TNSE
+    return (Cluster_Values)
+
+
+TNSE_Cluster = Cluster_TNSE(TNSE_Model, 5)
+
+
+### CLUSTER (KMEANS + WARD) EN LA TABLA DE DIMENSIONES
+
+DATA_DPTO["K_Means"] = Clusters_K
+DATA_DPTO["t_SNE"] = TNSE_Cluster
+
+
+#############################################################################
+#####  AZP CLUSTER
+
+Var_Scales['geometry'] = Var_Scales['hex'].apply(id2poly)
+Var_Scales = gpd.GeoDataFrame( Var_Scales,
+                               geometry = Var_Scales['geometry'])
+Var_Scales.crs = "EPSG:4326"
+Var_Scales = Var_Scales.to_crs("EPSG:4326").to_crs(6372).set_index('hex')
+
+
+COLUMN_DIMEN = ['superficie', 'habitacion', 'baños', 'Parques_SA',
+                 'Angel_SA', 'Teatros_SA', 'SuperM_SA', 'Restau_SA', 'Oficina_SA',
+                 'MetroBus_SA', 'Metro_SA', 'Hospital_SA', 'GyMFree_SA', 'GyM_SA',
+                 'Cinema_SA', 'EscSecu_SA', 'EscPr_SA', 'EscMS_SA',
+                 'EcoBici_SA', 'Delitos', 'Socioeco']
+
+#############################################################################
+##### AZP CLUSTER
+
+KKN_MATRIX = libpysal.weights.KNN.from_dataframe(Var_Scales, k = 8)
+
+model = AZP(Var_Scales, KKN_MATRIX, COLUMN_DIMEN, 5)
+model.solve()
+
+Var_Scales['AZP'] = model.labels_
+
+DATA_ALL = DATA_DPTO.merge( Var_Scales[['hex', 'AZP']],
+                            left_on = 'hex',
+                            right_on = 'hex',
+                            how ='left')
+
+DPTO_CLUSTER_ALL = DATA_DEPART.merge( DATA_ALL[['hex',
+                                                'AZP',
+                                                'K_Means',
+                                                't_SNE']],
+                                                left_on = 'hex',
+                                                right_on = 'hex',
+                                                how = 'left')
+
+
+#############################################################################
+##### Write as csv
+
+DPTO_CLUSTER_ALL.to_csv(Salidas + "DPTO_CLUSTER_ALL.csv", index = False)