-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
1,167 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,15 @@ | ||
# housing-submarkets-cdmx | ||
Supplementary material for peer review | ||
|
||
|
||
This repository contains the scripts we used to perform the analysis in _An interdisciplinary framework for the comparison of geography aware housing market segmentation models using web scraped listing price data for Mexico City_ | ||
|
||
All scripts are in _src_ folder | ||
|
||
* Accesibility.py Contains the code used to model accessibility to ammenities | ||
* Clustering.py Contains all clustering code to define submarkets | ||
* Outliers_LOF.py contains the code to filter outliers | ||
* Prediction.py Contains the XGBoost model fitting procedure | ||
* Welch_Test.py Contains the code to obtain confidence intervals for prices in submarkets | ||
|
||
All data needed to run the scripts can be found in the `data` folder. |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
############################################################################# | ||
##### Accesibility | ||
# This file produces an accesibility model for servicers defined in file OFERTA | ||
############################################################################# | ||
|
||
#### Imports | ||
from h3 import h3 | ||
import numpy as np | ||
import pandas as pd | ||
import geopandas as gpd | ||
from shapely.geometry import Polygon, Point | ||
from access import Access, weights, Datasets | ||
|
||
import warnings | ||
warnings.filterwarnings('ignore') | ||
|
||
def id2poly(hex_id): | ||
boundary = [x[::-1] for x in h3.h3_to_geo_boundary(hex_id)] | ||
return Polygon(boundary) | ||
|
||
############################################################################# | ||
##### Inputs | ||
|
||
Entrada = '../data' | ||
Salida = '../data' | ||
|
||
############################################################################# | ||
##### Base hexagons | ||
|
||
HEX_CDMX = gpd.read_file(Entrada + 'HEX_CDMX.shp') | ||
|
||
HOGARES_ALL = gpd.read_file(Entrada + 'HOGARES_ALL.shp') | ||
|
||
OFERTA = gpd.read_file(Entrada + 'office.shp') | ||
|
||
############################################################################# | ||
##### Join services to hexagons | ||
|
||
Hex_Unidad = gpd.sjoin( HEX_CDMX, | ||
OFERTA, | ||
how = "right", | ||
op = 'contains') | ||
|
||
############################################################################# | ||
##### Count services by hexagon | ||
|
||
Hex_Unidad = Hex_Unidad.groupby('hex')['name'].count().reset_index() | ||
Hex_Unidad ['geometry'] = Hex_Unidad['hex'].apply(id2poly) | ||
Hex_Unidad = gpd.GeoDataFrame( Hex_Unidad, | ||
geometry = Hex_Unidad['geometry']) | ||
Hex_Unidad.crs = "EPSG:4326" | ||
Hex_Unidad = Hex_Unidad.to_crs("EPSG:4326") | ||
Hex_Unidad = Hex_Unidad.to_crs(6372) | ||
|
||
############################################################################# | ||
##### SPATIAL ACCESIBILITY : GRAVITY MODEL | ||
|
||
Unidad_access = Access( demand_df = HOGARES_ALL, | ||
demand_index = "hex", | ||
demand_value= "TOTHOG", | ||
supply_df = Hex_Unidad, | ||
supply_index = "hex", | ||
supply_value = "name") | ||
|
||
|
||
Unidad_access.create_euclidean_distance( name = "euclidean_neighbors", | ||
threshold = 5000, ### Unidad Metros | ||
centroid_d = True, | ||
centroid_o=True) | ||
|
||
gravity = weights.gravity( scale = 30, | ||
alpha = -2, | ||
min_dist = 10) | ||
|
||
weighted = Unidad_access.weighted_catchment( name = "gravity", | ||
weight_fn = gravity) | ||
|
||
|
||
############################################################################# | ||
##### GRAVITY MODEL WEIGHTS | ||
|
||
weighted = weighted.reset_index() | ||
weighted ['geometry'] = weighted ['hex'].apply(id2poly) | ||
weighted = gpd.GeoDataFrame( weighted, | ||
geometry = weighted['geometry']) | ||
weighted.crs = "EPSG:4326" | ||
weighted = weighted.to_crs("EPSG:4326") | ||
weighted = weighted.rename({'gravity_name':'gravity_value'}, axis=1) | ||
weighted['gravity_value'] = weighted['gravity_value'].fillna(0) | ||
|
||
print('Max value',weighted.gravity_value.max()) | ||
print('Min value',weighted.gravity_value.min()) | ||
print('Weighted shape',weighted.shape) | ||
|
||
############################################################################# | ||
##### PLOT | ||
|
||
BASE = HOGARES_ALL.to_crs(4326).plot( color="black", | ||
markersize=7, | ||
figsize=(11, 11)) | ||
|
||
weighted.plot( ax = BASE, | ||
column = "gravity_value", | ||
scheme = 'Quantiles', | ||
cmap = 'viridis', | ||
classification_kwds = {'k':8}) | ||
|
||
OFERTA.to_crs(4326).plot( ax = BASE, | ||
color = "red", | ||
markersize = 1) | ||
|
||
############################################################################# | ||
##### Save as csv | ||
|
||
weighted[[ 'hex', 'gravity_value']].to_csv(Salida + "Oficinas_SA.csv", index = False) | ||
|
||
############################################################################# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
############################################################################# | ||
##### KMEANS, TSNE Y AZP | ||
|
||
############################################################################# | ||
|
||
#### Imports | ||
|
||
import libpysal | ||
from h3 import h3 | ||
import numpy as np | ||
import pandas as pd | ||
import geopandas as gpd | ||
from spopt.region import AZP | ||
import matplotlib.pyplot as plt | ||
from shapely.geometry import Polygon | ||
import seaborn as sns | ||
|
||
import warnings | ||
warnings.filterwarnings('ignore') | ||
|
||
from sklearn.preprocessing import PowerTransformer | ||
|
||
from sklearn.manifold import TSNE | ||
from sklearn.cluster import KMeans | ||
from scipy.cluster.hierarchy import ward, fcluster | ||
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer | ||
|
||
def id2poly(hex_id): | ||
boundary = [x[::-1] for x in h3.h3_to_geo_boundary(hex_id)] | ||
return Polygon(boundary) | ||
|
||
############################################################################# | ||
##### Input - output paths | ||
|
||
Entradas = '' | ||
Salidas = '' | ||
|
||
############################################################################# | ||
##### | ||
|
||
#### Input dataset must have all acessibility and geographic variables | ||
#### | ||
|
||
DATA_DEPART = pd.read_csv(Entradas + 'DEPART_DIM.csv') | ||
|
||
DATA_CASAS = pd.read_csv(Entradas + 'CASA_DIM.csv') | ||
|
||
#### Example for apartments dataset | ||
|
||
DATA_DPTO = DATA_DEPART.groupby('hex').agg({'superficie':'mean', | ||
'habitacion':'mean', | ||
'baños':'mean', | ||
'Parques_SA':'mean', | ||
'Angel_SA':'mean', | ||
'Teatros_SA':'mean', | ||
'SuperM_SA':'mean', | ||
'Restau_SA':'mean', | ||
'Oficina_SA':'mean', | ||
'MetroBus_SA':'mean', | ||
'Metro_SA':'mean', | ||
'Hospital_SA':'mean', | ||
'GyMFree_SA':'mean', | ||
'GyM_SA':'mean', | ||
'Cinema_SA':'mean', | ||
'EscSecu_SA':'mean', | ||
'EscPr_SA':'mean', | ||
'EscMS_SA':'mean', | ||
'EcoBici_SA':'mean', | ||
'Delitos':'mean', | ||
'Socioeco':'mean'}).reset_index() | ||
|
||
LABEL = DATA_DPTO['hex'] | ||
DIMENSIONS = DATA_DPTO.drop(['hex'], axis=1) | ||
|
||
|
||
S_S = PowerTransformer() | ||
Var_Scales = pd.DataFrame(S_S.fit_transform(DIMENSIONS), | ||
columns = DIMENSIONS.columns) | ||
|
||
|
||
############################################################################# | ||
##### KMEANS CLUSTER | ||
|
||
|
||
def K_model(Dimensiones_tran, numero_min, numero_max, Title): | ||
### Input: | ||
### -> Variables explicativas transformadas | ||
### -> Número mínimo de cluster | ||
### -> Número máximo de cluster | ||
model = KMeans() | ||
visualizer = KElbowVisualizer( model, | ||
k = (numero_min, numero_max), | ||
timings= True) | ||
visualizer.fit(Dimensiones_tran) | ||
visualizer.show(outpath = Title, dpi = 300) | ||
|
||
|
||
K_model(Var_Scales, 2, 10, 'CURVA DATA') | ||
|
||
|
||
def K_means(Dimensiones_tran, numero_cluster, semilla): | ||
### Input: | ||
### -> Variables explicativas transformadas | ||
### -> Número de Cluster | ||
### -> Número de semilla en random-state | ||
kmeans = KMeans(n_clusters = numero_cluster, random_state = semilla) | ||
kmeans.fit(Dimensiones_tran) | ||
clusters = kmeans.predict(Dimensiones_tran) | ||
### Output: | ||
### -> Valores de cluster | ||
return (clusters) | ||
|
||
Clusters_K = K_means(Var_Scales, 5, 42) | ||
|
||
############################################################################# | ||
##### Ward CLUSTER | ||
|
||
def T_NSE(Dimensiones_tran, Numero_compo, Semilla, Perplexity): | ||
### Input: | ||
### -> Variables explicativas transformadas | ||
### -> Número de componentes debe ser inferior a 4 (algoritmo barnes_hut, basado en quad-tree y oct-tree) | ||
### -> Número de semilla en random-state | ||
### -> Perplexity o valores de perplejidad en el rango (5 - 50) sugeridos por van der Maaten & Hinton | ||
tsn = TSNE(n_components = Numero_compo, random_state = Semilla, perplexity = Perplexity) | ||
res_tsne = tsn.fit_transform(Dimensiones_tran) | ||
fig = plt.figure(figsize=(25,25)) | ||
ax1 = fig.add_subplot(3,3,1) | ||
axpl_1 = sns.scatterplot(res_tsne[:,0],res_tsne[:,1]); | ||
ax2 = fig.add_subplot(3,3,2) | ||
axpl_2 = sns.scatterplot(res_tsne[:,0],res_tsne[:,2]); | ||
### Output: | ||
### -> Valores TNSE de las componentes reducidas | ||
return (res_tsne) | ||
|
||
TNSE_Model = T_NSE(Var_Scales, 3, 42, 50) | ||
|
||
### Cluster t-SNE :WARD | ||
|
||
def Cluster_TNSE(Modelo_tSNE, Cluster_max): | ||
### Input: | ||
### -> Array correspondiente a cada componente TNSE resultado de reducción dimensional | ||
### -> Número de cluster designados bajo el criterio maxclust de Fcluster | ||
link = ward(Modelo_tSNE) | ||
Cluster_Values = fcluster(link, t = Cluster_max, criterion = 'maxclust') | ||
fig = plt.figure(figsize = (25,25)) | ||
ax1 = fig.add_subplot(3,3,1) | ||
pd.value_counts(Cluster_Values).plot(kind='barh') | ||
ax2 = fig.add_subplot(3,3,2) | ||
axpl_2 = sns.scatterplot(x = Modelo_tSNE[:,0], y = Modelo_tSNE[:,1],hue = Cluster_Values, palette="Set2"); | ||
ax2 = fig.add_subplot(3,3,3) | ||
axpl_2 = sns.scatterplot(x = Modelo_tSNE[:,0], y = Modelo_tSNE[:,2], hue = Cluster_Values, palette="Set2"); | ||
### Output: | ||
### -> Valores de cluster TNSE | ||
return (Cluster_Values) | ||
|
||
|
||
TNSE_Cluster = Cluster_TNSE(TNSE_Model, 5) | ||
|
||
|
||
### CLUSTER (KMEANS + WARD) EN LA TABLA DE DIMENSIONES | ||
|
||
DATA_DPTO["K_Means"] = Clusters_K | ||
DATA_DPTO["t_SNE"] = TNSE_Cluster | ||
|
||
|
||
############################################################################# | ||
##### AZP CLUSTER | ||
|
||
Var_Scales['geometry'] = Var_Scales['hex'].apply(id2poly) | ||
Var_Scales = gpd.GeoDataFrame( Var_Scales, | ||
geometry = Var_Scales['geometry']) | ||
Var_Scales.crs = "EPSG:4326" | ||
Var_Scales = Var_Scales.to_crs("EPSG:4326").to_crs(6372).set_index('hex') | ||
|
||
|
||
COLUMN_DIMEN = ['superficie', 'habitacion', 'baños', 'Parques_SA', | ||
'Angel_SA', 'Teatros_SA', 'SuperM_SA', 'Restau_SA', 'Oficina_SA', | ||
'MetroBus_SA', 'Metro_SA', 'Hospital_SA', 'GyMFree_SA', 'GyM_SA', | ||
'Cinema_SA', 'EscSecu_SA', 'EscPr_SA', 'EscMS_SA', | ||
'EcoBici_SA', 'Delitos', 'Socioeco'] | ||
|
||
############################################################################# | ||
##### AZP CLUSTER | ||
|
||
KKN_MATRIX = libpysal.weights.KNN.from_dataframe(Var_Scales, k = 8) | ||
|
||
model = AZP(Var_Scales, KKN_MATRIX, COLUMN_DIMEN, 5) | ||
model.solve() | ||
|
||
Var_Scales['AZP'] = model.labels_ | ||
|
||
DATA_ALL = DATA_DPTO.merge( Var_Scales[['hex', 'AZP']], | ||
left_on = 'hex', | ||
right_on = 'hex', | ||
how ='left') | ||
|
||
DPTO_CLUSTER_ALL = DATA_DEPART.merge( DATA_ALL[['hex', | ||
'AZP', | ||
'K_Means', | ||
't_SNE']], | ||
left_on = 'hex', | ||
right_on = 'hex', | ||
how = 'left') | ||
|
||
|
||
############################################################################# | ||
##### Write as csv | ||
|
||
DPTO_CLUSTER_ALL.to_csv(Salidas + "DPTO_CLUSTER_ALL.csv", index = False) |
Oops, something went wrong.