Skip to content

Commit

Permalink
Added script files.
Browse files Browse the repository at this point in the history
  • Loading branch information
plablo09 committed Dec 20, 2022
1 parent f1ab368 commit 7c2e78b
Show file tree
Hide file tree
Showing 7 changed files with 1,167 additions and 0 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,15 @@
# housing-submarkets-cdmx
Supplementary material for peer review


This repository contains the scripts we used to perform the analysis in _An interdisciplinary framework for the comparison of geography aware housing market segmentation models using web scraped listing price data for Mexico City_

All scripts are in _src_ folder

* Accesibility.py Contains the code used to model accessibility to ammenities
* Clustering.py Contains all clustering code to define submarkets
* Outliers_LOF.py contains the code to filter outliers
* Prediction.py Contains the XGBoost model fitting procedure
* Welch_Test.py Contains the code to obtain confidence intervals for prices in submarkets

All data needed to run the scripts can be found in the `data` folder.
Binary file added data/listing_price_data.zip
Binary file not shown.
117 changes: 117 additions & 0 deletions src/Accessibility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#############################################################################
##### Accesibility
# This file produces an accesibility model for servicers defined in file OFERTA
#############################################################################

#### Imports
from h3 import h3
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon, Point
from access import Access, weights, Datasets

import warnings
warnings.filterwarnings('ignore')

def id2poly(hex_id):
boundary = [x[::-1] for x in h3.h3_to_geo_boundary(hex_id)]
return Polygon(boundary)

#############################################################################
##### Inputs

Entrada = '../data'
Salida = '../data'

#############################################################################
##### Base hexagons

HEX_CDMX = gpd.read_file(Entrada + 'HEX_CDMX.shp')

HOGARES_ALL = gpd.read_file(Entrada + 'HOGARES_ALL.shp')

OFERTA = gpd.read_file(Entrada + 'office.shp')

#############################################################################
##### Join services to hexagons

Hex_Unidad = gpd.sjoin( HEX_CDMX,
OFERTA,
how = "right",
op = 'contains')

#############################################################################
##### Count services by hexagon

Hex_Unidad = Hex_Unidad.groupby('hex')['name'].count().reset_index()
Hex_Unidad ['geometry'] = Hex_Unidad['hex'].apply(id2poly)
Hex_Unidad = gpd.GeoDataFrame( Hex_Unidad,
geometry = Hex_Unidad['geometry'])
Hex_Unidad.crs = "EPSG:4326"
Hex_Unidad = Hex_Unidad.to_crs("EPSG:4326")
Hex_Unidad = Hex_Unidad.to_crs(6372)

#############################################################################
##### SPATIAL ACCESIBILITY : GRAVITY MODEL

Unidad_access = Access( demand_df = HOGARES_ALL,
demand_index = "hex",
demand_value= "TOTHOG",
supply_df = Hex_Unidad,
supply_index = "hex",
supply_value = "name")


Unidad_access.create_euclidean_distance( name = "euclidean_neighbors",
threshold = 5000, ### Unidad Metros
centroid_d = True,
centroid_o=True)

gravity = weights.gravity( scale = 30,
alpha = -2,
min_dist = 10)

weighted = Unidad_access.weighted_catchment( name = "gravity",
weight_fn = gravity)


#############################################################################
##### GRAVITY MODEL WEIGHTS

weighted = weighted.reset_index()
weighted ['geometry'] = weighted ['hex'].apply(id2poly)
weighted = gpd.GeoDataFrame( weighted,
geometry = weighted['geometry'])
weighted.crs = "EPSG:4326"
weighted = weighted.to_crs("EPSG:4326")
weighted = weighted.rename({'gravity_name':'gravity_value'}, axis=1)
weighted['gravity_value'] = weighted['gravity_value'].fillna(0)

print('Max value',weighted.gravity_value.max())
print('Min value',weighted.gravity_value.min())
print('Weighted shape',weighted.shape)

#############################################################################
##### PLOT

BASE = HOGARES_ALL.to_crs(4326).plot( color="black",
markersize=7,
figsize=(11, 11))

weighted.plot( ax = BASE,
column = "gravity_value",
scheme = 'Quantiles',
cmap = 'viridis',
classification_kwds = {'k':8})

OFERTA.to_crs(4326).plot( ax = BASE,
color = "red",
markersize = 1)

#############################################################################
##### Save as csv

weighted[[ 'hex', 'gravity_value']].to_csv(Salida + "Oficinas_SA.csv", index = False)

#############################################################################
209 changes: 209 additions & 0 deletions src/Clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#############################################################################
##### KMEANS, TSNE Y AZP

#############################################################################

#### Imports

import libpysal
from h3 import h3
import numpy as np
import pandas as pd
import geopandas as gpd
from spopt.region import AZP
import matplotlib.pyplot as plt
from shapely.geometry import Polygon
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import PowerTransformer

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import ward, fcluster
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

def id2poly(hex_id):
boundary = [x[::-1] for x in h3.h3_to_geo_boundary(hex_id)]
return Polygon(boundary)

#############################################################################
##### Input - output paths

Entradas = ''
Salidas = ''

#############################################################################
#####

#### Input dataset must have all acessibility and geographic variables
####

DATA_DEPART = pd.read_csv(Entradas + 'DEPART_DIM.csv')

DATA_CASAS = pd.read_csv(Entradas + 'CASA_DIM.csv')

#### Example for apartments dataset

DATA_DPTO = DATA_DEPART.groupby('hex').agg({'superficie':'mean',
'habitacion':'mean',
'baños':'mean',
'Parques_SA':'mean',
'Angel_SA':'mean',
'Teatros_SA':'mean',
'SuperM_SA':'mean',
'Restau_SA':'mean',
'Oficina_SA':'mean',
'MetroBus_SA':'mean',
'Metro_SA':'mean',
'Hospital_SA':'mean',
'GyMFree_SA':'mean',
'GyM_SA':'mean',
'Cinema_SA':'mean',
'EscSecu_SA':'mean',
'EscPr_SA':'mean',
'EscMS_SA':'mean',
'EcoBici_SA':'mean',
'Delitos':'mean',
'Socioeco':'mean'}).reset_index()

LABEL = DATA_DPTO['hex']
DIMENSIONS = DATA_DPTO.drop(['hex'], axis=1)


S_S = PowerTransformer()
Var_Scales = pd.DataFrame(S_S.fit_transform(DIMENSIONS),
columns = DIMENSIONS.columns)


#############################################################################
##### KMEANS CLUSTER


def K_model(Dimensiones_tran, numero_min, numero_max, Title):
### Input:
### -> Variables explicativas transformadas
### -> Número mínimo de cluster
### -> Número máximo de cluster
model = KMeans()
visualizer = KElbowVisualizer( model,
k = (numero_min, numero_max),
timings= True)
visualizer.fit(Dimensiones_tran)
visualizer.show(outpath = Title, dpi = 300)


K_model(Var_Scales, 2, 10, 'CURVA DATA')


def K_means(Dimensiones_tran, numero_cluster, semilla):
### Input:
### -> Variables explicativas transformadas
### -> Número de Cluster
### -> Número de semilla en random-state
kmeans = KMeans(n_clusters = numero_cluster, random_state = semilla)
kmeans.fit(Dimensiones_tran)
clusters = kmeans.predict(Dimensiones_tran)
### Output:
### -> Valores de cluster
return (clusters)

Clusters_K = K_means(Var_Scales, 5, 42)

#############################################################################
##### Ward CLUSTER

def T_NSE(Dimensiones_tran, Numero_compo, Semilla, Perplexity):
### Input:
### -> Variables explicativas transformadas
### -> Número de componentes debe ser inferior a 4 (algoritmo barnes_hut, basado en quad-tree y oct-tree)
### -> Número de semilla en random-state
### -> Perplexity o valores de perplejidad en el rango (5 - 50) sugeridos por van der Maaten & Hinton
tsn = TSNE(n_components = Numero_compo, random_state = Semilla, perplexity = Perplexity)
res_tsne = tsn.fit_transform(Dimensiones_tran)
fig = plt.figure(figsize=(25,25))
ax1 = fig.add_subplot(3,3,1)
axpl_1 = sns.scatterplot(res_tsne[:,0],res_tsne[:,1]);
ax2 = fig.add_subplot(3,3,2)
axpl_2 = sns.scatterplot(res_tsne[:,0],res_tsne[:,2]);
### Output:
### -> Valores TNSE de las componentes reducidas
return (res_tsne)

TNSE_Model = T_NSE(Var_Scales, 3, 42, 50)

### Cluster t-SNE :WARD

def Cluster_TNSE(Modelo_tSNE, Cluster_max):
### Input:
### -> Array correspondiente a cada componente TNSE resultado de reducción dimensional
### -> Número de cluster designados bajo el criterio maxclust de Fcluster
link = ward(Modelo_tSNE)
Cluster_Values = fcluster(link, t = Cluster_max, criterion = 'maxclust')
fig = plt.figure(figsize = (25,25))
ax1 = fig.add_subplot(3,3,1)
pd.value_counts(Cluster_Values).plot(kind='barh')
ax2 = fig.add_subplot(3,3,2)
axpl_2 = sns.scatterplot(x = Modelo_tSNE[:,0], y = Modelo_tSNE[:,1],hue = Cluster_Values, palette="Set2");
ax2 = fig.add_subplot(3,3,3)
axpl_2 = sns.scatterplot(x = Modelo_tSNE[:,0], y = Modelo_tSNE[:,2], hue = Cluster_Values, palette="Set2");
### Output:
### -> Valores de cluster TNSE
return (Cluster_Values)


TNSE_Cluster = Cluster_TNSE(TNSE_Model, 5)


### CLUSTER (KMEANS + WARD) EN LA TABLA DE DIMENSIONES

DATA_DPTO["K_Means"] = Clusters_K
DATA_DPTO["t_SNE"] = TNSE_Cluster


#############################################################################
##### AZP CLUSTER

Var_Scales['geometry'] = Var_Scales['hex'].apply(id2poly)
Var_Scales = gpd.GeoDataFrame( Var_Scales,
geometry = Var_Scales['geometry'])
Var_Scales.crs = "EPSG:4326"
Var_Scales = Var_Scales.to_crs("EPSG:4326").to_crs(6372).set_index('hex')


COLUMN_DIMEN = ['superficie', 'habitacion', 'baños', 'Parques_SA',
'Angel_SA', 'Teatros_SA', 'SuperM_SA', 'Restau_SA', 'Oficina_SA',
'MetroBus_SA', 'Metro_SA', 'Hospital_SA', 'GyMFree_SA', 'GyM_SA',
'Cinema_SA', 'EscSecu_SA', 'EscPr_SA', 'EscMS_SA',
'EcoBici_SA', 'Delitos', 'Socioeco']

#############################################################################
##### AZP CLUSTER

KKN_MATRIX = libpysal.weights.KNN.from_dataframe(Var_Scales, k = 8)

model = AZP(Var_Scales, KKN_MATRIX, COLUMN_DIMEN, 5)
model.solve()

Var_Scales['AZP'] = model.labels_

DATA_ALL = DATA_DPTO.merge( Var_Scales[['hex', 'AZP']],
left_on = 'hex',
right_on = 'hex',
how ='left')

DPTO_CLUSTER_ALL = DATA_DEPART.merge( DATA_ALL[['hex',
'AZP',
'K_Means',
't_SNE']],
left_on = 'hex',
right_on = 'hex',
how = 'left')


#############################################################################
##### Write as csv

DPTO_CLUSTER_ALL.to_csv(Salidas + "DPTO_CLUSTER_ALL.csv", index = False)
Loading

0 comments on commit 7c2e78b

Please sign in to comment.