cleaning,_fe_and_eda.py

# -*- coding: utf-8 -*-
"""Cleaning, FE and EDA.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1JiMcmZs-9Gu1MxQ_8lr5z6pdRCCj8G8F
"""

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

url = 'https://raw.githubusercontent.com/IvanVC21/Tijuana-house-prices/main/tijuana_prices.csv'
df = pd.read_csv(url)

df.head(10)

df.shape

df.isnull().sum().sort_values(ascending=False).head()

df['parkingSpots'].fillna(1 , inplace=True)

df.dropna(inplace=True)
df.isnull().sum()

print("% of neighborhoods listed as 'Tijuana, Baja California':", (df['neighborhood'].value_counts().max()/866)*100)

df['neighborhood'].value_counts()

df = df.drop(df.index[df['neighborhood'] == 'Tijuana, Baja California Norte'])

df.shape

df_mn = df[df['currency'] == 'MN']
df_mn.head()

df_mn = df_mn.apply(lambda x: x / 20 if x.name == 'price' else x)
df_mn.head()

df = df.drop(df.index[df['currency'] == 'MN'])
df.shape

prices = [df, df_mn]
df = pd.concat(prices).sort_index()

df.head()

df = df.drop(columns = ['currency'])

df.dtypes

df.head()

df.insert(8, "latSY", 32.5414378)
df.insert(9, "lonSY", -117.0275893)
df.insert(10, "latOT", 32.5457130)
df.insert(11, "lonOT", -116.9379029)

df.dtypes

R = 6373.0
df['lat_rad'] = np.radians(df['lat'])
df['lon_rad'] = np.radians(df['lon'])
df['latSYrad'] = np.radians(df['latSY'])
df['lonSYrad'] = np.radians(df['lonSY'])
df['latOTrad'] = np.radians(df['latOT'])
df['lonOTrad'] = np.radians(df['lonOT'])

df['dlatSY'] = df['latSYrad'] - df['lat_rad']
df['dlonSY'] = df['lonSYrad'] - df['lon_rad']
df['dlatOT'] = df['latOTrad'] - df['lat_rad']
df['dlonOT'] = df['lonOTrad'] - df['lon_rad']

df['aSY'] =  np.sin(df['dlatSY'] / 2)**2 + np.cos(df['lat_rad']) * np.cos(df['latSYrad']) * np.sin(df['dlonSY'] / 2)**2
df['aOT'] =  np.sin(df['dlatOT'] / 2)**2 + np.cos(df['lat_rad']) * np.cos(df['latOTrad']) * np.sin(df['dlonOT'] / 2)**2
df['cSY'] = 2 * np.arctan2(np.sqrt(df['aSY']), np.sqrt(1 - df['aSY']))
df['cOT'] = 2 * np.arctan2(np.sqrt(df['aOT']), np.sqrt(1 - df['aOT']))

df['distance_SY (km)'] = R * df['cSY']
df['distance_OT (km)'] = R * df['cOT']

df = df.drop(columns = ['latSY', 'lonSY', 'latOT', 'lonOT',
       'lat_rad', 'lon_rad', 'latSYrad', 'lonSYrad', 'latOTrad', 'lonOTrad',
       'dlatSY', 'dlonSY', 'dlatOT', 'dlonOT', 'aSY', 'aOT', 'cSY', 'cOT'])
df.head()

from sklearn.cluster import KMeans
X=df.loc[:,['lat','lon']]
X.head(10)

K_clusters = range(1,10)
kmeans = [KMeans(n_clusters=i) for i in K_clusters]
Y_axis = df[['lat']]
X_axis = df[['lon']]
score = [kmeans[i].fit(Y_axis).score(Y_axis) for i in range(len(kmeans))]
# Visualize
plt.plot(K_clusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()

X.columns[0:4]

kmeans = KMeans(n_clusters = 4, init ='k-means++')
kmeans.fit(X[X.columns[0:2]]) # Compute k-means clustering.
X['cluster_label'] = kmeans.fit_predict(X[X.columns[0:2]])
centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
labels = kmeans.predict(X[X.columns[0:2]]) # Labels of each point
X.head(10)

X.plot.scatter(x = 'lat', y = 'lon', c=labels, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)

X = X[['cluster_label']]
dfs = [df, X]
df = pd.concat(dfs, axis = 1)
df.rename(columns = {'cluster_label':'location'}, inplace = True)
df

loc0 = df[df['location']==0]
loc1 = df[df['location']==1]
loc2 = df[df['location']==2]
loc3 = df[df['location']==3]


print(loc0['lon'].mean())
print(loc1['lon'].mean())
print(loc2['lon'].mean())
print(loc3['lon'].mean())

df['location'] = df['location'].astype(str)

df['location'] = df['location'].str.replace('0', 'East')
df['location'] = df['location'].str.replace('1', 'Mid-West')
df['location'] = df['location'].str.replace('2', 'West')
df['location'] = df['location'].str.replace('3', 'Mid-East')

df = df[['price', 'bedrooms', 'bathrooms', 'parkingSpots', 'propertySize', 'distance_SY (km)','distance_OT (km)', 'location']]

df = pd.get_dummies(df)
df.head()

dist = sns.displot(df.price, aspect = 2)
dist.set(xlabel = "Price (Millions of USD)", ylabel = "Number of houses")

plt.rcParams['figure.figsize'] = (10, 5)
sns.heatmap(df.corr(), cmap = 'coolwarm', annot = True)

plt.show()

max_threshold = df['price'].quantile(0.95)
max_threshold

df[df['price']>max_threshold]

min_threshold = df['price'].quantile(0.05)
min_threshold

df[df['price']<min_threshold]

df2 = df[(df.price < max_threshold) & (df.price > min_threshold)]
df2.shape

df2.columns

df2['distance_SY (km)'] = df2['distance_SY (km)'].round(0)
df2['distance_OT (km)'] = df2['distance_OT (km)'].round(0)
df2

df2.columns

df2.to_csv(r'EDA.csv', index=False)