-
Notifications
You must be signed in to change notification settings - Fork 1
/
featureEngineering.py
146 lines (112 loc) · 5.74 KB
/
featureEngineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import pandas as pd
import csv
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
geolocator = Nominatim(user_agent="Eventure")
from __future__ import division
import numpy as np
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import collections
trainingCsvPath = "Documents/Events/finalCsvs/training.csv"
usersCsvPath = "Documents/Events/finalCsvs/users.csv"
eventsCsvPath = "Documents/Events/finalCsvs/events.csv"
#--------------------------------------------------------------------------------
# BHATTACHARYA
#--------------------------------------------------------------------------------
def compute_relevence_feature(userPastEvents, upcomingEventName, eventsCsv):
pastEventsTfidfVector = getPastEventsTfidfVector(userPastEvents, eventsCsv)
upcomingEventTfidfVector = getEventTfidfVector(upcomingEventName, eventsCsv)
eventsCosineSimilarity = computeCosineSimilarity(pastEventsTfidfVector, upcomingEventTfidfVector)
return event_similarity # 0..1
def getPastEventsTfidfVector(userPastEvents, eventsCsv):
#past_events_ids = get_past_events_of_user(uid)
userPastEventsList = userPastEvents.split(",")
#print("past list: ", userPastEventsList)
pastEventsTfidfVectorList = [getEventTfidfVector(eventName, eventsCsv) for eventName in userPastEventsList]
pastEventsDescription = ''
for description in pastEventsTfidfVectorList:
pastEventsDescription = pastEventsDescription + description
pastEventsTfidfVector = getVector(pastEventsDescription)
return pastEventsTfidfVector
def getEventTfidfVector(eventName_1, eventsCsv):
#print("eventname_1: ", eventName_1)
for eventRecord in eventsCsv.iterrows():
eventRecord = eventRecord[1]
eventName = eventRecord['event_name']
if(eventName.strip().lower() == eventName_1.strip().lower()):
eventDescription = [eventRecord['description']]
return getVector(eventDescription)
def getVector(eventDesc):
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(eventDesc)
# encode document
vector = vectorizer.transform([eventDesc[0]])
return vector
def computeCosineSimilarity(vector_1, vector_2):
return cosine_similarity(vector_1, vector_2)
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
def computeFeatures(tUserName, tEventName, usersCsvPath, eventsCsvPath):
print("user: ", tUserName, " event: ", tEventName)
usersCsv = pd.read_csv(usersCsvPath)
eventsCsv = pd.read_csv(eventsCsvPath)
for userRecord in usersCsv.iterrows():
userRecord = userRecord[1]
userName = userRecord['user_name']
if(userName.lower() == tUserName.lower()):
userLocation = userRecord['user_location']
print("user loc: ", userLocation)
userPastEvents = userRecord['past_event_names']
for eventRecord in eventsCsv.iterrows():
eventRecord = eventRecord[1]
eventName = eventRecord['event_name']
if(eventName.lower() == tEventName.lower()):
eventLocation = eventRecord['event_location']
print("event loc: ", eventLocation)
#histogram
upcomingEventName = eventName.lower()
# print(upcomingEventDescription)
#call tProximity = calculateProximity(userLocation, eventLocation)
#call tRelevance = calculateRelevance(?,?)
tProximity = calculateProximity(userLocation, eventLocation)
tRelevance = calculateRelevanceBetweenCurrentAndPastEvents(upcomingEventName, userPastEvents, eventsCsv)
print("tRelevance: ", tRelevance)
#relevance
featureVector = [tUserName, tEventName, tProximity, tRelevance]
return featureVector
def calculateProximity(userLocation, eventLocation):
uLocation = geolocator.geocode(userLocation)
eLocation = geolocator.geocode(eventLocation)
print("uloc: ", uLocation)
print("eloc: ", eLocation)
userLatitude = int(uLocation.latitude)
userLongitude = int(uLocation.longitude)
print("user lat: ", userLatitude)
eventLatitude = int(eLocation.latitude)
eventLongitude = int(eLocation.longitude)
print("event lat: ", eventLatitude)
userCoordinates = (userLatitude, userLongitude)
print("user coordinates: ", userCoordinates)
eventCoordinates = (eventLatitude, eventLongitude)
print("event coordinates: ", eventCoordinates)
# print("user past events", userPastEvents)
distance = geodesic(userCoordinates, eventCoordinates).miles
print("distance: ", distance)
return distance
def calculateRelevanceBetweenCurrentAndPastEvents(upcomingEventName, userPastEvents, eventsCsv):
return compute_relevence_feature(userPastEvents, upcomingEventName, eventsCsv)
def prepareDataForWatsonTraining(trainingCsvPath, usersCsvPath, eventsCsvPath):
trainingCsv = pd.read_csv(trainingCsvPath)
#create a csv file
with open('Documents/Events/finalCsvs/testTrainingData.csv', mode='w') as testTrainingData:
csvWriter = csv.writer(testTrainingData, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for trainingDataRow in trainingCsv.iterrows():
trainingDataRow = trainingDataRow[1]
featureVector = computeFeatures(trainingDataRow['user_name'], trainingDataRow['event_name'], usersCsvPath, eventsCsvPath)
csvWriter.writerow([featureVector[0], featureVector[1], featureVector[2], featureVector[3]]) #add featureVector[1] later
return testTrainingData
finalTrainingData = prepareDataForWatsonTraining(trainingCsvPath, usersCsvPath, eventsCsvPath)