-
Notifications
You must be signed in to change notification settings - Fork 0
/
recommender.py
122 lines (98 loc) · 4.76 KB
/
recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import numpy as np
import pickle
import datetime as dt
from sklearn.metrics.pairwise import cosine_similarity
#Function to get articles closest in topic/subtopic/date published to input article
def get_recs(article_index, n = 5, topic_df = None, dists = None):
topic_weights = topic_df[['Neuroscience/ Behavioral Sci.', 'Astronomy',
'Climate Science', 'Diseases/ Epidemics/ Viruses',
'Optics/ Electronics/ Photonics/ Device Physics',
'Drug Discovery/ Pharmaceuticals',
'Genetics/Genomics', 'Ocean Sciences/ Geology',
'Stem Cells/ Cloning', 'Agriculture/ Plant Sciences',
'Cellular Bio./ Molecular Bio.', 'Evolution/ Archaeology',
'Phyics/ Particle Physics/ Quantum Physics',
'Space Travel/ Exploration', 'Wildlife/ Conservation/ Biodiversity',
'Planetary Science/ Solar System']]
dists = cosine_similarity(topic_weights)
print(topic_df.loc[article_index, 'title'])
top = np.argsort(dists[article_index])[-2::-1]
count = 0
recs = []
start = 0
stop = 100
while count < n:
for ind in top[start:stop]:
if set(topic_df.loc[article_index, 'all_topics']) & set(topic_df.loc[ind, 'all_topics']):
if topic_df.loc[article_index, 'subtopic'] == topic_df.loc[ind, 'subtopic']:
if topic_df.loc[article_index, 'year'] == topic_df.loc[ind, 'year']:
count+=1
print(topic_df.loc[ind, 'subtopic'], topic_df.loc[ind, 'year'])
if count == n:
break
recs.append(ind)
if count == n:
break
for ind in top[start:stop]:
if set(topic_df.loc[article_index, 'all_topics']) & set(topic_df.loc[ind, 'all_topics']):
if topic_df.loc[article_index, 'subtopic'] == topic_df.loc[ind, 'subtopic']:
if topic_df.loc[article_index, 'year'] != topic_df.loc[ind, 'year']:
count+=1
if count == n:
break
print(topic_df.loc[ind, 'subtopic'], topic_df.loc[ind, 'year'])
recs.append(ind)
if count ==n:
break
for ind in top[start:stop]:
if set(topic_df.loc[article_index, 'all_topics']) & set(topic_df.loc[ind, 'all_topics']):
if topic_df.loc[article_index, 'subtopic'] != topic_df.loc[ind, 'subtopic']:
if topic_df.loc[article_index, 'year'] == topic_df.loc[ind, 'year']:
count+=1
print(topic_df.loc[ind, 'subtopic'], topic_df.loc[ind, 'year'])
if count == n:
break
recs.append(ind)
if count == n:
break
for ind in top[start:stop]:
if set(topic_df.loc[article_index, 'all_topics']) & set(topic_df.loc[ind, 'all_topics']):
if topic_df.loc[article_index, 'subtopic'] != topic_df.loc[ind, 'subtopic']:
if topic_df.loc[article_index, 'year'] != topic_df.loc[ind, 'year']:
count+=1
print(topic_df.loc[ind, 'subtopic'], topic_df.loc[ind, 'year'])
if count == n:
break
recs.append(ind)
start +=50
stop +=50
print('\nRecs:\n')
title_list = []
url_list = []
for rec in recs:
title_list.append(topic_df.loc[rec, 'title'])
url_list.append(topic_df.loc[rec, 'url'])
print(topic_df.loc[rec, 'title'])
return recs, title_list, url_list
#Load topic/subtopic dataframe, calculate cosine similarity matrix
def get_df_and_dists():
topic_df = pickle.load(open('subtopic_df', 'rb'))
topic_df.reset_index(inplace=True)
topic_weights = topic_df[['Neuroscience/ Behavioral Sci.', 'Astronomy',
'Climate Science', 'Diseases/ Epidemics/ Viruses',
'Optics/ Electronics/ Photonics/ Device Physics',
'Drug Discovery/ Pharmaceuticals',
'Genetics/Genomics', 'Ocean Sciences/ Geology',
'Stem Cells/ Cloning', 'Agriculture/ Plant Sciences',
'Cellular Bio./ Molecular Bio.', 'Evolution/ Archaeology',
'Phyics/ Particle Physics/ Quantum Physics',
'Space Travel/ Exploration', 'Wildlife/ Conservation/ Biodiversity',
'Planetary Science/ Solar System']]
dists = cosine_similarity(topic_weights)
return topic_df, dists
#Load and return topic/subtopic dataframe
def get_df():
topic_df = pickle.load(open('subtopic_df', 'rb'))
topic_df.reset_index(inplace=True)
return topic_df