-
Notifications
You must be signed in to change notification settings - Fork 0
/
12_run_caseolap_score.py
82 lines (57 loc) · 2.72 KB
/
12_run_caseolap_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
'''
The purpose of this file is to produce CaseOLAP scores for the entities
based on their hits in each document (pmid2pcount_path) and the documents'
category (category2pmids_path).
'''
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, json
from caseolap._12_caseolap_score import *
'''
Parameters
'''
# Input data directories
cat2pmids_path = './data/metadata_category2pmids.json' # {CategoryName:[PMID,...],...}
pmid2entity2count_path = './data/metadata_pmid2entity2count.json' # {PMID:{Entity:Count,...},...}
category_names_path = 'config/textcube_config.json' # ['CategoryName1',...]
# Output data path
result_dir = 'result/' # Main folder where the results from this section will be stored
logFilePath = './log/caseolap_score_log.txt' # Logs #PMIDs for each category
caseolap_name = 'caseolap' # Name of dataframe/spreadsheet for the caseolap scores
'''
Main Code
'''
if __name__ == '__main__':
logfile = open(logFilePath, 'w')
category2pmids = json.load(open(cat2pmids_path, 'r'))
pmid2entity2count = json.load(open(pmid2entity2count_path, 'r'))
''' Initial Calculations'''
# Initialize object with input data
C = Caseolap(category2pmids, pmid2entity2count, result_dir, category_names_path, logfile)
# Print info on categories and their number of publications
C.print_categories2pmid(dump = True, verbose =True)
# Map Category to its PMIDs to its Entities to the Entity's Counts
C.map_category2pmid2entity2count()
# Save all entities
C.get_all_entities(dump = True, verbose = True)
''' Popularity Score (note: relies on some previous sections above)'''
# Get the entity counts for each category
C.get_entity_counts_per_category()
# Maps category to its entities to their counts (includes zero count entities)
C.category2entity2tf_finder()
# Calculate the popularity scores for all entities
C.calculate_all_popularity_scores(dump = True)
''' Distinctiveness Score (note: relies on some previous sections above)'''
# Map entities to the count of their PMIDs
C.category2entity2num_pmids_finder()
# Calculate normalized term frequencies
C.calculate_category2entity2ntf()
# Calculate normalized document frequencies
C.calculate_category2entity2ndf()
# Calculate ratio of normalized term frequency over normalized document frequency
C.calculate_entity2ntf_ndf_ratio()
# Calculate distinctiveness score
C.calculate_all_distinctiveness_scores(dump = True)
'''Final Score'''
# Calculate CaseOLAP Score (combine popularity & distinctiveness)
C.calculate_caseolap_score(caseolap_name, dump = True)
# Close logfile
logfile.close()