-
Notifications
You must be signed in to change notification settings - Fork 0
/
corpora.py
92 lines (71 loc) · 2.57 KB
/
corpora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Utilities for handling corpora json files.
Example data is provided under CC0 terms here:
https://github.com/FreeDelete-Software/corpora
Important notes on data files:
- JSON data is intended to be covered under separate copyright terms.
- All JSON files should be stored in the 'corpora' directory.
- Data files which are improperly formatted *will* cause issues.
- The unit tests in test_corpora.py perform some basic sanity checks on data.
"""
import json
class CorpusObject(list):
"""
Object which represents the contents of a corpus.
"""
def import_corpus(self, corpus_name):
"""
Imports a corpus by name into the object.
!NOTE! - This won't work if you specify a path or extension.
Importing my_corpus.json would look like this:
my_corpus_object.import_corpus("my_corpus")
"""
filename = "corpora/%s.json" % corpus_name
with open(filename) as json_file:
corpus_file = json.load(json_file)
# Clear Existing contents
self.clear()
# Convert data items to CorpusRecord class
for item in corpus_file.get(corpus_name):
if isinstance(item, dict):
record = CorpusRecord(item)
self.append(record)
# Set attributes
self.description = corpus_file.get("description")
self.name = corpus_name
def get_field_matches(self, field_name, match_list):
"""
Returns records with a given <field_name> that
contains any *exact* items in a given <match_list>.
"""
results = []
for record in self:
for item in match_list:
if (item in record[field_name]) and not (record in results):
results.append(record)
return results
def get_combined_list_field(self, field_name):
"""
Combines lists from all records stored in a given <field_name>.
"""
results = []
for record in self:
for item in record[field_name]:
if not (item in results):
results.append(item)
results.sort()
return results
def get_all_field_values(self, field_name):
"""
Returns a list of values from all records stored in a specified <field_name>
"""
results = []
for record in self:
results.append(record[field_name])
return results
class CorpusRecord(dict):
"""Currently just a placeholder"""
pass
class CompendiumObject(dict):
"""Currently just a placeholder"""
pass