-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbooleanAND.py
106 lines (82 loc) · 3.32 KB
/
booleanAND.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import click
import json
import re
from typing import List, Dict, Tuple
from utils.booleanAND_utils import validate_paths
Q0 = "QO"
RUNTAG = "ctiscareAND"
def load_json_file(file_path: str) -> Dict:
with open(file_path) as file:
return json.load(file)
def process_query_topics(query_topics: Dict) -> Dict[str, List[str]]:
search_tokens = {}
for key, value in query_topics.items():
if int(key) in (416, 423, 437, 444, 447):
continue
cleaned_query = value.replace("\n", " ").replace("_", " ")
query_tokens = re.sub(r"\W+", ", ", cleaned_query).lower().split(", ")
search_tokens[key] = query_tokens
return search_tokens
def load_lexicon(file_path: str) -> Dict[str, int]:
with open(file_path) as file:
lexicon = file.read().splitlines()
return {word: index for index, word in enumerate(lexicon, 1)}
def load_index_registrar(file_path: str) -> Dict[int, str]:
with open(file_path) as file:
index_registrar = file.read().splitlines()
return {index: value for index, value in enumerate(index_registrar)}
def search_inverted_index(
search_tokens: Dict[str, List[str]],
lexicon: Dict[str, int],
inverted_index: Dict[str, List[int]],
index_registrar: Dict[int, str],
) -> List[Dict]:
results = []
for key, tokens in search_tokens.items():
valid_tokens = [token for token in tokens if token in lexicon]
if not valid_tokens:
continue
term_ids = [lexicon[token] for token in valid_tokens]
collections = [
inverted_index.get(str(term_id), [])[::2] for term_id in term_ids
]
common_docs = set(collections[0]).intersection(*collections[1:])
results.extend(create_final_results(key, common_docs, index_registrar))
return results
def create_final_results(
topic_id: str, doc_ids: set, index_registrar: Dict[int, str]
) -> List[Dict]:
return [
{
"topicID": topic_id,
"Q0": Q0,
"docno": index_registrar[doc_id],
"rank": str(index + 1),
"score": str(len(doc_ids) - index),
"runtag": RUNTAG,
}
for index, doc_id in enumerate(doc_ids)
]
def write_results_to_file(output_file_path: str, final_results: List[Dict]):
with open(output_file_path, "a") as file:
for result in final_results:
file.write(" ".join(result.values()) + "\n")
@click.command()
@click.argument("index_directory_path", nargs=1, required=False)
@click.argument("query_file_path", nargs=1, required=False)
@click.argument("output_file_path", nargs=1, required=False)
def main(index_directory_path: str, query_file_path: str, output_file_path: str):
validate_paths(index_directory_path, query_file_path, output_file_path)
query_topics = load_json_file(query_file_path)
search_tokens = process_query_topics(query_topics)
lexicon = load_lexicon(f"{index_directory_path}/lexicon.txt")
index_registrar = load_index_registrar(
f"{index_directory_path}/index_registrar.txt"
)
inverted_index = load_json_file(f"{index_directory_path}/inverted_index.json")
final_results = search_inverted_index(
search_tokens, lexicon, inverted_index, index_registrar
)
write_results_to_file(output_file_path, final_results)
if __name__ == "__main__":
main()