This repository has been archived by the owner on Mar 12, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 24
/
api_text_search.py
108 lines (91 loc) · 3.51 KB
/
api_text_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json
import requests
from config import settings
import utils
def extract(word="witchcraft", snippets=True):
"""
Get cases that have the word you're looking for.
If snippets is True, save only the context of the word
otherwise, save the entire casebody
"""
url = utils.get_api_url() + 'cases?full_case=true&search=%s' % word
headers = {'AUTHORIZATION': 'Token {}'.format(settings.API_KEY)}
response = requests.get(url, headers=headers)
res = response.json()
word_results = {}
warning_printed = False
while True:
for case in res['results']:
case_data = {
"id": case["id"],
"name": case["name"],
"name_abbreviation": case["name_abbreviation"],
"context": "",
"decision_date": case["decision_date"],
"url": case["url"],
"citations": case["citations"],
"times_appeared": 0
}
jur_slug = case["jurisdiction"]["slug"]
try:
opinions = case['casebody']['data']['opinions']
text = ''
# add all opinions and head matter up to one giant string
for opinion in opinions:
text += opinion['text']
text += case['casebody']['data']['head_matter']
if snippets:
context, times_appeared = get_word_context(word, text)
case_data["context"] = context if context else ""
case_data["times_appeared"] = times_appeared if times_appeared else 0
else:
case_data['casebody'] = case['casebody']
except Exception as e:
if snippets:
case_data['context'] = False
case_data['times_appeared'] = 0
else:
case_data['casebody'] = False
if not warning_printed:
utils.print_info("\nWarning: Something went wrong -- your daily limit may have run out.\nPlease check your account: https://case.law/user/details")
print("\nError:", e)
warning_printed = True
if jur_slug in word_results.keys():
word_results[jur_slug].append(case_data)
else:
word_results[jur_slug] = [case_data]
try:
next_result = requests.get(res['next'], headers=headers)
res = next_result.json()
except:
break
filename = "%s/%s.json" % (settings.DATA_DIR, word)
with open(filename, "w+") as f:
json.dump(word_results, f)
utils.print_info("\n>> Written to file %s" % filename)
def get_word_context(word, casebody):
"""
Return snippet around word and times the word appeared in the case
"""
words = casebody.split(" ")
lower_words = casebody.lower().split(" ")
times_appeared = 0
# iterate through entire list of words
# because word might be a substring, or could have an ocr error that
# is unaccounted for
indexes = []
for i, w in enumerate(lower_words):
if word in w:
times_appeared += 1
indexes.append(i)
# for now keep it semi-simple with one instance
index = indexes[0]
if index >= 10:
start = index - 10
else:
start = 0
if len(lower_words) >= index + 10:
end = index + 10
else:
end = len(lower_words)
return " ".join(words[start:end]), times_appeared