-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathmention_statute_sentence.py
144 lines (120 loc) · 6.08 KB
/
mention_statute_sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
"""
Created on Wed May 5 11:47:19 2021
@author: Paheli
"""
# -*- coding: utf-8 -*-
"""
Created on Sat May 1 18:05:13 2021
@author: Paheli
"""
import re
import string
import os
from tqdm import tqdm
all_acts = open("current-acts.txt","r")
actlist = set()
tokens = {}
for line in all_acts.readlines():
line = line.rstrip("\n")
a = []
line = line.translate(str.maketrans('', '', string.punctuation.replace(",",""))) # remove punctuation from a line
#print(line)
if line != "Constitution of India 1950":
a.append(line)
new = line[:line.rindex(" ")]
a.append(new)
#new = line[:line.rindex(" ")+1]+line[line.rindex(" ")+2:]
#a.append(new)
tokens[line] = a
actlist.add(line)
tokens["Constitution of India, 1950"] = ["Constitution of India, 1950", "Constitution of India", "Constitution"]
tokens["Code of Criminal Procedure, 1898"] = ["Code of Criminal Procedure, 1898","Criminal Procedure Code, 1898", "Cr.PC, 1898", "Cr.P.C., 1898"\
"Criminal Procedure Code, 1898", "Cr.PC, 1898", "Cr.P.C., 1898", "Cr.P.C", "Cr.P.C.", "Cr.PC", "Criminal Procedure Code"]
tokens["Code of Criminal Procedure, 1973"] = ["Code of Criminal Procedure, 1973","Criminal Procedure Code, 1973", "Cr.PC, 1973", "Cr.P.C., 1973"\
"Criminal Procedure Code, 1973", "Cr.PC, 1973", "Cr.P.C., 1973"]
tokens["Code of Civil Procedure, 1908"] = ["Code of Civil Procedure, 1908","Code of Civil Procedure", "Civil Procedure Code", \
"Code of Civil Procedure", "Civil Procedure Code","CPC","C.P.C","civil procedure code"]
tokens["Indian Penal Code, 1860"] = ["Indian Penal Code, 1860","Indian Penal Code"',', "Penal Code"',', "IPC"',', "IPC"',', "I.P.C"',', "I.P.C"',',\
"I.P.C."',', "I.P.C."',', "I. P. C.","penal code"]
abbrv = {"s.": "section", "ss.": "section", "art.": "article", "arts.": "article"}
name1 = "article"
name2 = "articles"
name3 = "section"
name4 = "sections"
def get_statute_mention(line):
answer = set()
line = line.rstrip("\n")
#line,lab = line.split("$$$")
text = line
flag = 0
#for act in actlist:
for name,variations in tokens.items():
for var in variations:
if var and var in text:
#flag = 1
#text = text.replace(var,name)
act = var
if "of the "+act in text:
act = "of the "+act
if "of "+act in text:
act = "of "+act
#print(text+" ==== "+act)
if "u/" in text:
text = text.replace("u/","under ")
matched = []
for a in abbrv.keys():
if re.search(rf".*{a}.*", text):
matched.append(a)
#true = ""
if matched:
true = matched[0]
big = len(matched[0])
for m in matched:
if len(m)>big:
true = m
big = len(m)
ff = abbrv[true]
if "constitution" in act:
if name1 in text and name2 not in text:
reg = re.compile(rf"{name1}\s(.*?)\s{act}")
matchResult = reg.search(text)
if matchResult:
#print(name1+" "+matchResult.group(1)+" "+act)
a = name1+" "+matchResult.group(1)+" "+act
if (len(a.split(" "))<50):
#fw.write(file+"\t"+line+"\t"+lab+"\t"+a+"\t"+str(len(a.split(" ")))+"\n")
answer.add(a)
elif name2 in text:
reg = re.compile(rf"{name2}\s(.*?)\s{act}")
matchResult = reg.search(text)
if matchResult:
#print(name2+" "+matchResult.group(1)+" "+act)
a = (name2+" "+matchResult.group(1)+" "+act)
if (len(a.split(" "))<50):
#fw.write(file+"\t"+line+"\t"+lab+"\t"+a+"\t"+str(len(a.split(" ")))+"\n")
answer.add(a)
#print(reg.findall(text))
#text = re.sub(rf"{name2}\s(.*?)\s{act}","ACT",text)
else:
if name3 in text and name4 not in text:
reg = re.compile(rf"{name3}\s(.*?)\s{act}")
matchResult = reg.search(text)
if matchResult:
#print(name3+" "+matchResult.group(1)+" "+act)
a = (name3+" "+matchResult.group(1)+" "+act)
if (len(a.split(" "))<50):
#fw.write(file+"\t"+line+"\t"+lab+"\t"+a+"\t"+str(len(a.split(" ")))+"\n")
answer.add(a)
#print(reg.findall(text))
#text = re.sub(rf"{name3}\s(.*?)\s{act}","ACT",text)
elif name4 in text:
reg = re.compile(rf"{name4}\s(.*?)\s{act}")
matchResult = reg.search(text)
if matchResult:
#print(name4+" "+matchResult.group(1)+" "+act)
a = (name4+" "+matchResult.group(1)+" "+act)
if (len(a.split(" "))<50):
answer.add(a)
# fw.write(file+"\t"+line+"\t"+lab+"\t"+a+"\t"+str(len(a.split(" ")))+"\n")
return answer