-
Notifications
You must be signed in to change notification settings - Fork 0
/
LawPreprocessing.py
73 lines (66 loc) · 2.04 KB
/
LawPreprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 27 18:54:57 2022
Provides the methods to preprocess a law by computing the lemma of its content.
@author: ClaudiaLorusso
"""
import pandas as pd
def count_occurences(law, pattern):
"""
Counts the occurrences of a substring (second argument)
into the main string (first argument)
:param law: string
law's content
:param pattern: string
substring
:return: integer
occurrences of the substring into the main string
"""
count = law.count(pattern)
return count
def preprocess_law(law):
"""
Preprocesses the specified string containing the law's content (argument)
returning it's lemma.
:param law: string
content of the law
:return: string
lemma of the law
"""
from Preprocessing import clean_text, remove_new_lines, preprocess_lemma
txt = clean_text(law)
txt = remove_new_lines(txt)
lemma = preprocess_lemma(txt)
return lemma
def get_df_laws_lemma(path_law = ""):
"""
Computes the DataFrame of the law's lemma.
The dataframe is in the form:
name |body
laws_name law's lemma
:param path_law: string
path of the file containing the law
:return: DataFrame
dataframe containing the law's lemma
:raises: ValueError
in case the file is protected by password
"""
from FileHandler import get_content
try:
cont, file_name = get_content(path_law)
lemma = preprocess_law(cont)
df = pd.DataFrame(columns = ["name", "body"])
df= pd.DataFrame(df).set_index("name")
df.loc[file_name] = lemma
df.loc[file_name, "name"] = file_name
df.index.name = "name"
return df
except ValueError:
print("ValueError: WARNING, The file you selected maybe protected by password.\nPlease select another file.")
#test
#remove triple prime to test the class
"""
if __name__ == '__main__':
df = get_df_laws_lemma("laws\\[2015-2020]LeggiRegionePuglia\\LR_10.2016.pdf")
print(df)
"""