-
Notifications
You must be signed in to change notification settings - Fork 0
/
file_treatment.py
126 lines (102 loc) · 5 KB
/
file_treatment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# --------------------------------
# 2nd Delivery of the PRI project
# 86379 - Ana Evans
# 86389 - Artur Guimarães
# 86417 - Francisco Rosa
# --------------------------------
import os, os.path
import sys
import pickle
import re
from bs4 import BeautifulSoup
# -----------------------------------------------------------------------------
# write_to_file() - Small auxiliary function to write data to a file
# -----------------------------------------------------------------------------
def write_to_file(dic, filename):
with open('material/saved_data/{}.txt'.format(filename), 'wb') as write_f:
pickle.dump(dic, write_f)
return
# -----------------------------------------------------------------------------
# read_from_file() - Small auxiliary function to read data from a file
# -----------------------------------------------------------------------------
def read_from_file(filename):
with open('material/saved_data/{}.txt'.format(filename), 'rb') as read_f:
return pickle.load(read_f)
#--------------------------------------------------
# get_xml_files_recursively - Auxiliary function to get_files_from_directory
#
# Input: path - The path to the parent directory or file from which to start our recursive function
#
# Behaviour: Creates a list with the path to every file that's an hierarquical child of parent directory path,
# recursively going through each child in Post-Order traversing
#
# Output: A List with the paths to each file child
#--------------------------------------------------
def get_xml_files_recursively(path, judged_documents, **kwargs):
files_list = []
directory_list = os.listdir(path)
for f in directory_list:
n_path = '{}{}/'.format(path,f)
if os.path.isdir(n_path):
files_list.extend(get_xml_files_recursively(n_path, judged_documents, **kwargs))
elif judged_documents != None:
if int(f.split('news')[0]) in judged_documents:
files_list.append(re.sub('//','/','{}/{}'.format(path,f)))
else:
files_list.append(re.sub('//','/','{}/{}'.format(path,f)))
return files_list
# -------------------------------------------------
# get_files_from_directory - Recursively gets all files from directory or file path, parsing the files from xml to objects
# and spliting them in D_Test and D_Train in the conditions specified by our project
#
# Input: path - The path to the parent directory or file from which to start our search
#
# Behaviour: It starts by creating a list with the path to every file that's an hierarquical child of parent directory path,
# recursively going through each child in Post-Order traversing. Afterwards it parses each and every file from xml to a runtime
# object using the BeautifulSoup library. At last after having all files in object form it splits the dataset in D_Test and D_Train
# sets, according to their identifier (D_Test -> identifier > 1996-09-30 D_Train -> identifier <= 1996-09-30)
#
# Output: A List with the Lists of file objects present in D_Test and D_Train
# -------------------------------------------------
def get_files_from_directory(path, judged_documents, **kwargs):
file_list = get_xml_files_recursively(path, judged_documents, **kwargs)
parsed_files_test = []
parsed_files_train = []
if 'set' in kwargs and kwargs['set'] == 'test':
for f in file_list:
date_identifier = int(f.split('/')[2])
if date_identifier <= 19960930:
continue
open_file = open(f, 'r')
parsed_file = BeautifulSoup(open_file.read(), 'lxml')
if parsed_file.copyright != None:
parsed_file.copyright.decompose()
if parsed_file.codes != None:
parsed_file.codes.decompose()
parsed_files_test += [parsed_file,]
elif 'set' in kwargs and kwargs['set'] == 'train':
for f in file_list:
date_identifier = int(f.split('/')[2])
if date_identifier > 19960930:
break
open_file = open(f, 'r')
parsed_file = BeautifulSoup(open_file.read(), 'lxml')
if parsed_file.copyright != None:
parsed_file.copyright.decompose()
if parsed_file.codes != None:
parsed_file.codes.decompose()
parsed_files_train += [parsed_file,]
else:
for f in file_list:
date_identifier = int(f.split('/')[2])
open_file = open(f, 'r')
parsed_file = BeautifulSoup(open_file.read(), 'lxml')
if parsed_file.copyright != None:
parsed_file.copyright.decompose()
if parsed_file.codes != None:
parsed_file.codes.decompose()
if date_identifier <= 19960930:
parsed_files_train += [parsed_file,]
else:
parsed_files_test += [parsed_file,]
return (parsed_files_test, parsed_files_train)