-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_use_preprocessed.py
67 lines (51 loc) · 1.93 KB
/
main_use_preprocessed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pickle
import os
import pandas as pd
import ast
from classes.preprocessing import Preprocessing
from datetime import datetime
''' DESCRIPTION '''
''' This is the file where preprocessed datasets are loaded and reused '''
''' Generated data case '''
PATH_TEXTS = "generated_text/14.03.2021_15.38/generated_dataset_1000.csv"
# directory = os.fsencode(PATH_TEXTS)
date = datetime.now().strftime('%d.%m.%Y')
time = datetime.now().strftime('%H.%M')
generated = pd.read_csv(PATH_TEXTS)
print(generated)
generated["text"] = generated["text"].apply(lambda s: ast.literal_eval(s))
pp_generated = Preprocessing(
generated,
date,
time,
analysis="text",
news_type="generated",
duplicate_rows_removal=False, lowercasing=False, tokenization=False,
lemmatization=False, noise_removal=False, stemming=False,
stopword_removal=False, entity_recognition=False, data_augmentation=False,
word2vec=True, doc2vec=False, aggregation=True
) # here you can set the configuration
gen = pp_generated.run_pipeline()
dataframe = pd.DataFrame(gen.aggregated, columns = ["text"])
dataframe["membership"] = generated["membership"]
dataset = pp_generated.shuffle(dataframe).reset_index()
dataset.columns = ["old index", "text", "membership"]
dataset.index.name = "index"
cardinality = len(dataset)
outdir = "generated_datasets/" + date + "_" + time
outname = "generated_dataset_" + str(cardinality) + ".csv"
if not os.path.exists(outdir):
os.makedirs(outdir)
fullname = os.path.join(outdir, outname)
dataset.to_csv(fullname, index=True)
''' Classical preprocessed data case '''
# fullname = "preprocessed_datasets/text/28.12.2020_18.48/dataset_fake.pickle"
# file = open(fullname, "rb")
# dataset_fake = pickle.load(file)
#
# fullname = "preprocessed_datasets/text/28.12.2020_18.48/dataset_true.pickle"
# file = open(fullname, "rb")
# dataset_true = pickle.load(file)
#
# print(len(dataset_fake.aggregated))
# print(len(dataset_true.aggregated))