-
Notifications
You must be signed in to change notification settings - Fork 24
/
run_discover.py
106 lines (84 loc) · 5.86 KB
/
run_discover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Created by Zhao Xinwei.
# 2017.05.??.
# Used to load the corpora and execute the new word discovering algorithm.
from argparse import ArgumentParser
from collections import namedtuple
import jieba
from discover_utils import *
from discoverer import Discoverer
# For 4000 lines of corpus.
default_latin = [10, 0, 0, 0]
default_bigram = [10, 50, 0, 1]
default_unigram2 = [10, 2, 0, 1]
default_unigram3 = [10, 2, 0, 1]
default_iteration = 2
default_verbose = 0
arg_parser = ArgumentParser('New Words Discovery',
usage='Discover new words from corpus according to term frequency, aggreagation coefficient, min neighboring entropy and max neighboring entropy.')
arg_parser.add_argument('input_path',
help='The path to the corpus. It should be a plain text file or a dir containing only plain text files.')
arg_parser.add_argument('output_path', help='The path to generate the reports.')
arg_parser.add_argument('--dictionary_path', default=os.path.join(os.path.dirname(jieba.__file__), 'dict.txt'),
help='The path to the dictionary (text), each line of which contains item, POS-tag and frequency, seperated by spaces. Terms satisfying the filter condition but in the dictionary are not considered as new words.')
arg_parser.add_argument('--latin', nargs=4, default=default_latin, type=int,
help='The parameters include term frequency, aggreagation coefficient, max neighboring entropy and min neighboring entropy, which also applies for --bigram, --unigram_2 and --unigram_3. This argument set thresholds for latin words, including pure digits, pure letters and the combination of letters and digits such as "iphone 7".')
arg_parser.add_argument('--bigram', nargs=4, default=default_bigram, type=float,
help='Bigrams are defined as words that are composed of two unigram terms. Reference argument --latin for further help.')
arg_parser.add_argument('--unigram_2', nargs=4, default=default_unigram2, type=float,
help='A term which is composed of two Chinese characters and cannot be divided into other words. Reference argument --latin for further help.')
arg_parser.add_argument('--unigram_3', nargs=4, default=default_unigram3, type=float,
help='A term which is composed of three Chinese characters and cannot be divided into other words. Reference argument --latin for further help.')
arg_parser.add_argument('--iteration', default=default_iteration, type=int,
help='The next iteration will base its dictionary as the original dictionary plusing the new words discovered in the last iteration.')
arg_parser.add_argument('--verbose', default=default_verbose, choices=[0, 1, 2], type=int,
help="Determines the verbosity of the reports. *** 0: only new word items and their term frequency.*** 1: min neighboring entropy and max neighboring entropy are supplemented. *** 2:left and right neighboring entropy are added.")
args = arg_parser.parse_args()
documents, corpus_name = load_lines_of_documents(args.input_path)
output_home = join(args.output_path, corpus_name)
if not os.path.exists(output_home):
logger.info('Output path does not exists and created.')
os.makedirs(output_home)
threshold_parameter = namedtuple('threshold_parameter', ['tf', 'agg_coef', 'max_entropy', 'min_entropy'])
threshold_parameters = dict()
threshold_parameters['bigram'] = threshold_parameter(*args.bigram)
threshold_parameters['latin'] = threshold_parameter(*args.latin)
threshold_parameters[2] = threshold_parameter(*args.unigram_2)
threshold_parameters[3] = threshold_parameter(*args.unigram_3)
dictionary = load_dictionary(args.dictionary_path)
discoverer = Discoverer(save_segmentation=False)
# Used to store stats generated in each iteration.
stats_ind = list()
import time
for iteration in range(args.iteration):
time.sleep(1)
logger.info("""
**********************************************************************
commencing iteration {}...
**********************************************************************
""".format(iteration + 1))
discoverer.fit(documents, corpus_name + ' [{}]'.format(iteration + 1))
discoverer.get_new_unigrams(dictionary)
# Add new words to the `dictionary`.
new_words, current_stats = generate_report(output_home, discoverer.new_unigram_stats, discoverer.bigram_stats,
threshold_parameters, corpus_name=corpus_name, iteration=iteration + 1,
verbose=args.verbose)
dictionary += new_words
stats_ind.append(current_stats)
for each_new_word in new_words:
jieba.add_word(each_new_word)
# Output complete reports with the results of each iteration concatenated.
by = 'tf'
overall_latin_new_unigram_stats = pd.concat(
[each_stats['latin'] for each_stats in stats_ind]).sort_values(by=by, ascending=False)
overall_new_bigrams_stats = pd.concat(
[each_stats['bigram'] for each_stats in stats_ind]).sort_values(by=by, ascending=False)
output_stats(join(output_home, 'overall_latin.csv'), overall_latin_new_unigram_stats)
output_stats(join(output_home, 'overall_bigrams.csv'), overall_new_bigrams_stats)
for each_length in stats_ind[0]['chinese_unigram']:
# ====================================================================================================
# ====================================================================================================
overall_chinese_sub_unigrams_verbose = pd.concat(
[each_stats['chinese_unigram'][each_length] for each_stats in stats_ind]).sort_values(by=by,
ascending=False)
output_stats(join(output_home, 'overall_chinese_unigrams@{}.csv'.format(each_length)),
overall_chinese_sub_unigrams_verbose)