-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_select_words.rb
30 lines (22 loc) · 714 Bytes
/
1_select_words.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
require_relative 'config'
require_relative 'lib/transliterate'
# - normalizes
# - clears duplicates
# - selects a subset
text = Config.file.input.read
words_and_counts = text.strip.lines.map { |x|
(word, count) = x.strip.split ' ', 2
word = Transliterate.to_latin word # normalizes, there are duplicates
[word, Integer(count)]
}
grouped = words_and_counts.group_by { |x|
x[0]
}.map { |k, pairs|
[k, pairs.map { |x| x[1] }.reduce(:+)]
}
grouped.sort_by! { |x| -x[1] }
words = grouped.map { |x| x[0] }
blacklist = Config.file.generated_blacklist.read + Config.file.manual_blacklist.read
words.reject! { |x| blacklist.include? x }
words = words.take Config.words_count
Config.file.words.write words