-
Notifications
You must be signed in to change notification settings - Fork 0
/
klassextant.rb
123 lines (101 loc) · 3.19 KB
/
klassextant.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/ruby
# Code for analyzing 18,000+ class names
require 'lingua/stemmer'
load 'klassextant_export.rb'
class Klassextant
attr_reader :base, :analytes
attr_accessor :debug
KNOWN_PREFIXES = ['ADT','AS','FB','GL','HPP','ID','MN','MQTT','NS','NUX','PYM','NFX','OTD','QP','SSO','UI','URL','UFI']
include KlassextantExport
def initialize(filename="FBclasses.txt", opt={})
@known_prefixes = opt[:known_prefixes] || KNOWN_PREFIXES
debug = opt[:debug] || false
klass_load(filename)
end
def analytes
@analytes ||= @base.map do |c|
Hash[
:prefix, (c.match(/^_?([A-Z]*)[A-Z][a-z]/) || [])[1],
:class_parts, c.sub(/#{$1}/,'').split(/([A-Z]{1,})/).reject(&:empty?).each_slice(2).map(&:join)
]
end
end
def analyte_tree
@analyte_tree ||= analytes.reduce({}) do |baseh, an|
baseh[an[:prefix]] ||= {}
baseh[an[:prefix]][:class_parts] ||= []
baseh[an[:prefix]][:class_parts] << an[:class_parts]
baseh
end
end
def parts
@parts ||= analytes.reduce([]) do |parts, a|
parts += a[:class_parts]
end.sort.uniq - analyte_tree.keys
end
def part_stems
@part_stems ||= parts.
reject{|p| p.scan(/^[a-z]/).any? }.
map{|p| Lingua.stemmer p }.
reject{|stem| stem.sub(/[a-z0-9]/,'').size == 1 }.
push(*@known_prefixes). # handcrafting
sort
end
def stem_tree
@stem_tree ||= parts.
group_by do |p|
part_stems.select do |stem|
p.start_with? stem
end.first || p
end
end
def stem_counts
unless @stem_counts
@stem_counts = {}
analyte_tree.values.each do |k|
k[:class_parts].each do |cp|
cp.each_with_index do |part, idx|
@stem_counts[part_stem_lookup[part]] ||= {:positions => []}
@stem_counts[part_stem_lookup[part]][:positions].push idx
end
end
end
@stem_counts.keys.each {|stem|
@stem_counts[stem][:clusters] = @stem_counts[stem][:positions].group_by{|w| w }.reduce({}) {|se, g| se[g.first] = g.last.size; se }
@stem_counts[stem][:total_use] = @stem_counts[stem][:clusters].collect(&:last).inject(&:+)
@stem_counts[stem][:avg_pos] = @stem_counts[stem][:total_use]/@stem_counts[stem][:positions].size.to_f
@stem_counts[stem][:median] = @stem_counts[stem][:clusters].max_by(&:last).first
}
end
@stem_counts
end
def part_stem_lookup
@part_stem_lookup = stem_tree.reduce({}) do |lookup, stem, parts|
lookup.merge( stem.last.reduce({}) do |parts, part|
parts[part] = stem.first
parts
end)
end
end
def prefixes
@prefixes ||= @base.map {|b| b.scan(/^[A-Z]*/).first[0..-2] }.uniq.sort.reject(&:empty?)
end
def prefix_tree
@prefix_tree = @prefixes.
map{|b| b[0..1] }.
uniq.reduce({}) do |acc,pre|
acc[pre] = @prefixes.group_by{|p| p.start_with?(pre) && p.size > 2 }[true]
acc
end.
reject do |k,v|
v.nil? ||
(v.collect(&:size).min < 4 && v.size == 1)
end
end
private
def klass_load(filename)
@base = File.readlines(filename).
map(&:strip).
map{|c| c.sub('.h','').gsub(/[_-]/,'') }
end
end