Skip to content

Commit

Permalink
Merge pull request #182 from CocaineCong/feature-bm25
Browse files Browse the repository at this point in the history
refactor: extract the segment with weight module
  • Loading branch information
vcaesar authored Nov 5, 2023
2 parents 2e51ba3 + bdb140e commit 10459e0
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 45 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,4 @@ _testmain.go
.glide/
examples/dict/embed/embed
examples/dict/embed/main
oryxBuildBinary
46 changes: 6 additions & 40 deletions hmm/idf/tag_extracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,9 @@ import (
"unicode/utf8"

"github.com/go-ego/gse"
"github.com/go-ego/gse/hmm/segment"
)

// Segment type a word with weight.
type Segment struct {
text string
weight float64
}

// Text return the segment's text.
func (s Segment) Text() string {
return s.text
}

// Weight return the segment's weight.
func (s Segment) Weight() float64 {
return s.weight
}

// Segments type a slice of Segment.
type Segments []Segment

func (ss Segments) Len() int {
return len(ss)
}

func (ss Segments) Less(i, j int) bool {
if ss[i].weight == ss[j].weight {
return ss[i].text < ss[j].text
}

return ss[i].weight < ss[j].weight
}

func (ss Segments) Swap(i, j int) {
ss[i], ss[j] = ss[j], ss[i]
}

// TagExtracter is extract tags struct.
type TagExtracter struct {
seg gse.Segmenter
Expand Down Expand Up @@ -82,7 +48,7 @@ func (t *TagExtracter) LoadStopWords(fileName ...string) error {
}

// ExtractTags extract the topK key words from text.
func (t *TagExtracter) ExtractTags(text string, topK int) (tags Segments) {
func (t *TagExtracter) ExtractTags(text string, topK int) (tags segment.Segments) {
freqMap := make(map[string]float64)

for _, w := range t.seg.Cut(text, true) {
Expand Down Expand Up @@ -110,13 +76,13 @@ func (t *TagExtracter) ExtractTags(text string, topK int) (tags Segments) {
freqMap[k] = v / total
}

ws := make(Segments, 0)
var s Segment
ws := make(segment.Segments, 0)
var s segment.Segment
for k, v := range freqMap {
if freq, _, ok := t.Idf.Freq(k); ok {
s = Segment{text: k, weight: freq * v}
s = segment.Segment{Text: k, Weight: freq * v}
} else {
s = Segment{text: k, weight: t.Idf.median * v}
s = segment.Segment{Text: k, Weight: t.Idf.median * v}
}
ws = append(ws, s)
}
Expand Down
11 changes: 6 additions & 5 deletions hmm/idf/textrank.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

"github.com/go-ego/gse"
"github.com/go-ego/gse/hmm/pos"
"github.com/go-ego/gse/hmm/segment"
)

const dampingFactor = 0.85
Expand Down Expand Up @@ -81,7 +82,7 @@ func (u *undirectWeightedGraph) addEdge(start, end string, weight float64) {
edge{start: end, end: start, weight: weight})
}

func (u *undirectWeightedGraph) rank() Segments {
func (u *undirectWeightedGraph) rank() segment.Segments {
if !sort.IsSorted(u.keys) {
sort.Sort(u.keys)
}
Expand Down Expand Up @@ -124,10 +125,10 @@ func (u *undirectWeightedGraph) rank() Segments {
}
}

result := make(Segments, 0)
result := make(segment.Segments, 0)
for n, w := range ws {
result = append(result,
Segment{text: n, weight: (w - minRank/10.0) / (maxRank - minRank/10.0)},
segment.Segment{Text: n, Weight: (w - minRank/10.0) / (maxRank - minRank/10.0)},
)
}

Expand All @@ -137,7 +138,7 @@ func (u *undirectWeightedGraph) rank() Segments {

// TextRankWithPOS extracts keywords from text using TextRank algorithm.
// Parameter allowPOS allows a []string pos list.
func (t *TextRanker) TextRankWithPOS(text string, topK int, allowPOS []string) Segments {
func (t *TextRanker) TextRankWithPOS(text string, topK int, allowPOS []string) segment.Segments {
posFilt := make(map[string]int)
for _, pos1 := range allowPOS {
posFilt[pos1] = 1
Expand Down Expand Up @@ -181,6 +182,6 @@ func (t *TextRanker) TextRankWithPOS(text string, topK int, allowPOS []string) S

// TextRank extract keywords from text using TextRank algorithm.
// Parameter topK specify how many top keywords to be returned at most.
func (t *TextRanker) TextRank(text string, topK int) Segments {
func (t *TextRanker) TextRank(text string, topK int) segment.Segments {
return t.TextRankWithPOS(text, topK, defaultAllowPOS)
}
36 changes: 36 additions & 0 deletions hmm/segment/segment.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package segment

// Segment type a word with weight.
type Segment struct {
Text string
Weight float64
}

// Text return the segment's text.
func (s Segment) GetText() string {
return s.Text
}

// Weight return the segment's weight.
func (s Segment) GetWeight() float64 {
return s.Weight
}

// Segments type a slice of Segment.
type Segments []Segment

func (ss Segments) Len() int {
return len(ss)
}

func (ss Segments) Less(i, j int) bool {
if ss[i].Weight == ss[j].Weight {
return ss[i].Text < ss[j].Text
}

return ss[i].Weight < ss[j].Weight
}

func (ss Segments) Swap(i, j int) {
ss[i], ss[j] = ss[j], ss[i]
}

0 comments on commit 10459e0

Please sign in to comment.