Skip to content

Commit

Permalink
v0.4.4
Browse files Browse the repository at this point in the history
- Use Golang sort package
- Now able to use custom weighing and scoring algorithm
- Now use pagerank as default scoring algorithm
- Update readme
- Update test
- Update benchmark
  • Loading branch information
didasy committed Jun 12, 2016
1 parent 1e02e75 commit b765f91
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 153 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,16 @@
tldr is a golang package to summarize a text automatically using [lexrank](http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html) algorithm.

### How?
There are two main steps in lexrank, weighing, and ranking. tldr have three weighing and two ranking algorithm included, they are Jaccard coeficient, Hamming distance, and PageRank, centrality, respectively. The default settings use Hamming distance and centrality.
There are two main steps in lexrank, weighing, and ranking. tldr have two weighing and two ranking algorithm included, they are Jaccard coeficient and Hamming distance, then PageRank and centrality, respectively. The default settings use Hamming distance and pagerank.

### Is This Fast?
Test it yourself, my system is i3-3217@1.8GHz with single channel 4GB RAM using Ubuntu 15.10 with kernel 4.5.0
```
$ go test -bench . -benchmem -benchtime 5s -cpu 4
BenchmarkSummarizeCentralityHamming-4 1000 6583990 ns/op 401311 B/op 3549 allocs/op
BenchmarkSummarizeCentralityJaccard-4 300 28813583 ns/op 3449792 B/op 12546 allocs/op
BenchmarkSummarizePagerankHamming-4 1000 7122519 ns/op 420581 B/op 3729 allocs/op
BenchmarkSummarizePagerankJaccard-4 300 30523688 ns/op 3468906 B/op 12725 allocs/op
ok github.com/JesusIslam/tldr 38.566s
BenchmarkSummarizeCentralityHamming-4 2000 6429340 ns/op 401204 B/op 3551 allocs/op
BenchmarkSummarizeCentralityJaccard-4 200 30036357 ns/op 3449461 B/op 12543 allocs/op
BenchmarkSummarizePagerankHamming-4 1000 7015008 ns/op 420665 B/op 3731 allocs/op
BenchmarkSummarizePagerankJaccard-4 200 31066764 ns/op 3469629 B/op 12737 allocs/op
```
So, not bad huh?

Expand Down
121 changes: 0 additions & 121 deletions heap_sort.go

This file was deleted.

4 changes: 2 additions & 2 deletions result.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Lucas just announced that Beijing-based MAD Architects will design the museum, while Chicago firm Studio Gang Architects will be responsible for the surrounding landscape and a pedestrian bridge that links nearby peninsula Northerly Island with the city.

In honor of the Museum of Narrative Art and its star-studded cast of architects, here's a roundup of articles from Architizer that feature Star Wars-related architecture:

Jeff Bennett's Wars on Kinkade are hilarious paintings that ravage the peaceful landscapes of Thomas Kinkade with the brutal destruction of Star Wars.

Lucas just announced that Beijing-based MAD Architects will design the museum, while Chicago firm Studio Gang Architects will be responsible for the surrounding landscape and a pedestrian bridge that links nearby peninsula Northerly Island with the city.

These products were inspired by the movie and blend pop culture memorabilia with high design, including Hans Solo Carbonite Coffee Tables, Emperor Thrones, and an AT-AT Triple Bunk Bed.
41 changes: 41 additions & 0 deletions sort.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package tldr

type ByWeight []*Edge

func (b ByWeight) Len() int {
return len(b)
}

func (b ByWeight) Swap(i, j int) {
b[i], b[j] = b[j], b[i]
}

func (b ByWeight) Less(i, j int) bool {
return b[i].weight < b[j].weight
}

type ByScore []*Rank

func (b ByScore) Len() int {
return len(b)
}

func (b ByScore) Swap(i, j int) {
b[i], b[j] = b[j], b[i]
}

func (b ByScore) Less(i, j int) bool {
return b[i].score < b[j].score
}

func ReverseEdge(num []*Edge) {
for i, j := 0, len(num)-1; i < j; i, j = i+1, j-1 {
num[i], num[j] = num[j], num[i]
}
}

func ReverseRank(num []*Rank) {
for i, j := 0, len(num)-1; i < j; i, j = i+1, j-1 {
num[i], num[j] = num[j], num[i]
}
}
76 changes: 52 additions & 24 deletions tldr.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ package tldr
import (
"errors"
"github.com/alixaxel/pagerank"
"sort"
"strings"
"unicode"
)
Expand All @@ -22,20 +23,23 @@ type Bag struct {
Ranks []int

MaxCharacters int
Algorithm string // "centrality" or "pagerank"
Weighing string // "hamming" or "jaccard"
Algorithm string // "centrality" or "pagerank" or "custom"
Weighing string // "hamming" or "jaccard" or "custom"
Damping float64
Tolerance float64
Threshold float64
SentencesDistanceThreshold float64

customAlgorithm func(e []*Edge) []int
customWeighing func(src, dst []int) float64

vectorLength int
}

// The default values of each settings
const (
VERSION = "0.4.3"
DEFAULT_ALGORITHM = "centrality"
VERSION = "0.4.4"
DEFAULT_ALGORITHM = "pagerank"
DEFAULT_WEIGHING = "hamming"
DEFAULT_DAMPING = 0.85
DEFAULT_TOLERANCE = 0.0001
Expand Down Expand Up @@ -68,26 +72,48 @@ func (bag *Bag) Set(m int, d, t, th, sth float64, alg, w string) {
bag.SentencesDistanceThreshold = sth
}

// Useful if you already have your own dictionary (example: from your database)
// Dictionary is a map[string]int where the key is the word and int is the position in vector, starting from 1
func (bag *Bag) SetDictionary(dict map[string]int) {
bag.Dict = dict
}

func (bag *Bag) SetCustomAlgorithm(f func(e []*Edge) []int) {
bag.customAlgorithm = f
}

func (bag *Bag) SetCustomWeighing(f func(src, dst []int) float64) {
bag.customWeighing = f
}

// Summarize the text to num sentences
func (bag *Bag) Summarize(text string, num int) (string, error) {
if len(text) < 1 {
return "", nil
}

bag.CreateSentences(text)
bag.CreateDictionary(text)
bag.CreateNodes()
bag.CreateEdges()
bag.createSentences(text)

// If user already provide dictionary, pass creating dictionary
if len(bag.Dict) < 1 {
bag.createDictionary(text)
}

bag.createNodes()
bag.createEdges()

switch bag.Algorithm {
case "centrality":
bag.Centrality()
bag.centrality()
break
case "pagerank":
bag.PageRank()
bag.pageRank()
break
case "custom":
bag.Ranks = bag.customAlgorithm(bag.Edges)
break
default:
bag.Centrality()
bag.pageRank()
}

// if no ranks, return error
Expand All @@ -104,7 +130,7 @@ func (bag *Bag) Summarize(text string, num int) (string, error) {
// get only top num of ranks
idx := bag.Ranks[:num]
// sort it ascending by how the sentences appeared on the original text
HeapSortInt(idx)
sort.Ints(idx)
var res string
for i, _ := range idx {
res += (bag.OriginalSentences[idx[i]] + " ")
Expand Down Expand Up @@ -137,7 +163,7 @@ type Rank struct {
score float64
}

func (bag *Bag) Centrality() {
func (bag *Bag) centrality() {
// first remove edges under Threshold weight
var newEdges []*Edge
for _, edge := range bag.Edges {
Expand All @@ -146,7 +172,7 @@ func (bag *Bag) Centrality() {
}
}
// sort them by weight descending
HeapSortEdge(newEdges)
sort.Sort(ByWeight(newEdges))
ReverseEdge(newEdges)
rankBySrc := make([]int, len(newEdges))
for i, v := range newEdges {
Expand All @@ -165,7 +191,7 @@ func (bag *Bag) Centrality() {
bag.Ranks = uniq
}

func (bag *Bag) PageRank() {
func (bag *Bag) pageRank() {
// first remove edges under Threshold weight
var newEdges []*Edge
for _, edge := range bag.Edges {
Expand All @@ -183,8 +209,8 @@ func (bag *Bag) PageRank() {
graph.Rank(bag.Damping, bag.Tolerance, func(sentenceIndex uint32, rank float64) {
ranks = append(ranks, &Rank{int(sentenceIndex), rank})
})
// sort ranks into an array of sentence index, by rank descending
HeapSortRank(ranks)
// sort ranks into an array of sentence index, by score descending
sort.Sort(ByScore(ranks))
ReverseRank(ranks)
idx := make([]int, len(ranks))
for i, v := range ranks {
Expand All @@ -200,7 +226,7 @@ type Edge struct {
weight float64 // weight of the similarity between two sentences
}

func (bag *Bag) CreateEdges() {
func (bag *Bag) createEdges() {
for i, src := range bag.Nodes {
for j, dst := range bag.Nodes {
// don't compare same node
Expand All @@ -215,10 +241,12 @@ func (bag *Bag) CreateEdges() {
differentElements := SymmetricDifference(src.vector, dst.vector)
weight = float64(len(differentElements))
break
case "custom":
weight = bag.customWeighing(src.vector, dst.vector)
break
default:
// defaulted to jaccard
commonElements := Intersection(src.vector, dst.vector)
weight = 1.0 - float64(len(commonElements))/((float64(bag.vectorLength)*2)-float64(len(commonElements)))
differentElements := SymmetricDifference(src.vector, dst.vector)
weight = float64(len(differentElements))
}
edge := &Edge{i, j, weight}
bag.Edges = append(bag.Edges, edge)
Expand All @@ -243,7 +271,7 @@ type Node struct {
*/
}

func (bag *Bag) CreateNodes() {
func (bag *Bag) createNodes() {
bag.vectorLength = len(bag.Dict)
for i, sentence := range bag.BagOfWordsPerSentence {
// vector length is len(dict)
Expand All @@ -266,7 +294,7 @@ func (bag *Bag) CreateNodes() {
}
}

func (bag *Bag) CreateSentences(text string) {
func (bag *Bag) createSentences(text string) {
// trim all spaces
text = strings.TrimSpace(text)
// tokenize text as sentences
Expand All @@ -288,7 +316,7 @@ func (bag *Bag) CreateSentences(text string) {
UniqSentences(bag.BagOfWordsPerSentence, bag.SentencesDistanceThreshold)
}

func (bag *Bag) CreateDictionary(text string) {
func (bag *Bag) createDictionary(text string) {
// trim all spaces
text = strings.TrimSpace(text)
// lowercase the text
Expand Down
Loading

0 comments on commit b765f91

Please sign in to comment.