v0.4.4

- Use Golang sort package - Now able to use custom weighing and scoring algorithm - Now use pagerank as default scoring algorithm - Update readme - Update test - Update benchmark
didasy · Jun 12, 2016 · b765f91 · b765f91
1 parent 1e02e75
commit b765f91
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 153 deletions.
diff --git a/README.md b/README.md
@@ -9,17 +9,16 @@
 tldr is a golang package to summarize a text automatically using [lexrank](http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html) algorithm.
 
 ### How?
-There are two main steps in lexrank, weighing, and ranking. tldr have three weighing and two ranking algorithm included, they are Jaccard coeficient, Hamming distance, and PageRank, centrality, respectively. The default settings use Hamming distance and centrality.
+There are two main steps in lexrank, weighing, and ranking. tldr have two weighing and two ranking algorithm included, they are Jaccard coeficient and Hamming distance, then PageRank and centrality, respectively. The default settings use Hamming distance and pagerank.
 
 ### Is This Fast?
 Test it yourself, my system is i3-3217@1.8GHz with single channel 4GB RAM using Ubuntu 15.10 with kernel 4.5.0
 ```
 $ go test -bench . -benchmem -benchtime 5s -cpu 4
-BenchmarkSummarizeCentralityHamming-4	    1000	   6583990 ns/op	  401311 B/op	    3549 allocs/op
-BenchmarkSummarizeCentralityJaccard-4	     300	  28813583 ns/op	 3449792 B/op	   12546 allocs/op
-BenchmarkSummarizePagerankHamming-4  	    1000	   7122519 ns/op	  420581 B/op	    3729 allocs/op
-BenchmarkSummarizePagerankJaccard-4  	     300	  30523688 ns/op	 3468906 B/op	   12725 allocs/op
-ok  	github.com/JesusIslam/tldr	38.566s
+BenchmarkSummarizeCentralityHamming-4	    2000	   6429340 ns/op	  401204 B/op	    3551 allocs/op
+BenchmarkSummarizeCentralityJaccard-4	     200	  30036357 ns/op	 3449461 B/op	   12543 allocs/op
+BenchmarkSummarizePagerankHamming-4  	    1000	   7015008 ns/op	  420665 B/op	    3731 allocs/op
+BenchmarkSummarizePagerankJaccard-4  	     200	  31066764 ns/op	 3469629 B/op	   12737 allocs/op
 ```
 So, not bad huh?
 

diff --git a/heap_sort.go b/heap_sort.go
diff --git a/result.txt b/result.txt
@@ -1,7 +1,7 @@
+Lucas just announced that Beijing-based MAD Architects will design the museum, while Chicago firm Studio Gang Architects will be responsible for the surrounding landscape and a pedestrian bridge that links nearby peninsula Northerly Island with the city. 
+
 In honor of the Museum of Narrative Art and its star-studded cast of architects, here's a roundup of articles from Architizer that feature Star Wars-related architecture:
 
 Jeff Bennett's Wars on Kinkade are hilarious paintings that ravage the peaceful landscapes of Thomas Kinkade with the brutal destruction of Star Wars. 
 
-Lucas just announced that Beijing-based MAD Architects will design the museum, while Chicago firm Studio Gang Architects will be responsible for the surrounding landscape and a pedestrian bridge that links nearby peninsula Northerly Island with the city. 
-
 These products were inspired by the movie and blend pop culture memorabilia with high design, including Hans Solo Carbonite Coffee Tables, Emperor Thrones, and an AT-AT Triple Bunk Bed.
diff --git a/sort.go b/sort.go
@@ -0,0 +1,41 @@
+package tldr
+
+type ByWeight []*Edge
+
+func (b ByWeight) Len() int {
+	return len(b)
+}
+
+func (b ByWeight) Swap(i, j int) {
+	b[i], b[j] = b[j], b[i]
+}
+
+func (b ByWeight) Less(i, j int) bool {
+	return b[i].weight < b[j].weight
+}
+
+type ByScore []*Rank
+
+func (b ByScore) Len() int {
+	return len(b)
+}
+
+func (b ByScore) Swap(i, j int) {
+	b[i], b[j] = b[j], b[i]
+}
+
+func (b ByScore) Less(i, j int) bool {
+	return b[i].score < b[j].score
+}
+
+func ReverseEdge(num []*Edge) {
+	for i, j := 0, len(num)-1; i < j; i, j = i+1, j-1 {
+		num[i], num[j] = num[j], num[i]
+	}
+}
+
+func ReverseRank(num []*Rank) {
+	for i, j := 0, len(num)-1; i < j; i, j = i+1, j-1 {
+		num[i], num[j] = num[j], num[i]
+	}
+}
diff --git a/tldr.go b/tldr.go
@@ -9,6 +9,7 @@ package tldr
 import (
 	"errors"
 	"github.com/alixaxel/pagerank"
+	"sort"
 	"strings"
 	"unicode"
 )
@@ -22,20 +23,23 @@ type Bag struct {
 	Ranks                 []int
 
 	MaxCharacters              int
-	Algorithm                  string // "centrality" or "pagerank"
-	Weighing                   string // "hamming" or "jaccard"
+	Algorithm                  string // "centrality" or "pagerank" or "custom"
+	Weighing                   string // "hamming" or "jaccard" or "custom"
 	Damping                    float64
 	Tolerance                  float64
 	Threshold                  float64
 	SentencesDistanceThreshold float64
 
+	customAlgorithm func(e []*Edge) []int
+	customWeighing  func(src, dst []int) float64
+
 	vectorLength int
 }
 
 // The default values of each settings
 const (
-	VERSION                              = "0.4.3"
-	DEFAULT_ALGORITHM                    = "centrality"
+	VERSION                              = "0.4.4"
+	DEFAULT_ALGORITHM                    = "pagerank"
 	DEFAULT_WEIGHING                     = "hamming"
 	DEFAULT_DAMPING                      = 0.85
 	DEFAULT_TOLERANCE                    = 0.0001
@@ -68,26 +72,48 @@ func (bag *Bag) Set(m int, d, t, th, sth float64, alg, w string) {
 	bag.SentencesDistanceThreshold = sth
 }
 
+// Useful if you already have your own dictionary (example: from your database)
+// Dictionary is a map[string]int where the key is the word and int is the position in vector, starting from 1
+func (bag *Bag) SetDictionary(dict map[string]int) {
+	bag.Dict = dict
+}
+
+func (bag *Bag) SetCustomAlgorithm(f func(e []*Edge) []int) {
+	bag.customAlgorithm = f
+}
+
+func (bag *Bag) SetCustomWeighing(f func(src, dst []int) float64) {
+	bag.customWeighing = f
+}
+
 // Summarize the text to num sentences
 func (bag *Bag) Summarize(text string, num int) (string, error) {
 	if len(text) < 1 {
 		return "", nil
 	}
 
-	bag.CreateSentences(text)
-	bag.CreateDictionary(text)
-	bag.CreateNodes()
-	bag.CreateEdges()
+	bag.createSentences(text)
+
+	// If user already provide dictionary, pass creating dictionary
+	if len(bag.Dict) < 1 {
+		bag.createDictionary(text)
+	}
+
+	bag.createNodes()
+	bag.createEdges()
 
 	switch bag.Algorithm {
 	case "centrality":
-		bag.Centrality()
+		bag.centrality()
 		break
 	case "pagerank":
-		bag.PageRank()
+		bag.pageRank()
+		break
+	case "custom":
+		bag.Ranks = bag.customAlgorithm(bag.Edges)
 		break
 	default:
-		bag.Centrality()
+		bag.pageRank()
 	}
 
 	// if no ranks, return error
@@ -104,7 +130,7 @@ func (bag *Bag) Summarize(text string, num int) (string, error) {
 	// get only top num of ranks
 	idx := bag.Ranks[:num]
 	// sort it ascending by how the sentences appeared on the original text
-	HeapSortInt(idx)
+	sort.Ints(idx)
 	var res string
 	for i, _ := range idx {
 		res += (bag.OriginalSentences[idx[i]] + " ")
@@ -137,7 +163,7 @@ type Rank struct {
 	score float64
 }
 
-func (bag *Bag) Centrality() {
+func (bag *Bag) centrality() {
 	// first remove edges under Threshold weight
 	var newEdges []*Edge
 	for _, edge := range bag.Edges {
@@ -146,7 +172,7 @@ func (bag *Bag) Centrality() {
 		}
 	}
 	// sort them by weight descending
-	HeapSortEdge(newEdges)
+	sort.Sort(ByWeight(newEdges))
 	ReverseEdge(newEdges)
 	rankBySrc := make([]int, len(newEdges))
 	for i, v := range newEdges {
@@ -165,7 +191,7 @@ func (bag *Bag) Centrality() {
 	bag.Ranks = uniq
 }
 
-func (bag *Bag) PageRank() {
+func (bag *Bag) pageRank() {
 	// first remove edges under Threshold weight
 	var newEdges []*Edge
 	for _, edge := range bag.Edges {
@@ -183,8 +209,8 @@ func (bag *Bag) PageRank() {
 	graph.Rank(bag.Damping, bag.Tolerance, func(sentenceIndex uint32, rank float64) {
 		ranks = append(ranks, &Rank{int(sentenceIndex), rank})
 	})
-	// sort ranks into an array of sentence index, by rank descending
-	HeapSortRank(ranks)
+	// sort ranks into an array of sentence index, by score descending
+	sort.Sort(ByScore(ranks))
 	ReverseRank(ranks)
 	idx := make([]int, len(ranks))
 	for i, v := range ranks {
@@ -200,7 +226,7 @@ type Edge struct {
 	weight float64 // weight of the similarity between two sentences
 }
 
-func (bag *Bag) CreateEdges() {
+func (bag *Bag) createEdges() {
 	for i, src := range bag.Nodes {
 		for j, dst := range bag.Nodes {
 			// don't compare same node
@@ -215,10 +241,12 @@ func (bag *Bag) CreateEdges() {
 					differentElements := SymmetricDifference(src.vector, dst.vector)
 					weight = float64(len(differentElements))
 					break
+				case "custom":
+					weight = bag.customWeighing(src.vector, dst.vector)
+					break
 				default:
-					// defaulted to jaccard
-					commonElements := Intersection(src.vector, dst.vector)
-					weight = 1.0 - float64(len(commonElements))/((float64(bag.vectorLength)*2)-float64(len(commonElements)))
+					differentElements := SymmetricDifference(src.vector, dst.vector)
+					weight = float64(len(differentElements))
 				}
 				edge := &Edge{i, j, weight}
 				bag.Edges = append(bag.Edges, edge)
@@ -243,7 +271,7 @@ type Node struct {
 	*/
 }
 
-func (bag *Bag) CreateNodes() {
+func (bag *Bag) createNodes() {
 	bag.vectorLength = len(bag.Dict)
 	for i, sentence := range bag.BagOfWordsPerSentence {
 		// vector length is len(dict)
@@ -266,7 +294,7 @@ func (bag *Bag) CreateNodes() {
 	}
 }
 
-func (bag *Bag) CreateSentences(text string) {
+func (bag *Bag) createSentences(text string) {
 	// trim all spaces
 	text = strings.TrimSpace(text)
 	// tokenize text as sentences
@@ -288,7 +316,7 @@ func (bag *Bag) CreateSentences(text string) {
 	UniqSentences(bag.BagOfWordsPerSentence, bag.SentencesDistanceThreshold)
 }
 
-func (bag *Bag) CreateDictionary(text string) {
+func (bag *Bag) createDictionary(text string) {
 	// trim all spaces
 	text = strings.TrimSpace(text)
 	// lowercase the text