-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikiassignment.go
116 lines (98 loc) · 3.25 KB
/
wikiassignment.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// Package wikiassignment provides utility functions for automatically assigning wikipedia pages to topics.
package wikiassignment
import (
"context"
"io"
"github.com/negapedia/wikiassignment/nationalization"
"github.com/negapedia/wikidump"
"github.com/RoaringBitmap/roaring"
"github.com/ebonetti/absorbingmarkovchain"
)
//From transforms the sematic graph from the input into a page-topic assignment
func From(ctx context.Context, tmpDir, lang string) (page2Topic map[uint32]uint32, namespaces struct{ Topics, Categories, Articles []uint32 }, err error) {
latestDump, err := wikidump.Latest(tmpDir, lang, "metahistory7zdump", "pagetable", "redirecttable", "categorylinkstable", "pagelinkstable")
if err != nil {
return
}
dumps := func(name string) (r io.ReadCloser, err error) {
rawReader, err := latestDump.Open(name)(ctx)
if err != nil {
return
}
r = readClose{wikidump.SQL2CSV(rawReader), rawReader.Close}
return
}
nationalization, err := nationalization.New(lang)
if err != nil {
return
}
topicAssignments := map[uint32][]uint32{}
for _, t := range nationalization.Topics {
for _, p := range t.Categories {
topicAssignments[t.ID] = append(topicAssignments[t.ID], p.ID)
}
}
filters := []uint32{}
for _, p := range nationalization.Filters {
filters = append(filters, p.ID)
}
amcData := amcData{}
page2Topic, err = chainFrom(ctx, tmpDir, SemanticGraphSources{dumps, topicAssignments, []Filter{{false, filters, 1}}}, &amcData).AbsorptionAssignments(ctx)
switch {
case amcData.err != nil:
page2Topic, err = nil, amcData.err
return
case err != nil:
return
}
namespaces.Topics = amcData.namespace2Ids[TopicNamespaceID].ToArray()
namespaces.Categories = amcData.namespace2Ids[CategoryNamespaceID].ToArray()
namespaces.Articles = amcData.namespace2Ids[ArticleNamespaceID].ToArray()
for _, t := range namespaces.Topics {
page2Topic[t] = t
}
return
}
//Filter represents a filter to be applied to the semantic graph before the transformation into assignment
type Filter struct {
IsWhitelist bool
Parents []uint32
Dept int
}
type amcData struct {
err error
namespace2Ids map[int]*roaring.Bitmap
}
func chainFrom(ctx context.Context, tmpDir string, d SemanticGraphSources, amcd *amcData) *absorbingmarkovchain.AbsorbingMarkovChain {
g, IDs2CatDistance, namespace2Ids, err := d.Build(ctx)
if err != nil {
amcd.err = err
return nil
}
amcd.namespace2Ids = namespace2Ids
articlesIds := namespace2Ids[ArticleNamespaceID]
weighter := func(from, to uint32) (weight float64, err error) { //amc weigherweight<=1
switch {
case articlesIds.Contains(to): //penalized link (this link was added by pagelink)
weight = 1.0 / 200
default: //valuable link (this link was added by categorylink)
d := IDs2CatDistance[to] + 1 - IDs2CatDistance[from] //d is non negative; weight=1 iff d=0
weight = 1 / float64(1+10*d)
}
return
}
nodes := roaring.NewBitmap()
for _, ids := range namespace2Ids {
nodes.Or(ids)
}
absorbingNodes := namespace2Ids[TopicNamespaceID]
edges := func(from uint32) []uint32 { return g[from] }
return absorbingmarkovchain.New(tmpDir, nodes, absorbingNodes, edges, weighter)
}
type readClose struct {
io.Reader
Closer func() error
}
func (r readClose) Close() error {
return r.Closer()
}