forked from trec-kba/kba-corpus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kba.thrift
105 lines (99 loc) · 3.19 KB
/
kba.thrift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/**
* This set of thrift structures is analogous to the JSON schemas
* defined in http://trec-kba.org/schemas/v1.0/
*
* The comments below should be enough to interact with the text of
* the corpus. The JSON schemas contain additional details,
* especially for the SourceMetadata, which is stored in the thrift as
* a JSON string using the schemas linked below.
*/
namespace java kba
namespace py kba
/**
* ContentItem is the thrift analog of
* http://trec-kba.org/schemas/v1.0/content-item.json
*
* The JSON version has a 'stages' property that contains descriptions
* **and also names** of additional properties on the ContentItem.
* That was overly flexible. Each content-item in the KBA corpus can
* have a 'cleansed' and 'ner' property. 'cleansed' is generated from
* 'raw', and 'ner' is generated from 'cleansed.' Generally,
* 'cleansed' is a tag-stripped version of 'raw', and 'ner' is the
* output of a named entity recognizer that generates
* one-word-per-line output.
*
* For the kba-stream-corpus-2012, the specific tag-stripping and NER
* configurations were:
* 'raw' --> boilerpipe 1.2.0 ArticleExtractor --> 'cleansed'
*
* 'cleansed' -> Stanford CoreNLP ver 1.2.0 with annotators
* {tokenize, cleanxml, ssplit, pos, lemma, ner}, property
* pos.maxlen=100" --> 'ner'
*/
struct ContentItem {
1: binary raw,
2: string encoding,
3: optional binary cleansed,
4: optional binary ner,
}
/**
* SourceMetadata is a JSON string with one of these schemas
*
* - http://trec-kba.org/schemas/v1.0/news-metadata.json
* - http://trec-kba.org/schemas/v1.0/linking-metadata.json
* - http://trec-kba.org/schemas/v1.0/social-metadata.json
*
* where 'news', 'social', 'linking' is the string found in
* CorpusItem.source
*
*/
typedef binary SourceMetadata
/**
* CorpusItem is the thrift equivalent of
* http://trec-kba.org/schemas/v1.0/corpus-item.json
*/
struct CorpusItem {
1: string doc_id,
2: binary abs_url,
3: string schost,
4: binary original_url,
5: string source,
6: ContentItem title,
7: ContentItem body,
8: ContentItem anchor,
9: SourceMetadata source_metadata,
}
/**
* StreamTime is a timestamp measured in seconds since the 1970 epoch.
* 'news', 'linking', and 'social' each have slightly different ways
* of generating these timestamps. See details:
* http://trec-kba.org/kba-stream-corpus-2012.shtml
*/
struct StreamTime {
1: double epoch_ticks,
2: string zulu_timestamp,
}
/**
* This is the primary interface to the data. StreamItem is the
* thrift equivalent of
* http://trec-kba.org/schemas/v1.0/stream-item.json
*
* which extends corpus-item.json. For better or worse, thrift does
* not support inheritence on struct, so this copies the first nine
* fields of CorpusItem and then adds two more fields.
*/
struct StreamItem {
1: string doc_id,
2: binary abs_url,
3: string schost,
4: binary original_url,
5: string source,
6: ContentItem title,
7: ContentItem body,
8: ContentItem anchor,
9: SourceMetadata source_metadata,
// stream_id is the actual unique identifier for the stream corpus,
// stream_id = '%d-%s' % (int(stream_time.epoch_ticks), doc_id)
10: string stream_id,
11: StreamTime stream_time,
}