kba.thrift


/**
 * This set of thrift structures is analogous to the JSON schemas
 * defined in http://trec-kba.org/schemas/v1.0/
 * 
 * The comments below should be enough to interact with the text of
 * the corpus.  The JSON schemas contain additional details,
 * especially for the SourceMetadata, which is stored in the thrift as
 * a JSON string using the schemas linked below.
 */
namespace java kba
namespace py kba

/**
 * ContentItem is the thrift analog of
 * http://trec-kba.org/schemas/v1.0/content-item.json
 *
 * The JSON version has a 'stages' property that contains descriptions
 * **and also names** of additional properties on the ContentItem.
 * That was overly flexible.  Each content-item in the KBA corpus can
 * have a 'cleansed' and 'ner' property.  'cleansed' is generated from
 * 'raw', and 'ner' is generated from 'cleansed.'  Generally,
 * 'cleansed' is a tag-stripped version of 'raw', and 'ner' is the
 * output of a named entity recognizer that generates
 * one-word-per-line output.
 *
 * For the kba-stream-corpus-2012, the specific tag-stripping and NER
 * configurations were:
 *   'raw' --> boilerpipe 1.2.0 ArticleExtractor --> 'cleansed'
 *
 *   'cleansed' -> Stanford CoreNLP ver 1.2.0 with annotators
 *        {tokenize, cleanxml, ssplit, pos, lemma, ner}, property
 *        pos.maxlen=100" --> 'ner'
 */
struct ContentItem {
  1: binary raw,
  2: string encoding,
  3: optional binary cleansed,
  4: optional binary ner,
}

/**
 * SourceMetadata is a JSON string with one of these schemas
 *
 *  - http://trec-kba.org/schemas/v1.0/news-metadata.json
 *  - http://trec-kba.org/schemas/v1.0/linking-metadata.json
 *  - http://trec-kba.org/schemas/v1.0/social-metadata.json
 *
 * where 'news', 'social', 'linking' is the string found in
 * CorpusItem.source
 *
 */
typedef binary SourceMetadata

/**
 * CorpusItem is the thrift equivalent of
 * http://trec-kba.org/schemas/v1.0/corpus-item.json
 */
struct CorpusItem {
  1: string doc_id,
  2: binary abs_url,
  3: string schost,
  4: binary original_url,
  5: string source,
  6: ContentItem title,
  7: ContentItem body,
  8: ContentItem anchor,
  9: SourceMetadata source_metadata,
}

/**
 * StreamTime is a timestamp measured in seconds since the 1970 epoch.
 * 'news', 'linking', and 'social' each have slightly different ways
 * of generating these timestamps.  See details:
 * http://trec-kba.org/kba-stream-corpus-2012.shtml
 */
struct StreamTime {
  1: double epoch_ticks,
  2: string zulu_timestamp,
}

/**
 * This is the primary interface to the data.  StreamItem is the
 * thrift equivalent of
 * http://trec-kba.org/schemas/v1.0/stream-item.json 
 * 
 * which extends corpus-item.json.  For better or worse, thrift does
 * not support inheritence on struct, so this copies the first nine
 * fields of CorpusItem and then adds two more fields.
 */
struct StreamItem {
  1: string doc_id,
  2: binary abs_url,
  3: string schost,
  4: binary original_url,
  5: string source,
  6: ContentItem title,
  7: ContentItem body,
  8: ContentItem anchor,
  9: SourceMetadata source_metadata,
  // stream_id is the actual unique identifier for the stream corpus,
  // stream_id = '%d-%s' % (int(stream_time.epoch_ticks), doc_id)
  10: string stream_id,  
  11: StreamTime stream_time,
}