forked from taku910/crfpp
-
Notifications
You must be signed in to change notification settings - Fork 11
/
feature_index.h
129 lines (110 loc) · 3.62 KB
/
feature_index.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
//
// CRF++ -- Yet Another CRF toolkit
//
// $Id: feature_index.h 1588 2007-02-12 09:03:39Z taku $;
//
// Copyright(C) 2005-2007 Taku Kudo <taku@chasen.org>
//
#ifndef CRFPP_FEATURE_INDEX_H_
#define CRFPP_FEATURE_INDEX_H_
#include <vector>
#include <map>
#include <iostream>
#include "common.h"
#include "scoped_ptr.h"
#include "feature_cache.h"
#include "path.h"
#include "node.h"
#include "freelist.h"
#include "mmap.h"
#include "darts.h"
namespace CRFPP {
class TaggerImpl;
class Allocator {
public:
explicit Allocator(size_t thread_num);
Allocator();
virtual ~Allocator();
char *strdup(const char *str);
Path *newPath(size_t thread_id);
Node *newNode(size_t thread_id);
void clear();
void clear_freelist(size_t thread_id);
FeatureCache *feature_cache() const;
size_t thread_num() const;
private:
void init();
size_t thread_num_;
scoped_ptr<FeatureCache> feature_cache_;
scoped_ptr<FreeList<char> > char_freelist_;
scoped_array< FreeList<Path> > path_freelist_;
scoped_array< FreeList<Node> > node_freelist_;
};
class FeatureIndex {
public:
static const unsigned int version = MODEL_VERSION;
size_t size() const { return maxid_; }
size_t xsize() const { return xsize_; }
size_t ysize() const { return y_.size(); }
const char* y(size_t i) const { return y_[i].c_str(); }
void set_alpha(const double *alpha) { alpha_ = alpha; }
const float *alpha_float() { return alpha_float_; }
const double *alpha() const { return alpha_; }
void set_cost_factor(double cost_factor) { cost_factor_ = cost_factor; }
double cost_factor() const { return cost_factor_; }
void calcCost(Node *node) const;
void calcCost(Path *path) const;
bool buildFeatures(TaggerImpl *tagger) const;
void rebuildFeatures(TaggerImpl *tagger) const;
const char* what() { return what_.str(); }
explicit FeatureIndex(): maxid_(0), alpha_(0), alpha_float_(0),
cost_factor_(1.0), xsize_(0),
check_max_xsize_(false), max_xsize_(0) {}
virtual ~FeatureIndex() {}
const char *getTemplate() const;
protected:
virtual int getID(const char *str) const = 0;
const char *getIndex(const char *&p,
size_t pos,
const TaggerImpl &tagger) const;
bool applyRule(string_buffer *os,
const char *pattern,
size_t pos, const TaggerImpl &tagger) const;
mutable unsigned int maxid_;
const double *alpha_;
const float *alpha_float_;
double cost_factor_;
unsigned int xsize_;
bool check_max_xsize_;
mutable unsigned int max_xsize_;
std::vector<std::string> unigram_templs_;
std::vector<std::string> bigram_templs_;
std::vector<std::string> y_;
std::string templs_;
whatlog what_;
};
class EncoderFeatureIndex: public FeatureIndex {
public:
bool open(const char *template_filename,
const char *model_filename);
bool save(const char *filename, bool emit_textmodelfile);
bool convert(const char *text_filename,
const char *binary_filename);
void shrink(size_t freq, Allocator *allocator);
private:
int getID(const char *str) const;
bool openTemplate(const char *filename);
bool openTagSet(const char *filename);
mutable std::map<std::string, std::pair<int, unsigned int> > dic_;
};
class DecoderFeatureIndex: public FeatureIndex {
public:
bool open(const char *model_filename);
bool openFromArray(const char *buf, size_t size);
private:
Mmap <char> mmap_;
Darts::DoubleArray da_;
int getID(const char *str) const;
};
}
#endif