forked from ixa-ehu/ixa-pipe-nerc
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrainParams.properties
188 lines (152 loc) · 8.3 KB
/
trainParams.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Sample machine learning properties file
# Choose between MAXENT and PERCEPTRON
Algorithm=PERCEPTRON
Iterations=500
Cutoff=0
Threads=4
##################################################
#### Custom parameters added by ixa-pipe-nerc ####
##################################################
# Languages supported: de, en, es, eu, it, nl
Language=en
# ClearAdaptiveFeatures: ONLY WORKS with CONLL formats!!
# Specify if adaptive features are cleared in the training and/or evaluation data.
# Options are: 'yes', 'no', 'docstart'. The first two will reset the
# features every sentence whereas the 'docstart' option will look for -DOCSTART-
# marks in the data to clear the adaptive features.
# Crossvalidation only works if ClearTrainingFeatures is set to 'yes'.
# If commented out both values default to 'no'
ClearTrainingFeatures=yes
ClearEvaluationFeatures=no
# TrainingCorpus:
TrainSet=/home/ragerri/experiments/nerc/conll03/eng.train
TestSet=/home/ragerri/experiments/nerc/conll03/eng.testb
# CorpusFormat: conll02, conll03, opennlp
# CorpusFormat of the training corpus
CorpusFormat=conll02
# OutputModel: if commented out, ixa-pipe-nerc will save the model with the
# name of this properties file
OutputModel=trainParams.bin
# Named Entity types; if not active all ne types in the training corpus.
# Otherwise, separate with comma, eg., location,organization,person,misc.
# NOTE: the name of the NE type needs to be exact, namely, if in the corpus
# appears B-ORG, then in the parameter needs to appear ORG, not organization,
# and so on.
#Types=location,organization,person,misc
# Beamsize 1 amounts to greedy search
BeamSize=3
# Sequence codec used to code named entity spans: Choose between BIO and BILOU.
# If commented out, it defaults to BILOU.
#SequenceCodec=BIO
##################
#### FEATURES ####
##################
# Window: left and right window range from the current token. TokenFeatures
# and TokenClassFeatures depend on the window range specified here. If
# commented out, it will default to 2:2.
Window=2:2
# TokenFeatures: include current token (both in original and lowercase form)
TokenFeatures=yes
# TokenClassFeatures: include token shape features (capitalization, digits,
# etc. see TokenClassFeatureGenerator class for details
TokenClassFeatures=yes
# WordShapeSuperSenseFeatures: token shape features as implemented by
# Ciaramita and Altun (2006).
WordShapeSuperSenseFeatures=yes
# OutcomePriorFeatures: maps the underlying previous outcomes
OutcomePriorFeatures=yes
# PreviousMapFeatures: takes into account previous decisions and adds them as
# features
PreviousMapFeatures=yes
# SentenceFeatures: add first and last words of sentence as features.
SentenceFeatures=yes
# PrefixFeatures: takes first 3rd and 4rd characters of current token as feature.
PrefixFeatures=yes
# SuffixFeatures: takes last 4 characters of current token as feature.
SuffixFeatures=yes
# BigramClassFeatures: adds bigram features based on tokens and their class
# features.
BigramClassFeatures=yes
# TrigramClassFeatures: add trigram features based on tokens and their class
# features.
TrigramClassFeatures=no
# FourgramClassFeatures: add fourgram features based on tokens and their
# class features.
FourgramClassFeatures=no
# FivegramClassFeatures: add fivegram features based on tokens and their class
# features.
FivegramClassFeatures=no
# CharNgramFeatures: min and maximum length for character ngrams of current
# token. If value is yes, specify the desired range in CharNgramFeaturesRange.
# If Range is commented out, it defaults to 2:5 when this feature is "yes".
CharNgramFeatures=no
CharNgramFeaturesRange=2:5
# DictionaryFeatures: add features if token found in some gazetteers. Comment
# it out deactivate this feature. Note that every file in the directory
# provided as parameter will be taken to be a dictionary. The dictionary format
# needs to be 'named entity\tabclass'.
DictionaryFeatures=/home/ragerri/javacode/ixa-pipe-nerc/nerc-resources/en/dictionaries
# BrownClusterFeatures: add features using Brown clusters
# Comment it out to deactivate this feature. NOTE: you can add multiple
# clustering lexicons by chaining them with a comma.
BrownClusterFeatures=/home/ragerri/javacode/ixa-pipe-nerc/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt
# ClarkClusterFeatures: add features using Clark (2003) clusters. If value is uncommented,
# specify the location of the clustering lexicon in Clark format. NOTE: you can add multiple
# clustering lexicons by chaining them with a comma.
ClarkClusterFeatures=/home/ragerri/resources/reuters-rcv1/clark/reuters-rcv1.tok.punct.lower.300
# Word2VecClusterFeatures: add features using word2vec clusters. If value is
# uncommented, specify the location of the clustering lexicon in word2vec format.
# NOTE: you can add multiple clustering lexicons by chaining them with a comma.
Word2VecClusterFeatures=/home/ragerri/clusters.large.txt
############################
## Morphological Features ##
############################
# Morpho Features: add pos tag and lemma as a feature. It is required to provide
# the ixa-pipe-pos model and a plain text word\tlemma\tpostag dictionary for it to work.
# MorphoFeaturesRange specifies the combination of features to be used: options are 'pos',
# 'posclass' and 'lemma' in that strict order. For example, if 'pos,posclass,lemma' is
# chosen then all three types of features will be used. If 'pos,no,no' is chosen, then
# only pos tag features are active. If 'pos,no,lemma' then pos tag and lemma features,
# and so on.
# If MorphoFeatures is commented out, none these features are used.
MorphoFeatures=/home/ragerri/javacode/ixa-pipe-pos/pos-models/en/en-maxent-100-c5-baseline-dict-penn.bin,/home/ragerri/javacode/ixa-pipe-pos/src/main/resources/lemmatizer-dicts/freeling/en-lemmatizer.dict
MorphoFeaturesRange=pos,posclass,lemma
# MFSFeatures: add Most Frequent Sense as features.
# It is required to provide an ixa-pipe-pos model, a plan text word\tlemma\tpostag
# dictionary and a lexicon containing the most frequent sense information, where
# each entry is of the form word#pos\tfreq#sense. For example, house#n\t1098#noun.artifact.
# This features include Morphological and SuperSense features, therefore,
# DO NOT COMBINE THEM with MORPHO OR SUPERSENSE FEATURES!!!
MFSFeatures=/home/ragerri/javacode/ixa-pipe-pos/pos-models-1.3.0/en/en-maxent-100-c5-baseline-dict-penn.bin,/home/ragerri/resources/pos-resources/lemmatizer-dicts/freeling/en-lemmatizer.txt,/home/ragerri/resources/supersense/supersenses.wn20
MFSFeaturesRange=pos,posclass,lemma,mfs,monosemic
# SuperSenseFeatures: add Ciaramita and Altun (2006) super sense tagging features.
# It is required to provide an ixa-pipe-pos model, a plan text word\tlemma\tpostag
# dictionary and a lexicon containing the most frequent sense information, where
# each entry is of the form word#pos\tfreq#sense. For example, house#n\t1098#noun.artifact.
# This features include Morphological and MFS features, therefore,
# DO NOT COMBINE THEM with MORPHO OR MFS FEATURES!!!
SuperSenseFeatures=/home/ragerri/javacode/ixa-pipe-pos/pos-models-1.3.0/en/en-maxent-100-c5-baseline-dict-penn.bin,/home/ragerri/resources/pos-resources/lemmatizer-dicts/freeling/en-lemmatizer.txt,/home/ragerri/resources/supersense/supersenses.wn20
SuperSenseFeaturesRange=mfs,monosemic
#####################################
#### CROSS VALIDATION PARAMETERS ####
#####################################
# Cross Validation Folds; if commented out it defaults to 10 cross validation
# folds.
Folds=5
# Evaluation type: choose between 'detailed' and 'error'; only for cross-validation.
# It defaults to detailed evaluation.
EvaluationType=detailed