trainParams.properties

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Sample machine learning properties file
# Choose between MAXENT and PERCEPTRON
Algorithm=PERCEPTRON
Iterations=500
Cutoff=0
Threads=4

##################################################
#### Custom parameters added by ixa-pipe-nerc ####
##################################################

# Languages supported: de, en, es, eu, it, nl
Language=en

# ClearAdaptiveFeatures: ONLY WORKS with CONLL formats!!
# Specify if adaptive features are cleared in the training and/or evaluation data.
# Options are: 'yes', 'no', 'docstart'. The first two will reset the
# features every sentence whereas the 'docstart' option will look for -DOCSTART-
# marks in the data to clear the adaptive features.
# Crossvalidation only works if ClearTrainingFeatures is set to 'yes'.
# If commented out both values default to 'no'
ClearTrainingFeatures=yes
ClearEvaluationFeatures=no

# TrainingCorpus:
TrainSet=/home/ragerri/experiments/nerc/conll03/eng.train
TestSet=/home/ragerri/experiments/nerc/conll03/eng.testb

# CorpusFormat: conll02, conll03, opennlp
# CorpusFormat of the training corpus
CorpusFormat=conll02

# OutputModel: if commented out, ixa-pipe-nerc will save the model with the
# name of this properties file
OutputModel=trainParams.bin

# Named Entity types; if not active all ne types in the training corpus.
# Otherwise, separate with comma, eg., location,organization,person,misc.
# NOTE: the name of the NE type needs to be exact, namely, if in the corpus
# appears B-ORG, then in the parameter needs to appear ORG, not organization,
# and so on.
#Types=location,organization,person,misc

# Beamsize 1 amounts to greedy search
BeamSize=3

# Sequence codec used to code named entity spans: Choose between BIO and BILOU.
# If commented out, it defaults to BILOU.
#SequenceCodec=BIO

##################
#### FEATURES ####
##################

# Window: left and right window range from the current token. TokenFeatures
# and TokenClassFeatures depend on the window range specified here. If
# commented out, it will default to 2:2.
Window=2:2

# TokenFeatures: include current token (both in original and lowercase form)
TokenFeatures=yes

# TokenClassFeatures: include token shape features (capitalization, digits,
# etc. see TokenClassFeatureGenerator class for details
TokenClassFeatures=yes

# WordShapeSuperSenseFeatures: token shape features as implemented by
# Ciaramita and Altun (2006).
WordShapeSuperSenseFeatures=yes

# OutcomePriorFeatures: maps the underlying previous outcomes
OutcomePriorFeatures=yes

# PreviousMapFeatures: takes into account previous decisions and adds them as
# features
PreviousMapFeatures=yes

# SentenceFeatures: add first and last words of sentence as features.
SentenceFeatures=yes

# PrefixFeatures: takes first 3rd and 4rd characters of current token as feature.
PrefixFeatures=yes

# SuffixFeatures: takes last 4 characters of current token as feature.
SuffixFeatures=yes

# BigramClassFeatures: adds bigram features based on tokens and their class
# features.
BigramClassFeatures=yes

# TrigramClassFeatures: add trigram features based on tokens and their class
# features.
TrigramClassFeatures=no

# FourgramClassFeatures: add fourgram features based on tokens and their
# class features.
FourgramClassFeatures=no

# FivegramClassFeatures: add fivegram features based on tokens and their class
# features.
FivegramClassFeatures=no

# CharNgramFeatures: min and maximum length for character ngrams of current
# token. If value is yes, specify the desired range in CharNgramFeaturesRange.
# If Range is commented out, it defaults to 2:5 when this feature is "yes".
CharNgramFeatures=no
CharNgramFeaturesRange=2:5

# DictionaryFeatures: add features if token found in some gazetteers. Comment
# it out deactivate this feature. Note that every file in the directory
# provided as parameter will be taken to be a dictionary. The dictionary format
# needs to be 'named entity\tabclass'.
DictionaryFeatures=/home/ragerri/javacode/ixa-pipe-nerc/nerc-resources/en/dictionaries

# BrownClusterFeatures: add features using Brown clusters
# Comment it out to deactivate this feature. NOTE: you can add multiple
# clustering lexicons by chaining them with a comma.
BrownClusterFeatures=/home/ragerri/javacode/ixa-pipe-nerc/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt

# ClarkClusterFeatures: add features using Clark (2003) clusters. If value is uncommented,
# specify the location of the clustering lexicon in Clark format. NOTE: you can add multiple
# clustering lexicons by chaining them with a comma.
ClarkClusterFeatures=/home/ragerri/resources/reuters-rcv1/clark/reuters-rcv1.tok.punct.lower.300

# Word2VecClusterFeatures: add features using word2vec clusters. If value is
# uncommented, specify the location of the clustering lexicon in word2vec format.
# NOTE: you can add multiple clustering lexicons by chaining them with a comma.
Word2VecClusterFeatures=/home/ragerri/clusters.large.txt

############################
## Morphological Features ##
############################

# Morpho Features: add pos tag and lemma as a feature. It is required to provide
# the ixa-pipe-pos model and a plain text word\tlemma\tpostag dictionary for it to work.
# MorphoFeaturesRange specifies the combination of features to be used: options are 'pos',
# 'posclass' and 'lemma' in that strict order. For example, if 'pos,posclass,lemma' is
# chosen then all three types of features will be used. If 'pos,no,no' is chosen, then
# only pos tag features are active. If 'pos,no,lemma' then pos tag and lemma features,
# and so on.
# If MorphoFeatures is commented out, none these features are used.
MorphoFeatures=/home/ragerri/javacode/ixa-pipe-pos/pos-models/en/en-maxent-100-c5-baseline-dict-penn.bin,/home/ragerri/javacode/ixa-pipe-pos/src/main/resources/lemmatizer-dicts/freeling/en-lemmatizer.dict
MorphoFeaturesRange=pos,posclass,lemma

# MFSFeatures: add Most Frequent Sense as features.
# It is required to provide an ixa-pipe-pos model, a plan text word\tlemma\tpostag
# dictionary and a lexicon containing the most frequent sense information, where
# each entry is of the form word#pos\tfreq#sense. For example, house#n\t1098#noun.artifact.
# This features include Morphological and SuperSense features, therefore,
# DO NOT COMBINE THEM with MORPHO OR SUPERSENSE FEATURES!!!
MFSFeatures=/home/ragerri/javacode/ixa-pipe-pos/pos-models-1.3.0/en/en-maxent-100-c5-baseline-dict-penn.bin,/home/ragerri/resources/pos-resources/lemmatizer-dicts/freeling/en-lemmatizer.txt,/home/ragerri/resources/supersense/supersenses.wn20
MFSFeaturesRange=pos,posclass,lemma,mfs,monosemic

# SuperSenseFeatures: add Ciaramita and Altun (2006) super sense tagging features.
# It is required to provide an ixa-pipe-pos model, a plan text word\tlemma\tpostag
# dictionary and a lexicon containing the most frequent sense information, where
# each entry is of the form word#pos\tfreq#sense. For example, house#n\t1098#noun.artifact.
# This features include Morphological and MFS features, therefore,
# DO NOT COMBINE THEM with MORPHO OR MFS FEATURES!!!
SuperSenseFeatures=/home/ragerri/javacode/ixa-pipe-pos/pos-models-1.3.0/en/en-maxent-100-c5-baseline-dict-penn.bin,/home/ragerri/resources/pos-resources/lemmatizer-dicts/freeling/en-lemmatizer.txt,/home/ragerri/resources/supersense/supersenses.wn20
SuperSenseFeaturesRange=mfs,monosemic

#####################################
#### CROSS VALIDATION PARAMETERS ####
#####################################

# Cross Validation Folds; if commented out it defaults to 10 cross validation
# folds.
Folds=5
# Evaluation type: choose between 'detailed' and 'error'; only for cross-validation.
# It defaults to detailed evaluation.
EvaluationType=detailed