// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.morph;
import java.util.HashMap;
import java.util.Map;
import marmot.core.Options;
import marmot.util.StringUtils.Mode;
public class MorphOptions extends Options {
private static final long serialVersionUID = 1L;
public static final String TRAIN_FILE = "train-file";
public static final String TEST_FILE = "test-file";
public static final String RARE_WORD_MAX_FREQ = "rare-word-max-freq";
public static final String SHAPE_TRIE_PATH = "shape-trie-path";
public static final String MODEL_FILE = "model-file";
public static final String RESTRICT_TRANSITIONS = "restrict-transitions";
//public static final String SUBPOS_TO_POS = "subpos-to-pos";
public static final String SHAPE = "shape";
public static final String TAG_MORPH = "tag-morph";
public static final String PRED_FILE = "pred-file";
public static final String OBSERVED_FEATURE = "observed-feature";
public static final String SPLIT_MORPH = "split-morph";
public static final String SUBTAG_SEPARATOR = "subtag-separator";
public static final String TYPE_DICT = "type-dict";
public static final String SPLIT_POS = "split-pos";
public static final String FLOAT_TYPE_DICT = "type-embeddings";
public static final String FORM_NORMALIZATION = "form-normalization";
public static final String NUM_CHUNKS = "num-chunks";
public static final String SPECIAL_SIGNATURE = "special-signature";
public static final String INTERNAL_ANALYZER = "internal-analyzer";
public static final String NUM_FOLDS = "num-folds";
public static final String USE_DEFAULT_FEATURES = "use-default-features";
public static final String USE_HASH_VECTOR = "use-hash-vector";
public static final String FEATURE_TEMPLATES = "feature-templates";
public static final String MAX_AFFIX_LENGTH = "max-affix-length";
public static final String LEMMATIZE = "lemmatize";
public static final String LEMMA_UNIGRAM_FILE = "lemma-unigram-file";
public static final String GOLD_LEMMA = "gold-lemma";
public static final String LEMMA_PRETRAINING = "lemma-pretraining";
public static final String MARGINALIZE_LEMMAS = "marginalize-lemmas";
public static final String LEMMA_ASPELL_LANG = "lemma-aspell-lang";
public static final String LEMMA_ASPELL_PATH = "lemma-aspell-path";
public static final String LEMMA_USE_SHAPE_LEXICON= "lemma-use-shape-lexicon";
public static final String LEMMA_USE_MORPH = "lemma-use-morph";
public static final String LEMMATIZER_FILE = "lemmatizer-file";
public static final String LEMMAS_IGNORE_FEATURES = "lemma-ignore-features";
public static final String LEMMA_PREPRUNING_EXTRACTION_ = "lemma-prepruning-extraction";
public static final String USE_HASH_FEATURE_TABLE = "use-hash-feature-table";
public static final String LEMMA_CLUSTER_FILE = "lemma-cluster-file";
public static final String LEMMA_TAG_DEPENDENT = "lemma-tag-dependent";
public static final String LEMMA_LEMMING_GENERATOR = "lemma-use-lemming-generator";
public static final String RESTRICT_POS_TAGS_TO_SEEN_COMBINATIONS = "restrict-pos-tags-to-seen-combinations";
private static final Map<String, String> DEFALUT_VALUES_ = new HashMap<String, String>();
private static final Map<String, String> COMMENTS_ = new HashMap<String, String>();
static {
DEFALUT_VALUES_.put(TRAIN_FILE, "");
COMMENTS_.put(TRAIN_FILE, "Input training file");
DEFALUT_VALUES_.put(TEST_FILE, "");
COMMENTS_.put(TEST_FILE, "Input test file. (optional for training)");
DEFALUT_VALUES_.put(PRED_FILE, "");
COMMENTS_.put(PRED_FILE, "Output prediction file in CoNLL09. (optional for training)");
DEFALUT_VALUES_.put(RARE_WORD_MAX_FREQ, "10");
COMMENTS_.put(RARE_WORD_MAX_FREQ, "Maximal frequency of a rare word.");
DEFALUT_VALUES_.put(SHAPE_TRIE_PATH, "");
COMMENTS_.put(SHAPE_TRIE_PATH, "Path to the shape trie. Will be created if non-existent.");
DEFALUT_VALUES_.put(MODEL_FILE, "");
COMMENTS_.put(MODEL_FILE, "Output model file.");
DEFALUT_VALUES_.put(RESTRICT_TRANSITIONS, "true");
COMMENTS_.put(RESTRICT_TRANSITIONS, "Whether to only allow POS -> MORPH transitions that have been seen during training.");
DEFALUT_VALUES_.put(SHAPE, "false");
COMMENTS_.put(SHAPE, "Whether to use shape features.");
DEFALUT_VALUES_.put(TAG_MORPH, "true");
COMMENTS_.put(TAG_MORPH, "Whether to train a morphological tagger or a POS tagger.");
DEFALUT_VALUES_.put(OBSERVED_FEATURE, "true");
COMMENTS_.put(OBSERVED_FEATURE, "Whether to use the observed feature. Have a look at the paper!");
DEFALUT_VALUES_.put(SPLIT_POS, "false");
COMMENTS_.put(SPLIT_POS, "Whether to split POS tags. See subtag-separator. Have a look at the paper!");
DEFALUT_VALUES_.put(SPLIT_MORPH, "true");
COMMENTS_.put(SPLIT_MORPH, "Whether to split MORPH tags. See subtag-separator. Have a look at the paper!");
DEFALUT_VALUES_.put(SUBTAG_SEPARATOR, "\\|");
COMMENTS_.put(SUBTAG_SEPARATOR, "Regular expression to use for splitting tags. (Has to work with Java's String.split)");
DEFALUT_VALUES_.put(TYPE_DICT, "");
COMMENTS_.put(TYPE_DICT, "Word type dictionary file (optional)");
DEFALUT_VALUES_.put(FLOAT_TYPE_DICT, "");
COMMENTS_.put(FLOAT_TYPE_DICT, "Word type embeddings file (optional)");
DEFALUT_VALUES_.put(FORM_NORMALIZATION, "none");
COMMENTS_.put(FORM_NORMALIZATION, "Whether to normalize word forms before tagging.");
DEFALUT_VALUES_.put(NUM_CHUNKS, "5");
COMMENTS_.put(NUM_CHUNKS, "Number of chunks. CrossAnnotator only.");
DEFALUT_VALUES_.put(SPECIAL_SIGNATURE, "false");
COMMENTS_.put(SPECIAL_SIGNATURE, "Whether to mark if a word contains a special character in the word signature.");
DEFALUT_VALUES_.put(INTERNAL_ANALYZER, "");
COMMENTS_.put(INTERNAL_ANALYZER, "Use an internal morphological analyzer. Currently supported: 'ar' for AraMorph (Arabic)");
DEFALUT_VALUES_.put(NUM_FOLDS, "10");
COMMENTS_.put(NUM_FOLDS, "Number of folds used for estimation of open word classes.");
DEFALUT_VALUES_.put(USE_DEFAULT_FEATURES, "true");
COMMENTS_.put(USE_DEFAULT_FEATURES, "Whether to extract default features such as prefixes, suffixes, word forms.");
DEFALUT_VALUES_.put(USE_HASH_VECTOR, "true");
COMMENTS_.put(USE_HASH_VECTOR, "Whether to use a hashed feature vector. Saves memory decreases accuracy.");
DEFALUT_VALUES_.put(FEATURE_TEMPLATES, "form,rare,affix,context,sig,bigrams");
COMMENTS_.put(FEATURE_TEMPLATES, "Comma separated list, activates individual templates.");
DEFALUT_VALUES_.put(MAX_AFFIX_LENGTH, "10");
COMMENTS_.put(MAX_AFFIX_LENGTH, "Max affix length to use in feature extraction.");
DEFALUT_VALUES_.put(LEMMATIZE, "false");
COMMENTS_.put(LEMMATIZE, "Train joint tagger + lemmatizer.");
DEFALUT_VALUES_.put(LEMMA_UNIGRAM_FILE, "");
COMMENTS_.put(LEMMA_UNIGRAM_FILE, "Is passed to lemma ranker model.");
DEFALUT_VALUES_.put(GOLD_LEMMA, "false");
COMMENTS_.put(GOLD_LEMMA, "Use only gold lemma.");
DEFALUT_VALUES_.put(LEMMA_PRETRAINING, "false");
COMMENTS_.put(LEMMA_PRETRAINING, "Pretrain tagger without lemma features.");
DEFALUT_VALUES_.put(MARGINALIZE_LEMMAS, "false");
COMMENTS_.put(MARGINALIZE_LEMMAS, "Marginalize over lemmas during viterbi decoding.");
DEFALUT_VALUES_.put(LEMMAS_IGNORE_FEATURES, "");
COMMENTS_.put(LEMMAS_IGNORE_FEATURES, "Features to ignore in the lemma model.");
DEFALUT_VALUES_.put(LEMMA_ASPELL_LANG, "");
COMMENTS_.put(LEMMA_ASPELL_LANG, "Passed to lemma model.");
DEFALUT_VALUES_.put(LEMMA_ASPELL_PATH, "");
COMMENTS_.put(LEMMA_ASPELL_PATH, "Passed to lemma model.");
DEFALUT_VALUES_.put(LEMMA_USE_SHAPE_LEXICON, "false");
COMMENTS_.put(LEMMA_USE_SHAPE_LEXICON, "Passed to lemma model.");
DEFALUT_VALUES_.put(LEMMA_USE_MORPH, "true");
COMMENTS_.put(LEMMA_USE_MORPH, "Passed to lemma model.");
DEFALUT_VALUES_.put(LEMMATIZER_FILE, "");
COMMENTS_.put(LEMMATIZER_FILE, "Use this pipeline lemmatizer to lemmatizer after tagging.");
DEFALUT_VALUES_.put(LEMMA_PREPRUNING_EXTRACTION_, "true");
COMMENTS_.put(LEMMA_PREPRUNING_EXTRACTION_, "Add lemmas before or after pruning.");
DEFALUT_VALUES_.put(USE_HASH_FEATURE_TABLE, "false");
COMMENTS_.put(USE_HASH_FEATURE_TABLE, "Less memory usage sligtly less accurate.");
DEFALUT_VALUES_.put(LEMMA_CLUSTER_FILE, "");
COMMENTS_.put(LEMMA_CLUSTER_FILE, "Passed to lemma model.");
DEFALUT_VALUES_.put(LEMMA_TAG_DEPENDENT, "false");
COMMENTS_.put(LEMMA_TAG_DEPENDENT, "Passed to lemma model.");
DEFALUT_VALUES_.put(LEMMA_LEMMING_GENERATOR, "0");
COMMENTS_.put(LEMMA_LEMMING_GENERATOR, "Passed to lemma model.");
DEFALUT_VALUES_.put(RESTRICT_POS_TAGS_TO_SEEN_COMBINATIONS, "false");
COMMENTS_.put(RESTRICT_POS_TAGS_TO_SEEN_COMBINATIONS, "Restrict the possible pos tags of a word to the combinations seen in the training set.");
}
public MorphOptions() {
super();
putAll(DEFALUT_VALUES_);
}
public String getTrainFile() {
return getProperty(TRAIN_FILE);
}
public String getTestFile() {
return getProperty(TEST_FILE);
}
public int getRareWordMaxFreq() {
return Integer.parseInt(getProperty(RARE_WORD_MAX_FREQ));
}
public boolean getRestricTransitions() {
return Boolean.parseBoolean(getProperty(RESTRICT_TRANSITIONS));
}
public boolean getTagMorph() {
return Boolean.valueOf(getProperty(TAG_MORPH));
}
public String getPredFile() {
return getProperty(PRED_FILE);
}
public boolean getShape() {
return Boolean.parseBoolean(getProperty(SHAPE));
}
public String getShapeTriePath() {
return getProperty(SHAPE_TRIE_PATH);
}
public String getModelFile() {
return getProperty(MODEL_FILE);
}
public boolean getObservedFeature() {
return Boolean.parseBoolean(getProperty(OBSERVED_FEATURE));
}
public boolean getSplitMorphs() {
return Boolean.parseBoolean(getProperty(SPLIT_MORPH));
}
public String getSubTagSeparator() {
return getProperty(SUBTAG_SEPARATOR);
}
public String getMorphDict() {
return getProperty(TYPE_DICT);
}
public String getFloatTypeDict() {
return getProperty(FLOAT_TYPE_DICT);
}
public boolean getSplitPos() {
return Boolean.parseBoolean(getProperty(SPLIT_POS));
}
protected void usage() {
super.usage();
System.err.println("Morph Options:");
usage(DEFALUT_VALUES_, COMMENTS_);
System.err.println();
}
public Mode getNormalizeForms() {
return Mode.valueOf(getProperty(FORM_NORMALIZATION));
}
public int getNumChunks() {
return Integer.parseInt(getProperty(NUM_CHUNKS));
}
public boolean getSpecialSignature() {
return Boolean.valueOf(getProperty(SPECIAL_SIGNATURE));
}
public String getInternalAnalyzer() {
String prop = getProperty(INTERNAL_ANALYZER);
if (prop.isEmpty()) {
return null;
}
return prop;
}
public int getNumFolds() {
return Integer.parseInt(getProperty(NUM_FOLDS));
}
public boolean getUseDefaultFeatures() {
return Boolean.parseBoolean(getProperty(USE_DEFAULT_FEATURES));
}
public boolean getUseHashVector() {
return Boolean.parseBoolean(getProperty(USE_HASH_VECTOR));
}
public String getFeatureTemplates() {
return getProperty(FEATURE_TEMPLATES);
}
public int getMaxAffixLength() {
return Integer.valueOf(getProperty(MAX_AFFIX_LENGTH));
}
public boolean getLemmatizer() {
return Boolean.valueOf(getProperty(LEMMATIZE));
}
public String getLemmaUnigramFile() {
return getProperty(LEMMA_UNIGRAM_FILE);
}
public boolean getGoldLemma() {
return Boolean.valueOf(getProperty(GOLD_LEMMA));
}
public boolean getLemmaPretraining() {
return Boolean.valueOf(getProperty(LEMMA_PRETRAINING));
}
public boolean getMarginalizeLemmas() {
return Boolean.valueOf(getProperty(MARGINALIZE_LEMMAS));
}
public String getLemmaIgnoreFeatures() {
return getProperty(LEMMAS_IGNORE_FEATURES);
}
public String getLemmaAspellPath() {
return getProperty(LEMMA_ASPELL_PATH);
}
public String getLemmaAspellLang() {
return getProperty(LEMMA_ASPELL_LANG);
}
public boolean getLemmaUseShapeLexicon() {
return Boolean.parseBoolean(getProperty(LEMMA_USE_SHAPE_LEXICON));
}
public boolean getLemmaUseMorph() {
return Boolean.parseBoolean(getProperty(LEMMA_USE_MORPH));
}
public String getLemmatizerFile() {
return getProperty(LEMMATIZER_FILE);
}
public boolean getUseHashFeatureTable() {
return Boolean.valueOf(getProperty(USE_HASH_FEATURE_TABLE));
}
public String getLemmaClusterFile() {
return getProperty(LEMMA_CLUSTER_FILE);
}
public boolean getLemmaPrePruningExtraction() {
return Boolean.valueOf(getProperty(LEMMA_PREPRUNING_EXTRACTION_));
}
public boolean getLemmaTagDependent() {
return Boolean.valueOf(getProperty(LEMMA_TAG_DEPENDENT));
}
public int getLemmaUseLemmingGenerator() {
return Integer.valueOf(getProperty(LEMMA_LEMMING_GENERATOR ));
}
public boolean getRestrictPosTagsToSeenCombinations() {
return Boolean.valueOf(getProperty(RESTRICT_POS_TAGS_TO_SEEN_COMBINATIONS));
}
}