package edu.stanford.nlp.tagger.maxent;
import java.io.*;
import java.util.Map;
import java.util.Properties;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
/**
* Reads and stores configuration information for a POS tagger.
*
* <i>Implementation note:</i> To add a new parameter: (1) define a default
* String value, (2) add it to defaultValues map, (3) add line to constructor,
* (4) add getter method, (5) add to dump() method, (6) add to printGenProps()
* method, (7) add to class javadoc of MaxentTagger.
*
* @author William Morgan
* @author Anna Rafferty
* @author Michel Galley
*/
public class TaggerConfig extends Properties /* Inherits implementation of Serializable! */ {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(TaggerConfig.class);
private static final long serialVersionUID = -4136407850147157497L;
public enum Mode {
TRAIN, TEST, TAG, DUMP
}
/* defaults. sentenceDelimiter might be null; the others all have non-null values. */
public static final String
SEARCH = "qn",
TAG_SEPARATOR = "/",
TOKENIZE = "true",
DEBUG = "false",
ITERATIONS = "100",
ARCH = "",
WORD_FUNCTION = "",
RARE_WORD_THRESH = "5",
MIN_FEATURE_THRESH = "5",
CUR_WORD_MIN_FEATURE_THRESH = "2",
RARE_WORD_MIN_FEATURE_THRESH = "10",
VERY_COMMON_WORD_THRESH = "250",
OCCURRING_TAGS_ONLY = "false",
POSSIBLE_TAGS_ONLY = "false",
SIGMA_SQUARED = String.valueOf(0.5),
ENCODING = "UTF-8",
LEARN_CLOSED_CLASS = "false",
CLOSED_CLASS_THRESHOLD = "40",
VERBOSE = "false",
VERBOSE_RESULTS = "true",
SGML = "false",
LANG = "",
TOKENIZER_FACTORY = "",
XML_INPUT = "",
TAG_INSIDE = "",
APPROXIMATE = "-1.0",
TOKENIZER_OPTIONS = "",
DEFAULT_REG_L1 = "1.0",
OUTPUT_FILE = "",
OUTPUT_FORMAT = "slashTags",
OUTPUT_FORMAT_OPTIONS = "",
NTHREADS = "1";
public static final String ENCODING_PROPERTY = "encoding",
TAG_SEPARATOR_PROPERTY = "tagSeparator";
private static final Map<String, String> defaultValues = Generics.newHashMap();
static {
defaultValues.put("arch", ARCH);
defaultValues.put("wordFunction", WORD_FUNCTION);
defaultValues.put("closedClassTags", "");
defaultValues.put("closedClassTagThreshold", CLOSED_CLASS_THRESHOLD);
defaultValues.put("search", SEARCH);
defaultValues.put(TAG_SEPARATOR_PROPERTY, TAG_SEPARATOR);
defaultValues.put("tokenize", TOKENIZE);
defaultValues.put("debug", DEBUG);
defaultValues.put("iterations", ITERATIONS);
defaultValues.put("rareWordThresh", RARE_WORD_THRESH);
defaultValues.put("minFeatureThresh", MIN_FEATURE_THRESH);
defaultValues.put("curWordMinFeatureThresh", CUR_WORD_MIN_FEATURE_THRESH);
defaultValues.put("rareWordMinFeatureThresh", RARE_WORD_MIN_FEATURE_THRESH);
defaultValues.put("veryCommonWordThresh", VERY_COMMON_WORD_THRESH);
defaultValues.put("occurringTagsOnly", OCCURRING_TAGS_ONLY);
defaultValues.put("possibleTagsOnly", POSSIBLE_TAGS_ONLY);
defaultValues.put("sigmaSquared", SIGMA_SQUARED);
defaultValues.put(ENCODING_PROPERTY, ENCODING);
defaultValues.put("learnClosedClassTags", LEARN_CLOSED_CLASS);
defaultValues.put("verbose", VERBOSE);
defaultValues.put("verboseResults", VERBOSE_RESULTS);
defaultValues.put("openClassTags", "");
defaultValues.put("lang", LANG);
defaultValues.put("tokenizerFactory", TOKENIZER_FACTORY);
defaultValues.put("xmlInput", XML_INPUT);
defaultValues.put("tagInside", TAG_INSIDE);
defaultValues.put("sgml", SGML);
defaultValues.put("approximate", APPROXIMATE);
defaultValues.put("tokenizerOptions", TOKENIZER_OPTIONS);
defaultValues.put("regL1", DEFAULT_REG_L1);
defaultValues.put("outputFile", OUTPUT_FILE);
defaultValues.put("outputFormat", OUTPUT_FORMAT);
defaultValues.put("outputFormatOptions", OUTPUT_FORMAT_OPTIONS);
defaultValues.put("nthreads", NTHREADS);
}
/**
* This constructor is just for creating an instance with default values.
* Used internally.
*/
private TaggerConfig() {
super();
this.putAll(defaultValues);
}
/**
* We force you to pass in a TaggerConfig rather than any other
* superclass so that we know the arg error checking has already occurred
*/
public TaggerConfig(TaggerConfig old) {
super(old);
}
public TaggerConfig(String... args) {
this(StringUtils.argsToProperties(args));
}
public TaggerConfig(Properties props) {
// load up the default properties
this();
/* Try and use the default properties from the model */
// Properties modelProps = new Properties();
// TaggerConfig oldConfig = new TaggerConfig(); // loads default values in oldConfig
if (! props.containsKey("trainFile")) {
String name = props.getProperty("model");
if (name == null) {
name = props.getProperty("dump");
}
if (name != null) {
try {
log.info("Loading default properties from tagger " + name);
DataInputStream in = new DataInputStream(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(name));
this.putAll(TaggerConfig.readConfig(in)); // overwrites defaults with any serialized values.
in.close();
} catch (Exception e) {
throw new RuntimeIOException("No such trained tagger config file found: " + name);
}
}
}
setProperties(props);
}
public void setProperties(Properties props) {
if (props.getProperty("") != null) {
throw new RuntimeException("unknown argument(s): \"" + props.getProperty("") + '\"');
}
if (props.getProperty("genprops") != null) {
printGenProps(System.out);
System.exit(0);
}
if (props.containsKey("mode") && props.containsKey("file")) {
this.setProperty("mode", props.getProperty("mode"));
this.setProperty("file", props.getProperty("file"));
} else if (props.containsKey("trainFile")) {
//Training mode
this.setProperty("mode", Mode.TRAIN.toString());
this.setProperty("file", props.getProperty("trainFile", "").trim());
} else if (props.containsKey("testFile")) {
//Testing mode
this.setProperty("mode", Mode.TEST.toString());
this.setProperty("file", props.getProperty("testFile", "").trim());
} else if (props.containsKey("textFile")) {
//Tagging mode
this.setProperty("mode", Mode.TAG.toString());
this.setProperty("file", props.getProperty("textFile", "").trim());
} else if (props.containsKey("dump")) {
this.setProperty("mode", Mode.DUMP.toString());
// this.setProperty("file", props.getProperty("dump").trim());
props.setProperty("model", props.getProperty("dump").trim());
} else {
this.setProperty("mode", Mode.TAG.toString());
this.setProperty("file", "stdin");
}
//for any mode other than train, we load a classifier, which means we load a config - model always needs to be specified
//on command line/in props file
//Get the path to the model (or the path where you'd like to save the model); this is necessary for training, testing, and tagging
this.setProperty("model", props.getProperty("model", this.getProperty("model", "")).trim());
if ( ! (this.getMode() == Mode.DUMP) && this.getProperty("model").isEmpty()) {
throw new RuntimeException("'model' parameter must be specified");
}
this.setProperty("search", props.getProperty("search", this.getProperty("search")).trim().toLowerCase());
String srch = this.getProperty("search");
if ( ! (srch.equals("cg") || srch.equals("iis") || srch.equals("owlqn") || srch.equals("qn") || srch.equals("owlqn2"))) {
throw new RuntimeException("'search' must be one of 'iis', 'cg', 'qn' or 'owlqn' or 'owlqn2': " + srch);
}
this.setProperty("sigmaSquared", props.getProperty("sigmaSquared", this.getProperty("sigmaSquared")));
this.setProperty(TAG_SEPARATOR_PROPERTY, props.getProperty(TAG_SEPARATOR_PROPERTY, this.getProperty(TAG_SEPARATOR_PROPERTY)));
this.setProperty("iterations", props.getProperty("iterations", this.getProperty("iterations")));
this.setProperty("rareWordThresh", props.getProperty("rareWordThresh", this.getProperty("rareWordThresh")));
this.setProperty("minFeatureThresh", props.getProperty("minFeatureThresh", this.getProperty("minFeatureThresh")));
this.setProperty("curWordMinFeatureThresh", props.getProperty("curWordMinFeatureThresh", this.getProperty("curWordMinFeatureThresh")));
this.setProperty("rareWordMinFeatureThresh", props.getProperty("rareWordMinFeatureThresh", this.getProperty("rareWordMinFeatureThresh")));
this.setProperty("veryCommonWordThresh", props.getProperty("veryCommonWordThresh", this.getProperty("veryCommonWordThresh")));
this.setProperty("occurringTagsOnly", props.getProperty("occurringTagsOnly", this.getProperty("occurringTagsOnly", OCCURRING_TAGS_ONLY)));
this.setProperty("possibleTagsOnly", props.getProperty("possibleTagsOnly", this.getProperty("possibleTagsOnly")));
this.setProperty("lang", props.getProperty("lang", this.getProperty("lang")));
this.setProperty("openClassTags", props.getProperty("openClassTags", this.getProperty("openClassTags")).trim());
this.setProperty("closedClassTags", props.getProperty("closedClassTags", this.getProperty("closedClassTags")).trim());
this.setProperty("learnClosedClassTags", props.getProperty("learnClosedClassTags", this.getProperty("learnClosedClassTags")));
this.setProperty("closedClassTagThreshold", props.getProperty("closedClassTagThreshold", this.getProperty("closedClassTagThreshold")));
this.setProperty("arch", props.getProperty("arch", this.getProperty("arch")));
if (this.getMode() == Mode.TRAIN && this.getProperty("arch").isEmpty()) {
throw new IllegalArgumentException("No architecture specified; " +
"set the -arch flag with " +
"the features to be used");
}
this.setProperty("wordFunction", props.getProperty("wordFunction", this.getProperty("wordFunction", WORD_FUNCTION)));
this.setProperty("tokenize", props.getProperty("tokenize", this.getProperty("tokenize")));
this.setProperty("tokenizerFactory", props.getProperty("tokenizerFactory", this.getProperty("tokenizerFactory")));
this.setProperty("debugPrefix", props.getProperty("debugPrefix", this.getProperty("debugPrefix", "")));
this.setProperty("debug", props.getProperty("debug", DEBUG));
this.setProperty(ENCODING_PROPERTY, props.getProperty(ENCODING_PROPERTY, this.getProperty(ENCODING_PROPERTY)));
this.setProperty("sgml", props.getProperty("sgml", this.getProperty("sgml")));
this.setProperty("verbose", props.getProperty("verbose", this.getProperty("verbose")));
this.setProperty("verboseResults", props.getProperty("verboseResults", this.getProperty("verboseResults")));
this.setProperty("regL1", props.getProperty("regL1", this.getProperty("regL1")));
//this is a property that is stored (not like the general properties)
this.setProperty("xmlInput", props.getProperty("xmlInput", this.getProperty("xmlInput")).trim());
this.setProperty("tagInside", props.getProperty("tagInside", this.getProperty("tagInside"))); //this isn't something we save from time to time
this.setProperty("approximate", props.getProperty("approximate", this.getProperty("approximate"))); //this isn't something we save from time to time
this.setProperty("tokenizerOptions", props.getProperty("tokenizerOptions", this.getProperty("tokenizerOptions"))); //this isn't something we save from time to time
this.setProperty("outputFile", props.getProperty("outputFile", this.getProperty("outputFile")).trim()); //this isn't something we save from time to time
this.setProperty("outputFormat", props.getProperty("outputFormat", this.getProperty("outputFormat")).trim()); //this isn't something we save from time to time
this.setProperty("outputFormatOptions", props.getProperty("outputFormatOptions", this.getProperty("outputFormatOptions")).trim()); //this isn't something we save from time to time
this.setProperty("nthreads", props.getProperty("nthreads", this.getProperty("nthreads", NTHREADS)).trim());
String sentenceDelimiter = props.getProperty("sentenceDelimiter", this.getProperty("sentenceDelimiter"));
if (sentenceDelimiter != null) {
// this isn't something we save from time to time.
// It is only relevant when tagging text files.
// In fact, we let this one be null, as it really is useful to
// let the null value represent no sentence delimiter.
this.setProperty("sentenceDelimiter", sentenceDelimiter);
}
}
public String getModel() { return getProperty("model"); }
public String getFile() { return getProperty("file"); }
public String getOutputFile() { return getProperty("outputFile"); }
public String getOutputFormat() { return getProperty("outputFormat"); }
public String[] getOutputOptions() { return getProperty("outputFormatOptions").split("\\s*,\\s*"); }
public boolean getOutputVerbosity() {
return getOutputOptionsContains("verbose");
}
public boolean getOutputLemmas() {
return getOutputOptionsContains("lemmatize");
}
public boolean keepEmptySentences() {
return getOutputOptionsContains("keepEmptySentences");
}
public boolean getOutputOptionsContains(String sought) {
String[] options = getOutputOptions();
for (String option : options) {
if (option.equals(sought)) {
return true;
}
}
return false;
}
public String getSearch() { return getProperty("search"); }
public double getSigmaSquared() { return Double.parseDouble(getProperty("sigmaSquared")); }
public int getIterations() { return Integer.parseInt(getProperty("iterations")); }
public int getRareWordThresh() { return Integer.parseInt(getProperty("rareWordThresh")); }
public int getMinFeatureThresh() { return Integer.parseInt(getProperty("minFeatureThresh")); }
public int getCurWordMinFeatureThresh() { return Integer.parseInt(getProperty("curWordMinFeatureThresh")); }
public int getRareWordMinFeatureThresh() { return Integer.parseInt(getProperty("rareWordMinFeatureThresh")); }
public int getVeryCommonWordThresh() { return Integer.parseInt(getProperty("veryCommonWordThresh")); }
public boolean occurringTagsOnly() { return Boolean.parseBoolean(getProperty("occurringTagsOnly")); }
public boolean possibleTagsOnly() { return Boolean.parseBoolean(getProperty("possibleTagsOnly")); }
public String getLang() { return getProperty("lang"); }
public String[] getOpenClassTags() {
return wsvStringToStringArray(getProperty("openClassTags"));
}
public String[] getClosedClassTags() {
return wsvStringToStringArray(getProperty("closedClassTags"));
}
private static String[] wsvStringToStringArray(String str) {
if (StringUtils.isNullOrEmpty(str)) {
return StringUtils.EMPTY_STRING_ARRAY;
} else {
return str.split("\\s+");
}
}
public boolean getLearnClosedClassTags() { return Boolean.parseBoolean(getProperty("learnClosedClassTags")); }
public int getClosedTagThreshold() { return Integer.parseInt(getProperty("closedClassTagThreshold")); }
public String getArch() { return getProperty("arch"); }
public String getWordFunction() { return getProperty("wordFunction"); }
public boolean getDebug() { return Boolean.parseBoolean(getProperty("debug")); }
public String getDebugPrefix() { return getProperty("debugPrefix"); }
public String getTokenizerFactory() { return getProperty("tokenizerFactory"); }
public static String getDefaultTagSeparator() { return TAG_SEPARATOR; }
public final String getTagSeparator() { return getProperty(TAG_SEPARATOR_PROPERTY); }
public boolean getTokenize() { return Boolean.parseBoolean(getProperty("tokenize")); }
public String getEncoding() { return getProperty(ENCODING_PROPERTY); }
public double getRegL1() { return Double.parseDouble(getProperty("regL1")); }
public String[] getXMLInput() {
return wsvStringToStringArray(getProperty("xmlInput"));
}
public boolean getVerbose() { return Boolean.parseBoolean(getProperty("verbose")); }
public boolean getVerboseResults() { return Boolean.parseBoolean(getProperty("verboseResults")); }
public boolean getSGML() { return Boolean.parseBoolean(getProperty("sgml")); }
public int getNThreads() { return Integer.parseInt(getProperty("nthreads")); }
/** Return a regex of XML elements to tag inside of. This may return an
* empty String, but never null.
*
* @return A regex of XML elements to tag inside of
*/
public String getTagInside() {
String str = getProperty("tagInside");
if (str == null) {
return "";
}
return str;
}
public String getTokenizerOptions() { return getProperty("tokenizerOptions"); }
public boolean getTokenizerInvertible() {
String tokenizerOptions = getTokenizerOptions();
if (tokenizerOptions != null &&
tokenizerOptions.matches("(^|.*,)invertible=true"))
return true;
return getOutputVerbosity() || getOutputLemmas();
}
/**
* Returns a default score to be used for each tag that is incompatible with
* the current word (e.g., the tag CC for the word "apple"). Using a default
* score may slightly decrease performance for some languages (e.g., Chinese and
* German), but allows the tagger to run considerably faster (since the computation
* of the normalization term Z requires much less feature extraction). This approximation
* does not decrease performance in English (on the WSJ). If this function returns
* 0.0, the tagger will compute exact scores.
*
* @return default score
*/
public double getDefaultScore() {
String approx = getProperty("approximate");
if ("false".equalsIgnoreCase(approx)) {
return -1.0;
} else if ("true".equalsIgnoreCase(approx)) {
return 1.0;
} else {
return Double.parseDouble(approx);
}
}
public void dump() { dump(new PrintWriter(System.err)); }
public void dump(PrintStream stream) {
PrintWriter pw = new PrintWriter(stream);
dump(pw);
}
private void dump(PrintWriter pw) {
pw.println(" model = " + getProperty("model"));
pw.println(" arch = " + getProperty("arch"));
pw.println(" wordFunction = " + getProperty("wordFunction"));
if (this.getMode() == Mode.TRAIN || this.getMode() == Mode.DUMP) {
pw.println(" trainFile = " + getProperty("file"));
} else if (this.getMode() == Mode.TAG) {
pw.println(" textFile = " + getProperty("file"));
} else if (this.getMode() == Mode.TEST) {
pw.println(" testFile = " + getProperty("file"));
}
pw.println(" closedClassTags = " + getProperty("closedClassTags"));
pw.println(" closedClassTagThreshold = " + getProperty("closedClassTagThreshold"));
pw.println(" curWordMinFeatureThresh = " + getProperty("curWordMinFeatureThresh"));
pw.println(" debug = " + getProperty("debug"));
pw.println(" debugPrefix = " + getProperty("debugPrefix"));
pw.println(" " + TAG_SEPARATOR_PROPERTY + " = " +
getProperty(TAG_SEPARATOR_PROPERTY));
pw.println(" " + ENCODING_PROPERTY + " = " +
getProperty(ENCODING_PROPERTY));
pw.println(" iterations = " + getProperty("iterations"));
pw.println(" lang = " + getProperty("lang"));
pw.println(" learnClosedClassTags = " + getProperty("learnClosedClassTags"));
pw.println(" minFeatureThresh = " + getProperty("minFeatureThresh"));
pw.println(" openClassTags = " + getProperty("openClassTags"));
pw.println("rareWordMinFeatureThresh = " + getProperty("rareWordMinFeatureThresh"));
pw.println(" rareWordThresh = " + getProperty("rareWordThresh"));
pw.println(" search = " + getProperty("search"));
pw.println(" sgml = " + getProperty("sgml"));
pw.println(" sigmaSquared = " + getProperty("sigmaSquared"));
pw.println(" regL1 = " + getProperty("regL1"));
pw.println(" tagInside = " + getProperty("tagInside"));
pw.println(" tokenize = " + getProperty("tokenize"));
pw.println(" tokenizerFactory = " + getProperty("tokenizerFactory"));
pw.println(" tokenizerOptions = " + getProperty("tokenizerOptions"));
pw.println(" verbose = " + getProperty("verbose"));
pw.println(" verboseResults = " + getProperty("verboseResults"));
pw.println(" veryCommonWordThresh = " + getProperty("veryCommonWordThresh"));
pw.println(" xmlInput = " + getProperty("xmlInput"));
pw.println(" outputFile = " + getProperty("outputFile"));
pw.println(" outputFormat = " + getProperty("outputFormat"));
pw.println(" outputFormatOptions = " + getProperty("outputFormatOptions"));
pw.println(" nthreads = " + getProperty("nthreads"));
pw.flush();
}
@Override
public String toString() {
StringWriter sw = new StringWriter(200);
PrintWriter pw = new PrintWriter(sw);
dump(pw);
return sw.toString();
}
/**
* This returns the sentence delimiter used when tokenizing text
* using the tokenizer requested in this config. In general, it is
* assumed the tokenizer doesn't need a sentence delimiter.... If you
* use the whitespace tokenizer, though, a newline breaks sentences.
*
* @return A null String unless tokenize is false and then the String
*/
public String getSentenceDelimiter() {
String delimiter = getProperty("sentenceDelimiter");
if (delimiter == null && !getTokenize()) {
delimiter = "\n";
}
return delimiter;
}
/**
* Returns whether or not we should use stdin for reading when
* tagging data. For now, this returns true iff the filename given
* was "stdin".
* (TODO: kind of ugly)
*/
public boolean useStdin() {
return getFile().trim().equalsIgnoreCase("stdin");
}
/**
* Prints out the automatically generated props file - in its own
* method to make code above easier to read
*/
private static void printGenProps(PrintStream out) {
out.println("## Sample properties file for maxent tagger. This file is used for three main");
out.println("## operations: training, testing, and tagging. It may also be used to dump");
out.println("## the contents of a model.");
out.println("## To train or test a model, or to tag something, run:");
out.println("## java edu.stanford.nlp.tagger.maxent.MaxentTagger -prop <properties file>");
out.println("## Arguments can be overridden on the commandline, e.g.:");
out.println("## java ....MaxentTagger -prop <properties file> -testFile /other/file ");
out.println();
out.println("# Model file name (created at train time; used at tag and test time)");
out.println("# (you can leave this blank and specify it on the commandline with -model)");
out.println("# model = ");
out.println();
out.println("# Path to file to be operated on (trained from, tested against, or tagged)");
out.println("# Specify -textFile <filename> to tag text in the given file, -trainFile <filename> to");
out.println("# to train a model using data in the given file, or -testFile <filename> to test your");
out.println("# model using data in the given file. Alternatively, you may specify");
out.println("# -dump <filename> to dump the parameters stored in a model or ");
out.println("# -convertToSingleFile <filename> to save an old, multi-file model (specified as -model)");
out.println("# to the new single file format. The new model will be saved in the file filename.");
out.println("# If you choose to convert an old file, you must specify ");
out.println("# the correct 'arch' parameter used to create the original model.");
out.println("# trainFile = ");
out.println();
out.println("# Path to outputFile to write tagged output to.");
out.println("# If empty, stdout is used.");
out.println("# outputFile = " + OUTPUT_FILE);
out.println();
out.println("# Output format. One of: slashTags (default), xml, or tsv");
out.println("# outputFormat = " + OUTPUT_FORMAT);
out.println();
out.println("# Output format options. Comma separated list.");
out.println("# currently \"lemmatize\" and \"keepEmptySentences\" are supported.");
out.println("# outputFormatOptions = " + OUTPUT_FORMAT_OPTIONS);
out.println();
out.println("# Tag separator character that separates word and pos tags");
out.println("# (for both training and test data) and used for");
out.println("# separating words and tags in slashTags format output.");
out.println("# tagSeparator = " + TAG_SEPARATOR);
out.println();
out.println("# Encoding format in which files are stored. If left blank, UTF-8 is assumed.");
out.println("# encoding = " + ENCODING);
out.println();
out.println("# A couple flags for controlling the amount of output:");
out.println("# - print extra debugging information:");
out.println("# verbose = " + VERBOSE);
out.println("# - print intermediate results:");
out.println("# verboseResults = " + VERBOSE_RESULTS);
out.println("######### parameters for tag and test operations #########");
out.println();
out.println("# Class to use for tokenization. Default blank value means Penn Treebank");
out.println("# tokenization. If you'd like to just assume that tokenization has been done,");
out.println("# and the input is whitespace-tokenized, use");
out.println("# edu.stanford.nlp.process.WhitespaceTokenizer or set tokenize to false.");
out.println("# tokenizerFactory = ");
out.println();
out.println("# Options to the tokenizer. A comma separated list.");
out.println("# This depends on what the tokenizer supports.");
out.println("# For PTBTokenizer, you might try options like americanize=false");
out.println("# or asciiQuotes (for German!).");
out.println("# tokenizerOptions = ");
out.println();
out.println("# Whether to tokenize text for tag and test operations. Default is true.");
out.println("# If false, your text must already be whitespace tokenized.");
out.println("# tokenize = " + TOKENIZE);
out.println();
out.println("# Write debugging information (words, top words, unknown words). Useful for");
out.println("# error analysis. Default is false.");
out.println("# debug = "+ DEBUG);
out.println();
out.println("# Prefix for debugging output (if debug == true). Default is to use the");
out.println("# filename from 'file'");
out.println("# debugPrefix = ");
out.println();
out.println("######### parameters for training #########");
out.println();
out.println("# model architecture: This is one or more comma separated strings, which");
out.println("# specify which extractors to use. Some of them take one or more integer");
out.println("# or string ");
out.println("# (file path) arguments in parentheses, written as m, n, and s below:");
out.println("# 'left3words', 'left5words', 'bidirectional', 'bidirectional5words',");
out.println("# 'generic', 'sighan2005', 'german', 'words(m,n)', 'wordshapes(m,n)',");
out.println("# 'biwords(m,n)', 'lowercasewords(m,n)', 'vbn(n)', distsimconjunction(s,m,n)',");
out.println("# 'naacl2003unknowns', 'naacl2003conjunctions', 'distsim(s,m,n)',");
out.println("# 'suffix(n)', 'prefix(n)', 'prefixsuffix(n)', 'capitalizationsuffix(n)',");
out.println("# 'wordshapes(m,n)', 'unicodeshapes(m,n)', 'unicodeshapeconjunction(m,n)',");
out.println("# 'lctagfeatures', 'order(k)', 'chinesedictionaryfeatures(s)'.");
out.println("# These keywords determines the features extracted. 'generic' is language independent.");
out.println("# distsim: Distributional similarity classes can be an added source of information");
out.println("# about your words. An English distsim file is included, or you can use your own.");
out.println("# arch = ");
out.println();
out.println("# 'wordFunction'. A function applied to the text before training or tagging.");
out.println("# For example, edu.stanford.nlp.util.LowercaseFunction");
out.println("# This function turns all the words into lowercase");
out.println("# The function must implement java.util.function.Function<String, String>");
out.println("# Blank means no preprocessing function");
out.println("# wordFunction = ");
out.println();
out.println("# 'language'. This is really the tag set which is used for the");
out.println("# list of open-class tags, and perhaps deterministic tag");
out.println("# expansion). Currently we have 'english', 'arabic', 'german', 'chinese'");
out.println("# or 'polish' predefined. For your own language, you can specify ");
out.println("# the same information via openClassTags or closedClassTags below");
out.println("# (only ONE of these three options may be specified). ");
out.println("# 'english' means UPenn English treebank tags. 'german' is STTS");
out.println("# 'chinese' is CTB, and Arabic is an expanded Bies mapping from the ATB");
out.println("# 'polish' means some tags that some guy on the internet once used. ");
out.println("# See the TTags class for more information.");
out.println("# lang = ");
out.println();
out.println("# a space-delimited list of open-class parts of speech");
out.println("# alternatively, you can specify language above to use a pre-defined list or specify the closed class tags (below)");
out.println("# openClassTags = ");
out.println();
out.println("# a space-delimited list of closed-class parts of speech");
out.println("# alternatively, you can specify language above to use a pre-defined list or specify the open class tags (above)");
out.println("# closedClassTags = ");
out.println();
out.println("# A boolean indicating whether you would like the trained model to set POS tags as closed");
out.println("# based on their frequency in training; default is false. The frequency threshold can be set below. ");
out.println("# This option is ignored if any of {openClassTags, closedClassTags, lang} are specified.");
out.println("# learnClosedClassTags = ");
out.println();
out.println("# Used only if learnClosedClassTags=true. Tags that have fewer tokens than this threshold are");
out.println("# considered closed in the trained model.");
out.println("# closedClassTagThreshold = ");
out.println();
out.println("# search method for optimization. Normally use the default 'qn'. choices: 'qn' (quasi-Newton),");
out.println("# 'cg' (conjugate gradient, 'owlqn' (L1 regularization) or 'iis' (improved iterative scaling)");
out.println("# search = " + SEARCH);
out.println();
out.println("# for conjugate gradient or quasi-Newton search, sigma-squared smoothing/regularization");
out.println("# parameter. if left blank, the default is 0.5, which is usually okay");
out.println("# sigmaSquared = " + SIGMA_SQUARED);
out.println();
out.println("# for OWLQN search, regularization");
out.println("# parameter. if left blank, the default is 1.0, which is usually okay");
out.println("# regL1 = " + DEFAULT_REG_L1);
out.println();
out.println("# For improved iterative scaling, the number of iterations, otherwise ignored");
out.println("# iterations = " + ITERATIONS);
out.println();
out.println("# rare word threshold. words that occur less than this number of");
out.println("# times are considered rare words.");
out.println("# rareWordThresh = " + RARE_WORD_THRESH);
out.println();
out.println("# minimum feature threshold. features whose history appears less");
out.println("# than this number of times are ignored.");
out.println("# minFeatureThresh = " + MIN_FEATURE_THRESH);
out.println();
out.println("# current word feature threshold. words that occur more than this");
out.println("# number of times will generate features with all of their occurring");
out.println("# tags.");
out.println("# curWordMinFeatureThresh = " + CUR_WORD_MIN_FEATURE_THRESH);
out.println();
out.println("# rare word minimum feature threshold. features of rare words whose histories");
out.println("# appear less than this times will be ignored.");
out.println("# rareWordMinFeatureThresh = " + RARE_WORD_MIN_FEATURE_THRESH);
out.println();
out.println("# very common word threshold. words that occur more than this number of");
out.println("# times will form an equivalence class by themselves. ignored unless");
out.println("# you are using equivalence classes.");
out.println("# veryCommonWordThresh = " + VERY_COMMON_WORD_THRESH);
out.println();
out.println("# sgml = ");
out.println("# tagInside = ");
out.println();
out.println("# testFile and textFile can use multiple threads to process text.");
out.println("# nthreads = " + NTHREADS);
}
public Mode getMode() {
if (!containsKey("mode")) {
return null;
}
return Mode.valueOf(getProperty("mode"));
}
/** Serialize the TaggerConfig.
*
* @param os Where to write this TaggerConfig
* @throws IOException If any IO problems
*/
public void saveConfig(OutputStream os) throws IOException {
ObjectOutputStream out = new ObjectOutputStream(os);
out.writeObject(this);
}
/** Read in a TaggerConfig.
*
* @param stream Where to read from
* @return The TaggerConfig
* @throws IOException Misc IOError
* @throws ClassNotFoundException Class error
*/
public static TaggerConfig readConfig(DataInputStream stream)
throws IOException, ClassNotFoundException
{
ObjectInputStream in = new ObjectInputStream(stream);
return (TaggerConfig) in.readObject();
}
}