package edu.stanford.nlp.parser.lexparser; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.trees.CompositeTreeTransformer; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.trees.TreeTransformer; import java.util.function.Function; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.ReflectionLoading; import edu.stanford.nlp.util.StringUtils; import java.io.*; import java.util.*; /** * This class contains options to the parser which MUST be the SAME at * both training and testing (parsing) time in order for the parser to * work properly. It also contains an object which stores the options * used by the parser at training time and an object which contains * default options for test use. * * @author Dan Klein * @author Christopher Manning * @author John Bauer */ public class Options implements Serializable { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(Options.class); public Options() { this(new EnglishTreebankParserParams()); } public Options(TreebankLangParserParams tlpParams) { this.tlpParams = tlpParams; } /** * Set options based on a String array in the style of * commandline flags. This method goes through the array until it ends, * processing options, as for {@link #setOption}. * * @param flags Array of options (or as a varargs list of arguments). * The options passed in should * be specified like command-line arguments, including with an initial * minus sign for example, * {"-outputFormat", "typedDependencies", "-maxLength", "70"} * @throws IllegalArgumentException If an unknown flag is passed in */ public void setOptions(String... flags) { setOptions(flags, 0, flags.length); } /** * Set options based on a String array in the style of * commandline flags. This method goes through the array until it ends, * processing options, as for {@link #setOption}. * * @param flags Array of options. The options passed in should * be specified like command-line arguments, including with an initial * minus sign for example, * {"-outputFormat", "typedDependencies", "-maxLength", "70"} * @param startIndex The index in the array to begin processing options at * @param endIndexPlusOne A number one greater than the last array index at * which options should be processed * @throws IllegalArgumentException If an unknown flag is passed in */ public void setOptions(final String[] flags, final int startIndex, final int endIndexPlusOne) { for (int i = startIndex; i < endIndexPlusOne;) { i = setOption(flags, i); } } /** * Set options based on a String array in the style of * commandline flags. This method goes through the array until it ends, * processing options, as for {@link #setOption}. * * @param flags Array of options (or as a varargs list of arguments). * The options passed in should * be specified like command-line arguments, including with an initial * minus sign for example, * {"-outputFormat", "typedDependencies", "-maxLength", "70"} * @throws IllegalArgumentException If an unknown flag is passed in */ public void setOptionsOrWarn(String... flags) { setOptionsOrWarn(flags, 0, flags.length); } /** * Set options based on a String array in the style of * commandline flags. This method goes through the array until it ends, * processing options, as for {@link #setOption}. * * @param flags Array of options. The options passed in should * be specified like command-line arguments, including with an initial * minus sign for example, * {"-outputFormat", "typedDependencies", "-maxLength", "70"} * @param startIndex The index in the array to begin processing options at * @param endIndexPlusOne A number one greater than the last array index at * which options should be processed * @throws IllegalArgumentException If an unknown flag is passed in */ public void setOptionsOrWarn(final String[] flags, final int startIndex, final int endIndexPlusOne) { for (int i = startIndex; i < endIndexPlusOne;) { i = setOptionOrWarn(flags, i); } } /** * Set an option based on a String array in the style of * commandline flags. The option may * be either one known by the Options object, or one recognized by the * TreebankLangParserParams which has already been set up inside the Options * object, and then the option is set in the language-particular * TreebankLangParserParams. * Note that despite this method being an instance method, many flags * are actually set as static class variables in the Train and Test * classes (this should be fixed some day). * Some options (there are many others; see the source code): * <ul> * <li> <code>-maxLength n</code> set the maximum length sentence to parse (inclusively) * <li> <code>-printTT</code> print the training trees in raw, annotated, and annotated+binarized form. Useful for debugging and other miscellany. * <li> <code>-printAnnotated filename</code> use only in conjunction with -printTT. Redirects printing of annotated training trees to <code>filename</code>. * <li> <code>-forceTags</code> when the parser is tested against a set of gold standard trees, use the tagged yield, instead of just the yield, as input. * </ul> * * @param flags An array of options arguments, command-line style. E.g. {"-maxLength", "50"}. * @param i The index in flags to start at when processing an option * @return The index in flags of the position after the last element used in * processing this option. If the current array position cannot be processed as a valid * option, then a warning message is printed to stderr and the return value is <code>i+1</code> */ public int setOptionOrWarn(String[] flags, int i) { int j = setOptionFlag(flags, i); if (j == i) { j = tlpParams.setOptionFlag(flags, i); } if (j == i) { log.info("WARNING! lexparser.Options: Unknown option ignored: " + flags[i]); j++; } return j; } /** * Set an option based on a String array in the style of * commandline flags. The option may * be either one known by the Options object, or one recognized by the * TreebankLangParserParams which has already been set up inside the Options * object, and then the option is set in the language-particular * TreebankLangParserParams. * Note that despite this method being an instance method, many flags * are actually set as static class variables in the Train and Test * classes (this should be fixed some day). * Some options (there are many others; see the source code): * <ul> * <li> <code>-maxLength n</code> set the maximum length sentence to parse (inclusively) * <li> <code>-printTT</code> print the training trees in raw, annotated, and annotated+binarized form. Useful for debugging and other miscellany. * <li> <code>-printAnnotated filename</code> use only in conjunction with -printTT. Redirects printing of annotated training trees to <code>filename</code>. * <li> <code>-forceTags</code> when the parser is tested against a set of gold standard trees, use the tagged yield, instead of just the yield, as input. * </ul> * * @param flags An array of options arguments, command-line style. E.g. {"-maxLength", "50"}. * @param i The index in flags to start at when processing an option * @return The index in flags of the position after the last element used in * processing this option. * @throws IllegalArgumentException If the current array position cannot be * processed as a valid option */ public int setOption(String[] flags, int i) { int j = setOptionFlag(flags, i); if (j == i) { j = tlpParams.setOptionFlag(flags, i); } if (j == i) { throw new IllegalArgumentException("Unknown option: " + flags[i]); } return j; } /** * Set an option in this object, based on a String array in the style of * commandline flags. The option is only processed with respect to * options directly known by the Options object. * Some options (there are many others; see the source code): * <ul> * <li> <code>-maxLength n</code> set the maximum length sentence to parse (inclusively) * <li> <code>-printTT</code> print the training trees in raw, annotated, and annotated+binarized form. Useful for debugging and other miscellany. * <li> <code>-printAnnotated filename</code> use only in conjunction with -printTT. Redirects printing of annotated training trees to <code>filename</code>. * <li> <code>-forceTags</code> when the parser is tested against a set of gold standard trees, use the tagged yield, instead of just the yield, as input. * </ul> * * @param args An array of options arguments, command-line style. E.g. {"-maxLength", "50"}. * @param i The index in args to start at when processing an option * @return The index in args of the position after the last element used in * processing this option, or the value i unchanged if a valid option couldn't * be processed starting at position i. */ protected int setOptionFlag(String[] args, int i) { if (args[i].equalsIgnoreCase("-PCFG")) { doDep = false; doPCFG = true; i++; } else if (args[i].equalsIgnoreCase("-dep")) { doDep = true; doPCFG = false; i++; } else if (args[i].equalsIgnoreCase("-factored")) { doDep = true; doPCFG = true; testOptions.useFastFactored = false; i++; } else if (args[i].equalsIgnoreCase("-fastFactored")) { doDep = true; doPCFG = true; testOptions.useFastFactored = true; i++; } else if (args[i].equalsIgnoreCase("-noRecoveryTagging")) { testOptions.noRecoveryTagging = true; i++; } else if (args[i].equalsIgnoreCase("-useLexiconToScoreDependencyPwGt")) { testOptions.useLexiconToScoreDependencyPwGt = true; i++; } else if (args[i].equalsIgnoreCase("-useSmoothTagProjection")) { useSmoothTagProjection = true; i++; } else if (args[i].equalsIgnoreCase("-useUnigramWordSmoothing")) { useUnigramWordSmoothing = true; i++; } else if (args[i].equalsIgnoreCase("-useNonProjectiveDependencyParser")) { testOptions.useNonProjectiveDependencyParser = true; i++; } else if (args[i].equalsIgnoreCase("-maxLength") && (i + 1 < args.length)) { testOptions.maxLength = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-MAX_ITEMS") && (i + 1 < args.length)) { testOptions.MAX_ITEMS = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-trainLength") && (i + 1 < args.length)) { // train on only short sentences trainOptions.trainLengthLimit = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-lengthNormalization")) { testOptions.lengthNormalization = true; i++; } else if (args[i].equalsIgnoreCase("-iterativeCKY")) { testOptions.iterativeCKY = true; i++; } else if (args[i].equalsIgnoreCase("-vMarkov") && (i + 1 < args.length)) { int order = Integer.parseInt(args[i + 1]); if (order <= 1) { trainOptions.PA = false; trainOptions.gPA = false; } else if (order == 2) { trainOptions.PA = true; trainOptions.gPA = false; } else if (order >= 3) { trainOptions.PA = true; trainOptions.gPA = true; } i += 2; } else if (args[i].equalsIgnoreCase("-vSelSplitCutOff") && (i + 1 < args.length)) { trainOptions.selectiveSplitCutOff = Double.parseDouble(args[i + 1]); trainOptions.selectiveSplit = trainOptions.selectiveSplitCutOff > 0.0; i += 2; } else if (args[i].equalsIgnoreCase("-vSelPostSplitCutOff") && (i + 1 < args.length)) { trainOptions.selectivePostSplitCutOff = Double.parseDouble(args[i + 1]); trainOptions.selectivePostSplit = trainOptions.selectivePostSplitCutOff > 0.0; i += 2; } else if (args[i].equalsIgnoreCase("-deleteSplitters") && (i+1 < args.length)) { String[] toDel = args[i+1].split(" *, *"); trainOptions.deleteSplitters = Generics.newHashSet(Arrays.asList(toDel)); i += 2; } else if (args[i].equalsIgnoreCase("-postSplitWithBaseCategory")) { trainOptions.postSplitWithBaseCategory = true; i += 1; } else if (args[i].equalsIgnoreCase("-vPostMarkov") && (i + 1 < args.length)) { int order = Integer.parseInt(args[i + 1]); if (order <= 1) { trainOptions.postPA = false; trainOptions.postGPA = false; } else if (order == 2) { trainOptions.postPA = true; trainOptions.postGPA = false; } else if (order >= 3) { trainOptions.postPA = true; trainOptions.postGPA = true; } i += 2; } else if (args[i].equalsIgnoreCase("-hMarkov") && (i + 1 < args.length)) { int order = Integer.parseInt(args[i + 1]); if (order >= 0) { trainOptions.markovOrder = order; trainOptions.markovFactor = true; } else { trainOptions.markovFactor = false; } i += 2; } else if (args[i].equalsIgnoreCase("-distanceBins") && (i + 1 < args.length)) { int numBins = Integer.parseInt(args[i + 1]); if (numBins <= 1) { distance = false; } else if (numBins == 4) { distance = true; coarseDistance = true; } else if (numBins == 5) { distance = true; coarseDistance = false; } else { throw new IllegalArgumentException("Invalid value for -distanceBin: " + args[i+1]); } i += 2; } else if (args[i].equalsIgnoreCase("-noStop")) { genStop = false; i++; } else if (args[i].equalsIgnoreCase("-nonDirectional")) { directional = false; i++; } else if (args[i].equalsIgnoreCase("-depWeight") && (i + 1 < args.length)) { testOptions.depWeight = Double.parseDouble(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-printPCFGkBest") && (i + 1 < args.length)) { testOptions.printPCFGkBest = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-evalPCFGkBest") && (i + 1 < args.length)) { testOptions.evalPCFGkBest = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-printFactoredKGood") && (i + 1 < args.length)) { testOptions.printFactoredKGood = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-smoothTagsThresh") && (i + 1 < args.length)) { lexOptions.smoothInUnknownsThreshold = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-unseenSmooth") && (i + 1 < args.length)) { testOptions.unseenSmooth = Double.parseDouble(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-fractionBeforeUnseenCounting") && (i + 1 < args.length)) { trainOptions.fractionBeforeUnseenCounting = Double.parseDouble(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-hSelSplitThresh") && (i + 1 < args.length)) { trainOptions.HSEL_CUT = Integer.parseInt(args[i + 1]); trainOptions.hSelSplit = trainOptions.HSEL_CUT > 0; i += 2; } else if (args[i].equalsIgnoreCase("-nohSelSplit")) { trainOptions.hSelSplit = false; i += 1; } else if (args[i].equalsIgnoreCase("-tagPA")) { trainOptions.tagPA = true; i += 1; } else if (args[i].equalsIgnoreCase("-noTagPA")) { trainOptions.tagPA = false; i += 1; } else if (args[i].equalsIgnoreCase("-tagSelSplitCutOff") && (i + 1 < args.length)) { trainOptions.tagSelectiveSplitCutOff = Double.parseDouble(args[i + 1]); trainOptions.tagSelectiveSplit = trainOptions.tagSelectiveSplitCutOff > 0.0; i += 2; } else if (args[i].equalsIgnoreCase("-tagSelPostSplitCutOff") && (i + 1 < args.length)) { trainOptions.tagSelectivePostSplitCutOff = Double.parseDouble(args[i + 1]); trainOptions.tagSelectivePostSplit = trainOptions.tagSelectivePostSplitCutOff > 0.0; i += 2; } else if (args[i].equalsIgnoreCase("-noTagSplit")) { trainOptions.noTagSplit = true; i += 1; } else if (args[i].equalsIgnoreCase("-uwm") && (i + 1 < args.length)) { lexOptions.useUnknownWordSignatures = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-unknownSuffixSize") && (i + 1 < args.length)) { lexOptions.unknownSuffixSize = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-unknownPrefixSize") && (i + 1 < args.length)) { lexOptions.unknownPrefixSize = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-uwModelTrainer") && (i + 1 < args.length)) { lexOptions.uwModelTrainer = args[i+1]; i += 2; } else if (args[i].equalsIgnoreCase("-openClassThreshold") && (i + 1 < args.length)) { trainOptions.openClassTypesThreshold = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-unary") && i+1 < args.length) { trainOptions.markUnary = Integer.parseInt(args[i+1]); i += 2; } else if (args[i].equalsIgnoreCase("-unaryTags")) { trainOptions.markUnaryTags = true; i += 1; } else if (args[i].equalsIgnoreCase("-mutate")) { lexOptions.smartMutation = true; i += 1; } else if (args[i].equalsIgnoreCase("-useUnicodeType")) { lexOptions.useUnicodeType = true; i += 1; } else if (args[i].equalsIgnoreCase("-rightRec")) { trainOptions.rightRec = true; i += 1; } else if (args[i].equalsIgnoreCase("-noRightRec")) { trainOptions.rightRec = false; i += 1; } else if (args[i].equalsIgnoreCase("-preTag")) { testOptions.preTag = true; i += 1; } else if (args[i].equalsIgnoreCase("-forceTags")) { testOptions.forceTags = true; i += 1; } else if (args[i].equalsIgnoreCase("-taggerSerializedFile")) { testOptions.taggerSerializedFile = args[i+1]; i += 2; } else if (args[i].equalsIgnoreCase("-forceTagBeginnings")) { testOptions.forceTagBeginnings = true; i += 1; } else if (args[i].equalsIgnoreCase("-noFunctionalForcing")) { testOptions.noFunctionalForcing = true; i += 1; } else if (args[i].equalsIgnoreCase("-scTags")) { dcTags = false; i += 1; } else if (args[i].equalsIgnoreCase("-dcTags")) { dcTags = true; i += 1; } else if (args[i].equalsIgnoreCase("-basicCategoryTagsInDependencyGrammar")) { trainOptions.basicCategoryTagsInDependencyGrammar = true; i+= 1; } else if (args[i].equalsIgnoreCase("-evalb")) { testOptions.evalb = true; i += 1; } else if (args[i].equalsIgnoreCase("-v") || args[i].equalsIgnoreCase("-verbose")) { testOptions.verbose = true; i += 1; } else if (args[i].equalsIgnoreCase("-outputFilesDirectory") && i+1 < args.length) { testOptions.outputFilesDirectory = args[i+1]; i += 2; } else if (args[i].equalsIgnoreCase("-outputFilesExtension") && i+1 < args.length) { testOptions.outputFilesExtension = args[i+1]; i += 2; } else if (args[i].equalsIgnoreCase("-outputFilesPrefix") && i+1 < args.length) { testOptions.outputFilesPrefix = args[i+1]; i += 2; } else if (args[i].equalsIgnoreCase("-outputkBestEquivocation") && i+1 < args.length) { testOptions.outputkBestEquivocation = args[i+1]; i += 2; } else if (args[i].equalsIgnoreCase("-writeOutputFiles")) { testOptions.writeOutputFiles = true; i += 1; } else if (args[i].equalsIgnoreCase("-printAllBestParses")) { testOptions.printAllBestParses = true; i += 1; } else if (args[i].equalsIgnoreCase("-outputTreeFormat") || args[i].equalsIgnoreCase("-outputFormat")) { testOptions.outputFormat = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-outputTreeFormatOptions") || args[i].equalsIgnoreCase("-outputFormatOptions")) { testOptions.outputFormatOptions = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-addMissingFinalPunctuation")) { testOptions.addMissingFinalPunctuation = true; i += 1; } else if (args[i].equalsIgnoreCase("-flexiTag")) { lexOptions.flexiTag = true; i += 1; } else if (args[i].equalsIgnoreCase("-lexiTag")) { lexOptions.flexiTag = false; i += 1; } else if (args[i].equalsIgnoreCase("-useSignatureForKnownSmoothing")) { lexOptions.useSignatureForKnownSmoothing = true; i += 1; } else if (args[i].equalsIgnoreCase("-wordClassesFile")) { lexOptions.wordClassesFile = args[i+1]; i += 2; } else if (args[i].equalsIgnoreCase("-compactGrammar")) { trainOptions.compactGrammar = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-markFinalStates")) { trainOptions.markFinalStates = args[i + 1].equalsIgnoreCase("true"); i += 2; } else if (args[i].equalsIgnoreCase("-leftToRight")) { trainOptions.leftToRight = args[i + 1].equals("true"); i += 2; } else if (args[i].equalsIgnoreCase("-cnf")) { forceCNF = true; i += 1; } else if(args[i].equalsIgnoreCase("-smoothRules")) { trainOptions.ruleSmoothing = true; trainOptions.ruleSmoothingAlpha = Double.valueOf(args[i+1]); i += 2; } else if (args[i].equalsIgnoreCase("-nodePrune") && i+1 < args.length) { nodePrune = args[i+1].equalsIgnoreCase("true"); i += 2; } else if (args[i].equalsIgnoreCase("-noDoRecovery")) { testOptions.doRecovery = false; i += 1; } else if (args[i].equalsIgnoreCase("-acl03chinese")) { trainOptions.markovOrder = 1; trainOptions.markovFactor = true; // no increment } else if (args[i].equalsIgnoreCase("-wordFunction")) { wordFunction = ReflectionLoading.loadByReflection(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-acl03pcfg")) { doDep = false; doPCFG = true; // lexOptions.smoothInUnknownsThreshold = 30; trainOptions.markUnary = 1; trainOptions.PA = true; trainOptions.gPA = false; trainOptions.tagPA = true; trainOptions.tagSelectiveSplit = false; trainOptions.rightRec = true; trainOptions.selectiveSplit = true; trainOptions.selectiveSplitCutOff = 400.0; trainOptions.markovFactor = true; trainOptions.markovOrder = 2; trainOptions.hSelSplit = true; lexOptions.useUnknownWordSignatures = 2; lexOptions.flexiTag = true; // DAN: Tag double-counting is BAD for PCFG-only parsing dcTags = false; // don't increment i so it gets language specific stuff as well } else if (args[i].equalsIgnoreCase("-jenny")) { doDep = false; doPCFG = true; // lexOptions.smoothInUnknownsThreshold = 30; trainOptions.markUnary = 1; trainOptions.PA = false; trainOptions.gPA = false; trainOptions.tagPA = false; trainOptions.tagSelectiveSplit = false; trainOptions.rightRec = true; trainOptions.selectiveSplit = false; // trainOptions.selectiveSplitCutOff = 400.0; trainOptions.markovFactor = false; // trainOptions.markovOrder = 2; trainOptions.hSelSplit = false; lexOptions.useUnknownWordSignatures = 2; lexOptions.flexiTag = true; // DAN: Tag double-counting is BAD for PCFG-only parsing dcTags = false; // don't increment i so it gets language specific stuff as well } else if (args[i].equalsIgnoreCase("-goodPCFG")) { doDep = false; doPCFG = true; // op.lexOptions.smoothInUnknownsThreshold = 30; trainOptions.markUnary = 1; trainOptions.PA = true; trainOptions.gPA = false; trainOptions.tagPA = true; trainOptions.tagSelectiveSplit = false; trainOptions.rightRec = true; trainOptions.selectiveSplit = true; trainOptions.selectiveSplitCutOff = 400.0; trainOptions.markovFactor = true; trainOptions.markovOrder = 2; trainOptions.hSelSplit = true; lexOptions.useUnknownWordSignatures = 2; lexOptions.flexiTag = true; // DAN: Tag double-counting is BAD for PCFG-only parsing dcTags = false; String[] delSplit = { "-deleteSplitters", "VP^NP,VP^VP,VP^SINV,VP^SQ" }; if (this.setOptionFlag(delSplit, 0) != 2) { log.info("Error processing deleteSplitters"); } // don't increment i so it gets language specific stuff as well } else if (args[i].equalsIgnoreCase("-linguisticPCFG")) { doDep = false; doPCFG = true; // op.lexOptions.smoothInUnknownsThreshold = 30; trainOptions.markUnary = 1; trainOptions.PA = true; trainOptions.gPA = false; trainOptions.tagPA = true; // on at the moment, but iffy trainOptions.tagSelectiveSplit = false; trainOptions.rightRec = false; // not for linguistic trainOptions.selectiveSplit = true; trainOptions.selectiveSplitCutOff = 400.0; trainOptions.markovFactor = true; trainOptions.markovOrder = 2; trainOptions.hSelSplit = true; lexOptions.useUnknownWordSignatures = 5; // different from acl03pcfg lexOptions.flexiTag = false; // different from acl03pcfg // DAN: Tag double-counting is BAD for PCFG-only parsing dcTags = false; // don't increment i so it gets language specific stuff as well } else if (args[i].equalsIgnoreCase("-ijcai03")) { doDep = true; doPCFG = true; trainOptions.markUnary = 0; trainOptions.PA = true; trainOptions.gPA = false; trainOptions.tagPA = false; trainOptions.tagSelectiveSplit = false; trainOptions.rightRec = false; trainOptions.selectiveSplit = true; trainOptions.selectiveSplitCutOff = 300.0; trainOptions.markovFactor = true; trainOptions.markovOrder = 2; trainOptions.hSelSplit = true; trainOptions.compactGrammar = 0; /// cdm: May 2005 compacting bad for factored? lexOptions.useUnknownWordSignatures = 2; lexOptions.flexiTag = false; dcTags = true; // op.nodePrune = true; // cdm: May 2005: this doesn't help // don't increment i so it gets language specific stuff as well } else if (args[i].equalsIgnoreCase("-goodFactored")) { doDep = true; doPCFG = true; trainOptions.markUnary = 0; trainOptions.PA = true; trainOptions.gPA = false; trainOptions.tagPA = false; trainOptions.tagSelectiveSplit = false; trainOptions.rightRec = false; trainOptions.selectiveSplit = true; trainOptions.selectiveSplitCutOff = 300.0; trainOptions.markovFactor = true; trainOptions.markovOrder = 2; trainOptions.hSelSplit = true; trainOptions.compactGrammar = 0; /// cdm: May 2005 compacting bad for factored? lexOptions.useUnknownWordSignatures = 5; // different from ijcai03 lexOptions.flexiTag = false; dcTags = true; // op.nodePrune = true; // cdm: May 2005: this doesn't help // don't increment i so it gets language specific stuff as well } else if (args[i].equalsIgnoreCase("-chineseFactored")) { // Single counting tag->word rewrite is also much better for Chinese // Factored. Bracketing F1 goes up about 0.7%. dcTags = false; lexOptions.useUnicodeType = true; trainOptions.markovOrder = 2; trainOptions.hSelSplit = true; trainOptions.markovFactor = true; trainOptions.HSEL_CUT = 50; // trainOptions.openClassTypesThreshold=1; // so can get unseen punctuation // trainOptions.fractionBeforeUnseenCounting=0.0; // so can get unseen punctuation // don't increment i so it gets language specific stuff as well } else if (args[i].equalsIgnoreCase("-arabicFactored")) { doDep = true; doPCFG = true; dcTags = false; // "false" seems to help Arabic about 0.1% F1 trainOptions.markovFactor = true; trainOptions.markovOrder = 2; trainOptions.hSelSplit = true; trainOptions.HSEL_CUT = 75; // 75 bit better than 50, 100 a bit worse trainOptions.PA = true; trainOptions.gPA = false; trainOptions.selectiveSplit = true; trainOptions.selectiveSplitCutOff = 300.0; trainOptions.markUnary = 1; // Helps PCFG and marginally factLB // trainOptions.compactGrammar = 0; // Doesn't seem to help or only 0.05% F1 lexOptions.useUnknownWordSignatures = 9; lexOptions.unknownPrefixSize = 1; lexOptions.unknownSuffixSize = 1; testOptions.MAX_ITEMS = 500000; // Arabic sentences are long enough that this helps a fraction // don't increment i so it gets language specific stuff as well } else if (args[i].equalsIgnoreCase("-frenchFactored")) { doDep = true; doPCFG = true; dcTags = false; //wsg2011: Setting to false improves F1 by 0.5% trainOptions.markovFactor = true; trainOptions.markovOrder = 2; trainOptions.hSelSplit = true; trainOptions.HSEL_CUT = 75; trainOptions.PA = true; trainOptions.gPA = false; trainOptions.selectiveSplit = true; trainOptions.selectiveSplitCutOff = 300.0; trainOptions.markUnary = 0; //Unary rule marking bad for french..setting to 0 gives +0.3 F1 lexOptions.useUnknownWordSignatures = 1; lexOptions.unknownPrefixSize = 1; lexOptions.unknownSuffixSize = 2; } else if (args[i].equalsIgnoreCase("-chinesePCFG")) { trainOptions.markovOrder = 2; trainOptions.markovFactor = true; trainOptions.HSEL_CUT = 5; trainOptions.PA = true; trainOptions.gPA = true; trainOptions.selectiveSplit = false; doDep = false; doPCFG = true; // Single counting tag->word rewrite is also much better for Chinese PCFG // Bracketing F1 is up about 2% and tag accuracy about 1% (exact by 6%) dcTags = false; // no increment } else if (args[i].equalsIgnoreCase("-printTT") && (i+1 < args.length)) { trainOptions.printTreeTransformations = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-printAnnotatedRuleCounts")) { trainOptions.printAnnotatedRuleCounts = true; i++; } else if (args[i].equalsIgnoreCase("-printAnnotatedStateCounts")) { trainOptions.printAnnotatedStateCounts = true; i++; } else if (args[i].equalsIgnoreCase("-printAnnotated") && (i + 1 < args.length)) { try { trainOptions.printAnnotatedPW = tlpParams.pw(new FileOutputStream(args[i + 1])); } catch (IOException ioe) { trainOptions.printAnnotatedPW = null; } i += 2; } else if (args[i].equalsIgnoreCase("-printBinarized") && (i + 1 < args.length)) { try { trainOptions.printBinarizedPW = tlpParams.pw(new FileOutputStream(args[i + 1])); } catch (IOException ioe) { trainOptions.printBinarizedPW = null; } i += 2; } else if (args[i].equalsIgnoreCase("-printStates")) { trainOptions.printStates = true; i++; } else if (args[i].equalsIgnoreCase("-preTransformer") && (i + 1 < args.length)) { String[] classes = args[i + 1].split(","); i += 2; if (classes.length == 1) { trainOptions.preTransformer = ReflectionLoading.loadByReflection(classes[0], this); } else if (classes.length > 1) { CompositeTreeTransformer composite = new CompositeTreeTransformer(); trainOptions.preTransformer = composite; for (String clazz : classes) { TreeTransformer transformer = ReflectionLoading.loadByReflection(clazz, this); composite.addTransformer(transformer); } } } else if (args[i].equalsIgnoreCase("-taggedFiles") && (i + 1 < args.length)) { trainOptions.taggedFiles = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-predictSplits")) { // This is an experimental (and still in development) // reimplementation of Berkeley's state splitting grammar. trainOptions.predictSplits = true; trainOptions.compactGrammar = 0; i++; } else if (args[i].equalsIgnoreCase("-splitCount")) { trainOptions.splitCount = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-splitRecombineRate")) { trainOptions.splitRecombineRate = Double.parseDouble(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-trainingThreads") || args[i].equalsIgnoreCase("-nThreads")) { trainOptions.trainingThreads = Integer.parseInt(args[i + 1]); testOptions.testingThreads = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-testingThreads")) { testOptions.testingThreads = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-evals")) { testOptions.evals = StringUtils.stringToProperties(args[i+1], testOptions.evals); i += 2; } else if (args[i].equalsIgnoreCase("-fastFactoredCandidateMultiplier")) { testOptions.fastFactoredCandidateMultiplier = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-fastFactoredCandidateAddend")) { testOptions.fastFactoredCandidateAddend = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-quietEvaluation")) { testOptions.quietEvaluation = true; i += 1; } else if (args[i].equalsIgnoreCase("-noquietEvaluation")) { testOptions.quietEvaluation = false; i += 1; } else if (args[i].equalsIgnoreCase("-simpleBinarizedLabels")) { trainOptions.simpleBinarizedLabels = true; i += 1; } else if (args[i].equalsIgnoreCase("-noRebinarization")) { trainOptions.noRebinarization = true; i += 1; } else if (args[i].equalsIgnoreCase("-dvKBest")) { trainOptions.dvKBest = Integer.parseInt(args[i + 1]); rerankerKBest = trainOptions.dvKBest; i += 2; } else if (args[i].equalsIgnoreCase("-regCost")) { trainOptions.regCost = Double.parseDouble(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-dvIterations") || args[i].equalsIgnoreCase("-trainingIterations")) { trainOptions.trainingIterations = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-stalledIterationLimit")) { trainOptions.stalledIterationLimit = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-dvBatchSize") || args[i].equalsIgnoreCase("-batchSize")) { trainOptions.batchSize = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-qnIterationsPerBatch")) { trainOptions.qnIterationsPerBatch = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-qnEstimates")) { trainOptions.qnEstimates = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-qnTolerance")) { trainOptions.qnTolerance = Double.parseDouble(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-debugOutputFrequency")) { trainOptions.debugOutputFrequency = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-maxTrainTimeSeconds")) { trainOptions.maxTrainTimeSeconds = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-dvSeed") || args[i].equalsIgnoreCase("-randomSeed")) { trainOptions.randomSeed = Long.parseLong(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-wordVectorFile")) { lexOptions.wordVectorFile = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-numHid")) { lexOptions.numHid = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-learningRate")) { trainOptions.learningRate = Double.parseDouble(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-deltaMargin")) { trainOptions.deltaMargin = Double.parseDouble(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-unknownNumberVector")) { trainOptions.unknownNumberVector = true; i += 1; } else if (args[i].equalsIgnoreCase("-noUnknownNumberVector")) { trainOptions.unknownNumberVector = false; i += 1; } else if (args[i].equalsIgnoreCase("-unknownDashedWordVectors")) { trainOptions.unknownDashedWordVectors = true; i += 1; } else if (args[i].equalsIgnoreCase("-noUnknownDashedWordVectors")) { trainOptions.unknownDashedWordVectors = false; i += 1; } else if (args[i].equalsIgnoreCase("-unknownCapsVector")) { trainOptions.unknownCapsVector = true; i += 1; } else if (args[i].equalsIgnoreCase("-noUnknownCapsVector")) { trainOptions.unknownCapsVector = false; i += 1; } else if (args[i].equalsIgnoreCase("-unknownChineseYearVector")) { trainOptions.unknownChineseYearVector = true; i += 1; } else if (args[i].equalsIgnoreCase("-noUnknownChineseYearVector")) { trainOptions.unknownChineseYearVector = false; i += 1; } else if (args[i].equalsIgnoreCase("-unknownChineseNumberVector")) { trainOptions.unknownChineseNumberVector = true; i += 1; } else if (args[i].equalsIgnoreCase("-noUnknownChineseNumberVector")) { trainOptions.unknownChineseNumberVector = false; i += 1; } else if (args[i].equalsIgnoreCase("-unknownChinesePercentVector")) { trainOptions.unknownChinesePercentVector = true; i += 1; } else if (args[i].equalsIgnoreCase("-noUnknownChinesePercentVector")) { trainOptions.unknownChinesePercentVector = false; i += 1; } else if (args[i].equalsIgnoreCase("-dvSimplifiedModel")) { trainOptions.dvSimplifiedModel = true; i += 1; } else if (args[i].equalsIgnoreCase("-scalingForInit")) { trainOptions.scalingForInit = Double.parseDouble(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-rerankerKBest")) { rerankerKBest = Integer.parseInt(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-baseParserWeight")) { baseParserWeight = Double.parseDouble(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-unkWord")) { trainOptions.unkWord = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-lowercaseWordVectors")) { trainOptions.lowercaseWordVectors = true; i += 1; } else if (args[i].equalsIgnoreCase("-noLowercaseWordVectors")) { trainOptions.lowercaseWordVectors = false; i += 1; } else if (args[i].equalsIgnoreCase("-transformMatrixType")) { trainOptions.transformMatrixType = TrainOptions.TransformMatrixType.valueOf(args[i + 1]); i += 2; } else if (args[i].equalsIgnoreCase("-useContextWords")) { trainOptions.useContextWords = true; i += 1; } else if (args[i].equalsIgnoreCase("-noUseContextWords")) { trainOptions.useContextWords = false; i += 1; } else if (args[i].equalsIgnoreCase("-trainWordVectors")) { trainOptions.trainWordVectors = true; i += 1; } else if (args[i].equalsIgnoreCase("-noTrainWordVectors")) { trainOptions.trainWordVectors = false; i += 1; } else if (args[i].equalsIgnoreCase("-markStrahler")) { trainOptions.markStrahler = true; i += 1; } return i; } public static class LexOptions implements Serializable { /** * Whether to use suffix and capitalization information for unknowns. * Within the BaseLexicon model options have the following meaning: * 0 means a single unknown token. 1 uses suffix, and capitalization. * 2 uses a variant (richer) form of signature. Good. * Use this one. Using the richer signatures in versions 3 or 4 seems * to have very marginal or no positive value. * 3 uses a richer form of signature that mimics the NER word type * patterns. 4 is a variant of 2. 5 is another with more English * specific morphology (good for English unknowns!). * 6-9 are options for Arabic. 9 codes some patterns for numbers and * derivational morphology, but also supports unknownPrefixSize and * unknownSuffixSize. * For German, 0 means a single unknown token, and non-zero means to use * capitalization of first letter and a suffix of length * unknownSuffixSize. */ public int useUnknownWordSignatures = 0; /** * RS: file for Turian's word vectors * The default value is an example of size 25 word vectors on the nlp machines */ public static final String DEFAULT_WORD_VECTOR_FILE = "/scr/nlp/deeplearning/datasets/turian/embeddings-scaled.EMBEDDING_SIZE=25.txt"; public String wordVectorFile = DEFAULT_WORD_VECTOR_FILE; /** * Number of hidden units in the word vectors. As setting of 0 * will make it try to extract the size from the data file. */ public int numHid = 0; /** * Words more common than this are tagged with MLE P(t|w). Default 100. The * smoothing is sufficiently slight that changing this has little effect. * But set this to 0 to be able to use the parser as a vanilla PCFG with * no smoothing (not as a practical parser but for exposition or debugging). */ public int smoothInUnknownsThreshold = 100; /** * Smarter smoothing for rare words. */ public boolean smartMutation = false; /** * Make use of unicode code point types in smoothing. */ public boolean useUnicodeType = false; /** For certain Lexicons, a certain number of word-final letters are * used to subclassify the unknown token. This gives the number of * letters. */ public int unknownSuffixSize = 1; /** For certain Lexicons, a certain number of word-initial letters are * used to subclassify the unknown token. This gives the number of * letters. */ public int unknownPrefixSize = 1; /** * Model for unknown words that the lexicon should use. This is the * name of a class. */ public String uwModelTrainer; // = null; /* If this option is false, then all words that were seen in the training * data (even once) are constrained to only have seen tags. That is, * mle is used for the lexicon. * If this option is true, then if a word has been seen more than * smoothInUnknownsThreshold, then it will still only get tags with which * it has been seen, but rarer words will get all tags for which the * unknown word model (or smart mutation) does not give a score of -Inf. * This will normally be all open class tags. * If floodTags is invoked by the parser, all other tags will also be * given a minimal non-zero, non-infinite probability. */ public boolean flexiTag = false; /** Whether to use signature rather than just being unknown as prior in * known word smoothing. Currently only works if turned on for English. */ public boolean useSignatureForKnownSmoothing; /** A file of word class data which may be used for smoothing, * normally instead of hand-specified signatures. */ public String wordClassesFile; private static final long serialVersionUID = 2805351374506855632L; private static final String[] params = { "useUnknownWordSignatures", "smoothInUnknownsThreshold", "smartMutation", "useUnicodeType", "unknownSuffixSize", "unknownPrefixSize", "flexiTag", "useSignatureForKnownSmoothing", "wordClassesFile" }; @Override public String toString() { return params[0] + " " + useUnknownWordSignatures + "\n" + params[1] + " " + smoothInUnknownsThreshold + "\n" + params[2] + " " + smartMutation + "\n" + params[3] + " " + useUnicodeType + "\n" + params[4] + " " + unknownSuffixSize + "\n" + params[5] + " " + unknownPrefixSize + "\n" + params[6] + " " + flexiTag + "\n" + params[7] + " " + useSignatureForKnownSmoothing + "\n" + params[8] + " " + wordClassesFile + "\n"; } public void readData(BufferedReader in) throws IOException { for (int i = 0; i < params.length; i++) { String line = in.readLine(); int idx = line.indexOf(' '); String key = line.substring(0, idx); String value = line.substring(idx + 1); if ( ! key.equalsIgnoreCase(params[i])) { log.info("Yikes!!! Expected " + params[i] + " got " + key); } switch (i) { case 0: useUnknownWordSignatures = Integer.parseInt(value); break; case 1: smoothInUnknownsThreshold = Integer.parseInt(value); break; case 2: smartMutation = Boolean.parseBoolean(value); break; case 3: useUnicodeType = Boolean.parseBoolean(value); break; case 4: unknownSuffixSize = Integer.parseInt(value); break; case 5: unknownPrefixSize = Integer.parseInt(value); break; case 6: flexiTag = Boolean.parseBoolean(value); break; case 7: useSignatureForKnownSmoothing = Boolean.parseBoolean(value); break; case 8: wordClassesFile = value; break; } } } } // end class LexOptions public LexOptions lexOptions = new LexOptions(); /** * The treebank-specific parser parameters to use. */ public TreebankLangParserParams tlpParams; /** * @return The treebank language pack for the treebank the parser * is trained on. */ public TreebankLanguagePack langpack() { return tlpParams.treebankLanguagePack(); } /** * Forces parsing with strictly CNF grammar -- unary chains are converted * to XP&YP symbols and back */ public boolean forceCNF = false; /** * Do a PCFG parse of the sentence. If both variables are on, * also do a combined parse of the sentence. */ public boolean doPCFG = true; /** * Do a dependency parse of the sentence. */ public boolean doDep = true; /** * if true, any child can be the head (seems rather bad!) */ public boolean freeDependencies = false; /** * Whether dependency grammar considers left/right direction. Good. */ public boolean directional = true; public boolean genStop = true; public boolean useSmoothTagProjection = false; public boolean useUnigramWordSmoothing = false; /** * Use distance bins in the dependency calculations */ public boolean distance = true; /** * Use coarser distance (4 bins) in dependency calculations */ public boolean coarseDistance = false; /** * "double count" tags rewrites as word in PCFG and Dep parser. Good for * combined parsing only (it used to not kick in for PCFG parsing). This * option is only used at Test time, but it is now in Options, so the * correct choice for a grammar is recorded by a serialized parser. * You should turn this off for a vanilla PCFG parser. */ public boolean dcTags = true; /** * If true, inside the factored parser, remove any node from the final * chosen tree which improves the PCFG score. This was added as the * dependency factor tends to encourage 'deep' trees. */ public boolean nodePrune = false; public TrainOptions trainOptions = newTrainOptions(); /** Separated out so subclasses of Options can override */ public TrainOptions newTrainOptions() { return new TrainOptions(); } /** * Note that the TestOptions is transient. This means that whatever * options get set at creation time are forgotten when the parser is * serialized. If you want an option to be remembered when the * parser is reloaded, put it in either TrainOptions or in this * class itself. */ public transient TestOptions testOptions = newTestOptions(); /** Separated out so subclasses of Options can override */ public TestOptions newTestOptions() { return new TestOptions(); } /** * A function that maps words used in training and testing to new * words. For example, it could be a function to lowercase text, * such as edu.stanford.nlp.util.LowercaseFunction (which makes the * parser case insensitive). This function is applied in * LexicalizedParserQuery.parse and in the training methods which * build a new parser. */ public Function<String, String> wordFunction = null; /** * If the parser has a reranker, it looks at this many trees when * building the reranked list. */ public int rerankerKBest = 100; /** * If reranking sentences, we can use the score from the original * parser as well. This tells us how much weight to give that score. */ public double baseParserWeight = 0.0; /** * Making the TestOptions transient means it won't even be * constructed when you deserialize an Options, so we need to * construct it on our own when deserializing */ private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); testOptions = newTestOptions(); } public void display() { // try { log.info("Options parameters:"); writeData(new PrintWriter(System.err)); /* } catch (IOException e) { e.printStackTrace(); }*/ } public void writeData(Writer w) {//throws IOException { PrintWriter out = new PrintWriter(w); StringBuilder sb = new StringBuilder(); sb.append(lexOptions.toString()); sb.append("parserParams ").append(tlpParams.getClass().getName()).append("\n"); sb.append("forceCNF ").append(forceCNF).append("\n"); sb.append("doPCFG ").append(doPCFG).append("\n"); sb.append("doDep ").append(doDep).append("\n"); sb.append("freeDependencies ").append(freeDependencies).append("\n"); sb.append("directional ").append(directional).append("\n"); sb.append("genStop ").append(genStop).append("\n"); sb.append("distance ").append(distance).append("\n"); sb.append("coarseDistance ").append(coarseDistance).append("\n"); sb.append("dcTags ").append(dcTags).append("\n"); sb.append("nPrune ").append(nodePrune).append("\n"); out.print(sb.toString()); out.flush(); } /** * Populates data in this Options from the character stream. * @param in The Reader * @throws IOException If there is a problem reading data */ public void readData(BufferedReader in) throws IOException { String line, value; // skip old variables if still present lexOptions.readData(in); line = in.readLine(); value = line.substring(line.indexOf(' ') + 1); try { tlpParams = (TreebankLangParserParams) Class.forName(value).newInstance(); } catch (Exception e) { IOException ioe = new IOException("Problem instantiating parserParams: " + line); ioe.initCause(e); throw ioe; } line = in.readLine(); // ensure backwards compatibility if (line.matches("^forceCNF.*")) { value = line.substring(line.indexOf(' ') + 1); forceCNF = Boolean.parseBoolean(value); line = in.readLine(); } value = line.substring(line.indexOf(' ') + 1); doPCFG = Boolean.parseBoolean(value); line = in.readLine(); value = line.substring(line.indexOf(' ') + 1); doDep = Boolean.parseBoolean(value); line = in.readLine(); value = line.substring(line.indexOf(' ') + 1); freeDependencies = Boolean.parseBoolean(value); line = in.readLine(); value = line.substring(line.indexOf(' ') + 1); directional = Boolean.parseBoolean(value); line = in.readLine(); value = line.substring(line.indexOf(' ') + 1); genStop = Boolean.parseBoolean(value); line = in.readLine(); value = line.substring(line.indexOf(' ') + 1); distance = Boolean.parseBoolean(value); line = in.readLine(); value = line.substring(line.indexOf(' ') + 1); coarseDistance = Boolean.parseBoolean(value); line = in.readLine(); value = line.substring(line.indexOf(' ') + 1); dcTags = Boolean.parseBoolean(value); line = in.readLine(); if ( ! line.matches("^nPrune.*")) { throw new RuntimeException("Expected nPrune, found: " + line); } value = line.substring(line.indexOf(' ') + 1); nodePrune = Boolean.parseBoolean(value); line = in.readLine(); // get rid of last line if (line.length() != 0) { throw new RuntimeException("Expected blank line, found: " + line); } } private static final long serialVersionUID = 4L; } // end class Options