package edu.stanford.nlp.coref.hybrid; import java.io.File; import java.util.Arrays; import java.util.HashSet; import java.util.Locale; import java.util.Properties; import java.util.Set; import edu.stanford.nlp.coref.hybrid.sieve.Sieve.ClassifierType; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.PropertiesUtils; /** * Properties for the hybrid coref system. * * @author Heeyoung Lee * @author Kevin Clark */ public class HybridCorefProperties { // public enum CorefInputType { RAW, CONLL, ACE, MUC } // general public static final String LANG_PROP = "coref.language"; private static final String SIEVES_PROP = "coref.sieves"; private static final String SCORE_PROP = "coref.doScore"; private static final String THREADS_PROP = "coref.threadCount"; private static final String POSTPROCESSING_PROP = "coref.postprocessing"; private static final String SEED_PROP = "coref.seed"; private static final String CONLL_AUTO_PROP = "coref.conll.auto"; private static final String USE_SEMANTICS_PROP = "coref.useSemantics"; // load semantics if true public static final String CURRENT_SIEVE_FOR_TRAIN_PROP = "coref.currentSieveForTrain"; private static final String STORE_TRAINDATA_PROP = "coref.storeTrainData"; private static final String ADD_MISSING_ANNOTATIONS = "coref.addMissingAnnotations"; // logging & system check & analysis private static final String DEBUG_PROP = "coref.debug"; public static final String LOG_PROP = "coref.logFile"; private static final String TIMER_PROP = "coref.checkTime"; private static final String MEMORY_PROP = "coref.checkMemory"; private static final String PRINT_MDLOG_PROP = "coref.print.md.log"; private static final String CALCULATE_IMPORTANCE_PROP = "coref.calculateFeatureImportance"; private static final String DO_ANALYSIS_PROP = "coref.analysis.doAnalysis"; private static final String ANALYSIS_SKIP_MTYPE_PROP = "coref.analysis.skip.mType"; private static final String ANALYSIS_SKIP_ATYPE_PROP = "coref.analysis.skip.aType"; // data & io public static final String STATES_PROP = "coref.states"; public static final String DEMONYM_PROP = "coref.demonym"; public static final String ANIMATE_PROP = "coref.animate"; public static final String INANIMATE_PROP = "coref.inanimate"; public static final String MALE_PROP = "coref.male"; public static final String NEUTRAL_PROP = "coref.neutral"; public static final String FEMALE_PROP = "coref.female"; public static final String PLURAL_PROP = "coref.plural"; public static final String SINGULAR_PROP = "coref.singular"; public static final String GENDER_NUMBER_PROP = "coref.big.gender.number"; public static final String COUNTRIES_PROP = "coref.countries"; public static final String STATES_PROVINCES_PROP = "coref.states.provinces"; public static final String DICT_LIST_PROP = "coref.dictlist"; public static final String DICT_PMI_PROP = "coref.dictpmi"; public static final String SIGNATURES_PROP = "coref.signatures"; public static final String LOAD_WORD_EMBEDDING_PROP = "coref.loadWordEmbedding"; private static final String WORD2VEC_PROP = "coref.path.word2vec"; private static final String WORD2VEC_SERIALIZED_PROP = "coref.path.word2vecSerialized"; private static final String PATH_SERIALIZED_PROP = "coref.path.serialized"; // models private static final String PATH_MODEL_PROP = "coref.SIEVENAME.model"; // sieve option private static final String CLASSIFIER_TYPE_PROP = "coref.SIEVENAME.classifierType"; private static final String NUM_TREE_PROP = "coref.SIEVENAME.numTrees"; private static final String NUM_FEATURES_PROP = "coref.SIEVENAME.numFeatures"; private static final String TREE_DEPTH_PROP = "coref.SIEVENAME.treeDepth"; private static final String MAX_SENT_DIST_PROP = "coref.SIEVENAME.maxSentDist"; private static final String MTYPE_PROP = "coref.SIEVENAME.mType"; private static final String ATYPE_PROP = "coref.SIEVENAME.aType"; private static final String DOWNSAMPLE_RATE_PROP = "coref.SIEVENAME.downsamplingRate"; private static final String THRES_FEATURECOUNT_PROP = "coref.SIEVENAME.thresFeatureCount"; private static final String FEATURE_SELECTION_PROP = "coref.SIEVENAME.featureSelection"; private static final String THRES_MERGE_PROP = "coref.SIEVENAME.merge.thres"; private static final String THRES_FEATURE_SELECTION_PROP = "coref.SIEVENAME.pmi.thres"; private static final String DEFAULT_PRONOUN_AGREEMENT_PROP = "coref.defaultPronounAgreement"; // features private static final String USE_BASIC_FEATURES_PROP = "coref.SIEVENAME.useBasicFeatures"; private static final String COMBINE_OBJECTROLE_PROP = "coref.SIEVENAME.combineObjectRole"; private static final String USE_MD_FEATURES_PROP = "coref.SIEVENAME.useMentionDetectionFeatures"; private static final String USE_DCOREFRULE_FEATURES_PROP = "coref.SIEVENAME.useDcorefRuleFeatures"; private static final String USE_POS_FEATURES_PROP = "coref.SIEVENAME.usePOSFeatures"; private static final String USE_LEXICAL_FEATURES_PROP = "coref.SIEVENAME.useLexicalFeatures"; private static final String USE_WORD_EMBEDDING_FEATURES_PROP = "coref.SIEVENAME.useWordEmbeddingFeatures"; public static final Locale LANGUAGE_DEFAULT = Locale.ENGLISH; /** if true, remove appositives, predicate nominatives in post processing */ public static final boolean REMOVE_APPOSITION_PREDICATENOMINATIVES = true; /** if true, remove singletons in post processing */ public static final boolean REMOVE_SINGLETONS = true; // current list of dcoref sieves private static final Set<String> dcorefSieveNames = new HashSet<>(Arrays.asList("MarkRole", "DiscourseMatch", "ExactStringMatch", "RelaxedExactStringMatch", "PreciseConstructs", "StrictHeadMatch1", "StrictHeadMatch2", "StrictHeadMatch3", "StrictHeadMatch4", "RelaxedHeadMatch", "PronounMatch", "SpeakerMatch", "ChineseHeadMatch")); private HybridCorefProperties() {} // static methods/ constants public static boolean doScore(Properties props) { return PropertiesUtils.getBool(props, SCORE_PROP, false); } public static boolean checkTime(Properties props) { return PropertiesUtils.getBool(props, TIMER_PROP, false); } public static boolean checkMemory(Properties props) { return PropertiesUtils.getBool(props, MEMORY_PROP, false); } public static int getThreadCounts(Properties props) { return PropertiesUtils.getInt(props, THREADS_PROP, Runtime.getRuntime().availableProcessors()); } public static Locale getLanguage(Properties props) { String lang = PropertiesUtils.getString(props, LANG_PROP, "en"); if(lang.equalsIgnoreCase("en") || lang.equalsIgnoreCase("english")) return Locale.ENGLISH; else if(lang.equalsIgnoreCase("zh") || lang.equalsIgnoreCase("chinese")) return Locale.CHINESE; else throw new RuntimeException("unsupported language"); } public static boolean printMDLog(Properties props) { return PropertiesUtils.getBool(props, PRINT_MDLOG_PROP, false); } public static boolean doPostProcessing(Properties props) { return PropertiesUtils.getBool(props, POSTPROCESSING_PROP, false); } /** if true, use conll auto files, else use conll gold files */ public static boolean useCoNLLAuto(Properties props) { return PropertiesUtils.getBool(props, CONLL_AUTO_PROP, true); } public static String getPathModel(Properties props, String sievename) { return props.getProperty(PATH_SERIALIZED_PROP) + File.separator + props.getProperty(PATH_MODEL_PROP.replace("SIEVENAME", sievename), "MISSING_MODEL_FOR_"+sievename); } public static boolean debug(Properties props) { return PropertiesUtils.getBool(props, DEBUG_PROP, false); } public static ClassifierType getClassifierType(Properties props, String sievename) { if(dcorefSieveNames.contains(sievename)) return ClassifierType.RULE; if(sievename.toLowerCase().endsWith("-rf")) return ClassifierType.RF; if(sievename.toLowerCase().endsWith("-oracle")) return ClassifierType.ORACLE; String classifierType = PropertiesUtils.getString(props, CLASSIFIER_TYPE_PROP.replace("SIEVENAME", sievename), null); return ClassifierType.valueOf(classifierType); } public static double getMergeThreshold(Properties props, String sievename) { String key = THRES_MERGE_PROP.replace("SIEVENAME", sievename); return PropertiesUtils.getDouble(props, key, 0.3); } public static void setMergeThreshold(Properties props, String sievename, double value) { String key = THRES_MERGE_PROP.replace("SIEVENAME", sievename); props.setProperty(key, String.valueOf(value)); } public static int getNumTrees(Properties props, String sievename) { return PropertiesUtils.getInt(props, NUM_TREE_PROP.replace("SIEVENAME", sievename), 100); } public static int getSeed(Properties props) { return PropertiesUtils.getInt(props, SEED_PROP, 1); } public static int getNumFeatures(Properties props, String sievename) { return PropertiesUtils.getInt(props, NUM_FEATURES_PROP.replace("SIEVENAME", sievename), 30); } public static int getTreeDepth(Properties props, String sievename) { return PropertiesUtils.getInt(props, TREE_DEPTH_PROP.replace("SIEVENAME", sievename), 0); } public static boolean calculateFeatureImportance(Properties props) { return PropertiesUtils.getBool(props, CALCULATE_IMPORTANCE_PROP, false); } public static int getMaxSentDistForSieve(Properties props, String sievename) { return PropertiesUtils.getInt(props, MAX_SENT_DIST_PROP.replace("SIEVENAME", sievename), 1000); } public static Set<MentionType> getMentionType(Properties props, String sievename) { return getMentionTypes(props, MTYPE_PROP.replace("SIEVENAME", sievename)); } public static Set<MentionType> getAntecedentType(Properties props, String sievename) { return getMentionTypes(props, ATYPE_PROP.replace("SIEVENAME", sievename)); } private static Set<MentionType> getMentionTypes(Properties props, String propKey) { if(!props.containsKey(propKey) || props.getProperty(propKey).equalsIgnoreCase("all")){ return new HashSet<>(Arrays.asList(MentionType.values())); } Set<MentionType> types = new HashSet<>(); for(String type : props.getProperty(propKey).trim().split(",\\s*")) { if(type.toLowerCase().matches("i|you|we|they|it|she|he")) type = "PRONOMINAL"; types.add(MentionType.valueOf(type)); } return types; } public static double getDownsamplingRate(Properties props, String sievename) { return PropertiesUtils.getDouble(props, DOWNSAMPLE_RATE_PROP.replace("SIEVENAME", sievename), 1); } public static int getFeatureCountThreshold(Properties props, String sievename) { return PropertiesUtils.getInt(props, THRES_FEATURECOUNT_PROP.replace("SIEVENAME", sievename), 20); } public static boolean useBasicFeatures(Properties props, String sievename) { return PropertiesUtils.getBool(props, USE_BASIC_FEATURES_PROP.replace("SIEVENAME", sievename), true); } public static boolean combineObjectRoles(Properties props, String sievename) { return PropertiesUtils.getBool(props, COMBINE_OBJECTROLE_PROP.replace("SIEVENAME", sievename), true); } public static boolean useMentionDetectionFeatures(Properties props, String sievename) { return PropertiesUtils.getBool(props, USE_MD_FEATURES_PROP.replace("SIEVENAME", sievename), true); } public static boolean useDcorefRules(Properties props, String sievename) { return PropertiesUtils.getBool(props, USE_DCOREFRULE_FEATURES_PROP.replace("SIEVENAME", sievename), true); } public static boolean usePOSFeatures(Properties props, String sievename) { return PropertiesUtils.getBool(props, USE_POS_FEATURES_PROP.replace("SIEVENAME", sievename), true); } public static boolean useLexicalFeatures(Properties props, String sievename) { return PropertiesUtils.getBool(props, USE_LEXICAL_FEATURES_PROP.replace("SIEVENAME", sievename), true); } public static boolean useWordEmbedding(Properties props, String sievename) { return PropertiesUtils.getBool(props, USE_WORD_EMBEDDING_FEATURES_PROP.replace("SIEVENAME", sievename), true); } private static Set<String> getMentionTypeStr(Properties props, String sievename, String whichMention) { Set<String> strs = Generics.newHashSet(); String propKey = whichMention; if (!props.containsKey(propKey)) { String prefix = "coref." + sievename + "."; propKey = prefix + propKey; } if(props.containsKey(propKey)) strs.addAll(Arrays.asList(props.getProperty(propKey).split(","))); return strs; } public static Set<String> getMentionTypeStr(Properties props, String sievename) { return getMentionTypeStr(props, sievename, "mType"); } public static Set<String> getAntecedentTypeStr(Properties props, String sievename) { return getMentionTypeStr(props, sievename, "aType"); } public static String getSieves(Properties props) { return PropertiesUtils.getString(props, SIEVES_PROP, "SpeakerMatch,PreciseConstructs,pp-rf,cc-rf,pc-rf,ll-rf,pr-rf"); } public static String getPathSerialized(Properties props) { return props.getProperty(PATH_SERIALIZED_PROP); } public static boolean doPMIFeatureSelection(Properties props, String sievename) { return PropertiesUtils.getString(props, FEATURE_SELECTION_PROP.replace("SIEVENAME", sievename), "pmi").equalsIgnoreCase("pmi"); } public static double getPMIThres(Properties props, String sievename) { return PropertiesUtils.getDouble(props, THRES_FEATURE_SELECTION_PROP.replace("SIEVENAME", sievename), 0.0001); } public static boolean doAnalysis(Properties props) { return PropertiesUtils.getBool(props, DO_ANALYSIS_PROP, false); } public static String getSkipMentionType(Properties props) { return PropertiesUtils.getString(props, ANALYSIS_SKIP_MTYPE_PROP, null); } public static String getSkipAntecedentType(Properties props) { return PropertiesUtils.getString(props, ANALYSIS_SKIP_ATYPE_PROP, null); } public static boolean useSemantics(Properties props) { return PropertiesUtils.getBool(props, USE_SEMANTICS_PROP, false); } public static String getPathSerializedWordVectors(Properties props) { return PropertiesUtils.getString(props, WORD2VEC_SERIALIZED_PROP, "/scr/nlp/data/coref/wordvectors/en/vector.ser.gz"); } public static String getCurrentSieveForTrain(Properties props) { return PropertiesUtils.getString(props, CURRENT_SIEVE_FOR_TRAIN_PROP, null); } // public static String getCurrentSieve(Properties props) { // return PropertiesUtils.getString(props, CURRENT_SIEVE_PROP, null); // } public static boolean loadWordEmbedding(Properties props) { return PropertiesUtils.getBool(props, LOAD_WORD_EMBEDDING_PROP, true); } public static String getPathWord2Vec(Properties props) { return PropertiesUtils.getString(props, WORD2VEC_PROP, null); } public static String getGenderNumber(Properties props) { return PropertiesUtils.getString(props, GENDER_NUMBER_PROP, "edu/stanford/nlp/models/dcoref/gender.data.gz"); } public static boolean storeTrainData(Properties props) { return PropertiesUtils.getBool(props, STORE_TRAINDATA_PROP, false); } public static boolean useDefaultPronounAgreement(Properties props){ return PropertiesUtils.getBool(props, HybridCorefProperties.DEFAULT_PRONOUN_AGREEMENT_PROP,false); } public static boolean addMissingAnnotations(Properties props) { return PropertiesUtils.getBool(props, ADD_MISSING_ANNOTATIONS, false); } }