package edu.stanford.nlp.patterns; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.lang.reflect.Field; import java.util.*; import java.util.Map.Entry; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.tokensregex.Env; import edu.stanford.nlp.ling.tokensregex.NodePattern; import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; import edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass.PatternScoring; import edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass.WordScoring; import edu.stanford.nlp.patterns.dep.DepPatternFactory; import edu.stanford.nlp.patterns.surface.SurfacePatternFactory; import edu.stanford.nlp.process.WordShapeClassifier; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.stats.Counters; import edu.stanford.nlp.util.*; import edu.stanford.nlp.util.ArgumentParser.Option; import edu.stanford.nlp.util.TypesafeMap.Key; import edu.stanford.nlp.util.logging.Redwood; import javax.json.Json; import javax.json.JsonArrayBuilder; import javax.json.JsonObjectBuilder; public class ConstantsAndVariables implements Serializable { private static final long serialVersionUID = 1L; /** * Maximum number of iterations to run */ @Option(name = "numIterationsForPatterns") public Integer numIterationsForPatterns = 10; /** * Maximum number of patterns learned in each iteration */ @Option(name = "numPatterns") public int numPatterns = 10; /** * The output directory where the justifications of learning patterns and * phrases would be saved. These are needed for visualization */ @Option(name = "outDir") public String outDir = null; /** * Cached file of all patterns for all tokens */ @Option(name = "allPatternsDir") public String allPatternsDir = null; /** * If all patterns should be computed. Otherwise patterns are read from * allPatternsFile */ @Option(name = "computeAllPatterns") public boolean computeAllPatterns = true; // @Option(name = "removeRedundantPatterns") // public boolean removeRedundantPatterns = true; /** * Pattern Scoring mechanism. See {@link PatternScoring} for options. */ @Option(name = "patternScoring") public PatternScoring patternScoring = PatternScoring.PosNegUnlabOdds; /** * Threshold for learning a pattern */ @Option(name = "thresholdSelectPattern") public double thresholdSelectPattern = 1.0; // /** // * Do not learn patterns that do not extract any unlabeled tokens (kind of // * useless) // */ // @Option(name = "discardPatternsWithNoUnlabSupport") // public boolean discardPatternsWithNoUnlabSupport = true; /** * Currently, does not work correctly. TODO: make this work. Ideally this * would label words only when they occur in the context of any learned * pattern. This comment seems old. Test it! */ @Option(name = "restrictToMatched") public boolean restrictToMatched = false; /** * Label words that are learned so that in further iterations we have more * information */ @Option(name = "usePatternResultAsLabel") public boolean usePatternResultAsLabel = true; /** * Debug flag for learning patterns. 0 means no output, 1 means necessary output, 2 means necessary output+some justification, 3 means extreme debug output */ @Option(name = "debug") public int debug = 1; /** * Do not learn patterns in which the neighboring words have the same label. * Deprecated! */ //@Option(name = "ignorePatWithLabeledNeigh") //public boolean ignorePatWithLabeledNeigh = false; /** * Save this run as ... */ @Option(name = "identifier") public String identifier = "getpatterns"; /** * Use the actual dictionary matching phrase(s) instead of the token word or * lemma in calculating the stats */ @Option(name = "useMatchingPhrase") public boolean useMatchingPhrase = true; /** * Reduce pattern threshold (=0.8*current_value) to extract as many patterns * as possible (still restricted by <code>numPatterns</code>) */ @Option(name = "tuneThresholdKeepRunning") public boolean tuneThresholdKeepRunning = false; /** * Maximum number of words to learn */ @Option(name = "maxExtractNumWords") public int maxExtractNumWords = Integer.MAX_VALUE; /** * use the seed dictionaries and the new words learned for the other labels in * the previous iterations as negative */ @Option(name = "useOtherLabelsWordsasNegative") public boolean useOtherLabelsWordsasNegative = true; /** * If not null, write the output like * "w1 w2 <label1> w3 <label2>w4</label2> </label1> w5 ... " if w3 w4 have * label1 and w4 has label 2 */ @Option(name = "markedOutputTextFile") String markedOutputTextFile = null; /** * If you want output of form "word\tlabels-separated-by-comma" in newlines */ @Option(name="columnOutputFile") String columnOutputFile = null; /** * Lowercase the context words/lemmas */ @Option(name = "matchLowerCaseContext") public static boolean matchLowerCaseContext = true; /** * Initials of all POS tags to use if * <code>usePOS4Pattern</code> is true, separated by comma. */ @Option(name = "targetAllowedTagsInitialsStr") public String targetAllowedTagsInitialsStr = null; public Map<String, Set<String>> allowedTagsInitials = null; /** * Allowed NERs for labels. Format is label1,NER1,NER11;label2,NER2,NER21,NER22;label3,... * <code>useTargetNERRestriction</code> flag should be true */ @Option(name = "targetAllowedNERs") public String targetAllowedNERs = null; public Map<String, Set<String>> allowedNERsforLabels = null; /** * Number of words to learn in each iteration */ @Option(name = "numWordsToAdd") public int numWordsToAdd = 10; @Option(name = "thresholdNumPatternsApplied") public double thresholdNumPatternsApplied = 2; @Option(name = "wordScoring") public WordScoring wordScoring = WordScoring.WEIGHTEDNORM; @Option(name = "thresholdWordExtract") public double thresholdWordExtract = 0.2; public boolean justify = false; /** * Sigma for L2 regularization in Logisitic regression, if a classifier is * used to score phrases */ @Option(name = "LRSigma") public double LRSigma = 1.0; /** * English words that are not labeled when labeling using seed dictionaries */ @Option(name = "englishWordsFiles") public String englishWordsFiles = null; private Set<String> englishWords = new HashSet<>(); /** * Words to be ignored when learning phrases if * <code>removePhrasesWithStopWords</code> or * <code>removeStopWordsFromSelectedPhrases</code> is true. Also, these words * are considered negative when scoring a pattern (similar to * othersemanticclasses). */ @Option(name = "commonWordsPatternFiles") public String commonWordsPatternFiles = null; private Set<String> commonEngWords = null; /** * List of dictionary phrases that are negative for all labels to be learned. * Format is file_1,file_2,... where file_i has each phrase in a different * line * */ @Option(name = "otherSemanticClassesFiles") public String otherSemanticClassesFiles = null; // set of words that are considered negative for all classes private Set<CandidatePhrase> otherSemanticClassesWords = null; /** * Seed dictionary, set in the class that uses this class */ private Map<String, Set<CandidatePhrase>> seedLabelDictionary = new HashMap<>(); /** * Just the set of labels */ private Set<String> labels = new HashSet<>(); private Map<String, Class<? extends TypesafeMap.Key<String>>> answerClass = null; /** * Can be used only when using the API - using the appropriate constructor. * Tokens with specified classes set (has to be boolean return value, even * though this variable says object) will be ignored. */ @SuppressWarnings("rawtypes") private Map<String, Map<Class, Object>> ignoreWordswithClassesDuringSelection = null; /** * These classes will be generalized. It can only be used via the API using * the appropriate constructor. All label classes are by default generalized. */ @SuppressWarnings("rawtypes") private static Map<String, Class> generalizeClasses = new HashMap<>(); /** * Minimum length of words that can be matched fuzzily */ @Option(name = "minLen4FuzzyForPattern") public int minLen4FuzzyForPattern = 6; /** * Do not learn phrases that match this regex. */ @Option(name = "wordIgnoreRegex") public String wordIgnoreRegex = "[^a-zA-Z]*"; /** * Number of threads */ @Option(name = "numThreads") public int numThreads = 1; /** * Words that are not learned. Patterns are not created around these words. * And, if useStopWordsBeforeTerm in {@link edu.stanford.nlp.patterns.surface.CreatePatterns} is true. */ @Option(name = "stopWordsPatternFiles", gloss = "stop words") public String stopWordsPatternFiles = null; private static Set<CandidatePhrase> stopWords = null; /** * Environment for {@link TokenSequencePattern} */ public Map<String, Env> env = new HashMap<>(); public static Env globalEnv = TokenSequencePattern.getNewEnv(); /** * */ @Option(name = "removeStopWordsFromSelectedPhrases") public boolean removeStopWordsFromSelectedPhrases = false; /** * */ @Option(name = "removePhrasesWithStopWords") public boolean removePhrasesWithStopWords = false; private boolean alreadySetUp = false; /** * Cluster file, in which each line is word/phrase<tab>clusterid */ @Option(name = "wordClassClusterFile") String wordClassClusterFile = null; private Map<String, Integer> wordClassClusters = new HashMap<>(); /** * General cluster file, if you wanna use it somehow, in which each line is * word/phrase<tab>clusterid */ @Option(name = "generalWordClassClusterFile") String generalWordClassClusterFile = null; private Map<String, Integer> generalWordClassClusters = null; // @Option(name = "includeExternalFeatures") // public boolean includeExternalFeatures = false; @Option(name = "externalFeatureWeightsFile") public String externalFeatureWeightsDir = null; @Option(name = "doNotApplyPatterns") public boolean doNotApplyPatterns = false; /** * If score for a pattern is square rooted */ @Option(name = "sqrtPatScore") public boolean sqrtPatScore = false; /** * Remove patterns that have number of unlabeled words is less than this. */ @Option(name = "minUnlabPhraseSupportForPat") public int minUnlabPhraseSupportForPat = 0; /** * Remove patterns that have number of positive words less than this. */ @Option(name = "minPosPhraseSupportForPat") public int minPosPhraseSupportForPat = 1; /** * For example, if positive seed dict contains "cancer" and "breast cancer" then "breast" is included as negative */ @Option(name="addIndvWordsFromPhrasesExceptLastAsNeg") public boolean addIndvWordsFromPhrasesExceptLastAsNeg = false; /** * Cached files */ private ConcurrentHashMap<String, Double> editDistanceFromEnglishWords = new ConcurrentHashMap<>(); /** * Cached files */ private ConcurrentHashMap<String, String> editDistanceFromEnglishWordsMatches = new ConcurrentHashMap<>(); /** * Cached files */ private ConcurrentHashMap<String, Double> editDistanceFromOtherSemanticClasses = new ConcurrentHashMap<>(); /** * Cached files */ private ConcurrentHashMap<String, String> editDistanceFromOtherSemanticClassesMatches = new ConcurrentHashMap<>(); /** * Cached files */ private ConcurrentHashMap<String, Double> editDistanceFromThisClass = new ConcurrentHashMap<>(); /** * Cached files */ private ConcurrentHashMap<String, String> editDistanceFromThisClassMatches = new ConcurrentHashMap<>(); private ConcurrentHashMap<String, Counter<String>> wordShapesForLabels = new ConcurrentHashMap<>(); String channelNameLogger = "settingUp"; public Map<String, Counter<Integer>> distSimWeights = new HashMap<>(); public Map<String, Counter<CandidatePhrase>> dictOddsWeights = new HashMap<>(); @Option(name="invertedIndexClass", gloss="another option is Lucene backed, which is not included in the CoreNLP release. Contact us to get a copy (distributed under Apache License).") public Class<? extends SentenceIndex> invertedIndexClass = InvertedIndexByTokens.class; /** * Where the inverted index (either in memory or lucene) is stored */ @Option(name="invertedIndexDirectory") public String invertedIndexDirectory; @Option(name="clubNeighboringLabeledWords") public boolean clubNeighboringLabeledWords = false; @Option(name="patternType") public PatternFactory.PatternType patternType = PatternFactory.PatternType.SURFACE; @Option(name="subsampleUnkAsNegUsingSim", gloss="When learning a classifier, remove phrases from unknown phrases that are too close to the positive phrases") public boolean subsampleUnkAsNegUsingSim = false; // @Option(name="subSampleUnkAsNegUsingSimPercentage", gloss="When using subsampleUnkAsNegUsingSim, select bottom %") // public double subSampleUnkAsNegUsingSimPercentage = 0.95; @Option(name="expandPositivesWhenSampling", gloss="when sampling for learning feature wts for learning phrases, expand the positives") public boolean expandPositivesWhenSampling = false; @Option(name="expandNegativesWhenSampling", gloss="when sampling for learning feature wts for learning phrases, expand the negatives") public boolean expandNegativesWhenSampling = false; @Option(name="similarityThresholdHighPrecision", gloss="used for expanding positives") public double similarityThresholdHighPrecision = 0.7; @Option(name="positiveSimilarityThresholdLowPrecision", gloss="used for not choosing close unknowns as positives") public double positiveSimilarityThresholdLowPrecision = 0.5; // @Option(name="subSampleUnkAsPosUsingSimPercentage", gloss="When using expandPositivesWhenSampling, select top % after applying the threshold") // public double subSampleUnkAsPosUsingSimPercentage = 0.05; @Option(name="wordVectorFile", gloss = "if using word vectors for computing similarities") public String wordVectorFile = null; @Option(name="useWordVectorsToComputeSim", gloss="use vectors directly instead of word classes for computing similarity") public boolean useWordVectorsToComputeSim; @Option(name="logFileVectorSimilarity", gloss="To store vectors for selected/almost-selected positive and negative words") String logFileVectorSimilarity = null; @Option(name="goldEntitiesEvalFiles", gloss="label1,gold_list_of_entities_file;label2,...") public String goldEntitiesEvalFiles = null; @Option(name="evaluate") public boolean evaluate = false; Map<String, Map<String, Boolean>> goldEntities = new HashMap<>(); @Option(name="featureCountThreshold") public int featureCountThreshold = 1; @Option(name="expandPhrasesNumTopSimilar", gloss="k in kNN") public int expandPhrasesNumTopSimilar = 1; /** * Whether to do a fuzzy matching when matching seeds to text. You can tune minLen4FuzzyForPattern parameter. */ @Option(name="fuzzyMatch") public boolean fuzzyMatch = false; /** * Ignore case when matching seed words. It's a map so something like {name->true,place->false} */ @Option(name="ignoreCaseSeedMatch") public Map<String, String> ignoreCaseSeedMatch = new HashMap<>(); @Option(name="sentsOutFile") public String sentsOutFile = null; @Option(name="savePatternsWordsDir") public boolean savePatternsWordsDir = true; @Option(name="learn") public boolean learn = true; public Set<String> getLabels() { return labels; } // public void addLearnedWords(String trainLabel, Counter<CandidatePhrase> identifiedWords) { // if(!learnedWords.containsKey(trainLabel)) // learnedWords.put(trainLabel, new ClassicCounter<CandidatePhrase>()); // this.learnedWords.get(trainLabel).addAll(identifiedWords); // } public Map<String, String> getAllOptions() { Map<String, String> values = new HashMap<>(); if(props != null) props.forEach( (x,y) -> values.put(x.toString(),y == null?"null":y.toString())); Class<?> thisClass; try { thisClass = Class.forName(this.getClass().getName()); Field[] aClassFields = thisClass.getDeclaredFields(); for(Field f : aClassFields){ if(f.getType().getClass().isPrimitive() || Arrays.binarySearch(GetPatternsFromDataMultiClass.printOptionClass, f.getType()) >= 0){ String fName = f.getName(); Object fvalue = f.get(this); values.put(fName, fvalue == null ? "null" : fvalue.toString()); } } } catch (Exception e) { e.printStackTrace(); } return values; } public boolean hasSeedWordOrOtherSem(CandidatePhrase p) { for(Map.Entry<String, Set<CandidatePhrase>> seeds: this.seedLabelDictionary.entrySet()){ if(seeds.getValue().contains(p)) return true; } if(otherSemanticClassesWords.contains(p)) return true; return false; } public TreeMap<Integer, Counter<CandidatePhrase>> getLearnedWordsEachIter(String label) { return learnedWordsEachIter.get(label); } public Map<String, TreeMap<Integer, Counter<CandidatePhrase>>> getLearnedWordsEachIter() { return learnedWordsEachIter; } public void setLearnedWordsEachIter(TreeMap<Integer, Counter<CandidatePhrase>> words, String label) { this.learnedWordsEachIter.put(label, words); } //PatternFactory.PatternType.SURFACE; // public PatternIndex getPatternIndex() { // return patternIndex; // } // // public void setPatternIndex(PatternIndex patternIndex) { // this.patternIndex = patternIndex; // } static public class ScorePhraseMeasures implements Comparable { String name; static int num = 0; int numObj; static Map<String, ScorePhraseMeasures> createdObjects = new ConcurrentHashMap<>(); public static ScorePhraseMeasures create(String n){ if(createdObjects.containsKey(n)) return createdObjects.get(n); else return new ScorePhraseMeasures(n); } private ScorePhraseMeasures(String n){ this.name= n; numObj = num++; createdObjects.put(n, this); } @Override public String toString(){return name;} @Override public boolean equals(Object o){ if(! (o instanceof ScorePhraseMeasures)) return false; return ((ScorePhraseMeasures)o).numObj == (this.numObj); } static final ScorePhraseMeasures DISTSIM = new ScorePhraseMeasures("DistSim"); static final ScorePhraseMeasures GOOGLENGRAM = new ScorePhraseMeasures("GoogleNGram"); static final ScorePhraseMeasures PATWTBYFREQ=new ScorePhraseMeasures("PatWtByFreq"); static final ScorePhraseMeasures EDITDISTSAME=new ScorePhraseMeasures("EditDistSame"); static final ScorePhraseMeasures EDITDISTOTHER =new ScorePhraseMeasures("EditDistOther"); static final ScorePhraseMeasures DOMAINNGRAM =new ScorePhraseMeasures("DomainNgram"); static final ScorePhraseMeasures SEMANTICODDS =new ScorePhraseMeasures("SemanticOdds"); static final ScorePhraseMeasures WORDSHAPE = new ScorePhraseMeasures("WordShape"); static final ScorePhraseMeasures WORDVECPOSSIMAVG = new ScorePhraseMeasures("WordVecPosSimAvg"); static final ScorePhraseMeasures WORDVECPOSSIMMAX = new ScorePhraseMeasures("WordVecPosSimMax"); static final ScorePhraseMeasures WORDVECNEGSIMAVG = new ScorePhraseMeasures("WordVecNegSimAvg"); static final ScorePhraseMeasures WORDVECNEGSIMMAX = new ScorePhraseMeasures("WordVecNegSimMax"); static final ScorePhraseMeasures ISFIRSTCAPITAL = new ScorePhraseMeasures("IsFirstLetterCapital"); static final ScorePhraseMeasures WORDSHAPESTR = new ScorePhraseMeasures("WordShapeStr"); static final ScorePhraseMeasures BOW = new ScorePhraseMeasures("Word"); @Override public int compareTo(Object o) { if(!(o instanceof ScorePhraseMeasures)) return -1; else return o.toString().compareTo(this.toString()); } } /** * Keeps only one label for each token, whichever has the longest */ @Option(name="removeOverLappingLabelsFromSeed") public boolean removeOverLappingLabelsFromSeed = false; /** * Only works if you have single label. And the word classes are given. */ @Option(name = "usePhraseEvalWordClass") public boolean usePhraseEvalWordClass = false; /** * Only works if you have single label. And the word vectors are given. */ @Option(name = "usePhraseEvalWordVector") public boolean usePhraseEvalWordVector = false; /** * use google tf-idf for learning phrases. Need to also provide googleNgram_dbname, * googleNgram_username and googleNgram_host */ @Option(name = "usePhraseEvalGoogleNgram") public boolean usePhraseEvalGoogleNgram = false; /** * use domain tf-idf for learning phrases */ @Option(name = "usePhraseEvalDomainNgram") public boolean usePhraseEvalDomainNgram = false; /** * use \sum_allpat pattern_wt_that_extracted_phrase/phrase_freq for learning * phrases */ @Option(name = "usePhraseEvalPatWtByFreq") public boolean usePhraseEvalPatWtByFreq = true; /** * odds of the phrase freq in the label dictionary vs other dictionaries */ @Option(name = "usePhraseEvalSemanticOdds") public boolean usePhraseEvalSemanticOdds = false; /** * Edit distance between this phrase and the other phrases in the label * dictionary */ @Option(name = "usePhraseEvalEditDistSame") public boolean usePhraseEvalEditDistSame = false; /** * Edit distance between this phrase and other phrases in other dictionaries */ @Option(name = "usePhraseEvalEditDistOther") public boolean usePhraseEvalEditDistOther = false; @Option(name = "usePhraseEvalWordShape", gloss="% of phrases of that label that have the same word shape") public boolean usePhraseEvalWordShape = false; @Option(name="usePhraseEvalWordShapeStr", gloss="uses the word shape str as a feature") public boolean usePhraseEvalWordShapeStr = false; @Option(name="usePhraseEvalFirstCapital", gloss="words starts with a capital letter") public boolean usePhraseEvalFirstCapital; /** * use bag of words */ @Option(name="usePhraseEvalBOW") public boolean usePhraseEvalBOW = false; /** * Used only if {@link #patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalWordClass") public boolean usePatternEvalWordClass = false; /** * Used only if {@link #patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalWordShape") public boolean usePatternEvalWordShape = false; @Option(name="usePatternEvalWordShapeStr", gloss="uses the word shape str as a feature") public boolean usePatternEvalWordShapeStr = false; @Option(name="usePatternEvalFirstCapital", gloss="words starts with a capital letter") public boolean usePatternEvalFirstCapital; /** * Used only if {@link #patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalGoogleNgram") public boolean usePatternEvalGoogleNgram = false; /** * Used only if {@link #patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. Need to also provide googleNgram_dbname, * googleNgram_username and googleNgram_host */ @Option(name = "usePatternEvalDomainNgram") public boolean usePatternEvalDomainNgram = false; /** * Used only if {@link #patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPatLogP</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalSemanticOdds") public boolean usePatternEvalSemanticOdds = false; /** * Used only if {@link #patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPatLogP</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalEditDistSame") public boolean usePatternEvalEditDistSame = false; /** * Used only if {@link #patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPatLogP</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalEditDistOther") public boolean usePatternEvalEditDistOther = false; /** * use bag of words */ @Option(name="usePatternEvalBOW") public boolean usePatternEvalBOW = false; /** * These are used to learn weights for features if using logistic regression. * Percentage of non-labeled tokens selected as negative. */ @Option(name = "perSelectRand") public double perSelectRand = 0.01; /** * These are used to learn weights for features if using logistic regression. * Percentage of negative tokens selected as negative. */ @Option(name = "perSelectNeg") public double perSelectNeg = 1; /** * Especially useful for multi word phrase extraction. Do not extract a phrase * if any word is labeled with any other class. */ @Option(name = "doNotExtractPhraseAnyWordLabeledOtherClass") public boolean doNotExtractPhraseAnyWordLabeledOtherClass = true; /** * You can save the inverted index. Lucene index is saved by default to <code>invertedIndexDirectory</code> if given. */ @Option(name="saveInvertedIndex") public boolean saveInvertedIndex = false; /** * You can load the inverted index using this file. * If false and using lucene index, the existing directory is deleted and new index is made. */ @Option(name="loadInvertedIndex") public boolean loadInvertedIndex = false; @Option(name = "storePatsForEachToken", gloss="used for storing patterns in PSQL/MEMORY/LUCENE") public PatternForEachTokenWay storePatsForEachToken = PatternForEachTokenWay.MEMORY; // // @Option(name = "storePatsIndex", gloss="used for storing patterns index") // public PatternIndexWay storePatsIndex = PatternIndexWay.MEMORY; @Option(name="sampleSentencesForSufficientStats",gloss="% sentences to use for learning pattterns" ) double sampleSentencesForSufficientStats = 1.0; // /** // * Directory where to save the sentences ser files. // */ // @Option(name="saveSentencesSerDir") // public File saveSentencesSerDir = null; // // public boolean usingDirForSentsInIndex = false; // @Option(name = "wekaOptions") // public String wekaOptions = ""; public static String backgroundSymbol = "O"; int wordShaper = WordShapeClassifier.WORDSHAPECHRIS2; private ConcurrentHashMap<String, String> wordShapeCache = new ConcurrentHashMap<>(); public SentenceIndex invertedIndex; public static String extremedebug = "extremePatDebug"; public static String minimaldebug = "minimaldebug"; Properties props; public enum PatternForEachTokenWay {MEMORY, LUCENE, DB}; public enum PatternIndexWay {MEMORY, OPENHFT, LUCENE}; public List<String> functionWords = Arrays.asList("a","an","the","of","at","on","in","he","she","him","her","they","them","and","no","not","nor","as","do"); public ConstantsAndVariables(Properties props, Set<String> labels, Map<String, Class<? extends Key<String>>> answerClass, Map<String, Class> generalizeClasses, Map<String, Map<Class, Object>> ignoreClasses) throws IOException { this.labels = labels; for(String label: labels){ this.seedLabelDictionary.put(label, new HashSet<>()); } this.answerClass = answerClass; this.generalizeClasses = generalizeClasses; if(this.generalizeClasses == null) this.generalizeClasses = new HashMap<>(); this.generalizeClasses.putAll(answerClass); this.ignoreWordswithClassesDuringSelection = ignoreClasses; setUp(props); } public ConstantsAndVariables(Properties props, Map<String, Set<CandidatePhrase>> labelDictionary, Map<String, Class<? extends Key<String>>> answerClass, Map<String, Class> generalizeClasses, Map<String, Map<Class, Object>> ignoreClasses) throws IOException { //make the list unmodifiable! for(Entry<String, Set<CandidatePhrase>> en2: labelDictionary.entrySet()){ seedLabelDictionary.put(en2.getKey(), Collections.unmodifiableSet(en2.getValue())); } this.labels = labelDictionary.keySet(); this.answerClass = answerClass; this.generalizeClasses = generalizeClasses; if(this.generalizeClasses == null) this.generalizeClasses = new HashMap<>(); this.generalizeClasses.putAll(answerClass); this.ignoreWordswithClassesDuringSelection = ignoreClasses; setUp(props); } public ConstantsAndVariables(Properties props, Set<String> labels, Map<String, Class<? extends TypesafeMap.Key<String>>> answerClass) throws IOException { this.labels = labels; for(String label: labels){ this.seedLabelDictionary.put(label, new HashSet<>()); } this.answerClass = answerClass; this.generalizeClasses = new HashMap<>(); this.generalizeClasses.putAll(answerClass); setUp(props); } public ConstantsAndVariables(Properties props, String label, Class<? extends TypesafeMap.Key<String>> answerClass) throws IOException { this.labels = new HashSet<>(); this.labels.add(label); this.seedLabelDictionary.put(label, new HashSet<>()); this.answerClass = new HashMap<>(); this.answerClass.put(label, answerClass); this.generalizeClasses = new HashMap<>(); this.generalizeClasses.putAll(this.answerClass); setUp(props); } public ConstantsAndVariables(Properties props, Set<String> labels, Map<String, Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String, Class> generalizeClasses) throws IOException { this.labels = labels; for(String label: labels){ this.seedLabelDictionary.put(label, new HashSet<>()); } this.answerClass = answerClass; this.generalizeClasses = generalizeClasses; if(this.generalizeClasses == null) this.generalizeClasses = new HashMap<>(); this.generalizeClasses.putAll(answerClass); setUp(props); } @SuppressWarnings("rawtypes") public void setUp(Properties props) throws IOException { if (alreadySetUp) { return; } Redwood.log(Redwood.DBG, "Setting up ConstantsAndVariables"); ArgumentParser.fillOptions(this, props); ArgumentParser.fillOptions(PatternFactory.class, props); ArgumentParser.fillOptions(SurfacePatternFactory.class, props); ArgumentParser.fillOptions(DepPatternFactory.class, props); if (wordIgnoreRegex != null && !wordIgnoreRegex.isEmpty()) { Redwood.log(Redwood.DBG, "Ignore word regex is " + wordIgnoreRegex); PatternFactory.ignoreWordRegex = Pattern.compile(wordIgnoreRegex); } for (String label : labels) { env.put(label, TokenSequencePattern.getNewEnv()); // env.get(label).bind("answer", answerClass.get(label)); for (Entry<String, Class<? extends Key<String>>> en : this.answerClass .entrySet()) { env.get(label).bind(en.getKey(), en.getValue()); } for (Entry<String, Class> en : generalizeClasses.entrySet()) env.get(label).bind(en.getKey(), en.getValue()); } Redwood.log(Redwood.DBG, channelNameLogger, "Running with debug output"); stopWords = new HashSet<>(); if(stopWordsPatternFiles != null) { Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, "Reading stop words from " + stopWordsPatternFiles); for (String stopwfile : stopWordsPatternFiles.split("[;,]")) { for(String word: IOUtils.readLines(stopwfile)){ if(!word.trim().isEmpty()) stopWords.add(CandidatePhrase.createOrGet(word.trim())); } } } englishWords = new HashSet<>(); if(englishWordsFiles != null) { System.out.println("Reading english words from " + englishWordsFiles); for (String englishWordsFile : englishWordsFiles.split("[;,]")) englishWords.addAll(IOUtils.linesFromFile(englishWordsFile)); } if (commonWordsPatternFiles != null) { commonEngWords = Collections.synchronizedSet(new HashSet<>()); for (String file : commonWordsPatternFiles.split("[;,]")) commonEngWords.addAll(IOUtils.linesFromFile(file)); } if (otherSemanticClassesFiles != null) { if (otherSemanticClassesWords == null) otherSemanticClassesWords = Collections .synchronizedSet(new HashSet<>()); for (String file : otherSemanticClassesFiles.split("[;,]")) { for (File f : listFileIncludingItself(file)) { for (String w : IOUtils.readLines(f)) { String[] t = w.split("\\s+"); if (t.length <= PatternFactory.numWordsCompoundMax) otherSemanticClassesWords.add(CandidatePhrase.createOrGet(w)); } } } System.out.println("Size of othersemantic class variables is " + otherSemanticClassesWords.size()); } else { otherSemanticClassesWords = Collections.synchronizedSet(new HashSet<>()); System.out.println("Size of othersemantic class variables is " + 0); } String stopStr = "/"; int i = 0; for (CandidatePhrase s : stopWords) { if (i > 0) stopStr += "|"; stopStr += Pattern.quote(s.getPhrase().replaceAll("\\\\", "\\\\\\\\")); i++; } stopStr += "/"; for (String label : labels) { env.get(label).bind("$FILLER", "/" + StringUtils.join(PatternFactory.fillerWords, "|") + "/"); env.get(label).bind("$STOPWORD", stopStr); env.get(label).bind("$MOD", "[{tag:/JJ.*/}]"); if (matchLowerCaseContext){ env.get(label).setDefaultStringMatchFlags(NodePattern.CASE_INSENSITIVE); env.get(label).setDefaultStringPatternFlags(Pattern.CASE_INSENSITIVE); } env.get(label).bind("OTHERSEM", PatternsAnnotations.OtherSemanticLabel.class); env.get(label).bind("grandparentparsetag", CoreAnnotations.GrandparentAnnotation.class); } if (wordClassClusterFile != null) { wordClassClusters = new HashMap<>(); for (String line : IOUtils.readLines(wordClassClusterFile)) { String[] t = line.split("\t"); wordClassClusters.put(t[0], Integer.parseInt(t[1])); } } if (generalWordClassClusterFile != null) { setGeneralWordClassClusters(new HashMap<>()); for (String line : IOUtils.readLines(generalWordClassClusterFile)) { String[] t = line.split("\t"); getGeneralWordClassClusters().put(t[0], Integer.parseInt(t[1])); } } if(targetAllowedTagsInitialsStr!= null){ allowedTagsInitials = new HashMap<>(); for(String labelstr : targetAllowedTagsInitialsStr.split(";")){ String[] t = labelstr.split(","); Set<String> st = new HashSet<>(); for(int j = 1; j < t.length; j++) st.add(t[j]); allowedTagsInitials.put(t[0], st); } } if(PatternFactory.useTargetNERRestriction && targetAllowedNERs !=null){ allowedNERsforLabels = new HashMap<>(); for(String labelstr : targetAllowedNERs.split(";")){ String[] t = labelstr.split(","); Set<String> st = new HashSet<>(); for(int j = 1; j < t.length; j++) st.add(t[j]); allowedNERsforLabels.put(t[0], st); } } for(String label: labels){ learnedWordsEachIter.put(label, new TreeMap<>()); } if(usePhraseEvalGoogleNgram || usePatternEvalDomainNgram) { Data.usingGoogleNgram = true; ArgumentParser.fillOptions(GoogleNGramsSQLBacked.class, props); } if(goldEntitiesEvalFiles !=null && evaluate) goldEntities = readGoldEntities(goldEntitiesEvalFiles); alreadySetUp = true; } public static Iterable<File> listFileIncludingItself(String file) { File f = new File(file); if(!f.isDirectory()) return Arrays.asList(f); else return IOUtils.iterFilesRecursive(f); } // The format of goldEntitiesEvalFiles is assumed same as // seedwordsfiles: label,file;label2,file2;... // Each file of gold entities consists of each entity in newline with // incorrect entities marked with "#" at the end of the entity. // Learned entities not present in the gold file are considered // negative. static Map<String, Map<String, Boolean>> readGoldEntities(String goldEntitiesEvalFiles){ Map<String, Map<String, Boolean>> goldWords = new HashMap<>(); if (goldEntitiesEvalFiles != null) { for (String gfile : goldEntitiesEvalFiles.split(";")) { String[] t = gfile.split(","); String label = t[0]; String goldfile = t[1]; Map<String, Boolean> goldWords4Label = new HashMap<>(); for (String line : IOUtils.readLines(goldfile)) { line = line.trim(); if (line.isEmpty()) continue; if (line.endsWith("#")) goldWords4Label.put(line.substring(0, line.length() - 1), false); else goldWords4Label.put(line, true); } goldWords.put(label, goldWords4Label); } } return goldWords; } //streams sents, files-from-which-sents-were read static public class DataSentsIterator implements Iterator<Pair<Map<String, DataInstance>, File>> { boolean readInMemory = false; Iterator<File> sentfilesIter = null; boolean batchProcessSents; public DataSentsIterator(boolean batchProcessSents){ this.batchProcessSents = batchProcessSents; if(batchProcessSents){ sentfilesIter = Data.sentsFiles.iterator(); } } @Override public boolean hasNext() { if(batchProcessSents){ return sentfilesIter.hasNext(); }else{ return !readInMemory; } } @Override public Pair<Map<String, DataInstance>, File> next() { if(batchProcessSents){ try { File f= sentfilesIter.next(); return new Pair<>(IOUtils.readObjectFromFile(f), f); } catch (IOException | ClassNotFoundException e) { throw new RuntimeException(e); } }else{ readInMemory= true; return new Pair<>(Data.sents, new File(Data.inMemorySaveFileLocation)); } } } public Map<String, Counter<String>> getWordShapesForLabels() { return wordShapesForLabels; } // public void setWordShapesForLabels(ConcurrentHashMap<String, Counter<String>> wordShapesForLabels) { // this.wordShapesForLabels = wordShapesForLabels; // } // public void addGeneralizeClasses(Map<String, Class> gen) { // this.generalizeClasses.putAll(gen); // } public static Map<String, Class> getGeneralizeClasses() { return generalizeClasses; } public static Set<CandidatePhrase> getStopWords() { return stopWords; } public void addWordShapes(String label, Set<CandidatePhrase> words){ if(!this.wordShapesForLabels.containsKey(label)){ this.wordShapesForLabels.put(label, new ClassicCounter<>()); } for(CandidatePhrase wc: words){ String w = wc.getPhrase(); String ws = null; if(wordShapeCache.containsKey(w)) ws = wordShapeCache.get(w); else{ ws = WordShapeClassifier.wordShape(w, wordShaper); wordShapeCache.put(w, ws); } wordShapesForLabels.get(label).incrementCount(ws); } } // public void setSeedLabelDictionary(Map<String, Set<CandidatePhrase>> seedSets) { // this.seedLabelDictionary = seedSets; // // if(usePhraseEvalWordShape || usePatternEvalWordShape){ // this.wordShapesForLabels.clear(); // for(Entry<String, Set<CandidatePhrase>> en: seedSets.entrySet()) // addWordShapes(en.getKey(), en.getValue()); // } // } public Map<String, Set<CandidatePhrase>> getSeedLabelDictionary() { return this.seedLabelDictionary; } //Map<String, Counter<CandidatePhrase>> learnedWords = new HashMap<String, Counter<CandidatePhrase>>(); Map<String, TreeMap<Integer, Counter<CandidatePhrase>>> learnedWordsEachIter = new HashMap<>(); public Counter<CandidatePhrase> getLearnedWords(String label) { Counter<CandidatePhrase> learned = Counters.flatten(learnedWordsEachIter.get(label)); if(learned == null){ learned = new ClassicCounter<>(); learnedWordsEachIter.put(label, new TreeMap<>()); } return learned; } // public Map<String, Counter<CandidatePhrase>> getLearnedWords() { // return Counters.flatten(learnedWordsEachIter); // } //public void setLearnedWords(Counter<CandidatePhrase> words, String label) { // this.learnedWords.put(label, words); //} public String getLearnedWordsAsJson(){ JsonObjectBuilder obj = Json.createObjectBuilder(); for(String label: getLabels()){ Counter<CandidatePhrase> learnedWords = getLearnedWords(label); JsonArrayBuilder arr = Json.createArrayBuilder(); for(CandidatePhrase k: learnedWords.keySet()) arr.add(k.getPhrase()); obj.add(label, arr); } return obj.build().toString(); } public String getLearnedWordsAsJsonLastIteration(){ JsonObjectBuilder obj = Json.createObjectBuilder(); for(String label: getLabels()){ Counter<CandidatePhrase> learnedWords = getLearnedWordsEachIter(label).lastEntry().getValue(); JsonArrayBuilder arr = Json.createArrayBuilder(); for(CandidatePhrase k: learnedWords.keySet()) arr.add(k.getPhrase()); obj.add(label, arr); } return obj.build().toString(); } public String getSetWordsAsJson(Map<String, Counter<CandidatePhrase>> words){ JsonObjectBuilder obj = Json.createObjectBuilder(); for(String label: getLabels()){ JsonArrayBuilder arr = Json.createArrayBuilder(); for(CandidatePhrase k: words.get(label).keySet()) arr.add(k.getPhrase()); obj.add(label, arr); } return obj.build().toString(); } public Set<String> getEnglishWords() { return this.englishWords; } public Set<String> getCommonEngWords() { return this.commonEngWords; } public Set<CandidatePhrase> getOtherSemanticClassesWords() { return this.otherSemanticClassesWords; } public void setOtherSemanticClassesWords(Set<CandidatePhrase> other) { this.otherSemanticClassesWords = other; } public Map<String, Integer> getWordClassClusters() { return this.wordClassClusters; } private Pair<String, Double> getEditDist(Collection<CandidatePhrase> words, String ph) { double minD = editDistMax; String minPh = ph; for (CandidatePhrase ec : words) { String e = ec.getPhrase(); if (e.equals(ph)) return new Pair<>(ph, 0.0); double d = EditDistanceDamerauLevenshteinLike.editDistance(e, ph, 3); if (d == 1) return new Pair<>(e, d); if (d == -1) d = editDistMax; if (d < minD) { minD = d; minPh = e; } } return new Pair<>(minPh, minD); } final double editDistMax = 1000; /** * Use this option if you are limited by memory ; ignored if fileFormat is ser. */ @Option(name="batchProcessSents") public boolean batchProcessSents = false; @Option(name="writeMatchedTokensFiles") public boolean writeMatchedTokensFiles = false; @Option(name="writeMatchedTokensIdsForEachPhrase") public boolean writeMatchedTokensIdsForEachPhrase = false; public Pair<String, Double> getEditDistanceFromThisClass(String label, String ph, int minLen) { if (ph.length() < minLen) return new Pair<>(ph, editDistMax); // if (editDistanceFromThisClass.containsKey(ph)) // return new Pair<String, Double>(editDistanceFromThisClassMatches.get(ph), // editDistanceFromThisClass.get(ph)); Set<CandidatePhrase> words = new HashSet<>(seedLabelDictionary.get(label)); words.addAll(getLearnedWords(label).keySet()); Pair<String, Double> minD = getEditDist(words, ph); double minDtotal = minD.second(); String minPh = minD.first(); assert (!minPh.isEmpty()); // editDistanceFromThisClass.putIfAbsent(ph, minDtotal); // editDistanceFromThisClassMatches.putIfAbsent(ph, minPh); return new Pair<>(minPh, minDtotal); } public Pair<String, Double> getEditDistanceFromOtherClasses(String label, String ph, int minLen) { if (ph.length() < minLen) return new Pair<>(ph, editDistMax); // if (editDistanceFromOtherSemanticClasses.containsKey(ph)) // return new Pair<String, Double>( // editDistanceFromOtherSemanticClassesMatches.get(ph), // editDistanceFromOtherSemanticClasses.get(ph)); Pair<String, Double> minD = getEditDist(otherSemanticClassesWords, ph); String minPh = minD.first(); double minDfinal = minD.second(); for(String l: labels){ if(l.equals(label)) continue; Pair<String, Double> editMatch = getEditDistanceFromThisClass(l, ph, minLen); if(editMatch.second() < minDfinal){ minDfinal = editMatch.second(); minPh = editMatch.first(); } } // double minDtotal = editDistMax; // String minPh = ""; // if (minD.second() == editDistMax && ph.contains(" ")) { // for (String s : ph.split("\\s+")) { // Pair<String, Double> minDSingle = getEditDist(otherSemanticClassesWords, s); // if (minDSingle.second() < minDtotal) { // minDtotal = minDSingle.second; // } // minPh += " " + minDSingle.first(); // } // minPh = minPh.trim(); // } else { // } assert (!minPh.isEmpty()); // editDistanceFromOtherSemanticClasses.putIfAbsent(ph, minDtotal); // editDistanceFromOtherSemanticClassesMatches.putIfAbsent(ph, minPh); return new Pair<>(minPh, minDfinal); } // public double getEditDistanceFromEng(String ph, int minLen) { // if (ph.length() < minLen) // return editDistMax; // if (editDistanceFromEnglishWords.containsKey(ph)) // return editDistanceFromEnglishWords.get(ph); // Pair<String, Double> d = getEditDist(commonEngWords, ph); // double minD = d.second(); // String minPh = d.first(); // if (d.second() > 2) { // Pair<String, Double> minD2 = getEditDist(CandidatePhrase.convertToString(otherSemanticClassesWords), ph); // if (minD2.second < minD) { // minD = minD2.second(); // minPh = minD2.first(); // } // } // // editDistanceFromEnglishWords.putIfAbsent(ph, minD); // editDistanceFromEnglishWordsMatches.putIfAbsent(ph, minPh); // return minD; // } public ConcurrentHashMap<String, Double> getEditDistanceFromEnglishWords() { return this.editDistanceFromEnglishWords; } public ConcurrentHashMap<String, String> getEditDistanceFromEnglishWordsMatches() { return this.editDistanceFromEnglishWordsMatches; } public double getEditDistanceScoresOtherClass(String label, String g) { double editDist; String editDistPh; // if (editDistanceFromOtherSemanticClasses.containsKey(g)) { // editDist = editDistanceFromOtherSemanticClasses.get(g); // editDistPh = editDistanceFromOtherSemanticClassesMatches.get(g); // } else { Pair<String, Double> editMatch = getEditDistanceFromOtherClasses(label, g, 4); editDist = editMatch.second(); editDistPh = editMatch.first(); // } assert (!editDistPh.isEmpty()); return (editDist == editDistMax ? 1.0 : (editDist / (double) Math.max(g.length(), editDistPh.length()))); } /** * 1 if lies in edit distance, 0 if not close to any words * * @param g * @return */ public double getEditDistanceScoresOtherClassThreshold(String label, String g) { double editDistRatio = getEditDistanceScoresOtherClass(label, g); if (editDistRatio < 0.2) return 1; else return 0; } public double getEditDistanceScoresThisClassThreshold(String label, String g) { double editDistRatio = getEditDistanceScoresThisClass(label, g); if (editDistRatio < 0.2) return 1; else return 0; } public double getEditDistanceScoresThisClass(String label, String g) { double editDist; String editDistPh; // if (editDistanceFromThisClass.containsKey(g)) { // editDist = editDistanceFromThisClass.get(g); // editDistPh = editDistanceFromThisClassMatches.get(g); // assert (!editDistPh.isEmpty()); // } else { // Pair<String, Double> editMatch = getEditDistanceFromThisClass(label, g, 4); editDist = editMatch.second(); editDistPh = editMatch.first(); assert (!editDistPh.isEmpty()); //} return ((editDist == editDistMax) ? 1.0 : (editDist / (double) Math.max(g.length(), editDistPh.length()))); } public static boolean isFuzzyMatch(String w1, String w2, int minLen4Fuzzy) { EditDistance editDistance = new EditDistance(true); if (w1.equals(w2)) return true; if (w2.length() > minLen4Fuzzy) { double d = editDistance.score(w1, w2); if (d == 1) { return true; } } return false; } public static CandidatePhrase containsFuzzy(Set<CandidatePhrase> words, CandidatePhrase w, int minLen4Fuzzy) { for (CandidatePhrase w1 : words) { if (isFuzzyMatch(w1.getPhrase(), w.getPhrase(), minLen4Fuzzy)) return w1; } return null; } public Map<String, Integer> getGeneralWordClassClusters() { return generalWordClassClusters; } public void setGeneralWordClassClusters( Map<String, Integer> generalWordClassClusters) { this.generalWordClassClusters = generalWordClassClusters; } public Map<String, String> getWordShapeCache() { return wordShapeCache; } public Map<String, Class<? extends Key<String>>> getAnswerClass() { return answerClass; } public Map<String, Map<Class, Object>> getIgnoreWordswithClassesDuringSelection() { return ignoreWordswithClassesDuringSelection; } public void addSeedWords(String label, Collection<CandidatePhrase> seeds) throws Exception { if(!seedLabelDictionary.containsKey(label)){ throw new Exception("label not present in the model"); } Set<CandidatePhrase> seedWords = new HashSet<>(seedLabelDictionary.get(label)); seedWords.addAll(seeds); seedLabelDictionary.put(label, Collections.unmodifiableSet(seedWords)); } }