package edu.stanford.nlp.patterns.surface; import java.io.IOException; import java.io.Serializable; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.Map.Entry; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.tokensregex.Env; import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; import edu.stanford.nlp.patterns.surface.GetPatternsFromDataMultiClass.PatternScoring; import edu.stanford.nlp.patterns.surface.GetPatternsFromDataMultiClass.WordScoring; import edu.stanford.nlp.process.WordShapeClassifier; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.EditDistance; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.TypesafeMap; import edu.stanford.nlp.util.Execution.Option; import edu.stanford.nlp.util.TypesafeMap.Key; import edu.stanford.nlp.util.logging.Redwood; public class ConstantsAndVariables implements Serializable{ private static final long serialVersionUID = 1L; /** * Maximum number of iterations to run */ @Option(name = "numIterationsForPatterns") public Integer numIterationsForPatterns = 10; /** * Maximum number of patterns learned in each iteration */ @Option(name = "numPatterns") public int numPatterns = 10; /** * The output directory where the justifications of learning patterns and * phrases would be saved. These are needed for visualization */ @Option(name = "outDir") public String outDir = null; /** * Cached file of all patterns for all tokens */ @Option(name = "allPatternsFile") public String allPatternsFile = null; /** * If all patterns should be computed. Otherwise patterns are read from * allPatternsFile */ @Option(name = "computeAllPatterns") public boolean computeAllPatterns = true; // @Option(name = "removeRedundantPatterns") // public boolean removeRedundantPatterns = true; /** * Pattern Scoring mechanism. See {@link PatternScoring} for options. */ @Option(name = "patternScoring") public PatternScoring patternScoring = PatternScoring.PosNegUnlabOdds; /** * Threshold for learning a pattern */ @Option(name = "thresholdSelectPattern") public double thresholdSelectPattern = 1.0; // /** // * Do not learn patterns that do not extract any unlabeled tokens (kind of // * useless) // */ // @Option(name = "discardPatternsWithNoUnlabSupport") // public boolean discardPatternsWithNoUnlabSupport = true; /** * Currently, does not work correctly. TODO: make this work. Ideally this * would label words only when they occur in the context of any learned * pattern */ @Option(name = "restrictToMatched") public boolean restrictToMatched = false; /** * Label words that are learned so that in further iterations we have more * information */ @Option(name = "usePatternResultAsLabel") public boolean usePatternResultAsLabel = true; /** * Debug flag for learning patterns. 0 means no output, 1 means necessary output, 2 means necessary output+some justification, 3 means extreme debug output */ @Option(name = "debug") public int debug = 1; /** * Do not learn patterns in which the neighboring words have the same label. */ @Option(name = "ignorePatWithLabeledNeigh") public boolean ignorePatWithLabeledNeigh = false; /** * Save this run as ... */ @Option(name = "identifier") public String identifier = "getpatterns"; /** * Use the actual dictionary matching phrase(s) instead of the token word or * lemma in calculating the stats */ @Option(name = "useMatchingPhrase") public boolean useMatchingPhrase = true; /** * Reduce pattern threshold (=0.8*current_value) to extract as many patterns * as possible (still restricted by <code>numPatterns</code>) */ @Option(name = "tuneThresholdKeepRunning") public boolean tuneThresholdKeepRunning = false; /** * Maximum number of words to learn */ @Option(name = "maxExtractNumWords") public int maxExtractNumWords = Integer.MAX_VALUE; /** * use the seed dictionaries and the new words learned for the other labels in * the previous iterations as negative */ @Option(name = "useOtherLabelsWordsasNegative") public boolean useOtherLabelsWordsasNegative = true; /** * If not null, write the output like * "w1 w2 <label1> w3 <label2>w4</label2> </label1> w5 ... " if w3 w4 have * label1 and w4 has label 2 */ @Option(name = "markedOutputTextFile") String markedOutputTextFile = null; /** * Use lemma instead of words for the context tokens */ @Option(name = "useLemmaContextTokens") public boolean useLemmaContextTokens = true; /** * Lowercase the context words/lemmas */ @Option(name = "matchLowerCaseContext") public boolean matchLowerCaseContext = true; /** * Add NER restriction to the target phrase in the patterns */ @Option(name = "useTargetNERRestriction") public boolean useTargetNERRestriction = false; /** * Initials of all POS tags to use if * <code>usePOS4Pattern</code> is true, separated by comma. */ @Option(name = "targetAllowedTagsInitialsStr") public String targetAllowedTagsInitialsStr = null; public Map<String, Set<String>> allowedTagsInitials = null; /** * Allowed NERs for labels. Format is label1,NER1,NER11;label2,NER2,NER21,NER22;label3,... * <code>useTargetNERRestriction</code> flag should be true */ @Option(name = "targetAllowedNERs") public String targetAllowedNERs = null; public Map<String, Set<String>> allowedNERsforLabels = null; /** * Adds the parent's tag from the parse tree to the target phrase in the patterns */ @Option(name = "useTargetParserParentRestriction") public boolean useTargetParserParentRestriction = false; /** * If the NER tag of the context tokens is not the background symbol, * generalize the token with the NER tag */ @Option(name = "useContextNERRestriction") public boolean useContextNERRestriction = false; /** * Number of words to learn in each iteration */ @Option(name = "numWordsToAdd") public int numWordsToAdd = 10; @Option(name = "weightDomainFreq") public int weightDomainFreq = 10; @Option(name = "thresholdNumPatternsApplied") public double thresholdNumPatternsApplied = 2; @Option(name = "wordScoring") public WordScoring wordScoring = WordScoring.WEIGHTEDNORM; @Option(name = "thresholdWordExtract") public double thresholdWordExtract = 0.2; public boolean justify = false; /** * Sigma for L2 regularization in Logisitic regression, if a classifier is * used to score phrases */ @Option(name = "LRSigma") public double LRSigma = 1.0; /** * English words that are not labeled when labeling using seed dictionaries */ @Option(name = "englishWordsFiles") public String englishWordsFiles = null; private Set<String> englishWords = null; /** * Words to be ignored when learning phrases if * <code>removePhrasesWithStopWords</code> or * <code>removeStopWordsFromSelectedPhrases</code> is true. Also, these words * are considered negative when scoring a pattern (similar to * othersemanticclasses). */ @Option(name = "commonWordsPatternFiles") public String commonWordsPatternFiles = null; private Set<String> commonEngWords = null; /** * List of dictionary phrases that are negative for all labels to be learned. * Format is file_1,file_2,... where file_i has each phrase in a different * line * */ @Option(name = "otherSemanticClassesFiles") public String otherSemanticClassesFiles = null; // set of words that are considered negative for all classes private Set<String> otherSemanticClasses = null; /** * Seed dictionary, set in the class that uses this class */ private Map<String, Set<String>> labelDictionary = new HashMap<String, Set<String>>(); public Map<String, Class<? extends TypesafeMap.Key<String>>> answerClass = null; /** * Can be used only when using the API - using the appropriate constructor. * Tokens with specified classes set (has to be boolean return value, even * though this variable says object) will be ignored. */ @SuppressWarnings("rawtypes") public Map<String, Map<Class, Object>> ignoreWordswithClassesDuringSelection = null; /** * These classes will be generalized. It can only be used via the API using * the appropriate constructor. All label classes are by default generalized. */ @SuppressWarnings("rawtypes") private Map<String, Class> generalizeClasses = new HashMap<String, Class>(); /** * Minimum length of words that can be matched fuzzily */ @Option(name = "minLen4FuzzyForPattern") public int minLen4FuzzyForPattern = 6; /** * Do not learn phrases that match this regex. */ @Option(name = "wordIgnoreRegex") public String wordIgnoreRegex = "[^a-zA-Z]*"; /** * Number of threads */ @Option(name = "numThreads") public int numThreads = 1; /** * Words that are not learned. Patterns are not created around these words. * And, if useStopWordsBeforeTerm in {@link CreatePatterns} is true. */ @Option(name = "stopWordsPatternFiles", gloss = "stop words") public String stopWordsPatternFiles = null; private Set<String> stopWords = null; public List<String> fillerWords = Arrays.asList("a", "an", "the", "`", "``", "'", "''"); /** * Environment for {@link TokenSequencePattern} */ public Map<String, Env> env = new HashMap<String, Env>(); /** * by default doesn't ignore anything. What phrases to ignore. */ public Pattern ignoreWordRegex = Pattern.compile("a^"); /** * */ @Option(name = "removeStopWordsFromSelectedPhrases") public boolean removeStopWordsFromSelectedPhrases = false; /** * */ @Option(name = "removePhrasesWithStopWords") public boolean removePhrasesWithStopWords = false; private boolean alreadySetUp = false; /** * Cluster file, in which each line is word/phrase<tab>clusterid */ @Option(name = "wordClassClusterFile") String wordClassClusterFile = null; private Map<String, Integer> wordClassClusters = null; /** * General cluster file, if you wanna use it somehow, in which each line is * word/phrase<tab>clusterid */ @Option(name = "generalWordClassClusterFile") String generalWordClassClusterFile = null; private Map<String, Integer> generalWordClassClusters = null; @Option(name = "includeExternalFeatures") public boolean includeExternalFeatures = false; @Option(name = "externalFeatureWeightsFile") public String externalFeatureWeightsFile = null; @Option(name = "doNotApplyPatterns") public boolean doNotApplyPatterns = false; @Option(name = "numWordsCompound") public int numWordsCompound = 2; /** * If score for a pattern is square rooted */ @Option(name = "sqrtPatScore") public boolean sqrtPatScore = false; /** * Remove patterns that have number of unlabeled words is less than this. */ @Option(name = "minUnlabPhraseSupportForPat") public int minUnlabPhraseSupportForPat = 0; /** * Remove patterns that have number of positive words less than this. */ @Option(name = "minPosPhraseSupportForPat") public int minPosPhraseSupportForPat = 1; /** * For example, if positive seed dict contains "cancer" and "breast cancer" then "breast" is included as negative */ @Option(name="addIndvWordsFromPhrasesExceptLastAsNeg") public boolean addIndvWordsFromPhrasesExceptLastAsNeg = false; /** * Cached files */ private ConcurrentHashMap<String, Double> editDistanceFromEnglishWords = new ConcurrentHashMap<String, Double>(); /** * Cached files */ private ConcurrentHashMap<String, String> editDistanceFromEnglishWordsMatches = new ConcurrentHashMap<String, String>(); /** * Cached files */ private ConcurrentHashMap<String, Double> editDistanceFromOtherSemanticClasses = new ConcurrentHashMap<String, Double>(); /** * Cached files */ private ConcurrentHashMap<String, String> editDistanceFromOtherSemanticClassesMatches = new ConcurrentHashMap<String, String>(); /** * Cached files */ private ConcurrentHashMap<String, Double> editDistanceFromThisClass = new ConcurrentHashMap<String, Double>(); /** * Cached files */ private ConcurrentHashMap<String, String> editDistanceFromThisClassMatches = new ConcurrentHashMap<String, String>(); private Map<String, Counter<String>> wordShapesForLabels = new HashMap<String, Counter<String>>(); String channelNameLogger = "settingUp"; public Map<String, Counter<Integer>> distSimWeights = new HashMap<String, Counter<Integer>>(); public Map<String, Counter<String>> dictOddsWeights = new HashMap<String, Counter<String>>(); public enum ScorePhraseMeasures { DISTSIM, GOOGLENGRAM, PATWTBYFREQ, EDITDISTSAME, EDITDISTOTHER, DOMAINNGRAM, SEMANTICODDS, WORDSHAPE }; /** * Only works if you have single label. And the word classes are given. */ @Option(name = "usePhraseEvalWordClass") public boolean usePhraseEvalWordClass = false; /** * use google tf-idf for learning phrases */ @Option(name = "usePhraseEvalGoogleNgram") public boolean usePhraseEvalGoogleNgram = false; /** * use domain tf-idf for learning phrases */ @Option(name = "usePhraseEvalDomainNgram") public boolean usePhraseEvalDomainNgram = false; /** * use \sum_allpat pattern_wt_that_extracted_phrase/phrase_freq for learning * phrases */ @Option(name = "usePhraseEvalPatWtByFreq") public boolean usePhraseEvalPatWtByFreq = false; /** * odds of the phrase freq in the label dictionary vs other dictionaries */ @Option(name = "usePhraseEvalSemanticOdds") public boolean usePhraseEvalSemanticOdds = false; /** * Edit distance between this phrase and the other phrases in the label * dictionary */ @Option(name = "usePhraseEvalEditDistSame") public boolean usePhraseEvalEditDistSame = false; /** * Edit distance between this phrase and other phrases in other dictionaries */ @Option(name = "usePhraseEvalEditDistOther") public boolean usePhraseEvalEditDistOther = false; @Option(name = "usePhraseEvalWordShape") public boolean usePhraseEvalWordShape = false; /** * Used only if {@link patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalWordClass") public boolean usePatternEvalWordClass = false; /** * Used only if {@link patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalWordShape") public boolean usePatternEvalWordShape = false; /** * Used only if {@link patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalGoogleNgram") public boolean usePatternEvalGoogleNgram = false; /** * Used only if {@link patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalDomainNgram") public boolean usePatternEvalDomainNgram = false; /** * Used only if {@link patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalSemanticOdds") public boolean usePatternEvalSemanticOdds = false; /** * Used only if {@link patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalEditDistSame") public boolean usePatternEvalEditDistSame = false; /** * Used only if {@link patternScoring} is <code>PhEvalInPat</code> or * <code>PhEvalInPat</code>. See usePhrase* for meanings. */ @Option(name = "usePatternEvalEditDistOther") public boolean usePatternEvalEditDistOther = false; /** * These are used to learn weights for features if using logistic regression. * Percentage of non-labeled tokens selected as negative. */ @Option(name = "perSelectRand") public double perSelectRand = 0.01; /** * These are used to learn weights for features if using logistic regression. * Percentage of negative tokens selected as negative. */ @Option(name = "perSelectNeg") public double perSelectNeg = 1; /** * Especially useful for multi word phrase extraction. Do not extract a phrase * if any word is labeled with any other class. */ @Option(name = "doNotExtractPhraseAnyWordLabeledOtherClass") public boolean doNotExtractPhraseAnyWordLabeledOtherClass = true; // /** // * Use FileBackedCache for the inverted index -- use if memory is limited // */ // @Option(name="diskBackedInvertedIndex") // public boolean diskBackedInvertedIndex = false; /** * You can save the inverted index to this file */ @Option(name="saveInvertedIndexDir") public String saveInvertedIndexDir = null; /** * You can load the inv index using this file */ @Option(name="loadInvertedIndexDir") public String loadInvertedIndexDir = null; /** * Directory where to save the sentences ser files. */ @Option(name="saveSentencesSerDir") public String saveSentencesSerDir = null; public boolean usingDirForSentsInIndex = false; // @Option(name = "wekaOptions") // public String wekaOptions = ""; String backgroundSymbol = "O"; int wordShaper = WordShapeClassifier.WORDSHAPECHRIS2; private Map<String, String> wordShapeCache = new HashMap<String, String>(); public InvertedIndexByTokens invertedIndex; public static String extremedebug = "extremePatDebug"; public static String minimaldebug = "minimaldebug"; Properties props; @SuppressWarnings("rawtypes") public void setUp(Properties props) throws IOException { if (alreadySetUp) { return; } if (wordIgnoreRegex != null && !wordIgnoreRegex.isEmpty()) ignoreWordRegex = Pattern.compile(wordIgnoreRegex); for (String label : labelDictionary.keySet()) { env.put(label, TokenSequencePattern.getNewEnv()); // env.get(label).bind("answer", answerClass.get(label)); for (Entry<String, Class<? extends Key<String>>> en : this.answerClass .entrySet()) { env.get(label).bind(en.getKey(), en.getValue()); } for (Entry<String, Class> en : generalizeClasses.entrySet()) env.get(label).bind(en.getKey(), en.getValue()); } Redwood.log(Redwood.DBG, channelNameLogger, "Running with debug output"); stopWords = new HashSet<String>(); Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, "Reading stop words from " + stopWordsPatternFiles); for (String stopwfile : stopWordsPatternFiles.split("[;,]")) stopWords.addAll(IOUtils.linesFromFile(stopwfile)); englishWords = new HashSet<String>(); System.out.println("Reading english words from " + englishWordsFiles); for (String englishWordsFile : englishWordsFiles.split("[;,]")) englishWords.addAll(IOUtils.linesFromFile(englishWordsFile)); if (commonWordsPatternFiles != null) { commonEngWords = Collections.synchronizedSet(new HashSet<String>()); for (String file : commonWordsPatternFiles.split("[;,]")) commonEngWords.addAll(IOUtils.linesFromFile(file)); } if (otherSemanticClassesFiles != null) { if (otherSemanticClasses == null) otherSemanticClasses = Collections .synchronizedSet(new HashSet<String>()); for (String file : otherSemanticClassesFiles.split("[;,]")) { for (String w : IOUtils.linesFromFile(file)) { String[] t = w.split("\\s+"); if (t.length <= this.numWordsCompound) otherSemanticClasses.add(w); } } System.out.println("Size of othersemantic class variables is " + otherSemanticClasses.size()); } else { otherSemanticClasses = Collections.synchronizedSet(new HashSet<String>()); System.out.println("Size of othersemantic class variables is " + 0); } String stopStr = "/"; int i = 0; for (String s : stopWords) { if (i > 0) stopStr += "|"; stopStr += Pattern.quote(s.replaceAll("\\\\", "\\\\\\\\")); i++; } stopStr += "/"; for (String label : labelDictionary.keySet()) { env.get(label).bind("$FILLER", "/" + StringUtils.join(fillerWords, "|") + "/"); env.get(label).bind("$STOPWORD", stopStr); env.get(label).bind("$MOD", "[{tag:/JJ.*/}]"); if (matchLowerCaseContext) env.get(label).setDefaultStringPatternFlags(Pattern.CASE_INSENSITIVE); env.get(label).bind("OTHERSEM", PatternsAnnotations.OtherSemanticLabel.class); env.get(label).bind("grandparentparsetag", CoreAnnotations.GrandparentAnnotation.class); } if (wordClassClusterFile != null) { wordClassClusters = new HashMap<String, Integer>(); for (String line : IOUtils.readLines(wordClassClusterFile)) { String[] t = line.split("\t"); wordClassClusters.put(t[0], Integer.parseInt(t[1])); } } if (generalWordClassClusterFile != null) { setGeneralWordClassClusters(new HashMap<String, Integer>()); for (String line : IOUtils.readLines(generalWordClassClusterFile)) { String[] t = line.split("\t"); getGeneralWordClassClusters().put(t[0], Integer.parseInt(t[1])); } } if(targetAllowedTagsInitialsStr!= null){ allowedTagsInitials = new HashMap<String, Set<String>>(); for(String labelstr : targetAllowedTagsInitialsStr.split(";")){ String[] t = labelstr.split(","); Set<String> st = new HashSet<String>(); for(int j = 1; j < t.length; j++) st.add(t[j]); allowedTagsInitials.put(t[0], st); } } if(useTargetNERRestriction && targetAllowedNERs !=null){ allowedNERsforLabels = new HashMap<String, Set<String>>(); for(String labelstr : targetAllowedNERs.split(";")){ String[] t = labelstr.split(","); Set<String> st = new HashSet<String>(); for(int j = 1; j < t.length; j++) st.add(t[j]); allowedNERsforLabels.put(t[0], st); } } alreadySetUp = true; } public Map<String, Counter<String>> getWordShapesForLabels() { return wordShapesForLabels; } public void setWordShapesForLabels(Map<String, Counter<String>> wordShapesForLabels) { this.wordShapesForLabels = wordShapesForLabels; } public void addGeneralizeClasses(Map<String, Class> gen) { this.generalizeClasses.putAll(gen); } public Map<String, Class> getGeneralizeClasses() { return this.generalizeClasses; } public Set<String> getStopWords() { return stopWords; } public void addWordShapes(String label, Set<String> words){ if(!this.wordShapesForLabels.containsKey(label)){ this.wordShapesForLabels.put(label, new ClassicCounter<String>()); } for(String w: words){ String ws = null; if(wordShapeCache.containsKey(w)) ws = wordShapeCache.get(w); else{ ws = WordShapeClassifier.wordShape(w, wordShaper); wordShapeCache.put(w, ws); } wordShapesForLabels.get(label).incrementCount(ws); } } public void setLabelDictionary(Map<String, Set<String>> seedSets) { this.labelDictionary = seedSets; if(usePhraseEvalWordShape || usePatternEvalWordShape){ this.wordShapesForLabels.clear(); for(Entry<String, Set<String>> en: seedSets.entrySet()) addWordShapes(en.getKey(), en.getValue()); } } public Map<String, Set<String>> getLabelDictionary() { return this.labelDictionary; } public void addLabelDictionary(String label, Set<String> words) { this.labelDictionary.get(label).addAll(words); if(usePhraseEvalWordShape || usePatternEvalWordShape) addWordShapes(label, words); } public Set<String> getEnglishWords() { return this.englishWords; } public Set<String> getCommonEngWords() { return this.commonEngWords; } public Set<String> getOtherSemanticClasses() { return this.otherSemanticClasses; } public void setOtherSemanticClasses(Set<String> other) { this.otherSemanticClasses = other; } public Map<String, Integer> getWordClassClusters() { return this.wordClassClusters; } private Pair<String, Double> getEditDist(Set<String> words, String ph) { double minD = editDistMax; String minPh = ph; for (String e : words) { if (e.equals(ph)) return new Pair<String, Double>(ph, 0.0); double d = EditDistanceDamerauLevenshteinLike.editDistance(e, ph, 3); if (d == 1) return new Pair<String, Double>(e, d); if (d == -1) d = editDistMax; if (d < minD) { minD = d; minPh = e; } } return new Pair<String, Double>(minPh, minD); } double editDistMax = 100; /** * Use this option if you are limited by memory ; ignored if fileFormat is ser. */ @Option(name="batchProcessSents") public boolean batchProcessSents = false; @Option(name="writeMatchedTokensFiles") public boolean writeMatchedTokensFiles = false; public Pair<String, Double> getEditDistanceFromThisClass(String label, String ph, int minLen) { if (ph.length() < minLen) return new Pair<String, Double>(ph, editDistMax); if (editDistanceFromThisClass.containsKey(ph)) return new Pair<String, Double>(editDistanceFromThisClassMatches.get(ph), editDistanceFromThisClass.get(ph)); Pair<String, Double> minD = getEditDist(labelDictionary.get(label), ph); // double minDtotal = editDistMax; // String minPh = ""; // if (minD.second() == editDistMax && ph.contains(" ")) { // for (String s : ph.split("\\s+")) { // Pair<String, Double> minDSingle = getEditDist(labelDictionary.get(label), // s); // if (minDSingle.second() < minDtotal) { // minDtotal = minDSingle.second; // } // minPh += " " + minDSingle.first(); // } // minPh = minPh.trim(); // } else { double minDtotal = minD.second(); String minPh = minD.first(); // } assert (!minPh.isEmpty()); editDistanceFromThisClass.putIfAbsent(ph, minDtotal); editDistanceFromThisClassMatches.putIfAbsent(ph, minPh); return new Pair<String, Double>(minPh, minDtotal); } public Pair<String, Double> getEditDistanceFromOtherSemanticClasses( String ph, int minLen) { if (ph.length() < minLen) return new Pair<String, Double>(ph, editDistMax); if (editDistanceFromOtherSemanticClasses.containsKey(ph)) return new Pair<String, Double>( editDistanceFromOtherSemanticClassesMatches.get(ph), editDistanceFromOtherSemanticClasses.get(ph)); Pair<String, Double> minD = getEditDist(otherSemanticClasses, ph); // double minDtotal = editDistMax; // String minPh = ""; // if (minD.second() == editDistMax && ph.contains(" ")) { // for (String s : ph.split("\\s+")) { // Pair<String, Double> minDSingle = getEditDist(otherSemanticClasses, s); // if (minDSingle.second() < minDtotal) { // minDtotal = minDSingle.second; // } // minPh += " " + minDSingle.first(); // } // minPh = minPh.trim(); // } else { double minDtotal = minD.second(); String minPh = minD.first(); // } assert (!minPh.isEmpty()); editDistanceFromOtherSemanticClasses.putIfAbsent(ph, minDtotal); editDistanceFromOtherSemanticClassesMatches.putIfAbsent(ph, minPh); return new Pair<String, Double>(minPh, minDtotal); } public double getEditDistanceFromEng(String ph, int minLen) { if (ph.length() < minLen) return editDistMax; if (editDistanceFromEnglishWords.containsKey(ph)) return editDistanceFromEnglishWords.get(ph); Pair<String, Double> d = getEditDist(commonEngWords, ph); double minD = d.second(); String minPh = d.first(); if (d.second() > 2) { Pair<String, Double> minD2 = getEditDist(otherSemanticClasses, ph); if (minD2.second < minD) { minD = minD2.second(); minPh = minD2.first(); } } editDistanceFromEnglishWords.putIfAbsent(ph, minD); editDistanceFromEnglishWordsMatches.putIfAbsent(ph, minPh); return minD; } public ConcurrentHashMap<String, Double> getEditDistanceFromEnglishWords() { return this.editDistanceFromEnglishWords; } public ConcurrentHashMap<String, String> getEditDistanceFromEnglishWordsMatches() { return this.editDistanceFromEnglishWordsMatches; } public double getEditDistanceScoresOtherClass(String g) { double editDist; String editDistPh; if (editDistanceFromOtherSemanticClasses.containsKey(g)) { editDist = editDistanceFromOtherSemanticClasses.get(g); editDistPh = editDistanceFromOtherSemanticClassesMatches.get(g); } else { Pair<String, Double> editMatch = getEditDistanceFromOtherSemanticClasses( g, 4); editDist = editMatch.second(); editDistPh = editMatch.first(); } assert (!editDistPh.isEmpty()); return editDist / (double) editDistPh.length(); } /** * 1 if lies in edit distance, 0 if not close to any words * * @param g * @return */ public double getEditDistanceScoresOtherClassThreshold(String g) { double editDistRatio = getEditDistanceScoresOtherClass(g); if (editDistRatio < 0.2) return 1; else return 0; } public double getEditDistanceScoresThisClassThreshold(String label, String g) { double editDistRatio = getEditDistanceScoresThisClass(label, g); if (editDistRatio < 0.2) return 1; else return 0; } public double getEditDistanceScoresThisClass(String label, String g) { double editDist; String editDistPh; if (editDistanceFromThisClass.containsKey(g)) { editDist = editDistanceFromThisClass.get(g); editDistPh = editDistanceFromThisClassMatches.get(g); } else { Pair<String, Double> editMatch = getEditDistanceFromThisClass(label, g, 4); editDist = editMatch.second(); editDistPh = editMatch.first(); } assert (!editDistPh.isEmpty()); return editDist / (double) editDistPh.length(); } public static boolean isFuzzyMatch(String w1, String w2, int minLen4Fuzzy) { EditDistance editDistance = new EditDistance(true); if (w1.equals(w2)) return true; if (w2.length() > minLen4Fuzzy) { double d = editDistance.score(w1, w2); if (d == 1) { return true; } } return false; } public static String containsFuzzy(Set<String> words, String w, int minLen4Fuzzy) { for (String w1 : words) { if (isFuzzyMatch(w1, w, minLen4Fuzzy)) return w1; } return null; } public Map<String, Integer> getGeneralWordClassClusters() { return generalWordClassClusters; } public void setGeneralWordClassClusters( Map<String, Integer> generalWordClassClusters) { this.generalWordClassClusters = generalWordClassClusters; } public Map<String, String> getWordShapeCache() { return wordShapeCache; } }