package edu.stanford.nlp.patterns.dep; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.Callable; import java.util.function.Function; import java.util.stream.Collectors; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.patterns.CandidatePhrase; import edu.stanford.nlp.patterns.ConstantsAndVariables; import edu.stanford.nlp.patterns.DataInstance; import edu.stanford.nlp.patterns.Pattern; import edu.stanford.nlp.patterns.PatternFactory; import edu.stanford.nlp.patterns.PatternsAnnotations; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.stats.TwoDimensionalCounter; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.util.CollectionValuedMap; import edu.stanford.nlp.util.IntPair; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.Triple; /** * Applying Dependency patterns to sentences. * * @author sonalg * @version 11/1/14 */ public class ApplyDepPatterns <E extends Pattern> implements Callable<Pair<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>>> { private String label; private Map<SemgrexPattern, E> patterns; private List<String> sentids; private boolean removeStopWordsFromSelectedPhrases; private boolean removePhrasesWithStopWords; private ConstantsAndVariables constVars; private Map<String, DataInstance> sents; // = null; public ApplyDepPatterns(Map<String, DataInstance> sents, List<String> sentids, Map<SemgrexPattern, E> patterns, String label, boolean removeStopWordsFromSelectedPhrases, boolean removePhrasesWithStopWords, ConstantsAndVariables cv) { this.sents = sents; this.patterns = patterns; this.sentids = sentids; this.label = label; this.removeStopWordsFromSelectedPhrases = removeStopWordsFromSelectedPhrases; this.removePhrasesWithStopWords = removePhrasesWithStopWords; this.constVars = cv; } @Override public Pair<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>> call() throws Exception { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>(); CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>(); for (String sentid : sentids) { DataInstance sent = sents.get(sentid); List<CoreLabel> tokens = sent.getTokens(); for (Map.Entry<SemgrexPattern, E> pEn : patterns.entrySet()) { if (pEn.getKey() == null) throw new RuntimeException("why is the pattern " + pEn + " null?"); SemanticGraph graph = ((DataInstanceDep) sent).getGraph(); //SemgrexMatcher m = pEn.getKey().matcher(graph); //TokenSequenceMatcher m = pEn.getKey().matcher(sent); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory //m.setBranchLimit(5); Collection<ExtractedPhrase> matched = getMatchedTokensIndex(graph, pEn.getKey(), sent, label); for (ExtractedPhrase match : matched) { int s = match.startIndex; int e = match.endIndex + 1; String phrase = ""; String phraseLemma = ""; boolean useWordNotLabeled = false; boolean doNotUse = false; //find if the neighboring words are labeled - if so - club them together if(constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (tokens.get(i).get(constVars.getAnswerClass().get(label)).equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped.get(label)) { s = i; //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s); } else break; } for (int i = e; i < tokens.size(); i++) { if (tokens.get(i).get(constVars.getAnswerClass().get(label)).equals(label) && (i-s + 1) <= PatternFactory.numWordsCompoundMapped.get(label)) { e = i; //System.out.println("for phrase " + match + " clubbing next word. new e is " + e); } else break; } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true boolean[] addedindices = new boolean[e-s]; // Arrays.fill(addedindices, false); // get for free on array initialization for (int i = s; i < e; i++) { CoreLabel l = tokens.get(i); l.set(PatternsAnnotations.MatchedPattern.class, true); if(!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null) l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>()); Pattern pSur = pEn.getValue(); assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!"; assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet(); l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur); for (Map.Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection() .get(label).entrySet()) { if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) { doNotUse = true; } } boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label)) { useWordNotLabeled = true; } phrase += " " + l.word(); phraseLemma += " " + l.lemma(); addedindices[i-s] = true; } } } for(int i =0; i < addedindices.length; i++){ if(i > 0 && i < addedindices.length -1 && addedindices[i-1] == true && addedindices[i] == false && addedindices[i+1] == true){ doNotUse = true; break; } } if (!doNotUse && useWordNotLabeled) { matchedTokensByPat.add(pEn.getValue(), new Triple<>( sentid, s, e - 1)); if (useWordNotLabeled) { phrase = phrase.trim(); phraseLemma = phraseLemma.trim(); allFreq.incrementCount(CandidatePhrase.createOrGet(phrase,phraseLemma, match.getFeatures()), pEn.getValue(), 1.0); } } } } } return new Pair<>(allFreq, matchedTokensByPat); } private Function<CoreLabel, Boolean> matchingWordRestriction = new Function<CoreLabel, Boolean>(){ @Override public Boolean apply(CoreLabel coreLabel) { return matchedRestriction(coreLabel, label); } }; private Collection<ExtractedPhrase> getMatchedTokensIndex(SemanticGraph graph, SemgrexPattern pattern, DataInstance sent, String label) { //TODO: look at the ignoreCommonTags flag ExtractPhraseFromPattern extract = new ExtractPhraseFromPattern(false, PatternFactory.numWordsCompoundMapped.get(label)); Collection<IntPair> outputIndices = new ArrayList<>(); boolean findSubTrees = true; List<CoreLabel> tokensC = sent.getTokens(); //TODO: see if you can get rid of this (only used for matchedGraphs) List<String> tokens = tokensC.stream().map(x -> x.word()).collect(Collectors.toList()); List<String> outputPhrases = new ArrayList<>(); List<ExtractedPhrase> extractedPhrases = new ArrayList<>(); Function<Pair<IndexedWord, SemanticGraph>, Counter<String>> extractFeatures = new Function<Pair<IndexedWord, SemanticGraph>, Counter<String>>() { @Override public Counter<String> apply(Pair<IndexedWord, SemanticGraph> indexedWordSemanticGraphPair) { //TODO: make features; Counter<String> feat = new ClassicCounter<>(); IndexedWord vertex = indexedWordSemanticGraphPair.first(); SemanticGraph graph = indexedWordSemanticGraphPair.second(); List<Pair<GrammaticalRelation, IndexedWord>> pt = graph.parentPairs(vertex); for(Pair<GrammaticalRelation, IndexedWord> en: pt) { feat.incrementCount("PARENTREL-" + en.first()); } return feat; } }; extract.getSemGrexPatternNodes(graph, tokens, outputPhrases, outputIndices, pattern, findSubTrees, extractedPhrases, constVars.matchLowerCaseContext, matchingWordRestriction); /* //TODO: probably a bad idea to add ALL ngrams Collection<ExtractedPhrase> outputIndicesMaxPhraseLen = new ArrayList<ExtractedPhrase>(); for(IntPair o: outputIndices){ int min = o.get(0); int max = o.get(1); for (int i = min; i <= max ; i++) { CoreLabel t = tokensC.get(i); String phrase = t.word(); if(!matchedRestriction(t, label)) continue; for (int ngramSize = 1; ngramSize < PatternFactory.numWordsCompound; ++ngramSize) { int j = i + ngramSize - 1; if(j > max) break; CoreLabel tokenj = tokensC.get(j); if(ngramSize > 1) phrase += " " + tokenj.word(); if (matchedRestriction(tokenj, label)) { outputIndicesMaxPhraseLen.add(new ExtractedPhrase(i, j, phrase)); //outputIndicesMaxPhraseLen.add(new IntPair(i, j)); } } } }*/ //System.out.println("extracted phrases are " + extractedPhrases + " and output indices are " + outputIndices); return extractedPhrases; } private boolean matchedRestriction(CoreLabel coreLabel, String label) { boolean use = false; if(PatternFactory.useTargetNERRestriction){ for(String s: constVars.allowedNERsforLabels.get(label)){ if(coreLabel.get(CoreAnnotations.NamedEntityTagAnnotation.class).matches(s)){ use = true; break; } } } else { //System.out.println("not matching NER"); use = true; } if(use){ String tag = coreLabel.tag(); if (constVars.allowedTagsInitials != null && constVars.allowedTagsInitials.containsKey(label)) { for (String allowed : constVars.allowedTagsInitials.get(label)) { if (tag.startsWith(allowed)) { use = true; break; } use = false; } } } if(constVars.debug >= 4) if(use) System.out.println(coreLabel.word() + " matched restriction " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels.get(label) : "") + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars.allowedTagsInitials != null ? constVars.allowedTagsInitials.get(label) :"")); else System.out.println(coreLabel.word() + " did not matched restrict " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels.get(label) : "") + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars.allowedTagsInitials != null ? constVars.allowedTagsInitials.get(label) :"")); return use; } private static boolean containsStopWord(CoreLabel l, Set<String> commonEngWords, java.util.regex.Pattern ignoreWordRegex) { // if(useWordResultCache.containsKey(l.word())) // return useWordResultCache.get(l.word()); if ((commonEngWords != null && (commonEngWords.contains(l.lemma()) || commonEngWords.contains(l.word()))) || (ignoreWordRegex != null && ignoreWordRegex.matcher(l.lemma()).matches())){ //|| (ignoreWords !=null && (ignoreWords.contains(l.lemma()) || ignoreWords.contains(l.word())))) { // useWordResultCache.putIfAbsent(l.word(), false); return true; } // // if (l.word().length() >= minLen4Fuzzy) { // try { // String matchedFuzzy = NoisyLabelSentences.containsFuzzy(commonEngWords, // l.word(), minLen4Fuzzy); // if (matchedFuzzy != null) { // synchronized (commonEngWords) { // commonEngWords.add(l.word()); // System.out.println("word is " + l.word() + " and matched fuzzy with " + // matchedFuzzy); // } // useWordResultCache.putIfAbsent(l.word(), false); // return false; // } // } catch (Exception e) { // e.printStackTrace(); // System.out.println("Exception " + " while fuzzy matching " + l.word()); // } // } // useWordResultCache.putIfAbsent(l.word(), true); return false; } }