package edu.stanford.nlp.patterns; import java.io.*; import java.util.*; import java.util.function.Function; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.patterns.surface.Token; import edu.stanford.nlp.util.CollectionUtils; import edu.stanford.nlp.util.CollectionValuedMap; import edu.stanford.nlp.util.ArgumentParser; import edu.stanford.nlp.util.logging.Redwood; /** * Creates an inverted index of (classkey:value) => {sentid1,sentid2,.. }. * * * @author Sonal Gupta (sonalg@stanford.edu) * */ public class InvertedIndexByTokens<E extends Pattern> extends SentenceIndex<E> implements Serializable { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(InvertedIndexByTokens.class); private static final long serialVersionUID = 1L; Map<String, Set<String>> index; public InvertedIndexByTokens(Properties props, Set<String> stopWords, Function<CoreLabel, Map<String, String>> transformSentenceToString) { super(stopWords, transformSentenceToString); ArgumentParser.fillOptions(this, props); index = new HashMap<>(); } public InvertedIndexByTokens(Properties props, Set<String> stopWords, Function<CoreLabel, Map<String, String>> transformSentenceToString, Map<String, Set<String>> index) { super(stopWords, transformSentenceToString); ArgumentParser.fillOptions(this, props); this.index = index; } @Override public void add(Map<String,DataInstance> sents, boolean addProcessedText) { for (Map.Entry<String, DataInstance> sEn : sents.entrySet()) { add(sEn.getValue().getTokens(), sEn.getKey(), addProcessedText); } } @Override protected void add(List<CoreLabel> sent, String sentId, boolean addProcessedText){ numAllSentences ++; for (CoreLabel l : sent) { //String w = l.word(); // w = w.replaceAll("/", "\\\\/"); // add(w, sEn.getKey()); Map<String, String> addThis = this.transformCoreLabeltoString.apply(l); for(Map.Entry<String, String> en: addThis.entrySet()){ String val = combineKeyValue(en.getKey(),en.getValue()); add(val, sentId); } if(addProcessedText){ String val = Token.getKeyForClass(PatternsAnnotations.ProcessedTextAnnotation.class) +":"+ l.get(PatternsAnnotations.ProcessedTextAnnotation.class); if(!stopWords.contains(val.toLowerCase())) add(val, sentId); } } } @Override public void finishUpdating() { //nothing to do right now! } @Override public void update(List<CoreLabel> tokens, String sentid) { add(tokens, sentid, false); } void add(String w, String sentid){ Set<String> sentids = index.get(w); if (sentids == null) { sentids = new HashSet<>(); } sentids.add(sentid); index.put(w, sentids); } String combineKeyValue(String key, String value){ return key+":"+value; } public Set<String> getFileSentIds(CollectionValuedMap<String, String> relevantWords) { Set<String> sentids = null; for (Map.Entry<String, Collection<String>> en : relevantWords.entrySet()) { for(String en2: en.getValue()){ if(!stopWords.contains(en2.toLowerCase())){ String w = combineKeyValue(en.getKey(), en2); Set<String> st = index.get(w); if (st == null){ //log.info("\n\nWARNING: INDEX HAS NO SENTENCES FOR " + w); return Collections.emptySet(); //throw new RuntimeException("How come the index does not have sentences for " + w); } if(sentids == null) sentids= st; else sentids = CollectionUtils.intersection(sentids, st); } }} return sentids; } //returns for each pattern, list of sentence ids public Map<E, Set<String>> getFileSentIdsFromPats(Collection<E> pats) { Map<E, Set<String>> sents = new HashMap<>(); for(E pat: pats){ Set<String> ids = getFileSentIds(pat.getRelevantWords()); Redwood.log(ConstantsAndVariables.extremedebug, "For pattern with index " + pat + " extracted the following sentences from the index " + ids); sents.put(pat, ids); } return sents; } //The last variable is not really used! public static InvertedIndexByTokens createIndex(Map<String, List<CoreLabel>> sentences, Properties props, Set<String> stopWords, String dir, Function<CoreLabel, Map<String, String>> transformCoreLabeltoString) { InvertedIndexByTokens inv = new InvertedIndexByTokens(props, stopWords, transformCoreLabeltoString); if(sentences != null && sentences.size() > 0) inv.add(sentences, true); System.out.println("Created index with size " + inv.size() + ". Don't worry if it's zero and you are using batch process sents."); return inv; } @Override public Map<E, Set<String>> queryIndex(Collection<E> patterns) { Map<E, Set<String>> sentSentids = getFileSentIdsFromPats(patterns); return sentSentids; } @Override public void saveIndex(String dir){ try { IOUtils.ensureDir(new File(dir)); IOUtils.writeObjectToFile(index, dir + "/map.ser"); } catch (IOException e) { throw new RuntimeException(e); } } //called by SentenceIndex.loadIndex public static InvertedIndexByTokens loadIndex(Properties props, Set<String> stopwords, String dir, Function<CoreLabel, Map<String, String>> transformSentenceToString) { try { Map<String, Set<String>> index = IOUtils.readObjectFromFile(dir + "/map.ser"); System.out.println("Loading inverted index from " + dir); return new InvertedIndexByTokens(props, stopwords, transformSentenceToString, index); } catch (Exception e) { throw new RuntimeException("Cannot load the inverted index. " + e); } } }