package edu.stanford.nlp.patterns.surface;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Set;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
/**
* Creates an inverted index of (word or lemma) => {file1 => {sentid1,
* sentid2,.. }, file2 => {sentid1, sentid2, ...}}.
*
* (Commented out FileBackedCache because it currrently doesnt support changing
* the values)
*
* @author Sonal Gupta (sonalg@stanford.edu)
*
*/
public class InvertedIndexByTokens implements Serializable{
private static final long serialVersionUID = 1L;
Map<String, Hashtable<String, Set<String>>> index;
boolean convertToLowercase;
// boolean filebacked;
Set<String> stopWords, specialWords;
// static int numfilesindiskbacked = 10000;
int numAllEntries = 0;
boolean batchProcessSents = false;
String filenamePrefix = null;
public InvertedIndexByTokens(File invertedIndexDir, boolean lc, Set<String> stopWords, Set<String> specialWords, boolean batchProcessSents, String dirName) {
// if (filebacked)
// index = new FileBackedCache<StringwithConsistentHashCode,
// Hashtable<String, Set<String>>>(invertedIndexDir, numfilesindiskbacked);
// else
// memory mapped
index = new HashMap<String, Hashtable<String, Set<String>>>();
this.convertToLowercase = lc;
this.batchProcessSents = batchProcessSents;
// this.filebacked = filebacked;
this.stopWords = stopWords;
if (this.stopWords == null)
this.stopWords = new HashSet<String>();
this.specialWords = specialWords;
this.filenamePrefix = dirName;
}
public InvertedIndexByTokens(Map<String, Hashtable<String, Set<String>>> index, boolean lc, Set<String> stopWords,
Set<String> specialWords, boolean batchProcessSents, String dirName) {
this.index = index;
this.convertToLowercase = lc;
this.batchProcessSents = batchProcessSents;
this.stopWords = stopWords;
if (this.stopWords == null)
this.stopWords = new HashSet<String>();
this.specialWords = specialWords;
this.filenamePrefix = dirName;
}
void add(Map<String, List<CoreLabel>> sents, String filename, boolean indexLemma) {
if(filenamePrefix != null)
filename = filenamePrefix+ (filenamePrefix.endsWith("/")?"":"/")+filename;
for (Map.Entry<String, List<CoreLabel>> sEn : sents.entrySet()) {
for (CoreLabel l : sEn.getValue()) {
String w = l.word();
if (indexLemma)
w = l.lemma();
if (convertToLowercase)
w = w.toLowerCase();
Hashtable<String, Set<String>> t = index.get(w);
if (t == null)
t = new Hashtable<String, Set<String>>();
Set<String> sentids = t.get(filename);
if (sentids == null) {
sentids = new HashSet<String>();
}
numAllEntries = numAllEntries - sentids.size();
sentids.add(sEn.getKey());
t.put(filename, sentids);
numAllEntries = numAllEntries + sentids.size();
index.put(w, t);
}
}
}
public Map<String, Set<String>> getFileSentIds(String word) {
return index.get(word);
}
public Map<String, Set<String>> getFileSentIds(Set<String> words) {
Hashtable<String, Set<String>> sentids = new Hashtable<String, Set<String>>();
for (String w : words) {
Hashtable<String, Set<String>> st = index.get(w);
if (st == null)
throw new RuntimeException("How come the index does not have sentences for " + w);
for (Map.Entry<String, Set<String>> en : st.entrySet()) {
if (!sentids.containsKey(en.getKey())) {
sentids.put(en.getKey(), new HashSet<String>());
}
sentids.get(en.getKey()).addAll(en.getValue());
}
}
return sentids;
}
public Map<String, Set<String>> getFileSentIdsFromPats(Set<SurfacePattern> pats) {
Set<String> relevantWords = new HashSet<String>();
for (SurfacePattern p : pats) {
Set<String> relwordsThisPat = new HashSet<String>();
String[] next = p.getSimplerTokensNext();
if (next != null)
for (String s : next) {
s = s.trim();
if (convertToLowercase)
s = s.toLowerCase();
if (!s.isEmpty())
relwordsThisPat.add(s);
}
String[] prev = p.getSimplerTokensPrev();
if (prev != null)
for (String s : prev) {
s = s.trim();
if (convertToLowercase)
s = s.toLowerCase();
if (!s.isEmpty())
relwordsThisPat.add(s);
}
boolean nonStopW = false;
for (String w : relwordsThisPat) {
if (!stopWords.contains(w) && !specialWords.contains(w)) {
relevantWords.add(w);
nonStopW = true;
}
}
// If the pat contains just the stop words, add all the stop words!
if (!nonStopW)
relevantWords.addAll(relwordsThisPat);
}
relevantWords.removeAll(specialWords);
return getFileSentIds(relevantWords);
}
public Set<String> getSpecialWordsList() {
return this.specialWords;
}
public void saveIndex(String dir) throws IOException {
BufferedWriter w = new BufferedWriter(new FileWriter(dir + "/param.txt"));
w.write(String.valueOf(convertToLowercase) + "\n");
w.write(String.valueOf(this.batchProcessSents) + "\n");
w.write(this.filenamePrefix+"\n");
w.close();
IOUtils.writeObjectToFile(this.stopWords, dir + "/stopwords.ser");
IOUtils.writeObjectToFile(this.specialWords, dir + "/specialwords.ser");
// if (!filebacked)
IOUtils.writeObjectToFile(index, dir + "/map.ser");
}
public static InvertedIndexByTokens loadIndex(String dir) {
try {
List<String> lines = IOUtils.linesFromFile(dir + "/param.txt");
boolean lc = Boolean.parseBoolean(lines.get(0));
boolean batchProcessSents = Boolean.parseBoolean(lines.get(1));
String filenameprefix = lines.get(2);
if(filenameprefix.equals("null"))
filenameprefix = null;
Set<String> stopwords = IOUtils.readObjectFromFile(dir + "/stopwords.ser");
Set<String> specialwords = IOUtils.readObjectFromFile(dir + "/specialwords.ser");
Map<String, Hashtable<String, Set<String>>> index = null;
// if (!filebacked)
index = IOUtils.readObjectFromFile(dir + "/map.ser");
// else
// index = new FileBackedCache<StringwithConsistentHashCode,
// Hashtable<String, Set<String>>>(dir + "/cache", numfilesindiskbacked);
return new InvertedIndexByTokens(index, lc, stopwords, specialwords, batchProcessSents, filenameprefix);
} catch (Exception e) {
throw new RuntimeException("Cannot load the inverted index. " + e);
}
}
public int size() {
return index.size();
}
public boolean isBatchProcessed(){
return this.batchProcessSents;
}
public int numAllEntries() {
return this.numAllEntries;
}
public Set<String> getKeySet() {
return index.keySet();
}
}