package edu.stanford.nlp.patterns;
import java.io.File;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.ArgumentParser.Option;
import edu.stanford.nlp.util.logging.Redwood;
public class Data {
public static double ratioDomainNgramFreqWithDataFreq = 1;
static public Counter<CandidatePhrase> rawFreq = null;
public static List<File> sentsFiles = null;
//when using batch processing, map from sentid to the file that has that sentence
public static Map<String, File> sentId2File = null;
public static Map<String, DataInstance> sents = null;
//save the in-memory sents to this file
public static String inMemorySaveFileLocation= "";
public static Counter<CandidatePhrase> processedDataFreq = null;
public static Counter<String> domainNGramRawFreq = new ClassicCounter<>();;
public static double ratioGoogleNgramFreqWithDataFreq = 1;
// @Option(name = "googleNGramsFile")
// public static String googleNGramsFile = null;
@Option(name = "domainNGramsFile")
public static String domainNGramsFile = null;
static boolean usingGoogleNgram = false;
//public static Counter<String> googleNGram = new ClassicCounter<String>();
public static Map<String, Map<String, List<Integer>>> matchedTokensForEachPhrase = new ConcurrentHashMap<>();
public static void computeRawFreqIfNull(int numWordsCompound, boolean batchProcess) {
ConstantsAndVariables.DataSentsIterator iter = new ConstantsAndVariables.DataSentsIterator(batchProcess);
while(iter.hasNext()){
computeRawFreqIfNull(iter.next().first(), numWordsCompound);
}
}
public static void computeRawFreqIfNull(Map<String, DataInstance> sents, int numWordsCompound) {
Redwood.log(Redwood.DBG, "Computing raw freq for every 1-" + numWordsCompound + " consecutive words");
for (DataInstance l : sents.values()) {
List<List<CoreLabel>> ngrams = CollectionUtils.getNGrams(l.getTokens(), 1, numWordsCompound);
for (List<CoreLabel> n : ngrams) {
String s = "";
for (CoreLabel c : n) {
// if (useWord(c, commonEngWords, ignoreWordRegex)) {
s += " " + c.word();
// }
}
s = s.trim();
if (!s.isEmpty()){
Data.rawFreq.incrementCount(CandidatePhrase.createOrGet(s));
}
}
}
//if (googleNGram != null && googleNGram.size() > 0)
if(usingGoogleNgram)
setRatioGoogleNgramFreqWithDataFreq();
if (domainNGramRawFreq != null && domainNGramRawFreq.size() > 0)
ratioDomainNgramFreqWithDataFreq = domainNGramRawFreq.totalCount() / Data.rawFreq.totalCount();
}
public static void setRatioGoogleNgramFreqWithDataFreq() {
ratioGoogleNgramFreqWithDataFreq = GoogleNGramsSQLBacked.getTotalCount(1)/ Data.rawFreq.totalCount();
Redwood.log(ConstantsAndVariables.minimaldebug, "Data", "ratioGoogleNgramFreqWithDataFreq is " + ratioGoogleNgramFreqWithDataFreq);
//return ratioGoogleNgramFreqWithDataFreq;
}
// public static void loadGoogleNGrams() {
// if (googleNGram == null || googleNGram.size() == 0) {
// for (String line : IOUtils.readLines(googleNGramsFile)) {
// String[] t = line.split("\t");
// googleNGram.setCount(t[0], Double.valueOf(t[1]));
// }
// Redwood.log(ConstantsAndVariables.minimaldebug, "Data", "loading freq from google ngram file " + googleNGramsFile);
// }
// }
public static void loadDomainNGrams() {
assert(domainNGramsFile != null);
if (domainNGramRawFreq == null || domainNGramRawFreq.size() == 0) {
for (String line : IOUtils.readLines(domainNGramsFile)) {
String[] t = line.split("\t");
domainNGramRawFreq.setCount(t[0], Double.valueOf(t[1]));
}
Redwood.log(ConstantsAndVariables.minimaldebug, "Data", "loading freq from domain ngram file " + domainNGramsFile);
}
}
}