package info.ephyra.answerselection.filters;
import info.ephyra.io.Logger;
import info.ephyra.io.MsgPrinter;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.OpenNLP;
import info.ephyra.nlp.SnowballStemmer;
import info.ephyra.querygeneration.Query;
import info.ephyra.querygeneration.generators.BagOfWordsG;
import info.ephyra.questionanalysis.AnalyzedQuestion;
import info.ephyra.questionanalysis.KeywordExtractor;
import info.ephyra.questionanalysis.QuestionNormalizer;
import info.ephyra.search.Result;
import info.ephyra.trec.TREC13To16Parser;
import info.ephyra.trec.TRECTarget;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
/**
* <p>A web term importance filter that counts term frequencies in a Wikipedia
* article on the target of the question.</p>
*
* <p>This class extends the class <code>WebTermImportanceFilter</code>.</p>
*
* @author Guido Sautter
* @version 2008-02-15
*/
public class WikipediaTermImportanceFilter extends WebTermImportanceFilter {
// protected static final String person = "person";
// protected static final String organization = "organization";
// protected static final String location = "location";
// protected static final String event = "event";
//
// public static final int NO_NORMALIZATION = 0;
// public static final int LINEAR_LENGTH_NORMALIZATION = 1;
// public static final int SQUARE_ROOT_LENGTH_NORMALIZATION = 2;
// public static final int LOG_LENGTH_NORMALIZATION = 3;
// public static final int LOG_10_LENGTH_NORMALIZATION = 4;
// protected static final String WIKIPEDIA = "wikipedia";
/**
* @param normalizationMode
* @param tfNormalizationMode
* @param isCombined
*/
public WikipediaTermImportanceFilter(int normalizationMode, int tfNormalizationMode, boolean isCombined) {
super(normalizationMode, tfNormalizationMode, isCombined);
}
/** @see info.ephyra.answerselection.filters.WebTermImportanceFilter#getTargets(java.lang.String)
*/
@Override
public String[] getTargets(String target) {
String[] targets = {target};
return targets;
}
/** @see info.ephyra.answerselection.filters.WebTermImportanceFilter#getTermCounters(java.lang.String[])
*/
public HashMap<String, TermCounter> getTermCounters(String[] targets) {
if (targets.length == 0) return new HashMap<String, TermCounter>();
return this.getTermCounters(targets[0]);
}
/**
* fetch the term frequencies in the top X result snippets of a web search
* for some target
*
* @param target the target
* @return a HashMap mapping the terms in the web search results to their
* frequency in the snippets
*/
public HashMap<String, TermCounter> getTermCounters(String target) {
HashMap<String, TermCounter> rawTermCounters = null;
try {
String url = "http://en.wikipedia.org/wiki/" + target.replaceAll("\\s", "_");
URLConnection connection = new URL(url).openConnection();
connection.setDoInput(true);
connection.setDoOutput(true);
connection.setUseCaches(false);
connection.setRequestProperty("User-Agent", "Ephyra");
connection.connect();
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
rawTermCounters = new HashMap<String, TermCounter>();
boolean inTag = false;
int c = 0;
StringBuffer term = new StringBuffer();
while ((c = reader.read()) != -1) {
if (c == '<') {
inTag = true;
if (term.length() != 0) {
String stemmedTerm = SnowballStemmer.stem(term.toString().toLowerCase());
System.out.println(stemmedTerm);
if (!rawTermCounters.containsKey(stemmedTerm))
rawTermCounters.put(stemmedTerm, new TermCounter());
rawTermCounters.get(stemmedTerm).increment(1);
term = new StringBuffer();
}
} else if (c == '>') {
inTag = false;
} else if (!inTag) {
if (c < 33) {
if (term.length() != 0) {
String stemmedTerm = SnowballStemmer.stem(term.toString().toLowerCase());
System.out.println(stemmedTerm);
if (!rawTermCounters.containsKey(stemmedTerm))
rawTermCounters.put(stemmedTerm, new TermCounter());
rawTermCounters.get(stemmedTerm).increment(1);
term = new StringBuffer();
}
} else term.append((char) c);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return rawTermCounters;
}
// /**
// * Increment the score of each result snippet for each word in it according
// * to the number of top-100 web search engine snippets containing this
// * particular word. This favors snippets that provide information given
// * frequently and thus likely to be more important with regard to the
// * target.
// *
// * @param results array of <code>Result</code> objects
// * @return extended array of <code>Result</code> objects
// */
// @SuppressWarnings("unchecked")
// public Result[] apply(Result[] results) {
//
// // catch empty result
// if (results.length == 0) return results;
//
// // produce target variations
// String target = results[0].getQuery().getOriginalQueryString();
//
// System.out.println("WikipediaTermImportanceFilter:\n processing target '" + target + "'");
//
// HashMap<String, TermCounter> rawTermCounters = this.cacheLookup(target);
//
// // cache miss
// if (rawTermCounters == null) {
// rawTermCounters = this.getTermCounters(target);
// this.cache(target, rawTermCounters);
// }
//
// // something's wrong, rely on other filters
// if (rawTermCounters == null) return results;
//
// // get target tokens
// HashSet<String> rawTargetTerms = new HashSet<String>();
// String[] targetTokens = OpenNLP.tokenize(target);
// for (String tt : targetTokens)
// if (Character.isLetterOrDigit(tt.charAt(0)))
// rawTargetTerms.add(tt);
//
// // stem terms, collect target terms
// HashMap<String, TermCounter> termCounters = new HashMap<String, TermCounter>();//this.getTermCounters(targets);
// HashSet<String> targetTerms = new HashSet<String>();
// ArrayList<String> rawTerms = new ArrayList<String>(rawTermCounters.keySet());
// for (String rawTerm : rawTerms) {
//
// String stemmedTerm = SnowballStemmer.stem(rawTerm.toLowerCase());
// if (!termCounters.containsKey(stemmedTerm))
// termCounters.put(stemmedTerm, new TermCounter());
// termCounters.get(stemmedTerm).increment(rawTermCounters.get(rawTerm).getValue());
//
// if (rawTargetTerms.contains(rawTerm))
// targetTerms.add(stemmedTerm);
// }
//
//
// // score results
// ArrayList<Result> resultList = new ArrayList<Result>();
// boolean goOn;
// do {
// goOn = false;
// ArrayList<Result> rawResults = new ArrayList<Result>();
//
// // score all results
// for (Result r : results) {
// if (r.getScore() != Float.NEGATIVE_INFINITY) {
//
// // tokenize sentence
// String[] sentence = NETagger.tokenize(r.getAnswer());
// float importance = 0;
//
// // scan sentence for terms from web result
// for (int i = 0; i < sentence.length; i++) {
// String term = sentence[i];
// if ((term.length() > 1)/* && !StringUtils.isSubsetKeywords(term, r.getQuery().getAnalyzedQuestion().getQuestion()) && !FunctionWords.lookup(term)*/) {
// term = SnowballStemmer.stem(term.toLowerCase());
// TermCounter count = termCounters.get(term);
// if (count != null) {
// int wc = WordFrequencies.lookup(term);
// importance += (count.getValue() / Math.max(wc, 1));
// }
// }
// }
//
// // TODO don't throw out 0-scored results for combining approaches
// if (importance > 0) {
// if (this.normalizationMode == NO_NORMALIZATION)
// r.setScore(importance);
// else if (this.normalizationMode == LINEAR_LENGTH_NORMALIZATION)
// r.setScore(importance / sentence.length); // try normalized score
// else if (this.normalizationMode == SQUARE_ROOT_LENGTH_NORMALIZATION)
// r.setScore(importance / ((float) Math.sqrt(sentence.length))); // try normalized score
// else if (this.normalizationMode == LOG_LENGTH_NORMALIZATION)
// r.setScore(importance / (1 + ((float) Math.log(sentence.length)))); // try normalized score
// else if (this.normalizationMode == LOG_10_LENGTH_NORMALIZATION)
// r.setScore(importance / (1 + ((float) Math.log10(sentence.length)))); // try normalized score
//
// rawResults.add(r);
// }
// }
// }
//
// if (rawResults.size() != 0) {
//
// // find top result
// Collections.sort(rawResults);
// Collections.reverse(rawResults);
// Result top = rawResults.remove(0);
// resultList.add(top);
//
// // decrement scores of top result terms
// String[] sentence = NETagger.tokenize(top.getAnswer());
// for (int i = 0; i < sentence.length; i++) {
// String term = SnowballStemmer.stem(sentence[i].toLowerCase());
// TermCounter count = termCounters.get(term);
//
// if (count != null) {
// if (targetTerms.contains(term)) count.divideValue(1);
// else count.divideValue(2);
//
// if (count.getValue() == 0) termCounters.remove(term);
// }
// }
//
// // prepare remaining results for next round
// results = rawResults.toArray(new Result[rawResults.size()]);
// goOn = true;
// }
//
// } while (goOn);
//
// Collections.sort(resultList);
// Collections.reverse(resultList);
//
// // set position-dependent extra score for combining approaches
// float eScore = 100;
// for (Result r : resultList) {
// r.addExtraScore((this.getClass().getName() + this.normalizationMode), eScore);
// eScore *= 0.9f;
// }
//
// return resultList.toArray(new Result[resultList.size()]);
// }
// private static String lastTarget = null;
// private static HashMap<String, TermCounter> lastTargetTermCounters = null;
//
// private void cache(String target, HashMap<String, TermCounter> termCounters) {
// System.out.println("WikipediaTermImportanceFilter: caching web lookup result for target '" + target + "'");
// lastTarget = target;
// lastTargetTermCounters = termCounters;
// }
//
// private HashMap<String, TermCounter> cacheLookup(String target) {
// System.out.println("WikipediaTermImportanceFilter: doing cache lookup result for target '" + target + "'");
// if (target.equals(lastTarget)) {
// System.out.println(" --> cache hit");
// return lastTargetTermCounters;
// } else {
// System.out.println(" --> cache miss, last target is '" + lastTarget + "'");
// return null;
// }
// }
protected static boolean TEST_TERM_DOWMLOD = false;
public static void main(String[] args) {
TEST_TERM_DOWMLOD = true;
MsgPrinter.enableStatusMsgs(true);
MsgPrinter.enableErrorMsgs(true);
// create tokenizer
MsgPrinter.printStatusMsg("Creating tokenizer...");
if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/EnglishTok.bin.gz"))
MsgPrinter.printErrorMsg("Could not create tokenizer.");
// LingPipe.createTokenizer();
// // create sentence detector
// MsgPrinter.printStatusMsg("Creating sentence detector...");
// if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz"))
// MsgPrinter.printErrorMsg("Could not create sentence detector.");
// LingPipe.createSentenceDetector();
// create stemmer
MsgPrinter.printStatusMsg("Creating stemmer...");
SnowballStemmer.create();
// // create part of speech tagger
// MsgPrinter.printStatusMsg("Creating POS tagger...");
// if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz",
// "res/nlp/postagger/opennlp/tagdict"))
// MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger.");
// if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" +
// "train-wsj-0-18.holder"))
// MsgPrinter.printErrorMsg("Could not create Stanford POS tagger.");
// // create chunker
// MsgPrinter.printStatusMsg("Creating chunker...");
// if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" +
// "EnglishChunk.bin.gz"))
// MsgPrinter.printErrorMsg("Could not create chunker.");
// create named entity taggers
MsgPrinter.printStatusMsg("Creating NE taggers...");
NETagger.loadListTaggers("res/nlp/netagger/lists/");
NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst");
MsgPrinter.printStatusMsg(" ...loading models");
// if (!NETagger.loadNameFinders("res/nlp/netagger/opennlp/"))
// MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger.");
// if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init())
// MsgPrinter.printErrorMsg("Could not create Stanford NE tagger.");
MsgPrinter.printStatusMsg(" ...done");
WikipediaTermImportanceFilter wtif = new WikipediaTermImportanceFilter(NO_NORMALIZATION, NO_NORMALIZATION, false);
TRECTarget[] targets = TREC13To16Parser.loadTargets(args[0]);
for (TRECTarget target : targets) {
String question = target.getTargetDesc();
// query generation
MsgPrinter.printGeneratingQueries();
String qn = QuestionNormalizer.normalize(question);
MsgPrinter.printNormalization(qn); // print normalized question string
Logger.logNormalization(qn); // log normalized question string
String[] kws = KeywordExtractor.getKeywords(qn);
AnalyzedQuestion aq = new AnalyzedQuestion(question);
aq.setKeywords(kws);
aq.setFactoid(false);
Query[] queries = new BagOfWordsG().generateQueries(aq);
for (int q = 0; q < queries.length; q++)
queries[q].setOriginalQueryString(question);
Result[] results = new Result[1];
results[0] = new Result("This would be the answer", queries[0]);
wtif.apply(results);
}
}
}