package info.ephyra.trec;
import info.ephyra.OpenEphyra;
import info.ephyra.answerselection.AnswerSelection;
import info.ephyra.answerselection.filters.AnswerPatternFilter;
import info.ephyra.answerselection.filters.AnswerProjectionFilter;
import info.ephyra.answerselection.filters.AnswerTypeFilter;
import info.ephyra.answerselection.filters.DuplicateFilter;
import info.ephyra.answerselection.filters.FactoidSubsetFilter;
import info.ephyra.answerselection.filters.FactoidsFromPredicatesFilter;
import info.ephyra.answerselection.filters.PredicateExtractionFilter;
import info.ephyra.answerselection.filters.QuestionKeywordsFilter;
import info.ephyra.answerselection.filters.ResultLengthFilter;
import info.ephyra.answerselection.filters.ScoreCombinationFilter;
import info.ephyra.answerselection.filters.ScoreNormalizationFilter;
import info.ephyra.answerselection.filters.ScoreSorterFilter;
import info.ephyra.answerselection.filters.StopwordFilter;
import info.ephyra.answerselection.filters.TruncationFilter;
import info.ephyra.answerselection.filters.WebDocumentFetcherFilter;
import info.ephyra.io.Logger;
import info.ephyra.io.MsgPrinter;
import info.ephyra.nlp.semantics.ontologies.Ontology;
import info.ephyra.nlp.semantics.ontologies.WordNet;
import info.ephyra.querygeneration.QueryGeneration;
import info.ephyra.querygeneration.generators.BagOfTermsG;
import info.ephyra.querygeneration.generators.BagOfWordsG;
import info.ephyra.querygeneration.generators.PredicateG;
import info.ephyra.querygeneration.generators.QuestionInterpretationG;
import info.ephyra.querygeneration.generators.QuestionReformulationG;
import info.ephyra.questionanalysis.AnalyzedQuestion;
import info.ephyra.questionanalysis.QuestionAnalysis;
import info.ephyra.questionanalysis.QuestionNormalizer;
import info.ephyra.search.Result;
import info.ephyra.search.Search;
import info.ephyra.search.searchers.BingKM;
import info.ephyra.search.searchers.IndriKM;
import java.util.ArrayList;
/**
* <p>A modified version of <code>OpenEphyra</code> that is optimized for the
* TREC evaluation. If no answers are found, the question is assumed to ask for
* a proper name and the pipeline is rerun to improve the recall. This setup
* extracts answers from the Web and projects them onto a local corpus.</p>
*
* <p>This class extends <code>OpenEphyra</code>.</p>
*
* @author Nico Schlaefer
* @version 2008-01-26
*/
public class OpenEphyraCorpus extends OpenEphyra {
/**
* Entry point of Ephyra. Initializes the engine and starts the command line
* interface.
*
* @param args command line arguments are ignored
*/
public static void main(String[] args) {
// enable output of status and error messages
MsgPrinter.enableStatusMsgs(true);
MsgPrinter.enableErrorMsgs(true);
// set log file and enable logging
Logger.setLogfile("log/OpenEphyraCorpus");
Logger.enableLogging(true);
// initialize Ephyra and start command line interface
(new OpenEphyraCorpus()).commandLine("");
}
/**
* Initializes the pipeline for factoid questions, using a local corpus as a
* knowledge source.
*/
protected void initFactoidCorpus() {
// question analysis
Ontology wordNet = new WordNet();
// - dictionaries for term extraction
QuestionAnalysis.clearDictionaries();
QuestionAnalysis.addDictionary(wordNet);
// - ontologies for term expansion
QuestionAnalysis.clearOntologies();
QuestionAnalysis.addOntology(wordNet);
// query generation
QueryGeneration.clearQueryGenerators();
QueryGeneration.addQueryGenerator(new BagOfWordsG());
QueryGeneration.addQueryGenerator(new BagOfTermsG());
QueryGeneration.addQueryGenerator(new PredicateG());
QueryGeneration.addQueryGenerator(new QuestionInterpretationG());
QueryGeneration.addQueryGenerator(new QuestionReformulationG());
// search
// - knowledge miners for unstructured knowledge sources
Search.clearKnowledgeMiners();
for (String[] indriIndices : IndriKM.getIndriIndices())
Search.addKnowledgeMiner(new IndriKM(indriIndices, false));
for (String[] indriServers : IndriKM.getIndriServers())
Search.addKnowledgeMiner(new IndriKM(indriServers, true));
// - knowledge annotators for (semi-)structured knowledge sources
Search.clearKnowledgeAnnotators();
// answer extraction and selection
// (the filters are applied in this order)
AnswerSelection.clearFilters();
// - answer extraction filters
AnswerSelection.addFilter(new AnswerTypeFilter());
AnswerSelection.addFilter(new AnswerPatternFilter());
AnswerSelection.addFilter(new WebDocumentFetcherFilter());
AnswerSelection.addFilter(new PredicateExtractionFilter());
AnswerSelection.addFilter(new FactoidsFromPredicatesFilter());
AnswerSelection.addFilter(new TruncationFilter());
// - answer selection filters
}
/**
* Initializes the pipeline for factoid questions, using the Web as a
* knowledge source.
*
* @param resultsCorp results retrieved from the corpus
*/
protected void initFactoidWeb(Result[] resultsCorp) {
// question analysis
Ontology wordNet = new WordNet();
// - dictionaries for term extraction
QuestionAnalysis.clearDictionaries();
QuestionAnalysis.addDictionary(wordNet);
// - ontologies for term expansion
QuestionAnalysis.clearOntologies();
QuestionAnalysis.addOntology(wordNet);
// query generation
QueryGeneration.clearQueryGenerators();
QueryGeneration.addQueryGenerator(new BagOfWordsG());
QueryGeneration.addQueryGenerator(new BagOfTermsG());
QueryGeneration.addQueryGenerator(new PredicateG());
QueryGeneration.addQueryGenerator(new QuestionInterpretationG());
QueryGeneration.addQueryGenerator(new QuestionReformulationG());
// search
// - knowledge miners for unstructured knowledge sources
Search.clearKnowledgeMiners();
Search.addKnowledgeMiner(new BingKM());
// Search.addKnowledgeMiner(new GoogleKM());
// Search.addKnowledgeMiner(new YahooKM());
// - knowledge annotators for (semi-)structured knowledge sources
Search.clearKnowledgeAnnotators();
// answer extraction and selection
// (the filters are applied in this order)
AnswerSelection.clearFilters();
// - answer extraction filters
AnswerSelection.addFilter(new AnswerTypeFilter());
AnswerSelection.addFilter(new AnswerPatternFilter());
AnswerSelection.addFilter(new WebDocumentFetcherFilter());
AnswerSelection.addFilter(new PredicateExtractionFilter());
AnswerSelection.addFilter(new FactoidsFromPredicatesFilter());
AnswerSelection.addFilter(new TruncationFilter());
// - answer selection filters
AnswerSelection.addFilter(new StopwordFilter());
AnswerSelection.addFilter(new QuestionKeywordsFilter());
AnswerSelection.addFilter(new AnswerProjectionFilter(resultsCorp));
AnswerSelection.addFilter(new ScoreNormalizationFilter(NORMALIZER));
AnswerSelection.addFilter(new ScoreCombinationFilter());
AnswerSelection.addFilter(new FactoidSubsetFilter());
AnswerSelection.addFilter(new DuplicateFilter());
AnswerSelection.addFilter(new ScoreSorterFilter());
AnswerSelection.addFilter(new ResultLengthFilter());
}
/**
* Asks Ephyra a factoid question and returns up to <code>maxAnswers</code>
* results that have a score of at least <code>absThresh</code>. This method
* is optimized for the TREC evaluation: if the answer type cannot be
* determined and no answers are found, it simply returns a list of proper
* names.
*
* @param question factoid question
* @param maxAnswers maximum number of answers
* @param absThresh absolute threshold for scores
* @return array of results
*/
public Result[] askFactoid(String question, int maxAnswers,
float absThresh) {
// initialize pipeline
initFactoidCorpus();
// analyze question
MsgPrinter.printAnalyzingQuestion();
AnalyzedQuestion aq = QuestionAnalysis.analyze(question);
// get corpus answers
Result[] resultsCorp = runPipeline(aq, Integer.MAX_VALUE,
Float.NEGATIVE_INFINITY);
// get web answers and project them
initFactoidWeb(resultsCorp);
Result[] results = runPipeline(aq, maxAnswers, absThresh);
// return results if any
if (results.length > 0) return results;
if (aq.getAnswerTypes().length == 0) {
// assume that question asks for a proper name
aq.setAnswerTypes(new String[] {"NEproperName"});
// get corpus answers (only factoid answers)
initFactoidCorpus();
resultsCorp = runPipeline(aq, Integer.MAX_VALUE, 0);
// get web answers and project them
initFactoidWeb(resultsCorp);
results = runPipeline(aq, maxAnswers, absThresh);
}
return results;
}
/**
* Asks Ephyra a list question and returns results that have a score of at
* least <code>relThresh * top score</code>. This method is optimized for
* the TREC evaluation: if no answers are found, it simply returns a list
* of proper names.
*
* @param question list question
* @param relThresh relative threshold for scores
* @return array of results
*/
public Result[] askList(String question, float relThresh) {
question = QuestionNormalizer.transformList(question);
Result[] results = askFactoid(question, Integer.MAX_VALUE, 0);
if (results.length == 0) {
// assume that question asks for proper names
AnalyzedQuestion aq = QuestionAnalysis.analyze(question);
aq.setAnswerTypes(new String[] {"NEproperName"});
// get corpus answers (only factoid answers)
initFactoidCorpus();
Result[] resultsCorp = runPipeline(aq, Integer.MAX_VALUE, 0);
// get web answers and project them
initFactoidWeb(resultsCorp);
results = runPipeline(aq, Integer.MAX_VALUE, 0);
}
// get results with a score of at least relThresh * top score
ArrayList<Result> confident = new ArrayList<Result>();
if (results.length > 0) {
float topScore = results[0].getScore();
for (Result result : results)
if (result.getScore() >= relThresh * topScore)
confident.add(result);
}
return confident.toArray(new Result[confident.size()]);
}
}