package info.ephyra;
import info.ephyra.answerselection.AnswerSelection;
import info.ephyra.answerselection.filters.AnswerPatternFilter;
import info.ephyra.answerselection.filters.AnswerTypeFilter;
import info.ephyra.answerselection.filters.DuplicateFilter;
import info.ephyra.answerselection.filters.FactoidSubsetFilter;
import info.ephyra.answerselection.filters.FactoidsFromPredicatesFilter;
import info.ephyra.answerselection.filters.PredicateExtractionFilter;
import info.ephyra.answerselection.filters.QuestionKeywordsFilter;
import info.ephyra.answerselection.filters.ScoreCombinationFilter;
import info.ephyra.answerselection.filters.ScoreNormalizationFilter;
import info.ephyra.answerselection.filters.ScoreSorterFilter;
import info.ephyra.answerselection.filters.StopwordFilter;
import info.ephyra.answerselection.filters.TruncationFilter;
import info.ephyra.answerselection.filters.WebDocumentFetcherFilter;
import info.ephyra.io.Logger;
import info.ephyra.io.MsgPrinter;
import info.ephyra.nlp.LingPipe;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.OpenNLP;
import info.ephyra.nlp.SnowballStemmer;
import info.ephyra.nlp.StanfordNeTagger;
import info.ephyra.nlp.StanfordParser;
import info.ephyra.nlp.indices.FunctionWords;
import info.ephyra.nlp.indices.IrregularVerbs;
import info.ephyra.nlp.indices.Prepositions;
import info.ephyra.nlp.indices.WordFrequencies;
import info.ephyra.nlp.semantics.ontologies.Ontology;
import info.ephyra.nlp.semantics.ontologies.WordNet;
import info.ephyra.querygeneration.Query;
import info.ephyra.querygeneration.QueryGeneration;
import info.ephyra.querygeneration.generators.BagOfTermsG;
import info.ephyra.querygeneration.generators.BagOfWordsG;
import info.ephyra.querygeneration.generators.PredicateG;
import info.ephyra.querygeneration.generators.QuestionInterpretationG;
import info.ephyra.querygeneration.generators.QuestionReformulationG;
import info.ephyra.questionanalysis.AnalyzedQuestion;
import info.ephyra.questionanalysis.QuestionAnalysis;
import info.ephyra.questionanalysis.QuestionInterpreter;
import info.ephyra.questionanalysis.QuestionNormalizer;
import info.ephyra.search.Result;
import info.ephyra.search.Search;
import info.ephyra.search.searchers.IndriKM;
import info.ephyra.search.searchers.IndriDocumentKM;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.eclipse.jetty.server.nio.SelectChannelConnector;
import org.eclipse.jetty.server.Connector;
import org.eclipse.jetty.server.Request;
import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.server.handler.AbstractHandler;
import org.eclipse.jetty.util.thread.QueuedThreadPool;
import java.net.URLDecoder;
import java.io.PrintWriter;
import java.io.IOException;
import java.util.ArrayList;
/**
* <code>OpenEphyra</code> is an open framework for question answering (QA).
*
* @author Nico Schlaefer
* @version 2008-03-23
*/
public class OpenEphyraServer extends AbstractHandler {
/** Factoid question type. */
protected static final String FACTOID = "FACTOID";
/** List question type. */
protected static final String LIST = "LIST";
/** Maximum number of factoid answers. */
protected static final int FACTOID_MAX_ANSWERS = 1;
/** Absolute threshold for factoid answer scores. */
protected static final float FACTOID_ABS_THRESH = 0;
/** Relative threshold for list answer scores (fraction of top score). */
protected static final float LIST_REL_THRESH = 0.1f;
/** Serialized classifier for score normalization. */
public static final String NORMALIZER =
"res/scorenormalization/classifiers/" +
"AdaBoost70_" +
"Score+Extractors_" +
"TREC10+TREC11+TREC12+TREC13+TREC14+TREC15+TREC8+TREC9" +
".serialized";
/** The directory of Ephyra, required when Ephyra is used as an API. */
protected String dir;
@Override
public void handle(String target,
Request baseRequest,
HttpServletRequest request,
HttpServletResponse response)
throws IOException, ServletException
{
String query_str = request.getQueryString();
System.out.println("Query str: " + query_str);
if (query_str == null) {
response.setContentType("text/html;charset=utf-8");
response.setStatus(HttpServletResponse.SC_OK);
baseRequest.setHandled(true);
return;
}
String[] tokens = query_str.split("=");
String question = URLDecoder.decode(tokens[1], "UTF-8");
// response
response.setContentType("text/html;charset=utf-8");
response.setStatus(HttpServletResponse.SC_OK);
baseRequest.setHandled(true);
PrintWriter out = response.getWriter();
out.flush();
// determine question type and extract question string
String type;
if (question.matches("(?i)" + FACTOID + ":.*+")) {
// factoid question
type = FACTOID;
question = question.split(":", 2)[1].trim();
} else if (question.matches("(?i)" + LIST + ":.*+")) {
// list question
type = LIST;
question = question.split(":", 2)[1].trim();
} else {
// question type unspecified
type = FACTOID; // default type
}
// ask question
Result[] results = new Result[0];
if (type.equals(FACTOID)) {
Logger.logFactoidStart(question);
results = askFactoid(question, FACTOID_MAX_ANSWERS,
FACTOID_ABS_THRESH);
Logger.logResults(results);
Logger.logFactoidEnd();
} else if (type.equals(LIST)) {
Logger.logListStart(question);
results = askList(question, LIST_REL_THRESH);
Logger.logResults(results);
Logger.logListEnd();
}
String answer = results[0].getAnswer();
if (answer != null)
out.println(answer);
else
out.println("Sorry, I cannot answer your question.");
out.close();
}
/**
* Entry point of Ephyra. Initializes the engine and starts the web service interface.
*
* @param args command line arguments are ignored
*/
public static void main(String[] args) throws Exception {
// enable output of status and error messages
MsgPrinter.enableStatusMsgs(true);
MsgPrinter.enableErrorMsgs(true);
// set log file and enable logging
Logger.setLogfile("log/OpenEphyra");
Logger.enableLogging(true);
String addr = "localhost";
int port = 8080;
if (args.length > 1) {
addr = args[0];
port = Integer.parseInt(args[1]);
}
int NTHREADS = Integer.parseInt(System.getenv("THREADS"));
Server server = new Server();
SelectChannelConnector con1 = new SelectChannelConnector();
con1.setHost(addr);
con1.setPort(port);
con1.setThreadPool(new QueuedThreadPool(NTHREADS));
con1.setMaxIdleTime(30000);
con1.setRequestHeaderSize(8192);
server.setConnectors(new Connector[]{con1});
server.setHandler(new OpenEphyraServer());
server.start();
server.join();
}
/**
* <p>Creates a new instance of Ephyra and initializes the system.</p>
*
* <p>For use as a standalone system.</p>
*/
protected OpenEphyraServer() {
this("");
}
/**
* <p>Creates a new instance of Ephyra and initializes the system.</p>
*
* <p>For use as an API.</p>
*
* @param dir directory of Ephyra
*/
public OpenEphyraServer(String dir) {
this.dir = dir;
MsgPrinter.printInitializing();
// create tokenizer
MsgPrinter.printStatusMsg("Creating tokenizer...");
if (!OpenNLP.createTokenizer(dir +
"res/nlp/tokenizer/opennlp/EnglishTok.bin.gz"))
MsgPrinter.printErrorMsg("Could not create tokenizer.");
// create sentence detector
MsgPrinter.printStatusMsg("Creating sentence detector...");
if (!OpenNLP.createSentenceDetector(dir +
"res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz"))
MsgPrinter.printErrorMsg("Could not create sentence detector.");
LingPipe.createSentenceDetector();
// create stemmer
MsgPrinter.printStatusMsg("Creating stemmer...");
SnowballStemmer.create();
// create part of speech tagger
MsgPrinter.printStatusMsg("Creating POS tagger...");
if (!OpenNLP.createPosTagger(
dir + "res/nlp/postagger/opennlp/tag.bin.gz",
dir + "res/nlp/postagger/opennlp/tagdict"))
MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger.");
// create chunker
MsgPrinter.printStatusMsg("Creating chunker...");
if (!OpenNLP.createChunker(dir +
"res/nlp/phrasechunker/opennlp/EnglishChunk.bin.gz"))
MsgPrinter.printErrorMsg("Could not create chunker.");
// create syntactic parser
MsgPrinter.printStatusMsg("Creating syntactic parser...");
try {
StanfordParser.initialize();
} catch (Exception e) {
MsgPrinter.printErrorMsg("Could not create Stanford parser.");
}
// create named entity taggers
MsgPrinter.printStatusMsg("Creating NE taggers...");
NETagger.loadListTaggers(dir + "res/nlp/netagger/lists/");
NETagger.loadRegExTaggers(dir + "res/nlp/netagger/patterns.lst");
MsgPrinter.printStatusMsg(" ...loading models");
if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init())
MsgPrinter.printErrorMsg("Could not create Stanford NE tagger.");
MsgPrinter.printStatusMsg(" ...done");
// create WordNet dictionary
MsgPrinter.printStatusMsg("Creating WordNet dictionary...");
if (!WordNet.initialize(dir +
"res/ontologies/wordnet/file_properties.xml"))
MsgPrinter.printErrorMsg("Could not create WordNet dictionary.");
// load function words (numbers are excluded)
MsgPrinter.printStatusMsg("Loading function verbs...");
if (!FunctionWords.loadIndex(dir +
"res/indices/functionwords_nonumbers"))
MsgPrinter.printErrorMsg("Could not load function words.");
// load prepositions
MsgPrinter.printStatusMsg("Loading prepositions...");
if (!Prepositions.loadIndex(dir +
"res/indices/prepositions"))
MsgPrinter.printErrorMsg("Could not load prepositions.");
// load irregular verbs
MsgPrinter.printStatusMsg("Loading irregular verbs...");
if (!IrregularVerbs.loadVerbs(dir + "res/indices/irregularverbs"))
MsgPrinter.printErrorMsg("Could not load irregular verbs.");
// load word frequencies
MsgPrinter.printStatusMsg("Loading word frequencies...");
if (!WordFrequencies.loadIndex(dir + "res/indices/wordfrequencies"))
MsgPrinter.printErrorMsg("Could not load word frequencies.");
// load query reformulators
MsgPrinter.printStatusMsg("Loading query reformulators...");
if (!QuestionReformulationG.loadReformulators(dir +
"res/reformulations/"))
MsgPrinter.printErrorMsg("Could not load query reformulators.");
// load question patterns
MsgPrinter.printStatusMsg("Loading question patterns...");
if (!QuestionInterpreter.loadPatterns(dir +
"res/patternlearning/questionpatterns/"))
MsgPrinter.printErrorMsg("Could not load question patterns.");
// load answer patterns
MsgPrinter.printStatusMsg("Loading answer patterns...");
if (!AnswerPatternFilter.loadPatterns(dir +
"res/patternlearning/answerpatterns/"))
MsgPrinter.printErrorMsg("Could not load answer patterns.");
}
/**
* Initializes the pipeline for factoid questions.
*/
protected void initFactoid() {
// question analysis
Ontology wordNet = new WordNet();
// - dictionaries for term extraction
QuestionAnalysis.clearDictionaries();
QuestionAnalysis.addDictionary(wordNet);
// - ontologies for term expansion
QuestionAnalysis.clearOntologies();
QuestionAnalysis.addOntology(wordNet);
// query generation
QueryGeneration.clearQueryGenerators();
QueryGeneration.addQueryGenerator(new BagOfWordsG());
QueryGeneration.addQueryGenerator(new BagOfTermsG());
QueryGeneration.addQueryGenerator(new PredicateG());
QueryGeneration.addQueryGenerator(new QuestionInterpretationG());
QueryGeneration.addQueryGenerator(new QuestionReformulationG());
// search
// - knowledge miners for unstructured knowledge sources
Search.clearKnowledgeMiners();
for (String[] indriIndices : IndriKM.getIndriIndices())
Search.addKnowledgeMiner(new IndriKM(indriIndices, false));
// - knowledge annotators for (semi-)structured knowledge sources
Search.clearKnowledgeAnnotators();
/* Search.addKnowledgeAnnotator(new WikipediaKA("list.txt")); */
// answer extraction and selection
// (the filters are applied in this order)
AnswerSelection.clearFilters();
// - answer extraction filters
AnswerSelection.addFilter(new AnswerTypeFilter());
AnswerSelection.addFilter(new AnswerPatternFilter());
AnswerSelection.addFilter(new PredicateExtractionFilter());
AnswerSelection.addFilter(new FactoidsFromPredicatesFilter());
AnswerSelection.addFilter(new TruncationFilter());
// - answer selection filters
AnswerSelection.addFilter(new StopwordFilter());
AnswerSelection.addFilter(new QuestionKeywordsFilter());
AnswerSelection.addFilter(new ScoreNormalizationFilter(NORMALIZER));
AnswerSelection.addFilter(new ScoreCombinationFilter());
AnswerSelection.addFilter(new FactoidSubsetFilter());
AnswerSelection.addFilter(new DuplicateFilter());
AnswerSelection.addFilter(new ScoreSorterFilter());
}
/**
* Runs the pipeline and returns an array of up to <code>maxAnswers</code>
* results that have a score of at least <code>absThresh</code>.
*
* @param aq analyzed question
* @param maxAnswers maximum number of answers
* @param absThresh absolute threshold for scores
* @return array of results
*/
protected Result[] runPipeline(AnalyzedQuestion aq, int maxAnswers,
float absThresh) {
// query generation
MsgPrinter.printGeneratingQueries();
Query[] queries = QueryGeneration.getQueries(aq);
// search
MsgPrinter.printSearching();
Result[] results = Search.doSearch(queries);
// answer selection
MsgPrinter.printSelectingAnswers();
results = AnswerSelection.getResults(results, maxAnswers, absThresh);
return results;
}
/**
* Returns the directory of Ephyra.
*
* @return directory
*/
public String getDir() {
return dir;
}
/**
* Asks Ephyra a factoid question and returns up to <code>maxAnswers</code>
* results that have a score of at least <code>absThresh</code>.
*
* @param question factoid question
* @param maxAnswers maximum number of answers
* @param absThresh absolute threshold for scores
* @return array of results
*/
public Result[] askFactoid(String question, int maxAnswers,
float absThresh) {
// initialize pipeline
initFactoid();
// analyze question
MsgPrinter.printAnalyzingQuestion();
AnalyzedQuestion aq = QuestionAnalysis.analyze(question);
// get answers
Result[] results = runPipeline(aq, maxAnswers, absThresh);
return results;
}
/**
* Asks Ephyra a factoid question and returns a single result or
* <code>null</code> if no answer could be found.
*
* @param question factoid question
* @return single result or <code>null</code>
*/
public Result askFactoid(String question) {
Result[] results = askFactoid(question, 1, 0);
return (results.length > 0) ? results[0] : null;
}
/**
* Asks Ephyra a list question and returns results that have a score of at
* least <code>relThresh * top score</code>.
*
* @param question list question
* @param relThresh relative threshold for scores
* @return array of results
*/
public Result[] askList(String question, float relThresh) {
question = QuestionNormalizer.transformList(question);
Result[] results = askFactoid(question, Integer.MAX_VALUE, 0);
// get results with a score of at least relThresh * top score
ArrayList<Result> confident = new ArrayList<Result>();
if (results.length > 0) {
float topScore = results[0].getScore();
for (Result result : results)
if (result.getScore() >= relThresh * topScore)
confident.add(result);
}
return confident.toArray(new Result[confident.size()]);
}
}