package info.ephyra; import info.ephyra.answerselection.AnswerSelection; import info.ephyra.answerselection.filters.AnswerPatternFilter; import info.ephyra.answerselection.filters.AnswerTypeFilter; import info.ephyra.answerselection.filters.DuplicateFilter; import info.ephyra.answerselection.filters.FactoidSubsetFilter; import info.ephyra.answerselection.filters.FactoidsFromPredicatesFilter; import info.ephyra.answerselection.filters.PredicateExtractionFilter; import info.ephyra.answerselection.filters.QuestionKeywordsFilter; import info.ephyra.answerselection.filters.ScoreCombinationFilter; import info.ephyra.answerselection.filters.ScoreNormalizationFilter; import info.ephyra.answerselection.filters.ScoreSorterFilter; import info.ephyra.answerselection.filters.StopwordFilter; import info.ephyra.answerselection.filters.TruncationFilter; import info.ephyra.answerselection.filters.WebDocumentFetcherFilter; import info.ephyra.io.Logger; import info.ephyra.io.MsgPrinter; import info.ephyra.nlp.LingPipe; import info.ephyra.nlp.NETagger; import info.ephyra.nlp.OpenNLP; import info.ephyra.nlp.SnowballStemmer; import info.ephyra.nlp.StanfordNeTagger; import info.ephyra.nlp.StanfordParser; import info.ephyra.nlp.indices.FunctionWords; import info.ephyra.nlp.indices.IrregularVerbs; import info.ephyra.nlp.indices.Prepositions; import info.ephyra.nlp.indices.WordFrequencies; import info.ephyra.nlp.semantics.ontologies.Ontology; import info.ephyra.nlp.semantics.ontologies.WordNet; import info.ephyra.querygeneration.Query; import info.ephyra.querygeneration.QueryGeneration; import info.ephyra.querygeneration.generators.BagOfTermsG; import info.ephyra.querygeneration.generators.BagOfWordsG; import info.ephyra.querygeneration.generators.PredicateG; import info.ephyra.querygeneration.generators.QuestionInterpretationG; import info.ephyra.querygeneration.generators.QuestionReformulationG; import info.ephyra.questionanalysis.AnalyzedQuestion; import info.ephyra.questionanalysis.QuestionAnalysis; import info.ephyra.questionanalysis.QuestionInterpreter; import info.ephyra.questionanalysis.QuestionNormalizer; import info.ephyra.search.Result; import info.ephyra.search.Search; import info.ephyra.search.searchers.BingKM; import info.ephyra.search.searchers.IndriKM; import java.util.ArrayList; /** * <code>OpenEphyra</code> is an open framework for question answering (QA). * * @author Nico Schlaefer * @version 2008-03-23 */ public class OpenEphyra { /** Factoid question type. */ protected static final String FACTOID = "FACTOID"; /** List question type. */ protected static final String LIST = "LIST"; /** Maximum number of factoid answers. */ protected static final int FACTOID_MAX_ANSWERS = 1; /** Absolute threshold for factoid answer scores. */ protected static final float FACTOID_ABS_THRESH = 0; /** Relative threshold for list answer scores (fraction of top score). */ protected static final float LIST_REL_THRESH = 0.1f; /** Serialized classifier for score normalization. */ public static final String NORMALIZER = "res/scorenormalization/classifiers/" + "AdaBoost70_" + "Score+Extractors_" + "TREC10+TREC11+TREC12+TREC13+TREC14+TREC15+TREC8+TREC9" + ".serialized"; /** The directory of Ephyra, required when Ephyra is used as an API. */ protected String dir; /** * Entry point of Ephyra. Initializes the engine and starts the command line * interface. * * @param args command line arguments are ignored */ public static void main(String[] args) { // enable output of status and error messages MsgPrinter.enableStatusMsgs(true); MsgPrinter.enableErrorMsgs(true); MsgPrinter.printStatusMsg("Arg:"+ args[0]); if (args.length != 1) return; // set log file and enable logging Logger.setLogfile("log/OpenEphyra"); Logger.enableLogging(true); // initialize Ephyra and start command line interface (new OpenEphyra()).commandLine(args[0].trim()); } /** * <p>Creates a new instance of Ephyra and initializes the system.</p> * * <p>For use as a standalone system.</p> */ protected OpenEphyra() { this(""); } /** * <p>Creates a new instance of Ephyra and initializes the system.</p> * * <p>For use as an API.</p> * * @param dir directory of Ephyra */ public OpenEphyra(String dir) { this.dir = dir; MsgPrinter.printInitializing(); // create tokenizer MsgPrinter.printStatusMsg("Creating tokenizer..."); if (!OpenNLP.createTokenizer(dir + "res/nlp/tokenizer/opennlp/EnglishTok.bin.gz")) MsgPrinter.printErrorMsg("Could not create tokenizer."); // LingPipe.createTokenizer(); // create sentence detector MsgPrinter.printStatusMsg("Creating sentence detector..."); if (!OpenNLP.createSentenceDetector(dir + "res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz")) MsgPrinter.printErrorMsg("Could not create sentence detector."); LingPipe.createSentenceDetector(); // create stemmer MsgPrinter.printStatusMsg("Creating stemmer..."); SnowballStemmer.create(); // create part of speech tagger MsgPrinter.printStatusMsg("Creating POS tagger..."); if (!OpenNLP.createPosTagger( dir + "res/nlp/postagger/opennlp/tag.bin.gz", dir + "res/nlp/postagger/opennlp/tagdict")) MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger."); // if (!StanfordPosTagger.init(dir + "res/nlp/postagger/stanford/" + // "wsj3t0-18-bidirectional/train-wsj-0-18.holder")) // MsgPrinter.printErrorMsg("Could not create Stanford POS tagger."); // create chunker MsgPrinter.printStatusMsg("Creating chunker..."); if (!OpenNLP.createChunker(dir + "res/nlp/phrasechunker/opennlp/EnglishChunk.bin.gz")) MsgPrinter.printErrorMsg("Could not create chunker."); // create syntactic parser MsgPrinter.printStatusMsg("Creating syntactic parser..."); // if (!OpenNLP.createParser(dir + "res/nlp/syntacticparser/opennlp/")) // MsgPrinter.printErrorMsg("Could not create OpenNLP parser."); try { StanfordParser.initialize(); } catch (Exception e) { MsgPrinter.printErrorMsg("Could not create Stanford parser."); } // create named entity taggers MsgPrinter.printStatusMsg("Creating NE taggers..."); NETagger.loadListTaggers(dir + "res/nlp/netagger/lists/"); NETagger.loadRegExTaggers(dir + "res/nlp/netagger/patterns.lst"); MsgPrinter.printStatusMsg(" ...loading models"); // if (!NETagger.loadNameFinders(dir + "res/nlp/netagger/opennlp/")) // MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger."); if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init()) MsgPrinter.printErrorMsg("Could not create Stanford NE tagger."); MsgPrinter.printStatusMsg(" ...done"); // create linker // MsgPrinter.printStatusMsg("Creating linker..."); // if (!OpenNLP.createLinker(dir + "res/nlp/corefresolver/opennlp/")) // MsgPrinter.printErrorMsg("Could not create linker."); // create WordNet dictionary MsgPrinter.printStatusMsg("Creating WordNet dictionary..."); if (!WordNet.initialize(dir + "res/ontologies/wordnet/file_properties.xml")) MsgPrinter.printErrorMsg("Could not create WordNet dictionary."); // load function words (numbers are excluded) MsgPrinter.printStatusMsg("Loading function verbs..."); if (!FunctionWords.loadIndex(dir + "res/indices/functionwords_nonumbers")) MsgPrinter.printErrorMsg("Could not load function words."); // load prepositions MsgPrinter.printStatusMsg("Loading prepositions..."); if (!Prepositions.loadIndex(dir + "res/indices/prepositions")) MsgPrinter.printErrorMsg("Could not load prepositions."); // load irregular verbs MsgPrinter.printStatusMsg("Loading irregular verbs..."); if (!IrregularVerbs.loadVerbs(dir + "res/indices/irregularverbs")) MsgPrinter.printErrorMsg("Could not load irregular verbs."); // load word frequencies MsgPrinter.printStatusMsg("Loading word frequencies..."); if (!WordFrequencies.loadIndex(dir + "res/indices/wordfrequencies")) MsgPrinter.printErrorMsg("Could not load word frequencies."); // load query reformulators MsgPrinter.printStatusMsg("Loading query reformulators..."); if (!QuestionReformulationG.loadReformulators(dir + "res/reformulations/")) MsgPrinter.printErrorMsg("Could not load query reformulators."); // load answer types // MsgPrinter.printStatusMsg("Loading answer types..."); // if (!AnswerTypeTester.loadAnswerTypes(dir + // "res/answertypes/patterns/answertypepatterns")) // MsgPrinter.printErrorMsg("Could not load answer types."); // load question patterns MsgPrinter.printStatusMsg("Loading question patterns..."); if (!QuestionInterpreter.loadPatterns(dir + "res/patternlearning/questionpatterns/")) MsgPrinter.printErrorMsg("Could not load question patterns."); // load answer patterns MsgPrinter.printStatusMsg("Loading answer patterns..."); if (!AnswerPatternFilter.loadPatterns(dir + "res/patternlearning/answerpatterns/")) MsgPrinter.printErrorMsg("Could not load answer patterns."); } /** * Reads a line from the command prompt. * * @return user input */ protected String readLine() { try { return new java.io.BufferedReader(new java.io.InputStreamReader(System.in)).readLine(); } catch(java.io.IOException e) { return new String(""); } } /** * Initializes the pipeline for factoid questions. */ protected void initFactoid() { // question analysis Ontology wordNet = new WordNet(); // - dictionaries for term extraction QuestionAnalysis.clearDictionaries(); QuestionAnalysis.addDictionary(wordNet); // - ontologies for term expansion QuestionAnalysis.clearOntologies(); QuestionAnalysis.addOntology(wordNet); // query generation QueryGeneration.clearQueryGenerators(); QueryGeneration.addQueryGenerator(new BagOfWordsG()); QueryGeneration.addQueryGenerator(new BagOfTermsG()); QueryGeneration.addQueryGenerator(new PredicateG()); QueryGeneration.addQueryGenerator(new QuestionInterpretationG()); QueryGeneration.addQueryGenerator(new QuestionReformulationG()); // search // - knowledge miners for unstructured knowledge sources Search.clearKnowledgeMiners(); // Search.addKnowledgeMiner(new BingKM()); // Search.addKnowledgeMiner(new GoogleKM()); // Search.addKnowledgeMiner(new YahooKM()); for (String[] indriIndices : IndriKM.getIndriIndices()) Search.addKnowledgeMiner(new IndriKM(indriIndices, false)); // for (String[] indriServers : IndriKM.getIndriServers()) // Search.addKnowledgeMiner(new IndriKM(indriServers, true)); // - knowledge annotators for (semi-)structured knowledge sources Search.clearKnowledgeAnnotators(); // answer extraction and selection // (the filters are applied in this order) AnswerSelection.clearFilters(); // - answer extraction filters AnswerSelection.addFilter(new AnswerTypeFilter()); AnswerSelection.addFilter(new AnswerPatternFilter()); //AnswerSelection.addFilter(new WebDocumentFetcherFilter()); AnswerSelection.addFilter(new PredicateExtractionFilter()); AnswerSelection.addFilter(new FactoidsFromPredicatesFilter()); AnswerSelection.addFilter(new TruncationFilter()); // - answer selection filters AnswerSelection.addFilter(new StopwordFilter()); AnswerSelection.addFilter(new QuestionKeywordsFilter()); AnswerSelection.addFilter(new ScoreNormalizationFilter(NORMALIZER)); AnswerSelection.addFilter(new ScoreCombinationFilter()); AnswerSelection.addFilter(new FactoidSubsetFilter()); AnswerSelection.addFilter(new DuplicateFilter()); AnswerSelection.addFilter(new ScoreSorterFilter()); } /** * Runs the pipeline and returns an array of up to <code>maxAnswers</code> * results that have a score of at least <code>absThresh</code>. * * @param aq analyzed question * @param maxAnswers maximum number of answers * @param absThresh absolute threshold for scores * @return array of results */ protected Result[] runPipeline(AnalyzedQuestion aq, int maxAnswers, float absThresh) { // query generation MsgPrinter.printGeneratingQueries(); Query[] queries = QueryGeneration.getQueries(aq); // search MsgPrinter.printSearching(); Result[] results = Search.doSearch(queries); // answer selection MsgPrinter.printSelectingAnswers(); results = AnswerSelection.getResults(results, maxAnswers, absThresh); return results; } /** * Returns the directory of Ephyra. * * @return directory */ public String getDir() { return dir; } /** * <p>A command line interface for Ephyra.</p> * * <p>Repeatedly queries the user for a question, asks the system the * question and prints out and logs the results.</p> * * <p>The command <code>exit</code> can be used to quit the program.</p> */ public void commandLine(String query_input) { // while (true) { // query user for question, quit if user types in "exit" // MsgPrinter.printQuestionPrompt(); // String question = readLine().trim(); String question = query_input.trim(); if (question.equalsIgnoreCase("exit")) System.exit(0); // determine question type and extract question string String type; if (question.matches("(?i)" + FACTOID + ":.*+")) { // factoid question type = FACTOID; question = question.split(":", 2)[1].trim(); } else if (question.matches("(?i)" + LIST + ":.*+")) { // list question type = LIST; question = question.split(":", 2)[1].trim(); } else { // question type unspecified type = FACTOID; // default type } // ask question Result[] results = new Result[0]; if (type.equals(FACTOID)) { Logger.logFactoidStart(question); results = askFactoid(question, FACTOID_MAX_ANSWERS, FACTOID_ABS_THRESH); Logger.logResults(results); Logger.logFactoidEnd(); } else if (type.equals(LIST)) { Logger.logListStart(question); results = askList(question, LIST_REL_THRESH); Logger.logResults(results); Logger.logListEnd(); } // print answers MsgPrinter.printAnswers(results); //} } /** * Asks Ephyra a factoid question and returns up to <code>maxAnswers</code> * results that have a score of at least <code>absThresh</code>. * * @param question factoid question * @param maxAnswers maximum number of answers * @param absThresh absolute threshold for scores * @return array of results */ public Result[] askFactoid(String question, int maxAnswers, float absThresh) { // initialize pipeline initFactoid(); // analyze question MsgPrinter.printAnalyzingQuestion(); AnalyzedQuestion aq = QuestionAnalysis.analyze(question); // get answers Result[] results = runPipeline(aq, maxAnswers, absThresh); return results; } /** * Asks Ephyra a factoid question and returns a single result or * <code>null</code> if no answer could be found. * * @param question factoid question * @return single result or <code>null</code> */ public Result askFactoid(String question) { Result[] results = askFactoid(question, 1, 0); return (results.length > 0) ? results[0] : null; } /** * Asks Ephyra a list question and returns results that have a score of at * least <code>relThresh * top score</code>. * * @param question list question * @param relThresh relative threshold for scores * @return array of results */ public Result[] askList(String question, float relThresh) { question = QuestionNormalizer.transformList(question); Result[] results = askFactoid(question, Integer.MAX_VALUE, 0); // get results with a score of at least relThresh * top score ArrayList<Result> confident = new ArrayList<Result>(); if (results.length > 0) { float topScore = results[0].getScore(); for (Result result : results) if (result.getScore() >= relThresh * topScore) confident.add(result); } return confident.toArray(new Result[confident.size()]); } }