package info.ephyra.patternlearning; import info.ephyra.answerselection.filters.AnswerPatternFilter; import info.ephyra.io.MsgPrinter; import info.ephyra.nlp.NETagger; import info.ephyra.nlp.OpenNLP; import info.ephyra.nlp.SnowballStemmer; import info.ephyra.nlp.StanfordNeTagger; import info.ephyra.nlp.indices.FunctionWords; import info.ephyra.nlp.indices.IrregularVerbs; import info.ephyra.nlp.indices.Prepositions; import info.ephyra.nlp.semantics.ontologies.WordNet; import info.ephyra.querygeneration.Query; import info.ephyra.querygeneration.generators.QuestionInterpretationG; import info.ephyra.questionanalysis.QuestionInterpretation; import info.ephyra.questionanalysis.QuestionInterpreter; import info.ephyra.questionanalysis.QuestionNormalizer; import info.ephyra.search.Result; import info.ephyra.search.Search; import info.ephyra.search.searchers.BingKM; import info.ephyra.trec.TREC8To12Parser; import info.ephyra.trec.TRECAnswer; import info.ephyra.trec.TRECPattern; import info.ephyra.trec.TRECQuestion; import info.ephyra.util.FileUtils; import info.ephyra.util.RegexConverter; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Hashtable; /** * A pattern learning tool for Ephyra. * * @author Nico Schlaefer * @version 2006-04-04 */ public class PatternLearner { /** Support threshold for answer patterns. */ private static final float SUPPORT_THRESH = 0.0001f; /** Confidence threshold for answer patterns. */ private static final float CONFIDENCE_THRESH = 0.01f; /** Question strings. */ private static String[] qss; /** Maps questions or query strings to answers. */ private static Hashtable<String, String> ass; /** Maps questions or query strings to patterns for correct answers. */ private static Hashtable<String, String> regexs; /** * Loads the questions, answers and patterns from TREC files. * * @param qFile name of the file containing the questions * @param aFile name of the file containing the answers or an empty string * @param pFile name of the file containing the patterns or an empty string */ private static void loadTRECData(String qFile, String aFile, String pFile) { ass = new Hashtable<String, String>(); regexs = new Hashtable<String, String>(); // load questions from file TRECQuestion[] questions = TREC8To12Parser.loadQuestions(qFile); qss = new String[questions.length]; for (int i = 0; i < questions.length; i++) qss[i] = questions[i].getQuestionString(); // if an answer file is provided, // load answers and derive patterns if (!aFile.equals("")) { TRECAnswer[] answers = TREC8To12Parser.loadTREC9Answers(aFile); String answer; for (int i = 0; i < questions.length; i++) { answer = answers[i].getAnswerString(); ass.put(qss[i], answer); if (pFile.equals("")) regexs.put(qss[i], RegexConverter.strToRegex(answer)); } } // if a patterns file is provided, // load patterns and derive answer strings if (!pFile.equals("")) { TRECPattern[] patterns = TREC8To12Parser.loadPatternsAligned(pFile); String pattern; for (int i = 0; i < questions.length; i++) if ((i < patterns.length) && (patterns[i] != null)) { pattern = patterns[i].getRegexs()[0]; regexs.put(qss[i], pattern); if (aFile.equals("")) ass.put(qss[i], RegexConverter.regexToQueryStr(pattern)); } } } /** * Interprets the questions and writes target-context-answer-regex * tuples to resource files. * * @param dir target directory * @return <code>true</code>, iff the interpretations could be written to * resource files */ private static boolean interpretQuestions(String dir) { boolean success = true; for (int i = 0; i < qss.length; i++) { // print original question string MsgPrinter.printQuestionString(qss[i]); // normalize question String qn = QuestionNormalizer.normalize(qss[i]); // stem verbs and nouns String stemmed = QuestionNormalizer.stemVerbsAndNouns(qn); // print normalized and stemmed question string MsgPrinter.printNormalization(stemmed); // interpret question QuestionInterpretation[] qis = QuestionInterpreter.interpret(qn, stemmed); MsgPrinter.printInterpretations(qis); for (QuestionInterpretation qi : qis) if (!saveInterpretation(dir, qi, ass.get(qss[i]), regexs.get(qss[i]))) success = false; } return success; } /** * Saves a question interpretation, an answer string and a regular * expression that describes a correct answer to a file. * * @param dir target directory * @param qi question interpretation * @param as answer string * @param regex regular expression * @return <code>true</code>, iff the tuple could be saved */ private static boolean saveInterpretation(String dir, QuestionInterpretation qi, String as, String regex) { try { File file = new File(dir + "/" + qi.getProperty()); // form tuple String tuple = qi.getTarget(); for (String context : qi.getContext()) tuple += "#" + context; tuple += "#" + as; tuple += "#" + regex; // first check if the tuple already exists in the file if (file.exists()) { BufferedReader in = new BufferedReader(new FileReader(file)); while (in.ready()) if (tuple.equalsIgnoreCase(in.readLine())) return true; in.close(); } // append new tuple PrintWriter out = new PrintWriter(new FileOutputStream(file, true)); out.println(tuple); out.close(); } catch (IOException e) { return false; } return true; } /** * Loads target-context-answer-regex tuples from resource files and forms * queries. * * @param dir directory containing the target-context-answer-regex tuples * @return queries formed from the tuples */ private static Query[] formQueries(String dir) { QuestionInterpretationG queryGenerator = new QuestionInterpretationG(); ArrayList<Query> results = new ArrayList<Query>(); File[] files = FileUtils.getFiles(dir); BufferedReader in; String[] tuple, context, kws; String prop, line, target, as, regex, queryString; QuestionInterpretation qi; Query query; try { for (File file : files) { prop = file.getName(); in = new BufferedReader(new FileReader(file)); while (in.ready()) { line = in.readLine().trim(); if (line.length() == 0 || line.startsWith("//")) continue; // skip blank lines and comments // extract interpretation, answer string and pattern tuple = line.split("#", -1); target = tuple[0]; context = new String[tuple.length - 3]; for (int i = 1; i < tuple.length - 2; i++) context[i - 1] = tuple[i]; as = tuple[tuple.length - 2]; regex = tuple[tuple.length - 1]; // complement answer string or regular expression if (as.equals("")) as = RegexConverter.regexToQueryStr(regex); else if (regex.equals("")) regex = RegexConverter.strToRegex(as); // create query object qi = new QuestionInterpretation(target, context, prop); kws = new String[] {"\"" + as + "\""}; queryString = queryGenerator.queryString(target, context, kws); query = new Query(queryString, null, 0); query.setInterpretation(qi); // store query, answer and regular expression results.add(query); ass.put(queryString, as); regexs.put(queryString, regex); } } } catch (IOException e) { return new Query[0]; } return results.toArray(new Query[results.size()]); } /** * Fetches text passages from knowledge sources. * * @param queries the queries sent to the searchers * @return results from the searchers */ private static Result[] fetchPassages(Query[] queries) { return Search.doSearch(queries); } /** * Extracts answer patterns from the text passages in the search results. * * @param results search results */ private static void extractPatterns(Result[] results) { String as; for (Result result : results) { as = ass.get(result.getQuery().getQueryString()); PatternExtractor.extract(result, as); } } /** * Saves answer patterns to resource files. * * @param dir target directory * @return <code>true</code>, iff the answer patterns could be saved */ private static boolean savePatterns(String dir) { return AnswerPatternFilter.savePatterns(dir); } /** * Loads answer patterns from resource files. * * @param dir directory containing the answer patterns * @return <code>true</code>, iff the answer patterns could be loaded */ private static boolean loadPatterns(String dir) { return AnswerPatternFilter.loadPatterns(dir); } /** * Assesses the answer patterns on the text passages in the * <code>Result</code> objects. * * @param results search results */ private static void assessPatterns(Result[] results) { String regex; for (Result result : results) { regex = regexs.get(result.getQuery().getQueryString()); AnswerPatternFilter.assessPatterns(result, regex); } } /** * Drops answer patterns that have a low support or confidence. */ private static void filterPatterns() { // drop answer patterns that have a low support AnswerPatternFilter.dropLowSupport(SUPPORT_THRESH); // drop answer patterns that have a low confidence AnswerPatternFilter.dropLowConfidence(CONFIDENCE_THRESH); } /** * Initializes the pattern learning tool. */ public static void init() { MsgPrinter.printInitializing(); // create tokenizer MsgPrinter.printStatusMsg("Creating tokenizer..."); if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/" + "EnglishTok.bin.gz")) MsgPrinter.printErrorMsg("Could not create tokenizer."); // LingPipe.createTokenizer(); // create sentence detector MsgPrinter.printStatusMsg("Creating sentence detector..."); if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/" + "opennlp/EnglishSD.bin.gz")) MsgPrinter.printErrorMsg("Could not create sentence detector."); // LingPipe.createSentenceDetector(); // create stemmer MsgPrinter.printStatusMsg("Creating stemmer..."); SnowballStemmer.create(); // create part of speech tagger MsgPrinter.printStatusMsg("Creating POS tagger..."); if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz", "res/nlp/postagger/opennlp/tagdict")) MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger."); // if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" + // "train-wsj-0-18.holder")) // MsgPrinter.printErrorMsg("Could not create Stanford POS tagger."); // create chunker MsgPrinter.printStatusMsg("Creating chunker..."); if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" + "EnglishChunk.bin.gz")) MsgPrinter.printErrorMsg("Could not create chunker."); // create syntactic parser // MsgPrinter.printStatusMsg("Creating syntactic parser..."); // if (!OpenNLP.createParser("res/nlp/syntacticparser/opennlp/")) // MsgPrinter.printErrorMsg("Could not create OpenNLP parser."); // try { // StanfordParser.initialize(); // } catch (Exception e) { // MsgPrinter.printErrorMsg("Could not create Stanford parser."); // } // create named entity taggers MsgPrinter.printStatusMsg("Creating NE taggers..."); NETagger.loadListTaggers("res/nlp/netagger/lists/"); NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst"); MsgPrinter.printStatusMsg(" ...loading models"); // if (!NETagger.loadNameFinders("res/nlp/netagger/opennlp/")) // MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger."); if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init()) MsgPrinter.printErrorMsg("Could not create Stanford NE tagger."); MsgPrinter.printStatusMsg(" ...done"); // create linker // MsgPrinter.printStatusMsg("Creating linker..."); // if (!OpenNLP.createLinker("res/nlp/corefresolver/opennlp/")) // MsgPrinter.printErrorMsg("Could not create linker."); // create WordNet dictionary MsgPrinter.printStatusMsg("Creating WordNet dictionary..."); if (!WordNet.initialize("res/ontologies/wordnet/file_properties.xml")) MsgPrinter.printErrorMsg("Could not create WordNet dictionary."); // load function words (numbers are excluded) MsgPrinter.printStatusMsg("Loading function verbs..."); if (!FunctionWords.loadIndex("res/indices/functionwords_nonumbers")) MsgPrinter.printErrorMsg("Could not load function words."); // load prepositions MsgPrinter.printStatusMsg("Loading prepositions..."); if (!Prepositions.loadIndex("res/indices/prepositions")) MsgPrinter.printErrorMsg("Could not load prepositions."); // load irregular verbs MsgPrinter.printStatusMsg("Loading irregular verbs..."); if (!IrregularVerbs.loadVerbs("res/indices/irregularverbs")) MsgPrinter.printErrorMsg("Could not load irregular verbs."); // load question patterns MsgPrinter.printStatusMsg("Loading question patterns..."); if (!QuestionInterpreter.loadPatterns("res/patternlearning/" + "questionpatterns/")) MsgPrinter.printErrorMsg("Could not load question patterns."); // add knowledge miners used to fetch text passages for pattern learning MsgPrinter.printStatusMsg("Adding BingKM..."); Search.addKnowledgeMiner(new BingKM()); // MsgPrinter.printStatusMsg("Adding GoogleKM..."); // Search.addKnowledgeMiner(new GoogleKM()); // MsgPrinter.printStatusMsg("Adding YahooKM..."); // Search.addKnowledgeMiner(new YahooKM()); // MsgPrinter.printStatusMsg("Adding IndriKMs..."); // for (String[] indriIndices : IndriKM.getIndriIndices()) // Search.addKnowledgeMiner(new IndriKM(indriIndices, false)); // for (String[] indriServers : IndriKM.getIndriServers()) // Search.addKnowledgeMiner(new IndriKM(indriServers, true)); } /** * Loads the TREC data, interprets the questions and writes * target-context-answer-regex tuples to files. * * @param qFile name of the file containing the questions * @param aFile name of the file containing the answers or an empty string * @param pFile name of the file containing the patterns or an empty string * @return <code>true, iff the TREC data could be interpreted */ public static boolean interpret(String qFile, String aFile, String pFile) { // load TREC data MsgPrinter.printLoadingTRECData(); loadTRECData(qFile, aFile, pFile); // interpret TREC questions and save interpretations to files MsgPrinter.printInterpretingQuestions(); return interpretQuestions("res/patternlearning/interpretations"); } /** * Loads target-context-answer-regex tuples from resource files, forms * queries, fetches text passages, extracts answer patterns and writes them * to resource files. * * @return <code>true</code>, iff the answer patterns could be extracted */ public static boolean extract() { // load tuples and form queries MsgPrinter.printFormingQueries(); ass = new Hashtable<String, String>(); regexs = new Hashtable<String, String>(); Query[] queries; ArrayList<Query> queryList = new ArrayList<Query>(); queries = formQueries("res/patternlearning/interpretations"); for (Query query : queries) queryList.add(query); queries = formQueries("res/patternlearning/interpretations_extract"); for (Query query : queries) queryList.add(query); queries = queryList.toArray(new Query[queryList.size()]); // fetch text passages MsgPrinter.printFetchingPassages(); Result[] results = fetchPassages(queries); // extract answer patterns MsgPrinter.printExtractingPatterns(); extractPatterns(results); // save answer patterns MsgPrinter.printSavingPatterns(); return savePatterns("res/patternlearning/answerpatterns_extract"); } /** * Loads target-context-answer-regex tuples and answer patterns from * resource files, forms queries from the tuples, fetches text passages, * assesses the answer patterns on the text passages and writes them to * resource files. * * @return <code>true</code>, iff the answer patterns could be assessed */ public static boolean assess() { // load answer patterns MsgPrinter.printLoadingPatterns(); if (!loadPatterns("res/patternlearning/answerpatterns_extract")) return false; // load tuples and form queries MsgPrinter.printFormingQueries(); ass = new Hashtable<String, String>(); regexs = new Hashtable<String, String>(); Query[] queries; ArrayList<Query> queryList = new ArrayList<Query>(); queries = formQueries("res/patternlearning/interpretations"); for (Query query : queries) queryList.add(query); queries = formQueries("res/patternlearning/interpretations_assess"); for (Query query : queries) queryList.add(query); queries = queryList.toArray(new Query[queryList.size()]); // fetch text passages MsgPrinter.printFetchingPassages(); Result[] results = fetchPassages(queries); // assess answer patterns MsgPrinter.printAssessingPatterns(); assessPatterns(results); // save answer patterns MsgPrinter.printSavingPatterns(); return savePatterns("res/patternlearning/answerpatterns_assess"); } /** * Loads answer patterns from resource files, drops patterns with a low * support or confidence and writes the remaining patterns back to resource * files. * * @return <code>true</code>, iff the answer patterns could be filtered */ public static boolean filter() { // load answer patterns MsgPrinter.printLoadingPatterns(); if (!loadPatterns("res/patternlearning/answerpatterns_assess")) return false; // drop patterns with low support/confidence MsgPrinter.printFilteringPatterns(); filterPatterns(); // save answer patterns MsgPrinter.printSavingPatterns(); return savePatterns("res/patternlearning/answerpatterns"); } /** * <p>Entry point of the program.</p> * * <p>Learns and assesses answer patterns using questions and patterns from * the TREC QA track as training data.</p> * * @param args argument 1: name of the question file<br> * argument 2: name of the answer file<br> * argument 3: name of the file containing the patterns */ public static void main(String[] args) { // enable output of status and error messages MsgPrinter.enableStatusMsgs(true); MsgPrinter.enableErrorMsgs(true); // if (args.length < 3) { // MsgPrinter.printUsage("java PatternLearner question_file " + // "answer_file pattern_file"); // System.exit(1); // } // initialize the system init(); // interpret TREC data // interpret(args[0], args[1], args[2]); // extract answer patterns extract(); // assess answer patterns assess(); // filter answer patterns filter(); } }