package info.ephyra.answerselection.filters; import info.ephyra.io.MsgPrinter; import info.ephyra.nlp.NETagger; import info.ephyra.nlp.OpenNLP; import info.ephyra.nlp.SnowballStemmer; import info.ephyra.nlp.StanfordNeTagger; import info.ephyra.search.searchers.GoogleKM; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import com.google.soap.search.GoogleSearch; import com.google.soap.search.GoogleSearchFault; import com.google.soap.search.GoogleSearchResult; import com.google.soap.search.GoogleSearchResultElement; /** * <p>A web term importance filter that counts term frequencies in text snippets * retrieved with the Google search engine.</p> * * <p>This class extends the class <code>WebTermImportanceFilter</code>.</p> * * @author Guido Sautter * @version 2008-02-15 */ public class GoogleTermImportanceFilter extends WebTermImportanceFilter { /** Google license key. */ private static final String GOOGLE_KEY = "Enter your Google license key."; /** Maximum total number of search results. */ private static final int MAX_RESULTS_TOTAL = 250; /** Maximum number of search results per query. */ private static final int MAX_RESULTS_PERQUERY = 10; /** Number of retries if search fails. */ private static final int RETRIES = 50; /** * @param normalizationMode * @param tfNormalizationMode * @param isCombined */ public GoogleTermImportanceFilter(int normalizationMode, int tfNormalizationMode, boolean isCombined) { super(normalizationMode, tfNormalizationMode, isCombined); } /** @see info.ephyra.answerselection.filters.WebTermImportanceFilter#getTermCounters(java.lang.String[]) */ @Override public HashMap<String, TermCounter> getTermCounters(String[] targets) { HashMap<String, TermCounter> termCounters = new HashMap<String, TermCounter>(); // process targets for (String target : targets) { // // process wikipedia lookup target // if (target.endsWith(WIKIPEDIA)) { // // // get snippets from google // GoogleSearch search = new GoogleSearch(); // if (TEST_TARGET_GENERATION) System.out.println("Got search ..."); // // // set license key // search.setKey(GOOGLE_KEY); // if (TEST_TARGET_GENERATION) System.out.println(" - key is " + GOOGLE_KEY); // // // set search string // search.setQueryString(target); // if (TEST_TARGET_GENERATION) System.out.println(" - target is " + target); // // // set language to English only // search.setLanguageRestricts("English"); // if (TEST_TARGET_GENERATION) System.out.println(" - language set"); // // // set hit position of first search result // search.setStartResult(0); // if (TEST_TARGET_GENERATION) System.out.println(" - start result set to " + 0); // // // set maximum number of search results // search.setMaxResults(MAX_RESULTS_PERQUERY); // if (TEST_TARGET_GENERATION) System.out.println(" - max results set"); // // // perform search // GoogleSearchResult googleResult = null; // int retries = 0; // while (googleResult == null) // try { // googleResult = search.doSearch(); // } catch (GoogleSearchFault e) { // MsgPrinter.printSearchError(e); // print search error message // // if (retries == RETRIES) { // MsgPrinter.printErrorMsg("\nSearch failed."); // System.exit(1); // } // retries++; // // try { // GoogleKM.sleep(1000); // } catch (InterruptedException ie) {} // } // // // get snippets // GoogleSearchResultElement[] elements = googleResult.getResultElements(); // if (TEST_TARGET_GENERATION) System.out.println(" - got results: " + elements.length); // // for (int i = 0; i < elements.length; i++) { // String url = elements[i].getURL(); // // // get artivle from wikipedia and extract terms // if (url.toLowerCase().indexOf(WIKIPEDIA.toLowerCase()) != -1) try { // BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openStream())); // String line; // while ((line = br.readLine()) != null) { // if (line.startsWith("<p>")) { // String plain = line.toLowerCase().replaceAll("\\<[^\\>]++\\>", " "); // plain = plain.replaceAll("\\&\\#39\\;", "'"); // if (TEST_TARGET_GENERATION) System.out.println(" - plain: " + plain); // // // tokenize and tag sentence // String[] sentence = NETagger.tokenize(plain); // // // scan sentence for NPs // for (int s = 0; s < sentence.length; s++) { // String term = SnowballStemmer.stem(sentence[s].toLowerCase()); // if (term.length() > 1) { // if (!termCounters.containsKey(term)) // termCounters.put(term, new TermCounter()); // termCounters.get(term).increment(); // } // } // } // } // } catch (IOException ioe) {} // } // } // // // process other target // else // subsequently get top MAX_RESULTS_TOTAL snippets, MAX_RESULTS_PERQUERY each time for (int startResult = 0; startResult < MAX_RESULTS_TOTAL; startResult += MAX_RESULTS_PERQUERY) { // get snippets from google GoogleSearch search = new GoogleSearch(); if (TEST_TARGET_GENERATION) System.out.println("Got search ..."); // set license key search.setKey(GOOGLE_KEY); if (TEST_TARGET_GENERATION) System.out.println(" - key is " + GOOGLE_KEY); // set search string search.setQueryString(target); if (TEST_TARGET_GENERATION) System.out.println(" - target is " + target); // set language to English only search.setLanguageRestricts("English"); if (TEST_TARGET_GENERATION) System.out.println(" - language set"); // set hit position of first search result search.setStartResult(startResult); if (TEST_TARGET_GENERATION) System.out.println(" - start result set to " + startResult); // set maximum number of search results search.setMaxResults(MAX_RESULTS_PERQUERY); if (TEST_TARGET_GENERATION) System.out.println(" - max results set"); // perform search GoogleSearchResult googleResult = null; int retries = 0; while (googleResult == null) try { googleResult = search.doSearch(); } catch (GoogleSearchFault e) { MsgPrinter.printSearchError(e); // print search error message if (retries == RETRIES) { MsgPrinter.printErrorMsg("\nSearch failed."); //System.exit(1); return termCounters; } retries++; try { GoogleKM.sleep(1000); } catch (InterruptedException ie) {} } // get snippets GoogleSearchResultElement[] elements = googleResult.getResultElements(); if (TEST_TARGET_GENERATION) System.out.println(" - got results: " + elements.length); // parse google snippets int lengthSum = 0; for (int i = 0; i < elements.length; i++) { // if (TEST_TARGET_GENERATION) System.out.println(" - summary: " + elements[i].getSummary()); // if (TEST_TARGET_GENERATION) System.out.println(" - snippet: " + elements[i].getSnippet()); String plain = elements[i].getSnippet().replaceAll("\\<[^\\>]++\\>", " "); plain = plain.replaceAll("\\&\\#39\\;", "'"); if (TEST_TARGET_GENERATION) System.out.println(" - plain: " + plain); // tokenize and tag sentence String[] sentence = NETagger.tokenize(plain); // String[] sentence = NETagger.tokenize(elements[i].getSnippet()); // String[] sentence = NETagger.tokenize(elements[i].getSummary()); lengthSum += sentence.length; // scan sentence for NPs for (int s = 0; s < sentence.length; s++) { String term = SnowballStemmer.stem(sentence[s].toLowerCase()); if (term.length() > 1) { if (!termCounters.containsKey(term)) termCounters.put(term, new TermCounter()); termCounters.get(term).increment(); } } } } } return termCounters; } public static void main(String[] args) { TEST_TARGET_GENERATION = true; MsgPrinter.enableStatusMsgs(true); MsgPrinter.enableErrorMsgs(true); // create tokenizer MsgPrinter.printStatusMsg("Creating tokenizer..."); if (!OpenNLP.createTokenizer("res/nlp/tokenizer/opennlp/EnglishTok.bin.gz")) MsgPrinter.printErrorMsg("Could not create tokenizer."); // LingPipe.createTokenizer(); // create sentence detector // MsgPrinter.printStatusMsg("Creating sentence detector..."); // if (!OpenNLP.createSentenceDetector("res/nlp/sentencedetector/opennlp/EnglishSD.bin.gz")) // MsgPrinter.printErrorMsg("Could not create sentence detector."); // LingPipe.createSentenceDetector(); // create stemmer MsgPrinter.printStatusMsg("Creating stemmer..."); SnowballStemmer.create(); // create part of speech tagger MsgPrinter.printStatusMsg("Creating POS tagger..."); if (!OpenNLP.createPosTagger("res/nlp/postagger/opennlp/tag.bin.gz", "res/nlp/postagger/opennlp/tagdict")) MsgPrinter.printErrorMsg("Could not create OpenNLP POS tagger."); // if (!StanfordPosTagger.init("res/nlp/postagger/stanford/" + // "train-wsj-0-18.holder")) // MsgPrinter.printErrorMsg("Could not create Stanford POS tagger."); // create chunker MsgPrinter.printStatusMsg("Creating chunker..."); if (!OpenNLP.createChunker("res/nlp/phrasechunker/opennlp/" + "EnglishChunk.bin.gz")) MsgPrinter.printErrorMsg("Could not create chunker."); // create named entity taggers MsgPrinter.printStatusMsg("Creating NE taggers..."); NETagger.loadListTaggers("res/nlp/netagger/lists/"); NETagger.loadRegExTaggers("res/nlp/netagger/patterns.lst"); MsgPrinter.printStatusMsg(" ...loading models"); // if (!NETagger.loadNameFinders("res/nlp/netagger/opennlp/")) // MsgPrinter.printErrorMsg("Could not create OpenNLP NE tagger."); if (!StanfordNeTagger.isInitialized() && !StanfordNeTagger.init()) MsgPrinter.printErrorMsg("Could not create Stanford NE tagger."); MsgPrinter.printStatusMsg(" ...done"); // WebTermImportanceFilter wtif = new TargetGeneratorTest(); // TRECTarget[] targets = TREC13To16Parser.loadTargets(args[0]); // for (TRECTarget target : targets) { // String question = target.getTargetDesc(); // // // query generation // MsgPrinter.printGeneratingQueries(); // String qn = QuestionNormalizer.normalize(question); // MsgPrinter.printNormalization(qn); // print normalized question string // Logger.logNormalization(qn); // log normalized question string // String[] kws = KeywordExtractor.getKeywords(qn); // AnalyzedQuestion aq = new AnalyzedQuestion(question); // aq.setKeywords(kws); // aq.setFactoid(false); // // Query[] queries = new BagOfWordsG().generateQueries(aq); // for (int q = 0; q < queries.length; q++) // queries[q].setOriginalQueryString(question); // // Result[] results = new Result[1]; // results[0] = new Result("This would be the answer", queries[0]); // wtif.apply(results); // } GoogleTermImportanceFilter gtif = new GoogleTermImportanceFilter(NO_NORMALIZATION, NO_NORMALIZATION, false); String[] targets = gtif.getTargets("Warren Moon"); final HashMap<String, TermCounter> termCounters = gtif.getTermCounters(targets); ArrayList<String> termList = new ArrayList<String>(termCounters.keySet()); Collections.sort(termList, new Comparator<String>() { public int compare(String o1, String o2) { int tc1 = termCounters.get(o1).getValue(); int tc2 = termCounters.get(o2).getValue(); return ((tc1 == tc2) ? o1.compareTo(o2) : (tc2 - tc1)); } }); // Iterator<String> terms = termCounters.keySet().iterator(); Iterator<String> terms = termList.iterator(); int atLeast5 = 0; while (terms.hasNext()) { String term = terms.next(); int tc = termCounters.get(term).getValue(); System.out.println(term + ": " + tc); if (tc > 4) atLeast5++; } System.out.println("At least 5 times: " + atLeast5); } }