package info.ephyra.answerselection.filters;
import info.ephyra.io.MsgPrinter;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.SnowballStemmer;
import info.ephyra.search.searchers.YahooKM;
import java.math.BigInteger;
import java.util.HashMap;
import com.yahoo.search.SearchClient;
import com.yahoo.search.WebSearchRequest;
import com.yahoo.search.WebSearchResult;
/**
* <p>A web term importance filter that counts term frequencies in text snippets
* retrieved with the Yahoo search engine.</p>
*
* <p>This class extends the class <code>WebTermImportanceFilter</code>.</p>
*
* @author Guido Sautter
* @version 2008-02-15
*/
public class YahooTermImportanceFilter extends WebTermImportanceFilter {
/** Yahoo application ID, allows 5,000 queries per day and IP address. */
private static final String YAHOO_ID = "questionanswering";
// /** Maximum total number of search results. */
// private static final int MAX_RESULTS_TOTAL = 600;
/** Maximum number of search results per query. */
private static final int MAX_RESULTS_PERQUERY = 100;
/** Number of retries if search fails. */
private static final int RETRIES = 60;
/**
* @param normalizationMode
* @param tfNormalizationMode
* @param isCombined
*/
public YahooTermImportanceFilter(int normalizationMode, int tfNormalizationMode, boolean isCombined) {
super(normalizationMode, tfNormalizationMode, isCombined);
}
/** @see info.ephyra.answerselection.filters.WebTermImportanceFilter#getTermCounters(java.lang.String[])
*/
@Override
public HashMap<String, TermCounter> getTermCounters(String[] targets) {
HashMap<String, TermCounter> termCounters = new HashMap<String, TermCounter>();
for (String target : targets) {
// get snippets from yahoo
SearchClient client = new SearchClient(YAHOO_ID);
// create request
WebSearchRequest request = new WebSearchRequest(target);
request.setLanguage("en"); // search for English pages only
request.setStart(BigInteger.valueOf(0));
request.setResults(MAX_RESULTS_PERQUERY);
// perform search
WebSearchResult[] searchResults = null;
int retries = 0;
while (searchResults == null)
try {
searchResults = client.webSearch(request).listResults();
} catch (Exception e) {
MsgPrinter.printSearchError(e); // print search error message
if (retries == RETRIES) {
MsgPrinter.printErrorMsg("\nSearch failed.");
System.exit(1);
}
retries++;
try {
YahooKM.sleep(1000);
} catch (InterruptedException ie) {}
}
// parse yahoo snippets
int lengthSum = 0;
for (int i = 0; i < searchResults.length; i++) {
String summary = searchResults[i].getSummary();
if (summary != null) {
// tokenize and tag sentence
String[] sentence = NETagger.tokenize(summary);
lengthSum += sentence.length;
// scan sentence for NPs
for (int s = 0; s < sentence.length; s++) {
String term = SnowballStemmer.stem(sentence[s].toLowerCase());
if (term.length() > 1) {
if (!termCounters.containsKey(term))
termCounters.put(term, new TermCounter());
termCounters.get(term).increment();
}
}
}
}
}
return termCounters;
}
}