package info.ephyra.search.searchers;
import info.ephyra.io.MsgPrinter;
import info.ephyra.nlp.SentenceExtractor;
import info.ephyra.search.Result;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.regex.Pattern;
/**
* <p>A <code>KnowledgeAnnotator</code> for the Wikipedia online encyclopedia.
* It answers a question for a definition by returning a sentence from the
* corresponding Wikipedia web page.</p>
*
* <p>It runs as a separate thread, so several queries can be performed in
* parallel.</p>
*
* <p>This class extends the class <code>KnowledgeAnnotator</code>.</p>
*
* @author Nico Schlaefer
* @version 2005-09-28
*/
public class WikipediaKA extends KnowledgeAnnotator {
/** The URL of the Wikipedia search page. */
private static final String URL =
"http://en.wikipedia.org/wiki/Special:Search?search=";
/**
* Protected constructor used by the <code>getCopy()</code> method.
*
* @param name name of the <code>KnowledgeAnnotator</code>
* @param qPatterns question patterns
* @param qContents descriptors of the relevant content of a question
*/
protected WikipediaKA(String name, ArrayList<Pattern> qPatterns,
ArrayList<String> qContents) {
super(name, qPatterns, qContents);
}
/**
* Creates a <code>WikipediaKA</code> and calls the constructor of the
* superclass that reads the question patterns from a file.
*
* @param filename file containing the question patterns
*/
public WikipediaKA(String filename) throws IOException {
super(filename);
}
/**
* Queries Wikipedia for a definition and returns an array containing a
* single <code>Result</code> object or an empty array, if the search
* failed.
*
* @return array containing a single <code>Result</code> or an empty array
*/
protected Result[] doSearch() {
try {
// compose URL for the search
String content = getContent();
String param = content.replace(" ", "+");
URL search = new URL(URL + param);
// retrieve document and extract answer sentence
BufferedReader in;
String line, sentence;
in = new BufferedReader(new InputStreamReader(search.openStream(),
Charset.forName("utf-8")));
while (in.ready()) {
line = in.readLine();
// line should contain the term
if (line.matches("(?i).*" + content + ".*")) {
// extract first sentence
sentence = SentenceExtractor.getSentencesFromHtml(line)[0];
// sentence is really a definition of the term
if (sentence.matches("(?i)(an? |the )?" + content +
".*\\."))
// create result from sentence
return getResult(sentence, search.toString());
}
}
in.close();
}
catch (Exception e) {
MsgPrinter.printSearchError(e); // print search error message
}
return new Result[0]; // search failed
}
/**
* Returns a new instance of <code>WikipediaKA</code>. A new instance is
* created for each query.
*
* @return new instance of <code>WikipediaKA</code>
*/
public KnowledgeAnnotator getCopy() {
KnowledgeAnnotator ka = new WikipediaKA(name, qPatterns, qContents);
return ka;
}
}