package info.ephyra.search.searchers;
import info.ephyra.io.MsgPrinter;
import info.ephyra.nlp.SentenceExtractor;
import info.ephyra.search.Result;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* <p>A <code>KnowledgeAnnotator</code> for the CIA World Factbook. It answers a
* question about a country by extracting the information from the web page for
* that country.</p>
*
* <p>It runs as a separate thread, so several queries can be performed in
* parallel.</p>
*
* <p>This class extends the class <code>KnowledgeAnnotator</code>.</p>
*
* @author Nico Schlaefer
* @version 2005-09-29
*/
public class WorldFactbookKA extends KnowledgeAnnotator {
/** The URL of the CIA World Factbook. */
private static final String URL =
"https://www.cia.gov/library/publications/the-world-factbook/";
/** Country names and corresponding web pages. */
private Hashtable<String, String> countries =
new Hashtable<String, String>();
/**
* Protected constructor used by the <code>getCopy()</code> method.
*
* @param name name of the <code>KnowledgeAnnotator</code>
* @param qPatterns question patterns
* @param qContents descriptors of the relevant content of a question
*/
protected WorldFactbookKA(String name, ArrayList<Pattern> qPatterns,
ArrayList<String> qContents) {
super(name, qPatterns, qContents);
}
/**
* <p>Creates a <code>WorldFactbookKA</code> and calls the constructor of
* the superclass that reads the question patterns from a file.</p>
*
* <p>Furthermore, a list of the available countries and the URLs of the
* corresponding web pages are retrieved from the Factbook.</p>
*
* @param filename file containing the question patterns
*/
public WorldFactbookKA(String filename) throws IOException {
super(filename);
try {
URL factbook = new URL(URL); // URL of the main page
BufferedReader in;
String line;
in = new BufferedReader(new InputStreamReader(factbook.openStream(),
Charset.forName("iso-8859-1")));
Pattern p = Pattern.compile(".*<option\\s*value=\"(.*)\"\\s*>(.*)" +
"</option>.*");
Matcher m;
while (in.ready()) {
line = in.readLine();
m = p.matcher(line);
if (m.matches()) // (country, url) pair found
countries.put(m.group(2).toLowerCase(), m.group(1));
}
in.close();
}
catch (Exception e) {
MsgPrinter.printSearchError(e); // print search error message
}
}
/**
* Searches the World Factbook for country details and returns an array
* containing a single <code>Result</code> object or an empty array, if the
* search failed.
*
* @return array containing a single <code>Result</code> or an empty array
*/
protected Result[] doSearch() {
try {
// get country name and demanded information
String[] content = getContent().split("#");
String info = content[0];
String country = content[1];
// get URL of country web page
String countryPage = countries.get(country.toLowerCase());
if (countryPage == null) return new Result[0];
URL page = new URL(URL + countryPage);
// retrieve document
BufferedReader in;
String html = "";
in = new BufferedReader(new InputStreamReader(page.openStream(),
Charset.forName("iso-8859-1")));
while (in.ready()) {
html += in.readLine() + " ";
}
in.close();
// extract information
Pattern p = Pattern.compile("(?i).*" + info + ":</div>\\s*</td>" +
"\\s*<td .*?>(.*?)</td>.*");
Matcher m = p.matcher(html);
if (m.matches()) {
// extract sentence
String sentence = SentenceExtractor.getSentencesFromHtml(m.group(1))[0];
// create result from that sentence
return getResult(sentence, page.toString());
}
}
catch (Exception e) {
MsgPrinter.printSearchError(e); // print search error message
}
return new Result[0]; // search failed
}
/**
* Returns a new instance of <code>WorldFactbookKA</code>. A new instance is
* created for each query.
*
* @return new instance of <code>WorldFactbookKA</code>
*/
public KnowledgeAnnotator getCopy() {
WorldFactbookKA ka = new WorldFactbookKA(name, qPatterns, qContents);
ka.countries = countries; // also copy list of countries
return ka;
}
}