package info.ephyra.search.searchers;
import info.ephyra.io.MsgPrinter;
import info.ephyra.search.Result;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lemurproject.indri.ParsedDocument;
import lemurproject.indri.QueryEnvironment;
import lemurproject.indri.ScoredExtentResult;
/**
* <p>A <code>KnowledgeMiner</code> that deploys the Indri IR system to
* search a local text corpus. In contrast to <code>IndriKM</code>, whole
* documents are returned instead of paragraphs.</p>
*
* <p>It runs as a separate thread, so several queries can be performed in
* parallel.</p>
*
* <p>This class extends the class <code>KnowledgeMiner</code>.</p>
*
* @author Nico Schlaefer
* @version 2007-07-26
*/
public class IndriDocumentKM extends KnowledgeMiner {
/** Maximum total number of search results. */
private static final int MAX_RESULTS_TOTAL = 20;
/** Maximum number of search results per query. */
private static final int MAX_RESULTS_PERQUERY = 20;
/** Maximum number of documents fetched at a time. */
private static final int MAX_DOCS = 20;
/**
* <p>Regular expression that matches characters that cause problems in
* Indri queries and thus should be removed from query strings.</p>
*
* <p>Indri allows the following characters:
* <ul>
* <li>'\u0080'..'\u00ff'</li>
* <li>'a'..'z'</li>
* <li>'A'..'Z'</li>
* <li>'0'..'9'</li>
* <li>'_'</li>
* <li>'-'</li>
* <li>'.' (only allowed if in between digits)</li>
* <li>whitespaces</li>
* <li>'"'</li>
* </ul>
* However, for some of the special characters Indri fails to retrieve
* results and therefore they are excluded.
* </p>
*/
private static final String FORBIDDEN_CHAR = "[^\\w\\.\\s\"]";
/** Directories of Indri indices. */
private String[] indriDirs;
/** URLs of Indri servers. */
private String[] indriUrls;
/**
* Returns a representation of the query string that is suitable for Indri.
*
* @param qs query string
* @return query string for Indri
*/
public static String transformQueryString(String qs) {
// drop characters that are not properly supported by Indri
// ('.' is only allowed in between digits)
qs = qs.replaceAll("&\\w++;", " ");
qs = qs.replaceAll(FORBIDDEN_CHAR, " ");
String dotsRemoved = "";
for (int i = 0; i < qs.length(); i++)
if (qs.charAt(i) != '.' ||
(i > 0 && i < qs.length() - 1 &&
Character.isDigit(qs.charAt(i - 1)) &&
Character.isDigit(qs.charAt(i + 1))))
dotsRemoved += qs.charAt(i);
qs = dotsRemoved;
// replace ... OR ... by #or(... ...)
Matcher m = Pattern.compile(
"((\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++) OR )++" +
"(\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++)").matcher(qs);
while (m.find())
qs = qs.replace(m.group(0), "#or(" + m.group(0) + ")");
qs = qs.replace(" OR", "");
// replace ... AND ... by #combine(... ...)
m = Pattern.compile(
"((\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++) AND )++" +
"(\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++)").matcher(qs);
while (m.find())
qs = qs.replace(m.group(0), "#combine(" + m.group(0) + ")");
qs = qs.replace(" AND", "");
// replace "..." by #1(...)
m = Pattern.compile("\"([^\"]*+)\"").matcher(qs);
while (m.find())
qs = qs.replace(m.group(0), "#1(" + m.group(1) + ")");
return qs;
}
/**
* Creates a new Indri knowledge miner and sets the directories of indices
* or the URLs of servers.
*
* @param locations directories of indices or URLs of servers
* @param isServers <code>true</code> iff the first parameter provides URLs
* of servers
*/
public IndriDocumentKM(String[] locations, boolean isServers) {
if (isServers) indriUrls = locations;
else indriDirs = locations;
}
/**
* Returns the maximum total number of search results.
*
* @return maximum total number of search results
*/
protected int getMaxResultsTotal() {
return MAX_RESULTS_TOTAL;
}
/**
* Returns the maximum number of search results per query.
*
* @return maximum total number of search results
*/
protected int getMaxResultsPerQuery() {
return MAX_RESULTS_PERQUERY;
}
/**
* Queries the Indri indices or servers and returns an array containing up
* to <code>MAX_RESULTS_PERQUERY</code> search results.
*
* @return Indri search results
*/
protected Result[] doSearch() {
try {
// create query environment
QueryEnvironment env = new QueryEnvironment();
// add Indri indices or servers
if (indriDirs != null && indriDirs.length > 0) {
for (String indriDir : indriDirs) env.addIndex(indriDir);
} else if (indriUrls != null && indriUrls.length > 0) {
for (String indriUrl : indriUrls) env.addServer(indriUrl);
} else {
MsgPrinter.printErrorMsg("Directories of Indri indices or " +
"URLs of Indri servers required.");
System.exit(1);
}
// run an Indri query, returning up to MAX_RESULTS_PERQUERY results
ScoredExtentResult[] results =
env.runQuery(transformQueryString(query.getQueryString()),
MAX_RESULTS_PERQUERY);
// get documents and document numbers
String[] docs = new String[results.length];
for (int i = 0; i < results.length; i += MAX_DOCS) {
// fetch MAX_DOCS documents at a time (for memory efficiency)
ScoredExtentResult[] partResults =
new ScoredExtentResult[Math.min(MAX_DOCS, results.length - i)];
for (int j = i; j < i + partResults.length; j++)
partResults[j-i] = results[j];
ParsedDocument[] documents = env.documents(partResults);
for (int j = 0; j < partResults.length; j++)
docs[j+i] = documents[j].text;
}
String[] docNos = env.documentMetadata(results, "docno");
// close query environment
env.close();
// return results
return getResults(docs, docNos, false);
} catch (Exception e) {
MsgPrinter.printSearchError(e); // print search error message
MsgPrinter.printErrorMsg("\nSearch failed.");
System.exit(1);
return null;
}
}
/**
* Returns a new instance of <code>IndriDocumentKM</code>. A new instance is
* created for each query.
*
* @return new instance of <code>IndriDocumentKM</code>
*/
public KnowledgeMiner getCopy() {
if (indriDirs != null)
return new IndriDocumentKM(indriDirs, false);
else
return new IndriDocumentKM(indriUrls, true);
}
}