package info.ephyra.search.searchers;
import info.ephyra.io.MsgPrinter;
import info.ephyra.search.Result;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lemurproject.indri.ParsedDocument;
import lemurproject.indri.QueryEnvironment;
import lemurproject.indri.ScoredExtentResult;
/**
* <p>A <code>KnowledgeMiner</code> that deploys the Indri IR system to
* search a local text corpus. The search results are paragraphs.</p>
*
* <p>It runs as a separate thread, so several queries can be performed in
* parallel.</p>
*
* <p>This class extends the class <code>KnowledgeMiner</code>.</p>
*
* @author Nico Schlaefer
* @version 2007-07-26
*/
public class IndriKM extends KnowledgeMiner {
/** Maximum total number of search results. */
private static final int MAX_RESULTS_TOTAL = 20;
// private static final int MAX_RESULTS_TOTAL = 0;
/** Maximum number of search results per query. */
private static final int MAX_RESULTS_PERQUERY = 20;
// private static final int MAX_RESULTS_PERQUERY = 50;
/** Maximum number of documents fetched at a time. */
private static final int MAX_DOCS = 20;
// private static final int MAX_DOCS = 20;
/**
* <p>Regular expression that matches characters that cause problems in
* Indri queries and thus should be removed from query strings.</p>
*
* <p>Indri allows the following characters:
* <ul>
* <li>'\u0080'..'\u00ff'</li>
* <li>'a'..'z'</li>
* <li>'A'..'Z'</li>
* <li>'0'..'9'</li>
* <li>'_'</li>
* <li>'-'</li>
* <li>'.' (only allowed if in between digits)</li>
* <li>whitespaces</li>
* <li>'"'</li>
* </ul>
* However, for some of the special characters Indri fails to retrieve
* results and therefore they are excluded.
* </p>
*/
private static final String FORBIDDEN_CHAR = "[^\\w\\.\\s\"]";
/** Directories of Indri indices. */
private String[] indriDirs;
/** URLs of Indri servers. */
private String[] indriUrls;
/**
* Gets a list of all Indri index directories that have been specified with
* system property 'INDRI_INDEX', 'INDRI_INDEX2', 'INDRI_INDEX3' etc.
* One environment variable can specify multiple indices which are queried
* with the same knowledge miner.
* Note: The system property "INDRI_INDEX" is set in lucida.handler.QAServiceHandler.
*
* @return Indri index directories grouped by knowledge miners
*/
public static String[][] getIndriIndices() {
ArrayList<String[]> indices = new ArrayList<String[]>();
String index = System.getProperty("INDRI_INDEX");
// String index = System.getenv("INDRI_INDEX");
// System.out.println("Index: " + index);
if (index != null && index.length() > 0)
indices.add(index.split(";"));
for (int i = 2; ; i++) {
index = System.getenv("INDRI_INDEX" + i);
if (index != null && index.length() > 0)
indices.add(index.split(";"));
else break;
}
return indices.toArray(new String[indices.size()][]);
}
/**
* Gets a list of all Indri server URLs that have been specified with
* environment variables 'INDRI_SERVER', 'INDRI_SERVER2', 'INDRI_SERVER3'
* etc. One environment variable can specify multiple servers which are
* queried with the same knowledge miner.
*
* @return Indri server URLs grouped by knowledge miners
*/
public static String[][] getIndriServers() {
ArrayList<String[]> servers = new ArrayList<String[]>();
String server = System.getenv("INDRI_SERVER");
if (server != null && server.length() > 0)
servers.add(server.split(";"));
for (int i = 2; ; i++) {
server = System.getenv("INDRI_SERVER" + i);
if (server != null && server.length() > 0)
servers.add(server.split(";"));
else break;
}
return servers.toArray(new String[servers.size()][]);
}
/**
* Returns a representation of the query string that is suitable for Indri.
*
* @param qs query string
* @return query string for Indri
*/
public static String transformQueryString(String qs) {
// drop characters that are not properly supported by Indri
// ('.' is only allowed in between digits)
qs = qs.replaceAll("&\\w++;", " ");
qs = qs.replaceAll(FORBIDDEN_CHAR, " ");
String dotsRemoved = "";
for (int i = 0; i < qs.length(); i++)
if (qs.charAt(i) != '.' ||
(i > 0 && i < qs.length() - 1 &&
Character.isDigit(qs.charAt(i - 1)) &&
Character.isDigit(qs.charAt(i + 1))))
dotsRemoved += qs.charAt(i);
qs = dotsRemoved;
// replace ... OR ... by #or(... ...)
Matcher m = Pattern.compile(
"((\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++) OR )++" +
"(\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++)").matcher(qs);
while (m.find())
qs = qs.replace(m.group(0), "#or(" + m.group(0) + ")");
qs = qs.replace(" OR", "");
// replace ... AND ... by #combine(... ...)
m = Pattern.compile(
"((\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++) AND )++" +
"(\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++)").matcher(qs);
while (m.find())
qs = qs.replace(m.group(0), "#combine(" + m.group(0) + ")");
qs = qs.replace(" AND", "");
// replace "..." by #1(...)
m = Pattern.compile("\"([^\"]*+)\"").matcher(qs);
while (m.find())
qs = qs.replace(m.group(0), "#1(" + m.group(1) + ")");
// form passage query
// qs = "#combine[p](" + qs + ")";
qs = "#combine(" + qs + ")";
return qs;
}
/**
* Creates a new Indri knowledge miner and sets the directories of indices
* or the URLs of servers.
*
* @param locations directories of indices or URLs of servers
* @param isServers <code>true</code> iff the first parameter provides URLs
* of servers
*/
public IndriKM(String[] locations, boolean isServers) {
if (isServers) indriUrls = locations;
else indriDirs = locations;
}
/**
* Returns the maximum total number of search results.
*
* @return maximum total number of search results
*/
protected int getMaxResultsTotal() {
return MAX_RESULTS_TOTAL;
}
/**
* Returns the maximum number of search results per query.
*
* @return maximum total number of search results
*/
protected int getMaxResultsPerQuery() {
return MAX_RESULTS_PERQUERY;
}
/**
* Queries the Indri indices or servers and returns an array containing up
* to <code>MAX_RESULTS_PERQUERY</code> search results.
*
* @return Indri search results
*/
protected Result[] doSearch() {
try {
// create query environment
QueryEnvironment env = new QueryEnvironment();
// add Indri indices or servers
if (indriDirs != null && indriDirs.length > 0) {
for (String indriDir : indriDirs) env.addIndex(indriDir);
} else if (indriUrls != null && indriUrls.length > 0) {
for (String indriUrl : indriUrls) env.addServer(indriUrl);
} else {
MsgPrinter.printErrorMsg("Directories of Indri indices or " +
"URLs of Indri servers required.");
System.exit(1);
}
// run an Indri query, returning up to MAX_RESULTS_PERQUERY results
System.out.println("@@@@@@@@@@" + transformQueryString(query.getQueryString()));
ScoredExtentResult[] results =
env.runQuery(transformQueryString(query.getQueryString()),
MAX_RESULTS_PERQUERY);
// get passages and document numbers
String[] passages = new String[results.length];
for (int i = 0; i < results.length; i += MAX_DOCS) {
// fetch MAX_DOCS documents at a time (for memory efficiency)
ScoredExtentResult[] partResults =
new ScoredExtentResult[Math.min(MAX_DOCS, results.length - i)];
for (int j = i; j < i + partResults.length; j++)
partResults[j-i] = results[j];
ParsedDocument[] documents = env.documents(partResults);
for (int j = 0; j < partResults.length; j++) {
int passageBegin = partResults[j].begin;
int passageEnd = partResults[j].end;
int byteBegin = documents[j].positions[passageBegin].begin;
int byteEnd = documents[j].positions[passageEnd - 1].end;
byte[] doc = documents[j].text.getBytes("UTF-8");
byte[] p = new byte[byteEnd - byteBegin];
for (int offset = byteBegin; offset < byteEnd; offset++) {
// Check offset to avoid OutOfBound error. By Yunsheng Bai.
if (offset >= doc.length) {
break;
}
p[offset - byteBegin] = doc[offset];
}
passages[j+i] = new String(p);
// passages[j+i] = documents[j].text.substring(byteBegin, byteEnd);
// // align passage with paragraph tags
// String docText = documents[j].text;
// while (byteBegin > docText.length() ||
// !docText.substring(byteBegin - 3, byteBegin).equals("<P>"))
// byteBegin--;
// passages[j+i] =
// docText.substring(byteBegin).split("</P>", 2)[0].trim();
}
}
String[] docNos = env.documentMetadata(results, "docno");
// close query environment
env.close();
// return results
return getResults(passages, docNos, false);
} catch (Exception e) {
MsgPrinter.printSearchError(e); // print search error message
MsgPrinter.printErrorMsg("\nSearch failed.");
//System.exit(1);
return null;
}
}
/**
* Returns a new instance of <code>IndriKM</code>. A new instance is created
* for each query.
*
* @return new instance of <code>IndriKM</code>
*/
public KnowledgeMiner getCopy() {
if (indriDirs != null)
return new IndriKM(indriDirs, false);
else
return new IndriKM(indriUrls, true);
}
}