IndriKM.java example

Explorer
lucida-master
- lucida
package info.ephyra.search.searchers;

import info.ephyra.io.MsgPrinter;
import info.ephyra.search.Result;

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import lemurproject.indri.ParsedDocument;
import lemurproject.indri.QueryEnvironment;
import lemurproject.indri.ScoredExtentResult;

/**
 * <p>A <code>KnowledgeMiner</code> that deploys the Indri IR system to
 * search a local text corpus. The search results are paragraphs.</p>
 * 
 * <p>It runs as a separate thread, so several queries can be performed in
 * parallel.</p>
 * 
 * <p>This class extends the class <code>KnowledgeMiner</code>.</p>
 * 
 * @author Nico Schlaefer
 * @version 2007-07-26
 */
public class IndriKM extends KnowledgeMiner {
	/** Maximum total number of search results. */
	private static final int MAX_RESULTS_TOTAL = 20;
//	private static final int MAX_RESULTS_TOTAL = 0;	
	/** Maximum number of search results per query. */
	private static final int MAX_RESULTS_PERQUERY = 20;
//	private static final int MAX_RESULTS_PERQUERY = 50;	
	/** Maximum number of documents fetched at a time. */
	private static final int MAX_DOCS = 20;
//	private static final int MAX_DOCS = 20;	
	/**
	 * <p>Regular expression that matches characters that cause problems in
	 * Indri queries and thus should be removed from query strings.</p>
	 * 
	 * <p>Indri allows the following characters:
	 * <ul>
	 * <li>'\u0080'..'\u00ff'</li>
	 * <li>'a'..'z'</li>
	 * <li>'A'..'Z'</li>
	 * <li>'0'..'9'</li>
	 * <li>'_'</li>
	 * <li>'-'</li>
	 * <li>'.' (only allowed if in between digits)</li>
	 * <li>whitespaces</li>
	 * <li>'"'</li>
	 * </ul>
	 * However, for some of the special characters Indri fails to retrieve
	 * results and therefore they are excluded.
	 * </p>
	 */
	private static final String FORBIDDEN_CHAR = "[^\\w\\.\\s\"]";
	
	/** Directories of Indri indices. */
	private String[] indriDirs;
	/** URLs of Indri servers. */
	private String[] indriUrls;
	
	/**
	 * Gets a list of all Indri index directories that have been specified with
	 * system property 'INDRI_INDEX', 'INDRI_INDEX2', 'INDRI_INDEX3' etc.
	 * One environment variable can specify multiple indices which are queried
	 * with the same knowledge miner.
	 * Note: The system property "INDRI_INDEX" is set in lucida.handler.QAServiceHandler.
	 * 
	 * @return Indri index directories grouped by knowledge miners
	 */
	public static String[][] getIndriIndices() {
		ArrayList<String[]> indices = new ArrayList<String[]>();
		
		String index = System.getProperty("INDRI_INDEX");
		// String index = System.getenv("INDRI_INDEX");
             //   System.out.println("Index: " + index);
		if (index != null && index.length() > 0)
			indices.add(index.split(";"));
		for (int i = 2; ; i++) {
			index = System.getenv("INDRI_INDEX" + i);
			if (index != null && index.length() > 0)
				indices.add(index.split(";"));
			else break;
		}
		
		return indices.toArray(new String[indices.size()][]);
	}
	
	/**
	 * Gets a list of all Indri server URLs that have been specified with
	 * environment variables 'INDRI_SERVER', 'INDRI_SERVER2', 'INDRI_SERVER3'
	 * etc. One environment variable can specify multiple servers which are
	 * queried with the same knowledge miner.
	 * 
	 * @return Indri server URLs grouped by knowledge miners
	 */
	public static String[][] getIndriServers() {
		ArrayList<String[]> servers = new ArrayList<String[]>();
		
		String server = System.getenv("INDRI_SERVER");
		if (server != null && server.length() > 0)
			servers.add(server.split(";"));
		for (int i = 2; ; i++) {
			server = System.getenv("INDRI_SERVER" + i);
			if (server != null && server.length() > 0)
				servers.add(server.split(";"));
			else break;
		}
		
		return servers.toArray(new String[servers.size()][]);
	}
	
	/**
	 * Returns a representation of the query string that is suitable for Indri.
	 * 
	 * @param qs query string
	 * @return query string for Indri
	 */
	public static String transformQueryString(String qs) {
		// drop characters that are not properly supported by Indri
		// ('.' is only allowed in between digits)
		qs = qs.replaceAll("&\\w++;", " ");
		qs = qs.replaceAll(FORBIDDEN_CHAR, " ");
		String dotsRemoved = "";
		for (int i = 0; i < qs.length(); i++)
			if (qs.charAt(i) != '.' ||
				(i > 0 && i < qs.length() - 1 &&
				 Character.isDigit(qs.charAt(i - 1)) &&
				 Character.isDigit(qs.charAt(i + 1))))
				 dotsRemoved += qs.charAt(i);
		qs = dotsRemoved;
		
		// replace ... OR ... by #or(... ...)
		Matcher m = Pattern.compile(
			"((\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++) OR )++" +
			"(\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++)").matcher(qs);
		while (m.find())
			qs = qs.replace(m.group(0), "#or(" + m.group(0) + ")");
		qs = qs.replace(" OR", "");
		
		// replace ... AND ... by #combine(... ...)
		m = Pattern.compile(
			"((\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++) AND )++" +
			"(\\([^\\(\\)]*+\\)|\\\"[^\\\"]*+\\\"|[^\\s\\(\\)]++)").matcher(qs);
		while (m.find())
			qs = qs.replace(m.group(0), "#combine(" + m.group(0) + ")");
		qs = qs.replace(" AND", "");
		
		// replace "..." by #1(...)
		m = Pattern.compile("\"([^\"]*+)\"").matcher(qs);
		while (m.find())
			qs = qs.replace(m.group(0), "#1(" + m.group(1) + ")");
		
		// form passage query
//		qs = "#combine[p](" + qs + ")";
		qs = "#combine(" + qs + ")";
	
		return qs;
	}
	
	/**
	 * Creates a new Indri knowledge miner and sets the directories of indices
	 * or the URLs of servers.
	 * 
	 * @param locations directories of indices or URLs of servers
	 * @param isServers <code>true</code> iff the first parameter provides URLs
	 *                  of servers
	 */
	public IndriKM(String[] locations, boolean isServers) {
		if (isServers) indriUrls = locations;
		else indriDirs = locations;
	}
	
	/**
	 * Returns the maximum total number of search results.
	 * 
	 * @return maximum total number of search results
	 */
	protected int getMaxResultsTotal() {
		return MAX_RESULTS_TOTAL;
	}
	
	/**
	 * Returns the maximum number of search results per query.
	 * 
	 * @return maximum total number of search results
	 */
	protected int getMaxResultsPerQuery() {
		return MAX_RESULTS_PERQUERY;
	}
	
	/**
	 * Queries the Indri indices or servers and returns an array containing up
	 * to <code>MAX_RESULTS_PERQUERY</code> search results.
	 * 
	 * @return Indri search results
	 */
	protected Result[] doSearch() {
		try {
			// create query environment
			QueryEnvironment env = new QueryEnvironment();
			
		    // add Indri indices or servers
			if (indriDirs != null && indriDirs.length > 0) {
				for (String indriDir : indriDirs) env.addIndex(indriDir);
			} else if (indriUrls != null && indriUrls.length > 0) {
				for (String indriUrl : indriUrls) env.addServer(indriUrl);
			} else {
				MsgPrinter.printErrorMsg("Directories of Indri indices or " +
						"URLs of Indri servers required.");
				System.exit(1);
			}
		    
		    // run an Indri query, returning up to MAX_RESULTS_PERQUERY results
			System.out.println("@@@@@@@@@@" + transformQueryString(query.getQueryString()));
		    ScoredExtentResult[] results =
		    	env.runQuery(transformQueryString(query.getQueryString()),
		    				 MAX_RESULTS_PERQUERY);
			
			// get passages and document numbers
			String[] passages = new String[results.length];
			for (int i = 0; i < results.length; i += MAX_DOCS) {
				// fetch MAX_DOCS documents at a time (for memory efficiency)
				ScoredExtentResult[] partResults =
					new ScoredExtentResult[Math.min(MAX_DOCS, results.length - i)];
				for (int j = i; j < i + partResults.length; j++)
					partResults[j-i] = results[j];
				
				ParsedDocument[] documents = env.documents(partResults);
				
				for (int j = 0; j < partResults.length; j++) {
					int passageBegin = partResults[j].begin;
					int passageEnd = partResults[j].end;
					int byteBegin = documents[j].positions[passageBegin].begin;
					int byteEnd = documents[j].positions[passageEnd - 1].end;
					
					byte[] doc = documents[j].text.getBytes("UTF-8");
					byte[] p = new byte[byteEnd - byteBegin];
					for (int offset = byteBegin; offset < byteEnd; offset++) {
						// Check offset to avoid OutOfBound error. By Yunsheng Bai.
						if (offset >= doc.length) {
							break;
						}
						p[offset - byteBegin] = doc[offset];
					}
					passages[j+i] = new String(p);
					
//					passages[j+i] = documents[j].text.substring(byteBegin, byteEnd);
					
//					// align passage with paragraph tags
//					String docText = documents[j].text;
//					while (byteBegin > docText.length() ||
//							!docText.substring(byteBegin - 3, byteBegin).equals("<P>"))
//						byteBegin--;
//					passages[j+i] =
//						docText.substring(byteBegin).split("</P>", 2)[0].trim();
				}
			}
			String[] docNos = env.documentMetadata(results, "docno");
		    
		    // close query environment
		    env.close();
			
			// return results
			return getResults(passages, docNos, false);
		} catch (Exception e) {
			MsgPrinter.printSearchError(e);  // print search error message
			
			MsgPrinter.printErrorMsg("\nSearch failed.");
			//System.exit(1);
			
			return null;
		}
	}
	
	/**
	 * Returns a new instance of <code>IndriKM</code>. A new instance is created
	 * for each query.
	 * 
	 * @return new instance of <code>IndriKM</code>
	 */
	public KnowledgeMiner getCopy() {
		if (indriDirs != null)
			return new IndriKM(indriDirs, false);
		else
			return new IndriKM(indriUrls, true);
	}
}