/*
* DSQuery.java
*
* Version: $Revision: 3705 $
*
* Date: $Date: 2009-04-11 17:02:24 +0000 (Sat, 11 Apr 2009) $
*
* Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
* Institute of Technology. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of the Hewlett-Packard Company nor the name of the
* Massachusetts Institute of Technology nor the names of their
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
package org.dspace.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.TokenMgrError;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.core.ConfigurationManager;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.core.LogManager;
import org.dspace.sort.SortOption;
// issues
// need to filter query string for security
// cmd line query needs to process args correctly (seems to split them up)
/**
* DSIndexer contains various static methods for performing queries on indices,
* for collections and communities.
*
*/
public class DSQuery
{
// Result types
static final String ALL = "999";
static final String ITEM = "" + Constants.ITEM;
static final String COLLECTION = "" + Constants.COLLECTION;
static final String COMMUNITY = "" + Constants.COMMUNITY;
// cache a Lucene IndexSearcher for more efficient searches
private static IndexSearcher searcher = null;
private static String indexDir = null;
private static String operator = null;
private static long lastModified;
/** log4j logger */
private static Logger log = Logger.getLogger(DSQuery.class);
static
{
String maxClauses = ConfigurationManager.getProperty("search.max-clauses");
if (maxClauses != null)
{
BooleanQuery.setMaxClauseCount(Integer.parseInt(maxClauses));
}
indexDir = ConfigurationManager.getProperty("search.dir");
operator = ConfigurationManager.getProperty("search.operator");
}
/**
* Do a query, returning a QueryResults object
*
* @param c context
* @param args query arguments in QueryArgs object
*
* @return query results QueryResults
*/
public static QueryResults doQuery(Context c, QueryArgs args)
throws IOException
{
String querystring = args.getQuery();
QueryResults qr = new QueryResults();
List hitHandles = new ArrayList();
List hitIds = new ArrayList();
List hitTypes = new ArrayList();
// set up the QueryResults object
qr.setHitHandles(hitHandles);
qr.setHitIds(hitIds);
qr.setHitTypes(hitTypes);
qr.setStart(args.getStart());
qr.setPageSize(args.getPageSize());
qr.setEtAl(args.getEtAl());
// massage the query string a bit
querystring = checkEmptyQuery(querystring); // change nulls to an empty string
// We no longer need to work around the Lucene bug with recent versions
//querystring = workAroundLuceneBug(querystring); // logicals changed to && ||, etc.
querystring = stripHandles(querystring); // remove handles from query string
querystring = stripAsterisk(querystring); // remove asterisk from beginning of string
try
{
// grab a searcher, and do the search
Searcher searcher = getSearcher(c);
QueryParser qp = new QueryParser("default", DSIndexer.getAnalyzer());
log.debug("Final query string: " + querystring);
if (operator == null || operator.equals("OR"))
{
qp.setDefaultOperator(QueryParser.OR_OPERATOR);
}
else
{
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
}
Query myquery = qp.parse(querystring);
Hits hits = null;
try
{
if (args.getSortOption() == null)
{
SortField[] sortFields = new SortField[] {
new SortField("search.resourcetype", true),
new SortField(null, SortField.SCORE, SortOption.ASCENDING.equals(args.getSortOrder()))
};
hits = searcher.search(myquery, new Sort(sortFields));
}
else
{
SortField[] sortFields = new SortField[] {
new SortField("search.resourcetype", true),
new SortField("sort_" + args.getSortOption().getName(), SortOption.DESCENDING.equals(args.getSortOrder())),
SortField.FIELD_SCORE
};
hits = searcher.search(myquery, new Sort(sortFields));
}
}
catch (Exception e)
{
// Lucene can throw an exception if it is unable to determine a sort time from the specified field
// Provide a fall back that just works on relevancy.
log.error("Unable to use speficied sort option: " + (args.getSortOption() == null ? "type/relevance": args.getSortOption().getName()));
hits = searcher.search(myquery, new Sort(SortField.FIELD_SCORE));
}
// set total number of hits
qr.setHitCount(hits.length());
// We now have a bunch of hits - snip out a 'window'
// defined in start, count and return the handles
// from that window
// first, are there enough hits?
if (args.getStart() < hits.length())
{
// get as many as we can, up to the window size
// how many are available after snipping off at offset 'start'?
int hitsRemaining = hits.length() - args.getStart();
int hitsToProcess = (hitsRemaining < args.getPageSize()) ? hitsRemaining
: args.getPageSize();
for (int i = args.getStart(); i < (args.getStart() + hitsToProcess); i++)
{
Document d = hits.doc(i);
String resourceId = d.get("search.resourceid");
String resourceType = d.get("search.resourcetype");
String handleText = d.get("handle");
String handleType = d.get("type");
switch (Integer.parseInt( resourceType != null ? resourceType : handleType))
{
case Constants.ITEM:
hitTypes.add(new Integer(Constants.ITEM));
break;
case Constants.COLLECTION:
hitTypes.add(new Integer(Constants.COLLECTION));
break;
case Constants.COMMUNITY:
hitTypes.add(new Integer(Constants.COMMUNITY));
break;
}
hitHandles.add( handleText );
hitIds.add( resourceId == null ? null: Integer.parseInt(resourceId) );
}
}
}
catch (NumberFormatException e)
{
log.warn(LogManager.getHeader(c, "Number format exception", "" + e));
qr.setErrorMsg("number-format-exception");
}
catch (ParseException e)
{
// a parse exception - log and return null results
log.warn(LogManager.getHeader(c, "Invalid search string", "" + e));
qr.setErrorMsg("invalid-search-string");
}
catch (TokenMgrError tme)
{
// Similar to parse exception
log.warn(LogManager.getHeader(c, "Invalid search string", "" + tme));
qr.setErrorMsg("invalid-search-string");
}
catch(BooleanQuery.TooManyClauses e)
{
log.warn(LogManager.getHeader(c, "Query too broad", e.toString()));
qr.setErrorMsg("query-too-broad");
}
return qr;
}
static String checkEmptyQuery(String myquery)
{
if (myquery == null || myquery.equals("()") || myquery.equals(""))
{
myquery = "empty_query_string";
}
return myquery;
}
/**
* Workaround Lucene bug that breaks wildcard searching.
* This is no longer required with Lucene upgrades.
*
* @param myquery
* @return
* @deprecated
*/
static String workAroundLuceneBug(String myquery)
{
// Lucene currently has a bug which breaks wildcard
// searching when you have uppercase characters.
// Here we substitute the boolean operators -- which
// have to be uppercase -- before tranforming the
// query string to lowercase.
return myquery.replaceAll(" AND ", " && ")
.replaceAll(" OR ", " || ")
.replaceAll(" NOT ", " ! ")
.toLowerCase();
}
static String stripHandles(String myquery)
{
// Drop beginning pieces of full handle strings
return myquery.replaceAll("^\\s*http://hdl\\.handle\\.net/", "")
.replaceAll("^\\s*hdl:", "");
}
static String stripAsterisk(String myquery)
{
// query strings (or words) begining with "*" cause a null pointer error
return myquery.replaceAll("^\\*", "")
.replaceAll("\\s\\*", " ")
.replaceAll("\\(\\*", "(")
.replaceAll(":\\*", ":");
}
/**
* Do a query, restricted to a collection
*
* @param c
* context
* @param args
* query args
* @param coll
* collection to restrict to
*
* @return QueryResults same results as doQuery, restricted to a collection
*/
public static QueryResults doQuery(Context c, QueryArgs args,
Collection coll) throws IOException
{
String querystring = args.getQuery();
querystring = checkEmptyQuery(querystring);
String location = "l" + (coll.getID());
String newquery = new String("+(" + querystring + ") +location:\""
+ location + "\"");
args.setQuery(newquery);
return doQuery(c, args);
}
/**
* Do a query, restricted to a community
*
* @param c
* context
* @param args
* query args
* @param comm
* community to restrict to
*
* @return QueryResults same results as doQuery, restricted to a collection
*/
public static QueryResults doQuery(Context c, QueryArgs args, Community comm)
throws IOException
{
String querystring = args.getQuery();
querystring = checkEmptyQuery(querystring);
String location = "m" + (comm.getID());
String newquery = new String("+(" + querystring + ") +location:\""
+ location + "\"");
args.setQuery(newquery);
return doQuery(c, args);
}
/**
* Do a query, printing results to stdout largely for testing, but it is
* useful
*/
public static void doCMDLineQuery(String query)
{
System.out.println("Command line query: " + query);
System.out.println("Only reporting default-sized results list");
try
{
Context c = new Context();
QueryArgs args = new QueryArgs();
args.setQuery(query);
QueryResults results = doQuery(c, args);
Iterator i = results.getHitHandles().iterator();
Iterator j = results.getHitTypes().iterator();
while (i.hasNext())
{
String thisHandle = (String) i.next();
Integer thisType = (Integer) j.next();
String type = Constants.typeText[thisType.intValue()];
// also look up type
System.out.println(type + "\t" + thisHandle);
}
}
catch (Exception e)
{
System.out.println("Exception caught: " + e);
}
}
/**
* Close any IndexSearcher that is currently open.
*/
public static void close()
{
if (searcher != null)
{
try
{
searcher.close();
searcher = null;
}
catch (IOException ioe)
{
log.error("DSQuery: Unable to close open IndexSearcher", ioe);
}
}
}
public static void main(String[] args)
{
if (args.length > 0)
{
DSQuery.doCMDLineQuery(args[0]);
}
}
/*--------- protected methods ----------*/
/**
* get an IndexReader.
* @throws IOException
*/
protected static IndexReader getIndexReader()
throws IOException
{
return getSearcher(null).getIndexReader();
}
/**
* get an IndexSearcher, hopefully a cached one (gives much better
* performance.) checks to see if the index has been modified - if so, it
* creates a new IndexSearcher
*/
protected static synchronized IndexSearcher getSearcher(Context c)
throws IOException
{
// If we have already opened a searcher, check to see if the index has been updated
// If it has, we need to close the existing searcher - we will open a new one later
if (searcher != null && lastModified != IndexReader.getCurrentVersion(indexDir))
{
try
{
// Close the cached IndexSearcher
searcher.close();
}
catch (IOException ioe)
{
// Index is probably corrupt. Log the error, but continue to either:
// 1) Return existing searcher (may yet throw exception, no worse than throwing here)
log.warn("DSQuery: Unable to check for updated index", ioe);
}
finally
{
searcher = null;
}
}
// There is no existing searcher - either this is the first execution,
// or the index has been updated and we closed the old index.
if (searcher == null)
{
// So, open a new searcher
lastModified = IndexReader.getCurrentVersion(indexDir);
searcher = new IndexSearcher(indexDir){
/*
* TODO: Has Lucene fixed this bug yet?
* Lucene doesn't release read locks in
* windows properly on finalize. Our hack
* extend IndexSearcher to force close().
*/
protected void finalize() throws Throwable {
this.close();
super.finalize();
}
};
}
return searcher;
}
}
// it's now up to the display page to do the right thing displaying
// items & communities & collections