/** * The contents of this file are subject to the license and copyright * detailed in the LICENSE and NOTICE files at the root of the source * tree and available online at * * http://www.dspace.org/license/ */ package org.dspace.search; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.TokenMgrError; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.dspace.content.Collection; import org.dspace.content.Community; import org.dspace.core.ConfigurationManager; import org.dspace.core.Constants; import org.dspace.core.Context; import org.dspace.core.LogManager; import org.dspace.sort.SortOption; // issues // need to filter query string for security // cmd line query needs to process args correctly (seems to split them up) /** * DSIndexer contains various static methods for performing queries on indices, * for collections and communities. * */ public class DSQuery { // Result types static final String ALL = "999"; static final String ITEM = "" + Constants.ITEM; static final String COLLECTION = "" + Constants.COLLECTION; static final String COMMUNITY = "" + Constants.COMMUNITY; // cache a Lucene IndexSearcher for more efficient searches private static IndexSearcher searcher = null; private static String indexDir = null; private static String operator = null; private static long lastModified; /** log4j logger */ private static Logger log = Logger.getLogger(DSQuery.class); static { String maxClauses = ConfigurationManager.getProperty("search.max-clauses"); if (maxClauses != null) { BooleanQuery.setMaxClauseCount(Integer.parseInt(maxClauses)); } indexDir = ConfigurationManager.getProperty("search.dir"); operator = ConfigurationManager.getProperty("search.operator"); } /** * Do a query, returning a QueryResults object * * @param c context * @param args query arguments in QueryArgs object * * @return query results QueryResults */ public static QueryResults doQuery(Context c, QueryArgs args) throws IOException { String querystring = args.getQuery(); QueryResults qr = new QueryResults(); List<String> hitHandles = new ArrayList<String>(); List<Integer> hitIds = new ArrayList<Integer>(); List<Integer> hitTypes = new ArrayList<Integer>(); // set up the QueryResults object qr.setHitHandles(hitHandles); qr.setHitIds(hitIds); qr.setHitTypes(hitTypes); qr.setStart(args.getStart()); qr.setPageSize(args.getPageSize()); qr.setEtAl(args.getEtAl()); // massage the query string a bit querystring = checkEmptyQuery(querystring); // change nulls to an empty string // We no longer need to work around the Lucene bug with recent versions //querystring = workAroundLuceneBug(querystring); // logicals changed to && ||, etc. querystring = stripHandles(querystring); // remove handles from query string querystring = stripAsterisk(querystring); // remove asterisk from beginning of string try { // grab a searcher, and do the search IndexSearcher searcher = getSearcher(c); QueryParser qp = new QueryParser(Version.LUCENE_33, "default", DSIndexer.getAnalyzer()); log.debug("Final query string: " + querystring); if (operator == null || operator.equals("OR")) { qp.setDefaultOperator(QueryParser.OR_OPERATOR); } else { qp.setDefaultOperator(QueryParser.AND_OPERATOR); } Query myquery = qp.parse(querystring); //Retrieve enough docs to get all the results we need ! TopDocs hits = performQuery(args, searcher, myquery, args.getPageSize() * (args.getStart() + 1)); // set total number of hits qr.setHitCount(hits.totalHits); // We now have a bunch of hits - snip out a 'window' // defined in start, count and return the handles // from that window // first, are there enough hits? if (args.getStart() < hits.totalHits) { // get as many as we can, up to the window size // how many are available after snipping off at offset 'start'? int hitsRemaining = hits.totalHits - args.getStart(); int hitsToProcess = (hitsRemaining < args.getPageSize()) ? hitsRemaining : args.getPageSize(); for (int i = args.getStart(); i < (args.getStart() + hitsToProcess); i++) { Document d = searcher.doc(hits.scoreDocs[i].doc); String resourceId = d.get("search.resourceid"); String resourceType = d.get("search.resourcetype"); String handleText = d.get("handle"); String handleType = d.get("type"); switch (Integer.parseInt( resourceType != null ? resourceType : handleType)) { case Constants.ITEM: hitTypes.add(Constants.ITEM); break; case Constants.COLLECTION: hitTypes.add(Constants.COLLECTION); break; case Constants.COMMUNITY: hitTypes.add(Constants.COMMUNITY); break; } hitHandles.add( handleText ); hitIds.add( resourceId == null ? null: Integer.parseInt(resourceId) ); } } } catch (NumberFormatException e) { log.warn(LogManager.getHeader(c, "Number format exception", "" + e)); qr.setErrorMsg("number-format-exception"); } catch (ParseException e) { // a parse exception - log and return null results log.warn(LogManager.getHeader(c, "Invalid search string", "" + e)); qr.setErrorMsg("invalid-search-string"); } catch (TokenMgrError tme) { // Similar to parse exception log.warn(LogManager.getHeader(c, "Invalid search string", "" + tme)); qr.setErrorMsg("invalid-search-string"); } catch(BooleanQuery.TooManyClauses e) { log.warn(LogManager.getHeader(c, "Query too broad", e.toString())); qr.setErrorMsg("query-too-broad"); } return qr; } private static TopDocs performQuery(QueryArgs args, IndexSearcher searcher, Query myquery, int max) throws IOException { TopDocs hits; try { if (args.getSortOption() == null) { SortField[] sortFields = new SortField[] { new SortField("search.resourcetype", SortField.INT, true), new SortField(null, SortField.SCORE, SortOption.ASCENDING.equals(args.getSortOrder())) }; hits = searcher.search(myquery, max, new Sort(sortFields)); } else { SortField[] sortFields = new SortField[] { new SortField("search.resourcetype", SortField.INT, true), new SortField("sort_" + args.getSortOption().getName(), SortField.STRING, SortOption.DESCENDING.equals(args.getSortOrder())), SortField.FIELD_SCORE }; hits = searcher.search(myquery, max, new Sort(sortFields)); } } catch (Exception e) { // Lucene can throw an exception if it is unable to determine a sort time from the specified field // Provide a fall back that just works on relevancy. log.error("Unable to use speficied sort option: " + (args.getSortOption() == null ? "type/relevance": args.getSortOption().getName())); hits = searcher.search(myquery, max, new Sort(SortField.FIELD_SCORE)); } return hits; } static String checkEmptyQuery(String myquery) { if (myquery == null || myquery.equals("()") || myquery.equals("")) { myquery = "empty_query_string"; } return myquery; } /** * Workaround Lucene bug that breaks wildcard searching. * This is no longer required with Lucene upgrades. * * @param myquery * @return * @deprecated */ static String workAroundLuceneBug(String myquery) { // Lucene currently has a bug which breaks wildcard // searching when you have uppercase characters. // Here we substitute the boolean operators -- which // have to be uppercase -- before transforming the // query string to lowercase. return myquery.replaceAll(" AND ", " && ") .replaceAll(" OR ", " || ") .replaceAll(" NOT ", " ! ") .toLowerCase(); } static String stripHandles(String myquery) { // Drop beginning pieces of full handle strings return myquery.replaceAll("^\\s*http://hdl\\.handle\\.net/", "") .replaceAll("^\\s*hdl:", ""); } static String stripAsterisk(String myquery) { // query strings (or words) beginning with "*" cause a null pointer error return myquery.replaceAll("^\\*", "") .replaceAll("\\s\\*", " ") .replaceAll("\\(\\*", "(") .replaceAll(":\\*", ":"); } /** * Do a query, restricted to a collection * * @param c * context * @param args * query args * @param coll * collection to restrict to * * @return QueryResults same results as doQuery, restricted to a collection */ public static QueryResults doQuery(Context c, QueryArgs args, Collection coll) throws IOException { String querystring = args.getQuery(); querystring = checkEmptyQuery(querystring); String location = "l" + (coll.getID()); String newquery = "+(" + querystring + ") +location:\"" + location + "\""; args.setQuery(newquery); return doQuery(c, args); } /** * Do a query, restricted to a community * * @param c * context * @param args * query args * @param comm * community to restrict to * * @return QueryResults same results as doQuery, restricted to a collection */ public static QueryResults doQuery(Context c, QueryArgs args, Community comm) throws IOException { String querystring = args.getQuery(); querystring = checkEmptyQuery(querystring); String location = "m" + (comm.getID()); String newquery = "+(" + querystring + ") +location:\"" + location + "\""; args.setQuery(newquery); return doQuery(c, args); } /** * Do a query, printing results to stdout largely for testing, but it is * useful */ public static void doCMDLineQuery(String query) { System.out.println("Command line query: " + query); System.out.println("Only reporting default-sized results list"); try { Context c = new Context(); QueryArgs args = new QueryArgs(); args.setQuery(query); QueryResults results = doQuery(c, args); Iterator i = results.getHitHandles().iterator(); Iterator j = results.getHitTypes().iterator(); while (i.hasNext()) { String thisHandle = (String) i.next(); Integer thisType = (Integer) j.next(); String type = Constants.typeText[thisType]; // also look up type System.out.println(type + "\t" + thisHandle); } } catch (Exception e) { System.out.println("Exception caught: " + e); } } /** * Close any IndexSearcher that is currently open. */ public static synchronized void close() { if (searcher != null) { try { searcher.close(); searcher = null; } catch (IOException ioe) { log.error("DSQuery: Unable to close open IndexSearcher", ioe); } } } public static void main(String[] args) { if (args.length > 0) { DSQuery.doCMDLineQuery(args[0]); } } /*--------- protected methods ----------*/ /** * get an IndexReader. * @throws IOException */ protected static IndexReader getIndexReader() throws IOException { return getSearcher(null).getIndexReader(); } /** * get an IndexSearcher, hopefully a cached one (gives much better * performance.) checks to see if the index has been modified - if so, it * creates a new IndexSearcher */ protected static synchronized IndexSearcher getSearcher(Context c) throws IOException { // If we have already opened a searcher, check to see if the index has been updated // If it has, we need to close the existing searcher - we will open a new one later Directory searchDir = FSDirectory.open(new File(indexDir)); if (searcher != null && lastModified != IndexReader.getCurrentVersion(searchDir)) { try { // Close the cached IndexSearcher searcher.close(); } catch (IOException ioe) { // Index is probably corrupt. Log the error, but continue to either: // 1) Return existing searcher (may yet throw exception, no worse than throwing here) log.warn("DSQuery: Unable to check for updated index", ioe); } finally { searcher = null; } } // There is no existing searcher - either this is the first execution, // or the index has been updated and we closed the old index. if (searcher == null) { // So, open a new searcher lastModified = IndexReader.getCurrentVersion(searchDir); String osName = System.getProperty("os.name"); if (osName != null && osName.toLowerCase().contains("windows")) { searcher = new IndexSearcher(searchDir){ /* * TODO: Has Lucene fixed this bug yet? * Lucene doesn't release read locks in * windows properly on finalize. Our hack * extend IndexSearcher to force close(). */ @Override protected void finalize() throws Throwable { this.close(); super.finalize(); } }; } else { searcher = new IndexSearcher(searchDir); } } return searcher; } } // it's now up to the display page to do the right thing displaying // items & communities & collections