/* * AndQuery.java * * Copyright (c) 2007-2011, The University of Sheffield. * * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), * and is free software, licenced under the GNU Lesser General Public License, * Version 3, June 2007 (also included with this distribution as file * LICENCE-LGPL3.html). * * Valentin Tablan, 04 Jul 2009 * * $Id$ */ package gate.mimir.search.query; import gate.mimir.search.QueryEngine; import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue; import java.io.IOException; import java.util.*; import org.apache.log4j.Logger; /** * Query Node for AND queries. Finds the shortest document intervals that * contain hits from several sub-queries. */ public class AndQuery implements QueryNode { private static final long serialVersionUID = -5565830708202297074L; public static class AndQueryExecutor extends AbstractIntersectionQueryExecutor { /** * Creates a query executor for an AND query. * * @param engine * the {@link QueryEngine} to be used. * @param query * the query being executed. * @throws IOException * if there are problems accessing the index. */ public AndQueryExecutor(AndQuery query, QueryEngine engine) throws IOException { super(engine, query, query.nodes); this.query = query; } /** * Static logger. */ private static final Logger logger = Logger.getLogger(AndQueryExecutor.class); /* * (non-Javadoc) * * @see gate.mimir.search.query.QueryExecutor#close() */ public void close() throws IOException { super.close(); hitsOnCurrentDocument = null; query = null; } /* * (non-Javadoc) * * @see gate.mimir.search.query.QueryExecutor#nextDocument(int) */ public long nextDocument(long greaterThan) throws IOException { hitsOnCurrentDocument = null; return super.nextDocument(greaterThan); } /** * @throws IOException */ protected void getHitsOnCurrentDocument() throws IOException { // start with an empty list hitsOnCurrentDocument = new LinkedList<Binding[]>(); // The algorithm is: // 1) obtain the hit list from each sub-executor // 2) filter non-minimal intervals // 3) apply the algorithm from // http://www.springerlink.com/content/1267481874833210/ // an array of hit lists, one from each executor. LinkedLists are used // explicitly for performance reasons. LinkedList<Binding>[] subHits = new LinkedList[executors.length]; for(int i = 0; i < executors.length; i++) { LinkedList<Binding> hits = new LinkedList<Binding>(); Binding aHit = executors[i].nextHit(); if(aHit == null){ logger.warn("Malfunction in AND operator (or one of the sub-nodes):\n" + "No input sub-hits from " + nodes[i].toString() + " on document " + latestDocument + "!"); } while(aHit != null) { // filter all the non minimal intervals, i.e.: // remove all previous elements (if any) that have a greater end // offset while(!hits.isEmpty() && (hits.getLast().getTermPosition() + hits.getLast().getLength() >= aHit.getTermPosition() + aHit.getLength())) { hits.removeLast(); } hits.add(aHit); aHit = executors[i].nextHit(); } subHits[i] = hits; // if one of the sub-executors has no results, return if(subHits[i].isEmpty()){ // this should never happen logger.warn("Malfunction in AND operator (or one of the sub-nodes):\n" + "No ouput sub-hits from " + nodes[i].toString() + "!"); return; } } // The algorithm is: // function next begin // 1 if !(Q is full) then return null; // 2 do // 3 c = span(Q); // 4 advance(Q) // 5 while Q is full and span(Q) in c ; // 6 // 7 while Q is full and c in span(Q) do // 8 advance(Q) // 9 end; // 10 return c // 11 end; // Where: // The queue in the algorithm is a double priority queue (a queue with two // order relations). However, the second order relation is based on // interval end-offsets, we only ever use it to obtain the last element, // and the intervals come in ascending order. So we can replace the second // order by simply keeping track of the maximum end offset. // - advance(Q) means replacing the top element with a new one from the // same // source sub-executor; // - span(Q) is the span covered by all elements in the queue (defined as // the left extreme of the queue primary top, and the right extreme of // the queue secondary top. // construct the initial candidate solution (by taking the first sub-hit // from each list // the candidate result hit (contains one hit hit from each sub-executor) Binding[] candidateHits = new Binding[executors.length]; // create the queue ObjectHeapSemiIndirectPriorityQueue<Binding> queue = new ObjectHeapSemiIndirectPriorityQueue<Binding>(candidateHits, new Comparator<Binding>() { /** * Compares intervals (bindings). An interval is smaller * than another if it starts before or it extends it. In * other words, the top interval in the queue will * always be one that potentially extends the candidate * span either to the left or right. * * @param o1 * @param o2 * @return */ public int compare(Binding b1, Binding b2) { int start1 = b1.getTermPosition(); int start2 = b2.getTermPosition(); if(start1 < start2) { return -1; } else if(start1 == start2) { int end1 = start1 + b1.getLength(); int end2 = start2 + b2.getLength(); // note the inversion! return end2 - end1; } else { // start2 > start1 return 1; } } }); int maxRight = Integer.MIN_VALUE; for(int i = 0; i < subHits.length; i++) { // first hit from each sub-executor becomes part of the candidate // solution candidateHits[i] = subHits[i].removeFirst(); int currentRight = candidateHits[i].getTermPosition() + candidateHits[i].getLength(); if(currentRight > maxRight) maxRight = currentRight; queue.enqueue(i); } boolean done = false; // each loop is a call to the "next" function above bigwhile: while(!done) { // step 1 -> improve the current candidate solution int solutionLeft; int solutionRight; int first; step1: while(true) { first = queue.first(); if(subHits[first].isEmpty()) { // no more inputs -> save current solution and exit hitsOnCurrentDocument.add(candidateHits); break bigwhile; } else { // advance Queue Binding oldTop = candidateHits[first]; int oldFirst = first; candidateHits[first] = subHits[first].removeFirst(); queue.changed(); first = queue.first(); // if span(Q) > candidate, exit step1 //the span(Q) is defined by the left extreme of the new top, and //the right extreme of the newly added element (the one at oldFirst) if(candidateHits[first].getTermPosition() < oldTop.getTermPosition() || candidateHits[oldFirst].getTermPosition() + candidateHits[oldFirst].getLength() > maxRight) { // cannot improve current solution any more // save the solution Binding[] solution = new Binding[candidateHits.length]; for(int i = 0; i < solution.length; i++) { solution[i] = (i == oldFirst) ? oldTop : candidateHits[i]; } solutionLeft = oldTop.getTermPosition(); solutionRight = maxRight; hitsOnCurrentDocument.add(solution); // update maxRight int currentRight = candidateHits[first].getTermPosition() + candidateHits[first].getLength(); if(currentRight > maxRight) maxRight = currentRight; // and exit step1 break step1; } } } // step 2 while c in Span(Q) while(candidateHits[first].getTermPosition() <= solutionLeft && candidateHits[first].getTermPosition() + candidateHits[first].getLength() >= solutionRight) { if(subHits[first].isEmpty()) { // no more input hits -> we're done break bigwhile; } else { // advance the queue candidateHits[first] = subHits[first].removeFirst(); queue.changed(); first = queue.first(); } }// step2 while } } /* * (non-Javadoc) * * @see gate.mimir.search.query.QueryExecutor#nextHit() */ public Binding nextHit() throws IOException { if(closed) return null; if(hitsOnCurrentDocument == null){ getHitsOnCurrentDocument(); } if(hitsOnCurrentDocument.isEmpty()) return null; else { // get the first hit, and construct the corresponding bindings array for // it // note that the sub-hits are in no particular order int start = Integer.MAX_VALUE; int end = Integer.MIN_VALUE; Binding[] hitSlots = hitsOnCurrentDocument.remove(0); // there will be one contained binding for each sub-query, plus // all of their own contained bindings. int containedBindsCount = executors.length; for(Binding aSubHit : hitSlots) { Binding[] containedB = aSubHit.getContainedBindings(); containedBindsCount += containedB == null ? 0 : containedB.length; if(aSubHit.getTermPosition() < start) { start = aSubHit.getTermPosition(); } if(aSubHit.getTermPosition() + aSubHit.getLength() > end) { end = aSubHit.getTermPosition() + aSubHit.getLength(); } } Binding[] containedBindings = null; if(engine.isSubBindingsEnabled()){ containedBindings = new Binding[containedBindsCount]; // position to write into containedBindings int cbIdx = 0; for(Binding aSubHit : hitSlots) { containedBindings[cbIdx++] = aSubHit; if(aSubHit.getContainedBindings() != null) { System.arraycopy(aSubHit.getContainedBindings(), 0, containedBindings, cbIdx, aSubHit.getContainedBindings().length); cbIdx += aSubHit.getContainedBindings().length; } } } return new Binding(query, hitSlots[0].getDocumentId(), start, end - start, containedBindings); } } /** * The query being executed. */ private AndQuery query; /** * The list of available hits for the current document (i.e. the latest * document returned by {@link #nextDocument(int)}. Each entry is an array * of {@link Binding} values, one from each sub-query. */ protected List<Binding[]> hitsOnCurrentDocument; }// public static class AndQueryExecutor /** * Constructs a new short-AND operator from an array of sub-nodes. * * @param nodes * the sub-queries for this operator. */ public AndQuery(QueryNode... nodes) { this.nodes = nodes; } /* * (non-Javadoc) * * @see gate.mimir.search.query.QueryNode#getQueryExecutor(gate.mimir.search. * QueryEngine) */ public QueryExecutor getQueryExecutor(QueryEngine engine) throws IOException { return new AndQueryExecutor(this, engine); } /** * Gets the sub-queries for this AND query. * * @return the nodes */ public QueryNode[] getNodes() { return nodes; } public String toString() { StringBuilder str = new StringBuilder("AND ("); if(nodes != null){ for(int i = 0; i < nodes.length -1; i++){ str.append(nodes[i].toString()); str.append(", "); } str.append(nodes[nodes.length -1].toString()); } str.append(")"); return str.toString(); } /** * The list of sub-queries. */ private QueryNode[] nodes; }