/* * SequenceQuery.java * * Copyright (c) 2007-2011, The University of Sheffield. * * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), * and is free software, licenced under the GNU Lesser General Public License, * Version 3, June 2007 (also included with this distribution as file * LICENCE-LGPL3.html). * * Valentin Tablan, 06 Mar 2009 * * $Id$ */ package gate.mimir.search.query; import gate.mimir.search.QueryEngine; import java.io.IOException; import java.io.Serializable; import java.text.NumberFormat; import java.util.*; import java.util.concurrent.BlockingQueue; /** * Implementation for phrase queries. A phrase query consists of a sequence of * other sub-queries. */ public class SequenceQuery implements QueryNode { private static final long serialVersionUID = -2331332813532064881L; /** * A gap in a sequence query, represented the number of permitted extra terms * between the results of two consecutive sub-queries. * These can be used for unconstrained query segments (e.g. {Token}[2,5] in * JAPE notation). * A Gap is defined by a minimum and a maximum number of permitted terms, * hence a gap with <code>min=max=n</code> would only allow a fixed number * of <code>n</code> terms. A Gap with * <code>max={@link Integer#MAX_VALUE}</code> allows arbitrarily long gaps. * A gap with <code>min=max=0</code> allows no space between sub-query * results. */ public static class Gap implements Serializable { public static final long serialVersionUID = 4792642842105791776L; /** * Creates a new {@link Gap}. * @param min the minimum number of terms required. * @param max the maximum number of terms permitted. */ public Gap(int min, int max) { if(min > max) throw new IllegalArgumentException( "Value for min larger than value for max!"); this.min = min; this.max = max; } int min; int max; /** * Checks whether two given term positions can be the start and the end of * this gap (i.e. if the distance between them corresponds to the gap * specification). * The check performed is that the <code>end<code> value needs to be * between (start + min + 1) and (start + max +1). This means a gap of zero * requires <code>end == start + 1</code>. * * @param start the start position for the gap. * @param end the end position for the gap. * @return <code>0</code> if the start and end positions are accepted by * this gap, a negative number if the end position is too small, a positive * number if the end position is too large. */ public int check(int start, int end){ if(end < start + min + 1) return -1; else if(end > start + max + 1) return 1; else return 0; } public int getMin() { return min; } public int getMax() { return max; } } public static class SequenceQueryExecutor extends AbstractIntersectionQueryExecutor{ /** * @param engine * @param query * @throws IOException */ public SequenceQueryExecutor(SequenceQuery query, QueryEngine engine) throws IOException { super(engine, query, query.nodes); this.query = query; //initialise the internal data hitsOnCurrentDocument = new LinkedList<Binding[]>(); candidateHits = new List[executors.length]; } /** * The query being executed. */ private SequenceQuery query; /** * A list of hits on the current document. */ protected List<Binding[]> hitsOnCurrentDocument; /** * An array of lists of hits, one list for each of the {@link #executors}. */ protected List<Binding>[] candidateHits; /* (non-Javadoc) * @see gate.mimir.search.query.QueryExecutor#close() */ public void close() throws IOException { super.close(); //release all pointers hitsOnCurrentDocument = null; query = null; } /* (non-Javadoc) * @see gate.mimir.search.query.QueryExecutor#nextDocument(int) */ public long nextDocument(long greaterThan) throws IOException { if(closed) return latestDocument = -1; if(latestDocument == -1) return latestDocument; //we've just been asked to change documents -> old hits not current any more hitsOnCurrentDocument.clear(); while(hitsOnCurrentDocument.isEmpty()){ long nextDocFromSuper = super.nextDocument(greaterThan); if(nextDocFromSuper < 0){ //no more documents return nextDocFromSuper; }else{ //We have a common document from super. //Now confirm if there is a match in that document. getHitsOnCurrentDocumentv3(); } } return latestDocument; } protected void getHitsOnCurrentDocumentv3() throws IOException{ //For each candidate hit, we keep either: //> the index of the candidate in the next list that forms a chain, or //> -1, if no such chain is possible. int[][] marks = new int[executors.length][]; //All the hits on the current document, one row for each executor Binding[][] hits = new Binding[executors.length][]; //prepare the candidateHits data //for each executor, we accumulate the hits on the current document //in its corresponding list. for(int i = 0; i < executors.length; i++){ //get all the hits on the current document List<Binding> hitsFromOneExecutor = new ArrayList<Binding>(); Binding subHit = executors[i].nextHit(); while(subHit != null){ hitsFromOneExecutor.add(subHit); subHit = executors[i].nextHit(); } hits[i] = hitsFromOneExecutor.toArray(new Binding[hitsFromOneExecutor.size()]); //create the marks array for this list of candidates marks[i] = new int[hits[i].length]; } //we mark from right to left, starting from the penultimate list //for the last list, all elements are marked as valid //VT: the following lines removed, as Java initialises arrays with 0 // for(int i = 0; i < marks[marks.length -1].length; i++){ // marks[marks.length -1][i] = 0; // } int currentSlot = hits.length - 2; while(currentSlot >= 0){ for(int i = 0 ; i < hits[currentSlot].length; i++ ){ //for each candidate hit Binding aHit = hits[currentSlot][i]; int minStart = aHit.getTermPosition() + aHit.getLength() + query.gaps[currentSlot].min; int maxStart = aHit.getTermPosition() + aHit.getLength() + query.gaps[currentSlot].max; //find the first hit in the next list and store its pointer marks[currentSlot][i] = computeMark(minStart, maxStart, hits[currentSlot+1], marks[currentSlot + 1]); } currentSlot--; } //we collect results from left to right hitsOnCurrentDocument.clear(); //for each marked starting point on the first slot, start enumerating the //hits for(int i = 0; i < marks[0].length; i++){ if(marks[0][i] >= 0){ Binding[] slots = new Binding[executors.length]; extractHitsRec(query, hitsOnCurrentDocument, hits, marks, 0, i, slots); } } } /** * Recursively extracts the hits * @param hits the arrays of hits, one row for each of the {@link #executors}. * @param marks the marks for the hits * @param currentSlot which slot to fill at this stage (used for recursion). * @param currentHit for the current slot, which candidate hit to start from. * @param slots the array of slots that needs filling. * @see #computeMark(int, int, Binding[], int[]) */ protected static void extractHitsRec(SequenceQuery query, List<Binding[]> results, Binding[][] hits, int[][] marks, int currentSlot, int currentHit, Binding[] slots){ slots[currentSlot] = hits[currentSlot][currentHit]; if(currentSlot == slots.length -1){ //we have a full result, no more recursion needed results.add(slots); }else{ //recursive call for the first next candidate extractHitsRec(query, results, hits, marks, currentSlot + 1, marks[currentSlot][currentHit], slots); //find all other candidates for next slot for(int i = marks[currentSlot][currentHit] +1; i < marks[currentSlot + 1].length; i++){ //if the gap accepts this next candidate, and it is valid, //add it to solution and do recursive call if(query.gaps[currentSlot].check( slots[currentSlot].getTermPosition() + slots[currentSlot].getLength() -1, hits[currentSlot + 1][i].getTermPosition()) == 0){ if(marks[currentSlot + 1][i] >= 0){ //copy the slots array Binding[] newSlots = new Binding[slots.length]; for(int j = 0; j <= currentSlot; j++){ newSlots[j] = slots[j]; } extractHitsRec(query, results, hits, marks, currentSlot + 1, i, newSlots); } }else{ //no more next candidates break; } } } } /** * Computes the mark for a current hit. The mark for a hit is: * - a pointer to a hit from the next slot candidates (if there is a chain * of hits from the current hit to the end) * - otherwise, -1 * The next hit pointed to is the first hit that can cause matches. * * @param minStart the minimum acceptable start position for the next hit * @param maxStart the maximum acceptable start position for the next hit * @param hits the list of candidate hits (hits on the next slot) * @param marks the list of marks for the candidate hits * @return the mark for the current hit */ protected static int computeMark(int minStart, int maxStart, Binding[] hits, int[] marks){ //binary search for the first valid next hit int low = 0; int high = hits.length -1; while(low <= high){ int mid = (low + high) >>> 1; if(hits[mid].getTermPosition() < minStart){ //mid hit too low -> look in second half low = mid + 1; }else if(hits[mid].getTermPosition() > maxStart){ //mid hit too large -> look in first half high = mid - 1; }else{ //Mid hit in the right range -> scroll up to find the first good hit. //A good hit is one that: // - starts between minStart and maxStart, and // - has a non-negative mark itself int lastValidMark = -1; int newMid = mid; while(newMid >= 0 && hits[newMid].getTermPosition() >= minStart){ if(marks[newMid] >= 0) lastValidMark = newMid; newMid--; } //if not found, scroll down while the starting position is still valid if(lastValidMark < 0){ newMid = mid + 1; while(newMid < hits.length && hits[newMid].getTermPosition() <= maxStart){ if(marks[newMid] >= 0){ lastValidMark = newMid; break; } newMid++; } } return lastValidMark; } } return -1; } protected void getHitsOnCurrentDocumentv2() throws IOException{ //We use the hitsOnCurrentDocument list as a list of possible suffixes, //sorted by start offset. This starts with the hits on the last slot, and //grows by adding new slot fillers at the start. //After we've filled the first slot, this list contains //the results. hitsOnCurrentDocument.clear(); //prepare the candidateHits data //for each executor, we accumulate the hits on the current document //in its corresponding list. for(int i = 0; i < executors.length; i++){ //get all the hits on the current document candidateHits[i] = new LinkedList<Binding>(); Binding subHit = executors[i].nextHit(); while(subHit != null){ candidateHits[i].add(subHit); subHit = executors[i].nextHit(); } } //start filling slots in the suffixes map. //we begin with the list of candidates from the last executor. int currentSlot = executors.length -1; for(Binding aSlotFiller : candidateHits[currentSlot]){ Binding[] aSuffix = new Binding[executors.length]; aSuffix[currentSlot] = aSlotFiller; hitsOnCurrentDocument.add(aSuffix); } //now continue filling the previous slot, until we've reached the 0 //position while(currentSlot > 0){ List<Binding[]> newSuffixes = new LinkedList<Binding[]>(); currentSlot--; for(Binding aSlotFiller : candidateHits[currentSlot]){ // int minStart = aSlotFiller.getTermPosition() + // aSlotFiller.getLength() + query.gaps[currentSlot].min; // int maxStart = aSlotFiller.getTermPosition() + // aSlotFiller.getLength() + query.gaps[currentSlot].max; //find suffixes that start between minStart and maxStart boolean suffixUsed = false; for(Binding[] aSuffix : hitsOnCurrentDocument){ if(query.gaps[currentSlot].check( aSlotFiller.getTermPosition() + aSlotFiller.getLength() -1, aSuffix[currentSlot + 1].getTermPosition()) == 0){ suffixUsed = true; //we managed to left-extend a suffix if(aSuffix[currentSlot] == null){ //this is the first time we're extending this suffix -> //we can re-use the same array aSuffix[currentSlot] = aSlotFiller; newSuffixes.add(aSuffix); }else{ //we need to create a copy Binding[] newSuffix = new Binding[executors.length]; newSuffix[currentSlot] = aSlotFiller; for(int i = currentSlot +1; i < newSuffix.length; i++){ newSuffix[i] = aSuffix[i]; } newSuffixes.add(newSuffix); } } } } hitsOnCurrentDocument = newSuffixes; }//while(currentSlot > 0) } /** * Attempts to find all the matches on the current document. When this * method is called, all the values in {@link #nextDocument(int)} are the * same (the current document). * * This method will use hits from the {@link #candidateHits} array (and * replenish any hit lists as necessary). In the process it will discard * any hits from those lists that are not useful (refer to lower documentIDs * or are not accepted by the gap restrictions). * * When the method returns, all the possible hits on the current document * have been saved in the {@link #hitsOnCurrentDocument} list. * * This method is a non-recursive implementation of the backtracking * algorithm. * * @throws IOException */ protected void getHitsOnCurrentDocumentOldBCK() throws IOException{ hitsOnCurrentDocument.clear(); //prepare the candidateHits data //for each executor, we accumulate the hits on the current document //in its corresponding list. for(int i = 0; i < executors.length; i++){ //get all the hits on the current document candidateHits[i] = new LinkedList<Binding>(); Binding subHit = executors[i].nextHit(); while(subHit != null){ candidateHits[i].add(subHit); subHit = executors[i].nextHit(); } } //prepare the backtracking machine //get an array of iterators for the sub-hits Iterator<Binding>[] hitIters = new Iterator[executors.length]; for(int i = 0; i < executors.length; i++){ hitIters[i] = candidateHits[i].iterator(); } //start the backtracking machine //we try to fill one of these: Binding[] slots = new Binding[executors.length]; //the slot that needs to be filled next int currentSlot = 0; while(currentSlot >= 0){ //we try to fill the current slot slots[currentSlot] = null; if(hitIters[currentSlot].hasNext()){ slots[currentSlot] = hitIters[currentSlot].next(); //validate the current slot candidate //check document if(currentSlot == 0){ //we just filled the first slot -> no tests to preform }else{ //we 're not on first position //Optimisation (beam search): //first remove all candidate hits on the current slot for which the //starting position is smaller than that of the previous slot. //This is safe to do, as the previous slot position will only get //larger. //VT: if the candidates list is short, this will make it slower // if(candidateHits[currentSlot].size() > 100){ int lowerBound = slots[currentSlot -1].getTermPosition(); while(slots[currentSlot].getTermPosition() <= lowerBound){ hitIters[currentSlot].remove(); if(hitIters[currentSlot].hasNext()){ slots[currentSlot] = hitIters[currentSlot].next(); }else{ slots[currentSlot] = null; break; } } // } //use the gap to check position if(slots[currentSlot] != null){ while(query.gaps[currentSlot - 1].check( slots[currentSlot -1].getTermPosition() + slots[currentSlot -1].getLength() -1, slots[currentSlot].getTermPosition()) != 0){ if(hitIters[currentSlot].hasNext()){ slots[currentSlot] = hitIters[currentSlot].next(); }else{ slots[currentSlot] = null; break; } } } } } if(slots[currentSlot] == null){ //we couldn't fill the current slot -> go back //reset the iterator for the current slot hitIters[currentSlot] = candidateHits[currentSlot].iterator(); //...and go back one step currentSlot--; }else{ //we successfully filled the current slot if(currentSlot == executors.length -1){ //we have a full hit hitsOnCurrentDocument.add(slots); //create a new copy of the first length-1 elements Binding[] newSlots = new Binding[executors.length]; System.arraycopy(slots, 0, newSlots, 0, slots.length -1); slots = newSlots; //..and stay on the same slot }else{ //not a full hit yet ->advance to next slot currentSlot++; } } } } /* (non-Javadoc) * @see gate.mimir.search.query.QueryExecutor#nextHit() */ public Binding nextHit() throws IOException { if(closed) return null; if(hitsOnCurrentDocument.isEmpty()) return null; else{ //get the first hit, and construct the corresponding bindings array for //it Binding[] hitSlots = hitsOnCurrentDocument.remove(0); Binding[] containedBindings = null; if(engine.isSubBindingsEnabled()){ //there will be one contained binding for each sub-query, plus //all of their own contained bindings. int containedBindsCount = executors.length; for(Binding aSubHit : hitSlots){ Binding[] containedB = aSubHit.getContainedBindings(); containedBindsCount += containedB == null ? 0 : containedB.length; } containedBindings = new Binding[containedBindsCount]; //position to write into containedBindings int cbIdx =0; for(Binding aSubHit : hitSlots){ containedBindings[cbIdx++] = aSubHit; if(aSubHit.getContainedBindings() != null){ System.arraycopy(aSubHit.getContainedBindings(), 0, containedBindings, cbIdx, aSubHit.getContainedBindings().length); cbIdx += aSubHit.getContainedBindings().length; } } } int length = hitSlots[hitSlots.length -1].getTermPosition() + hitSlots[hitSlots.length -1].getLength() - hitSlots[0].getTermPosition(); return new Binding(query, hitSlots[0].getDocumentId(), hitSlots[0].getTermPosition(), length, containedBindings); } } } /** * Constructor from an array of {@link QueryNode}s. * @param nodes the array of {@link QueryNode} representing the sequence of * sub-queries. * @param gaps an array of {@link Gap} objects, representing the gaps permitted * between the sub-query results. The gap at position <code>n</code> in this * array represents a space permitted <b>after</b> the sub-query at position * <code>n</code> in the <code>nodes<code> array. This means that the length * of the <code>gaps<code> array should be <code>nodes.length -1</code>. If * the provided parameter value is a shorter array, it will be padded with * zero-length gaps; if it is longer, the extra gaps at the end will be * ignored. If this parameter is set to <code>null</code>, then no gaps are * permitted: the result for sub-query <code>n+1</code> needs to start with * the term following the one where the result for sub-query <code>n</code> * ends. This is equivalent to providing an array filled with zero-length * gaps. */ public SequenceQuery(Gap[] gaps, QueryNode... nodes) { this.nodes = nodes; this.gaps = gaps; //normalise the gaps array normaliseGaps(); } /** * Makes sure all required Gap values are defined. */ protected void normaliseGaps(){ if(gaps == null){ //no gaps provided -> create a new array of nulls. gaps = new Gap[nodes.length -1]; } else if(gaps.length != (nodes.length -1)){ //wrong size -> create a new array and copy all available useful elements Gap[] oldGaps = gaps; gaps = new Gap[nodes.length -1]; System.arraycopy(oldGaps, 0, gaps, 0, Math.min(oldGaps.length, gaps.length)); } //now replace all nulls with zero gaps. for(int i = 0; i < gaps.length; i++){ if(gaps[i] == null) gaps[i] = ZERO_GAP; } } /** * The sub-queries of this sequence. */ private QueryNode[] nodes; /** * The gaps for this query. */ private Gap[] gaps; /** * A cached copy of the zero-length gap, which is ferquently used. */ private static final Gap ZERO_GAP = new Gap(0, 0); /* (non-Javadoc) * @see gate.mimir.search.query.QueryNode#getQueryExecutor(gate.mimir.search.QueryEngine) */ public QueryExecutor getQueryExecutor(QueryEngine engine) throws IOException { return new SequenceQueryExecutor(this, engine); } /** * @return the nodes */ public QueryNode[] getNodes() { return nodes; } /** * Obtains a {@link Gap} that allows a number of terms between the min and * max values (inclusive). * @param min the minimum number of term required for this gap. * @param max the maximum number of terms permitted for this gap. * @return an appropriately built {@link Gap} object. */ public static Gap getGap(int min, int max){ return new Gap(min, max); } public String toString() { StringBuilder str = new StringBuilder("SEQUENCE ("); if(nodes != null){ for(int i = 0; i < nodes.length -1; i++){ str.append(nodes[i].toString()); str.append(", "); } str.append(nodes[nodes.length -1].toString()); } str.append(")"); return str.toString(); } public Gap[] getGaps() { return gaps; } }