/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.accumulo.core.iterators.user; import java.io.IOException; import java.util.Base64; import java.util.Collection; import java.util.Collections; import java.util.Map; import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.data.ArrayByteSequence; import org.apache.accumulo.core.data.ByteSequence; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.PartialKey; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.IteratorEnvironment; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; import org.apache.accumulo.core.util.TextUtil; import org.apache.hadoop.io.Text; /** * This iterator facilitates document-partitioned indexing. It involves grouping a set of documents together and indexing those documents into a single row of * an Accumulo table. This allows a tablet server to perform boolean AND operations on terms in the index. * * The table structure should have the following form: * * row: shardID, colfam: term, colqual: docID * * When you configure this iterator with a set of terms (column families), it will return only the docIDs that appear with all of the specified terms. The * result will have an empty column family, as follows: * * row: shardID, colfam: (empty), colqual: docID * * This iterator is commonly used with BatchScanner or AccumuloInputFormat, to parallelize the search over all shardIDs. * * This iterator will *ignore* any columnFamilies passed to {@link #seek(Range, Collection, boolean)} as it performs intersections over terms. Extending classes * should override the {@link TermSource#seekColfams} in their implementation's {@link #init(SortedKeyValueIterator, Map, IteratorEnvironment)} method. * * An example of using the IntersectingIterator is available at https://github.com/apache/accumulo-examples/blob/master/docs/shard.md */ public class IntersectingIterator implements SortedKeyValueIterator<Key,Value> { protected Text nullText = new Text(); protected Text getPartition(Key key) { return key.getRow(); } protected Text getTerm(Key key) { return key.getColumnFamily(); } protected Text getDocID(Key key) { return key.getColumnQualifier(); } protected Key buildKey(Text partition, Text term) { return new Key(partition, (term == null) ? nullText : term); } protected Key buildKey(Text partition, Text term, Text docID) { return new Key(partition, (term == null) ? nullText : term, docID); } protected Key buildFollowingPartitionKey(Key key) { return key.followingKey(PartialKey.ROW); } public static class TermSource { public SortedKeyValueIterator<Key,Value> iter; public Text term; public Collection<ByteSequence> seekColfams; public boolean notFlag; public TermSource(TermSource other) { this.iter = other.iter; this.term = other.term; this.notFlag = other.notFlag; this.seekColfams = other.seekColfams; } public TermSource(SortedKeyValueIterator<Key,Value> iter, Text term) { this(iter, term, false); } public TermSource(SortedKeyValueIterator<Key,Value> iter, Text term, boolean notFlag) { this.iter = iter; this.term = term; this.notFlag = notFlag; // The desired column families for this source is the term itself this.seekColfams = Collections.<ByteSequence> singletonList(new ArrayByteSequence(term.getBytes(), 0, term.getLength())); } public String getTermString() { return (this.term == null) ? "Iterator" : this.term.toString(); } } protected TermSource[] sources; int sourcesCount = 0; Range overallRange; // query-time settings protected Text currentPartition = null; protected Text currentDocID = new Text(emptyByteArray); static final byte[] emptyByteArray = new byte[0]; protected Key topKey = null; protected Value value = new Value(emptyByteArray); public IntersectingIterator() {} @Override public SortedKeyValueIterator<Key,Value> deepCopy(IteratorEnvironment env) { return new IntersectingIterator(this, env); } private IntersectingIterator(IntersectingIterator other, IteratorEnvironment env) { if (other.sources != null) { sourcesCount = other.sourcesCount; sources = new TermSource[sourcesCount]; for (int i = 0; i < sourcesCount; i++) { sources[i] = new TermSource(other.sources[i].iter.deepCopy(env), other.sources[i].term); } } } @Override public Key getTopKey() { return topKey; } @Override public Value getTopValue() { // we don't really care about values return value; } @Override public boolean hasTop() { return currentPartition != null; } // precondition: currentRow is not null private boolean seekOneSource(int sourceID) throws IOException { // find the next key in the appropriate column family that is at or beyond the cursor (currentRow, currentCQ) // advance the cursor if this source goes beyond it // return whether we advanced the cursor // within this loop progress must be made in one of the following forms: // - currentRow or currentCQ must be increased // - the given source must advance its iterator // this loop will end when any of the following criteria are met // - the iterator for the given source is pointing to the key (currentRow, columnFamilies[sourceID], currentCQ) // - the given source is out of data and currentRow is set to null // - the given source has advanced beyond the endRow and currentRow is set to null boolean advancedCursor = false; if (sources[sourceID].notFlag) { while (true) { if (sources[sourceID].iter.hasTop() == false) { // an empty column that you are negating is a valid condition break; } // check if we're past the end key int endCompare = -1; // we should compare the row to the end of the range if (overallRange.getEndKey() != null) { endCompare = overallRange.getEndKey().getRow().compareTo(sources[sourceID].iter.getTopKey().getRow()); if ((!overallRange.isEndKeyInclusive() && endCompare <= 0) || endCompare < 0) { // an empty column that you are negating is a valid condition break; } } int partitionCompare = currentPartition.compareTo(getPartition(sources[sourceID].iter.getTopKey())); // check if this source is already at or beyond currentRow // if not, then seek to at least the current row if (partitionCompare > 0) { // seek to at least the currentRow Key seekKey = buildKey(currentPartition, sources[sourceID].term); sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); continue; } // check if this source has gone beyond currentRow // if so, this is a valid condition for negation if (partitionCompare < 0) { break; } // we have verified that the current source is positioned in currentRow // now we must make sure we're in the right columnFamily in the current row // Note: Iterators are auto-magically set to the correct columnFamily if (sources[sourceID].term != null) { int termCompare = sources[sourceID].term.compareTo(getTerm(sources[sourceID].iter.getTopKey())); // check if this source is already on the right columnFamily // if not, then seek forwards to the right columnFamily if (termCompare > 0) { Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID); sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); continue; } // check if this source is beyond the right columnFamily // if so, then this is a valid condition for negating if (termCompare < 0) { break; } } // we have verified that we are in currentRow and the correct column family // make sure we are at or beyond columnQualifier Text docID = getDocID(sources[sourceID].iter.getTopKey()); int docIDCompare = currentDocID.compareTo(docID); // If we are past the target, this is a valid result if (docIDCompare < 0) { break; } else if (docIDCompare > 0) { // if this source is not yet at the currentCQ then advance in this source // seek forwards Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID); sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); continue; } else { // docIDCompare == 0 // if we are equal to the target, this is an invalid result. // Force the entire process to go to the next row. // We are advancing column 0 because we forced that column to not contain a ! // when we did the init() sources[0].iter.next(); advancedCursor = true; break; } } } else { while (true) { if (sources[sourceID].iter.hasTop() == false) { currentPartition = null; // setting currentRow to null counts as advancing the cursor return true; } // check if we're past the end key int endCompare = -1; // we should compare the row to the end of the range if (overallRange.getEndKey() != null) { endCompare = overallRange.getEndKey().getRow().compareTo(sources[sourceID].iter.getTopKey().getRow()); if ((!overallRange.isEndKeyInclusive() && endCompare <= 0) || endCompare < 0) { currentPartition = null; // setting currentRow to null counts as advancing the cursor return true; } } int partitionCompare = currentPartition.compareTo(getPartition(sources[sourceID].iter.getTopKey())); // check if this source is already at or beyond currentRow // if not, then seek to at least the current row if (partitionCompare > 0) { // seek to at least the currentRow Key seekKey = buildKey(currentPartition, sources[sourceID].term); sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); continue; } // check if this source has gone beyond currentRow // if so, advance currentRow if (partitionCompare < 0) { currentPartition.set(getPartition(sources[sourceID].iter.getTopKey())); currentDocID.set(emptyByteArray); advancedCursor = true; continue; } // we have verified that the current source is positioned in currentRow // now we must make sure we're in the right columnFamily in the current row // Note: Iterators are auto-magically set to the correct columnFamily if (sources[sourceID].term != null) { int termCompare = sources[sourceID].term.compareTo(getTerm(sources[sourceID].iter.getTopKey())); // check if this source is already on the right columnFamily // if not, then seek forwards to the right columnFamily if (termCompare > 0) { Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID); sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); continue; } // check if this source is beyond the right columnFamily // if so, then seek to the next row if (termCompare < 0) { // we're out of entries in the current row, so seek to the next one // byte[] currentRowBytes = currentRow.getBytes(); // byte[] nextRow = new byte[currentRowBytes.length + 1]; // System.arraycopy(currentRowBytes, 0, nextRow, 0, currentRowBytes.length); // nextRow[currentRowBytes.length] = (byte)0; // // we should reuse text objects here // sources[sourceID].seek(new Key(new Text(nextRow),columnFamilies[sourceID])); if (endCompare == 0) { // we're done currentPartition = null; // setting currentRow to null counts as advancing the cursor return true; } Key seekKey = buildFollowingPartitionKey(sources[sourceID].iter.getTopKey()); sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); continue; } } // we have verified that we are in currentRow and the correct column family // make sure we are at or beyond columnQualifier Text docID = getDocID(sources[sourceID].iter.getTopKey()); int docIDCompare = currentDocID.compareTo(docID); // if this source has advanced beyond the current column qualifier then advance currentCQ and return true if (docIDCompare < 0) { currentDocID.set(docID); advancedCursor = true; break; } // if this source is not yet at the currentCQ then seek in this source if (docIDCompare > 0) { // seek forwards Key seekKey = buildKey(currentPartition, sources[sourceID].term, currentDocID); sources[sourceID].iter.seek(new Range(seekKey, true, null, false), sources[sourceID].seekColfams, true); continue; } // this source is at the current row, in its column family, and at currentCQ break; } } return advancedCursor; } @Override public void next() throws IOException { if (currentPartition == null) { return; } // precondition: the current row is set up and the sources all have the same column qualifier // while we don't have a match, seek in the source with the smallest column qualifier sources[0].iter.next(); advanceToIntersection(); } protected void advanceToIntersection() throws IOException { boolean cursorChanged = true; while (cursorChanged) { // seek all of the sources to at least the highest seen column qualifier in the current row cursorChanged = false; for (int i = 0; i < sourcesCount; i++) { if (currentPartition == null) { topKey = null; return; } if (seekOneSource(i)) { cursorChanged = true; break; } } } topKey = buildKey(currentPartition, nullText, currentDocID); } public static String stringTopKey(SortedKeyValueIterator<Key,Value> iter) { if (iter.hasTop()) return iter.getTopKey().toString(); return ""; } private static final String columnFamiliesOptionName = "columnFamilies"; private static final String notFlagOptionName = "notFlag"; /** * @return encoded columns */ protected static String encodeColumns(Text[] columns) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < columns.length; i++) { sb.append(Base64.getEncoder().encodeToString(TextUtil.getBytes(columns[i]))); sb.append('\n'); } return sb.toString(); } /** * @return encoded flags */ protected static String encodeBooleans(boolean[] flags) { byte[] bytes = new byte[flags.length]; for (int i = 0; i < flags.length; i++) { if (flags[i]) bytes[i] = 1; else bytes[i] = 0; } return Base64.getEncoder().encodeToString(bytes); } protected static Text[] decodeColumns(String columns) { String[] columnStrings = columns.split("\n"); Text[] columnTexts = new Text[columnStrings.length]; for (int i = 0; i < columnStrings.length; i++) { columnTexts[i] = new Text(Base64.getDecoder().decode(columnStrings[i])); } return columnTexts; } /** * @return decoded flags */ protected static boolean[] decodeBooleans(String flags) { // return null of there were no flags if (flags == null) return null; byte[] bytes = Base64.getDecoder().decode(flags); boolean[] bFlags = new boolean[bytes.length]; for (int i = 0; i < bytes.length; i++) { if (bytes[i] == 1) bFlags[i] = true; else bFlags[i] = false; } return bFlags; } @Override public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException { Text[] terms = decodeColumns(options.get(columnFamiliesOptionName)); boolean[] notFlag = decodeBooleans(options.get(notFlagOptionName)); if (terms.length < 1) { throw new IllegalArgumentException("IntersectionIterator requires one or more columns families"); } // Scan the not flags. // There must be at least one term that isn't negated // And we are going to re-order such that the first term is not a ! term if (notFlag == null) { notFlag = new boolean[terms.length]; for (int i = 0; i < terms.length; i++) notFlag[i] = false; } if (notFlag[0]) { for (int i = 1; i < notFlag.length; i++) { if (notFlag[i] == false) { Text swapFamily = new Text(terms[0]); terms[0].set(terms[i]); terms[i].set(swapFamily); notFlag[0] = false; notFlag[i] = true; break; } } if (notFlag[0]) { throw new IllegalArgumentException("IntersectionIterator requires at lest one column family without not"); } } sources = new TermSource[terms.length]; sources[0] = new TermSource(source, terms[0]); for (int i = 1; i < terms.length; i++) { sources[i] = new TermSource(source.deepCopy(env), terms[i], notFlag[i]); } sourcesCount = terms.length; } @Override public void seek(Range range, Collection<ByteSequence> seekColumnFamilies, boolean inclusive) throws IOException { overallRange = new Range(range); currentPartition = new Text(); currentDocID.set(emptyByteArray); // seek each of the sources to the right column family within the row given by key for (int i = 0; i < sourcesCount; i++) { Key sourceKey; if (range.getStartKey() != null) { if (range.getStartKey().getColumnQualifier() != null) { sourceKey = buildKey(getPartition(range.getStartKey()), sources[i].term, range.getStartKey().getColumnQualifier()); } else { sourceKey = buildKey(getPartition(range.getStartKey()), sources[i].term); } // Seek only to the term for this source as a column family sources[i].iter.seek(new Range(sourceKey, true, null, false), sources[i].seekColfams, true); } else { // Seek only to the term for this source as a column family sources[i].iter.seek(range, sources[i].seekColfams, true); } } advanceToIntersection(); } /** * @deprecated since 1.6.0 */ @Deprecated public void addSource(SortedKeyValueIterator<Key,Value> source, IteratorEnvironment env, Text term, boolean notFlag) { // Check if we have space for the added Source if (sources == null) { sources = new TermSource[1]; } else { // allocate space for node, and copy current tree. // TODO: Should we change this to an ArrayList so that we can just add() ? - ACCUMULO-1309 TermSource[] localSources = new TermSource[sources.length + 1]; int currSource = 0; for (TermSource myTerm : sources) { // TODO: Do I need to call new here? or can I just re-use the term? - ACCUMULO-1309 localSources[currSource] = new TermSource(myTerm); currSource++; } sources = localSources; } sources[sourcesCount] = new TermSource(source.deepCopy(env), term, notFlag); sourcesCount++; } /** * Encode the columns to be used when iterating. */ public static void setColumnFamilies(IteratorSetting cfg, Text[] columns) { if (columns.length < 1) throw new IllegalArgumentException("Must supply at least one term to intersect"); cfg.addOption(IntersectingIterator.columnFamiliesOptionName, IntersectingIterator.encodeColumns(columns)); } /** * Encode columns and NOT flags indicating which columns should be negated (docIDs will be excluded if matching negated columns, instead of included). */ public static void setColumnFamilies(IteratorSetting cfg, Text[] columns, boolean[] notFlags) { if (columns.length < 1) throw new IllegalArgumentException("Must supply at least one terms to intersect"); if (columns.length != notFlags.length) throw new IllegalArgumentException("columns and notFlags arrays must be the same length"); setColumnFamilies(cfg, columns); cfg.addOption(IntersectingIterator.notFlagOptionName, IntersectingIterator.encodeBooleans(notFlags)); } }