/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil; import org.apache.hadoop.hive.ql.io.IOContext.Comparison; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; /** This class prepares an IOContext, and provides the ability to perform a binary search on the * data. The binary search can be used by setting the value of inputFormatSorted in the * MapreduceWork to true, but it should only be used if the data is going to a FilterOperator, * which filters by comparing a value in the data with a constant, using one of the comparisons * =, <, >, <=, >=. If the RecordReader's underlying format is an RCFile, this object can perform * a binary search to find the block to begin reading from, and stop reading once it can be * determined no other entries will match the filter. */ public abstract class HiveContextAwareRecordReader<K, V> implements RecordReader<K, V> { private static final Log LOG = LogFactory.getLog(HiveContextAwareRecordReader.class.getName()); private boolean initDone = false; private long rangeStart; private long rangeEnd; private long splitEnd; private long previousPosition = -1; private boolean wasUsingSortedSearch = false; private String genericUDFClassName = null; private final List<Comparison> stopComparisons = new ArrayList<Comparison>(); protected RecordReader recordReader; protected JobConf jobConf; protected boolean isSorted = false; public HiveContextAwareRecordReader(JobConf conf) throws IOException { this(null, conf); } public HiveContextAwareRecordReader(RecordReader recordReader) { this.recordReader = recordReader; } public HiveContextAwareRecordReader(RecordReader recordReader, JobConf conf) throws IOException { this.recordReader = recordReader; this.jobConf = conf; } public void setRecordReader(RecordReader recordReader) { this.recordReader = recordReader; } /** * Close this {@link InputSplit} to future operations. * * @throws IOException */ public abstract void doClose() throws IOException; private IOContext ioCxtRef = null; @Override public void close() throws IOException { doClose(); initDone = false; ioCxtRef = null; } @Override public boolean next(K key, V value) throws IOException { if(!initDone) { throw new IOException("Hive IOContext is not inited."); } updateIOContext(); try { return doNext(key, value); } catch (IOException e) { ioCxtRef.setIOExceptions(true); throw e; } } protected void updateIOContext() throws IOException { long pointerPos = this.getPos(); if (!ioCxtRef.isBlockPointer) { ioCxtRef.currentBlockStart = pointerPos; ioCxtRef.currentRow = 0; return; } ioCxtRef.currentRow++; if (ioCxtRef.nextBlockStart == -1) { ioCxtRef.nextBlockStart = pointerPos; ioCxtRef.currentRow = 0; } if (pointerPos != ioCxtRef.nextBlockStart) { // the reader pointer has moved to the end of next block, or the end of // current record. ioCxtRef.currentRow = 0; if (ioCxtRef.currentBlockStart == ioCxtRef.nextBlockStart) { ioCxtRef.currentRow = 1; } ioCxtRef.currentBlockStart = ioCxtRef.nextBlockStart; ioCxtRef.nextBlockStart = pointerPos; } } public IOContext getIOContext() { return IOContext.get(); } public void initIOContext(long startPos, boolean isBlockPointer, String inputFile) { ioCxtRef = this.getIOContext(); ioCxtRef.currentBlockStart = startPos; ioCxtRef.isBlockPointer = isBlockPointer; ioCxtRef.inputFile = inputFile; LOG.info("Processing file " + inputFile); initDone = true; } public void initIOContext(FileSplit split, JobConf job, Class inputFormatClass) throws IOException { this.initIOContext(split, job, inputFormatClass, null); } public void initIOContext(FileSplit split, JobConf job, Class inputFormatClass, RecordReader recordReader) throws IOException { boolean blockPointer = false; long blockStart = -1; FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(job); if (inputFormatClass.getName().contains("SequenceFile")) { SequenceFile.Reader in = new SequenceFile.Reader(fs, path, job); blockPointer = in.isBlockCompressed(); in.sync(fileSplit.getStart()); blockStart = in.getPosition(); in.close(); } else if (recordReader instanceof RCFileRecordReader) { blockPointer = true; blockStart = ((RCFileRecordReader) recordReader).getStart(); } else if (inputFormatClass.getName().contains("RCFile")) { blockPointer = true; RCFile.Reader in = new RCFile.Reader(fs, path, job); in.sync(fileSplit.getStart()); blockStart = in.getPosition(); in.close(); } this.initIOContext(blockStart, blockPointer, path.makeQualified(fs).toString()); this.initIOContextSortedProps(split, recordReader, job); } public void initIOContextSortedProps(FileSplit split, RecordReader recordReader, JobConf job) { this.getIOContext().resetSortingValues(); this.isSorted = jobConf.getBoolean("hive.input.format.sorted", false); this.rangeStart = split.getStart(); this.rangeEnd = split.getStart() + split.getLength(); this.splitEnd = rangeEnd; if (recordReader instanceof RCFileRecordReader && rangeEnd != 0 && this.isSorted) { // Binary search only works if we know the size of the split, and the recordReader is an // RCFileRecordReader this.getIOContext().setUseSorted(true); this.getIOContext().setIsBinarySearching(true); this.wasUsingSortedSearch = true; } else { // Use the defalut methods for next in the child class this.isSorted = false; } } @Override public float getProgress() throws IOException { if (this.getIOContext().isBinarySearching()) { return 0; } else { return recordReader.getProgress(); } } public boolean doNext(K key, V value) throws IOException { if (this.isSorted) { if (this.getIOContext().shouldEndBinarySearch() || (!this.getIOContext().useSorted() && this.wasUsingSortedSearch)) { beginLinearSearch(); this.wasUsingSortedSearch = false; this.getIOContext().setEndBinarySearch(false); } if (this.getIOContext().useSorted()) { if (this.genericUDFClassName == null && this.getIOContext().getGenericUDFClassName() != null) { setGenericUDFClassName(this.getIOContext().getGenericUDFClassName()); } if (this.getIOContext().isBinarySearching()) { // Proceed with a binary search if (this.getIOContext().getComparison() != null) { switch (this.getIOContext().getComparison()) { case GREATER: case EQUAL: // Indexes have only one entry per value, could go linear from here, if we want to // use this for any sorted table, we'll need to continue the search rangeEnd = previousPosition; break; case LESS: rangeStart = previousPosition; break; default: break; } } long position = (rangeStart + rangeEnd) / 2; sync(position); long newPosition = getSyncedPosition(); // If the newPosition is the same as the previousPosition, we've reached the end of the // binary search, if the new position at least as big as the size of the split, any // matching rows must be in the final block, so we can end the binary search. if (newPosition == previousPosition || newPosition >= splitEnd) { this.getIOContext().setIsBinarySearching(false); sync(rangeStart); } previousPosition = newPosition; } else if (foundAllTargets()) { // Found all possible rows which will not be filtered return false; } } } try { return recordReader.next(key, value); } catch (Exception e) { return HiveIOExceptionHandlerUtil.handleRecordReaderNextException(e, jobConf); } } private void sync(long position) throws IOException { ((RCFileRecordReader)recordReader).sync(position); ((RCFileRecordReader)recordReader).resetBuffer(); } private long getSyncedPosition() throws IOException { return recordReader.getPos(); } /** * This uses the name of the generic UDF being used by the filter to determine whether we should * perform a binary search, and what the comparisons we should use to signal the end of the * linear scan are. * @param genericUDFClassName * @throws IOException */ private void setGenericUDFClassName(String genericUDFClassName) throws IOException { this.genericUDFClassName = genericUDFClassName; if (genericUDFClassName.equals(GenericUDFOPEqual.class.getName())) { stopComparisons.add(Comparison.GREATER); } else if (genericUDFClassName.equals(GenericUDFOPLessThan.class.getName())) { stopComparisons.add(Comparison.EQUAL); stopComparisons.add(Comparison.GREATER); if (this.getIOContext().isBinarySearching()) { beginLinearSearch(); } } else if (genericUDFClassName.equals(GenericUDFOPEqualOrLessThan.class.getName())) { stopComparisons.add(Comparison.GREATER); if (this.getIOContext().isBinarySearching()) { beginLinearSearch(); } } else if (genericUDFClassName.equals(GenericUDFOPGreaterThan.class.getName()) || genericUDFClassName.equals(GenericUDFOPEqualOrGreaterThan.class.getName())) { // Do nothing } else { // This is an unsupported operator LOG.debug(genericUDFClassName + " is not the name of a supported class. " + "Continuing linearly."); if (this.getIOContext().isBinarySearching()) { beginLinearSearch(); } } } /** * This should be called after the binary search is finished and before the linear scan begins * @throws IOException */ private void beginLinearSearch() throws IOException { sync(rangeStart); this.getIOContext().setIsBinarySearching(false); this.wasUsingSortedSearch = false; } /** * Returns true if the current comparison is in the list of stop comparisons, i.e. we've found * all records which won't be filtered * @return true if the current comparison is found */ public boolean foundAllTargets() { if (this.getIOContext().getComparison() == null || !stopComparisons.contains(this.getIOContext().getComparison())) { return false; } return true; } }