/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.index; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Iterables; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.HiveInputFormat.HiveInputSplit; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapred.FileSplit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public final class SplitFilter { public static final Logger LOG = LoggerFactory.getLogger(SplitFilter.class); private final IndexResult indexResult; private final long maxInputSize; public SplitFilter(IndexResult indexResult, long maxInputSize) { this.indexResult = indexResult; this.maxInputSize = maxInputSize; } public List<HiveInputSplit> filter(HiveInputSplit[] splits) throws IOException { long sumSplitLengths = 0; List<HiveInputSplit> newSplits = new ArrayList<>(); Arrays.sort(splits, new HiveInputSplitComparator()); for (HiveInputSplit split : splits) { LOG.info("split start : " + split.getStart()); LOG.info("split end : " + (split.getStart() + split.getLength())); try { if (indexResult.contains(split)) { HiveInputSplit newSplit = split; if (isAdjustmentRequired(newSplits, split)) { newSplit = adjustSplit(split); } sumSplitLengths += newSplit.getLength(); if (sumSplitLengths > maxInputSize) { String messageTemplate = "Size of data to read during a compact-index-based query " + "exceeded the maximum of %d set in %s"; throw new IOException(String.format(messageTemplate, maxInputSize, HiveConf.ConfVars.HIVE_INDEX_COMPACT_QUERY_MAX_SIZE.varname)); } newSplits.add(newSplit); } } catch (HiveException e) { throw new RuntimeException("Unable to get metadata for input table split " + split.getPath(), e); } } LOG.info("Number of input splits: {}, new input splits: {}, sum of split lengths: {}", splits.length, newSplits.size(), sumSplitLengths); return newSplits; } private boolean isAdjustmentRequired(List<HiveInputSplit> newSplits, HiveInputSplit split) { return (split.inputFormatClassName().contains("RCFile") || split.inputFormatClassName().contains("SequenceFile")) && split.getStart() > 0 && !doesOverlap(newSplits, split.getPath(), adjustStart(split.getStart())); } private boolean doesOverlap(List<HiveInputSplit> newSplits, Path path, long start) { if (newSplits.isEmpty()) { return false; } HiveInputSplit lastSplit = Iterables.getLast(newSplits); if (lastSplit.getPath().equals(path)) { return lastSplit.getStart() + lastSplit.getLength() > start; } return false; } private long adjustStart(long start) { return start > SequenceFile.SYNC_INTERVAL ? start - SequenceFile.SYNC_INTERVAL : 0; } private HiveInputSplit adjustSplit(HiveInputSplit split) throws IOException { long adjustedStart = adjustStart(split.getStart()); return new HiveInputSplit(new FileSplit(split.getPath(), adjustedStart, split.getStart() - adjustedStart + split.getLength(), split.getLocations()), split.inputFormatClassName()); } @VisibleForTesting static final class HiveInputSplitComparator implements Comparator<HiveInputSplit> { @Override public int compare(HiveInputSplit o1, HiveInputSplit o2) { int pathCompare = comparePath(o1.getPath(), o2.getPath()); if (pathCompare != 0) { return pathCompare; } return Long.compare(o1.getStart(), o2.getStart()); } private int comparePath(Path p1, Path p2) { return p1.compareTo(p2); } } }