InputTableConfig.java example

Explorer
accumulo-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.core.client.mapreduce;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;

import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.client.ScannerBase;
import org.apache.accumulo.core.client.sample.SamplerConfiguration;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.util.Pair;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;

/**
 * This class to holds a batch scan configuration for a table. It contains all the properties needed to specify how rows should be returned from the table.
 */
public class InputTableConfig implements Writable {

  private List<IteratorSetting> iterators;
  private List<Range> ranges;
  private Collection<Pair<Text,Text>> columns;

  private boolean autoAdjustRanges = true;
  private boolean useLocalIterators = false;
  private boolean useIsolatedScanners = false;
  private boolean offlineScan = false;
  private SamplerConfiguration samplerConfig = null;

  public InputTableConfig() {}

  /**
   * Creates a batch scan config object out of a previously serialized batch scan config object.
   *
   * @param input
   *          the data input of the serialized batch scan config
   */
  public InputTableConfig(DataInput input) throws IOException {
    readFields(input);
  }

  /**
   * Sets the input ranges to scan for all tables associated with this job. This will be added to any per-table ranges that have been set using
   *
   * @param ranges
   *          the ranges that will be mapped over
   * @since 1.6.0
   */
  public InputTableConfig setRanges(List<Range> ranges) {
    this.ranges = ranges;
    return this;
  }

  /**
   * Returns the ranges to be queried in the configuration
   */
  public List<Range> getRanges() {
    return ranges != null ? ranges : new ArrayList<>();
  }

  /**
   * Restricts the columns that will be mapped over for this job for the default input table.
   *
   * @param columns
   *          a pair of {@link Text} objects corresponding to column family and column qualifier. If the column qualifier is null, the entire column family is
   *          selected. An empty set is the default and is equivalent to scanning the all columns.
   * @since 1.6.0
   */
  public InputTableConfig fetchColumns(Collection<Pair<Text,Text>> columns) {
    this.columns = columns;
    return this;
  }

  /**
   * Returns the columns to be fetched for this configuration
   */
  public Collection<Pair<Text,Text>> getFetchedColumns() {
    return columns != null ? columns : new HashSet<>();
  }

  /**
   * Set iterators on to be used in the query.
   *
   * @param iterators
   *          the configurations for the iterators
   * @since 1.6.0
   */
  public InputTableConfig setIterators(List<IteratorSetting> iterators) {
    this.iterators = iterators;
    return this;
  }

  /**
   * Returns the iterators to be set on this configuration
   */
  public List<IteratorSetting> getIterators() {
    return iterators != null ? iterators : new ArrayList<>();
  }

  /**
   * Controls the automatic adjustment of ranges for this job. This feature merges overlapping ranges, then splits them to align with tablet boundaries.
   * Disabling this feature will cause exactly one Map task to be created for each specified range. The default setting is enabled. *
   *
   * <p>
   * By default, this feature is <b>enabled</b>.
   *
   * @param autoAdjustRanges
   *          the feature is enabled if true, disabled otherwise
   * @see #setRanges(java.util.List)
   * @since 1.6.0
   */
  public InputTableConfig setAutoAdjustRanges(boolean autoAdjustRanges) {
    this.autoAdjustRanges = autoAdjustRanges;
    return this;
  }

  /**
   * Determines whether a configuration has auto-adjust ranges enabled.
   *
   * @return false if the feature is disabled, true otherwise
   * @since 1.6.0
   * @see #setAutoAdjustRanges(boolean)
   */
  public boolean shouldAutoAdjustRanges() {
    return autoAdjustRanges;
  }

  /**
   * Controls the use of the {@link org.apache.accumulo.core.client.ClientSideIteratorScanner} in this job. Enabling this feature will cause the iterator stack
   * to be constructed within the Map task, rather than within the Accumulo TServer. To use this feature, all classes needed for those iterators must be
   * available on the classpath for the task.
   *
   * <p>
   * By default, this feature is <b>disabled</b>.
   *
   * @param useLocalIterators
   *          the feature is enabled if true, disabled otherwise
   * @since 1.6.0
   */
  public InputTableConfig setUseLocalIterators(boolean useLocalIterators) {
    this.useLocalIterators = useLocalIterators;
    return this;
  }

  /**
   * Determines whether a configuration uses local iterators.
   *
   * @return true if the feature is enabled, false otherwise
   * @since 1.6.0
   * @see #setUseLocalIterators(boolean)
   */
  public boolean shouldUseLocalIterators() {
    return useLocalIterators;
  }

  /**
   * Enable reading offline tables. By default, this feature is disabled and only online tables are scanned. This will make the map reduce job directly read the
   * table's files. If the table is not offline, then the job will fail. If the table comes online during the map reduce job, it is likely that the job will
   * fail.
   *
   * <p>
   * To use this option, the map reduce user will need access to read the Accumulo directory in HDFS.
   *
   * <p>
   * Reading the offline table will create the scan time iterator stack in the map process. So any iterators that are configured for the table will need to be
   * on the mapper's classpath. The accumulo-site.xml may need to be on the mapper's classpath if HDFS or the Accumulo directory in HDFS are non-standard.
   *
   * <p>
   * One way to use this feature is to clone a table, take the clone offline, and use the clone as the input table for a map reduce job. If you plan to map
   * reduce over the data many times, it may be better to the compact the table, clone it, take it offline, and use the clone for all map reduce jobs. The
   * reason to do this is that compaction will reduce each tablet in the table to one file, and it is faster to read from one file.
   *
   * <p>
   * There are two possible advantages to reading a tables file directly out of HDFS. First, you may see better read performance. Second, it will support
   * speculative execution better. When reading an online table speculative execution can put more load on an already slow tablet server.
   *
   * <p>
   * By default, this feature is <b>disabled</b>.
   *
   * @param offlineScan
   *          the feature is enabled if true, disabled otherwise
   * @since 1.6.0
   */
  public InputTableConfig setOfflineScan(boolean offlineScan) {
    this.offlineScan = offlineScan;
    return this;
  }

  /**
   * Determines whether a configuration has the offline table scan feature enabled.
   *
   * @return true if the feature is enabled, false otherwise
   * @since 1.6.0
   * @see #setOfflineScan(boolean)
   */
  public boolean isOfflineScan() {
    return offlineScan;
  }

  /**
   * Controls the use of the {@link org.apache.accumulo.core.client.IsolatedScanner} in this job.
   *
   * <p>
   * By default, this feature is <b>disabled</b>.
   *
   * @param useIsolatedScanners
   *          the feature is enabled if true, disabled otherwise
   * @since 1.6.0
   */
  public InputTableConfig setUseIsolatedScanners(boolean useIsolatedScanners) {
    this.useIsolatedScanners = useIsolatedScanners;
    return this;
  }

  /**
   * Determines whether a configuration has isolation enabled.
   *
   * @return true if the feature is enabled, false otherwise
   * @since 1.6.0
   * @see #setUseIsolatedScanners(boolean)
   */
  public boolean shouldUseIsolatedScanners() {
    return useIsolatedScanners;
  }

  /**
   * Set the sampler configuration to use when reading from the data.
   *
   * @see ScannerBase#setSamplerConfiguration(SamplerConfiguration)
   * @see InputFormatBase#setSamplerConfiguration(org.apache.hadoop.mapreduce.Job, SamplerConfiguration)
   *
   * @since 1.8.0
   */
  public void setSamplerConfiguration(SamplerConfiguration samplerConfiguration) {
    this.samplerConfig = samplerConfiguration;
  }

  /**
   *
   * @since 1.8.0
   */
  public SamplerConfiguration getSamplerConfiguration() {
    return samplerConfig;
  }

  @Override
  public void write(DataOutput dataOutput) throws IOException {
    if (iterators != null) {
      dataOutput.writeInt(iterators.size());
      for (IteratorSetting setting : iterators)
        setting.write(dataOutput);
    } else {
      dataOutput.writeInt(0);
    }
    if (ranges != null) {
      dataOutput.writeInt(ranges.size());
      for (Range range : ranges)
        range.write(dataOutput);
    } else {
      dataOutput.writeInt(0);
    }
    if (columns != null) {
      dataOutput.writeInt(columns.size());
      for (Pair<Text,Text> column : columns) {
        if (column.getSecond() == null) {
          dataOutput.writeInt(1);
          column.getFirst().write(dataOutput);
        } else {
          dataOutput.writeInt(2);
          column.getFirst().write(dataOutput);
          column.getSecond().write(dataOutput);
        }
      }
    } else {
      dataOutput.writeInt(0);
    }
    dataOutput.writeBoolean(autoAdjustRanges);
    dataOutput.writeBoolean(useLocalIterators);
    dataOutput.writeBoolean(useIsolatedScanners);
    dataOutput.writeBoolean(offlineScan);
  }

  @Override
  public void readFields(DataInput dataInput) throws IOException {
    // load iterators
    long iterSize = dataInput.readInt();
    if (iterSize > 0)
      iterators = new ArrayList<>();
    for (int i = 0; i < iterSize; i++)
      iterators.add(new IteratorSetting(dataInput));
    // load ranges
    long rangeSize = dataInput.readInt();
    if (rangeSize > 0)
      ranges = new ArrayList<>();
    for (int i = 0; i < rangeSize; i++) {
      Range range = new Range();
      range.readFields(dataInput);
      ranges.add(range);
    }
    // load columns
    long columnSize = dataInput.readInt();
    if (columnSize > 0)
      columns = new HashSet<>();
    for (int i = 0; i < columnSize; i++) {
      long numPairs = dataInput.readInt();
      Text colFam = new Text();
      colFam.readFields(dataInput);
      if (numPairs == 1) {
        columns.add(new Pair<Text,Text>(colFam, null));
      } else if (numPairs == 2) {
        Text colQual = new Text();
        colQual.readFields(dataInput);
        columns.add(new Pair<>(colFam, colQual));
      }
    }
    autoAdjustRanges = dataInput.readBoolean();
    useLocalIterators = dataInput.readBoolean();
    useIsolatedScanners = dataInput.readBoolean();
    offlineScan = dataInput.readBoolean();
  }

  @Override
  public boolean equals(Object o) {
    if (this == o)
      return true;
    if (o == null || getClass() != o.getClass())
      return false;

    InputTableConfig that = (InputTableConfig) o;

    if (autoAdjustRanges != that.autoAdjustRanges)
      return false;
    if (offlineScan != that.offlineScan)
      return false;
    if (useIsolatedScanners != that.useIsolatedScanners)
      return false;
    if (useLocalIterators != that.useLocalIterators)
      return false;
    if (columns != null ? !columns.equals(that.columns) : that.columns != null)
      return false;
    if (iterators != null ? !iterators.equals(that.iterators) : that.iterators != null)
      return false;
    if (ranges != null ? !ranges.equals(that.ranges) : that.ranges != null)
      return false;
    if (samplerConfig != null ? !samplerConfig.equals(that.samplerConfig) : that.samplerConfig != null)
      return false;
    return true;
  }

  @Override
  public int hashCode() {
    int result = 31 * (iterators != null ? iterators.hashCode() : 0);
    result = 31 * result + (ranges != null ? ranges.hashCode() : 0);
    result = 31 * result + (columns != null ? columns.hashCode() : 0);
    result = 31 * result + (autoAdjustRanges ? 1 : 0);
    result = 31 * result + (useLocalIterators ? 1 : 0);
    result = 31 * result + (useIsolatedScanners ? 1 : 0);
    result = 31 * result + (offlineScan ? 1 : 0);
    result = 31 * result + (samplerConfig == null ? 0 : samplerConfig.hashCode());
    return result;
  }
}