/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.accumulo.core.client.rfile;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Collection;
import java.util.Map;
import java.util.Map.Entry;
import java.util.function.Predicate;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.admin.TableOperations;
import org.apache.accumulo.core.client.sample.SamplerConfiguration;
import org.apache.accumulo.core.client.summary.Summarizer;
import org.apache.accumulo.core.client.summary.SummarizerConfiguration;
import org.apache.accumulo.core.client.summary.Summary;
import org.apache.accumulo.core.client.summary.Summary.FileStatistics;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.Text;
/**
* RFile is Accumulo's internal storage format for Key Value pairs. This class is a Factory that enables creating a {@link Scanner} for reading and a
* {@link RFileWriter} for writing Rfiles.
*
* <p>
* The {@link Scanner} created by this class makes it easy to experiment with real data from a live system on a developers workstation. Also the {@link Scanner}
* can be used to write tools to analyze Accumulo's raw data.
*
* @since 1.8.0
*/
public class RFile {
/**
* This is an intermediate interface in a larger builder pattern. Supports setting the required input sources for reading a RFile.
*
* @since 1.8.0
*/
public static interface InputArguments {
/**
* Specify RFiles to read from. When multiple inputs are specified the {@link Scanner} constructed will present a merged view.
*
* @param inputs
* one or more RFiles to read.
* @return this
*/
ScannerOptions from(RFileSource... inputs);
/**
* Specify RFiles to read from. When multiple are specified the {@link Scanner} constructed will present a merged view.
*
* @param files
* one or more RFiles to read.
* @return this
*/
ScannerFSOptions from(String... files);
}
/**
* This is an intermediate interface in a larger builder pattern. Enables optionally setting a FileSystem to read RFile(s) from.
*
* @since 1.8.0
*/
public static interface ScannerFSOptions extends ScannerOptions {
/**
* Optionally provide a FileSystem to open RFiles. If not specified, the FileSystem will be constructed using configuration on the classpath.
*
* @param fs
* use this FileSystem to open files.
* @return this
*/
ScannerOptions withFileSystem(FileSystem fs);
}
/**
* This is an intermediate interface in a larger builder pattern. Supports setting optional parameters for reading RFile(s) and building a scanner over
* RFile(s).
*
* @since 1.8.0
*/
public static interface ScannerOptions {
/**
* By default the {@link Scanner} created will setup the default Accumulo system iterators. The iterators do things like the following :
*
* <ul>
* <li>Suppress deleted data</li>
* <li>Filter based on @link {@link Authorizations}</li>
* <li>Filter columns specified by functions like {@link Scanner#fetchColumn(Text, Text)} and {@link Scanner#fetchColumnFamily(Text)}</li>
* </ul>
*
* <p>
* Calling this method will turn off these system iterators and allow reading the raw data in an RFile. When reading the raw data, delete data and delete
* markers may be seen. Delete markers are {@link Key}s with the delete flag set.
*
* <p>
* Disabling system iterators will cause {@link #withAuthorizations(Authorizations)}, {@link Scanner#fetchColumn(Text, Text)}, and
* {@link Scanner#fetchColumnFamily(Text)} to throw runtime exceptions.
*
* @return this
*/
public ScannerOptions withoutSystemIterators();
/**
* The authorizations passed here will be used to filter Keys, from the {@link Scanner}, based on the content of the column visibility field.
*
* @param auths
* scan with these authorizations
* @return this
*/
public ScannerOptions withAuthorizations(Authorizations auths);
/**
* Enabling this option will cache RFiles data in memory. This option is useful when doing lots of random accesses.
*
* @param cacheSize
* the size of the data cache in bytes.
* @return this
*/
public ScannerOptions withDataCache(long cacheSize);
/**
* Enabling this option will cache RFiles indexes in memory. Index data within a RFile is used to find data when seeking to a {@link Key}. This option is
* useful when doing lots of random accesses.
*
* @param cacheSize
* the size of the index cache in bytes.
* @return this
*/
public ScannerOptions withIndexCache(long cacheSize);
/**
* This option allows limiting the {@link Scanner} from reading data outside of a given range. A scanner will not see any data outside of this range even if
* the RFile(s) have data outside the range.
*
* @return this
*/
public ScannerOptions withBounds(Range range);
/**
* Construct the {@link Scanner} with iterators specified in a tables properties. Properties for a table can be obtained by calling
* {@link TableOperations#getProperties(String)}
*
* @param props
* iterable over Accumulo table key value properties.
* @return this
*/
public ScannerOptions withTableProperties(Iterable<Entry<String,String>> props);
/**
* @see #withTableProperties(Iterable)
* @param props
* a map instead of an Iterable
* @return this
*/
public ScannerOptions withTableProperties(Map<String,String> props);
/**
* @return a Scanner over RFile using the specified options.
*/
public Scanner build();
}
/**
* Entry point for building a new {@link Scanner} over one or more RFiles.
*/
public static InputArguments newScanner() {
return new RFileScannerBuilder();
}
/**
* This is an intermediate interface in a larger builder pattern. Supports setting the required input sources for reading summary data from an RFile.
*
* @since 2.0.0
*/
public static interface SummaryInputArguments {
/**
* Specify RFiles to read from. When multiple inputs are specified the summary data will be merged.
*
* @param inputs
* one or more RFiles to read.
* @return this
*/
SummaryOptions from(RFileSource... inputs);
/**
* Specify RFiles to read from. When multiple are specified the summary data will be merged.
*
* @param files
* one or more RFiles to read.
* @return this
*/
SummaryFSOptions from(String... files);
}
/**
* This is an intermediate interface in a larger builder pattern. Enables optionally setting a FileSystem to read RFile summary data from.
*
* @since 2.0.0
*/
public static interface SummaryFSOptions extends SummaryOptions {
/**
* Optionally provide a FileSystem to open RFiles. If not specified, the FileSystem will be constructed using configuration on the classpath.
*
* @param fs
* use this FileSystem to open files.
* @return this
*/
SummaryOptions withFileSystem(FileSystem fs);
}
/**
* This is an intermediate interface in a large builder pattern. Allows setting options for retrieving summary data.
*
* @since 2.0.0
*/
public static interface SummaryOptions {
/**
* This method allows retrieving a subset of summary data from a file. If a file has lots of separate summaries, reading a subset may be faster.
*
* @param summarySelector
* Only read summary data that was generated with configuration that this predicate matches.
* @return this
*/
SummaryOptions selectSummaries(Predicate<SummarizerConfiguration> summarySelector);
/**
* Summary data may possibly be stored at a more granular level than the entire file. However there is no guarantee of this. If the data was stored at a
* more granular level, then this will get a subset of the summary data. The subset will very likely be an inaccurate approximation.
*
* @param startRow
* A non-null start row. The startRow is used exclusively.
* @return this
*
* @see FileStatistics#getExtra()
*/
SummaryOptions startRow(Text startRow);
/**
* @param startRow
* UTF-8 encodes startRow. The startRow is used exclusively.
* @return this
* @see #startRow(Text)
*/
SummaryOptions startRow(CharSequence startRow);
/**
* Summary data may possibly be stored at a more granular level than the entire file. However there is no guarantee of this. If the data was stored at a
* more granular level, then this will get a subset of the summary data. The subset will very likely be an inaccurate approximation.
*
* @param endRow
* A non-null end row. The end row is used inclusively.
* @return this
*
* @see FileStatistics#getExtra()
*/
SummaryOptions endRow(Text endRow);
/**
* @param endRow
* UTF-8 encodes endRow. The end row is used inclusively.
* @return this
* @see #endRow(Text)
*/
SummaryOptions endRow(CharSequence endRow);
/**
* Reads summary data from file.
*
* @return The summary data in the file that satisfied the selection criteria.
*/
Collection<Summary> read() throws IOException;
}
/**
* Entry point for reading summary data from RFiles.
*
* @since 2.0.0
*/
public static SummaryInputArguments summaries() {
return new RFileSummariesRetriever();
}
/**
* This is an intermediate interface in a larger builder pattern. Supports setting the required output sink to write a RFile to.
*
* @since 1.8.0
*/
public static interface OutputArguments {
/**
* @param filename
* name of file to write RFile data
* @return this
*/
public WriterFSOptions to(String filename);
/**
* @param out
* output stream to write RFile data
* @return this
*/
public WriterOptions to(OutputStream out);
}
/**
* This is an intermediate interface in a larger builder pattern. Enables optionally setting a FileSystem to write to.
*
* @since 1.8.0
*/
public static interface WriterFSOptions extends WriterOptions {
/**
* Optionally provide a FileSystem to open a file to write a RFile. If not specified, the FileSystem will be constructed using configuration on the
* classpath.
*
* @param fs
* use this FileSystem to open files.
* @return this
*/
WriterOptions withFileSystem(FileSystem fs);
}
/**
* This is an intermediate interface in a larger builder pattern. Supports setting optional parameters for creating a RFile and building a RFileWriter.
*
* @since 1.8.0
*/
public static interface WriterOptions {
/**
* Enable generating summary data in the created RFile by running {@link Summarizer}'s based on the specified configuration.
*
* @param summarizerConf
* Configuration for summarizer to run.
* @since 2.0.0
*/
public WriterOptions withSummarizers(SummarizerConfiguration... summarizerConf);
/**
* An option to store sample data in the generated RFile.
*
* @param samplerConf
* configuration to use when generating sample data.
* @throws IllegalArgumentException
* if table properties were previously specified and the table properties also specify a sampler.
* @return this
*/
public WriterOptions withSampler(SamplerConfiguration samplerConf);
/**
* Create an RFile using the same configuration as an Accumulo table. Properties for a table can be obtained by calling
* {@link TableOperations#getProperties(String)}
*
* @param props
* iterable over Accumulo table key value properties.
* @throws IllegalArgumentException
* if sampler was previously specified and the table properties also specify a sampler.
* @return this
*/
public WriterOptions withTableProperties(Iterable<Entry<String,String>> props);
/**
* @see #withTableProperties(Iterable)
*/
public WriterOptions withTableProperties(Map<String,String> props);
/**
* @param maxSize
* As keys are added to an RFile the visibility field is validated. Validating the visibility field requires parsing it. In order to make
* validation faster, previously seen visibilities are cached. This option allows setting the maximum size of this cache.
* @return this
*/
public WriterOptions withVisibilityCacheSize(int maxSize);
/**
* @return a new RfileWriter created with the options previously specified.
*/
public RFileWriter build() throws IOException;
}
/**
* Entry point for creating a new RFile writer.
*/
public static OutputArguments newWriter() {
return new RFileWriterBuilder();
}
}