/** * (c) Copyright 2012 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.kvstore.lib; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.kiji.annotations.ApiAudience; import org.kiji.annotations.ApiStability; import org.kiji.mapreduce.kvstore.KeyValueStore; import org.kiji.mapreduce.kvstore.KeyValueStoreReader; import org.kiji.mapreduce.kvstore.framework.KeyValueStoreConfiguration; /** * KeyValueStore implementation that reads delimited records from text files. * * <p>Each line of an input text file is made available as a record in the KeyValueStore. * Lines are separated into keys and values by the first instance of the <i>delimiter</i> string. * The second (and additional) instances of the delimiter string in the line are part of the * value, and are not treated specially.</p> * * <p>Input lines are read through BufferedReader.readLine(). A line is considered to be * terminated by any of a line feed ('\n'), carriage return ('\r'), or a carriage return * followed immediately by a linefeed.</p> * * <p>Line termination characters are removed before data is put in the KeyValueStore.</p> * * <p>By default, keys and values are tab-delimited. The delimiter may be set by the * <tt>setDelimiter()</tt> method of the TextFileKeyValueStore.Builder instance. * The delimiter may be multiple characters long. * The following rules are applied when breaking lines into keys and values:</p> * <ul> * <li>Keys and values are separated by the first instance of the delimiter in the line. * Further instances of the delimiter string are retained as part of the value.</li> * <li>A key followed immediately by a line terminator has a <tt>null</tt> value associated * with it. <tt>containsKey()</tt> will return <tt>true</tt> for this key, and * <tt>get()</tt> will return <tt>null</tt>.</li> * <li>A key followed by the delimiter and then the line terminator has the empty string * (<tt>""</tt>) as its value.</li> * <li>A line that begins with a delimiter has the empty string as its key; the remainder * of the line following the delimiter is the value.</li> * <li><tt>null</tt> is never allowed or possible as a key.</li> * <li>A blank line uses the above rules as follows: the key is the empty string * (<tt>""</tt>), and the value is <tt>null</tt>.</li> * <li>The last line in the file does not need to be newline-terminated; this is treated * like any other line in the file.</li> * </ul> * * <h3>XML Configuration</h3> * * <p>A kvstores XML file may contain the following properties when specifying the * behavior of this class:</p> * <ul> * <li><tt>dcache</tt> - True if files should be accessed by jobs through the DistributedCache. * <li><tt>delim</tt> - The delimiter string that separates keys and values within * a line of text. (Default is a tab character.)</li> * <li><tt>paths</tt> - A comma-separated list of HDFS paths to files backing this store. * </ul> * * <h3>Default Values</h3> * <ul> * <li>By default, use of the DistributedCache is enabled.</li> * <li>You must specify the paths to read. It is an error to leave this unconfigured.</li> * <li>Files will be read using a new <tt>Configuration</tt> object if you do not specify * your own.</li> * <li>The delimiter between the key and the value in the line of text defaults to a tab * character. You may specify any other UTF-8 character or sequence of UTF-8 characters. * Note that the MapReduce <tt>TextOutputFormat</tt> emits tab-delimited key-value pairs.</li> * </ul> */ @ApiAudience.Public @ApiStability.Evolving public final class TextFileKeyValueStore implements Configurable, KeyValueStore<String, String> { /** The configuration variable for the delimiter. */ private static final String CONF_DELIMITER_KEY = "delim"; /** * Records are tab-delimited by default, to be compatible with TextOutputFormat, * and function like KeyValueTextInputFormat. */ public static final String DEFAULT_DELIMITER = "\t"; /** Helper object to manage backing files. */ private final FileStoreHelper mFileHelper; /** The delimiter string to use. */ private String mDelim; /** true if the user has called open(); cannot call initFromConf() after that. */ private boolean mOpened; /** * A Builder-pattern class that configures and creates new TextFileKeyValueStore * instances. You should use this to specify the input to this KeyValueStore. * Call the build() method to return a new, configured TextFileKeyValueStore instance. */ @ApiAudience.Public @ApiStability.Evolving public static final class Builder { private FileStoreHelper.Builder mFileBuilder; private String mDelim; /** * Private, default constructor. Call the builder() method of this KeyValueStore * to get a new builder instance. */ private Builder() { mFileBuilder = FileStoreHelper.builder(); mDelim = DEFAULT_DELIMITER; } /** * Specifies the delimiter between the key and the value on a line in the file. * * @param delim the delimiter string to use. * @return this Builder instance. */ public Builder withDelimiter(String delim) { if (null == delim || delim.isEmpty()) { throw new IllegalArgumentException("Cannot use empty delimiter"); } mDelim = delim; return this; } /** * Sets the Hadoop configuration instance to use. * * @param conf The configuration. * @return This builder instance. */ public Builder withConfiguration(Configuration conf) { mFileBuilder.withConfiguration(conf); return this; } /** * Adds a path to the list of files to load. * * @param path The input file/directory path. * @return This builder instance. */ public Builder withInputPath(Path path) { mFileBuilder.withInputPath(path); return this; } /** * Replaces the current list of files to load with the set of files * specified as an argument. * * @param paths The input file/directory paths. * @return This builder instance. */ public Builder withInputPaths(List<Path> paths) { mFileBuilder.withInputPaths(paths); return this; } /** * Sets a flag indicating the use of the DistributedCache to distribute * input files. * * @param enabled true if the DistributedCache should be used, false otherwise. * @return This builder instance. */ public Builder withDistributedCache(boolean enabled) { mFileBuilder.withDistributedCache(enabled); return this; } /** * Build a new TextFileKeyValueStore instance. * * @return the initialized KeyValueStore. */ public TextFileKeyValueStore build() { return new TextFileKeyValueStore(this); } } /** * Creates a new TextFileKeyValueStore.Builder instance that can be used * to configure and create a new KeyValueStore. * * @return a new Builder instance. */ public static Builder builder() { return new Builder(); } /** * Reflection-only constructor. Used only for reflection. You should create and configure * TextFileKeyValueStore instances by using a builder; call TextFileKeyValueStore.builder() * to get a new builder instance. */ public TextFileKeyValueStore() { this(builder()); } /** * Main constructor used by the builder; creates a new TextFileKeyValueStore to read text files. * * @param builder the builder to configure from. */ private TextFileKeyValueStore(Builder builder) { mFileHelper = builder.mFileBuilder.build(); mDelim = builder.mDelim; mOpened = false; } /** {@inheritDoc} */ @Override public void setConf(Configuration conf) { if (mOpened) { // Don't allow mutation after we start using this store for reads. throw new IllegalStateException( "Cannot set the configuration after a reader has been opened"); } mFileHelper.setConf(conf); } /** {@inheritDoc} */ @Override public Configuration getConf() { return new Configuration(mFileHelper.getConf()); } /** {@inheritDoc} */ @Override public void storeToConf(KeyValueStoreConfiguration conf) throws IOException { if (null == mDelim || mDelim.isEmpty()) { throw new IOException("Cannot use empty delimiter"); } conf.set(CONF_DELIMITER_KEY, mDelim); mFileHelper.storeToConf(conf); } /** {@inheritDoc} */ @Override public void initFromConf(KeyValueStoreConfiguration conf) throws IOException { if (mOpened) { throw new IllegalStateException("Cannot reinitialize; already opened a reader."); } mFileHelper.initFromConf(conf); mDelim = conf.get(CONF_DELIMITER_KEY, DEFAULT_DELIMITER); } /** {@inheritDoc} */ @Override public KeyValueStoreReader<String, String> open() throws IOException { mOpened = true; return new Reader(mFileHelper.getConf(), mFileHelper.getExpandedInputPaths(), mDelim); } /** * Reads an entire text file of records into memory, and indexes it by the key field. * Key and value fields are separated by the first occurrence of the 'delimiter' string * in a line. If the delimiter does not exist, then the entire line is taken to be the * key and the value is 'null'. (Note that lines that end with the delimiter string * will have the non-null empty string as a value.) * * <p>Lookups for a key <i>K</i> will return the first record in the file where the key field * has value <i>K</i>. Where multiple files back the reader, the order in which files * are processed is undefined.</p> */ @ApiAudience.Private private static final class Reader implements KeyValueStoreReader<String, String> { /** A map from keys to values loaded from the input files. */ private Map<String, String> mMap; /** The delimiter string. */ private final String mDelim; /** * Constructs a key value reader over text file(s). * * @param conf The Hadoop configuration. * @param paths The path to the text file(s). * @param delim the delimeter string. * @throws IOException If the files cannot be read. */ public Reader(Configuration conf, List<Path> paths, String delim) throws IOException { if (null == delim || delim.isEmpty()) { throw new IOException("Cannot use null/empty delimiter."); } mDelim = delim; mMap = new HashMap<String, String>(); for (Path path : paths) { // Load the entire file into the lookup map. FileSystem fs = path.getFileSystem(conf); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(fs.open(path), "UTF-8")); String line = reader.readLine(); while (null != line) { String key; String val; int delimPos = line.indexOf(mDelim); if (-1 == delimPos) { // No delimiter in the line. key = line; // Whole line is the key. val = null; // The value is null. } else { key = line.substring(0, delimPos); val = line.substring(delimPos + mDelim.length()); } if (!mMap.containsKey(key)) { mMap.put(key, val); } line = reader.readLine(); } } catch (UnsupportedEncodingException uee) { // Can't open as UTF-8? Java is quite broken if we get here throw new IOException(uee); } finally { IOUtils.closeQuietly(reader); } } } /** {@inheritDoc} */ @Override public boolean isOpen() { return null != mMap; } /** {@inheritDoc} */ @Override public String get(String key) throws IOException { if (!isOpen()) { throw new IOException("Reader is closed"); } return mMap.get(key); } /** {@inheritDoc} */ @Override public boolean containsKey(String key) throws IOException { if (!isOpen()) { throw new IOException("Reader is closed"); } return mMap.containsKey(key); } /** {@inheritDoc} */ @Override public void close() throws IOException { mMap = null; } } }