/*
* Cloud9: A MapReduce Library for Hadoop
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package edu.umd.cloud9.io;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import tl.lin.data.pair.PairOfWritables;
/**
* Class containing a number of utility methods for manipulating SequenceFiles.
*/
public class SequenceFileUtils {
private static final TupleFactory TUPLE_FACTORY = TupleFactory.getInstance();
private SequenceFileUtils() {}
public static <K extends Writable, V extends Writable> List<PairOfWritables<K, V>> readFile(Path path)
throws IOException {
FileSystem fs;
fs = FileSystem.get(new Configuration());
return readFile(path, fs, Integer.MAX_VALUE);
}
public static <K extends Writable, V extends Writable>
List<PairOfWritables<K, V>> readFile(Path path, int max) throws IOException {
FileSystem fs;
fs = FileSystem.get(new Configuration());
return readFile(path, fs, max);
}
public static <K extends Writable, V extends Writable>
List<PairOfWritables<K, V>> readFile(Path path, FileSystem fs) throws IOException {
return readFile(path, fs, Integer.MAX_VALUE);
}
/**
* Reads key-value pairs from a SequenceFile, up to a maximum number.
*
* @param path path to file
* @param max maximum of key-value pairs to read
* @return list of key-value pairs
*/
@SuppressWarnings("unchecked")
public static <K extends Writable, V extends Writable>
List<PairOfWritables<K, V>> readFile(Path path, FileSystem fs, int max) throws IOException {
List<PairOfWritables<K, V>> list = new ArrayList<PairOfWritables<K, V>>();
try {
int k = 0;
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
K key;
V value;
if (Tuple.class.isAssignableFrom(reader.getKeyClass())) {
key = (K) TUPLE_FACTORY.newTuple();
} else {
key = (K) reader.getKeyClass().newInstance();
}
if (Tuple.class.isAssignableFrom(reader.getValueClass())) {
value = (V) TUPLE_FACTORY.newTuple();
} else {
value = (V) reader.getValueClass().newInstance();
}
while (reader.next(key, value)) {
k++;
list.add(new PairOfWritables<K, V>(key, value));
if (k >= max) {
break;
}
// Create new objects, because the key, value gets reused
if (Tuple.class.isAssignableFrom(reader.getKeyClass())) {
key = (K) TUPLE_FACTORY.newTuple();
} else {
key = (K) reader.getKeyClass().newInstance();
}
if (Tuple.class.isAssignableFrom(reader.getValueClass())) {
value = (V) TUPLE_FACTORY.newTuple();
} else {
value = (V) reader.getValueClass().newInstance();
}
}
reader.close();
} catch (IllegalAccessException e) {
throw new RuntimeException("Error reading SequenceFile: " + e);
} catch (InstantiationException e) {
throw new RuntimeException("Error reading SequenceFile: " + e);
}
return list;
}
public static <K extends Writable, V extends Writable>
SortedMap<K, V> readFileIntoMap(Path path) throws IOException {
FileSystem fs;
fs = FileSystem.get(new Configuration());
return readFileIntoMap(path, fs, Integer.MAX_VALUE);
}
public static <K extends Writable, V extends Writable> SortedMap<K, V>
readFileIntoMap(Path path, int max) throws IOException {
FileSystem fs;
fs = FileSystem.get(new Configuration());
return readFileIntoMap(path, fs, max);
}
public static <K extends Writable, V extends Writable> SortedMap<K, V>
readFileIntoMap(Path path, FileSystem fs) throws IOException {
return readFileIntoMap(path, fs, Integer.MAX_VALUE);
}
public static <K extends Writable, V extends Writable> SortedMap<K, V>
readFileIntoMap(Path path, FileSystem fs, int max) throws IOException {
SortedMap<K, V> map = new TreeMap<K, V>();
for ( PairOfWritables<K,V> pair : SequenceFileUtils.<K, V>readFile(path, fs, max)) {
map.put(pair.getLeftElement(), pair.getRightElement());
}
return map;
}
public static <K extends Writable, V extends Writable> List<PairOfWritables<K, V>> readDirectory(Path path) {
FileSystem fs;
try {
fs = FileSystem.get(new Configuration());
} catch (IOException e) {
throw new RuntimeException("Unable to access the file system!");
}
return readDirectory(path, fs, Integer.MAX_VALUE);
}
/**
* Reads key-value pairs from a directory containing SequenceFiles. A
* maximum number of key-value pairs is read from each SequenceFile.
*
* @param path path to directory
* @param max maximum of key-value pairs to read per file
* @return list of key-value pairs
*/
public static <K extends Writable, V extends Writable> List<PairOfWritables<K, V>> readDirectory(Path path, FileSystem fs, int max) {
List<PairOfWritables<K, V>> list = new ArrayList<PairOfWritables<K, V>>();
try {
FileStatus[] stat = fs.listStatus(path);
for (int i = 0; i < stat.length; ++i) {
// skip '_log' directory
if (stat[i].getPath().getName().startsWith("_"))
continue;
List<PairOfWritables<K, V>> pairs = readFile(stat[i].getPath(), fs, max);
list.addAll(pairs);
}
} catch (IOException e) {
throw new RuntimeException("Error reading the file system!");
}
return list;
}
public static <K extends Writable> List<K> readKeys(Path path) {
FileSystem fs;
try {
fs = FileSystem.get(new Configuration());
} catch (IOException e) {
throw new RuntimeException("Unable to access the file system!");
}
return readKeys(path, fs, Integer.MAX_VALUE);
}
public static <K extends Writable> List<K> readKeys(Path path, int max) {
FileSystem fs;
try {
fs = FileSystem.get(new Configuration());
} catch (IOException e) {
throw new RuntimeException("Unable to access the file system!");
}
return readKeys(path, fs, max);
}
public static <K extends Writable> List<K> readKeys(Path path, FileSystem fs) {
return readKeys(path, fs, Integer.MAX_VALUE);
}
@SuppressWarnings("unchecked")
public static <K extends Writable> List<K> readKeys(Path path, FileSystem fs, int max) {
List<K> list = new ArrayList<K>();
try {
int k = 0;
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
K key = (K) reader.getKeyClass().newInstance();
Writable value = (Writable) reader.getValueClass().newInstance();
while (reader.next(key, value)) {
k++;
list.add(key);
if (k >= max) {
break;
}
key = (K) reader.getKeyClass().newInstance();
}
reader.close();
} catch (Exception e) {
throw new RuntimeException("Error reading SequenceFile " + path);
}
return list;
}
public static <V extends Writable> List<V> readValues(Path path) {
FileSystem fs;
try {
fs = FileSystem.get(new Configuration());
} catch (IOException e) {
throw new RuntimeException("Unable to access the file system!");
}
return readValues(path, fs, Integer.MAX_VALUE);
}
public static <V extends Writable> List<V> readValues(Path path, int max) {
FileSystem fs;
try {
fs = FileSystem.get(new Configuration());
} catch (IOException e) {
throw new RuntimeException("Unable to access the file system!");
}
return readValues(path, fs, max);
}
public static <V extends Writable> List<V> readValues(Path path, FileSystem fs) {
return readValues(path, fs, Integer.MAX_VALUE);
}
@SuppressWarnings("unchecked")
public static <V extends Writable> List<V> readValues(Path path, FileSystem fs, int max) {
List<V> list = new ArrayList<V>();
try {
int k = 0;
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
Writable key = (Writable) reader.getKeyClass().newInstance();
V value = (V) reader.getValueClass().newInstance();
while (reader.next(key, value)) {
k++;
list.add(value);
if (k >= max) {
break;
}
value = (V) reader.getValueClass().newInstance();
}
reader.close();
} catch (Exception e) {
throw new RuntimeException("Error reading SequenceFile " + path);
}
return list;
}
}