/*
* Licensed to Think Big Analytics, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Think Big Analytics, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright 2011 Think Big Analytics. All Rights Reserved.
*/
package tap;
import tap.compression.Compressions;
import tap.core.InfeasiblePlanException;
import tap.core.TapContext;
import tap.core.io.BinaryKey;
import tap.core.mapreduce.input.TapfileRecordReader;
import tap.core.mapreduce.io.BinaryWritable;
import tap.formats.*;
import tap.util.ObjectFactory;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Iterator;
import org.apache.avro.mapred.AvroValue;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
@SuppressWarnings("deprecation")
public class Pipe<T> implements Iterable<T>, Iterator<T> {
private TapContext<T> context; // for OutPipe
private Iterator<AvroValue<T>> values; // for InPipe
Formats format = Formats.UNKNOWN_FORMAT;
private Phase producer;
protected String path;
protected T prototype;
String uncompressedPath;
protected Compressions compression = null;
protected boolean isCompressed = false;
boolean isTempfile = false;
private DFSStat stat;
//to support subscribe API
boolean isDirectFileAccess = false;
TapfileRecordReader recordReader = null;
BinaryKey binaryKey = null;
BinaryWritable writable = null;
// Pipe's reference to the job configuration
Configuration conf = null;
public static <T> Pipe<T> of(Class<? extends T> ofClass) {
try {
return new Pipe<T>(ObjectFactory.newInstance(ofClass));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public static <T> Pipe<T> of(T prototype) {
return new Pipe<T>(prototype);
}
@Deprecated
public Pipe(T prototype) {
this.setPrototype(prototype);
}
Pipe(String path) {
setPath(path);
init();
}
/**
* Temporary inter-phase pipe
* @param isTemporary
*/
Pipe(boolean isTemporary) {
isTempfile = isTemporary;
}
/**
* Setup job configuration (and cache it) on the Pipe
* @param conf
*/
void setConf(Configuration conf) {
if (null == conf) {
throw new IllegalArgumentException("Please don't give us a null Configuration object");
}
this.conf = conf;
}
/**
*
* @return The job configuration
*/
Configuration getConf() {
return conf;
}
/**
* Generate and return DFS file stat info
* @return The file status
*/
DFSStat stat() {
if (null == stat) {
this.stat = new DFSStat(path,getConf());
}
return stat;
}
/*
* Probe HDFS to determine if this.path exists.
*/
boolean exists() {
return stat().exists;
}
/**
* Determine if file(s) in path are obsolete. Used in generating a work
* plan.
* @return True if obsolete
*/
boolean isObsolete() {
// this needs to be smart - we should encode in the file metadata
// the dependents and their dates used
// so we can verify that any existing antecedent is not newer and
// declare victory...
if (stat().exists) {
for (FileStatus status : stat().getStatuses()) {
if (!status.isDir()) {
// TODO add other types?
if (getFormat() != Formats.AVRO_FORMAT
|| status.getPath().toString().endsWith(".avro")) {
return false; // may check for extension for other
// types
}
} else {
if (!status.getPath().toString().endsWith("/_logs")
&& !status.getPath().toString()
.endsWith("/_temporary")) {
return false;
}
}
}
}
return true; // needs more work!
}
//for subscribe
public void setRecordReader(TapfileRecordReader reader)
{
this.recordReader = reader;
isDirectFileAccess = true;
binaryKey = reader.createKey();
writable = reader.createValue();
}
/**
* Check outputs
* @param conf
*/
void clearAndPrepareOutput() {
try {
if (stat().exists) {
for (FileStatus status : stat().getStatuses()) {
if (status.isDir()) {
if (!status.getPath().toString().endsWith("/_logs")
&& !status.getPath().toString()
.endsWith("/_temporary")) {
throw new IllegalArgumentException(
"Trying to overwrite directory with child directories: "
+ path);
}
}
}
} else {
stat().fs.mkdirs(stat().dfsPath);
}
stat().fs.delete(stat().dfsPath, true);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Make determination of this (input) pipe is valid. This test is only
* useful during early binding.
*
* @return
*/
boolean isValidInput() {
return isTempfile
|| hasWildcard()
|| isFile()
|| isSingleDir()
;
}
boolean isFile() {
return stat().exists && stat().isFile;
}
boolean isSingleDir() {
return stat().exists && !stat().isFile && !hasSubdirs();
}
boolean hasSubdirs() {
return false; //TODO: Implement logic here if performance is acceptable
}
boolean hasWildcard() {
return path.contains("*")
|| path.contains("?")
|| path.contains("[");
}
public void delete() {
clearAndPrepareOutput();
}
@SuppressWarnings("unchecked")
public Pipe<T> stringFormat() {
setPrototype((T) new String());
return this;
}
public Pipe<T> jsonFormat() {
this.setFormat(Formats.JSON_FORMAT);
return this;
}
public Pipe<T> avroFormat() {
this.setFormat(Formats.AVRO_FORMAT);
return this;
}
public Pipe<T> protoFormat() {
this.setFormat(Formats.TAPPROTO_FORMAT);
return this;
}
// Compression Methods
public Pipe<T> gzipCompression() {
this.setCompression(Compressions.GZIP_COMPRESSION);
return this;
}
public Compressions getCompression() {
return compression;
}
void setupOutput(JobConf conf) {
getFormat().getFileFormat().setupOutput(conf,
getPrototype() == null ? null : getPrototype().getClass());
if (this.isCompressed == true) {
getCompression().getCompression().setupOutput(conf);
}
}
public void setupInput(JobConf conf) {
getFormat().getFileFormat().setupInput(conf,
getPrototype() == null ? null : getPrototype().getClass());
if (this.isCompressed == true) {
getCompression().getCompression().setupInput(conf);
}
}
/**
* Return timestamp of @path
*
* @param conf
* Environment configuration
* @return the timestamp
*/
public long getTimestamp() {
return stat().timestamp;
}
/**
* Files at the same location are deemed equal, however Pipe needs to warn
* if there are inconsistencies.
*/
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Pipe other = (Pipe) obj;
if (path == null) {
if (other.path != null)
return false;
} else if (!path.equals(other.path))
return false;
return true;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result
+ ((path == null) ? 0x123c67ce : path.hashCode());
return result;
}
/**
* Turn on/off the Pipe's compression
*
* @param isCompressed
* true if compression is to be used
* @return this
*/
public Pipe<T> compressed(boolean isCompressed) {
this.isCompressed = isCompressed;
return this;
}
/**
* InPipe type constructor Reducer In pipe
*
* @param values
*/
public Pipe(Iterator<AvroValue<T>> values) {
this.values = values;
}
public boolean hasNext() {
if(isDirectFileAccess)
{
return recordReader.hasNext();
}
else
{
return this.values.hasNext();
}
}
public Iterator<T> iterator() {
return this;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
/**
* Get next object of Type <T> from @Pipe
*
* @return Object value
*/
public T next() {
if(isDirectFileAccess)
{
try {
recordReader.next(binaryKey, writable);
return (T) writable.get(); //is there anyway to get the type?
}
catch (IOException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
return (T) null;
}
}
else
{
T val = this.values.next().datum();
return (T) val;
}
}
/**
* Alias for next()
*
* @return The next value in the Iterator
*/
public T get() {
return this.next();
}
/**
* @return the context
*/
public TapContext<T> getContext() {
return context;
}
/**
* @param context
* The context to set
*/
public void setContext(TapContext<T> context) {
this.context = context;
}
/**
* Put value @out into output
*
* @param out
* The value to put
*/
public void put(T out) {
this.context.write(out);
}
public void put(T out, String multiName) {
this.context.write(out, multiName);
}
/**
* @return The phase that produces this file.
*/
public Phase getProducer() {
return producer;
}
public void setProducer(Phase producer) {
this.producer = producer;
}
public String getPath() {
return path;
}
public Pipe<T> at(String path) {
setPath(path);
return this;
}
@Override
public String toString() {
return path + ":" + super.toString();
}
public T getPrototype() {
return prototype;
}
public Formats getPipeFormat() throws FileNotFoundException, IOException, IllegalArgumentException
{
if(!isFile())
throw new IllegalArgumentException("Pipe is not associated with a file, so has no format");
Path p = new Path(path);
return sniffFileFormat(p);
}
public Class readPipeClassFromFile(Configuration job) throws IOException {
if(path == null)
throw new IllegalArgumentException("specify file or directory for mapper before setting prototype");
Path p = new Path(path);
return TapfileRecordReader.readMessageClass(job, p);
}
public void setPrototypeForMapperInput(T prototype) throws InfeasiblePlanException {
Formats sniffedFileFormat;
if(path == null)
throw new IllegalArgumentException("specify file or directory for mapper before setting prototype");
Path p = new Path(path);
if(isSingleDir())
{
FileStatus[] status;
try {
FileSystem f = FileSystem.get(getConf());
status = f.listStatus(p);
}
catch(IOException ioexception)
{
throw new InfeasiblePlanException("invalid path");
}
if(status.length == 0) //directory is empty
{ this.prototype = prototype;
return;
}
else //read the first file in directory
{
p = status[0].getPath();
if(p.getName().equals("_SUCCESS"))
{
if(status.length == 1) //directory is empty except for _SUCCESS file
{
this.prototype = prototype;
return;
}
else
{
p = status[1].getPath();
}
}
}
}
try {
sniffedFileFormat = sniffFileFormat(p);
}
catch(Exception e)
{
throw new InfeasiblePlanException(e.getMessage());
}
if(format != sniffedFileFormat)
{
//alert user?
//override file extension
format = sniffedFileFormat;
}
if(format.getFileFormat().instanceOfCheck(prototype))
{
//can do additional checking i.e., read file to make sure it matches it file extension, make sure it contains the correct objects.
this.prototype = prototype;
}
else
{
throw new InfeasiblePlanException("Pipe prototype and file type are not compatible");
}
}
//NB this duplicates code found in TapfileRecodReader. Need to re-factor.
private Formats sniffFileFormat(Path path) throws IOException, FileNotFoundException {
byte[] header;
FileSystem fs = path.getFileSystem(this.getConf());
FSDataInputStream in = null;
try {
in = fs.open(path);
header = new byte[1000];
in.read(header);
in.close();
} finally {
if(in != null)
in.close();
}
return determineFileFormat(header);
}
private Formats determineFileFormat(byte[] header) {
for (Formats format : Formats.values()) {
if (format.getFileFormat().signature(header)) {
return format;
}
}
return Formats.UNKNOWN_FORMAT;
}
public void setPrototype(T prototype) {
if (prototype == null) {
return;
}
this.prototype = prototype;
init();
}
public Formats getFormat() {
return format;
}
public void setFormat(Formats format) {
this.format = format;
}
public String getUncompressedPath() {
return uncompressedPath;
}
protected void setUncompressedPath(String uncompressedPath) {
this.uncompressedPath = uncompressedPath;
}
protected void setPath(String path) {
this.path = path;
}
protected Formats determineFormat() {
for (Formats f : Formats.values()) {
FileFormat fileFormat = f.getFileFormat();
if (fileFormat.matches(this)) {
return f;
}
}
return Formats.UNKNOWN_FORMAT;
}
/**
* Based on path name, determine if file is compressed
*/
protected void determineCompression() {
if (null != path) {
if (this.path.endsWith(Compressions.GZIP_COMPRESSION
.fileExtension())) {
this.isCompressed = true;
setCompression(Compressions.GZIP_COMPRESSION);
this.setUncompressedPath(this.path.replaceAll(".gz$", ""));
} else {
this.setUncompressedPath(path);
}
}
}
public Pipe<T> setCompression(Compressions compression) {
this.isCompressed = true;
this.compression = compression;
return this;
}
public boolean isCompressed() {
return isCompressed;
}
protected void init() {
determineCompression();
Formats format = determineFormat();
setFormat(format);
}
}