package com.scaleunlimited.cascading.local;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import cascading.flow.FlowProcess;
import cascading.scheme.Scheme;
import cascading.tap.CompositeTap;
import cascading.tap.SinkMode;
import cascading.tap.local.FileTap;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryChainIterator;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import cascading.tuple.TupleEntrySchemeCollector;
import cascading.util.Util;
/**
* A DirectoryTap is a Cascading local tap that represents a directory of files, similar to Lfs in the
* Hadoop platform.
*
*/
@SuppressWarnings("serial")
public class DirectoryTap extends FileTap implements CompositeTap<FileTap> {
private class TupleIterator implements Iterator<Tuple> {
final TupleEntryIterator iterator;
private TupleIterator(TupleEntryIterator iterator) {
this.iterator = iterator;
}
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public Tuple next() {
return iterator.next().getTuple();
}
@Override
public void remove() {
iterator.remove();
}
}
private transient List<FileTap> _taps;
/**
* Constructor FileTap creates a new FileTap instance using the given
* {@link cascading.scheme.Scheme} and file {@code path}.
*
* @param scheme
* of type LocalScheme
* @param path
* of type String
*/
public DirectoryTap(Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String path) {
super(scheme, path);
}
/**
* Constructor FileTap creates a new FileTap instance using the given
* {@link cascading.scheme.Scheme}, file {@code path}, and {@code SinkMode}.
*
* @param scheme
* of type LocalScheme
* @param path
* of type String
* @param sinkMode
* of type SinkMode
*/
public DirectoryTap(Scheme<Properties, InputStream, OutputStream, ?, ?> scheme, String path, SinkMode sinkMode) {
super(scheme, path, sinkMode);
}
public String getPath() {
return getIdentifier();
}
@SuppressWarnings("unchecked")
@Override
public TupleEntryIterator openForRead(FlowProcess<Properties> flowProcess, InputStream input) throws IOException {
// TODO what to do about input? Why does MultiSourceTap check for input != null, and return first tap's TEI?
List<FileTap> taps = getTaps();
List<Iterator<Tuple>> iterators = new ArrayList<Iterator<Tuple>>();
for (FileTap tap : taps) {
iterators.add(new TupleIterator(tap.openForRead(flowProcess)));
}
return new TupleEntryChainIterator(getSourceFields(), iterators.toArray(new Iterator[taps.size()]));
}
private List<FileTap> getTaps() {
if (_taps == null) {
File dir = new File(getPath());
List<FileTap> result = new ArrayList<FileTap>();
if (!dir.exists()) {
throw new IllegalArgumentException("Path provided doesn't exist: " + getPath());
} else if (dir.isDirectory()) {
File[] files = dir.listFiles();
for (File file : files) {
// Ignore .xxx files, like .part-00000.crc
if (file.isFile() && (!file.getName().startsWith(".")) && (!file.getName().equals("_SUCCESS"))) {
result.add(new FileTap(getScheme(), file.getAbsolutePath()));
}
}
} else if (dir.isFile()) {
result.add(new FileTap(getScheme(), dir.getAbsolutePath()));
} else {
throw new IllegalArgumentException("Path provided isn't a directory or a file: " + getPath());
}
_taps = result;
}
return _taps;
}
@Override
public TupleEntryCollector openForWrite(FlowProcess<Properties> flowProcess, OutputStream output) throws IOException {
if (output == null) {
if (getSinkMode() == SinkMode.REPLACE) {
File dirFile = new File(getPath());
if (dirFile.exists()) {
if (dirFile.isDirectory()) {
FileUtils.deleteDirectory(dirFile);
} else {
dirFile.delete();
}
}
}
output = new DirectoryFileOutputStream(this, "part-00000", isUpdate());
}
return new TupleEntrySchemeCollector<Properties, OutputStream>(flowProcess, getScheme(), output, getPath());
}
/**
* Method getSize returns the size of the file referenced by this tap.
*
* @param conf
* of type Properties
* @return The size of the file reference by this tap.
* @throws IOException
*/
public long getSize(Properties conf) throws IOException {
long totalSize = 0;
for (FileTap tap : getTaps()) {
totalSize += tap.getSize(conf);
}
return totalSize;
}
@Override
public boolean createResource(Properties conf) throws IOException {
File dir = new File(getPath());
return dir.exists() || dir.mkdirs();
}
@Override
public boolean deleteResource(Properties conf) throws IOException {
FileUtils.deleteDirectory(new File(getPath()));
return true;
}
@Override
public boolean commitResource(Properties conf) throws IOException {
return true;
}
@Override
public boolean resourceExists(Properties conf) throws IOException {
// First see if the actual path exists, and is a directory.
File pathDir = new File(getPath());
if (!pathDir.exists()) {
return false;
} else if (pathDir.isFile()) {
return true;
}
// Now we have to check for what's inside of the pathDir.
for (FileTap tap : getTaps()) {
if (!tap.resourceExists(conf)) {
return false;
}
}
return true;
}
@Override
public long getModifiedTime(Properties conf) throws IOException {
long modified = 0;
for (FileTap tap : getTaps()) {
modified = Math.max(modified, tap.getModifiedTime(conf));
}
return modified;
}
@Override
public String[] getChildIdentifiers(Properties conf) throws IOException {
return getChildIdentifiers(conf, 1, false);
}
@Override
public String[] getChildIdentifiers(Properties conf, int depth, boolean fullyQualified) throws IOException {
if (!resourceExists(conf))
return new String[0];
Set<String> results = new LinkedHashSet<String>();
getChildPaths(results, getIdentifier(), depth);
String[] allPaths = results.toArray(new String[results.size()]);
if (!fullyQualified)
return allPaths;
for (int i = 0; i < allPaths.length; i++)
allPaths[i] = fullyQualifyIdentifier(allPaths[i]);
return allPaths;
}
private boolean getChildPaths(Set<String> results, String identifier, int depth) {
File file = new File(identifier);
if (depth == 0 || file.isFile()) {
results.add(identifier);
return true;
}
String[] paths = file.list();
if (paths == null)
return false;
boolean result = false;
for (String path : paths)
result |= getChildPaths(results, new File(file, path).getPath(), depth - 1);
return result;
}
private String fullyQualifyIdentifier( String identifier ) {
return new File( identifier ).getAbsoluteFile().toURI().toString();
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
DirectoryTap other = (DirectoryTap) obj;
if (getPath() == null) {
if (other.getPath() != null)
return false;
} else if (!getPath().equals(other.getPath()))
return false;
return true;
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + ((getPath() == null) ? 0 : getPath().hashCode());
return result;
}
@Override
public String toString() {
if (getPath() != null)
return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[\"" + Util.sanitizeUrl(getPath()) + "\"]"; // sanitize
else
return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[not initialized]";
}
@Override
public Iterator<FileTap> getChildTaps() {
return getTaps().iterator();
}
@Override
public long getNumChildTaps() {
return getTaps().size();
}
}