package edu.washington.escience.myria.io;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.SequenceInputStream;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* A data source that pulls data from a specified URI. The URI may be: a path on the local file system; an HDFS link; a
* web link; an AWS link; and perhaps more.
*
* If the URI points to a directory, all files in that directory will be concatenated into a single {@link InputStream}.
*/
public class UriSource implements DataSource, Serializable {
/** Required for Java serialization. */
private static final long serialVersionUID = 1L;
/** The logger for debug, trace, etc. messages in this class. */
private static final org.slf4j.Logger LOGGER = org.slf4j.LoggerFactory.getLogger(UriSource.class);
/** The Uniform Resource Indicator (URI) of the data source. */
private URI parsedUri;
/**
* Construct a source of data from the specified URI. The URI may be: a path on the local file system; an HDFS link; a
* web link; an AWS link; and perhaps more.
*
* If the URI points to a directory in HDFS, all files in that directory will be concatenated into a single
* {@link InputStream}.
*
* @param uri the Uniform Resource Indicator (URI) of the data source.
* @throws URISyntaxException
*/
@JsonCreator
public UriSource(@JsonProperty(value = "uri", required = true) final String uri)
throws URISyntaxException {
parsedUri =
URI.create(Objects.requireNonNull(uri, "Parameter uri to UriSource may not be null"));
/* Force using the Hadoop S3A FileSystem */
if (parsedUri.getScheme().equals("s3")) {
parsedUri =
new URI(
"s3a",
parsedUri.getUserInfo(),
parsedUri.getHost(),
parsedUri.getPort(),
parsedUri.getPath(),
parsedUri.getQuery(),
parsedUri.getFragment());
}
}
@Override
public InputStream getInputStream() throws IOException {
return (parsedUri.getScheme().equals("http") || parsedUri.getScheme().equals("https"))
? parsedUri.toURL().openConnection().getInputStream()
: getHadoopFileSystemInputStream(parsedUri);
}
@JsonProperty("uri")
private String getUriString() {
return parsedUri.toString();
}
/**
* Get an input stream using the configured Hadoop file system for the given URI scheme
*/
private static InputStream getHadoopFileSystemInputStream(final URI uri) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(uri, conf);
Path rootPath = new Path(uri);
FileStatus[] statii = fs.globStatus(rootPath);
if (statii == null || statii.length == 0) {
throw new FileNotFoundException(uri.toString());
}
List<InputStream> streams = new ArrayList<InputStream>();
for (FileStatus status : statii) {
Path path = status.getPath();
LOGGER.debug("Incorporating input file: " + path);
streams.add(fs.open(path));
}
return new SequenceInputStream(java.util.Collections.enumeration(streams));
}
}