package org.deeplearning4j.examples; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.datavec.hadoop.conf.ConfigurationUtil; /** * Base class for HDFS Iterator. This class holds your HDFS configs * * @author: Ousmane A. Dia */ public class BaseDataSetIterator { protected volatile RemoteIterator<LocatedFileStatus> hdfsIterator; private final Configuration configuration; protected volatile FileSystem fs; protected volatile String hdfsUrl; public BaseDataSetIterator(Configuration configuration, String hdfsUrl) { this.configuration = configuration; initIterator(hdfsUrl); } /** * This method creates an instance of {@code org.apache.hadoop.conf.Configuration} to pass to the constructor * {@link BaseDataSetIterator#BaseDataSetIterator(Configuration, String)} * * @param baseConfPath Config path * @return an instance of {@code org.apache.hadoop.conf.Configuration} */ public static final Configuration initialize(String baseConfPath) { Configuration configuration = ConfigurationUtil.generateConfig(baseConfPath); configuration.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); configuration.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); return configuration; } protected String getRelativeFilename(String path) { String index = null; StringTokenizer pathTokens = new StringTokenizer(path, "/"); while (pathTokens.hasMoreTokens()) { index = pathTokens.nextToken(); } return index; } /** * Adding this method to help reset the iterator (see {@link MDSIterator#reset()} */ protected void initIterator(String hdfsUrl) { try { this.hdfsUrl = hdfsUrl; fs = FileSystem.get(configuration); hdfsIterator = fs.listFiles(new Path(this.hdfsUrl), true); } catch (IOException e) { throw new RuntimeException(e); } } }