package com.thinkaurelius.faunus.hdfs; import com.thinkaurelius.faunus.Tokens; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.CompressionCodec; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.Iterator; import java.util.LinkedList; import java.util.NoSuchElementException; import java.util.Queue; /** * @author Marko A. Rodriguez (http://markorodriguez.com) */ public class TextFileLineIterator implements Iterator<String> { private final FileSystem fs; private final Queue<Path> paths; private final long totalLines; private long lines = 0; private BufferedReader reader = null; private String line; private CompressionCodec codec = new BZip2Codec(); public TextFileLineIterator(final FileSystem fs, final Queue<Path> paths, final long totalLines) throws IOException { this.fs = fs; this.totalLines = totalLines; this.paths = paths; } public TextFileLineIterator(final FileSystem fs, final FileStatus[] statuses, final long totalLines) throws IOException { this.fs = fs; this.totalLines = totalLines; this.paths = new LinkedList<Path>(); for (final FileStatus status : statuses) { this.paths.add(status.getPath()); } } public boolean hasNext() { if (null != line) return true; if (this.lines >= this.totalLines) return false; try { if (this.reader == null) if (this.paths.isEmpty()) return false; else this.reader = this.getUncompressedInputStream(); this.line = this.reader.readLine(); if (this.line != null) { this.lines++; return true; } else { this.reader.close(); if (this.paths.isEmpty()) this.reader = null; else this.reader = this.getUncompressedInputStream(); return this.hasNext(); } } catch (Exception e) { throw new RuntimeException(e.getMessage(), e); } } public String next() { if (null != line) { final String temp = line; line = null; return temp; } else if (this.hasNext()) { return this.next(); } else { throw new NoSuchElementException(); } } private BufferedReader getUncompressedInputStream() throws IOException { final Path path = this.paths.remove(); if (path.getName().endsWith(Tokens.BZ2)) return new BufferedReader(new InputStreamReader(this.codec.createInputStream(fs.open(path)))); else return new BufferedReader(new InputStreamReader(this.fs.open(path))); } public void remove() { throw new UnsupportedOperationException(); } }