/* * Copyright (c) 2011-2015 EPFL DATA Laboratory * Copyright (c) 2014-2015 The Squall Collaboration (see NOTICE) * * All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ch.epfl.data.squall.utilities; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.Logger; public class SerializableHDFSFileInputStream extends InputStream implements CustomReader { // self-test public static void main(String args[]) throws IOException { final SerializableHDFSFileInputStream reader = new SerializableHDFSFileInputStream( "hdfs://localhost:9000/0.3/0.3/customer.tbl"); while (true) { final String l = reader.readLine(); if (l == null) break; LOG.info(l); } } /** * */ private static final long serialVersionUID = 1L; private static Logger LOG = Logger .getLogger(SerializableHDFSFileInputStream.class); // protected final File _file; // The _file to read from protected byte[] _buffer; // The _file _buffer protected long _filePtr = 0; // How many bytes already read // from the _file FileSystem _file; FSDataInputStream _fis; // _filePtr might also represent the logical start of file read protected int _bufferPtr = 0; // How many bytes into the // _buffer the user has read protected int _bufferSize = 0; // How many bytes of the _buffer // are being used protected boolean _eofReached = false; // Modified by Aleksandar // All the sections (except the first one) should omit firstLine characters. // The responsability of the previous section is to finish the previous // line. private boolean _omitFirstLine; // Logical end of the file based on (section, parts) private long _fileEndPtr; private URI _uri; // Accurate position into the file (updatet after each readLine method) private long _filePosition; protected static int DEFAULT_BUFFER_SIZE = 1 * 1024 * 1024; protected static int LOW_WATER_MARK = 1024; public SerializableHDFSFileInputStream(String URIstring, int bufferSize) throws IOException { this(URIstring, bufferSize, 0, 1); } public SerializableHDFSFileInputStream(String URIstring, int bufferSize, int section, int parts) throws IOException { _uri = URI.create(URIstring); Configuration conf = new Configuration(); _file = FileSystem.get(_uri, conf); _fis = _file.open(new Path(_uri)); _buffer = new byte[bufferSize]; setParameters(section, parts); fillBuffer(); } public SerializableHDFSFileInputStream(String URIstring) throws IOException { this(URIstring, DEFAULT_BUFFER_SIZE, 0, 1); } @Override public void close() { // ignore } public boolean eof() { return (_eofReached && (_bufferPtr >= _bufferSize)) // Modified by Aleksandar // if _fileEnd points to a first character after \n, // we still want the previous section to read it || (_filePosition > _fileEndPtr); } protected void fillBuffer() throws IOException { if (_eofReached) return; if (_bufferPtr < 0) throw new IOException("Invalid Buffer Pointer: " + _bufferPtr); if (_bufferPtr > 0) { // Do some housekeeping on our _buffer if (_bufferPtr < _bufferSize) { // Move any existing data to the front of the _buffer... // A circular _buffer would be faster, but we're really only // talking // about a few dozen bytes at a time. // Note that System.arraycopy is explicitly safe for // self-to-self copies. System.arraycopy(_buffer, _bufferPtr, _buffer, 0, _bufferSize - _bufferPtr); _bufferSize = _bufferSize - _bufferPtr; } else // If we've precisely exhausted our _buffer (_bufferPtr should // never be > // _bufferSize), then don't bother copying; _bufferSize = 0; _bufferPtr = 0; } _fis = _file.open(new Path(_uri)); int bytesRead = -100; try { if (_filePtr > 0) for (long i = 0; i < _filePtr; i += _fis.skip(_filePtr - i)) { } bytesRead = _fis.read(_buffer, _bufferSize, _buffer.length - _bufferSize); if (bytesRead < 0) _eofReached = true; else { _bufferSize += bytesRead; _filePtr += bytesRead; } } finally { _fis.close(); } } protected void fillBufferIfNeeded(int bytesRequested) throws IOException { final int currentBufferBytes = _bufferSize - _bufferPtr; if ((currentBufferBytes < LOW_WATER_MARK) && (bytesRequested > currentBufferBytes)) fillBuffer(); } protected int numberOfBytesToEOL(boolean canFail) throws IOException { int i; for (i = 0; i < _bufferSize - _bufferPtr; i++) { if (_buffer[i + _bufferPtr] == '\n') break; if (_buffer[i + _bufferPtr] == '\r') { if (i + 1 < _bufferSize) fillBuffer(); if ((i + 1 < _bufferSize - _bufferPtr) && (_buffer[i + 1 + _bufferPtr] == '\n')) i += 1; break; } } if (i >= _bufferSize - _bufferPtr) if (canFail) return -1; else { fillBuffer(); return numberOfBytesToEOL(true); } return i + 1; } @Override public int read() throws IOException { if (eof()) return -1; fillBufferIfNeeded(1); final int ret = _buffer[_bufferPtr]; _bufferPtr++; return ret; } @Override public int read(byte[] b, int off, int len) throws IOException { if (eof()) return -1; fillBufferIfNeeded(len); if (len > _buffer.length - _bufferSize) len = _buffer.length - _bufferSize; System.arraycopy(b, off, _buffer, _bufferPtr, len); _bufferPtr += len; return len; } @Override public String readLine() throws IOException { if (eof()) return null; int i = numberOfBytesToEOL(false); String ret = null; if ((i < 0) && (_bufferPtr < _bufferSize)) i = _bufferSize - _bufferPtr; if (i > 0) { int newlineChomp = 1; if (_buffer[_bufferPtr + i - 1] == '\n') if ((i > 1) && (_buffer[_bufferPtr + i - 2] == '\r')) newlineChomp++; // Modified by Aleksandar // For the first line in a section: // we neglect the line we are currently in (it might be only a part // of a line), // and send the next line. if (!_omitFirstLine) { ret = new String(_buffer, _bufferPtr, i - newlineChomp); _bufferPtr += i; _filePosition += i; } else { _bufferPtr += i; _filePosition += i; _omitFirstLine = false; return readLine(); } } return ret; } // Modified by Aleksandar private void setParameters(int section, int parts) throws IOException { if (section >= parts) throw new RuntimeException("The section can take value from 0 to " + (parts - 1)); final long fileSize = _file.getLength(new Path(_uri)); final long sectionSize = fileSize / parts; _filePtr = section * sectionSize; _filePosition = _filePtr; // for all the sections except the last one, the end is sectionSize far // from the beginning if (section == parts - 1) _fileEndPtr = fileSize; else _fileEndPtr = _filePtr + sectionSize; // for all the sections except the first one, we discard the first read // line if (section == 0) _omitFirstLine = false; else _omitFirstLine = true; } protected String stats() { return "bP=" + _bufferPtr + "; bS=" + _bufferSize + "; fP=" + _filePtr; } }