package org.basex.io.in; import static org.basex.util.Token.*; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.Arrays; import java.util.zip.ZipInputStream; import org.basex.io.IO; import org.basex.io.IOFile; import org.basex.util.list.ByteList; /** * This class uses a byte buffer to speed up input stream processing. * * @author BaseX Team 2005-12, BSD License * @author Christian Gruen */ public class BufferInput extends InputStream { /** UTF8 cache. */ private final byte[] cache = new byte[4]; /** Byte buffer. */ final byte[] buffer; /** Current buffer position. */ int bpos; /** Current buffer size. */ int bsize; /** Reference to the data input stream. */ private InputStream in; /** Default encoding for text files. */ private String enc = UTF8; /** Charset decoder. */ private CharsetDecoder csd; /** Total length of input to be processed (may be {@code 0}). */ private long length; /** Buffer marker to jump back (not available when set to {@code -1}. */ private int bmark; /** Number of read bytes. */ private int read; /** * Initializes the file reader. * @param file the file to be read * @throws IOException I/O Exception */ public BufferInput(final IOFile file) throws IOException { this(new FileInputStream(file.file())); length = file.length(); } /** * Initializes the file reader. * @param is input stream */ public BufferInput(final InputStream is) { this(is, IO.BLOCKSIZE); } /** * Initializes the file reader. * @param is input stream * @param bs buffer size */ public BufferInput(final InputStream is, final int bs) { buffer = new byte[bs]; in = is; } /** * Empty constructor. * @param buf buffer */ BufferInput(final byte[] buf) { buffer = buf; bsize = buf.length; length = bsize; } /** * Guesses the file encoding, based on the first characters. * @return encoding * @throws IOException I/O exception */ public final String encoding() throws IOException { final int a = read(); final int b = read(); final int c = read(); final int d = read(); int skip = 0; if(a == 0xFF && b == 0xFE) { // BOM: FF FE enc = UTF16LE; skip = 2; } else if(a == 0xFE && b == 0xFF) { // BOM: FE FF enc = UTF16BE; skip = 2; } else if(a == 0xEF && b == 0xBB && c == 0xBF) { // BOM: EF BB BF skip = 3; } else if(a == '<' && b == 0 && c == '?' && d == 0) { enc = UTF16LE; } else if(a == 0 && b == '<' && c == 0 && d == '?') { enc = UTF16BE; } reset(); for(int s = 0; s < skip; s++) read(); return enc; } /** * Sets a new encoding. * @param encoding encoding * @throws IOException I/O Exception */ public final void encoding(final String encoding) throws IOException { try { enc = normEncoding(encoding, enc); csd = Charset.forName(encoding).newDecoder(); } catch(final Exception ex) { throw new IOException(ex.toString()); } } /** * Returns the next byte (see {@link InputStream#read}. * {@code -1} is returned if all bytes have been read. * @return next byte * @throws IOException I/O exception */ @Override public int read() throws IOException { final int blen = buffer.length; final byte[] buf = buffer; if(bpos >= bsize) { if(bsize == 0 || bsize == blen) { // reset mark if buffer is full if(bsize == blen) bmark = -1; // buffer is empty or full: re-fill it bsize = 0; bpos = 0; } int r; while((r = in.read(buf, bsize, blen - bsize)) == 0); if(r < 0) return -1; bsize += r; read += r; } return buf[bpos++] & 0xFF; } /** * Reads a string from the input stream, suffixed by a {@code 0} byte. * @return string * @throws IOException I/O Exception */ public final String readString() throws IOException { final ByteList bl = new ByteList(); for(int l; (l = read()) > 0;) bl.add(l); return bl.toString(); } /** * Reads a byte array from the input stream, suffixed by a {@code 0} byte. * @return token * @throws IOException I/O Exception */ public final byte[] readBytes() throws IOException { final ByteList bl = new ByteList(); for(int l; (l = read()) > 0;) bl.add(l); return bl.toArray(); } /** * Returns the next character (code point), or {@code -1} if end of stream * is reached. Erroneous characters are ignored. * @return next character * @throws IOException I/O exception */ public final int readChar() throws IOException { final int ch = read(); if(ch == -1) return ch; // handle different encodings (comparing by references is safe here) final String e = enc; if(e == UTF16LE) return ch | read() << 8; if(e == UTF16BE) return ch << 8 | read(); if(ch < 0x80) return ch; if(e == UTF8) { final int cl = cl((byte) ch); cache[0] = (byte) ch; for(int c = 1; c < cl; ++c) cache[c] = (byte) read(); return cp(cache, 0); } // convert other encodings.. loop until all needed bytes have been read int p = 0; while(true) { if(p == 4) return -cache[0]; cache[p++] = (byte) ch; try { final CharBuffer cb = csd.decode( ByteBuffer.wrap(Arrays.copyOf(cache, p))); int i = 0; for(int c = 0; c < cb.limit(); ++c) i |= cb.get(c) << (c << 3); return i; } catch(final CharacterCodingException ex) { // ignore erroneous characters } } } @Override public final void close() throws IOException { if(in != null && !(in instanceof ZipInputStream)) in.close(); } /** * Returns the number of read bytes. * @return read bytes */ public final int size() { return read + bpos; } /** * Returns the input length. * @return input length */ public final long length() { return length; } /** * Sets the input length. * @param l input length */ public final void length(final long l) { length = l; } @Override public final boolean markSupported() { return true; } @Override public synchronized void mark(final int m) { bmark = bpos; } @Override public final synchronized void reset() throws IOException { if(bmark == -1) throw new IOException("Mark cannot be reset."); bpos = bmark; } }