HtmlInputStreamReader.java example

Explorer
htmlparser-master
/*
 * Copyright (c) 2007 Henri Sivonen
 * Copyright (c) 2013 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a 
 * copy of this software and associated documentation files (the "Software"), 
 * to deal in the Software without restriction, including without limitation 
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 * and/or sell copies of the Software, and to permit persons to whom the 
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 * DEALINGS IN THE SOFTWARE.
 */

package nu.validator.htmlparser.io;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;

import nu.validator.htmlparser.common.ByteReadable;
import nu.validator.htmlparser.common.Heuristics;
import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.extra.ChardetSniffer;
import nu.validator.htmlparser.extra.IcuDetectorSniffer;
import nu.validator.htmlparser.impl.Tokenizer;

import org.xml.sax.ErrorHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

/**
 * Be very careful with this class. It is not a general-purpose subclass of of
 * <code>Reader</code>. Instead, it is the minimal implementation that does
 * what <code>Tokenizer</code> needs while being an instance of
 * <code>Reader</code>.
 * 
 * The only reason why this is a public class is that it needs to be visible to
 * test code in another package.
 * 
 * @version $Id$
 * @author hsivonen
 */
public final class HtmlInputStreamReader extends Reader implements
        ByteReadable, Locator {

    private static final int SNIFFING_LIMIT = 1024;

    private final InputStream inputStream;

    private final ErrorHandler errorHandler;

    private final Tokenizer tokenizer;

    private final Driver driver;

    private CharsetDecoder decoder = null;

    private boolean sniffing = true;

    private int limit = 0;

    private int position = 0;

    private int bytesRead = 0;

    private boolean eofSeen = false;

    private boolean shouldReadBytes = false;

    private boolean charsetBoundaryPassed = false;

    private final byte[] byteArray = new byte[4096]; // Length must be >=

    // SNIFFING_LIMIT

    private final ByteBuffer byteBuffer = ByteBuffer.wrap(byteArray);

    private boolean needToNotifyTokenizer = false;

    private boolean flushing = false;

    private int line = -1;

    private int col = -1;

    private int lineColPos;

    private boolean hasPendingReplacementCharacter = false;

    private boolean nextCharOnNewLine;

    private boolean prevWasCR;

    /**
     * @param inputStream
     * @param errorHandler
     * @param locator
     * @throws IOException
     * @throws SAXException
     */
    public HtmlInputStreamReader(InputStream inputStream,
            ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
            Heuristics heuristics) throws SAXException, IOException {
        this.inputStream = inputStream;
        this.errorHandler = errorHandler;
        this.tokenizer = tokenizer;
        this.driver = driver;
        this.sniffing = true;
        Encoding encoding = (new BomSniffer(this)).sniff();
        if (encoding == null) {
            position = 0;
            encoding = (new MetaSniffer(errorHandler, this)).sniff(this);
            boolean declared = true;
            if (encoding == null) {
                declared = false;
            } else if (encoding != Encoding.UTF8) {
                warn("Legacy encoding \u201C"
                        + encoding.getCanonName()
                        + "\u201D used. Documents should use UTF-8.");
            }
            if (encoding == null
                    && (heuristics == Heuristics.CHARDET || heuristics == Heuristics.ALL)) {
                encoding = (new ChardetSniffer(byteArray, limit)).sniff();
            }
            if (encoding == null
                    && (heuristics == Heuristics.ICU || heuristics == Heuristics.ALL)) {
                position = 0;
                encoding = (new IcuDetectorSniffer(this)).sniff();
            }
            sniffing = false;
            if (encoding == null) {
                encoding = Encoding.WINDOWS1252;
            }
            if (!declared) {
                err("The character encoding was not declared. Proceeding using \u201C" + encoding.getCanonName() + "\u201D.");
            }
            if (driver != null) {
                driver.setEncoding(encoding, Confidence.TENTATIVE);
            }
        } else {
            if (encoding == Encoding.UTF8) {
                if (driver != null) {
                    driver.setEncoding(Encoding.UTF8, Confidence.CERTAIN);
                }
            } else {
                warn("Legacy encoding \u201C"
                        + encoding.getCanonName()
                        + "\u201D used. Documents should use UTF-8.");
                if (driver != null) {
                    driver.setEncoding(Encoding.UTF16, Confidence.CERTAIN);
                }
            }
        }
        this.decoder = encoding.newDecoder();
        sniffing = false;
        position = 0;
        bytesRead = 0;
        byteBuffer.position(position);
        byteBuffer.limit(limit);
        initDecoder();
    }

    /**
     * 
     */
    private void initDecoder() {
        this.decoder.onMalformedInput(CodingErrorAction.REPORT);
        this.decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
    }

    public HtmlInputStreamReader(InputStream inputStream,
            ErrorHandler errorHandler, Tokenizer tokenizer, Driver driver,
            Encoding encoding) throws SAXException, IOException {
        this.inputStream = inputStream;
        this.errorHandler = errorHandler;
        this.tokenizer = tokenizer;
        this.driver = driver;
        this.decoder = encoding.newDecoder();
        this.sniffing = false;
        position = 0;
        bytesRead = 0;
        byteBuffer.position(0);
        byteBuffer.limit(0);
        shouldReadBytes = true;
        initDecoder();
    }

    @Override public void close() throws IOException {
        inputStream.close();
    }

    @Override public int read(char[] charArray) throws IOException {
        lineColPos = 0;
        assert !sniffing;
        assert charArray.length >= 2;
        if (needToNotifyTokenizer) {
            if (driver != null) {
                driver.notifyAboutMetaBoundary();
            }
            needToNotifyTokenizer = false;
        }
        CharBuffer charBuffer = CharBuffer.wrap(charArray);
        charBuffer.limit(charArray.length);
        charBuffer.position(0);
        if (flushing) {
            decoder.flush(charBuffer);
            // return -1 if zero
            int cPos = charBuffer.position();
            return cPos == 0 ? -1 : cPos;
        }
        if (hasPendingReplacementCharacter) {
            charBuffer.put('\uFFFD');
            hasPendingReplacementCharacter = false;
        }
        for (;;) {
            if (shouldReadBytes) {
                int oldLimit = byteBuffer.limit();
                int readLen;
                if (charsetBoundaryPassed) {
                    readLen = byteArray.length - oldLimit;
                } else {
                    readLen = SNIFFING_LIMIT - oldLimit;
                }
                int num = inputStream.read(byteArray, oldLimit, readLen);
                if (num == -1) {
                    eofSeen = true;
                    inputStream.close();
                } else {
                    byteBuffer.position(0);
                    byteBuffer.limit(oldLimit + num);
                }
                shouldReadBytes = false;
            }
            boolean finalDecode = false;
            for (;;) {
                int oldBytePos = byteBuffer.position();
                CoderResult cr = decoder.decode(byteBuffer, charBuffer,
                        finalDecode);
                bytesRead += byteBuffer.position() - oldBytePos;
                if (cr == CoderResult.OVERFLOW) {
                    // Decoder will remember surrogates
                    return charBuffer.position();
                } else if (cr == CoderResult.UNDERFLOW) {
                    int remaining = byteBuffer.remaining();
                    if (!charsetBoundaryPassed) {
                        if (bytesRead + remaining >= SNIFFING_LIMIT) {
                            needToNotifyTokenizer = true;
                            charsetBoundaryPassed = true;
                        }
                    }

                    // XXX what happens if the entire byte buffer consists of 
                    // a pathologically long malformed sequence?

                    // If the buffer was not fully consumed, there may be an
                    // incomplete byte sequence that needs to seed the next
                    // buffer.
                    if (remaining > 0) {
                        System.arraycopy(byteArray, byteBuffer.position(),
                                byteArray, 0, remaining);
                    }
                    byteBuffer.position(0);
                    byteBuffer.limit(remaining);
                    if (flushing) {
                        // The final decode was successful. Not sure if this
                        // ever happens.
                        // Let's get out in any case.
                        int cPos = charBuffer.position();
                        return cPos == 0 ? -1 : cPos;
                    } else if (eofSeen) {
                        // If there's something left, it isn't something that
                        // would be
                        // consumed in the middle of the stream. Rerun the loop
                        // once
                        // in the final mode.
                        shouldReadBytes = false;
                        finalDecode = true;
                        flushing = true;
                        continue;
                    } else {
                        // The usual stuff. Want more bytes next time.
                        shouldReadBytes = true;
                        int cPos = charBuffer.position();
                        if (cPos == 0) {
                            // No output. Read more bytes right away
                            break;
                        }
                        return cPos;
                    }
                } else {
                    // The result is in error. No need to test.
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < cr.length(); i++) {
                        if (i > 0) {
                            sb.append(", ");
                        }
                        sb.append('\u201C');
                        sb.append(Integer.toHexString(byteBuffer.get() & 0xFF));
                        bytesRead++;
                        sb.append('\u201D');
                    }
                    if (charBuffer.hasRemaining()) {
                        charBuffer.put('\uFFFD');                     
                    } else {
                        hasPendingReplacementCharacter = true;
                    }
                    calculateLineAndCol(charBuffer);
                    if (cr.isMalformed()) {
                        err("Malformed byte sequence: " + sb + ".");
                    } else if (cr.isUnmappable()) {
                        err("Unmappable byte sequence: " + sb + ".");
                    } else {
                        throw new RuntimeException(
                                "CoderResult was none of overflow, underflow, malformed or unmappable.");
                    }
                    if (finalDecode) {
                        // These were the last bytes of input. Return without
                        // relooping.
                        // return -1 if zero
                        int cPos = charBuffer.position();
                        return cPos == 0 ? -1 : cPos;
                    }
                }
            }
        }
    }

    private void calculateLineAndCol(CharBuffer charBuffer) {
        if (tokenizer != null) {
            if (lineColPos == 0) {
                line = tokenizer.getLine();
                col = tokenizer.getCol();
                nextCharOnNewLine = tokenizer.isNextCharOnNewLine();
                prevWasCR = tokenizer.isPrevCR();
            }
            
            char[] charArray = charBuffer.array();
            int i = lineColPos;
            while (i < charBuffer.position()) {
                char c;
                if (nextCharOnNewLine) {
                    line++;
                    col = 1;
                    nextCharOnNewLine = false;
                } else {
                    col++;
                }

                c = charArray[i];
                switch (c) {
                    case '\r':
                        nextCharOnNewLine = true;
                        prevWasCR = true;
                        break;
                    case '\n':
                        if (prevWasCR) {
                            col--;
                        } else {
                            nextCharOnNewLine = true;
                        }
                        break;
                }
                i++;
            }
            lineColPos = i;
        }
    }

    public int readByte() throws IOException {
        if (!sniffing) {
            throw new IllegalStateException(
                    "readByte() called when not in the sniffing state.");
        }
        if (position == SNIFFING_LIMIT) {
            return -1;
        } else if (position < limit) {
            return byteArray[position++] & 0xFF;
        } else {
            int num = inputStream.read(byteArray, limit, SNIFFING_LIMIT - limit);
            if (num == -1) {
                return -1;
            } else {
                limit += num;
                return byteArray[position++] & 0xFF;
            }
        }
    }

    public static void main(String[] args) {
        CharsetDecoder dec = Charset.forName("UTF-8").newDecoder();
        dec.onMalformedInput(CodingErrorAction.REPORT);
        dec.onUnmappableCharacter(CodingErrorAction.REPORT);
        byte[] bytes = { (byte) 0xF0, (byte) 0x9D, (byte) 0x80, (byte) 0x80 };
        byte[] bytes2 = { (byte) 0xB8, (byte) 0x80, 0x63, 0x64, 0x65 };
        ByteBuffer byteBuf = ByteBuffer.wrap(bytes);
        ByteBuffer byteBuf2 = ByteBuffer.wrap(bytes2);
        char[] chars = new char[1];
        CharBuffer charBuf = CharBuffer.wrap(chars);

        CoderResult cr = dec.decode(byteBuf, charBuf, false);
        System.out.println(cr);
        System.out.println(byteBuf);
        // byteBuf.get();
        cr = dec.decode(byteBuf2, charBuf, false);
        System.out.println(cr);
        System.out.println(byteBuf2);

    }

    public int getColumnNumber() {
        if (tokenizer != null) {
            return col;
        }
        return -1;
    }

    public int getLineNumber() {
        if (tokenizer != null) {
            return line;
        }
        return -1;
    }

    public String getPublicId() {
        if (tokenizer != null) {
            return tokenizer.getPublicId();
        }
        return null;
    }

    public String getSystemId() {
        if (tokenizer != null) {
            return tokenizer.getSystemId();
        }
        return null;
    }

    /**
     * @param string
     * @throws SAXException
     */
    private void err(String message) throws IOException {
        // TODO remove wrapping when changing read() to take a CharBuffer
        try {
            if (errorHandler != null) {
                SAXParseException spe = new SAXParseException(message, this);
                errorHandler.error(spe);
            }
        } catch (SAXException e) {
            throw (IOException) new IOException(e.getMessage()).initCause(e);
        }
    }

    private void warn(String message) throws IOException {
        // TODO remove wrapping when changing read() to take a CharBuffer
        try {
            if (errorHandler != null) {
                SAXParseException spe = new SAXParseException(message, this);
                errorHandler.warning(spe);
            }
        } catch (SAXException e) {
            throw (IOException) new IOException(e.getMessage()).initCause(e);
        }
    }

    public Charset getCharset() {
        return decoder.charset();
    }

    /**
     * @see java.io.Reader#read()
     */
    @Override public int read() throws IOException {
        throw new UnsupportedOperationException();
    }

    /**
     * @see java.io.Reader#read(char[], int, int)
     */
    @Override public int read(char[] cbuf, int off, int len) throws IOException {
        throw new UnsupportedOperationException();
    }

    /**
     * @see java.io.Reader#read(java.nio.CharBuffer)
     */
    @Override public int read(CharBuffer target) throws IOException {
        throw new UnsupportedOperationException();
    }

    public void switchEncoding(Encoding newEnc) {
        this.decoder = newEnc.newDecoder();
        initDecoder();
    }
}