/* * Copyright (c) 1998-2011 Caucho Technology -- all rights reserved * * This file is part of Resin(R) Open Source * * Each copy or derived work must preserve the copyright notice and this * notice unmodified. * * Resin Open Source is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Resin Open Source is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty * of NON-INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with Resin Open Source; if not, write to the * Free SoftwareFoundation, Inc. * 59 Temple Place, Suite 330 * Boston, MA 02111-1307 USA * * @author Scott Ferguson */ package com.caucho.xml2.readers; import com.caucho.util.CharBuffer; import com.caucho.vfs.ReadStream; import com.caucho.xml2.XmlParser; import java.io.CharConversionException; import java.io.EOFException; import java.io.IOException; /** * A fast reader to convert bytes to characters for parsing XML. */ public class Utf8Reader extends XmlReader { /** * Create a new reader. */ public Utf8Reader() { } /** * Create a new reader with the given read stream. */ public Utf8Reader(XmlParser parser, ReadStream is) { super(parser, is); } /** * Read the next character, returning -1 on end of file.. */ public int read() throws IOException { int ch1 = _is.read(); if (ch1 == '\n') { _parser.setLine(++_line); return ch1; } else if (ch1 == '\r') { _parser.setLine(++_line); int ch2 = _is.read(); if (ch2 == '\n') return '\n'; if (ch2 < 0) { } else if (ch2 < 0x80) _parser.unread(ch2); else _parser.unread(readSecond(ch2)); return '\n'; } else if (ch1 < 0x80) return ch1; else return readSecond(ch1); } private int readSecond(int ch1) throws IOException { if ((ch1 & 0xe0) == 0xc0) { int ch2 = _is.read(); if (ch2 < 0) throw new EOFException("unexpected end of file in utf8 character"); else if ((ch2 & 0xc0) != 0x80) throw error(L.l("illegal utf8 encoding {0}", hex(ch1))); return ((ch1 & 0x1f) << 6) + (ch2 & 0x3f); } else if ((ch1 & 0xf0) == 0xe0) { int ch2 = _is.read(); int ch3 = _is.read(); if (ch2 < 0) throw new EOFException("unexpected end of file in utf8 character"); else if ((ch2 & 0xc0) != 0x80) throw error(L.l("illegal utf8 encoding at {0} {1} {2}", hex(ch1), hex(ch2), hex(ch3))); if (ch3 < 0) throw new EOFException("unexpected end of file in utf8 character"); else if ((ch3 & 0xc0) != 0x80) throw error(L.l("illegal utf8 encoding {0} {1} {2}", hex(ch1), hex(ch2), hex(ch3))); int ch = ((ch1 & 0x1f) << 12) + ((ch2 & 0x3f) << 6) + (ch3 & 0x3f); if (ch == 0xfeff) // handle some writers, e.g. microsoft return read(); else return ch; } else throw error(L.l("illegal utf8 encoding at {0}", hex(ch1))); } private String hex(int n) { n = n & 0xff; CharBuffer cb = CharBuffer.allocate(); cb.append("0x"); int d = n / 16; if (d >= 0 && d <= 9) cb.append((char) ('0' + d)); else cb.append((char) ('a' + d - 10)); d = n % 16; if (d >= 0 && d <= 9) cb.append((char) ('0' + d)); else cb.append((char) ('a' + d - 10)); return cb.close(); } private CharConversionException error(String msg) { String filename = _parser.getFilename(); int line = _parser.getLine(); if (filename != null) return new CharConversionException(filename + ":" + line + ": " + msg); else return new CharConversionException(msg); } }