/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.util.xml; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.util.regex.Matcher; import org.omegat.util.OConsts; import org.omegat.util.PatternConsts; /** * This class automatically detects encoding of an inner XML file and constructs * a Reader with appropriate encoding. * <p> * Detecting of encoding is done by reading a value from XML header * <code><?xml version="1.0" encoding="..."?></code> * <p> * If encoding isn't specified, or it is not supported by Java platform, the * file is opened in default system encoding (ISO-8859-2 in USA, Windows-1251 on * my OS). * * @author Maxym Mykhalchuk */ public class XMLReader extends Reader { /** Inner reader */ private BufferedReader reader; /** * Creates a new instance of XMLReader. If encoding cannot be detected, * falls back to default encoding of Operating System. * * @param fileName * - the file to read */ public XMLReader(String fileName) throws IOException { this(fileName, null); } /** * Creates a new instance of XMLReader. If encoding cannot be detected, * falls back to supplied <code>encoding</code>, or (if supplied null, or * supplied encoding is not supported by JVM) falls back to default encoding * of Operating System. * * @param fileName * The file to read. * @param encoding * The encoding to use if we can't autodetect. */ public XMLReader(String fileName, String encoding) throws IOException { reader = new BufferedReader(createReader(new FileInputStream(fileName), encoding)); } public XMLReader(InputStream inputStream, String encoding) throws IOException { reader = new BufferedReader(createReader(inputStream, encoding)); } /** * Returns the reader of the underlying file in the correct encoding. * * <p> * We can detect the following: * <ul> * <li>UTF-16 with BOM (byte order mark) * <li>UTF-8 with BOM (byte order mark) * <li>Any other encoding with 8-bit Latin symbols (e.g. Windows-1251, UTF-8 * etc), if it is specified using XML/HTML-style encoding declarations. * </ul> * * <p> * Note that we cannot detect UTF-16 encoding, if there's no BOM! */ private Reader createReader(InputStream inputStream, String defaultEncoding) throws IOException { // BOM detection BufferedInputStream is = new BufferedInputStream(inputStream); is.mark(OConsts.READ_AHEAD_LIMIT); int char1 = is.read(); int char2 = is.read(); int char3 = is.read(); String encoding = null; if (char1 == 0xFE && char2 == 0xFF) { encoding = "UTF-16BE"; } else if (char1 == 0xFF && char2 == 0xFE) { encoding = "UTF-16LE"; } else if (char1 == 0xEF && char2 == 0xBB && char3 == 0xBF) { encoding = "UTF-8"; } is.reset(); if (encoding != null) { return new InputStreamReader(is, encoding); } is.mark(OConsts.READ_AHEAD_LIMIT); byte[] buf = new byte[OConsts.READ_AHEAD_LIMIT]; int len = is.read(buf); if (len > 0) { String buffer = defaultEncoding == null ? new String(buf, 0, len, Charset.defaultCharset()) : new String(buf, 0, len, defaultEncoding); Matcher matcher_xml = PatternConsts.XML_ENCODING.matcher(buffer); if (matcher_xml.find()) { encoding = matcher_xml.group(1); } } is.reset(); if (encoding != null) { return new InputStreamReader(is, encoding); } // default encoding if we couldn't detect it ourselves try { return new InputStreamReader(is, defaultEncoding); } catch (Exception e) { return new InputStreamReader(is, Charset.defaultCharset()); } } @Override public void close() throws IOException { reader.close(); } boolean readFirstTime = true; @Override public int read(char[] cbuf, int off, int len) throws IOException { // BOM (byte order mark) bugfix if (readFirstTime) { readFirstTime = false; reader.mark(1); int ch = reader.read(); if (ch != 0xFEFF) reader.reset(); } return reader.read(cbuf, off, len); } }