/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2008 Didier Briel Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.filters3.xml; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.regex.Matcher; import org.omegat.util.OConsts; import org.omegat.util.PatternConsts; /** * This class automatically detects encoding of an inner XML file and constructs * a Reader with appropriate encoding. * <p> * Detecting of encoding is done first by reading a possible BOM, to detect * UTF-16 or UTF-8 then by reading a value from the XML header * <code><?xml version="1.0" encoding="..."?></code> * <p> * If encoding isn't specified, or it is not supported by the Java platform, the * file is opened in UTF-8, in compliance with the XML specifications * * * @author Maxym Mykhalchuk * @author Didier Briel */ public class XMLReader extends Reader { /** Inner reader */ private BufferedReader reader; /** Inner encoding. */ private String encoding; /** EOL chars used in source file. */ private String eol; /** Returns detected encoding. */ public String getEncoding() { return encoding; } /** Returns detected EOL chars. */ public String getEol() { return eol; } /** * Creates a new instance of XMLReader. If encoding cannot be detected, * falls back to default UTF-8. * * @param fileName * - the file to read */ public XMLReader(File file) throws IOException { this(file, null); } /** * Creates a new instance of XMLReader. If encoding cannot be detected, * falls back to supplied <code>encoding</code>, or (if supplied null, or * supplied encoding is not supported by JVM) falls back to UTF-8. * * @param fileName * The file to read. * @param encoding * The encoding to use if we can't autodetect. */ public XMLReader(File file, String encoding) throws IOException { reader = createReader(file, encoding); } /** * Returns the reader of the underlying file in the correct encoding. * * <p> * We can detect the following: * <ul> * <li>UTF-16 with BOM (byte order mark) * <li>UTF-8 with BOM (byte order mark) * <li>Any other encoding with 8-bit Latin symbols (e.g. Windows-1251, UTF-8 * etc), if it is specified using XML/HTML-style encoding declarations. * </ul> * * <p> * Note that we cannot detect UTF-16 encoding, if there's no BOM! */ private BufferedReader createReader(File file, String defaultEncoding) throws IOException { // BOM detection BufferedInputStream is = new BufferedInputStream(new FileInputStream(file)); is.mark(OConsts.READ_AHEAD_LIMIT); int char1 = is.read(); int char2 = is.read(); int char3 = is.read(); encoding = null; if (char1 == 0xFE && char2 == 0xFF) { encoding = "UTF-16BE"; } if (char1 == 0xFF && char2 == 0xFE) { encoding = "UTF-16LE"; } if (char1 == 0xEF && char2 == 0xBB && char3 == 0xBF) { encoding = "UTF-8"; } is.reset(); if (encoding != null) { return createReaderAndDetectEOL(is, encoding); } is.mark(OConsts.READ_AHEAD_LIMIT); byte[] buf = new byte[OConsts.READ_AHEAD_LIMIT]; int len = is.read(buf); if (len > 0) { String buffer = defaultEncoding == null ? new String(buf, 0, len, Charset.defaultCharset()) : new String(buf, 0, len, defaultEncoding); Matcher matcherXml = PatternConsts.XML_ENCODING.matcher(buffer); if (matcherXml.find()) { encoding = matcherXml.group(1); } } is.reset(); if (encoding != null) { return createReaderAndDetectEOL(is, encoding); } // UTF-8 if we couldn't detect it ourselves try { return createReaderAndDetectEOL(is, StandardCharsets.UTF_8.name()); } catch (Exception e) { return createReaderAndDetectEOL(is, null); } } private BufferedReader createReaderAndDetectEOL(InputStream is, String encoding) throws IOException { InputStreamReader isr = encoding == null ? new InputStreamReader(is, Charset.defaultCharset()) : new InputStreamReader(is, encoding); BufferedReader rd = new BufferedReader(isr, OConsts.READ_AHEAD_LIMIT); rd.mark(OConsts.READ_AHEAD_LIMIT); for (int i = 0; i < OConsts.READ_AHEAD_LIMIT; i++) { char ch = (char) rd.read(); if (ch == '\r' || ch == '\n') { if (eol == null) { eol = ""; } else if (eol.codePointAt(0) == ch) { // duplicate char - this is second line rd.reset(); return rd; } eol += ch; if (eol.codePointCount(0, eol.length()) == 2) { // second char - latest rd.reset(); return rd; } } else { if (eol != null) { rd.reset(); return rd; } } } // no eols found - assume '\n' eol = "\n"; rd.reset(); return rd; } public void close() throws IOException { reader.close(); } boolean readFirstTime = true; public int read(char[] cbuf, int off, int len) throws IOException { // BOM (byte order mark) bugfix if (readFirstTime) { readFirstTime = false; reader.mark(1); int ch = reader.read(); if (ch != 0xFEFF) { reader.reset(); } } return reader.read(cbuf, off, len); } }