/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
2015 Didier Briel
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.filters2.html2;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import org.omegat.util.OConsts;
import org.omegat.util.PatternConsts;
/**
* This class automatically detects encoding of an inner HTML file and
* constructs a Reader with appropriate encoding. Detecting of encoding is done
* by reading a possible
* <code><META http-equiv="content-type" content="text/html; charset=..."></code>
* and a value from XML header (in case there is one)
* <code><?xml version="1.0" encoding="..."?></code>. If encoding isn't
* specified, or it is not supported by Java platform, the file is opened in
* encoding passed to constructor or default system encoding (ISO-8859-2 in USA,
* Windows-1251 on my OS).
*
* @author Maxym Mykhalchuk
* @author Didier Briel
*/
public class HTMLReader extends Reader {
/** Inner reader */
private BufferedReader reader;
/**
* Creates a new instance of HTMLReader. If encoding cannot be detected,
* falls back to supplied <code>encoding</code>, or (if supplied null, or
* supplied encoding is not supported by JVM) falls back to default encoding
* of Operating System.
*
* @param fileName
* The file to read.
* @param encoding
* The encoding to use if we can't autodetect.
*/
public HTMLReader(String fileName, String encoding) throws IOException {
reader = new BufferedReader(createReader(fileName, encoding));
}
private String encoding = null;
/**
* Returns encoding that was used to read the HTML file.
*/
public String getEncoding() {
return encoding;
}
/**
* Returns the reader of the underlying file in the correct encoding.
*
* <p>
* We can detect the following:
* <ul>
* <li>UTF-16 with BOM (byte order mark)
* <li>UTF-8 with BOM (byte order mark)
* <li>Any other encoding with 8-bit Latin symbols (e.g. Windows-1251, UTF-8
* etc), if it is specified using XML/HTML-style encoding declarations.
* </ul>
* <p>
* Note that we cannot detect UTF-16 encoding, if there's no BOM!
*/
private Reader createReader(String fileName, String defaultEncoding) throws IOException {
// BOM detection
BufferedInputStream is = new BufferedInputStream(new FileInputStream(fileName));
is.mark(OConsts.READ_AHEAD_LIMIT);
int char1 = is.read();
int char2 = is.read();
int char3 = is.read();
if (char1 == 0xFE && char2 == 0xFF)
encoding = "UTF-16BE";
if (char1 == 0xFF && char2 == 0xFE)
encoding = "UTF-16LE";
if (char1 == 0xEF && char2 == 0xBB && char3 == 0xBF)
encoding = "UTF-8";
is.reset();
if (encoding != null) {
return new InputStreamReader(is, encoding);
}
is.mark(OConsts.READ_AHEAD_LIMIT);
byte[] buf = new byte[OConsts.READ_AHEAD_LIMIT];
int len = is.read(buf);
if (len > 0) {
String buffer = defaultEncoding == null ? new String(buf, 0, len, Charset.defaultCharset())
: new String(buf, 0, len, defaultEncoding);
Matcher matcher_html = PatternConsts.HTML_ENCODING.matcher(buffer);
if (matcher_html.find()) {
encoding = matcher_html.group(1);
} else if (encoding == null) {
Matcher matcher_html5 = PatternConsts.HTML5_ENCODING.matcher(buffer);
if (matcher_html5.find()) {
encoding = matcher_html5.group(1);
} else if (encoding == null) {
Matcher matcher_xml = PatternConsts.XML_ENCODING.matcher(buffer);
if (matcher_xml.find()) {
encoding = matcher_xml.group(1);
}
}
}
}
// reset the inputstream to its start
is.reset();
// create an inputstream reader
InputStreamReader isr = null;
// try the encoding specified in the file first
if (encoding != null) {
try {
isr = new InputStreamReader(is, encoding);
} catch (Exception e) {
}
}
// if there's no reader yet, try the default encoding
if (isr == null) {
try {
isr = new InputStreamReader(is, defaultEncoding);
encoding = defaultEncoding;
} catch (Exception e) {
}
}
// just create one without an encoding and cross fingers
if (isr == null) {
isr = new InputStreamReader(is, Charset.defaultCharset());
encoding = Charset.defaultCharset().name();
}
return isr;
}
public void close() throws IOException {
reader.close();
}
boolean readFirstTime = true;
public int read(char[] cbuf, int off, int len) throws IOException {
// BOM (byte order mark) bugfix
if (readFirstTime) {
readFirstTime = false;
reader.mark(1);
int ch = reader.read();
if (ch != 0xFEFF)
reader.reset();
}
return reader.read(cbuf, off, len);
}
}