/* * Copyright (c) 2009 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package nu.validator.htmlparser.io; import java.io.IOException; import java.nio.charset.UnsupportedCharsetException; import nu.validator.htmlparser.common.ByteReadable; import nu.validator.htmlparser.impl.MetaScanner; import org.xml.sax.ErrorHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; public class MetaSniffer extends MetaScanner implements Locator { private Encoding characterEncoding = null; private final ErrorHandler errorHandler; private final Locator locator; private int line = 1; private int col = 0; private boolean prevWasCR = false; public MetaSniffer(ErrorHandler eh, Locator locator) { this.errorHandler = eh; this.locator = locator; this.characterEncoding = null; } /** * -1 means end. * @return * @throws IOException */ protected int read() throws IOException { int b = readable.readByte(); // [NOCPP[ switch (b) { case '\n': if (!prevWasCR) { line++; col = 0; } prevWasCR = false; break; case '\r': line++; col = 0; prevWasCR = true; break; default: col++; prevWasCR = false; break; } // ]NOCPP] return b; } /** * Main loop. * * @return * * @throws SAXException * @throws IOException * @throws */ public Encoding sniff(ByteReadable readable) throws SAXException, IOException { this.readable = readable; stateLoop(stateSave); return characterEncoding; } /** * @param string * @throws SAXException */ private void err(String message) throws SAXException { if (errorHandler != null) { SAXParseException spe = new SAXParseException(message, this); errorHandler.error(spe); } } /** * @param string * @throws SAXException */ private void warn(String message) throws SAXException { if (errorHandler != null) { SAXParseException spe = new SAXParseException(message, this); errorHandler.warning(spe); } } public int getColumnNumber() { return col; } public int getLineNumber() { return line; } public String getPublicId() { if (locator != null) { return locator.getPublicId(); } return null; } public String getSystemId() { if (locator != null) { return locator.getSystemId(); } return null; } protected boolean tryCharset(String encoding) throws SAXException { encoding = Encoding.toAsciiLowerCase(encoding); try { // XXX spec says only UTF-16 if ("utf-16".equals(encoding) || "utf-16be".equals(encoding) || "utf-16le".equals(encoding) || "utf-32".equals(encoding) || "utf-32be".equals(encoding) || "utf-32le".equals(encoding)) { this.characterEncoding = Encoding.UTF8; err("The internal character encoding declaration specified \u201C" + encoding + "\u201D which is not a rough superset of ASCII. Using \u201CUTF-8\u201D instead."); return true; } else { Encoding cs = Encoding.forName(encoding); String canonName = cs.getCanonName(); if (!cs.isAsciiSuperset()) { err("The encoding \u201C" + encoding + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration. Continuing the sniffing algorithm."); return false; } if (!cs.isRegistered()) { if (encoding.startsWith("x-")) { err("The encoding \u201C" + encoding + "\u201D is not an IANA-registered encoding. (Charmod C022)"); } else { err("The encoding \u201C" + encoding + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)"); } } else if (!cs.getCanonName().equals(encoding)) { err("The encoding \u201C" + encoding + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" + canonName + "\u201D. (Charmod C024)"); } if (cs.isShouldNot()) { warn("Authors should not use the character encoding \u201C" + encoding + "\u201D. It is recommended to use \u201CUTF-8\u201D."); } else if (cs.isObscure()) { warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); } Encoding actual = cs.getActualHtmlEncoding(); if (actual == null) { this.characterEncoding = cs; } else { warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D."); this.characterEncoding = actual; } return true; } } catch (UnsupportedCharsetException e) { err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will continue sniffing."); } return false; } }