/* * Copyright (c) 2006, 2007 Henri Sivonen * Copyright (c) 2007 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package nu.validator.htmlparser.extra; import nu.validator.htmlparser.common.CharacterHandler; import org.xml.sax.ErrorHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.UnicodeSet; /** * @version $Id$ * @author hsivonen */ public final class NormalizationChecker implements CharacterHandler { private ErrorHandler errorHandler; private Locator locator; /** * A thread-safe set of composing characters as per Charmod Norm. */ @SuppressWarnings("deprecation") private static final UnicodeSet COMPOSING_CHARACTERS = (UnicodeSet) new UnicodeSet( "[[:nfc_qc=maybe:][:^ccc=0:]]").freeze(); // see http://sourceforge.net/mailarchive/message.php?msg_id=37279908 /** * A buffer for holding sequences overlap the SAX buffer boundary. */ private char[] buf = new char[128]; /** * A holder for the original buffer (for the memory leak prevention * mechanism). */ private char[] bufHolder = null; /** * The current used length of the buffer, i.e. the index of the first slot * that does not hold current data. */ private int pos; /** * Indicates whether the checker the next call to <code>characters()</code> * is the first call in a run. */ private boolean atStartOfRun; /** * Indicates whether the current run has already caused an error. */ private boolean alreadyComplainedAboutThisRun; /** * Emit an error. The locator is used. * * @param message the error message * @throws SAXException if something goes wrong */ public void err(String message) throws SAXException { if (errorHandler != null) { SAXParseException spe = new SAXParseException(message, locator); errorHandler.error(spe); } } /** * Returns <code>true</code> if the argument is a composing BMP character * or a surrogate and <code>false</code> otherwise. * * @param c a UTF-16 code unit * @return <code>true</code> if the argument is a composing BMP character * or a surrogate and <code>false</code> otherwise */ private static boolean isComposingCharOrSurrogate(char c) { if (UCharacter.isHighSurrogate(c) || UCharacter.isLowSurrogate(c)) { return true; } return isComposingChar(c); } /** * Returns <code>true</code> if the argument is a composing character * and <code>false</code> otherwise. * * @param c a Unicode code point * @return <code>true</code> if the argument is a composing character * <code>false</code> otherwise */ private static boolean isComposingChar(int c) { return COMPOSING_CHARACTERS.contains(c); } /** * Constructor with mode selection. * * @param sourceTextMode whether the source text-related messages * should be enabled. */ public NormalizationChecker(Locator locator) { super(); start(); } /** * @see nu.validator.htmlparser.common.CharacterHandler#start() */ public void start() { atStartOfRun = true; alreadyComplainedAboutThisRun = false; pos = 0; } /** * @see nu.validator.htmlparser.common.CharacterHandler#characters(char[], int, int) */ public void characters(char[] ch, int start, int length) throws SAXException { if (alreadyComplainedAboutThisRun) { return; } if (atStartOfRun) { char c = ch[start]; if (pos == 1) { // there's a single high surrogate in buf if (isComposingChar(UCharacter.getCodePoint(buf[0], c))) { err("Text run starts with a composing character."); } atStartOfRun = false; } else { if (length == 1 && UCharacter.isHighSurrogate(c)) { buf[0] = c; pos = 1; return; } else { if (UCharacter.isHighSurrogate(c)) { if (isComposingChar(UCharacter.getCodePoint(c, ch[start + 1]))) { err("Text run starts with a composing character."); } } else { if (isComposingCharOrSurrogate(c)) { err("Text run starts with a composing character."); } } atStartOfRun = false; } } } int i = start; int stop = start + length; if (pos > 0) { // there's stuff in buf while (i < stop && isComposingCharOrSurrogate(ch[i])) { i++; } appendToBuf(ch, start, i); if (i == stop) { return; } else { if (!Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) { errAboutTextRun(); } pos = 0; } } if (i < stop) { start = i; i = stop - 1; while (i > start && isComposingCharOrSurrogate(ch[i])) { i--; } if (i > start) { if (!Normalizer.isNormalized(ch, start, i, Normalizer.NFC, 0)) { errAboutTextRun(); } } appendToBuf(ch, i, stop); } } /** * Emits an error stating that the current text run or the source * text is not in NFC. * * @throws SAXException if the <code>ErrorHandler</code> throws */ private void errAboutTextRun() throws SAXException { err("Source text is not in Unicode Normalization Form C."); alreadyComplainedAboutThisRun = true; } /** * Appends a slice of an UTF-16 code unit array to the internal * buffer. * * @param ch the array from which to copy * @param start the index of the first element that is copied * @param end the index of the first element that is not copied */ private void appendToBuf(char[] ch, int start, int end) { if (start == end) { return; } int neededBufLen = pos + (end - start); if (neededBufLen > buf.length) { char[] newBuf = new char[neededBufLen]; System.arraycopy(buf, 0, newBuf, 0, pos); if (bufHolder == null) { bufHolder = buf; // keep the original around } buf = newBuf; } System.arraycopy(ch, start, buf, pos, end - start); pos += (end - start); } /** * @see nu.validator.htmlparser.common.CharacterHandler#end() */ public void end() throws SAXException { if (!alreadyComplainedAboutThisRun && !Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) { errAboutTextRun(); } if (bufHolder != null) { // restore the original small buffer to avoid leaking // memory if this checker is recycled buf = bufHolder; bufHolder = null; } } public void setErrorHandler(ErrorHandler errorHandler) { this.errorHandler = errorHandler; } }