/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.sax; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * Content handler decorator that makes sure that the character events * ({@link #characters(char[], int, int)} or * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated * content handler contain only valid XML characters. All invalid characters * are replaced with spaces. * <p> * The XML standard defines the following Unicode character ranges as * valid XML characters: * <pre> * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] * </pre> * <p> * Note that currently this class only detects those invalid characters whose * UTF-16 representation fits a single char. Also, this class does not ensure * that the UTF-16 encoding of incoming characters is correct. */ public class SafeContentHandler extends ContentHandlerDecorator { /** * Replacement for invalid characters. */ private static final char[] REPLACEMENT = new char[] { ' ' }; /** * Internal interface that allows both character and * ignorable whitespace content to be filtered the same way. */ protected interface Output { void write(char[] ch, int start, int length) throws SAXException; } /** * Output through the {@link ContentHandler#characters(char[], int, int)} * method of the decorated content handler. */ private final Output charactersOutput = new Output() { public void write(char[] ch, int start, int length) throws SAXException { SafeContentHandler.super.characters(ch, start, length); } }; /** * Output through the * {@link ContentHandler#ignorableWhitespace(char[], int, int)} * method of the decorated content handler. */ private final Output ignorableWhitespaceOutput = new Output() { public void write(char[] ch, int start, int length) throws SAXException { SafeContentHandler.super.ignorableWhitespace(ch, start, length); } }; public SafeContentHandler(ContentHandler handler) { super(handler); } /** * Filters and outputs the contents of the given input buffer. Any * invalid characters in the input buffer area handled by sending a * replacement (a space character) to the given output. Any sequences * of valid characters are passed as-is to the given output. * * @param ch input buffer * @param start start offset within the buffer * @param length number of characters to read from the buffer * @param output output channel * @throws SAXException if the filtered characters could not be written out */ private void filter(char[] ch, int start, int length, Output output) throws SAXException { int end = start + length; for (int i = start; i < end; i++) { if (isInvalid(ch[i])) { // Output any preceding valid characters if (i > start) { output.write(ch, start, i - start); } // Output the replacement for this invalid character writeReplacement(output); // Continue with the rest of the array start = i + 1; } } // Output any remaining valid characters output.write(ch, start, end - start); } /** * Checks whether the given character (more accurately a UTF-16 code unit) * is an invalid XML character and should be replaced for output. * Subclasses can override this method to use an alternative definition * of which characters should be replaced in the XML output. * * @param ch character * @return <code>true</code> if the character should be replaced, * <code>false</code> otherwise */ protected boolean isInvalid(char ch) { // TODO: Correct handling of multi-word characters if (ch < 0x20) { return ch != 0x09 && ch != 0x0A && ch != 0x0D; } else { return ch >= 0xFFFE; } } /** * Outputs the replacement for an invalid character. Subclasses can * override this method to use a custom replacement. * * @param output where the replacement is written to * @throws SAXException if the replacement could not be written */ protected void writeReplacement(Output output) throws SAXException { output.write(REPLACEMENT, 0, REPLACEMENT.length); } //------------------------------------------------------< ContentHandler > @Override public void characters(char[] ch, int start, int length) throws SAXException { filter(ch, start, length, charactersOutput); } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { filter(ch, start, length, ignorableWhitespaceOutput); } }