package org.gbif.occurrence.util; import java.io.FilterReader; import java.io.IOException; import java.io.Reader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Extends FilterReader to clean character streams of invalid xml characters while streaming. The set of valid * chars are defined by the w3c here: http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char. * Note that this sanitizing is for the entire xml stream - the problem of illegal characters within elements/CDATA * sections (e.g. < > & ) is not handled by this reader. * TODO: move to gbif-common project * * @author oliver */ public class XmlSanitizingReader extends FilterReader { private static final Logger LOG = LoggerFactory.getLogger(XmlSanitizingReader.class); private boolean endOfStreamReached = false; public XmlSanitizingReader(Reader in) { super(in); LOG.debug("Starting XmlSanitizingReader"); } @Override public int read() throws IOException { synchronized (lock) { int nextChar = nextValidXmlChar(); LOG.debug("call to read(), returning [{}]", nextChar); return nextChar; } } @Override /** * This violates the read contract slightly - the returned value is number * of chars read in all cases except where end of stream is the first char read. * This is something that BufferedReader expects for its readLine() calls (and * how it behaves for its implementation of this method). */ public int read(char[] buffer, int offset, int length) throws IOException { synchronized (lock) { LOG.debug("call to read(b, o, l) with l [{}]", length); /** * TODO: careful here - I think char can only represent basic multilingual plane * while int can represent anything, so the cast to char could fail */ int charsRead = 0; for (int i = offset; i < (offset + length); i++) { int nextChar = nextValidXmlChar(); if (nextChar != -1) { buffer[i] = (char) nextChar; charsRead++; } else { if (charsRead == 0) { LOG.debug("End of stream is first char read: returning -1"); return -1; } else { LOG.debug("At end of stream having read [{}] of requested [{}]", charsRead, length); break; } } } return charsRead; } } @Override public boolean ready() throws IOException { return (!endOfStreamReached && in.ready()); } @Override public void close() throws IOException { synchronized (lock) { if (in == null) return; in.close(); in = null; } } @Override public boolean markSupported() { return false; } private int nextValidXmlChar() throws IOException { synchronized (lock) { Integer validChar = null; while (validChar == null) { int nextChar = in.read(); // -1 means end of stream if (nextChar == -1) { endOfStreamReached = true; return -1; } else { if (isValidXml(nextChar)) { validChar = nextChar; } else { if (LOG.isDebugEnabled()) LOG.debug("Dropping invalid xml char [0x{}]", Integer.toHexString(nextChar)); } } } return validChar; } } private boolean isValidXml(int charVal) { if (charVal == 0x9 || charVal == 0xA || charVal == 0xD || (charVal >= 0x20 && charVal <= 0xD7FF) || (charVal >= 0xE000 && charVal <= 0xFFFD) || (charVal >= 0x10000 && charVal <= 0x10FFFF)) { return true; } return false; } }