package cgeo.geocaching.files; import org.apache.commons.lang3.StringUtils; import java.io.FilterReader; import java.io.IOException; import java.io.Reader; /** * Filter reader which can filter out invalid XML characters and character references. * */ public class InvalidXMLCharacterFilterReader extends FilterReader { public InvalidXMLCharacterFilterReader(final Reader in) { super(in); } /** * Every overload of {@link Reader#read()} method delegates to this one so * it is enough to override only this one. <br /> * To skip invalid characters this method shifts only valid chars to left * and returns decreased value of the original read method. So after last * valid character there will be some unused chars in the buffer. * * @return Number of read valid characters or {@code -1} if end of the * underling reader was reached. */ @Override public int read(final char[] cbuf, final int off, final int len) throws IOException { final int read = super.read(cbuf, off, len); // check for end if (read == -1) { return -1; } // target position int pos = off - 1; int entityStart = -1; for (int readPos = off; readPos < off + read; readPos++) { boolean useChar = true; switch (cbuf[readPos]) { case '&': pos++; entityStart = readPos; break; case ';': pos++; if (entityStart >= 0) { final int entityLength = readPos - entityStart + 1; if (entityLength <= 5) { final String entity = new String(cbuf, entityStart, entityLength); if (StringUtils.startsWith(entity, "&#")) { final String numberString = StringUtils.substringBetween(entity, "&#", ";"); final int value; if (StringUtils.startsWith(numberString, "x")) { value = Integer.parseInt(numberString.substring(1), 16); } else { value = Integer.parseInt(numberString); } if (!isValidXMLChar((char) value)) { pos -= entityLength; useChar = false; } } } } break; default: if (isValidXMLChar(cbuf[readPos])) { pos++; } else { continue; } } // copy, and skip unwanted characters if (pos < readPos && useChar) { cbuf[pos] = cbuf[readPos]; } } return pos - off + 1; } private static boolean isValidXMLChar(final char c) { return c == 0x9 || c == 0xA || c == 0xD || (c >= 0x20 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD); } }