package org.gephi.project.io;
import java.io.FilterReader;
import java.io.IOException;
import java.io.Reader;
/**
* {@link FilterReader} to skip invalid xml version 1.0 characters. Valid
* Unicode chars for xml version 1.0 according to http://www.w3.org/TR/xml are
* #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD], [#x10000-#x10FFFF] . In
* other words - any Unicode character, excluding the surrogate blocks, FFFE,
* and FFFF.
* <p>
* More details on the <a
* href="http://info.tsachev.org/2009/05/skipping-invalid-xml-character-with.html">blog</a>
*/
public class Xml10FilterReader extends FilterReader {
/**
* Creates filter reader which skips invalid xml characters.
*
* @param in original reader
*/
public Xml10FilterReader(Reader in) {
super(in);
}
/**
* Every overload of {@link Reader#read()} method delegates to this one so
* it is enough to override only this one.
* <p>
* To skip invalid characters this method shifts only valid chars to left
* and returns decreased value of the original read method. So after last
* valid character there will be some unused chars in the buffer.
*
* @return Number of read valid characters or <code>-1</code> if end of the
* underling reader was reached.
*/
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int read = super.read(cbuf, off, len);
/*
* If read chars are -1 then we have reach the end of the reader.
*/
if (read == -1) {
return -1;
}
/*
* pos will show the index where chars should be moved if there are gaps
* from invalid characters.
*/
int pos = off - 1;
for (int readPos = off; readPos < off + read; readPos++) {
if (XMLChar.isValid(cbuf[readPos])) {
pos++;
} else {
continue;
}
/*
* If there is gap(s) move current char to its position.
*/
if (pos < readPos) {
cbuf[pos] = cbuf[readPos];
}
}
/*
* Number of read valid characters.
*/
return pos - off + 1;
}
}