package dima; import com.google.common.base.Charsets; import org.apache.commons.io.IOUtils; import org.apache.uima.UIMAException; import org.apache.uima.jcas.JCas; import org.xml.sax.SAXException; import java.io.IOException; import java.io.InputStream; /** * Date: 30.09.13 * Time: 17:30 * * @author Johannes Kirschnick * adapted from http://svn.apache.org/repos/asf/uima/uimaj/tags/uimaj-2.3.0/uimaj-2.3.0-07/uimaj-core/src/main/java/org/apache/uima/util/TCasToInlineXml.java */ public abstract class UIMAConverterHelper<T> { public abstract T serialize(JCas jCas) throws IOException, SAXException; public abstract JCas deserialize(InputStream inputStream, JCas newElement) throws IOException, UIMAException, InterruptedException, SAXException, ClassNotFoundException; public static String sanitizeString(String input) { // sanitize the content // get document text if(input != null) { char[] docCharArray = input.toCharArray(); replaceInvalidXmlChars(docCharArray); return String.valueOf(docCharArray); } return input; } private static void replaceInvalidXmlChars(char[] aChars) { for (int i = 0; i < aChars.length; i++) { if ((aChars[i] < 0x20 && aChars[i] != 0x09 && aChars[i] != 0x0A && aChars[i] != 0x0D) || (aChars[i] > 0xD7FF && aChars[i] < 0xE000) || aChars[i] == 0xFFFE || aChars[i] == 0xFFFF) { // System.out.println("Found invalid XML character: " + (int)aChars[i] + " at position " + // i); //temp aChars[i] = ' '; } } } public JCas deserialize(String input, JCas newElement) throws IOException, UIMAException, InterruptedException, SAXException, ClassNotFoundException { return deserialize(IOUtils.toInputStream(input, Charsets.UTF_8.name()), newElement); } }