/*
* XMIResultFormatter.java
*
* Copyright (c) 2011, Database Research Group, Institute of Computer Science, University of Heidelberg.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU General Public License.
*
* authors: Andreas Fay, Jannik Strötgen
* email: fay@stud.uni-heidelberg.de, stroetgen@uni-hd.de
*
* HeidelTime is a multilingual, cross-domain temporal tagger.
* For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
*/
package de.unihd.dbs.heideltime.standalone.components.impl;
import java.io.ByteArrayOutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.jcas.JCas;
import org.apache.uima.util.XMLSerializer;
import de.unihd.dbs.heideltime.standalone.components.ResultFormatter;
/**
* Result formatter based on XMI.
*
* @see {@link org.apache.uima.examples.xmi.XmiWriterCasConsumer}
*
* @author Andreas Fay, University of Heidelberg
* @version 1.0
*/
public class XMIResultFormatter implements ResultFormatter {
@Override
public String format(JCas jcas) throws Exception {
ByteArrayOutputStream outStream = null;
try {
// Write XMI
outStream = new ByteArrayOutputStream();
XmiCasSerializer ser = new XmiCasSerializer(jcas.getTypeSystem());
XMLSerializer xmlSer = new XMLSerializer(outStream, false);
ser.serialize(jcas.getCas(), xmlSer.getContentHandler());
// Convert output stream to string
// String newOut = outStream.toString("UTF-8");
String newOut = outStream.toString();
// System.err.println("NEWOUT:"+newOut);
//
// if (newOut.matches("^<\\?xml version=\"1.0\" encoding=\"UTF-8\"\\?>.*$")){
// newOut = newOut.replaceFirst("<\\?xml version=\"1.0\" encoding=\"UTF-8\"\\?>",
// "<\\?xml version=\"1.0\" encoding=\""+Charset.defaultCharset().name()+"\"\\?>");
// }
// if (newOut.matches("^.*?sofaString=\"(.*?)\".*$")){
// for (MatchResult r : findMatches(Pattern.compile("^(.*?sofaString=\")(.*?)(\".*)$"), newOut)){
// String stringBegin = r.group(1);
// String sofaString = r.group(2);
// System.err.println("SOFASTRING:"+sofaString);
// String stringEnd = r.group(3);
// // The sofaString is encoded as UTF-8.
// // However, at this point it has to be translated back into the defaultCharset.
// byte[] defaultDocText = new String(sofaString.getBytes(), "UTF-8").getBytes(Charset.defaultCharset().name());
// String docText = new String(defaultDocText);
// System.err.println("DOCTEXT:"+docText);
// newOut = stringBegin + docText + stringEnd;
//// newOut = newOut.replaceFirst("sofaString=\".*?\"", "sofaString=\"" + docText + "\"");
// }
// }
// System.err.println("NEWOUT:"+newOut);
return newOut;
} finally {
if (outStream != null) {
outStream.close();
}
}
}
/**
* Find all the matches of a pattern in a charSequence and return the
* results as list.
*
* @param pattern
* @param s
* @return
*/
public static Iterable<MatchResult> findMatches(Pattern pattern,
CharSequence s) {
List<MatchResult> results = new ArrayList<MatchResult>();
for (Matcher m = pattern.matcher(s); m.find();)
results.add(m.toMatchResult());
return results;
}
}