package jav.correctionBackend;
import java.io.*;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.lang3.StringEscapeUtils;
/**
*Copyright (c) 2012, IMPACT working group at the Centrum für Informations- und Sprachverarbeitung, University of Munich.
*All rights reserved.
*Redistribution and use in source and binary forms, with or without
*modification, are permitted provided that the following conditions are met:
*Redistributions of source code must retain the above copyright
*notice, this list of conditions and the following disclaimer.
*Redistributions in binary form must reproduce the above copyright
*notice, this list of conditions and the following disclaimer in the
*documentation and/or other materials provided with the distribution.
*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
*IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
*PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
*HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
*SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
*LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
*DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
*THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
*(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
*OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* This file is part of the ocr-postcorrection tool developed
* by the IMPACT working group at the Centrum für Informations- und Sprachverarbeitung, University of Munich.
* For further information and contacts visit http://ocr.cis.uni-muenchen.de/
*
* @author thorsten (thorsten.vobl@googlemail.com)
*/
public class OCRXMLExporter {
public OCRXMLExporter() {
}
public void export(Document doc, String filename, boolean exportCandidates) {
try {
int index = -1;
Writer w = new OutputStreamWriter(new FileOutputStream(filename), "UTF8");
BufferedWriter out = new BufferedWriter(w);
out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
out.write("<document>\n");
Iterator<Page> pages = doc.pageIterator();
while (pages.hasNext()) {
Page p = pages.next();
out.write("<page imageFile=\"" + p.getImageCanonical() + "\" sourceFile=\"\">\n");
Iterator<Token> tokens = doc.tokenIterator(p);
while (tokens.hasNext()) {
Token t = tokens.next();
index++;
SpecialSequenceType sst = t.getSpecialSeq();
if (sst.equals(SpecialSequenceType.SPACE)) {
out.write("<token token_id=\"" + index + "\" special_seq=\"space\" isNormal=\"false\">\n");
} else if (sst.equals(SpecialSequenceType.NEWLINE)) {
out.write("<token token_id=\"" + index + "\" special_seq=\"newline\" isNormal=\"false\">\n");
} else if (sst.equals(SpecialSequenceType.NORMAL)) {
out.write("<token token_id=\"" + index + "\" isNormal=\"" + t.isNormal() + "\">\n");
} else {
out.write("<token token_id=\"" + index + "\" isNormal=\"" + t.isNormal() + "\">\n");
}
// if( t.getID() != t.getIndexInDocument() ) {
out.write("<ext_id>"+t.getID()+"</ext_id>\n");
// }
out.write("<wOCR>" + StringEscapeUtils.escapeXml(t.getWOCR()) + "</wOCR>\n");
out.write("<wOCR_lc>" + StringEscapeUtils.escapeXml(t.getWOCR_lc()) + "</wOCR_lc>\n");
out.write("<wCorr>" + StringEscapeUtils.escapeXml(t.getWCOR()) + "</wCorr>\n");
TokenImageInfoBox b = t.getTokenImageInfoBox();
if( b != null) {
out.write("<coord l=\"" + b.getCoordinateLeft() + "\" t=\"" + b.getCoordinateTop() + "\" r=\"" + b.getCoordinateRight() + "\" b=\"" + b.getCoordinateBottom() + "\"/>\n");
}
if( sst.equals(SpecialSequenceType.NORMAL) || sst.equals(SpecialSequenceType.HYPHENATED)) {
out.write("<abbyy_suspicious value=\"" + t.isSuspicious() + "\"/>\n");
}
if ( exportCandidates && t.getNumberOfCandidates() > 0 ) {
Iterator<Candidate> cands = doc.candidateIterator(t.getID());
while (cands.hasNext()) {
Candidate cand = cands.next();
out.write("<cand>" + StringEscapeUtils.escapeXml(cand.getSuggestion()) + StringEscapeUtils.escapeXml(cand.getInterpretation()) + ",voteWeight=" + cand.getVoteweight() + ",levDistance=" + cand.getDlev() + "</cand>\n");
}
}
out.write("</token>\n");
}
out.write("</page>\n");
}
out.write("</document>\n");
out.flush();
out.close();
} catch (IOException ex) {
Logger.getLogger(OCRXMLExporter.class.getName()).log(Level.SEVERE, null, ex);
}
}
}