package edu.berkeley.cs.nlp.ocular.output;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.berkeley.cs.nlp.ocular.data.Document;
import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType;
import edu.berkeley.cs.nlp.ocular.model.DecodeState;
import edu.berkeley.cs.nlp.ocular.model.transition.SparseTransitionModel.TransitionState;
import edu.berkeley.cs.nlp.ocular.util.StringHelper;
import tberg.murphy.fileio.f;
//import static org.apache.commons.lang3.StringEscapeUtils.escapeHtml3; // to escape all HTML special characters
import tberg.murphy.indexer.Indexer;
import tberg.murphy.util.Iterators;
/**
* @author Hannah Alpert-Abrams (halperta@gmail.com)
*/
public class AltoOutputWriter {
private Indexer<String> charIndexer;
private Indexer<String> langIndexer;
private int spaceCharIndex;
private int hyphenCharIndex;
public AltoOutputWriter(Indexer<String> charIndexer, Indexer<String> langIndexer) {
this.charIndexer = charIndexer;
this.langIndexer = langIndexer;
this.spaceCharIndex = charIndexer.getIndex(Charset.SPACE);
this.hyphenCharIndex = charIndexer.getIndex(Charset.HYPHEN);
}
public void write(int numLines, List<DecodeState>[] viterbiDecodeStates, Document doc, String outputFilenameBase, String inputDocPath, List<String> commandLineArgs, boolean outputNormalized, double lmPerplexity) {
String altoOutputFilename = outputFilenameBase + (outputNormalized ? "_norm" : "_dipl") + ".alto.xml";
SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss");
String imgFilename = doc.baseName();
StringBuffer outputBuffer = new StringBuffer();
outputBuffer.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
outputBuffer.append("<alto xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/standards/alto/v3/alto.xsd\" xmlns:emop=\"http://emop.tamu.edu\">\n");
outputBuffer.append(" <Description>\n");
outputBuffer.append(" <MeasurementUnit>pixel</MeasurementUnit>\n");
outputBuffer.append(" <sourceImageInformation>\n");
outputBuffer.append(" <fileName>"+imagePathToFilename(imgFilename)+"</fileName>\n"); //gives filename with extension
outputBuffer.append(" </sourceImageInformation>\n");
outputBuffer.append(" <OCRProcessing ID=\"Ocular0.0.3\">\n");
outputBuffer.append(" <preProcessingStep></preProcessingStep>\n");
outputBuffer.append(" <ocrProcessingStep>\n");
outputBuffer.append(" <processingDateTime>"+formatter.format(new Date())+"</processingDateTime>\n");
outputBuffer.append(" <processingStepSettings>"+StringHelper.join(commandLineArgs, " ")+"</processingStepSettings>\n");
outputBuffer.append(" <processingSoftware>\n");
outputBuffer.append(" <softwareCreator>Taylor Berg-Kirkpatrick, Greg Durrett, Dan Klein, Dan Garrette, Hannah Alpert-Abrams</softwareCreator>\n");
outputBuffer.append(" <softwareName>Ocular</softwareName>\n");
outputBuffer.append(" <softwareVersion>0.0.3</softwareVersion>\n");
outputBuffer.append(" </processingSoftware>\n");
outputBuffer.append(" </ocrProcessingStep>\n");
// outputBuffer.append(" <postProcessingStep>\n");
// outputBuffer.append(" <processingSoftware>\n");
// outputBuffer.append(" <softwareCreator>\n");
// outputBuffer.append(" Illinois Informatics Institute, University of Illinois at Urbana-Champaign http://www.informatics.illinois.edu\n");
// outputBuffer.append(" </softwareCreator>\n");
// outputBuffer.append(" <softwareName>PageCorrector</softwareName>\n");
// outputBuffer.append(" <softwareVersion>1.10.0-SNAPSHOT</softwareVersion>\n");
// outputBuffer.append(" </processingSoftware>\n");
// outputBuffer.append(" </postProcessingStep>\n");
outputBuffer.append(" </OCRProcessing>\n");
outputBuffer.append(" </Description>\n");
outputBuffer.append(" <Layout>\n");
outputBuffer.append(" <Page ID=\""+imageFilenameToId(imgFilename)+"\" PHYSICAL_IMG_NR=\""+imageFilenameToIdNumber(imgFilename)+"\">\n"); // ACCURACY=\""+lmPerplexity+"\"
outputBuffer.append(" <PrintSpace>\n");
outputBuffer.append(" <TextBlock ID=\"par_1\">\n");
boolean inWord = false; // (as opposed to a space)
int wordIndex = 0;
for (int line = 0; line < numLines; ++line) {
StringBuffer lineOutputBuffer = new StringBuffer();
boolean beginningOfLine = true;
@SuppressWarnings("unchecked")
Iterator<DecodeState> dsIterator = Iterators.concat(viterbiDecodeStates[line].iterator(), Iterators.<DecodeState>oneItemIterator(null));
List<DecodeState> wordBuffer = new ArrayList<DecodeState>();
int wordWidth = 0;
while (dsIterator.hasNext()) {
DecodeState ds = dsIterator.next();
boolean isSpace = ds != null ? ds.ts.getLmCharIndex() == spaceCharIndex && ds.ts.getGlyphChar().templateCharIndex == spaceCharIndex : true;
boolean isPunct = ds != null ? ds.ts.getLmCharIndex() != hyphenCharIndex && Charset.isPunctuationChar(charIndexer.getObject(ds.ts.getLmCharIndex())) : false;
boolean endOfSpan = (isSpace == inWord) || isPunct || !dsIterator.hasNext(); // end of word, contiguous space sequence, or line
if (endOfSpan) { // if we're at a transition point between spans, we need to write out the complete span's information
if (inWord) { // if we're completing a word (as opposed to a sequence of spaces)
if (!wordBuffer.isEmpty()) { // if there's wordiness to print out (hopefully this will always be true if we get to this point)
int languageIndex = wordBuffer.get(0).ts.getLanguageIndex();
String language = languageIndex >= 0 ? langIndexer.getObject(languageIndex) : "None";
StringBuffer diplomaticTranscriptionBuffer = new StringBuffer();
StringBuffer normalizedTranscriptionBuffer = new StringBuffer();
for (DecodeState wds : wordBuffer) {
TransitionState wts = wds.ts;
if (!wts.getGlyphChar().isElided()) {
diplomaticTranscriptionBuffer.append(Charset.unescapeChar(charIndexer.getObject(wts.getGlyphChar().templateCharIndex))); //w/ normalized ocular, we'll want to preserve things like "shorthand" or whatever.
}
if (wts.getGlyphChar().glyphType != GlyphType.DOUBLED) { // the first in a pair of doubled characters isn't part of the language model transcription
switch(wts.getType()) {
case RMRGN_HPHN_INIT:
normalizedTranscriptionBuffer.append(Charset.HYPHEN);
break;
case RMRGN_HPHN:
case LMRGN_HPHN:
break;
case LMRGN:
case RMRGN:
normalizedTranscriptionBuffer.append(Charset.SPACE);
break;
case TMPL:
String s = Charset.unescapeChar(charIndexer.getObject(wts.getLmCharIndex()));
//if (s.equals(Charset.LONG_S)) s = "s"; // don't use long-s in "normalized" transcriptions
normalizedTranscriptionBuffer.append(s);
}
}
}
String diplomaticTranscription = diplomaticTranscriptionBuffer.toString().trim();
String normalizedTranscription = normalizedTranscriptionBuffer.toString().trim(); //Use this to add in the norm
if (!diplomaticTranscription.isEmpty()) {
lineOutputBuffer.append(" <String ID=\"word_"+wordIndex+"\" WIDTH=\""+wordWidth+"\" CONTENT=\""+escapeCharactersForValidation(outputNormalized ? normalizedTranscription : diplomaticTranscription)+"\" LANG=\""+language+"\"");
if (!normalizedTranscription.equals(diplomaticTranscription)) {
lineOutputBuffer.append("> \n");
if (outputNormalized) {
lineOutputBuffer.append(" <ALTERNATIVE PURPOSE=\"Diplomatic\">"+escapeCharactersForValidation(diplomaticTranscription)+"</ALTERNATIVE>\n");
}
else {
lineOutputBuffer.append(" <ALTERNATIVE PURPOSE=\"Normalization\">"+escapeCharactersForValidation(normalizedTranscription)+"</ALTERNATIVE>\n");
}
lineOutputBuffer.append(" </String>\n");
}
else {
lineOutputBuffer.append("/> \n");
}
beginningOfLine = false;
wordIndex = wordIndex+1;
}
}
}
else { // ALTO does not accept spaces at the commencement of a line
if (!beginningOfLine) {
if (wordWidth > 0) {
lineOutputBuffer.append(" <SP WIDTH=\""+wordWidth+"\"/>\n");
}
}
}
// get ready to start a new span
wordBuffer.clear();
wordWidth = 0;
inWord = !isSpace;
}
// add the current state into the (existing or freshly-cleared) span buffer
wordBuffer.add(ds);
wordWidth += (ds != null ? ds.charAndPadWidth : 0);
}
if (lineOutputBuffer.length() > 0) {
outputBuffer.append(" <TextLine ID=\"line_"+(line+1)+"\">\n"); //Opening <TextLine>, assigning ID.
outputBuffer.append(lineOutputBuffer);
outputBuffer.append(" </TextLine>\n");
}
}
outputBuffer.append("</TextBlock>\n");
outputBuffer.append("</PrintSpace>\n");
outputBuffer.append("</Page>\n");
outputBuffer.append("</Layout>\n");
outputBuffer.append("</alto>\n");
String outputString = outputBuffer.toString();
System.out.println("Writing alto output to " + altoOutputFilename);
f.writeString(altoOutputFilename, outputString);
}
private String imageFilenameToId(String imageFilename) { //pl_blac_012_00013-800.jpg
String pattern = "(pl_[a-z]+_\\d+_\\d+).*";
Pattern r = Pattern.compile(pattern);
Matcher m = r.matcher(imageFilename);
if (m.find()) {
return m.group(1);
} else {
return "Error: page ID unknown";
}
}
private String imageFilenameToIdNumber(String imageFilename) { //pl_blac_012_00013-800.jpg
String pattern = "pl_[a-z]+_\\d+_(\\d+).*";
Pattern r = Pattern.compile(pattern);
Matcher m = r.matcher(imageFilename);
if (m.find()) {
return m.group(1);
} else {
return "Error: ID Number unknown";
}
}
private String imagePathToFilename(String imageFilename) { //pl_blac_012_00013-800.jpg
String pattern = ".*(pl_[a-z]+_\\d+_\\d+.*)";
Pattern r = Pattern.compile(pattern);
Matcher m = r.matcher(imageFilename);
if (m.find()) {
return m.group(1);
} else {
return "Error: filename unknown";
}
}
private String escapeCharactersForValidation(String inputText) {
return inputText
.replace("&", "&")
.replace(">", ">")
.replace("<", "<")
.replace("'", "'")
.replace("\"", """)
.replace("P\u0303", "Pį")
.replace("p\u0303", "pį")
.replace("Q\u0303", "Qį")
.replace("q\u0303", "qį");
}
// · Ampersand—&—&
// · greater-than—>—>
// · less-than—<—<
// · apostrophe—'—'
// · quote—"—"
}