package jav.correctionBackend;
import java.io.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
*
* @author thorsten (thorsten.vobl@googlemail.com)
*/
public class HOCRParser extends DefaultHandler implements Parser {
private int orig_id = 1;
private SAXParser sx;
private int tokenIndex_ = 0;
private int top_ = 0;
private int bottom_ = 0;
private int left_ = 0;
private int right_ = 0;
private String temp_ = "";
private int pages = 0;
private boolean tokenIsToBeAdded = false;
private Document doc_ = null;
private Token temptoken_ = null;
private String tempimage_ = null;
private java.util.regex.Pattern myAlnum;
public HOCRParser(Document d) {
this.doc_ = d;
this.myAlnum = java.util.regex.Pattern.compile("[\\pL\\pM\\p{Nd}\\p{Nl}\\p{Pc}[\\p{InEnclosedAlphanumerics}&&\\p{So}]]+");
try {
sx = new org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl().newSAXParser();
} catch (ParserConfigurationException ex) {
Logger.getLogger(HOCRParser.class.getName()).log(Level.SEVERE, null, ex);
}
}
@Override
public void parse(String xmlFile, String imgFile, String encoding) {
try {
InputStream inputStream = new FileInputStream(xmlFile);
Reader reader = new InputStreamReader(inputStream, encoding);
InputSource is = new InputSource(reader);
// is.setEncoding(encoding);
this.tempimage_ = imgFile;
sx.parse(is, this);
} catch (SAXException ex) {
} catch (IOException ex) {
}
}
@Override
public void endDocument() {
temptoken_ = null;
pages++;
}
private static boolean isWord(String name, Attributes attrs) {
return "span".equals(name) && ("ocr_word".equals(attrs.getValue("class"))
|| "ocrx_word".equals(attrs.getValue("class")));
}
private static boolean isLine(String name, Attributes attrs) {
return "span".equals(name)
&& "ocr_line".equals(attrs.getValue("class"));
}
private static boolean isPage(String name, Attributes attrs) {
return "div".equals(name)
&& "ocr_page".equals(attrs.getValue("class"));
}
private static int[] parseBbox(String str) {
final Pattern p = Pattern.compile("bbox\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
int[] res = {0, 0, 0, 0};
if (str != null) {
Matcher m = p.matcher(str);
if (m.find()) {
for (int i = 0; i < 4; ++i) {
res[i] = Integer.parseInt(m.group(i + 1));
}
}
}
return res;
}
private static String parseImageFileName(String str) {
final Pattern p1 = Pattern.compile("image\\s+\"(.*)\"");
final Pattern p2 = Pattern.compile("file\\s+(.*)");
if (str != null) {
Matcher m = p1.matcher(str);
if (m.find()) {
return m.group(1);
}
m = p2.matcher(str);
if (m.find()) {
return m.group(1);
}
}
return "";
}
private static int parseId(String str) {
final Pattern p = Pattern.compile("_(\\d+)$");
int res = 0;
if (str != null) {
Matcher m = p.matcher(str);
if (m.find()) {
res = Integer.parseInt(m.group(1));
}
}
return res;
}
@Override
public void startElement(String uri, String nname, String qName, Attributes atts) {
if (isWord(nname, atts)) {
orig_id = parseId(atts.getValue("id"));
int[] bbox = parseBbox(atts.getValue("title"));
this.left_ = bbox[0];
this.right_ = bbox[2];
this.tokenIsToBeAdded = true;
} else if (isPage(nname, atts)) {
this.tempimage_ = parseImageFileName(atts.getValue("title"));
} else if (isLine(nname, atts)) {
// beginning of new line, if not first line add newline token
if (this.temptoken_ != null) {
temptoken_ = new Token("\n");
temptoken_.setSpecialSeq(SpecialSequenceType.NEWLINE);
temptoken_.setIndexInDocument(tokenIndex_);
temptoken_.setIsSuspicious(false);
temptoken_.setIsCorrected(false);
temptoken_.setIsNormal(false);
temptoken_.setNumberOfCandidates(0);
temptoken_.setPageIndex(pages);
temptoken_.setTokenImageInfoBox(null);
doc_.addToken(temptoken_);
tokenIndex_++;
}
int[] bbox = parseBbox(atts.getValue("title"));
this.top_ = bbox[1];
if (this.top_ < 0) {
this.top_ = 0;
}
this.bottom_ = bbox[3];
}
}
@Override
public void endElement(String uri, String nname, String qName) {
// paragraph end, add newline ??
if (nname.equals("p")) {
temptoken_ = new Token("\n");
temptoken_.setSpecialSeq(SpecialSequenceType.NEWLINE);
temptoken_.setIndexInDocument(tokenIndex_);
temptoken_.setIsSuspicious(false);
temptoken_.setIsCorrected(false);
temptoken_.setIsNormal(false);
temptoken_.setNumberOfCandidates(0);
temptoken_.setPageIndex(pages);
temptoken_.setTokenImageInfoBox(null);
doc_.addToken(temptoken_);
tokenIndex_++;
}
}
@Override
public void characters(char ch[], int start, int length) {
this.temp_ = new String(ch, start, length);
if (this.tokenIsToBeAdded) {
if (this.temp_.length() > 60) {
this.temp_ = this.temp_.substring(0, 60);
}
temptoken_ = new Token(this.temp_);
if (temp_.matches("^[\\p{Space}]+$")) {
temptoken_.setSpecialSeq(SpecialSequenceType.SPACE);
} else if (temp_.matches("^[\\p{Punct}]+$")) {
temptoken_.setSpecialSeq(SpecialSequenceType.PUNCTUATION);
} else if (temp_.matches("^[\n\r\f]+$")) {
temptoken_.setSpecialSeq(SpecialSequenceType.NEWLINE);
} else {
temptoken_.setSpecialSeq(SpecialSequenceType.NORMAL);
}
temptoken_.setIndexInDocument(tokenIndex_);
temptoken_.setIsSuspicious(false);
temptoken_.setIsCorrected(false);
temptoken_.setPageIndex(pages);
temptoken_.setIsNormal(myAlnum.matcher(temp_).matches());
temptoken_.setNumberOfCandidates(0);
// if document has coordinates
if (left_ >= 0) { // && (temptoken_.getSpecialSeq().equals(SpecialSequenceType.NORMAL) || temptoken_.getSpecialSeq().equals(SpecialSequenceType.PUNCTUATION))) {
TokenImageInfoBox tiib = new TokenImageInfoBox();
tiib.setCoordinateBottom(bottom_);
tiib.setCoordinateLeft(left_);
tiib.setCoordinateRight(right_);
tiib.setCoordinateTop(top_);
tiib.setImageFileName(this.tempimage_);
temptoken_.setTokenImageInfoBox(tiib);
} else {
temptoken_.setTokenImageInfoBox(null);
}
temptoken_.setOrigID(orig_id);
doc_.addToken(temptoken_);
tokenIndex_++;
this.tokenIsToBeAdded = false;
}
// add a space token
if (this.temp_.equals(" ")) {
temptoken_ = new Token( " " );
temptoken_.setSpecialSeq(SpecialSequenceType.SPACE);
temptoken_.setIndexInDocument(tokenIndex_);
temptoken_.setIsSuspicious(false);
temptoken_.setIsCorrected(false);
temptoken_.setIsNormal(false);
temptoken_.setNumberOfCandidates(0);
temptoken_.setPageIndex(pages);
temptoken_.setTokenImageInfoBox(null);
doc_.addToken(temptoken_);
tokenIndex_++;
}
}
}
//def coordsToAbbyCoords(hOCRCoords: ((Int,Int),(Int,Int)), p: Page) = {
// val ((leftDistance,topDistance),(hOCRRight,hOCRbottom)) = hOCRCoords
// val ((,),(pageRight, pageBottom)) = p.coordinates
//
// ((leftDistance,topDistance),(pageRight - hOCRRight, pageBottom - hOCRbottom))
// }