package jav.correctionBackend; import java.io.*; import java.util.regex.Pattern; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; /** *Copyright (c) 2012, IMPACT working group at the Centrum für Informations- und Sprachverarbeitung, University of Munich. *All rights reserved. *Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: *Redistributions of source code must retain the above copyright *notice, this list of conditions and the following disclaimer. *Redistributions in binary form must reproduce the above copyright *notice, this list of conditions and the following disclaimer in the *documentation and/or other materials provided with the distribution. *THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS *IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A *PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT *HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT *LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This file is part of the ocr-postcorrection tool developed * by the IMPACT working group at the Centrum für Informations- und Sprachverarbeitung, University of Munich. * For further information and contacts visit http://ocr.cis.uni-muenchen.de/ * * @author thorsten (thorsten.vobl@googlemail.com) */ public class AbbyyXMLParser extends DefaultHandler implements Parser { private int orig_id = 1; private int tokenIndex_ = 0; private int top_ = 0; private int bottom_ = 0; private int left_ = 0; private int right_ = 0; private int right_temp = 0; private int left_temp = 0; private String temp_ = ""; private String lastchar_; private String thischar_; private int pages = 0; private int position_ = 0; private boolean globalIsSuspicious = false; private boolean inVariant_ = false; private boolean isSuspicious_ = false; private boolean isDict_ = false; private Document doc_ = null; private Token temptoken_ = null; private String tempimage_ = null; private XMLReader xr; private Pattern myAlnum; public AbbyyXMLParser(Document d) { this.doc_ = d; // this.myAlnum = Pattern.compile("[\\p{Space}\\p{Punct}]"); this.myAlnum = Pattern.compile("[\\pL\\pM\\p{Nd}\\p{Nl}\\p{Pc}[\\p{InEnclosedAlphanumerics}&&\\p{So}]]+"); try { xr = XMLReaderFactory.createXMLReader(); xr.setContentHandler(this); xr.setErrorHandler(this); } catch (SAXException e1) { } } @Override public void parse(String filename, String imageFile, String encoding) { this.tempimage_ = imageFile; try { InputSource is = new InputSource(getReader(filename)); // is.setEncoding(encoding); xr.parse(is); } catch (IOException ex) { throw new RuntimeException(ex); } catch (SAXException ex) { throw new RuntimeException(ex); } } private final static int BOM_SIZE = 4; private Reader getReader(String path) throws IOException { PushbackInputStream is = new PushbackInputStream( new BufferedInputStream(new FileInputStream(path)), BOM_SIZE); byte[] bom = new byte[BOM_SIZE]; is.read(bom); // utf8 if ((bom[0] == (byte)0xef) && (bom[1] == (byte)0xbb) && (bom[2] == (byte)0xbf)) { is.unread(bom, 3, 1); } else if ((bom[0] == (byte)0xfe) && (bom[1] == (byte)0xff)) { is.unread(bom, 2, 2); } else if ((bom[0] == (byte)0xff) && (bom[1] == (byte)0xfe)) { is.unread(bom, 2, 2); } else if ((bom[0] == (byte)0x0) && (bom[1] == (byte)0x0) && (bom[2] == (byte)0xfe) && (bom[3] == (byte)0xff)) { /* do nothing */ } else if ((bom[0] == (byte)0xff) && (bom[1] == (byte)0xfe) && (bom[2] == (byte)0x0) && (bom[3] == (byte)0x0)) { /* do nothing */ } else { is.unread(bom, 0, BOM_SIZE); } return new InputStreamReader(is); } @Override public void startDocument() { // System.out.println("Parsing started."); // this.starttime_ = System.currentTimeMillis(); } @Override public void endDocument() { // System.out.println("Parsing ended. " + (System.currentTimeMillis() - this.starttime_)); } @Override public void startElement(String uri, String nname, String qName, Attributes atts) { if (qName.equals("document")) { } else if (qName.equals("page")) { } else if (qName.equals("block")) { } else if (qName.equals("region")) { } else if (qName.equals("rect")) { } else if (qName.equals("text")) { } else if (qName.equals("par")) { } else if (qName.equals("line")) { top_ = Integer.parseInt(atts.getValue("t")); if( top_ == -1) { top_ = 1; } bottom_ = Integer.parseInt(atts.getValue("b")); if( bottom_ == -1) { bottom_ = 1; } } else if (qName.equals("variantText")) { inVariant_ = true; } else if (qName.equals("formatting")) { } else if (qName.equals("charParams")) { // tempchar_ = new Character(this.tokenIndex_, position_); // tempchar_.setLeft(Integer.parseInt(atts.getValue("l"))); // tempchar_.setRight(Integer.parseInt(atts.getValue("r"))); // tempchar_.setIsSuspicious((atts.getValue("suspicious") != null)); this.isSuspicious_ = (atts.getValue("suspicious") != null); this.isDict_ = Boolean.parseBoolean(atts.getValue("wordFromDictionary")); System.out.println("charparams " + this.isSuspicious_ + " " + this.isDict_); // doc_.addCharacter(tempchar_); left_temp = Integer.parseInt(atts.getValue("l")); right_temp = Integer.parseInt(atts.getValue("r")); position_++; } } @Override public void endElement(String uri, String nname, String qName) { if (qName.equals("document")) { } else if (qName.equals("page")) { orig_id = 1; pages++; } else if (qName.equals("block")) { } else if (qName.equals("region")) { } else if (qName.equals("rect")) { } else if (qName.equals("text")) { } else if (qName.equals("par")) { temptoken_ = new Token("\n"); temptoken_.setSpecialSeq(SpecialSequenceType.NEWLINE); temptoken_.setIndexInDocument(tokenIndex_); temptoken_.setIsSuspicious(false); temptoken_.setIsCorrected(false); temptoken_.setIsNormal(false); temptoken_.setNumberOfCandidates(0); temptoken_.setPageIndex(pages); temptoken_.setTokenImageInfoBox(null); doc_.addToken(temptoken_); tokenIndex_++; temptoken_ = null; position_ = 0; left_ = 0; temp_ = ""; // at end of line, pushback actual token and add newline token } else if (qName.equals("line")) { if (!temp_.equals("")) { temptoken_ = new Token( temp_ ); if (temp_.matches("^[\\p{Space}]+$")) { temptoken_.setSpecialSeq(SpecialSequenceType.SPACE); } // else if (temp_.matches("^[\\p{Punct}]+$")) { // temptoken_.setSpecialSeq(SpecialSequenceType.PUNCTUATION); // } else if (temp_.matches("^[\n\r\f]+$")) { temptoken_.setSpecialSeq(SpecialSequenceType.NEWLINE); } else { temptoken_.setSpecialSeq(SpecialSequenceType.NORMAL); } temptoken_.setIndexInDocument(tokenIndex_); temptoken_.setIsSuspicious(this.globalIsSuspicious); temptoken_.setIsCorrected(false); temptoken_.setPageIndex(pages); temptoken_.setIsNormal(myAlnum.matcher(temp_).matches()); temptoken_.setNumberOfCandidates(0); if (left_ > 0 && !temptoken_.getSpecialSeq().equals(SpecialSequenceType.SPACE)) { TokenImageInfoBox tiib = new TokenImageInfoBox(); tiib.setCoordinateBottom(bottom_); tiib.setCoordinateLeft(left_); tiib.setCoordinateRight(right_); tiib.setCoordinateTop(top_); tiib.setImageFileName(this.tempimage_); temptoken_.setTokenImageInfoBox(tiib); } else { temptoken_.setTokenImageInfoBox(null); } temptoken_.setOrigID(orig_id); doc_.addToken(temptoken_); System.out.println("token add " + temptoken_.getWOCR() + " " + temptoken_.isSuspicious()); this.globalIsSuspicious = false; orig_id++; tokenIndex_++; } temptoken_ = new Token( "\n" ); temptoken_.setSpecialSeq(SpecialSequenceType.NEWLINE); temptoken_.setIndexInDocument(tokenIndex_); temptoken_.setIsSuspicious(false); temptoken_.setIsCorrected(false); temptoken_.setIsNormal(false); temptoken_.setNumberOfCandidates(0); temptoken_.setPageIndex(pages); temptoken_.setTokenImageInfoBox(null); doc_.addToken(temptoken_); tokenIndex_++; temptoken_ = null; position_ = 0; left_ = 0; temp_ = ""; } else if (qName.equals("variantText")) { inVariant_ = false; } else if (qName.equals("formatting")) { } else if (qName.equals("charParams")) { if (!inVariant_) { // tokenstring empty (happens at begin of document and after closing </line> and </par> tags) if (temp_.equals("")) { temp_ = thischar_; } else { // previous char alnum and actual char alnum -> attach thischar_ to tempstring if (myAlnum.matcher(lastchar_).matches() && myAlnum.matcher(thischar_).matches()) { temp_ += thischar_; // previous char non-alnum and actual char alnum -> pushback token, attach thischar_ to tempstring } else if (!myAlnum.matcher(lastchar_).matches() && myAlnum.matcher(thischar_).matches()) { temptoken_ = new Token( temp_ ); if (temp_.matches("^[\\p{Space}]+$")) { temptoken_.setSpecialSeq(SpecialSequenceType.SPACE); } else if (temp_.matches("^[\\p{Punct}]+$")) { temptoken_.setSpecialSeq(SpecialSequenceType.PUNCTUATION); } else if (temp_.matches("^[\n\r\f]+$")) { temptoken_.setSpecialSeq(SpecialSequenceType.NEWLINE); } else { temptoken_.setSpecialSeq(SpecialSequenceType.NORMAL); } temptoken_.setIndexInDocument(tokenIndex_); temptoken_.setIsSuspicious(this.globalIsSuspicious); temptoken_.setIsCorrected(false); temptoken_.setPageIndex(pages); temptoken_.setIsNormal(myAlnum.matcher(temp_).matches()); temptoken_.setNumberOfCandidates(0); // if document has coordinates if (left_ > 0 && !temptoken_.getSpecialSeq().equals(SpecialSequenceType.SPACE)) { TokenImageInfoBox tiib = new TokenImageInfoBox(); tiib.setCoordinateBottom(bottom_); tiib.setCoordinateLeft(left_); tiib.setCoordinateRight(right_); tiib.setCoordinateTop(top_); tiib.setImageFileName(this.tempimage_); temptoken_.setTokenImageInfoBox(tiib); } else { temptoken_.setTokenImageInfoBox(null); } temptoken_.setOrigID(orig_id); doc_.addToken(temptoken_); System.out.println("token add " + temptoken_.getWOCR() + " " + temptoken_.isSuspicious()); this.globalIsSuspicious = false; orig_id++; tokenIndex_++; temptoken_ = null; position_ = 0; left_ = 0; temp_ = thischar_; // previous char alnum and actual char non-alnum -> pushback token, attach thischar_ to tempstring } else if (myAlnum.matcher(lastchar_).matches() & !myAlnum.matcher(thischar_).matches()) { temptoken_ = new Token( temp_ ); if (temp_.matches("^[\\p{Space}]+$")) { temptoken_.setSpecialSeq(SpecialSequenceType.SPACE); } // else if (temp_.matches("^[\\p{Punct}]+$")) { // temptoken_.setSpecialSeq(SpecialSequenceType.PUNCTUATION); // } else if (temp_.matches("^[\n\r\f]+$")) { temptoken_.setSpecialSeq(SpecialSequenceType.NEWLINE); } else { temptoken_.setSpecialSeq(SpecialSequenceType.NORMAL); } temptoken_.setIndexInDocument(tokenIndex_); temptoken_.setIsSuspicious(this.globalIsSuspicious); temptoken_.setIsCorrected(false); temptoken_.setPageIndex(pages); temptoken_.setIsNormal(myAlnum.matcher(temp_).matches()); temptoken_.setNumberOfCandidates(0); // if document has coordinates if (left_ > 0 && !temptoken_.getSpecialSeq().equals(SpecialSequenceType.SPACE)) { TokenImageInfoBox tiib = new TokenImageInfoBox(); tiib.setCoordinateBottom(bottom_); tiib.setCoordinateLeft(left_); tiib.setCoordinateRight(right_); tiib.setCoordinateTop(top_); tiib.setImageFileName(this.tempimage_); temptoken_.setTokenImageInfoBox(tiib); } else { temptoken_.setTokenImageInfoBox(null); } temptoken_.setOrigID(orig_id); doc_.addToken(temptoken_); System.out.println("token add " + temptoken_.getWOCR() + " " + temptoken_.isSuspicious()); this.globalIsSuspicious = false; tokenIndex_++; orig_id++; temptoken_ = null; position_ = 0; temp_ = thischar_; left_ = 0; // previous char non-alnum and actual char non-alnum -> attach tempchar_ to token } else if (!myAlnum.matcher(lastchar_).matches() & !myAlnum.matcher(thischar_).matches()) { temp_ += thischar_; } } lastchar_ = thischar_; } if( this.isSuspicious_ && !this.isDict_ ) { System.out.println("global"); this.globalIsSuspicious = true; } // if left unset set it if (left_ == 0) { left_ = left_temp; } // set new right coordinate right_ = right_temp; } } /* * Assumption: abbyy xml output is charwise */ @Override public void characters(char ch[], int start, int length) { if (length > 1) { System.err.println("Error. Length > 1. " + new String(ch, start, length)); } thischar_ = new String(ch, start, length); } }