package jav.correctionBackend; import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.regex.Matcher; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; /** *Copyright (c) 2012, IMPACT working group at the Centrum für Informations- und Sprachverarbeitung, University of Munich. *All rights reserved. *Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: *Redistributions of source code must retain the above copyright *notice, this list of conditions and the following disclaimer. *Redistributions in binary form must reproduce the above copyright *notice, this list of conditions and the following disclaimer in the *documentation and/or other materials provided with the distribution. *THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS *IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A *PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT *HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT *LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This file is part of the ocr-postcorrection tool developed * by the IMPACT working group at the Centrum für Informations- und Sprachverarbeitung, University of Munich. * For further information and contacts visit http://ocr.cis.uni-muenchen.de/ * * @author thorsten (thorsten.vobl@googlemail.com) */ public class OCRXMLImporter { public OCRXMLImporter() { } public void simpleUpdateDocument(Document doc, String documentfile) { new SimpleImporter().parse(doc, documentfile); } public void importDocument(Document doc, String documentfile, String imgdir) { new OCRCXMLImporter().parse(doc, documentfile, imgdir); } public void importCandidates(Document doc, String documentfile) { new CandidateImporter().parse(doc, documentfile); } public void importProfile(Document doc, String profilefile) { new ProfileImporter().parse(doc, profilefile); } } class SimpleImporter extends DefaultHandler { private Document doc; private int tokenID; private String susp; private String norm; public void parse(Document d, String f) { if (d != null) { this.doc = d; XMLReader xr; try { xr = XMLReaderFactory.createXMLReader(); xr.setContentHandler(this); xr.setErrorHandler(this); xr.parse(f); } catch (SAXException ex) { } catch (IOException e) { } } else { throw new NullPointerException(); } } @Override public void startDocument() { } @Override public void endDocument() { } @Override public void startElement(String uri, String nname, String qname, Attributes atts) { if (nname.equals("token")) { tokenID = Integer.parseInt(atts.getValue("token_id")); norm = atts.getValue("isNormal"); } else if (nname.equals("abbyy_suspicious")) { susp = atts.getValue("value"); } } @Override public void endElement(String uri, String nname, String qname) { if (nname.equals("token")) { doc.setNormal(tokenID, norm); doc.setSuspicious(tokenID, susp); } } } class CandidateImporter extends DefaultHandler { private String content = ""; private Document doc; private int rank; private int tokenID; private String susp; private java.util.regex.Pattern pattern; private Candidate tempcand; public void parse(Document d, String f) { pattern = java.util.regex.Pattern.compile("(.*):\\{(.*),voteWeight=(.*),levDistance=(.*)"); if (d != null) { this.doc = d; this.doc.clearCandidates(); XMLReader xr; try { xr = XMLReaderFactory.createXMLReader(); xr.setContentHandler(this); xr.setErrorHandler(this); xr.parse(f); } catch (SAXException ex) { } catch (IOException e) { } } else { throw new NullPointerException(); } } @Override public void startDocument() { } @Override public void endDocument() { } @Override public void startElement(String uri, String nname, String qname, Attributes atts) { if (nname.equals("token")) { rank = 0; } else if (nname.equals("abbyy_suspicious")) { susp = atts.getValue("value"); } content = ""; } @Override public void endElement(String uri, String nname, String qname) { if (nname.equals("cand")) { Matcher matcher = pattern.matcher(content); if (matcher.matches()) { rank++; tempcand = new Candidate(tokenID, rank, matcher.group(1), matcher.group(2), Double.parseDouble(matcher.group(3)), Integer.parseInt(matcher.group(4))); doc.addCandidate(tempcand); if (rank == 1) { doc.setTopCandDLev(tokenID, Integer.parseInt(matcher.group(4))); doc.setTopSuggestion(tokenID, matcher.group(1)); } } } else if (nname.equals("token")) { doc.setNumCandidates(tokenID, rank); doc.setSuspicious(tokenID, susp); } else if( nname.equals("ext_id")) { if( !content.equals("")) { tokenID = Integer.parseInt(content); } } content = ""; } @Override public void characters(char ch[], int start, int length) { content += new String(ch, start, length); } } class ProfileImporter extends DefaultHandler { private Document doc; private int patternid; private int part; private Pattern temppattern; private PatternOccurrence tempocc; private boolean begin; public void parse(Document d, String f) { patternid = 0; if (d != null) { this.doc = d; XMLReader xr; try { xr = XMLReaderFactory.createXMLReader(); xr.setContentHandler(this); xr.setErrorHandler(this); xr.parse(f); } catch (SAXException ex) { } catch (IOException e) { } } else { throw new NullPointerException(); } } @Override public void startDocument() { } @Override public void endDocument() { } @Override public void startElement(String uri, String nname, String qname, Attributes atts) { if( qname.equals("ocr_errors")) { begin = true; } else if( qname.equals("pattern") && begin) { String left = atts.getValue("left"); String right = atts.getValue("right"); this.part = 0; this.temppattern = new Pattern(this.patternid, left, right, 0, 0); } else if (qname.equals("type")) { String wocr_lc = atts.getValue("wOCR_lc"); String wsuggest = atts.getValue("wSuggest"); int freq = Integer.parseInt(atts.getValue("freq")); tempocc = new PatternOccurrence(patternid, part, wocr_lc, wsuggest, freq, 0); this.part++; temppattern.addOccurence(tempocc, true); } } @Override public void endElement(String uri, String nname, String qname) { if (qname.equals("pattern") && begin) { doc.addPattern(temppattern); Iterator<PatternOccurrence> iter = temppattern.getOccurences().iterator(); while( iter.hasNext() ) { doc.addPatternOccurrence( iter.next() ); } this.patternid++; } } } class OCRCXMLImporter extends DefaultHandler { private String imgdir; private String content = ""; private Document doc; private SpecialSequenceType spec; private int rank; private int pages = 0; private String imgFilename; private int tokenIndex = -1; private java.util.regex.Pattern candpattern; private java.util.regex.Pattern fileNamePattern; private Candidate tempcand; private Token temptoken; private boolean isNormal; public void parse(Document d, String docfile, String imgdir) { candpattern = java.util.regex.Pattern.compile("(.*):\\{(.*),voteWeight=(.*),levDistance=(.*)"); fileNamePattern = java.util.regex.Pattern.compile(".*\\/(.*\\..*)"); if (d != null) { this.doc = d; this.imgdir = imgdir; XMLReader xr; try { xr = XMLReaderFactory.createXMLReader(); xr.setContentHandler(this); xr.setErrorHandler(this); xr.parse(docfile); } catch (SAXException ex) { } catch (IOException e) { } } else { throw new NullPointerException(); } } @Override public void startDocument() { } @Override public void endDocument() { } @Override public void startElement(String uri, String nname, String qname, Attributes atts) { if (nname.equals("page")) { if (this.imgdir != null) { Matcher m = fileNamePattern.matcher(atts.getValue("imageFile")); if (m.matches()) { File f = new File(imgdir + File.separator + m.group(1)); if (f.exists()) { try { imgFilename = f.getCanonicalPath(); } catch (IOException ex) { imgFilename = ""; } } else { imgFilename = ""; } } else { imgFilename = ""; } } else { imgFilename = ""; } } else if (nname.equals("token")) { rank = 0; tokenIndex = Integer.parseInt(atts.getValue("token_id")); isNormal = Boolean.parseBoolean(atts.getValue("isNormal")); String seq; if ((seq = atts.getValue("special_seq")) != null) { if (seq.equals("newline")) { spec = SpecialSequenceType.NEWLINE; } else if (seq.equals("space")) { spec = SpecialSequenceType.SPACE; } } else { spec = SpecialSequenceType.NORMAL; } } else if (nname.equals("coord")) { TokenImageInfoBox b = new TokenImageInfoBox(); b.setImageFileName(imgFilename); b.setCoordinateLeft(Integer.parseInt(atts.getValue("l"))); b.setCoordinateRight(Integer.parseInt(atts.getValue("r"))); b.setCoordinateTop(Integer.parseInt(atts.getValue("t"))); b.setCoordinateBottom(Integer.parseInt(atts.getValue("b"))); temptoken.setTokenImageInfoBox(b); } else if (nname.equals("abbyy_suspicious")) { temptoken.setIsSuspicious(Boolean.parseBoolean(atts.getValue("value"))); } content = ""; } @Override public void endElement(String uri, String nname, String qname) { // add wocr wcorr if (nname.equals("cand")) { rank++; Matcher matcher = candpattern.matcher(content); if (matcher.matches()) { tempcand = new Candidate(tokenIndex, rank, matcher.group(1), matcher.group(2), Double.parseDouble(matcher.group(3)), Integer.parseInt(matcher.group(4))); doc.addCandidate(tempcand); if (rank == 1) { temptoken.setTopCandDLev(Integer.parseInt(matcher.group(4))); temptoken.setTopSuggestion(matcher.group(1)); } } } else if (nname.equals("token")) { temptoken.setNumberOfCandidates(rank); doc.addToken(temptoken); } else if (nname.equals("wOCR")) { temptoken = new Token( content ); temptoken.setIndexInDocument(tokenIndex); temptoken.setIsNormal(isNormal); temptoken.setIsCorrected(false); temptoken.setSpecialSeq(spec); temptoken.setPageIndex(pages); } else if (nname.equals("wCorr")) { temptoken.setWCOR(content); } else if (nname.equals("page")) { pages++; } content = ""; } @Override public void characters(char ch[], int start, int length) { content += new String(ch, start, length); } }