package jav.correctionBackend; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; import java.util.HashMap; import java.util.logging.Level; import java.util.logging.Logger; import org.h2.jdbcx.JdbcConnectionPool; import org.netbeans.api.progress.ProgressHandle; /** *Copyright (c) 2012, IMPACT working group at the Centrum für Informations- und Sprachverarbeitung, University of Munich. *All rights reserved. *Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: *Redistributions of source code must retain the above copyright *notice, this list of conditions and the following disclaimer. *Redistributions in binary form must reproduce the above copyright *notice, this list of conditions and the following disclaimer in the *documentation and/or other materials provided with the distribution. *THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS *IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A *PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT *HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT *LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY *THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This file is part of the ocr-postcorrection tool developed * by the IMPACT working group at the Centrum für Informations- und Sprachverarbeitung, University of Munich. * For further information and contacts visit http://ocr.cis.uni-muenchen.de/ * * @author thorsten (thorsten.vobl@googlemail.com) */ public class CorrectionSystem { private JdbcConnectionPool jcp; private SpreadIndexDocument document; private Parser parser; public CorrectionSystem() { } public int openDocument(String dbPath) { int retval = 0; jcp = JdbcConnectionPool.create("jdbc:h2:"+dbPath+";AUTO_RECONNECT=TRUE;MVCC=true", "SA", ""); jcp.setMaxConnections(50); jcp.setLoginTimeout(0); this.document = new SpreadIndexDocument(jcp); document.loadNumberOfPagesFromDB(); document.loadNumberOfTokensFromDB(); return retval; } private int newDocDatabase(String dbPath) { int retval = -1; try { // File f = new File(dbPath + ".h2.db"); Statement s; jcp = JdbcConnectionPool.create("jdbc:h2:"+dbPath+";AUTO_RECONNECT=TRUE;MVCC=true", "SA", ""); jcp.setMaxConnections(50); jcp.setLoginTimeout(0); Connection conn = jcp.getConnection(); s = conn.createStatement(); s.execute("DROP TABLE token IF EXISTS"); // s.execute("DROP TABLE page IF EXISTS"); s.execute("DROP TABLE candidate IF EXISTS"); s.execute("DROP TABLE pattern IF EXISTS"); s.execute("DROP TABLE patternoccurrence IF EXISTS"); s.execute("DROP TABLE undoredo IF EXISTS"); s.execute("DROP TABLE correction_log IF EXISTS"); // s.execute("CREATE TABLE token( tokenID IDENTITY(0), indexInDocument INTEGER, orig_id INTEGER, wOCR VARCHAR(60), wCorr VARCHAR(60), isNormal BOOLEAN, isCorrected BOOLEAN, numCands SMALLINT, cleft SMALLINT, cright SMALLINT, ctop SMALLINT, cbottom SMALLINT, special_seq VARCHAR(20), imageFile VARCHAR(200), isSuspicious BOOLEAN, pageIndex SMALLINT, topSuggestion VARCHAR(50), topCandDLev SMALLINT)"); s.execute("CREATE TABLE token( tokenID INTEGER GENERATED BY DEFAULT AS IDENTITY (START WITH 0, INCREMENT BY 1) PRIMARY KEY, indexInDocument INTEGER, orig_id INTEGER, wOCR VARCHAR(60), wCorr VARCHAR(60), isNormal BOOLEAN, isCorrected BOOLEAN, numCands SMALLINT, cleft SMALLINT, cright SMALLINT, ctop SMALLINT, cbottom SMALLINT, special_seq VARCHAR(20), imageFile VARCHAR(200), isSuspicious BOOLEAN, pageIndex SMALLINT, topSuggestion VARCHAR(50), topCandDLev SMALLINT)"); // s.execute("CREATE TABLE page( index SMALLINT GENERATED BY DEFAULT AS IDENTITY (START WITH 0, INCREMENT BY 1) PRIMARY KEY, token_index_from INTEGER, token_index_to INTEGER, imageFile VARCHAR(200))"); s.execute("CREATE TABLE candidate( tokenID INTEGER, rank SMALLINT, suggestion VARCHAR(50), interpretation VARCHAR(200), voteweight REAL, dlev TINYINT, PRIMARY KEY (tokenID, rank))"); s.execute("CREATE TABLE pattern (patternID INTEGER GENERATED BY DEFAULT AS IDENTITY (START WITH 0, INCREMENT BY 1) PRIMARY KEY, leftpart VARCHAR(5), rightpart VARCHAR(5), freq INTEGER, corrected INTEGER)"); s.execute("CREATE TABLE patternoccurrence (patternID INTEGER, part INTEGER, PRIMARY KEY (patternID, part), wocr_lc VARCHAR(50), wsuggestion VARCHAR(50), freq INTEGER, corrected INTEGER)"); s.execute("CREATE TABLE undoredo( operation_id SMALLINT, part SMALLINT, type VARCHAR(10), PRIMARY KEY(operation_id, part, type), edit_type VARCHAR(20), sql_command VARCHAR(100))"); s.execute("CREATE TABLE correction_log( operation_id INTEGER GENERATED BY DEFAULT AS IDENTITY (START WITH 0, INCREMENT BY 1) PRIMARY KEY, user_name VARCHAR(20), operation_description VARCHAR(255))"); s.execute("CREATE INDEX IDX_indexInDoc ON TOKEN(indexInDocument, pageIndex, isNormal, isSuspicious)"); s.execute("CREATE INDEX IDX_indexInDoc_desc ON TOKEN(indexInDocument DESC, pageIndex, isNormal, isSuspicious)"); s.close(); conn.close(); retval = 0; } catch (SQLException ex) { ex.printStackTrace(); } return retval; } public int newDocumentFromOCRCXML( String dbPath, String ocrcxmlfile, String imagedir, ProgressHandle ph ) { int retval = -1; if( this.newDocDatabase(dbPath) == 0 ) { ph.progress("Loading"); jcp = JdbcConnectionPool.create("jdbc:h2:"+dbPath+";AUTO_RECONNECT=TRUE;MVCC=true", "SA", ""); jcp.setMaxConnections(50); jcp.setLoginTimeout(0); this.document = new SpreadIndexDocument(jcp); new OCRXMLImporter().importDocument(document, ocrcxmlfile, imagedir); document.loadNumberOfPagesFromDB(); document.loadNumberOfTokensFromDB(); retval = 0; } return retval; } public int newDocumentFromXML(String dbPath, String xmldir, String imagedir, FileType t, String encoding, ProgressHandle ph) { int retval = -1; if ( this.newDocDatabase(dbPath) == 0 ) { jcp = JdbcConnectionPool.create("jdbc:h2:"+dbPath+";AUTO_RECONNECT=TRUE;MVCC=true", "SA", ""); jcp.setMaxConnections(50); jcp.setLoginTimeout(0); FilenameFilter fil = null; this.document = new SpreadIndexDocument(jcp); if (t.equals(FileType.ABBYY_XML_DIR)) { parser = new AbbyyXMLParser(this.document); fil = new FilenameFilter() { @Override public boolean accept(File d, String name) { return name.endsWith(".xml"); } }; } else if (t.equals(FileType.HOCR)) { parser = new HOCRParser(this.document); fil = new FilenameFilter() { @Override public boolean accept(File d, String name) { return name.endsWith(".html"); } }; } else { // TODO throw error } File xmld = new File(xmldir); File imgd = new File(imagedir); String[] xmlfiles = xmld.list(fil); java.util.Arrays.sort(xmlfiles); HashMap<String, String> imgs = new HashMap<String, String>(); if (imagedir != null) { String[] imgfiles = imgd.list(new FilenameFilter() { @Override public boolean accept(File d, String name) { return name.endsWith(".tif") || name.endsWith(".jpg") || name.endsWith(".jpeg"); } }); for (int j = 0; j < imgfiles.length; j++) { imgs.put(imgfiles[j].substring(0, imgfiles[j].lastIndexOf(".")), imgfiles[j]); } } long time_start = System.currentTimeMillis(); for (int i = 0; i < xmlfiles.length; i++) { try { if (imgs.containsKey(xmlfiles[i].substring(0, xmlfiles[i].indexOf(".")))) { ph.progress("Parsing file: " + xmlfiles[i]); parser.parse(xmld.getCanonicalPath() + "/" + xmlfiles[i], imgs.get(xmlfiles[i].substring(0, xmlfiles[i].indexOf("."))), encoding); } else { ph.progress("Parsing file: " + xmlfiles[i]); parser.parse(xmld.getCanonicalPath() + "/" + xmlfiles[i], "", encoding); } } catch (IOException ex) { Logger.getLogger(CorrectionSystem.class.getName()).log(Level.SEVERE, null, ex); } } long duration = System.currentTimeMillis() - time_start; ph.progress("Done parsing. Time elapsed " + duration); document.loadNumberOfPagesFromDB(); document.loadNumberOfTokensFromDB(); retval = 0; } return retval; } public void importProfile( Document doc, String filename) { new ProfileImporter().parse(doc, filename); } public void closeDocument() { try { Connection conn = jcp.getConnection(); Statement stat = conn.createStatement(); stat.execute("SHUTDOWN COMPACT"); stat.close(); conn.close(); jcp.dispose(); } catch (SQLException ex) { Logger.getLogger(CorrectionSystem.class.getName()).log(Level.SEVERE, null, ex); } } public Document getDocument() { return this.document; } }