/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.uima.examples.cpe; import java.io.File; import java.io.IOException; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; import java.sql.Statement; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.cas.FSIterator; import org.apache.uima.collection.CasConsumer_ImplBase; import org.apache.uima.examples.SourceDocumentInformation; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceProcessException; import org.apache.uima.util.ProcessTrace; import example.PersonTitle; /** * A simple CAS consumer that creates a Derby (Cloudscape) database in the file system. You can * obtain this database from http://incubator.apache.org/derby/ * * <p> * This CAS Consumer takes one parameters: * <ul> * <li><code>OutputDirectory</code> - path to directory which is the "System" directory for the * derby DB. </li> * </ul> * * It deletes all the databases at the system location (!!!), Creates a new database (takes the most * time - order of 10+ seconds) creates a table in the database to hold instances of the PersonTitle * annotation Adds entries for each PersonTitle annotation in each CAS to the database * * To use - add derby.jar to the classpath when you start the CPE GUI - run the CPE Gui and select * the Name Recognizer and Person Title Annotator aggregate. - a good sample collection reader is * the FileSystemCollectionReader, and - a good sample data is the <UIMA_HOME>/examples/data * * The processing is set up to handle multiple CASes. The end is indicated by using the * CollectionProcessComplete call. * * Batching of updates to the database is done. The batch size is set to 50. The larger size takes * more Java heap space, but perhaps runs more efficiently. * * The Table is populated with a slightly denormalized form of the data: the URI of the document is * included with every record. * * */ public class PersonTitleDBWriterCasConsumer extends CasConsumer_ImplBase { /** * Name of configuration parameter that must be set to the path of a directory into which the * Derby Database will be written. */ public static final String PARAM_OUTPUTDIR = "OutputDirectory"; public static final int MAX_URI_LENGTH = 80; public static final int MAX_TITLE_LENGTH = 20; public static final int DB_LOAD_BATCH_SIZE = 50; private int batchCounter = DB_LOAD_BATCH_SIZE; private File mOutputDir; private boolean firstCall = true; private static boolean firstEverCall = true; private PreparedStatement stmt; private Connection con; private long startTime; public void initialize() throws ResourceInitializationException { startTime = System.currentTimeMillis(); System.out .println("Time: " + (System.currentTimeMillis() - startTime) + " initialize() called"); mOutputDir = new File((String) getConfigParameterValue(PARAM_OUTPUTDIR)); if (!mOutputDir.exists()) { mOutputDir.mkdirs(); } // make this the derby home by setting system property System.setProperty("derby.system.home", mOutputDir.toString()); System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: Set derby system home to: '" + mOutputDir.toString() + "'"); } /** * Processes the CasContainer which was populated by the TextAnalysisEngines. <br> * In this case, the CAS is assumed to contain annotations of type PersonTitle, created with the * PersonTitleAnnotator. These Annotations are stored in a database table called PersonTitle. * * @param aCAS * CasContainer which has been populated by the TAEs * * @throws ResourceProcessException * if there is an error in processing the Resource * * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS) */ public void processCas(CAS aCAS) throws ResourceProcessException { System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: ProcessCas called"); JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new ResourceProcessException(e); } try { if (firstCall) { firstCall = false; System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: First Time Initiailization: "); // NOTE TO USERS: a better design will be to do the loading of the // driver in the initialize() method, where it can // throw a ResourceInitialization Exception if it can't // load (and perhaps even connect to) the database // load the driver // Depends on "derby.system.property" set in initialize // Different databases have different classes they load // This is the one for derby - for the "Embedded" database // (derby also has a network accessed database driver) if (firstEverCall) { firstEverCall = false; System.out .println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: Doing first process call ever (even during re-runs) initialization"); try { // note: newInstance() call is needed to reinitialize properly after // derby has been shutdown Class.forName("org.apache.derby.jdbc.EmbeddedDriver").newInstance(); System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: Loaded derby DB driver OK"); } catch (ClassNotFoundException e) { System.err.println("No driver found for derby - check class path."); } catch (InstantiationException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IllegalAccessException e) { // TODO Auto-generated catch block e.printStackTrace(); } } // create and connect to "ExamplePersonTitleDB" - // in derby, this is a directory name under the "System" // directory set with the above parameter // the System directory is passed to the JVM as a property // (see the derby documentation). Or - you can specify a // complete path, using //forward slashes on windows, such as // "jdbc:derby:c:/a/b/myDB". // NOTE TO USERS: In a real application you would probably // not delete the database and re-create it - this takes a long // time. If you want to get rid of the particular table you're // about to create if it already exists, use // the SQL "Drop" command // if database exists, delete it File db = new File(mOutputDir.toString() + "/ExamplePersonTitleDB"); if (db.exists()) { System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: First Time Initiailization: Deleting Database"); deleteDir(db); System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: First Time Initiailization: Database deleted"); } con = DriverManager.getConnection("jdbc:derby:ExamplePersonTitleDB;create=true"); System.out .println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: First Time Initiailization: Created the ExamplePersonTitleDB and connected to it."); // Databases typically use user-names and passwords; these can // be passed as //properties to the getConnection method. // drop the table in case it's already present // This isn't needed because we're starting from an empty database, // but leave here for tutorial reasons Statement sqlStmt = con.createStatement(); try { sqlStmt.execute("drop table PersonTitle"); } catch (SQLException e) { } sqlStmt.execute("create table PersonTitle(" + "uri varchar(" + MAX_URI_LENGTH + "), spannedText varchar(" + MAX_TITLE_LENGTH + "), beginOffset int, endOffset int)"); System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: First Time Initiailization: Created the PersonTitle table."); sqlStmt.close(); // free resources associated with this // statement // Entering data: use a Statement, or a PreparedStatement stmt = con.prepareStatement("insert into PersonTitle values (?, ?, ?, ?)"); // Writing out all instances of type uima.tcas.Annotation // Assume the variable "jcas" holds a reference to a JCas con.setAutoCommit(false); // need this for batch updating } // get the singleton instance of the SourceDocumentInformation SourceDocumentInformation sdi = (SourceDocumentInformation) jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator().next(); System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: Processing doc: '" + sdi.getUri() + "'"); stmt.setString(1, truncate(sdi.getUri(), MAX_URI_LENGTH)); for (FSIterator iter = jcas.getAnnotationIndex(PersonTitle.type).iterator(); iter.hasNext();) { PersonTitle pt = (PersonTitle) iter.next(); stmt.setString(2, truncate(pt.getCoveredText(), MAX_TITLE_LENGTH)); stmt.setInt(3, pt.getBegin()); stmt.setInt(4, pt.getEnd()); stmt.addBatch(); batchCounter--; if (batchCounter <= 0) { System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: Batch writing updates - process call"); stmt.executeBatch(); // NOTE TO USERS: Although we "commit" here, you may want // to delay committing until batchProcessComplete - or some // other logical point - to keep the DB in a more consistent // state (not partially updated). con.commit(); batchCounter = DB_LOAD_BATCH_SIZE; } } } catch (SQLException e) { try { // NOTE TO USERS: depending on your error recover logic, you'll // probably want to do both a rollback and a clearBatch if an // exception occurs. con.rollback(); } catch (SQLException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { stmt.clearBatch(); } catch (SQLException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } throw new ResourceProcessException(e); } } public void collectionProcessComplete(ProcessTrace arg0) throws ResourceProcessException, IOException { firstCall = true; try { if (batchCounter < DB_LOAD_BATCH_SIZE) { System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: Batch writing updates - processComplete call"); stmt.executeBatch(); con.commit(); batchCounter = DB_LOAD_BATCH_SIZE; } stmt.close(); con.close(); System.out.println("Time: " + (System.currentTimeMillis() - startTime) + " DB Writer: Sucessfully closed the connection - done."); } catch (SQLException e) { System.err.println("Unexpected SQL exception"); e.printStackTrace(); } try { DriverManager.getConnection("jdbc:derby:ExamplePersonTitleDB;shutdown=true"); } catch (SQLException e) { } // If we shut down the db - we get a "no suitable driver" SQL exception if rerunning try { firstEverCall = true; DriverManager.getConnection("jdbc:derby:;shutdown=true"); } catch (SQLException e) { } } private void deleteDir(File f) { if (f.isDirectory()) { String[] contents = f.list(); for (int i = 0; i < contents.length; i++) { deleteDir(new File(f.toString() + "/" + contents[i])); } } f.delete(); } private String truncate(String s, int length) { if (s.length() <= length) return s; return s.substring(0, length); } }