/* * EuroCarbDB, a framework for carbohydrate bioinformatics * * Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as * indicated by the @author tags or express copyright attribution * statements applied by the authors. * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * A copy of this license accompanies this distribution in the file LICENSE.txt. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * Last commit: $Rev: 1870 $ by $Author: david@nixbioinf.org $ on $Date:: 2010-02-23 #$ */ package org.eurocarbdb.util.carbbank; // stdlib imports import java.util.*; import java.io.*; // 3rd party imports import org.apache.log4j.Logger; import org.hibernate.Session; import org.hibernate.EntityMode; import org.dom4j.Element; import org.dom4j.io.XMLWriter; import org.dom4j.io.OutputFormat; // eurocarb imports import org.eurocarbdb.dataaccess.Eurocarb; import org.eurocarbdb.dataaccess.EntityManager; import org.eurocarbdb.dataaccess.HibernateEntityManager; import org.eurocarbdb.dataaccess.core.Reference; import org.eurocarbdb.dataaccess.core.JournalReference; import org.eurocarbdb.dataaccess.core.Contributor; import org.eurocarbdb.dataaccess.core.GlycanSequence; import org.eurocarbdb.dataaccess.core.BiologicalContext; import org.eurocarbdb.dataaccess.core.Disease; import org.eurocarbdb.dataaccess.exception.*; import org.eurocarbdb.util.carbbank.CarbbankParser; import org.eurocarbdb.util.carbbank.CarbbankRecord; // import org.eurocarbdb.dataaccess.hibernate.HibernateUtil; // static imports import static org.eurocarbdb.util.StringUtils.join; import static org.eurocarbdb.dataaccess.Eurocarb.getEntityManager; /* class CarbbankManager *//*************************************** * * A data loader and unloader for Carbbank structures, including * sequence, reference & biological context information. * * @see org.eurocarbdb.util.carbbank.CarbbankParser * @see org.eurocarbdb.util.carbbank.CarbbankRecord * @author mjh * @version $Rev: 1870 $ */ public class CarbbankManager { //~~~~~~~~~~~~~~~~~~~~~~~~~~ FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~// /** Logging handle. */ static final Logger log = Logger.getLogger( CarbbankManager.class ); /** Number of parsing/loading exceptions to tolerate until the * load process is aborted. */ private static final int ERROR_TOLERANCE = 100000; /** The stream from which we read Carbbank records. @see #getInputStream */ private InputStream instream = null; /** The stream to which we output Carbbank CSV once parsed. @see #getOutputStreamErrorSequences */ private PrintStream outstreamErrorSequences = null; /** This is the contributor that will be used when loading * (or unloading) Carbbank structures to the data store. * @see #getCarbbankContributor */ private static Contributor carbbankContributor = null; /** Carbbank parser instance. */ private CarbbankParser parser = new CarbbankParser(); /** Max number of entries to parse. Negative means parse all. */ private int loadLimit = -1; /** Specifies the first record that will be fully parsed. For example, * firstRecord=10 means the first record loaded will be record 10. */ private int firstRecord = 1; private static final String QUERY_GET_ALL_CARBBANK_STRUCTURES = "org.eurocarbdb.dataaccess.core.Contributor.GET_ALL_CARBBANK_STRUCTURES"; /** Number of records that parsed with errors. */ private int records_with_errors = 0; /** Number of records to save before committing a transaction. */ private int save_after = 25; //~~~~~~~~~~~~~~~~~~~~~~~~~ METHODS ~~~~~~~~~~~~~~~~~~~~~~~~~~~// /** * Returns a {@link CarbbankParser} that may be used for parsing * a raw Carbbank file. */ public CarbbankParser getCarbbankParser() { assert parser != null; return parser; } /** * Returns an {@link InputStream} to a Carbbank raw data file. The data * file used is determined at runtime by the value of the Eurocarb * property 'carbbank.raw.file'. * @see Eurocarb#getProperty */ public InputStream getInputStream() { if ( instream == null ) { String filename = Eurocarb.getProperty("carbbank.raw.file"); log.info("opening local Carbbank file '" + filename + "'"); try { instream = new FileInputStream( filename ); } catch ( FileNotFoundException e ) { log.warn("Couldn't open file '" + filename + "': " + e ); return null; } } return instream; } public void setFirstRecord( int index ) { if ( index < 0 ) index = 0; firstRecord = index; } /** * Sets the passed {@link InputStream} from which Carbbank * raw data will be read. * @see #parseAndLoadCarbbank */ public void setInputStream( InputStream in ) { assert in != null; instream = in; } /** * Returns the {@link PrintStream} that will be used to output Carbbank * records that produce errors. If not set explicitly by * {@link #setOutputStreamErrorSequences} then the stream returned * will be directed to a file named by the Eurocarb property * <tt>'carbbank.errors.file'</tt>. * @see Eurocarb#getProperty * @throws DataAccessException if method cannot open file for writing */ public PrintStream getOutputStreamErrorSequences() throws DataAccessException { if ( outstreamErrorSequences == null ) { String filename = Eurocarb.getProperty("carbbank.errors.file"); if ( log.isDebugEnabled() ) log.debug("creating cache file '" + filename + "'"); try { outstreamErrorSequences = new PrintStream( new BufferedOutputStream( new FileOutputStream( filename ))); } catch ( Exception e ) { outstreamErrorSequences = null; String msg = "Caught exception while trying to open file '" + filename + "' for writing: " + e ; log.warn( msg ); throw new DataAccessException( msg ); } } return outstreamErrorSequences; } /** * Sets the passed {@link OutputStream} to which pre-parsed & cached * Carbbank data will be read (by the method {@link #parseAndLoadCarbbank}). */ public void setOutputStreamErrorSequences( PrintStream out ) { assert out != null; outstreamErrorSequences = out; } /** * Returns the canonical "Carbbank" contributor. * If a "Carbbank" contributor does not exist in the * current data store at the time this method is called, * then it will be created. The name of this contributor * is given by the Eurocarb property 'carbbank.contributor.name'. */ public static Contributor getCarbbankContributor() { // if ( carbbankContributor != null ) // return carbbankContributor; String contributor_name = Eurocarb.getProperty("carbbank.contributor.name"); if ( contributor_name == null ) { log.warn( "There is no value for property '" + "carbbank.contributor.name" + "' configured! Using last-resort value of 'Carbbank'" ); contributor_name = "Carbbank"; } if ( log.isDebugEnabled() ) log.debug( "Looking up the canonical Carbbank contributor " + "(contributor name '" + contributor_name + "')" ); carbbankContributor = Contributor.lookupExactName( contributor_name ); if ( carbbankContributor == null ) { log.debug( "A Carbbank contributor could not be found " + "in the current data store, creating it"); carbbankContributor = new Contributor(); carbbankContributor.setContributorName( contributor_name ); getEntityManager().store( carbbankContributor ); if ( log.isDebugEnabled() ) log.debug( "Carbbank contributor with name '" + carbbankContributor.getContributorName() + "', id '" + carbbankContributor.getContributorId() + "' successfully added to the data store" ); } return carbbankContributor; } /** * Sets a limit on the number of carbbank entries that will be * parsed and loaded via the {@link #parseAndLoadCarbbank} method. * Less than zero means 'load all'. */ public void setLoadLimit( int nmb_of_entries ) { loadLimit = nmb_of_entries; } /** * Parses and loads Carbbank data from raw file. Note that * this is much slower than the {@link #loadCarbbank} method, * which loads a pre-parsed version of Carbbank data. * @return number of carbbank entries parsed */ public int parseAndLoadCarbbank() throws IOException, DataAccessException { if ( firstRecord < 0 ) { log.info("Nothing to do!"); return 0; } EntityManager em = getEntityManager(); // all entries parsed will be added to eurocarb db under this contributor. Contributor c = this.getCarbbankContributor(); InputStream in = getInputStream(); parser.setInputStream( in ); int count = 0; assert c != null; while ( true ) { CarbbankRecord r = parser.parse(); if ( r == null ) break; count++; // skip records until we reach the first record specified by firstRecord if ( count < firstRecord ) { if ( log.isDebugEnabled() ) log.debug("skipping record " + count + "(<" + firstRecord + ")..."); continue; } // stop parsing if we've loaded more than loadLimit records. if ( loadLimit == 0 ) { log.debug("Load limit reached, stopping..."); break; } // check seq has not already been added to DB if ( recordAlreadySaved( r ) ) { log.debug("record already exists in DB, skipping..."); continue; } // otherwise process records as usual. // skip records with unparseable sequences GlycanSequence gs = null; try { gs = r.getGlycanSequence(); if ( gs == null ) throw new RuntimeException("GlycanSequence returned null"); if ( gs.getSequenceCt() == null ) throw new RuntimeException("GlycanSequence returned a null Glycoct sequence"); } catch ( Exception ex ) { logErrorRecord( r, ex, "Sequence unparseable" ); continue; } // get references for entry JournalReference jref = r.getJournalReference(); if ( jref == null ) { logErrorRecord( r, null, "Couldn't get a valid JournalReference" ); continue; } Reference ref = r.getEntryReference(); assert ref != null; // heavyweight biological context lookup List<BiologicalContext> bcs = r.getContexts(); // everythings ok so far, set associations between // objects and then save them all jref.setContributor( c ); ref.setContributor( c ); gs.setContributor( c ); gs.addReference( ref ); gs.addReference( jref ); for ( BiologicalContext bc : bcs ) { bc.addContributor(c, ""); gs.addBiologicalContext( bc ); } // save the whole object graph log.debug("attempting to save carbbank record..."); try { // update object with new information getEntityManager().update(jref); getEntityManager().update(ref); em.update( gs ); log.debug("record was saved successfully"); } catch ( Exception ex ) { log.warn("record not saved: " + ex.getMessage()); logErrorRecord( r, ex, "caught exception while trying to save" ); } loadLimit--; if ( (count % save_after) == 0 ) { periodicSaveProgress(); } } // end while if ( log.isInfoEnabled() ) { log.info( "Parsed " + count + " records, " + records_with_errors + " load error(s)" ); } return count; } protected void periodicSaveProgress() { log.info("saving progress..."); getEntityManager().endUnitOfWork(); getEntityManager().beginUnitOfWork(); } /** Returns true if given CarbbankRecord already exists in the DB. */ protected boolean recordAlreadySaved( CarbbankRecord r ) { assert r != null; int id = r.getCarbbankId(); assert id > 0; GlycanSequence existing = GlycanSequence.lookupByExternalRef("Carbbank", id ); return existing != null; } /** * Records that the given Carbbank record had a problem and/or threw * an error during import. * @param ex can be null */ protected void logErrorRecord( CarbbankRecord r, Exception ex, String msg ) { assert r != null; PrintStream out = this.getOutputStreamErrorSequences(); out.println( ";~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ); out.println( "; carbbank id " + r.getCarbbankId() ); out.println( "; eurocarb reason for failure: " + msg ); if ( ex != null ) { out.println( "; exception was: " + ex.getClass().getSimpleName() + " - " + ex.getMessage() ); } out.println( r.getRawEntry() ); out.println(); out.println(); records_with_errors++; if ( records_with_errors > ERROR_TOLERANCE ) throw new DataAccessException( "Aborting load, too many errors"); } /** * Exports a freshly parsed & loaded Carbbank as CSV to the * {@link OutputStream} given by {@link #getOutputStreamErrorSequences}. */ public int exportCarbbank() { assert false: "TODO"; //PrintWriter out = getOutputWriter(); OutputStream out = this.getOutputStreamErrorSequences(); assert out != null; Session s = null; EntityManager em = Eurocarb.getEntityManager(); if ( em instanceof HibernateEntityManager ) { s = ((HibernateEntityManager) em).getHibernateSession(); } else { throw new RuntimeException( "Only Hibernate-backed EntityManagers " + "support bulk exporting Carbbank data" ); } assert s != null; Session dom4j = s.getSession( EntityMode.DOM4J ); String contrib_name = getCarbbankContributor().getContributorName(); log.debug("query for all carbbank structures..."); /* List structures = dom4j.getNamedQuery( QUERY_GET_ALL_CARBBANK_STRUCTURES ) .setParameter("name", contrib_name ) .list(); if ( log.isDebugEnabled() ) log.debug( "found " + structures.size() + " carbbank structures..." ); Element e = (Element) structures.get(0); */ Element e = (Element) dom4j.load( Disease.class, 9538 ); try { log.debug("generating XML..."); OutputFormat format = OutputFormat.createPrettyPrint(); XMLWriter writer = new XMLWriter( out, format ); writer.write( e ); } catch ( IOException ioex ) { log.warn( "Caught " + ioex.getClass().getName() + " while generating export XML" , ioex ); throw new RuntimeException( ioex ); } return 1; } /** * Loads previously-parsed Carbbank structures and associated * data into the current data store. This method requires a pre-parsed * version of the raw data, which is created when loading Carbbank * with the {@link parseAndLoadCarbbank} method. If this pre-parsed * data does not exist when this method is called a * {@link UnsupportedOperationException} is thrown. * * @return a string indicating success/failure. * @throws UnsupportedOperationException * if pre-parsed Carbbank data does not exist at time of calling. * @see EntityManager */ public int loadCarbank() { return 0; } /** * Unloads (deletes!) Carbbank structures and associated data from * the current data store. * @return a string indicating success/failure. */ public int unloadCarbbank() { Contributor c = this.getCarbbankContributor(); //TODO: getEntityManager().delete( c ); this.carbbankContributor = null; return 0; } /** * Saves the passed CarbbankRecord to the current data store. */ protected void storeCarbbankRecord( CarbbankRecord r ) { assert r != null; } public static class CLI { public static void main( String[] args ) throws IOException { getEntityManager().beginUnitOfWork(); CarbbankManager cm = new CarbbankManager(); int parsed = cm.parseAndLoadCarbbank(); getEntityManager().endUnitOfWork(); } } } // end class