package org.genedb.db.loading; import org.apache.log4j.Logger; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.sql.SQLException; import java.util.zip.GZIPInputStream; /** * This class deals with the mundane details of recursing over a directory * hierarchy looking for files whose names match a given pattern. It is the * superclass of the data-loading classes <code>LoadEmbl</code> and * <code>LoadOrthologues</code>. * * @author rh11 */ public abstract class FileProcessor { private static final Logger logger = Logger.getLogger(FileProcessor.class); /** * Get the value of a system property, throwing an exception if the * property has not been set. * * @param key the name of the system property * @return the value of the property * @throws MissingPropertyException if there is no such property */ protected static String getRequiredProperty(String key) throws MissingPropertyException { String value = System.getProperty(key); if (value == null) { throw new MissingPropertyException(key); } return value; } /** * Get the value of a system property, returning a default value instead * if the property has not been set. * * @param key the name of the system property * @param defaultValue the default value, to use if the property is not defined * @return the value of the property, or the supplied default if there is * no such property */ protected static String getPropertyWithDefault(String key, String defaultValue) { String value = System.getProperty(key); if (value == null) { return defaultValue; } return value; } /** * Check whether a specified system property has been set. This is used * for testing boolean-valued properties (flags). If the property is set, * but its value is the string "false", this method will also return false. * * @param key the name of the system property * @return <code>true</code> if the property is defined and its value is not * equal to "false". */ protected static boolean hasProperty(String key) { return null != System.getProperty(key) && !System.getProperty(key).equals("false"); } /** * Recurse over the directory structure, starting at the specified directory. Whenever * a file is encountered whose name matches the given pattern, <code>processFile</code> * is called on that file. * * @param inputDirectoryName the starting directory. This can also be the path to a file * in which case just that single file is processed whether or not its name matches * the supplied pattern. * @param fileNamePattern a regular expression pattern, used to filter the files by name * @throws IOException * @throws ParsingException */ protected void processFileOrDirectory(String inputDirectoryName, String fileNamePattern) throws IOException, ParsingException, SQLException { processFileOrDirectory(new File(inputDirectoryName), fileNamePattern); } private void processFileOrDirectory(File file, final String fileNamePattern) throws IOException, ParsingException, SQLException { if (file.isDirectory()) { String[] entries = file.list(new FilenameFilter() { public boolean accept(File dir, String name) { File file = new File(dir, name); /* * The Mac OS version of the 'tar' command will create a file ._foo * containing the extended attribute data of each file foo that is archived. * This has the annoying result that if, say, I edit some EMBL files in * TextMate (which sets an extended attribute on those files) and then use * tar to transfer these files to a Linux machine, the Linux machine * will then have various files with the extension .embl that are not * actually EMBL files and obviously cannot be parsed as such. * * Since the prefix ._ is almost never used for any other purpose, it * seems reasonable to ignore such files here, and so avoid the problem. */ if (name.startsWith("._")) { return false; } return file.isDirectory() || (file.isFile() && name.matches(fileNamePattern)); }}); for (String entry: entries) { processFileOrDirectory(new File(file, entry), fileNamePattern); } } else { processFileAndHandleExceptions(file); } } private void processFileAndHandleExceptions(File inputFile) throws IOException, ParsingException, SQLException { try { Reader reader = new FileReader(inputFile); if (inputFile.getName().endsWith(".gz")) { reader = new InputStreamReader(new GZIPInputStream(new FileInputStream(inputFile))); } processFile(inputFile, reader); reader.close(); } catch (ParsingException e) { e.setInputFile(inputFile); logger.error("Parsing error", e); skipRetryAbort(inputFile, e); } catch (org.springframework.dao.DataIntegrityViolationException e) { logger.error("Data integrity violation", e); /* * The cause of a Spring exception is a Hibernate exception, * and the cause of *that* is the underlying JDBC exception. * * On the other hand, we might be using JDBC directly (e.g. VulgarLoader), * in which case the cause of the Spring exception is just the JDBC * exception. */ Throwable cause = e.getCause().getCause(); if (cause == null) { cause = e.getCause(); } skipRetryAbort(inputFile, cause); } /* * We can also get a Hibernate exception directly here. * (I confess I don't understand why, but I have observed this. -rh11) */ catch (org.hibernate.exception.ConstraintViolationException e) { logger.error("Constraint violation", e); /* * The cause of the Hibernate exception is the underlying JDBC exception. */ Throwable cause = e.getCause(); skipRetryAbort(inputFile, cause); } } /** * Process the file. Subclasses should implement this method to do whatever is * appropriate with the file. * * @param inputFile the input file * @param reader a reader for the * @throws IOException * @throws ParsingException */ protected abstract void processFile(File inputFile, Reader reader) throws IOException, ParsingException; /** * Prompt the user to skip, retry or abort; and do the appropriate thing. * If the user chooses to skip, we return without doing anything. * If the user chooses to retry, we call {@link #processFile} recursively. * If the user chooses to abort, the exception we were passed is rethrown. * If we are unable to prompt the user (because neither a console nor a * windowing environment is available, or because the user kills the dialog * window) then an informative RuntimeException should be thrown. * * @param inputFile The input file * @param e The exception. Should be a ParsingException or a RuntimeException * @throws IOException Can only happen if the user retries, and an IOException * is encountered when the input file is subsequently reloaded. * @throws ParsingException If we were passed a ParsingException, it will be * rethrown if the user opts to abort. We may also pass on a ParsingException * from a recursive <code>processEmblFile</code> call (resulting from * a retry). */ private void skipRetryAbort(File inputFile, Throwable e) throws IOException, ParsingException, SQLException { switch (skipRetryAbort.getResponse(e)) { case SKIP: logger.info(String.format("Skipping file '%s'", inputFile)); break; case RETRY: logger.info(String.format("Retrying file '%s'", inputFile)); processFileAndHandleExceptions(inputFile); break; case ABORT: if (e instanceof ParsingException) { throw (ParsingException) e; } else if (e instanceof RuntimeException) { throw (RuntimeException) e; } else if (e instanceof SQLException) { throw (SQLException) e; } else { throw new RuntimeException("Unexpected exception (should not happen)", e); } } } private SkipRetryAbort skipRetryAbort = new SkipRetryAbort(); /** * Configure this file processor always to skip a file that causes * an error, without prompting the user. This may be used to do a mass * load of dirty data for testing purposes. It should not be used for * a live data load. */ public void alwaysSkip() { this.skipRetryAbort = new AlwaysSkip(); } } class MissingPropertyException extends Exception { MissingPropertyException(String propertyName) { super(String.format("Required property '%s' is missing", propertyName)); } }