package org.genedb.db.loading; import org.gmod.schema.feature.BACEnd; import org.gmod.schema.feature.Chromosome; import org.gmod.schema.feature.Contig; import org.gmod.schema.feature.EST; import org.gmod.schema.feature.Plasmid; import org.gmod.schema.feature.Supercontig; import org.apache.log4j.Logger; import org.springframework.context.ApplicationContext; import org.springframework.context.support.ClassPathXmlApplicationContext; import java.io.File; import java.io.IOException; import java.io.Reader; import java.sql.SQLException; /** * Recurse through a directory structure, loading any FASTA files we encounter. * The actual loading is done by {@link FastaLoader}. * * @author rh11 * */ public class LoadFasta extends FileProcessor { private static final Logger logger = Logger.getLogger(LoadFasta.class); /** * Recurse through a directory structure, loading each FASTA file we encounter. * Each FASTA file is loaded as a single supercontig (or other type of top-level * feature, as specified by the property <code>load.topLevel</load>). Each entry * in the file is loaded as a contig. * <p> * Takes no command-line arguments, but expects to find the system properties * <code>load.organismCommonName</code> and <code>load.inputDirectory</code>. * Optionally, the property <code>load.fileNamePattern</code> may contain a regular * expression aganst which file names are matched. If this property is not specified, * we default to <code>.*\.fasta</code>, which matches any file name with the extension * <code>.fasta</code>. * <p> * Other system properties control various options: * <ul> * <li> <code>load.overwriteExisting</code> can be set to <b>yes</b> * (delete the existing copy of the top-level feature before loading), * or <b>no</b> (skip top-level features that already exist). * <li> <code>load.topLevel</code> may be <code>chromosome</code>, <code>supercontig</code> * <code>contig</code>, <code>plasmid</code>, or <code>none</code>, * and determines the type of the top-level feature * created for each FASTA file. The default is <code>supercontig</code>. * The special value "none" indicates that no top-level feature should be * created for the file as a whole; rather each entry represents a top-level * feature in its own right. This is useful for loading FASTA files containing * EST or BAC end sequences. * <li> <code>load.entryType</code> may be <code>contig</code>, <code>EST</code>, <code>BAC_end</code>, * <code>supercontig</code> or <code>chromosome</code> * and determines the type of feature that is created for each entry in the file. * </ul> * * @param args ignored * @throws MissingPropertyException if a required system property (as detailed above) is missing * @throws ParsingException if a FASTA file cannot be parsed * @throws IOException if there's a problem opening or reading a file or directory */ public static void main(String[] args) throws MissingPropertyException, IOException, ParsingException, SQLException { if (args.length > 0) { logger.warn("Ignoring command-line arguments"); } String organismCommonName = getRequiredProperty("load.organismCommonName"); String inputDirectory = getRequiredProperty("load.inputDirectory"); String fileNamePattern = getPropertyWithDefault("load.fileNamePattern", ".*\\.fasta(?:\\.gz)?"); String overwriteExisting = getPropertyWithDefault("load.overwriteExisting", "no").toLowerCase(); String topLevelFeatureType = getPropertyWithDefault("load.topLevel", "supercontig"); String entryType = getPropertyWithDefault("load.entryType", "contig"); logger.info(String.format("Options: organismCommonName=%s, inputDirectory=%s, fileNamePattern=%s," + "overwriteExisting=%s, topLevel=%s, entry=%s", organismCommonName, inputDirectory, fileNamePattern, overwriteExisting, topLevelFeatureType, entryType)); LoadFasta loadFasta = new LoadFasta(organismCommonName, overwriteExisting, topLevelFeatureType, entryType); loadFasta.processFileOrDirectory(inputDirectory, fileNamePattern); } private FastaLoader loader; private LoadFasta(String organismCommonName, String overwriteExistingString, String topLevelFeatureType, String entryType) { FastaLoader.OverwriteExisting overwriteExisting; if (overwriteExistingString.equals("yes")) { overwriteExisting = FastaLoader.OverwriteExisting.YES; } else if (overwriteExistingString.equals("no")) { overwriteExisting = FastaLoader.OverwriteExisting.NO; } else { throw new RuntimeException("Unrecognised value for load.overwriteExisting: " + overwriteExistingString); } ApplicationContext applicationContext = new ClassPathXmlApplicationContext(new String[] {"Load.xml"}); this.loader = applicationContext.getBean("fastaLoader", FastaLoader.class); loader.setOrganismCommonName(organismCommonName); loader.setOverwriteExisting(overwriteExisting); if (topLevelFeatureType.equals("none")) { loader.setTopLevelFeatureClass(null); } else if (topLevelFeatureType.equals("chromosome")) { loader.setTopLevelFeatureClass(Chromosome.class); } else if (topLevelFeatureType.equals("supercontig")) { loader.setTopLevelFeatureClass(Supercontig.class); } else if (topLevelFeatureType.equals("contig")) { loader.setTopLevelFeatureClass(Contig.class); } else if (topLevelFeatureType.equals("plasmid")) { loader.setTopLevelFeatureClass(Plasmid.class); } else { throw new RuntimeException( String.format("Unrecognised value for load.topLevel: '%s'", topLevelFeatureType)); } if (entryType.equals("contig")) { loader.setEntryClass(Contig.class); } else if (entryType.equals("EST")) { loader.setEntryClass(EST.class); } else if (entryType.equals("BAC_end")) { loader.setEntryClass(BACEnd.class); /* Sometimes the FASTA files have supercontig or chromosome sequences. * Example, Schisto version 5 * nds, 9.11.2010 */ } else if (entryType.equals("supercontig")) { loader.setEntryClass(Supercontig.class); } else if (entryType.equals("chromosome")) { loader.setEntryClass(Chromosome.class); } else { throw new RuntimeException( String.format("Unrecognised value for load.entryType: '%s'", topLevelFeatureType)); } } @Override protected void processFile(File inputFile, Reader reader) throws IOException, ParsingException { String fileId = inputFile.getName(); int lastDotIndex = fileId.lastIndexOf('.'); if (lastDotIndex >= 0) { fileId = fileId.substring(0, lastDotIndex); } loader.load(fileId, new FastaFile(reader)); } }