package org.genedb.db.loading;
import org.gmod.schema.feature.BACEnd;
import org.gmod.schema.feature.Chromosome;
import org.gmod.schema.feature.Contig;
import org.gmod.schema.feature.EST;
import org.gmod.schema.feature.Gene;
import org.gmod.schema.feature.Plasmid;
import org.gmod.schema.feature.Supercontig;
import org.apache.log4j.AppenderSkeleton;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.sql.SQLException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Command-line entry point for loading EMBL files.
* The class {@link EmblLoader} is used to do the heavy lifting.
*
* @author rh11
*
*/
public class LoadEmbl extends FileProcessor {
private static final Logger logger = Logger.getLogger(LoadEmbl.class);
/**
* Recurse through a directory structure, loading each EMBL file we encounter.
* <p>
* Takes no command-line arguments, but expects to find the system properties
* <code>load.organismCommonName</code> and <code>load.startingDirectory</code>.
* Optionally, the property <code>load.fileNamePattern</code> may contain a regular
* expression aganst which file names are matched. If this property is not specified,
* we default to <code>.*\.embl(?:\\.gz)?</code>, which matches any file name with
* the extension <code>.embl</code> or <code>.embl.gz</code>.
* <p>
* Other system properties control various options:
* <ul>
* <li> <code>load.overwriteExisting</code> can be set to <b>yes</b>
* (delete the existing copy of the top-level feature before loading),
* <b>no</b> (skip top-level features that already exist),
* or <b>merge</b> (add the features from the file to the existing
* top-level feature, if there is one).
* <li> <code>load.topLevel</code> should be <code>chromosome</code>, <code>supercontig</code>
* <code>contig</code>, <code>plasmid</code>, <code>EST</code> or <code>BAC_end</code>,
* and determines the type of the top-level feature
* created for each EMBL file. The default is <code>supercontig</code>.
* <li> if <code>load.sloppyControlledCuration</code> is set to <code>true</code> (or
* any value other than <code>false</code>, in fact) then /controlled_curation
* qualifiers will be treated as essentially free-text fields, though
* <code>db_xref</code>s will still be extracted if possible. This is required
* for projects such as Staphylococcus aureus, whose controlled_curation qualifiers
* use a non-standard format.
* <li> if <code>load.goTermErrorsAreNotFatal</code> is set to <code>true</code> (or
* any value other than <code>false</code>, in fact) then errors loading /GO
* qualifiers will be reported, but loading will continue. This does <strong>not</strong>
* affect the parsing of /GO qualifiers: parsing errors will still be fatal, as
* usual. It affects situations where there is no GO term with the specified
* accession number in the database, for example.
* <li> <code>load.ignoreQualifiers</code> may be set to a comma-separated list of qualifiers
* to ignore. The qualifier name may be prefixed with a feature type, for example
* <code>-Dload.ignoreQualifiers=CDS:similarity</code> would cause all /similarity
* qualifiers on CDS features to be ignored.
* <li> <code>load.ignoreFeatures</code> may be set to a comma-separated list of feature types
* to ignore.
* </ul>
*
* @param args ignored
* @throws MissingPropertyException if a required system property (as detailed above) is missing
* @throws ParsingException if an EMBL file cannot be parsed
* @throws IOException if there's a problem opening or reading a file or directory
*/
public static void main(String[] args) throws MissingPropertyException, IOException, ParsingException, SQLException {
if (args.length > 0) {
logger.warn("Ignoring command-line arguments");
}
// PropertyConfigurator.configure("resources/classpath/log4j.loader.properties");
String organismCommonName = getRequiredProperty("load.organismCommonName");
String inputDirectory = getRequiredProperty("load.inputDirectory");
String fileNamePattern = getPropertyWithDefault("load.fileNamePattern", ".*\\.(embl|tab)(?:\\.gz)?");
String overwriteExisting = getPropertyWithDefault("load.overwriteExisting", "no").toLowerCase();
String topLevelFeatureType = getRequiredProperty("load.topLevel");
boolean sloppyControlledCuration = hasProperty("load.sloppyControlledCuration");
boolean goTermErrorsAreNotFatal = hasProperty("load.goTermErrorsAreNotFatal");
boolean quickAndDirty = hasProperty("load.quickAndDirty");
String ignoreQualifiers = getPropertyWithDefault("load.ignoreQualifiers", null);
String ignoreFeatures = getPropertyWithDefault("load.ignoreFeatures", null);
logger.info(String.format("Options: organismCommonName=%s, inputDirectory=%s, fileNamePattern=%s," +
"overwriteExisting=%s, topLevel=%s, sloppyControlledCuration=%b, goTermErrorsAreNotFatal=%b," +
"ignoreQualifiers=%s, ignoreFeatures=%s",
organismCommonName, inputDirectory, fileNamePattern, overwriteExisting,
topLevelFeatureType, sloppyControlledCuration, goTermErrorsAreNotFatal,
ignoreQualifiers, ignoreFeatures));
if (quickAndDirty) {
((AppenderSkeleton) Logger.getRootLogger().getAppender("stdout")).setThreshold(Level.WARN);
}
LoadEmbl loadEmbl = new LoadEmbl(organismCommonName, overwriteExisting,
topLevelFeatureType, sloppyControlledCuration, goTermErrorsAreNotFatal,
ignoreQualifiers, ignoreFeatures);
if (quickAndDirty) {
loadEmbl.quickAndDirty();
}
loadEmbl.processFileOrDirectory(inputDirectory, fileNamePattern);
}
private EmblLoader loader;
private static final Pattern ignoreQualifiersPattern = Pattern.compile("\\G(?:(\\w+):)?(\\w+)(?:,|\\Z)");
private static final Pattern ignoreFeaturesPattern = Pattern.compile("\\G\\s*(\\S+)\\s*(?:,|\\Z)");
private LoadEmbl(String organismCommonName, String overwriteExistingString, String topLevelFeatureType,
boolean sloppyControlledCuration, boolean goTermErrorsAreNotFatal, String ignoreQualifiers,
String ignoreFeatures) {
EmblLoader.OverwriteExisting overwriteExisting;
if (overwriteExistingString.equals("yes")) {
overwriteExisting = EmblLoader.OverwriteExisting.YES;
} else if (overwriteExistingString.equals("no")) {
overwriteExisting = EmblLoader.OverwriteExisting.NO;
} else if (overwriteExistingString.equals("merge")) {
overwriteExisting = EmblLoader.OverwriteExisting.MERGE;
} else {
throw new RuntimeException("Unrecognised value for load.overwriteExisting: " + overwriteExistingString);
}
ApplicationContext applicationContext = new ClassPathXmlApplicationContext(new String[] {"Load.xml"});
// set this to true if you want to find out what the actual database connection is.
boolean debug = false;
if (debug == true) {
org.apache.commons.dbcp.BasicDataSource ds = applicationContext.getBean("dataSource", org.apache.commons.dbcp.BasicDataSource.class);
logger.info("Connecting to " + ds.getUrl() + " with username " + ds.getUsername());
}
this.loader = applicationContext.getBean("emblLoader", EmblLoader.class);
loader.setOrganismCommonName(organismCommonName);
loader.setOverwriteExisting(overwriteExisting);
loader.setSloppyControlledCuration(sloppyControlledCuration);
loader.setGoTermErrorsAreNotFatal(goTermErrorsAreNotFatal);
if (topLevelFeatureType.equals("chromosome")) {
loader.setTopLevelFeatureClass(Chromosome.class);
} else if (topLevelFeatureType.equals("supercontig")) {
loader.setTopLevelFeatureClass(Supercontig.class);
} else if (topLevelFeatureType.equals("contig")) {
loader.setTopLevelFeatureClass(Contig.class);
} else if (topLevelFeatureType.equals("plasmid")) {
loader.setTopLevelFeatureClass(Plasmid.class);
} else if (topLevelFeatureType.equals("EST")) {
loader.setTopLevelFeatureClass(EST.class);
} else if (topLevelFeatureType.equals("BAC_end")) {
loader.setTopLevelFeatureClass(BACEnd.class);
} else if (topLevelFeatureType.equals("gene")) {
loader.setTopLevelFeatureClass(Gene.class);
} else {
throw new RuntimeException(
String.format("Unrecognised value for load.topLevel: '%s'", topLevelFeatureType));
}
if (ignoreQualifiers != null) {
Matcher ignoreQualifiersMatcher = ignoreQualifiersPattern.matcher(ignoreQualifiers);
int end = 0;
while (ignoreQualifiersMatcher.find()) {
end = ignoreQualifiersMatcher.end();
String feature = ignoreQualifiersMatcher.group(1);
String qualifier = ignoreQualifiersMatcher.group(2);
if (feature == null) {
loader.ignoreQualifier(qualifier);
} else {
loader.ignoreQualifier(qualifier, feature);
}
}
if (end < ignoreQualifiersMatcher.regionEnd()) {
throw new RuntimeException("Failed to parse load.ignoreQualifiers: " + ignoreQualifiers);
}
}
if (ignoreFeatures != null) {
Matcher ignoreFeaturesMatcher = ignoreFeaturesPattern.matcher(ignoreFeatures);
while (ignoreFeaturesMatcher.find()) {
loader.ignoreFeature(ignoreFeaturesMatcher.group(1));
}
}
}
private boolean continueOnError = false;
@Override
protected void processFile(File inputFile, Reader reader) throws IOException, ParsingException {
EmblFile emblFile = new EmblFile(inputFile, reader, continueOnError, loader.getOverwriteExisting());
loader.load(emblFile);
}
private void quickAndDirty() {
alwaysSkip();
loader.setContinueOnError(true);
continueOnError = true;
}
}