package org.genedb.db.loading;
import org.genedb.db.dao.OrganismDao;
import org.genedb.db.dao.SequenceDao;
import org.gmod.schema.feature.AbstractGene;
import org.gmod.schema.feature.Polypeptide;
import org.gmod.schema.feature.ProductiveTranscript;
import org.gmod.schema.feature.ProteinMatch;
import org.gmod.schema.feature.Transcript;
import org.gmod.schema.mapped.Analysis;
import org.gmod.schema.mapped.AnalysisFeature;
import org.gmod.schema.mapped.FeatureRelationship;
import org.gmod.schema.mapped.Organism;
import org.apache.log4j.Logger;
import org.hibernate.Session;
import org.hibernate.SessionFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.orm.hibernate3.SessionFactoryUtils;
import org.springframework.transaction.annotation.Transactional;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Load orthologue data from a simple tab-separated file.
* Each line of the file has four or five fields:
* <ol>
* <li> Source organism,
* <li> source feature,
* <li> target organism or <code>cluster</code>,
* <li> target feature or cluster name,
* <li> the percentage identity (optional)
* </ol>
* The source feature field should contain the uniqueName of a transcript
* or polypeptide.
* For a simple orthologue, the third and fourth fields contain the name of the
* target organism and target feature (transcript or polypeptide);
* for orthologue clusters, the third field contains the word "cluster"
* and the fourth field the name of the cluster.
* <p>
* Details of the analysis should be specified as properties:
* <dl>
* <dt><code>load.dataset</code></dt><dd>The name of the orthologue dataset being loaded, e.g. "Plasmodium" (Required)</dd>
* <dt><code>load.analysis.program</code></dt><dd>The program used to perform the analysis (Required for automatic predictions)</dd>
* <dt><code>load.analysis.programVersion</code></dt><dd>The version of the program that was used (Defaults to "unknown")</dd>
* <dt><code>load.analysis.algorithm</code></dt><dd>The name of the algorithm (Optional)</dd>
* </dl>
* If no analysis program is specified, load.orthologues.manual must be specified to indictate that the file contains manually curated orthologues.
* <p>
* There are also a couple of optional properties that control the behaviour of the loader:
* <dl>
* <dt><code>load.orthologues.notFoundNotFatal</code></dt><dd>If this property is set, it is not a fatal error if a gene mentioned
* in the input file does not exist. This can be useful when gene models have been deleted or renamed since the orthologue
* data were generated.</dd>
* <dt><code>load.orthologues.geneNames</
* code></dt><dd>Treat the names in the input file as gene names, rather than polypeptide
* or transcript names.</dd>
* <dt><code>load.analysis.paralogues</code></dt><dd>Load as paralogues not orthologues. NB: Paralogues will overwrite orthologues with the same dataset name.</dd>
* <dt><code>load.orthologues.manual</code></dt><dd>Load as manually curated orthologues. load.analysisProgram is no longer required.</dd>
* </dl>
*
* @author rh11
*
*/
public class LoadOrthologues extends FileProcessor {
private static final Logger logger = Logger.getLogger(LoadOrthologues.class);
public static void main(String[] args) throws MissingPropertyException, IOException, ParsingException, SQLException {
if (args.length > 0) {
logger.warn("Ignoring command-line arguments");
}
String inputDirectory = getRequiredProperty("load.inputDirectory");
String fileNamePattern = getPropertyWithDefault("load.fileNamePattern", ".*\\.ortho");
String analysisProgram = getPropertyWithDefault("load.analysis.program", null);
String analysisProgramVersion = getPropertyWithDefault("load.analysis.programVersion", null);
String analysisAlgorithm = getPropertyWithDefault("load.analysis.algorithm", null);
String datasetName = getPropertyWithDefault("load.dataset", null);
boolean geneNames = hasProperty("load.orthologues.geneNames");
boolean notFoundNotFatal = hasProperty("load.orthologues.notFoundNotFatal");
boolean loadAsParalogues = hasProperty("load.orthologues.paralogues");
boolean manualOrthologues = hasProperty("load.orthologues.manual");
if (analysisProgram == null) {
if (analysisProgramVersion != null) {
throw new IllegalArgumentException("load.analysis.programVersion is specified, but load.analysis.program is not");
}
if (analysisAlgorithm != null) {
throw new IllegalArgumentException("load.analysis.algorithm is specified, but load.analysis.program is not");
}
if (manualOrthologues == false) {
throw new IllegalArgumentException("You must either specify load.analysis.program (for automatic predictions) or load.orthologues.manual (for manual orthologues)");
}
} else {
analysisProgramVersion = "unknown";
}
LoadOrthologues loadOrthologues = new LoadOrthologues(datasetName);
loadOrthologues.setAnalysisProperties(analysisProgram, analysisProgramVersion, analysisAlgorithm);
loadOrthologues.setGeneNames(geneNames);
loadOrthologues.setNotFoundNotFatal(notFoundNotFatal);
loadOrthologues.setLoadAsParalogues(loadAsParalogues);
loadOrthologues.processFileOrDirectory(inputDirectory, fileNamePattern);
}
private final OrthologuesLoader loader;
private LoadOrthologues(String datasetName) {
ApplicationContext applicationContext = new ClassPathXmlApplicationContext(new String[] {"Load.xml"});
this.loader = applicationContext.getBean("orthologuesLoader", OrthologuesLoader.class);
loader.setDatasetName(datasetName);
}
private void setAnalysisProperties(String analysisProgram, String analysisProgramVersion, String analysisAlgorithm) {
loader.setAnalysisProperties(analysisProgram, analysisProgramVersion, analysisAlgorithm);
}
private void setGeneNames(boolean geneNames) {
loader.setGeneNames(geneNames);
}
private void setNotFoundNotFatal(boolean notFoundNotFatal) {
loader.setNotFoundNotFatal(notFoundNotFatal);
}
private void setLoadAsParalogues(boolean loadAsParalogues) {
loader.setLoadAsParalogues(loadAsParalogues);
}
@Override
protected void processFile(File inputFile, Reader reader) throws IOException, ParsingException {
OrthologueFile orthologueFile = new OrthologueFile(inputFile, reader);
loader.load(orthologueFile);
}
}
class OrthologueFile {
class Line {
int lineNumber;
private String sourceOrganism;
private String sourceFeature;
private String targetOrganism;
private String targetFeature;
private Double identity = null;
private Line(int lineNumber, String line) throws ParsingException {
this.lineNumber = lineNumber;
String[] fields = line.split("\\t");
if (fields.length != 4 && fields.length != 5) {
throw new SyntaxError(file, lineNumber,
String.format("Wrong number of fields (%d)", fields.length));
}
sourceOrganism = fields[0];
sourceFeature = fields[1];
targetOrganism = fields[2];
targetFeature = fields[3];
if (fields.length > 4 && fields[4].length() > 0) {
try {
identity = Double.parseDouble(fields[4]);
} catch (NumberFormatException e) {
throw new SyntaxError(file, lineNumber,
String.format("Could not parse identity field '%s'", fields[4]));
}
if (identity < 0 || identity > 100) {
throw new DataError(file, lineNumber,
String.format("Value of identity field '%s' is out of range", fields[4]));
}
}
}
String getSourceOrganism() {
return sourceOrganism;
}
String getSourceFeature() {
return sourceFeature;
}
String getTargetOrganism() {
return targetOrganism;
}
String getTargetFeature() {
return targetFeature;
}
Double getIdentity() {
return identity;
}
}
private File file;
private List<Line> lines = new ArrayList<Line>();
public OrthologueFile(File file, Reader reader) throws IOException, ParsingException {
this.file = file;
BufferedReader br = new BufferedReader(reader);
String line;
int lineNumber = 0;
while (null != (line = br.readLine())) {
lineNumber ++;
lines.add(new Line(lineNumber, line));
}
}
public File file() {
return file;
}
public List<Line> lines() {
return lines;
}
public int numberOfLines() {
return lines.size();
}
}
@Transactional(rollbackFor=DataError.class) // Will also rollback for runtime exceptions, by default
class OrthologuesLoader {
private static final Logger logger = Logger.getLogger(OrthologuesLoader.class);
private static final int BATCH_SIZE = 50;
private SequenceDao sequenceDao;
private OrganismDao organismDao;
private SessionFactory sessionFactory;
private Organism dummyOrganism;
private Analysis analysis = null;
private boolean geneNames = false;
private boolean notFoundNotFatal = false;
private boolean loadAsParalogues = false;
private String datasetName;
public void setAnalysisProperties(String analysisProgram, String analysisProgramVersion, String analysisAlgorithm) {
if (analysisProgram == null) {
return;
}
analysis = new Analysis();
analysis.setProgram(analysisProgram);
analysis.setProgramVersion(analysisProgramVersion);
analysis.setAlgorithm(analysisAlgorithm);
}
public void setDatasetName(String datasetName) {
this.datasetName = datasetName;
}
/**
* If the geneNames property is set, then the identifiers in the file
* are assumed to refer to genes rather than transcripts. This only
* makes sense for bacteria, where the gene name uniquely identifies
* a transcript.
*
* @param geneNames
*/
void setGeneNames(boolean geneNames) {
this.geneNames = geneNames;
}
void setNotFoundNotFatal(boolean notFoundNotFatal) {
this.notFoundNotFatal = notFoundNotFatal;
}
void setLoadAsParalogues(boolean loadAsParalogues) {
this.loadAsParalogues = loadAsParalogues;
}
public void load(OrthologueFile orthologueFile) throws DataError {
Session session = SessionFactoryUtils.getSession(sessionFactory, false);
dummyOrganism = organismDao.getOrganismByCommonName("dummy");
if (analysis != null) {
persistAnalysis();
}
Map<String,Collection<Integer>> clustersByName = new HashMap<String,Collection<Integer>>();
int numberOfLines = orthologueFile.numberOfLines();
for (OrthologueFile.Line line: orthologueFile.lines()) {
logger.trace(String.format("[%d/%d] %s:%s -> %s:%s",
line.lineNumber, numberOfLines,
line.getSourceOrganism(), line.getSourceFeature(),
line.getTargetOrganism(), line.getTargetFeature()));
processLine(orthologueFile.file(), line, clustersByName);
if (line.lineNumber % BATCH_SIZE == 0) {
/* If we don't clear the session regularly,
* it becomes impossibly slow after a while.
*/
logger.trace("Flushing and clearing session");
session.flush();
session.clear();
}
}
loadClusters(clustersByName);
}
private void persistAnalysis() {
SessionFactoryUtils.getSession(sessionFactory, false).persist(analysis);
}
private void processLine(File file, OrthologueFile.Line line,
Map<String, Collection<Integer>> clustersByName) throws DataError
{
Polypeptide source = getPolypeptide(line.getSourceOrganism(), line.getSourceFeature(),
file, line.lineNumber);
if (source == null) {
// A return value of null indicates a non-fatal error
return;
}
String targetOrganismName = line.getTargetOrganism();
if (targetOrganismName.equalsIgnoreCase("cluster")) {
String cluster = line.getTargetFeature();
if (!clustersByName.containsKey(cluster)) {
clustersByName.put(cluster, new ArrayList<Integer>());
}
clustersByName.get(cluster).add(source.getFeatureId());
} else {
// Unclustered
Polypeptide target = getPolypeptide(line.getTargetOrganism(), line.getTargetFeature(),
file, line.lineNumber);
if (target != null) {
addOrthologue(source, target, line.getIdentity());
}
}
}
private void loadClusters(Map<String, Collection<Integer>> clustersByName) {
Session session = SessionFactoryUtils.getSession(sessionFactory, false);
int n = 0;
for (Map.Entry<String, Collection<Integer>> entry: clustersByName.entrySet()) {
String clusterName = entry.getKey();
Collection<Integer> polypeptideIds = entry.getValue();
loadCluster(clusterName, polypeptideIds, null, this.loadAsParalogues);
if (++n % BATCH_SIZE == 0) {
/* If we don't clear the session regularly,
* it becomes impossibly slow after a while.
*/
logger.trace("Flushing and clearing session");
session.flush();
session.clear();
}
}
}
private Polypeptide getPolypeptide(String organism, String uniqueName, File file, int lineNumber) throws DataError {
Transcript transcript;
if (geneNames) {
AbstractGene gene = sequenceDao.getFeatureByUniqueNameAndOrganismCommonName(uniqueName, organism, AbstractGene.class);
if (gene == null) {
String errorMessage = String.format("Could not find gene '%s' in organism '%s'", uniqueName, organism);
if (notFoundNotFatal) {
logger.error(errorMessage);
return null;
}
throw new DataError(errorMessage);
}
Collection<Transcript> transcripts = gene.getTranscripts();
if (transcripts.isEmpty()) {
logger.error(String.format("The gene '%s' does not have any transcripts", uniqueName));
return null;
}
if (transcripts.size() > 1) {
logger.error(String.format("The gene '%s' has %d transcripts", uniqueName, transcripts.size()));
return null;
}
transcript = transcripts.iterator().next();
} else {
transcript = sequenceDao.getFeatureByUniqueNameAndOrganismCommonName(uniqueName, organism, Transcript.class);
if (transcript == null) {
Polypeptide polypeptide = sequenceDao.getFeatureByUniqueNameAndOrganismCommonName(uniqueName, organism, Polypeptide.class);
if (polypeptide != null) {
return polypeptide;
}
String errorMessage = String.format("Could not find transcript feature '%s' in organism '%s'", uniqueName, organism);
if (notFoundNotFatal) {
logger.error(errorMessage);
return null;
}
throw new DataError(file, lineNumber, errorMessage);
}
}
if (!(transcript instanceof ProductiveTranscript)) {
logger.error(String.format("%s line %d: The transcript (%s) is non-coding. " +
"We can't currently store orthologues for non-coding transcripts",
file, lineNumber, transcript.getUniqueName()));
return null;
}
Polypeptide polypeptide = ((ProductiveTranscript) transcript).getProtein();
if (polypeptide == null) {
logger.error(String.format("%s line %d: The transcript (%s) has no associated polypeptide. Ignoring entry",
file, lineNumber, transcript.getUniqueName()));
return null;
}
return polypeptide;
}
private void addOrthologue(Polypeptide sourcePolypeptide, Polypeptide targetPolypeptide, Double identity) {
if (this.analysis != null) {
if (datasetName == null) {
throw new NullPointerException("The datasetName is null - did you call setDatasetName?");
}
// Since this is an algorithmically-derived orthologue, we add it as a cluster
String clusterName = String.format("%s: %s -> %s", datasetName,
sourcePolypeptide.getUniqueName(), targetPolypeptide.getUniqueName());
List<Integer> polypeptideIds = new ArrayList<Integer>(2);
Collections.addAll(polypeptideIds, sourcePolypeptide.getFeatureId(), targetPolypeptide.getFeatureId());
loadCluster(clusterName, polypeptideIds, identity, this.loadAsParalogues);
} else {
// Manually-curated orthologue. Add a simple orthologous_to/paralogous_to relationship in both directions.
Session session = SessionFactoryUtils.getSession(sessionFactory, false);
if (this.loadAsParalogues == true) {
session.persist(sourcePolypeptide.addParalogue(targetPolypeptide));
session.persist(targetPolypeptide.addParalogue(sourcePolypeptide));
} else {
session.persist(sourcePolypeptide.addOrthologue(targetPolypeptide));
session.persist(targetPolypeptide.addOrthologue(sourcePolypeptide));
}
}
}
private void loadCluster(String clusterName, Collection<Integer> polypeptideIds, Double identity, Boolean loadAsParalogues) {
Session session = SessionFactoryUtils.getSession(sessionFactory, false);
if (datasetName == null) {
throw new NullPointerException("The datasetName is null - did you call setDatasetName?");
}
String clusterUniqueName = String.format("%s:%s", datasetName, clusterName);
logger.trace(String.format("Loading orthologue cluster '%s' as '%s'", clusterName, clusterUniqueName));
ProteinMatch clusterFeature = new ProteinMatch(dummyOrganism, clusterUniqueName, true, false);
if (analysis != null) {
AnalysisFeature analysisFeature = clusterFeature.createAnalysisFeature(analysis);
analysisFeature.setIdentity(identity);
}
session.persist(clusterFeature);
for (int polypeptideId: polypeptideIds) {
logger.trace("Loading polypeptide " + polypeptideId);
Polypeptide p = (Polypeptide) session.load(Polypeptide.class, polypeptideId);
FeatureRelationship fr;
if (loadAsParalogues == true) {
logger.trace("Adding paralogue to cluster");
fr = clusterFeature.addParalogue(p);
}
else {
logger.trace("Adding orthologue to cluster");
fr = clusterFeature.addOrthologue(p);
}
logger.trace("Persisting FeatureRelationship");
session.persist(fr);
}
}
/* Spring setters */
public void setSequenceDao(SequenceDao sequenceDao) {
this.sequenceDao = sequenceDao;
}
public void setSessionFactory(SessionFactory sessionFactory) {
this.sessionFactory = sessionFactory;
}
public void setOrganismDao(OrganismDao organismDao) {
this.organismDao = organismDao;
}
}