package org.genedb.db.loading.auxiliary;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Set;
import java.util.Map.Entry;
import org.apache.log4j.Logger;
import org.gmod.schema.feature.AbstractExon;
import org.gmod.schema.feature.AbstractGene;
import org.gmod.schema.feature.Polypeptide;
import org.gmod.schema.feature.Transcript;
import org.gmod.schema.mapped.Feature;
import org.gmod.schema.mapped.FeatureLoc;
import org.gmod.schema.mapped.FeatureRelationship;
import org.hibernate.Session;
public class LocationLoader extends Loader {
private static final Logger logger = Logger.getLogger(LocationLoader.class);
private String delimiter = "\t";
private String sourceFeatureUniqueName;
private int n = 0;
private int nGenes = 0;
private class GFFFeature {
Integer fmin;
Integer fmax;
Integer phase;
Short strand;
String id;
public String toString() {
return String.format("%s\t%s\t%s\t%s\t%s", id, fmin, fmax, phase, strand);
}
}
@Override
protected void doLoad(InputStream inputStream, Session session) throws IOException {
Feature sourceFeature = sequenceDao.getFeatureByUniqueName(sourceFeatureUniqueName);
if (sourceFeature == null) {
throw new RuntimeException("Could not find sourceFeature " + sourceFeatureUniqueName);
}
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String line;
// the GFF may have several lines per ID, pertaining to multiple exons
LinkedHashMap<String, List<GFFFeature>> features = new LinkedHashMap<String, List<GFFFeature>>();
while ((line = reader.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.length() > 0 && line.contains(delimiter)) {
String[] split = line.split(delimiter);
Integer fmin = Integer.parseInt(split[3]);
Integer fmax = Integer.parseInt(split[4]);
Short strand = null;
if (split[6].equals("-")) {
strand = -1;
} else if (split[6].equals("+")) {
strand = 1;
}
Integer phase = 0;
if (!split[7].equals(".")) {
phase = Integer.parseInt(split[7]);
}
String annotations = split[8];
String id = null;
for (String annotation : annotations.split(";")) {
String[] kv = annotation.split("=");
if (kv[0].equals("ID")) {
id = kv[1];
break;
}
}
if (id == null) {
throw new RuntimeException("Could not find an ID for this line '" + line + "'");
}
GFFFeature gf = new GFFFeature();
gf.fmin = fmin;
gf.fmax = fmax;
gf.strand = strand;
gf.phase = phase;
gf.id = id;
if (!features.containsKey(id)) {
features.put(id, new ArrayList<GFFFeature>());
}
features.get(id).add(gf);
}
}
List<String> warnings = new ArrayList<String>();
for (Entry<String, List<GFFFeature>> entry : features.entrySet()) {
String id = entry.getKey();
List<GFFFeature> gffFeatures = entry.getValue();
Feature feature = sequenceDao.getFeatureByUniqueName(id);
if (feature == null) {
logger.debug("This feature " + id + " does not exist... skipping");
continue;
}
assert (feature instanceof AbstractGene);
if (feature.getFeatureLocs() != null && feature.getFeatureLocs().size() > 0) {
logger.debug("This feature " + id + " has locs... skipping");
continue;
}
assert (feature.getFeatureLocs() == null);
final int expectedExonCount = gffFeatures.size();
int exonIndex = 0;
int minFmin = Integer.MAX_VALUE;
int maxFmax = 0;
for (FeatureRelationship rel : feature.getFeatureRelationshipsForObjectId()) {
Feature mRNA = rel.getSubjectFeature();
GFFFeature gf = gffFeatures.get(exonIndex);
assert (gf.id.equals(id));
if (gf.fmin < minFmin) {
minFmin = gf.fmin;
}
if (gf.fmax > maxFmax) {
maxFmax = gf.fmax;
}
loc(session, sourceFeature, gf, mRNA);
if (mRNA instanceof Transcript) {
Transcript transcript = (Transcript) mRNA;
Polypeptide p = transcript.getPolypeptide();
loc(session, sourceFeature, gf, p);
/*
* can't use getExons() here: 2011-12-19 15:44:16,295 ERROR
* [org.gmod.schema.mapped.Feature] -
* <getRankZeroFeatureLoc: Feature 'Tb11.v5.0111.1:exon:1'
* has no FeatureLocs> Exception in thread "main"
* java.lang.NullPointerException at
* org.gmod.schema.feature.Region.loadLoc(Region.java:56) at
* org.gmod.schema.feature.Region.compareTo(Region.java:70)
* at
* org.gmod.schema.feature.Region.compareTo(Region.java:1)
* at java.util.TreeMap.put(TreeMap.java:545) at
* java.util.TreeSet.add(TreeSet.java:238) at
* org.gmod.schema
* .feature.Transcript.getComponents(Transcript.java:209) at
* org
* .gmod.schema.feature.Transcript.getExons(Transcript.java
* :218) at
* org.genedb.db.loading.auxiliary.LocationLoader.doLoad
* (LocationLoader.java:169) at
* org.genedb.db.loading.auxiliary
* .Loader.load(Loader.java:79) at
* org.genedb.db.loading.auxiliary.Load.load(Load.java:141)
* at
* org.genedb.db.loading.auxiliary.Load.main(Load.java:94)
*
* for (AbstractExon exon : transcript.getExons()) {
*/
for (FeatureRelationship relation : transcript.getFeatureRelationshipsForObjectId()) {
Feature exon = relation.getSubjectFeature();
if (exon instanceof AbstractExon) {
// get the next exon location
gf = gffFeatures.get(exonIndex);
loc(session, sourceFeature, gf, exon);
exonIndex++;
// logger.info(exonIndex);
}
}
}
}
// the exonIndex gets incremented above AFTER each exon is found, so
// you don't need to add 1 to it here
if (expectedExonCount != exonIndex) {
warnings.add(String.format("The number of GFF lines for %s is %s, but should should equal the number of exons %s. Manual intervention needed.", feature.getUniqueName(), expectedExonCount, exonIndex));
}
GFFFeature gf = new GFFFeature();
gf.fmin = minFmin;
gf.fmax = maxFmax;
gf.strand = gffFeatures.get(0).strand;
gf.phase = gffFeatures.get(0).phase;
gf.id = id;
loc(session, sourceFeature, gf, feature);
nGenes++;
if (n % 50 == 1) {
logger.info("Clearing session");
session.flush();
session.clear();
}
n++;
}
for (String warning : warnings) {
logger.warn(warning);
}
logger.info("Processed " + nGenes + " genes, and " + n + " features");
session.flush();
session.clear();
}
private void loc(Session session, Feature sourceFeature, GFFFeature gf, Feature feature) {
List<FeatureLoc> locs = feature.getFeatureLocs();
if (locs != null && locs.size() > 0) {
logger.warn(feature.getUniqueName() + " already had locs, skipping...");
return;
}
logger.info(String.format("%s\t%s", feature.getUniqueName(), gf));
FeatureLoc loc = new FeatureLoc(sourceFeature, feature, gf.fmin, false, gf.fmax, false, gf.strand, gf.phase, 0, 0);
session.persist(loc);
}
@Override
protected Set<String> getOptionNames() {
Set<String> options = new HashSet<String>();
Collections.addAll(options, "sourceFeature");
return options;
}
@Override
protected boolean processOption(String optionName, String optionValue) {
logger.info(String.format("Setting option: '%s' :: '%s'", optionName, optionValue));
if (optionName.equals("sourceFeature")) {
sourceFeatureUniqueName = optionValue;
return true;
}
return false;
}
}