/*
* The MIT License (MIT)
*
* Copyright (c) 2007-2015 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broad.igv.feature.genome;
import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.feature.*;
import org.broad.igv.util.ParsingUtils;
import htsjdk.tribble.Feature;
import org.broad.igv.util.StringUtils;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Sequence defined by a Genbank (.gbk) file. These files contain a single sequence/chromosome/contig.
*/
public class GenbankParser {
private static Logger log = Logger.getLogger(GenbankParser.class);
private String path;
private String accession;
private byte[] sequence;
private List<Feature> features;
private String locusName;
private String[] aliases;
private static List<String> nameFields = Arrays.asList("gene");
/**
* @param path
*/
public GenbankParser(String path) throws IOException {
this.path = path;
readFeatures(true);
}
public GenbankParser() throws IOException {
}
public void readFeatures(boolean readSequence) throws IOException {
BufferedReader reader = null;
try {
reader = ParsingUtils.openBufferedReader(path);
readFeatures_(readSequence, reader);
} finally {
if (reader != null) reader.close();
}
}
public void readFeatures(InputStream inputStream, boolean readSequence) throws IOException {
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(inputStream));
readFeatures_(readSequence, reader);
} finally {
if (reader != null) reader.close();
}
}
private void readFeatures_(boolean readSequence, BufferedReader reader) throws IOException {
readLocus(reader);
String line = null;
do {
line = reader.readLine();
if (line.startsWith("ACCESSION")) {
readAccession(line);
} else if (line.startsWith("ALIASES")) {
readAliases(line);
}
}
while (line != null && !line.startsWith("FEATURES"));
readFeatures(reader);
if (readSequence) readOriginSequence(reader);
}
public byte[] getSequence(String chr, int qstart, int qend) {
if (sequence == null) {
return null;
} else {
final int start = Math.max(0, qstart); // qstart should never be < 0
final int end = Math.min(sequence.length, qend);
int len = end - start;
byte[] bytes = new byte[len];
Arrays.fill(bytes, (byte) 0);
int s = Math.max(start, 0);
System.arraycopy(sequence, s, bytes, 0, len); // This copy is a crime
return bytes;
}
}
/**
* Return the total sequence length. Added primarily for unit testing.
*
* @return
*/
public int getSequenceLenth() {
return sequence == null ? 0 : sequence.length;
}
/**
* Read the locus line
* LOCUS NT_030059 105338 bp DNA linear CON 28-OCT-2010
*/
private void readLocus(BufferedReader reader) throws IOException {
String line = reader.readLine();
String[] tokens = Globals.whitespacePattern.split(line);
if (!tokens[0].equalsIgnoreCase("LOCUS")) {
// throw exception
}
locusName = tokens[1].trim();
}
/**
* Read the acession line
* ACCESSION K03160
*
* @throws IOException
*/
private void readAccession(String line) {
String[] tokens = Globals.whitespacePattern.split(line);
if (tokens.length < 2) {
log.info("Genbank file missing ACCESSION number.");
} else {
accession = tokens[1].trim();
}
}
/**
* Read the sequence aliases line -- Note: this is an IGV extension
* ACCESSION K03160
*
* @throws IOException
*/
private void readAliases(String line) {
String[] tokens = Globals.whitespacePattern.split(line);
if (tokens.length < 2) {
//log.info("Genbank file missing ACCESSION number.");
} else {
aliases = Globals.commaPattern.split(tokens[1]);
}
}
/**
* Read the origin section. Example...
* <p/>
* ORIGIN
* 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
* 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
* 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
*
* @param reader
*/
private void readOriginSequence(BufferedReader reader) throws IOException {
String nextLine;
ByteArrayOutputStream buffer = new ByteArrayOutputStream(100000); // TODO -- we might know length
while ((nextLine = reader.readLine()) != null && !nextLine.startsWith("//")) {
nextLine = nextLine.trim();
String[] tokens = Globals.whitespacePattern.split(nextLine);
for (int i = 1; i < tokens.length; i++) {
buffer.write(tokens[i].getBytes());
}
}
sequence = buffer.toByteArray();
}
/**
* Return a string representing the chromosome/contig/sequence. We use the accession if it is defined, otherwise
* the first word in the LOCUS field.
*
* @return
*/
public String getChr() {
return accession == null ? locusName : accession;
}
/**
* FEATURES Location/Qualifiers
* source 1..105338
* /organism="Homo sapiens"
* /mol_type="genomic DNA"
* /db_xref="taxon:9606"
* /chromosome="10"
* gene 1..105338
* /gene="PTEN"
* /note="Derived by automated computational analysis using
* gene prediction method: BestRefseq."
* /db_xref="GeneID:5728"
* /db_xref="HGNC:9588"
* /db_xref="HPRD:03431"
* /db_xref="MIM:601728"
* <p/>
* CDS join(1033..1111,30588..30672,62076..62120,67609..67652,
* 69576..69814,88681..88822,94416..94582,97457..97681,
* 101850..102035)
* /gene="PTEN"
*
* @param reader
* @throws IOException
*/
private void readFeatures(BufferedReader reader) throws IOException {
String chr = getChr();
//Process features until "ORIGIN"
features = new ArrayList<Feature>();
BasicFeature f = null;
String currentLocQualifier = null;
String nextLine = null;
int errorCount = 0;
do {
nextLine = reader.readLine();
// TODO -- first line is source (required), has total length => use to size sequence
// TODO -- keys start at column 6, locations and qualifiers at column 22.
if (nextLine == null || nextLine.startsWith("ORIGIN")) {
break;
}
if(nextLine.length() < 6) {
if(errorCount < 10) {
log.error("Unexpected line in genbank file (skipping): " + nextLine);
}
errorCount++;
continue;
}
if (nextLine.charAt(5) != ' ') {
String featureType = nextLine.substring(5, 21).trim();
f = new BasicFeature();
f.setChr(chr);
f.setType(featureType);
currentLocQualifier = nextLine.substring(21);
if (!featureType.toLowerCase().equals("source")) {
features.add(f);
}
} else {
String tmp = nextLine.substring(21).trim();
if (tmp.length() > 0)
if (tmp.charAt(0) == '/') {
if (currentLocQualifier.charAt(0) == '/') {
String[] tokens = Globals.equalPattern.split(currentLocQualifier, 2);
if (tokens.length > 1) {
String keyName = tokens[0].length() > 1 ? tokens[0].substring(1) : "";
String value = StringUtils.stripQuotes(tokens[1]);
f.setAttribute(keyName, value);
if (nameFields.contains(keyName)) {
f.setName(value);
}
} else {
// TODO -- don't know how to interpret, log?
}
} else {
// location string TODO -- many forms of this to support
// Crude test for strand
Strand strand = currentLocQualifier.contains("complement") ? Strand.NEGATIVE : Strand.POSITIVE;
f.setStrand(strand);
// join and complement functions irrelevant
String joinString = currentLocQualifier.replace("join", "");
joinString = joinString.replace("order", "");
joinString = joinString.replace("complement", "");
joinString = joinString.replace("(", "");
joinString = joinString.replace(")", "");
if (joinString.contains("..")) {
joinString = joinString.replace("<", "");
joinString = joinString.replace(">", "");
List<Exon> exons = createExons(joinString, strand);
FeatureUtils.sortFeatureList(exons);
Exon firstExon = exons.get(0);
f.setStart(firstExon.getStart());
Exon lastExon = exons.get(exons.size() - 1);
f.setEnd(lastExon.getEnd());
if (exons.size() > 1) {
for (Exon exon : exons) {
f.addExon(exon);
}
}
} else {
// TODO Single locus for now, other forms possible
int start = Integer.parseInt(joinString) - 1;
int end = start + 1;
f.setStart(start);
f.setEnd(end);
}
}
currentLocQualifier = tmp;
} else {
currentLocQualifier = (currentLocQualifier == null ? tmp : currentLocQualifier + tmp);
}
}
}
while (true);
}
/**
* Create a list of Exon objects from the Embl join string. Apparently exons in embl
* format are represented by a single CDS record.
*
* @param joinString
*/
List<Exon> createExons(String joinString, Strand strand) {
String[] lociArray = joinString.split(",");
List<Exon> exons = new ArrayList(lociArray.length);
boolean isNegative = joinString.contains("complement");
for (String loci : lociArray) {
String[] tmp = loci.split("\\.\\.");
int exonStart = 0; // - (isNegative ? 0 : 1);
try {
exonStart = Integer.parseInt(tmp[0]) - 1;
} catch (NumberFormatException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
int exonEnd = exonStart + 1;
if (tmp.length > 1) {
exonEnd = Integer.parseInt(tmp[1]);
}
Exon r = new Exon(accession, exonStart, exonEnd, strand);
exons.add(r);
}
return exons;
}
public String getAccession() {
return accession;
}
public byte[] getSequence() {
return sequence;
}
public List<Feature> getFeatures() {
return features;
}
public String getLocusName() {
return locusName;
}
public String[] getAliases() {
return aliases;
}
}