/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer.genbank;
import java.util.List;
import java.util.ArrayList;
import java.util.Set;
import java.util.HashSet;
import java.util.HashMap;
import org.json.JSONObject;
import act.installer.sequence.SequenceEntry;
import java.io.File;
import java.io.FileInputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileReader;
import java.util.Iterator;
import java.util.LinkedHashMap;
import org.biojavax.SimpleNamespace;
import org.biojavax.bio.seq.RichSequence;
import org.biojavax.bio.seq.RichSequenceIterator;
import org.biojava.bio.seq.Feature;
import org.biojavax.Note;
import org.biojavax.RichObjectFactory;
import org.biojavax.RichAnnotation;
import org.biojavax.ontology.ComparableTerm;
/*
* This will process entries downloaded from:
* ftp://ftp.ncbi.nih.gov/genbank/ -- DOES NOT INCLUDE WGS
* Descriptions of the subsets bct (bacteria), env (environmental) etc:
* http://www.ncbi.nlm.nih.gov/genbank/htgs/divisions/
* gbbct1.seq - bacterial sequences
* gbenv1.seq - environmental sampling sequences
* gbest1.seq - expressed sequence tag (http://www.ncbi.nlm.nih.gov/dbEST/)
* stats: http://www.ncbi.nlm.nih.gov/genbank/dbest/dbest_summary/
* gbinv1.seq - invertebrate
* gbmam1.seq - mamalian
* gbpat1.seq - Patented sequences
* gbphg1.seq - phage
* gbpln1.seq - plant and fungi
* gbpri1.seq - primates
* gbrod1.seq - rodent
* gbsyn1.seq - synthetic
* gbvrl1.seq - viral
* gbvrt1.seq - other vertebrate
*
* Stats on distributions within the 547GB files from Jeff:
* https://docs.google.com/document/d/1qsXzUDcrXy6qZZZJlVRYTifR9lOEpiZQxQ-WknLRW6E/edit#heading=h.r8tyjsata0sr
* https://docs.google.com/presentation/d/1FKvATGlnkVKkB6ZOJuLWMMFII4pbqVqvrp-UT1sZa8g/edit#slide=id.gb9d05972_00
*
* ftp://ftp.ncbi.nih.gov/genbank/wgs/
* The readme ftp://ftp.ncbi.nih.gov/genbank/README.genbank tells us
* that whole genome shotgun sequences are available elsewhere.
* We will need this later when doing chem->org->genome mappings
*
*/
public class GenbankEntry {
JSONObject data;
public static Set<SequenceEntry> parsePossiblyMany(String gbFile) throws Exception {
Set<SequenceEntry> all_entries = new HashSet<SequenceEntry>();
read_all(gbFile);
return all_entries;
}
static int here = 0;
private static void here() { System.out.println("loc: " + here++); }
private GenbankEntry(JSONObject gbEntry) {
this.data = gbEntry;
}
// API is http://www.biojava.org/docs/api1.9.1/
private static void read_all(String gbFile) throws Exception {
BufferedReader br = new BufferedReader(new FileReader(gbFile));
SimpleNamespace ns = new SimpleNamespace("biojava");
// You can use any of the convenience methods found in the BioJava 1.6 API
RichSequenceIterator rsi = RichSequence.IOTools.readGenbankDNA(br,ns);
// contain more than a sequence, you need to iterate over rsi
while(rsi.hasNext()){
RichSequence rs = rsi.nextRichSequence();
print(rs);
}
}
// API is http://www.biojava.org/docs/api1.9.1/
private static void print(RichSequence seq) {
System.out.println( seq.getAccession() );
System.out.println( seq.getDescription() );
for (Feature f : seq.getFeatureSet()) {
print(f);
for(Iterator cfi = f.features(); cfi.hasNext(); ) {
Feature cf = (Feature)cfi.next();
print(cf);
}
}
System.out.println( seq.getTaxon() );
System.out.println( seq.getInternalSymbolList() );
}
// API is http://www.biojava.org/docs/api1.9.1/
private static void print(Feature f) {
System.out.println( "F " + f.getType() + " - " + f );
//Get the annotation of the feature
RichAnnotation ra = (RichAnnotation)f.getAnnotation();
//Use BioJava defined ComparableTerms
ComparableTerm geneTerm = new RichSequence.Terms().getGeneNameTerm();
ComparableTerm synonymTerm = new RichSequence.Terms().getGeneSynonymTerm();
//Create the required additional ComparableTerms
ComparableTerm locusTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm("locus_tag");
ComparableTerm productTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm("product");
ComparableTerm proteinIDTerm = RichObjectFactory.getDefaultOntology().getOrCreateTerm("protein_id");
for (Iterator <Note> it = ra.getNoteSet().iterator(); it.hasNext();){
Note note = it.next();
System.out.println("\tN " + note);
}
}
public static void main(String[] args) throws Exception {
read_all(args[0]);
}
}