/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package edu.mayo.bior.pipeline.Treat;
import java.io.IOException;
import java.util.List;
import com.tinkerpop.pipes.Pipe;
import com.tinkerpop.pipes.transform.IdentityPipe;
import com.tinkerpop.pipes.transform.TransformFunctionPipe;
import com.tinkerpop.pipes.util.Pipeline;
import edu.mayo.bior.util.BiorProperties;
import edu.mayo.pipes.JSON.DrillPipe;
import edu.mayo.pipes.JSON.RemoveAllJSONPipe;
import edu.mayo.pipes.JSON.lookup.LookupPipe;
import edu.mayo.pipes.JSON.tabix.OverlapPipe;
import edu.mayo.pipes.JSON.tabix.SameVariantPipe;
import edu.mayo.pipes.bioinformatics.VCF2VariantPipe;
import edu.mayo.pipes.history.ColumnMetaData;
import edu.mayo.pipes.history.History;
import edu.mayo.pipes.history.HistoryMetaData;
/**
* Get the variant information for the TREAT workflow from BioR
*
* @author m102417
*/
@SuppressWarnings ({"rawtypes", "unchecked"})
public class OverlapingFeaturesPipeline extends Pipeline implements Cleaner
{
private BiorProperties biorProps;
private String baseDir;
private boolean header = true;
private int deleteColCount = 0;
private int numCols = 0;
private int firstCol = 0;
// properties to extract...
private static final int kGeneName = 0;
private static final int kNCBIEntrezGeneID = kGeneName + 1;
private static final int kNCBICols = kNCBIEntrezGeneID + 1;
private static final String[] kGeneDrill = {"gene", "GeneID"}; // , "note"
private static final int kdbSNPBuild = 0;
private static final int kdbSNPSuspect = kdbSNPBuild + 1;
private static final int kdbSNPClinical = kdbSNPSuspect + 1;
private static final int kdbSNPDisease = kdbSNPClinical + 1;
private static final int kdbSNPAllele = kdbSNPDisease + 1;
private static final int kdbSNPID = kdbSNPAllele + 1;
private static final int kdbSNPRef = kdbSNPID + 1;
private static final int kdbSNPAlt = kdbSNPRef + 1;
private static final int kdbSNPCols = kdbSNPAlt + 5;
private static final String[] kDbSnpDrill = {"INFO.dbSNPBuildID", "INFO.SSR", "INFO.SCS", "INFO.CLN", "INFO.SAO", "_id",
"REF", "ALT", "INFO.SYN", "INFO.S3D", "INFO.U5", "INFO.NSF"};
private static final String[] kDbSnpSuspectLookup = {"unspecified", "Paralog", "byEST", "Para_EST", "oldAlign", "other"};
private static final String[] kDbSnpClinicalLookup = {"unknown", "untested", "non-pathogenic", "probable-non-pathogenic",
"probable-pathogenic", "pathogenic", "drug-response",
"histocompatibility", "other"};
private static final String[] kDbSnpAlleleLookup = {"unspecified", "Germline", "Somatic", "Both", "not-tested",
"tested-inconclusive", "other"};
private static final int kHGNCSymbol = 0;
private static final int kHGNCEntrezGeneID = kHGNCSymbol + 1;
private static final int kHGNCEnsemblGeneID = kHGNCEntrezGeneID + 1;
private static final int kHGNCCols = kHGNCEnsemblGeneID + 1;
private static final String[] kHGNCDrill = {"Approved_Symbol", "Entrez_Gene_ID", "Ensembl_Gene_ID"};
private static final int kCosmicID = 0;
private static final int kCosmicCDS = kCosmicID + 1;
private static final int kCosmicAA = kCosmicCDS + 1;
private static final int kCosmicStrand = kCosmicAA + 1;
private static final int kCosmicCols = kCosmicStrand + 1;
private static final String[] kCosmicDrill = {"Mutation_ID", "Mutation_CDS", "Mutation_AA", "Mutation_GRCh37_strand"};
private static final int kOMIMDisorder = 0;
private static final int kOMIMCols = kOMIMDisorder + 1;
private static final String[] kOMIMDrill = {"Disorders"};
private static final int kBlacklistScore = 0;
private static final int kBlacklistCols = kBlacklistScore + 1;
private static final String[] kBlacklistDrill = {"score"};
private static final int kConservationScore = 0;
private static final int kConservationCols = kConservationScore + 1;
private static final String[] kConservationDrill = {"score"};
private static final int kEnhancerScore = 0;
private static final int kEnhancerCols = kEnhancerScore + 1;
private static final String[] kEnhancerDrill = {"score"};
private static final int kTFBSScore = 0;
private static final int kTFBSCols = kTFBSScore + 1;
private static final String[] kTFBSDrill = {"score"};
private static final int kTSSScore = 0;
private static final int kTSSCols = kTSSScore + 1;
private static final String[] kTSSDrill = {"score"};
private static final int kUniqueScore = 0;
private static final int kUniqueCols = kUniqueScore + 1;
private static final String[] kUniqueDrill = {"score"};
private static final int kRegulationName = 0;
private static final int kRegulationCols = kRegulationName + 1;
private static final String[] kRegulationDrill = {"name"};
private static final int kRepeatName = 0;
private static final int kRepeatCols = kRepeatName + 1;
private static final String[] kRepeatDrill = {"repName"};
private static final int kMiRLandmark = 0;
private static final int kMiRType = kMiRLandmark + 1;
private static final int kMiRMinBP = kMiRType + 1;
private static final int kMiRMaxBP = kMiRMinBP + 1;
private static final int kMiRStrand = kMiRMaxBP + 1;
private static final int kMiRAcc = kMiRStrand + 1;
private static final int kMiRID = kMiRAcc + 1;
private static final int kMiRCols = kMiRID + 1;
private static final String[] kMiRBaseDrill = {"_landmark", "type", "_minBP", "_maxBP", "_strand", "ACC", "ID"};
private static final String kBlank = ".";
private static final int kScoreCutoff = 500;
private static final String kPlusStrand = "+";
/**
* Default constructor, IdentityPipe in and out, no cleaning
*
* @throws IOException
*/
public OverlapingFeaturesPipeline () throws IOException
{
biorProps = new BiorProperties ();
baseDir = biorProps.get ("fileBase");
init (new IdentityPipe (), new IdentityPipe (), false);
}
/**
* Standard Constructor, gets data from input, sends it to output, doesn't do any cleaning
*
* @param input Where gets vcf information from
* @param output Where sends the resulting information
* @throws IOException
*/
public OverlapingFeaturesPipeline (Pipe input, Pipe output) throws IOException
{
biorProps = new BiorProperties ();
baseDir = biorProps.get ("fileBase");
init (input, output, false);
}
/**
* Full constructor, gets data from input, sends it to output, does cleaning if clean is true
*
* @param input Where gets vcf information from
* @param output Where sends the resulting information
* @param clean If true, trims off the catalog data and adds the information in the TREAT format
* @throws IOException
*/
public OverlapingFeaturesPipeline (Pipe input, Pipe output, boolean clean) throws IOException
{
biorProps = new BiorProperties ();
baseDir = biorProps.get ("fileBase");
init (input, output, clean);
}
/**
* Full constructor, gets data from input, sends it to output, does cleaning if clean is true
*
* @param input Where gets vcf information from
* @param output Where sends the resulting information
* @param baseDir Base directory that holds all the catalog files, or null to use the one from properties
* @param clean If true, trims off the catalog data and adds the information in the TREAT format
* @throws IOException
*/
public OverlapingFeaturesPipeline (Pipe input, Pipe output, String baseDir, boolean clean) throws IOException
{
biorProps = new BiorProperties ();
if (baseDir != null)
this.baseDir = baseDir;
init (input, output, clean);
}
/**
* Sets up the catalogs and drills necessary to get the data
*
* @param input assumes that somehow input is converted to a history
* @param output Where the results will go
* @param clean If true, trims off the catalog data and adds one column per frequency source
* (i.e. 1kGenome_EUR, etc), with the information in the TREAT format
* @throws IOException
*/
public void init (Pipe input, Pipe output, boolean clean) throws IOException
{
Pipe cleaner;
if (clean)
cleaner = new TransformFunctionPipe<History, History> (new TreatPipe (this));
else
cleaner = new IdentityPipe<History> ();
if (baseDir == null)
baseDir = biorProps.get ("fileBase");
if (baseDir == null)
baseDir = "";
String genesFile = baseDir + biorProps.get ("genesFile");
String hgncFile = baseDir + biorProps.get ("hgncFile");
String dbsnpFile = baseDir + biorProps.get ("dbsnpFile");
String cosmicFile = baseDir + biorProps.get ("cosmicFile");
String hgncIndexFile = baseDir + biorProps.get ("hgncIndexFile");
String omimFile = baseDir + biorProps.get ("omimFile");
String omimIndexFile = baseDir + biorProps.get ("omimIndexFile");
String conservationFile = baseDir + biorProps.get ("conservationFile");
String repeatFile = baseDir + biorProps.get ("repeatFile");
String regulationFile = baseDir + biorProps.get ("regulationFile");
String uniqueFile = baseDir + biorProps.get ("uniqueFile");
String tssFile = baseDir + biorProps.get ("tssFile");
String tfbsFile = baseDir + biorProps.get ("tfbsFile");
String enhancerFile = baseDir + biorProps.get ("enhancerFile");
String blacklistedFile = baseDir + biorProps.get ("blacklistedFile");
String mirBaseFile = baseDir + biorProps.get ("mirBaseFile");
String[] geneDrill = kGeneDrill;
String[] hgncDrill = kHGNCDrill;
String[] dbSnpDrill = kDbSnpDrill;
String[] cosmicDrill = kCosmicDrill;
String[] omimDrill = kOMIMDrill;
String[] blacklistDrill = kBlacklistDrill;
String[] conservationDrill = kConservationDrill;
String[] enhancerDrill = kEnhancerDrill;
String[] tfbsDrill = kTFBSDrill;
String[] tssDrill = kTSSDrill;
String[] uniqueDrill = kUniqueDrill;
String[] repeatDrill = kRepeatDrill;
String[] regulationDrill = kRegulationDrill;
String[] mirBaseDrill = kMiRBaseDrill;
int posCol = -1;
// requires history as input, history as output
Pipeline p = new Pipeline (input, new VCF2VariantPipe (),
new OverlapPipe (genesFile), new DrillPipe (false, geneDrill),
new LookupPipe (hgncFile, hgncIndexFile, (posCol -= geneDrill.length) + 1),
new DrillPipe (false, hgncDrill),
new SameVariantPipe (dbsnpFile, posCol -= hgncDrill.length),
new DrillPipe (false, dbSnpDrill),
new SameVariantPipe (cosmicFile, posCol -= dbSnpDrill.length),
new DrillPipe (false, cosmicDrill),
new LookupPipe (omimFile, omimIndexFile, (posCol -= cosmicDrill.length) + 1),
new DrillPipe (false, omimDrill),
new OverlapPipe (blacklistedFile, posCol -= omimDrill.length),
new DrillPipe (false, blacklistDrill),
new OverlapPipe (conservationFile, posCol -= blacklistDrill.length),
new DrillPipe (false, conservationDrill),
new OverlapPipe (enhancerFile, posCol -= conservationDrill.length),
new DrillPipe (false, enhancerDrill),
new OverlapPipe (tfbsFile, posCol -= enhancerDrill.length),
new DrillPipe (false, tfbsDrill),
new OverlapPipe (tssFile, posCol -= tfbsDrill.length),
new DrillPipe (false, tssDrill),
new OverlapPipe (uniqueFile, posCol -= tssDrill.length),
new DrillPipe (false, uniqueDrill),
new OverlapPipe (repeatFile, posCol -= uniqueDrill.length),
new DrillPipe (false, repeatDrill),
new OverlapPipe (regulationFile, posCol -= repeatDrill.length),
new DrillPipe (false, regulationDrill),
new OverlapPipe (mirBaseFile, posCol -= regulationDrill.length),
new DrillPipe (false, mirBaseDrill),
new RemoveAllJSONPipe (), cleaner, output);
this.setPipes (p.getPipes ());
deleteColCount = mirBaseDrill.length - posCol;
}
/* (non-Javadoc)
* @see edu.mayo.bior.pipeline.Treat.Cleaner#doClean(edu.mayo.pipes.history.History)
*/
public History doClean (History history)
{
if (header)
{
header = false;
numCols = history.size ();
firstCol = numCols - deleteColCount + 1; // Skip the history column
// Now clean up the history metadata
HistoryMetaData metaData = History.getMetaData ();
List<ColumnMetaData> columns = metaData.getColumns ();
for (int i = numCols - 1; i >= firstCol; --i)
columns.remove (i);
addVariantColumns (columns);
}
int startCol = firstCol;
// String geneName = history.get (startCol + kGeneName);
// int ncbiEntrezGeneID = parseInt (history.get (startCol + kNCBIEntrezGeneID));
startCol += kNCBICols;
String geneSymbol = getString (history.get (startCol + kHGNCSymbol));
int entrezGeneID = parseInt (history.get (startCol + kHGNCEntrezGeneID));
String ensemblGeneID = getString (history.get (startCol + kHGNCEnsemblGeneID));
startCol += kHGNCCols;
int firstBuild = parseInt (history.get (startCol + kdbSNPBuild));
String suspectRegion = lookupString (history.get (startCol + kdbSNPSuspect), kDbSnpSuspectLookup);
String clinicalSig = lookupString (history.get (startCol + kdbSNPClinical), kDbSnpClinicalLookup);
String alleleOrigin = lookupString (history.get (startCol + kdbSNPAllele), kDbSnpAlleleLookup);
boolean diseaseVariant = Boolean.parseBoolean (history.get (startCol + kdbSNPDisease));
String dbSNPsID = getString (history.get (startCol + kdbSNPID));
String dbSNPsRef = getString (history.get (startCol + kdbSNPRef));
String dbSNPsAlt = getString (history.get (startCol + kdbSNPAlt));
startCol += kdbSNPCols;
int mutationID = parseInt (history.get (startCol + kCosmicID));
String cosmicCDS = history.get (startCol + kCosmicCDS);
String cosmicAA = history.get (startCol + kCosmicAA);
boolean strand = kPlusStrand.equals (history.get (startCol + kCosmicStrand));
startCol += kCosmicCols;
String omimDisease = getString (history.get (startCol + kOMIMDisorder));
startCol += kOMIMCols;
boolean blacklisted = isAboveCutoff (history.get (startCol + kBlacklistScore));
startCol += kBlacklistCols;
boolean conserved = isAboveCutoff (history.get (startCol + kConservationScore));
startCol += kConservationCols;
boolean enhancer = isAboveCutoff (history.get (startCol + kEnhancerScore));
startCol += kEnhancerCols;
boolean tfbs = isAboveCutoff (history.get (startCol + kTFBSScore));
startCol += kTFBSCols;
boolean tss = isAboveCutoff (history.get (startCol + kTSSScore));
startCol += kTSSCols;
boolean unique = isAboveCutoff (history.get (startCol + kUniqueScore));
startCol += kUniqueCols;
String name = getString (history.get (startCol + kRepeatName));
boolean repeat = !isEmpty (name);
startCol += kRepeatCols;
name = getString (history.get (startCol + kRegulationName));
boolean regulatory = !isEmpty (name);
startCol += kRegulationCols;
String landmark = getString (history.get (startCol + kMiRLandmark));
String type = getString (history.get (startCol + kMiRType));
int minBP = parseInt (history.get (startCol + kMiRMinBP));
int maxBP = parseInt (history.get (startCol + kMiRMaxBP));
boolean miRStrand = kPlusStrand.equals (history.get (startCol + kMiRStrand));
String acc = getString (history.get (startCol + kMiRAcc));
String id = getString (history.get (startCol + kMiRID));
startCol += kMiRCols;
// Got the information we needed, now clear it all out
for (int i = startCol - 1; i >= firstCol; --i)
history.remove (i);
addDbSNPs (dbSNPsID, dbSNPsRef, dbSNPsAlt, history);
addString (ensemblGeneID, history);
addString (geneSymbol, history);
addNonZeroInt (entrezGeneID, history);
addString (omimDisease, history);
addCosmic (mutationID, cosmicCDS, cosmicAA, strand, history);
addBoolean (blacklisted, history);
addBoolean (unique, history);
addBoolean (repeat, history);
addBoolean (diseaseVariant, history);
addmiRBase (landmark, type, minBP, maxBP, miRStrand, acc, id, history);
addString (suspectRegion, history);
addString (clinicalSig, history);
history.add (kBlank); // TODO polyphen2
addString (alleleOrigin, history);
addNonZeroInt (firstBuild, history);
history.add (kBlank); // TODO UniprotID
addBoolean (conserved, history);
addBoolean (regulatory, history);
addBoolean (tfbs, history);
addBoolean (tss, history);
addBoolean (enhancer, history);
return history;
}
private static final String[] kSNVHeader = {"dbSNP135", "dbSNP135Alleles", "Gene ID", "Gene Name", "Entrez_id", "OMIM Disease"};
private static final String[] kSNVHeaderGGPS = {"COSMIC", "BlacklistedRegion", "Alignability/Uniquness",
"Repeat_Region", "DiseaseVariant", "miRbase", "SNP_SuspectRegion",
"SNP_ClinicalSig", "polyphen2", "Variant_AlleleOrigin",
"First_dbSNP_Build", "UniprotID"};
protected static final String[] kSNVHeaderSift = {"Codons", "Transcript ID", "Protein ID", "Substitution",
"Region", "SNP Type", "Prediction", "Score", "Median Info",
"Gene ID", "Gene Name", "OMIM Disease", "Average Allele Freqs",
"User Comment", "SynonymousCodonUsage", "Difference"};
private static final String[] kSNVHeaderUCSC = {"conservation", "regulation", "tfbs", "tss", "enhancer"};
protected static final String kSNVHeaderSSeq = "# inDBSNPOrNot accession functionGVS " +
"aminoAcids proteinPosition polyPhen geneList " +
"Entrez_id Gene_title closest_transcript_id " +
"Tissue_specificity pathway GeneCards Kaviar_Variants";
protected static final String[] kSNVHeaderSEffect = {"Homozygous", "Bio_type", "accession", "Exon_ID", "Exon_Rank",
"Effect", "aminoAcids", "proteinPosition", "Codon_Degeneracy",
"geneList", "Entrez_id", "Gene_title", "Tissue_specificity",
"pathway", "GeneCards", "Kaviar_Variants"};
/**
* Add the Variant information columns to the history metadata
*
* @param columns Where to add them
*/
private void addVariantColumns (List<ColumnMetaData> columns)
{
for (String columnName : kSNVHeader)
columns.add (new ColumnMetaData (columnName));
for (String columnName : kSNVHeaderGGPS)
columns.add (new ColumnMetaData (columnName));
for (String columnName : kSNVHeaderUCSC)
columns.add (new ColumnMetaData (columnName));
}
/**
* Add the dbSNPs information, or maybe two blank strings
*
* @param dbSNPsID dbSNPs ID
* @param dbSNPsRef The dbSNPs ref base
* @param dbSNPsAlt The dbSNPs alt base
* @param history The history object to add to
*/
private void addDbSNPs (String dbSNPsID, String dbSNPsRef, String dbSNPsAlt, History history)
{
history.add (dbSNPsID);
if (!isEmpty (dbSNPsRef))
history.add (dbSNPsRef + "/" + dbSNPsAlt);
else
history.add (kBlank);
}
/**
* Add an int if it's greater than 0, or a blank string if it isn't
*
* @param theInt Int to add
* @param history The history object to add to
*/
private void addNonZeroInt (int theInt, History history)
{
if (theInt > 0)
history.add ("" + theInt);
else
history.add (kBlank);
}
/**
* Add aString if it's not null or empty, or a blank string if it is
*
* @param theString String to add
* @param history The history object to add to
*/
private void addString (String theString, History history)
{
if (!isEmpty (theString))
history.add (theString);
else
history.add (kBlank);
}
/**
* Add a boolean as either 1 (true) or 0 (false)
*
* @param theBool Value to use
* @param history The history object to add to
*/
private void addBoolean (boolean theBool, History history)
{
if (theBool)
history.add ("1");
else
history.add ("0");
}
/**
* Add a Cosmic entry, or else a blank string
*
* @param mutationID The mutation ID, if not greater than 0 will return a blank string
* @param cosmicCDS the CDS value
* @param cosmicAA The Amino Acid change
* @param strand The strand, true for plus
* @param history The history object to add to
*/
private void addCosmic (int mutationID, String cosmicCDS, String cosmicAA, boolean strand, History history)
{
if (mutationID <= 0)
{
history.add (kBlank);
return;
}
StringBuilder result = new StringBuilder ();
result.append (mutationID);
result.append (';');
result.append (cosmicCDS);
result.append (';');
result.append (cosmicAA);
if (strand)
result.append (";+");
else
result.append (";-");
history.add (result.toString ());
}
/**
* Add a miRBase entry, or else a blank string
*
* @param landmark miRBase Landmark, a Chromosome specifier
* @param type miRBase type, generally "miRNA"
* @param minBP Location on the Chromosome where the miRBase item starts
* @param maxBP Location on the Chromosome where the miRBase item ends
* @param miRStrand Which strand of the Chromosome where the miRBase item ends
* @param acc The accession for the entry in miRBase
* @param id The identifier for the entry in miRBase
* @param history The history object to add to
*/
private void addmiRBase (String landmark, String type, int minBP, int maxBP, boolean miRStrand, String acc, String id,
History history)
{
if (id == null)
{
history.add (kBlank);
return;
}
StringBuilder result = new StringBuilder ();
result.append (landmark);
result.append ('/');
result.append (type);
result.append ('/');
result.append (minBP);
result.append ('/');
result.append (maxBP);
if (miRStrand)
result.append ("/+/");
else
result.append ("/-/");
result.append (acc);
result.append ('/');
result.append (id);
history.add (result.toString ());
}
/**
* Test to see if a string is an int greater than or equal to the cutoff score
*
* @param testStr String to test
* @return True if it is, false if it isn't a valid int, or the int is too small
*/
private static final boolean isAboveCutoff (String testStr)
{
int score = parseInt (testStr);
return score >= kScoreCutoff;
}
/**
* Test a String, if it's not empty, and not ".", return it, otherwise return null
*
* @param theString String to test
* @return A String, or null
*/
private static final boolean isEmpty (String theString)
{
return ((theString == null) || theString.isEmpty ());
}
/**
* Test a String, if it's not empty, and not ".", return it, otherwise return null
*
* @param theString String to test
* @return A String, or null
*/
private static final String getString (String theString)
{
if ((theString == null) || theString.isEmpty ())
return null;
if (theString.equals (kBlank))
return null;
return theString;
}
/**
* Parse a String, returning the int represented, or 0 if not an int
*
* @param theInt String to parse. Must not be null
* @return An integer, 0 if parsing failed
*/
private static final int parseInt (String theInt)
{
int result = 0;
if (!theInt.equals (kBlank))
{
try
{
result = Integer.parseInt (theInt);
}
catch (NumberFormatException oops)
{
// Do nothing
}
}
return result;
}
/**
* Parse a String, getting an int. If that int gives a String from theLookup, will return it. If
* String doesn't parse to an int, or the int is negative or >= theLookup.length (), returns theLookup[0]
*
* @param theInt String to parse. Must not be null
* @param theLookup Array to get strings from. Must not be null or of length 0
* @return A string from theLookup
*/
private static final String lookupString (String theInt, String[] theLookup)
{
int result = 0;
if (!theInt.equals (kBlank))
{
try
{
result = Integer.parseInt (theInt);
}
catch (NumberFormatException oops)
{
// Do nothing
}
}
if (result > theLookup.length)
return theLookup[0];
return theLookup[result];
}
}