package edu.mayo.bior.pipeline.Treat;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.TimeoutException;
import com.tinkerpop.pipes.Pipe;
import com.tinkerpop.pipes.transform.TransformFunctionPipe;
import com.tinkerpop.pipes.util.Pipeline;
import edu.mayo.bior.cli.cmd.Cmds;
import edu.mayo.bior.cli.cmd.SNPEffCommand;
import edu.mayo.bior.pipeline.SNPEff.SNPEFFPipeline;
import edu.mayo.bior.pipeline.Treat.format.BgiFormatter;
import edu.mayo.bior.pipeline.Treat.format.CosmicFormatter;
import edu.mayo.bior.pipeline.Treat.format.DbsnpClinvarFormatter;
import edu.mayo.bior.pipeline.Treat.format.DbsnpFormatter;
import edu.mayo.bior.pipeline.Treat.format.EspFormatter;
import edu.mayo.bior.pipeline.Treat.format.Formatter;
import edu.mayo.bior.pipeline.Treat.format.FormatterPipeFunction;
import edu.mayo.bior.pipeline.Treat.format.HapmapFormatter;
import edu.mayo.bior.pipeline.Treat.format.HgncFormatter;
import edu.mayo.bior.pipeline.Treat.format.MirBaseFormatter;
import edu.mayo.bior.pipeline.Treat.format.NcbiGeneFormatter;
import edu.mayo.bior.pipeline.Treat.format.OmimFormatter;
import edu.mayo.bior.pipeline.Treat.format.SNPEffFormatter;
import edu.mayo.bior.pipeline.Treat.format.ThousandGenomesFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscBlacklistedFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscConservationFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscEnhancerFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscRegulationFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscRepeatFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscTfbsFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscTssFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscUniqueFormatter;
import edu.mayo.bior.pipeline.Treat.format.VEPFormatter;
import edu.mayo.bior.pipeline.Treat.format.VEPHgncFormatter;
import edu.mayo.bior.pipeline.VEP.VEPPipeline;
import edu.mayo.bior.util.BiorProperties;
import edu.mayo.bior.util.BiorProperties.Key;
import edu.mayo.bior.util.DependancyUtil;
import edu.mayo.exec.AbnormalExitException;
import edu.mayo.pipes.JSON.DrillPipe;
import edu.mayo.pipes.JSON.lookup.LookupPipe;
import edu.mayo.pipes.JSON.tabix.OverlapPipe;
import edu.mayo.pipes.JSON.tabix.SameVariantPipe;
import edu.mayo.pipes.bioinformatics.VCF2VariantPipe;
import edu.mayo.pipes.history.CompressPipe;
import edu.mayo.pipes.history.History;
import edu.mayo.pipes.string.TrimSpacesPipe;
import edu.mayo.pipes.util.FieldSpecification;
import edu.mayo.pipes.util.FieldSpecification.FieldDirection;
import edu.mayo.pipes.util.metadata.Metadata;
/**
* BioR implementation of TREAT annotation module.
*
* @author Greg Dougherty, duffp, Mike Meiners
*
*/
public class TreatPipelineSingleThread extends Pipeline<History, History>
{
private List<String> mConfigColumnsToOutput;
private TreatUtils mUtils;
// Metadata lines that would be generated by the JSON columns.
// These will NOT be in the output, but we need to generate the appropriate columns from these metadata lines
private List<Metadata> mMetadataToAdd = new ArrayList<Metadata>();
private List<String> mCatalogForColumn = new ArrayList<String>();
/**
* Constructor
*
* @throws IOException
* @throws AbnormalExitException
* @throws TimeoutException
* @throws BrokenBarrierException
* @throws InterruptedException
* @throws URISyntaxException
*/
public TreatPipelineSingleThread() throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException, URISyntaxException {
this(null);
}
public TreatPipelineSingleThread(String configFilePath) throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException, URISyntaxException {
mUtils = new TreatUtils();
mConfigColumnsToOutput = mUtils.loadConfig(configFilePath);
mUtils.validateConfigFileColumns(mConfigColumnsToOutput);
initPipes();
}
/**
* Initializes what pipes will be used for this pipeline.
*
* @throws IOException
* @throws AbnormalExitException
* @throws TimeoutException
* @throws BrokenBarrierException
* @throws InterruptedException
* @throws URISyntaxException
*/
private void initPipes() throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException, URISyntaxException
{
List<Pipe> pipeList = new ArrayList<Pipe>();
// tracks the order of the added JSON columns
List<JsonColumn> order = new ArrayList<JsonColumn>();
addTrim(pipeList);
addVcfToTjson( order, pipeList);
addVep( order, pipeList);
addVepHgnc( order, pipeList);
addSnpEff( order, pipeList);
addSameVariant(order, pipeList, new DbsnpFormatter(), Key.dbsnpFile, JsonColumn.DBSNP_ALL);
addSameVariant(order, pipeList, new DbsnpClinvarFormatter(), Key.dbsnpClinvarFile, JsonColumn.DBSNP_CLINVAR);
addSameVariant(order, pipeList, new CosmicFormatter(), Key.cosmicFile, JsonColumn.COSMIC);
addOverlap( order, pipeList, new UcscBlacklistedFormatter(), Key.blacklistedFile, JsonColumn.UCSC_BLACKLISTED);
addOverlap( order, pipeList, new UcscConservationFormatter(),Key.conservationFile, JsonColumn.UCSC_CONSERVATION);
addOverlap( order, pipeList, new UcscEnhancerFormatter(), Key.enhancerFile, JsonColumn.UCSC_ENHANCER);
addOverlap( order, pipeList, new UcscTfbsFormatter(), Key.tfbsFile, JsonColumn.UCSC_TFBS);
addOverlap( order, pipeList, new UcscTssFormatter(), Key.tssFile, JsonColumn.UCSC_TSS);
addOverlap( order, pipeList, new UcscUniqueFormatter(), Key.uniqueFile, JsonColumn.UCSC_UNIQUE);
addOverlap( order, pipeList, new UcscRepeatFormatter(), Key.repeatFile, JsonColumn.UCSC_REPEAT);
addOverlap( order, pipeList, new UcscRegulationFormatter(), Key.regulationFile, JsonColumn.UCSC_REGULATION);
addOverlap( order, pipeList, new MirBaseFormatter(), Key.mirBaseFile, JsonColumn.MIRBASE);
// allele frequency annotation
addSameVariant(order, pipeList, new BgiFormatter(), Key.bgiFile, JsonColumn.BGI);
addSameVariant(order, pipeList, new EspFormatter(), Key.espFile, JsonColumn.ESP);
addSameVariant(order, pipeList, new HapmapFormatter(), Key.hapMapFile, JsonColumn.HAPMAP);
addSameVariant(order, pipeList, new ThousandGenomesFormatter(), Key.kGenomeFile, JsonColumn.THOUSAND_GENOMES);
// annotation requiring walking X-REFs
addOverlap( order, pipeList, new NcbiGeneFormatter(), Key.genesFile, JsonColumn.NCBI_GENE);
// Drill to add Entrez GeneID X-REF
addLookup( order, pipeList, new HgncFormatter(), Key.hgncFile, JsonColumn.HGNC, Key.hgncIndexFile, "GeneID", "Entrez_Gene_ID");
// Drill to add OMIM ID X-REF
addLookup( order, pipeList, new OmimFormatter(), Key.omimFile, JsonColumn.OMIM, Key.omimIndexFile, "mapped_OMIM_ID", "MIM_Number");
FormatterPipeFunction formatterPipe = new FormatterPipeFunction(order, mConfigColumnsToOutput);
/* transform JSON cols into final output */
pipeList.add(new TransformFunctionPipe(formatterPipe));
// NOTE: Don't need a metadata object for the compress pipe since annotate always does a compress,
// and this is taken care of when constructing the annotate metadata line.
mMetadataToAdd = formatterPipe.getMetadataForUserColumns(mCatalogForColumn);
/* specify final output cols to compress */
FieldSpecification fSpec = new FieldSpecification(formatterPipe.getColumnsAdded().size() + "-", FieldDirection.RIGHT_TO_LEFT);
/* compress to have 1-to-1 */
pipeList.add(new CompressPipe(Cmds.Names.bior_compress.toString(), fSpec, "|", "\\|", true));
this.setPipes(pipeList);
}
private void addTrim(List<Pipe> pipeList) {
// Don't use the STDBUF option on the first command as this will cause an exception
pipeList.add(new TrimSpacesPipe());
}
private void addVcfToTjson(List<JsonColumn> order, List<Pipe> pipeList) {
// 1ST JSON column is the original variant
order.add(JsonColumn.VARIANT);
// Add "null" since this column is not used directly by formatters
mCatalogForColumn.add(null);
// Don't use the STDBUF option on the first command as this will cause an exception
pipeList.add(new VCF2VariantPipe());
}
private void addSameVariant(List<JsonColumn> order, List<Pipe> pipeList,
Formatter formatter, BiorProperties.Key catalogKey, JsonColumn jsonColName) throws IOException
{
if(! mUtils.isNeedPipe(formatter))
return;
mUtils.throwErrorIfMissing(catalogKey, TreatUtils.FileType.catalog);
order.add(jsonColName);
mCatalogForColumn.add(mUtils.getFile(catalogKey));
// Using 1-order.size because we don't know how many columns the user passed in.
// We want to reference the vcf2variant column, but it is easier to reference it from the end
pipeList.add(new SameVariantPipe(mUtils.getFile(catalogKey), 1-order.size()));
}
private void addOverlap(List<JsonColumn> order, List<Pipe> pipeList,
Formatter formatter, BiorProperties.Key catalogKey, JsonColumn jsonColName) throws IOException
{
if( ! mUtils.isNeedPipe(formatter) )
return;
mUtils.throwErrorIfMissing(catalogKey, TreatUtils.FileType.catalog);
order.add(jsonColName);
mCatalogForColumn.add(mUtils.getFile(catalogKey));
// Using 1-order.size because we don't know how many columns the user passed in.
// We want to reference the vcf2variant column, but it is easier to reference it from the end
pipeList.add(new OverlapPipe(mUtils.getFile(catalogKey), 1-order.size()));
}
private void addLookup(List<JsonColumn> order, List<Pipe> pipeList, Formatter formatter,
BiorProperties.Key catalogKey, JsonColumn jsonColName, BiorProperties.Key indexKey,
String jsonPathToDrill, String jsonPathToLookup ) throws FileNotFoundException
{
if( ! mUtils.isNeedPipe(formatter) )
return;
mUtils.throwErrorIfMissing(catalogKey, TreatUtils.FileType.catalog);
mUtils.throwErrorIfMissing(indexKey, TreatUtils.FileType.index);
// Drill column, and keep the JSON
// (note: this requires that we insert at end minus one position since the JSON stays on the end)
// Ignore drilled column, since it is the JSON we lookup that we care about
order.add(order.size()-1, JsonColumn.IGNORE);
// Add "null" since this column is not used directly by formatters
// NOTE: drill swaps last 2 columns, so we need to make the second to last column null
mCatalogForColumn.add(mCatalogForColumn.size()-1, null);
pipeList.add(new DrillPipe(true, new String[] {jsonPathToDrill}));
// Now add the lookup column, which will be JSON
// NOTE: Json path to lookup will likely be different from the path to drill
order.add(jsonColName);
mCatalogForColumn.add(mUtils.getFile(catalogKey));
pipeList.add(new LookupPipe(mUtils.getFile(catalogKey), mUtils.getFile(indexKey), -2));
}
private void addSnpEff(List<JsonColumn> order, List<Pipe> pipeList) throws IOException, URISyntaxException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException {
// Since SNPEff takes a long time to load, AND that load is in the constructor, let's check if we need it first before calling the constructor
if(! mUtils.isNeedPipe(new SNPEffFormatter()) )
return;
//check to see if it is even installed, if not bail!
if(DependancyUtil.isSNPEffInstalled()){
System.err.println("SNPEFF is requested, bior is starting it up, this will take about 1 min.");
order.add(JsonColumn.SNPEFF);
mCatalogForColumn.add("/tools/snpeff");
//pipeList.add(new SNPEFFPipeline (new String[]{SNPEffCommand.DEFAULT_GENOME_VERSION}, true));
pipeList.add(new SNPEFFPipeline (new String[0], true));
} else { // Show warning:
System.err.println("Warning: SnpEffect is listed as a required field, but is not installed. Running without it...");
}
}
private void addVepHgnc(List<JsonColumn> order, List<Pipe> pipeList) throws FileNotFoundException, IOException {
// Since the drill and cut are for HGNC lookup, we must check if HGNC is needed before we perform the drill and cut
VEPHgncFormatter vepHgncFormatter = new VEPHgncFormatter();
if( ! mUtils.isNeedPipe(vepHgncFormatter) )
return;
if(DependancyUtil.isVEPInstalled()){
mUtils.throwErrorIfMissing(Key.hgncEnsemblGeneIndexFile, TreatUtils.FileType.index);
addLookup(order, pipeList, vepHgncFormatter, Key.hgncFile, JsonColumn.VEP_HGNC,
Key.hgncEnsemblGeneIndexFile, "Gene", "Ensembl_Gene_ID");
} else { // Show warning:
System.err.println("Warning: VEP is listed as a required field (HGNC fields are dependent on it), but is not installed. Running without it...");
}
}
private void addVep(List<JsonColumn> order, List<Pipe> pipeList) throws IOException, URISyntaxException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException {
if( ! mUtils.isNeedPipe(new VEPFormatter()))
return;
//if vep is not installed and they try to use it, then we need to bail!
if(DependancyUtil.isVEPInstalled()){
order.add(JsonColumn.VEP);
mCatalogForColumn.add("/tools/vep");
pipeList.add(new VEPPipeline (new String[0], true));
} else { // Show warning:
System.err.println("Warning: VEP is listed as a required field, but is not installed. Running without it...");
}
}
/** Use this after calling the constructor to get the metadata
* for ONLY the columns the user wants in the end,
* that will be used for HistoryInPipe */
public List<Metadata> getMetadata() {
return mMetadataToAdd;
}
}