package edu.mayo.bior.pipeline.Treat; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.BrokenBarrierException; import java.util.concurrent.TimeoutException; import com.tinkerpop.pipes.Pipe; import com.tinkerpop.pipes.transform.TransformFunctionPipe; import com.tinkerpop.pipes.util.Pipeline; import edu.mayo.bior.cli.cmd.Cmds; import edu.mayo.bior.cli.cmd.SNPEffCommand; import edu.mayo.bior.pipeline.SNPEff.SNPEFFPipeline; import edu.mayo.bior.pipeline.Treat.format.BgiFormatter; import edu.mayo.bior.pipeline.Treat.format.CosmicFormatter; import edu.mayo.bior.pipeline.Treat.format.DbsnpClinvarFormatter; import edu.mayo.bior.pipeline.Treat.format.DbsnpFormatter; import edu.mayo.bior.pipeline.Treat.format.EspFormatter; import edu.mayo.bior.pipeline.Treat.format.Formatter; import edu.mayo.bior.pipeline.Treat.format.FormatterPipeFunction; import edu.mayo.bior.pipeline.Treat.format.HapmapFormatter; import edu.mayo.bior.pipeline.Treat.format.HgncFormatter; import edu.mayo.bior.pipeline.Treat.format.MirBaseFormatter; import edu.mayo.bior.pipeline.Treat.format.NcbiGeneFormatter; import edu.mayo.bior.pipeline.Treat.format.OmimFormatter; import edu.mayo.bior.pipeline.Treat.format.SNPEffFormatter; import edu.mayo.bior.pipeline.Treat.format.ThousandGenomesFormatter; import edu.mayo.bior.pipeline.Treat.format.UcscBlacklistedFormatter; import edu.mayo.bior.pipeline.Treat.format.UcscConservationFormatter; import edu.mayo.bior.pipeline.Treat.format.UcscEnhancerFormatter; import edu.mayo.bior.pipeline.Treat.format.UcscRegulationFormatter; import edu.mayo.bior.pipeline.Treat.format.UcscRepeatFormatter; import edu.mayo.bior.pipeline.Treat.format.UcscTfbsFormatter; import edu.mayo.bior.pipeline.Treat.format.UcscTssFormatter; import edu.mayo.bior.pipeline.Treat.format.UcscUniqueFormatter; import edu.mayo.bior.pipeline.Treat.format.VEPFormatter; import edu.mayo.bior.pipeline.Treat.format.VEPHgncFormatter; import edu.mayo.bior.pipeline.VEP.VEPPipeline; import edu.mayo.bior.util.BiorProperties; import edu.mayo.bior.util.BiorProperties.Key; import edu.mayo.bior.util.DependancyUtil; import edu.mayo.exec.AbnormalExitException; import edu.mayo.pipes.JSON.DrillPipe; import edu.mayo.pipes.JSON.lookup.LookupPipe; import edu.mayo.pipes.JSON.tabix.OverlapPipe; import edu.mayo.pipes.JSON.tabix.SameVariantPipe; import edu.mayo.pipes.bioinformatics.VCF2VariantPipe; import edu.mayo.pipes.history.CompressPipe; import edu.mayo.pipes.history.History; import edu.mayo.pipes.string.TrimSpacesPipe; import edu.mayo.pipes.util.FieldSpecification; import edu.mayo.pipes.util.FieldSpecification.FieldDirection; import edu.mayo.pipes.util.metadata.Metadata; /** * BioR implementation of TREAT annotation module. * * @author Greg Dougherty, duffp, Mike Meiners * */ public class TreatPipelineSingleThread extends Pipeline<History, History> { private List<String> mConfigColumnsToOutput; private TreatUtils mUtils; // Metadata lines that would be generated by the JSON columns. // These will NOT be in the output, but we need to generate the appropriate columns from these metadata lines private List<Metadata> mMetadataToAdd = new ArrayList<Metadata>(); private List<String> mCatalogForColumn = new ArrayList<String>(); /** * Constructor * * @throws IOException * @throws AbnormalExitException * @throws TimeoutException * @throws BrokenBarrierException * @throws InterruptedException * @throws URISyntaxException */ public TreatPipelineSingleThread() throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException, URISyntaxException { this(null); } public TreatPipelineSingleThread(String configFilePath) throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException, URISyntaxException { mUtils = new TreatUtils(); mConfigColumnsToOutput = mUtils.loadConfig(configFilePath); mUtils.validateConfigFileColumns(mConfigColumnsToOutput); initPipes(); } /** * Initializes what pipes will be used for this pipeline. * * @throws IOException * @throws AbnormalExitException * @throws TimeoutException * @throws BrokenBarrierException * @throws InterruptedException * @throws URISyntaxException */ private void initPipes() throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException, URISyntaxException { List<Pipe> pipeList = new ArrayList<Pipe>(); // tracks the order of the added JSON columns List<JsonColumn> order = new ArrayList<JsonColumn>(); addTrim(pipeList); addVcfToTjson( order, pipeList); addVep( order, pipeList); addVepHgnc( order, pipeList); addSnpEff( order, pipeList); addSameVariant(order, pipeList, new DbsnpFormatter(), Key.dbsnpFile, JsonColumn.DBSNP_ALL); addSameVariant(order, pipeList, new DbsnpClinvarFormatter(), Key.dbsnpClinvarFile, JsonColumn.DBSNP_CLINVAR); addSameVariant(order, pipeList, new CosmicFormatter(), Key.cosmicFile, JsonColumn.COSMIC); addOverlap( order, pipeList, new UcscBlacklistedFormatter(), Key.blacklistedFile, JsonColumn.UCSC_BLACKLISTED); addOverlap( order, pipeList, new UcscConservationFormatter(),Key.conservationFile, JsonColumn.UCSC_CONSERVATION); addOverlap( order, pipeList, new UcscEnhancerFormatter(), Key.enhancerFile, JsonColumn.UCSC_ENHANCER); addOverlap( order, pipeList, new UcscTfbsFormatter(), Key.tfbsFile, JsonColumn.UCSC_TFBS); addOverlap( order, pipeList, new UcscTssFormatter(), Key.tssFile, JsonColumn.UCSC_TSS); addOverlap( order, pipeList, new UcscUniqueFormatter(), Key.uniqueFile, JsonColumn.UCSC_UNIQUE); addOverlap( order, pipeList, new UcscRepeatFormatter(), Key.repeatFile, JsonColumn.UCSC_REPEAT); addOverlap( order, pipeList, new UcscRegulationFormatter(), Key.regulationFile, JsonColumn.UCSC_REGULATION); addOverlap( order, pipeList, new MirBaseFormatter(), Key.mirBaseFile, JsonColumn.MIRBASE); // allele frequency annotation addSameVariant(order, pipeList, new BgiFormatter(), Key.bgiFile, JsonColumn.BGI); addSameVariant(order, pipeList, new EspFormatter(), Key.espFile, JsonColumn.ESP); addSameVariant(order, pipeList, new HapmapFormatter(), Key.hapMapFile, JsonColumn.HAPMAP); addSameVariant(order, pipeList, new ThousandGenomesFormatter(), Key.kGenomeFile, JsonColumn.THOUSAND_GENOMES); // annotation requiring walking X-REFs addOverlap( order, pipeList, new NcbiGeneFormatter(), Key.genesFile, JsonColumn.NCBI_GENE); // Drill to add Entrez GeneID X-REF addLookup( order, pipeList, new HgncFormatter(), Key.hgncFile, JsonColumn.HGNC, Key.hgncIndexFile, "GeneID", "Entrez_Gene_ID"); // Drill to add OMIM ID X-REF addLookup( order, pipeList, new OmimFormatter(), Key.omimFile, JsonColumn.OMIM, Key.omimIndexFile, "mapped_OMIM_ID", "MIM_Number"); FormatterPipeFunction formatterPipe = new FormatterPipeFunction(order, mConfigColumnsToOutput); /* transform JSON cols into final output */ pipeList.add(new TransformFunctionPipe(formatterPipe)); // NOTE: Don't need a metadata object for the compress pipe since annotate always does a compress, // and this is taken care of when constructing the annotate metadata line. mMetadataToAdd = formatterPipe.getMetadataForUserColumns(mCatalogForColumn); /* specify final output cols to compress */ FieldSpecification fSpec = new FieldSpecification(formatterPipe.getColumnsAdded().size() + "-", FieldDirection.RIGHT_TO_LEFT); /* compress to have 1-to-1 */ pipeList.add(new CompressPipe(Cmds.Names.bior_compress.toString(), fSpec, "|", "\\|", true)); this.setPipes(pipeList); } private void addTrim(List<Pipe> pipeList) { // Don't use the STDBUF option on the first command as this will cause an exception pipeList.add(new TrimSpacesPipe()); } private void addVcfToTjson(List<JsonColumn> order, List<Pipe> pipeList) { // 1ST JSON column is the original variant order.add(JsonColumn.VARIANT); // Add "null" since this column is not used directly by formatters mCatalogForColumn.add(null); // Don't use the STDBUF option on the first command as this will cause an exception pipeList.add(new VCF2VariantPipe()); } private void addSameVariant(List<JsonColumn> order, List<Pipe> pipeList, Formatter formatter, BiorProperties.Key catalogKey, JsonColumn jsonColName) throws IOException { if(! mUtils.isNeedPipe(formatter)) return; mUtils.throwErrorIfMissing(catalogKey, TreatUtils.FileType.catalog); order.add(jsonColName); mCatalogForColumn.add(mUtils.getFile(catalogKey)); // Using 1-order.size because we don't know how many columns the user passed in. // We want to reference the vcf2variant column, but it is easier to reference it from the end pipeList.add(new SameVariantPipe(mUtils.getFile(catalogKey), 1-order.size())); } private void addOverlap(List<JsonColumn> order, List<Pipe> pipeList, Formatter formatter, BiorProperties.Key catalogKey, JsonColumn jsonColName) throws IOException { if( ! mUtils.isNeedPipe(formatter) ) return; mUtils.throwErrorIfMissing(catalogKey, TreatUtils.FileType.catalog); order.add(jsonColName); mCatalogForColumn.add(mUtils.getFile(catalogKey)); // Using 1-order.size because we don't know how many columns the user passed in. // We want to reference the vcf2variant column, but it is easier to reference it from the end pipeList.add(new OverlapPipe(mUtils.getFile(catalogKey), 1-order.size())); } private void addLookup(List<JsonColumn> order, List<Pipe> pipeList, Formatter formatter, BiorProperties.Key catalogKey, JsonColumn jsonColName, BiorProperties.Key indexKey, String jsonPathToDrill, String jsonPathToLookup ) throws FileNotFoundException { if( ! mUtils.isNeedPipe(formatter) ) return; mUtils.throwErrorIfMissing(catalogKey, TreatUtils.FileType.catalog); mUtils.throwErrorIfMissing(indexKey, TreatUtils.FileType.index); // Drill column, and keep the JSON // (note: this requires that we insert at end minus one position since the JSON stays on the end) // Ignore drilled column, since it is the JSON we lookup that we care about order.add(order.size()-1, JsonColumn.IGNORE); // Add "null" since this column is not used directly by formatters // NOTE: drill swaps last 2 columns, so we need to make the second to last column null mCatalogForColumn.add(mCatalogForColumn.size()-1, null); pipeList.add(new DrillPipe(true, new String[] {jsonPathToDrill})); // Now add the lookup column, which will be JSON // NOTE: Json path to lookup will likely be different from the path to drill order.add(jsonColName); mCatalogForColumn.add(mUtils.getFile(catalogKey)); pipeList.add(new LookupPipe(mUtils.getFile(catalogKey), mUtils.getFile(indexKey), -2)); } private void addSnpEff(List<JsonColumn> order, List<Pipe> pipeList) throws IOException, URISyntaxException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException { // Since SNPEff takes a long time to load, AND that load is in the constructor, let's check if we need it first before calling the constructor if(! mUtils.isNeedPipe(new SNPEffFormatter()) ) return; //check to see if it is even installed, if not bail! if(DependancyUtil.isSNPEffInstalled()){ System.err.println("SNPEFF is requested, bior is starting it up, this will take about 1 min."); order.add(JsonColumn.SNPEFF); mCatalogForColumn.add("/tools/snpeff"); //pipeList.add(new SNPEFFPipeline (new String[]{SNPEffCommand.DEFAULT_GENOME_VERSION}, true)); pipeList.add(new SNPEFFPipeline (new String[0], true)); } else { // Show warning: System.err.println("Warning: SnpEffect is listed as a required field, but is not installed. Running without it..."); } } private void addVepHgnc(List<JsonColumn> order, List<Pipe> pipeList) throws FileNotFoundException, IOException { // Since the drill and cut are for HGNC lookup, we must check if HGNC is needed before we perform the drill and cut VEPHgncFormatter vepHgncFormatter = new VEPHgncFormatter(); if( ! mUtils.isNeedPipe(vepHgncFormatter) ) return; if(DependancyUtil.isVEPInstalled()){ mUtils.throwErrorIfMissing(Key.hgncEnsemblGeneIndexFile, TreatUtils.FileType.index); addLookup(order, pipeList, vepHgncFormatter, Key.hgncFile, JsonColumn.VEP_HGNC, Key.hgncEnsemblGeneIndexFile, "Gene", "Ensembl_Gene_ID"); } else { // Show warning: System.err.println("Warning: VEP is listed as a required field (HGNC fields are dependent on it), but is not installed. Running without it..."); } } private void addVep(List<JsonColumn> order, List<Pipe> pipeList) throws IOException, URISyntaxException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException { if( ! mUtils.isNeedPipe(new VEPFormatter())) return; //if vep is not installed and they try to use it, then we need to bail! if(DependancyUtil.isVEPInstalled()){ order.add(JsonColumn.VEP); mCatalogForColumn.add("/tools/vep"); pipeList.add(new VEPPipeline (new String[0], true)); } else { // Show warning: System.err.println("Warning: VEP is listed as a required field, but is not installed. Running without it..."); } } /** Use this after calling the constructor to get the metadata * for ONLY the columns the user wants in the end, * that will be used for HistoryInPipe */ public List<Metadata> getMetadata() { return mMetadataToAdd; } }