package edu.mayo.bior.pipeline.Treat;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.TimeoutException;
import org.apache.log4j.Logger;
import com.tinkerpop.pipes.transform.TransformFunctionPipe;
import com.tinkerpop.pipes.util.Pipeline;
import edu.mayo.bior.cli.cmd.Cmds;
import edu.mayo.bior.pipeline.Treat.format.BgiFormatter;
import edu.mayo.bior.pipeline.Treat.format.CosmicFormatter;
import edu.mayo.bior.pipeline.Treat.format.DbsnpClinvarFormatter;
import edu.mayo.bior.pipeline.Treat.format.DbsnpFormatter;
import edu.mayo.bior.pipeline.Treat.format.EspFormatter;
import edu.mayo.bior.pipeline.Treat.format.Formatter;
import edu.mayo.bior.pipeline.Treat.format.FormatterPipeFunction;
import edu.mayo.bior.pipeline.Treat.format.HapmapFormatter;
import edu.mayo.bior.pipeline.Treat.format.HgncFormatter;
import edu.mayo.bior.pipeline.Treat.format.MirBaseFormatter;
import edu.mayo.bior.pipeline.Treat.format.NcbiGeneFormatter;
import edu.mayo.bior.pipeline.Treat.format.OmimFormatter;
import edu.mayo.bior.pipeline.Treat.format.SNPEffFormatter;
import edu.mayo.bior.pipeline.Treat.format.ThousandGenomesFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscBlacklistedFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscConservationFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscEnhancerFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscRegulationFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscRepeatFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscTfbsFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscTssFormatter;
import edu.mayo.bior.pipeline.Treat.format.UcscUniqueFormatter;
import edu.mayo.bior.pipeline.Treat.format.VEPFormatter;
import edu.mayo.bior.pipeline.Treat.format.VEPHgncFormatter;
import edu.mayo.bior.util.BiorProperties;
import edu.mayo.bior.util.BiorProperties.Key;
import edu.mayo.bior.util.DependancyUtil;
import edu.mayo.exec.AbnormalExitException;
import edu.mayo.pipes.history.CompressPipe;
import edu.mayo.pipes.history.History;
import edu.mayo.pipes.util.FieldSpecification;
import edu.mayo.pipes.util.FieldSpecification.FieldDirection;
import edu.mayo.pipes.util.metadata.Metadata;
/**
* BioR implementation of TREAT annotation module.
*
* @author Greg Dougherty, duffp, Mike Meiners
*
*/
public class TreatPipelineMultiCmd extends Pipeline<History, History>
{
private List<String> mConfigColumnsToOutput;
private TreatUtils mUtils;
private static Logger sLogger = Logger.getLogger(TreatPipelineMultiCmd.class);
private String mBiorLiteHome = "";
private String mBiorCmdDir = "";
// Metadata lines that would be generated by the JSON columns.
// These will NOT be in the output, but we need to generate the appropriate columns from these metadata lines
private List<Metadata> mMetadataToAdd = new ArrayList<Metadata>();
private List<String> mCatalogForColumn = new ArrayList<String>();
/**
* Constructor
*
* @throws IOException
* @throws AbnormalExitException
* @throws TimeoutException
* @throws BrokenBarrierException
* @throws InterruptedException
* @throws URISyntaxException
*/
public TreatPipelineMultiCmd() throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException, URISyntaxException {
this(null);
}
/**
* @param configFilePath
* @throws IOException
* @throws InterruptedException
* @throws BrokenBarrierException
* @throws TimeoutException
* @throws AbnormalExitException
* @throws URISyntaxException
*/
public TreatPipelineMultiCmd(String configFilePath) throws IOException, InterruptedException, BrokenBarrierException,
TimeoutException, AbnormalExitException, URISyntaxException
{
mUtils = new TreatUtils();
mConfigColumnsToOutput = mUtils.loadConfig(configFilePath);
mUtils.validateConfigFileColumns(mConfigColumnsToOutput);
initPipes();
}
private String generatedCommand = "";
/**
* Get the full command that will be used for bior_annotate
* @return The generated command
*/
public String getGeneratedCommand(){
return generatedCommand;
}
/**
* Initializes what pipes will be used for this pipeline.
* NOTE: The reason we construct a string of unix-style pipes instead of using the Java pipes themselves is that
* in Java we do not have the pipes setup for multi-threading, which the unix-style command would do for us.
* This can more than double the speed of the bior_annotate command!
*
* @throws IOException
* @throws AbnormalExitException
* @throws TimeoutException
* @throws BrokenBarrierException
* @throws InterruptedException
* @throws URISyntaxException
*/
@SuppressWarnings ({"rawtypes", "unchecked"})
private void initPipes() throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException, URISyntaxException
{
// tracks the order of the added JSON columns
List<JsonColumn> order = new ArrayList<JsonColumn>();
List<String> pipeList = new ArrayList<String>();
setBiorLiteCmdDir();
addTrim(pipeList);
addVcfToTjson( order, pipeList);
addVep( order, pipeList);
addVepHgnc( order, pipeList);
addSnpEff( order, pipeList);
addSameVariant(order, pipeList, new DbsnpFormatter(), Key.dbsnpFile, JsonColumn.DBSNP_ALL);
addSameVariant(order, pipeList, new DbsnpClinvarFormatter(), Key.dbsnpClinvarFile, JsonColumn.DBSNP_CLINVAR);
addSameVariant(order, pipeList, new CosmicFormatter(), Key.cosmicFile, JsonColumn.COSMIC);
addOverlap( order, pipeList, new UcscBlacklistedFormatter(), Key.blacklistedFile, JsonColumn.UCSC_BLACKLISTED);
addOverlap( order, pipeList, new UcscConservationFormatter(),Key.conservationFile, JsonColumn.UCSC_CONSERVATION);
addOverlap( order, pipeList, new UcscEnhancerFormatter(), Key.enhancerFile, JsonColumn.UCSC_ENHANCER);
addOverlap( order, pipeList, new UcscTfbsFormatter(), Key.tfbsFile, JsonColumn.UCSC_TFBS);
addOverlap( order, pipeList, new UcscTssFormatter(), Key.tssFile, JsonColumn.UCSC_TSS);
addOverlap( order, pipeList, new UcscUniqueFormatter(), Key.uniqueFile, JsonColumn.UCSC_UNIQUE);
addOverlap( order, pipeList, new UcscRepeatFormatter(), Key.repeatFile, JsonColumn.UCSC_REPEAT);
addOverlap( order, pipeList, new UcscRegulationFormatter(), Key.regulationFile, JsonColumn.UCSC_REGULATION);
addOverlap( order, pipeList, new MirBaseFormatter(), Key.mirBaseFile, JsonColumn.MIRBASE);
// allele frequency annotation
addSameVariant(order, pipeList, new BgiFormatter(), Key.bgiFile, JsonColumn.BGI);
addSameVariant(order, pipeList, new EspFormatter(), Key.espFile, JsonColumn.ESP);
addSameVariant(order, pipeList, new HapmapFormatter(), Key.hapMapFile, JsonColumn.HAPMAP);
addSameVariant(order, pipeList, new ThousandGenomesFormatter(), Key.kGenomeFile, JsonColumn.THOUSAND_GENOMES);
// annotation requiring walking X-REFs
addOverlap( order, pipeList, new NcbiGeneFormatter(), Key.genesFile, JsonColumn.NCBI_GENE);
// Drill to add Entrez GeneID X-REF
addLookup( order, pipeList, new HgncFormatter(), Key.hgncFile, JsonColumn.HGNC, Key.hgncIndexFile, "GeneID", "Entrez_Gene_ID");
// Drill to add OMIM ID X-REF
addLookup( order, pipeList, new OmimFormatter(), Key.omimFile, JsonColumn.OMIM, Key.omimIndexFile, "mapped_OMIM_ID", "MIM_Number");
// The many commands should be one string / one call
String pipesAsStr = pipeAsString(pipeList);
sLogger.info("bior_annotate pipeline long cmd: " + pipesAsStr);
//System.err.println(pipesAsStr);
this.generatedCommand = pipesAsStr;
if(this.generatedCommand.contains("bior_snpeff")){
System.err.println("SNPEFF is requested, bior is starting it up, this will take about 1 min.");
}
Map<String,String> envVars = new HashMap<String,String>();
sLogger.info("BIOR_LITE_HOME: " + mBiorLiteHome);
envVars.put("BIOR_LITE_HOME", mBiorLiteHome);
// Transform JSON cols into final output
FormatterPipeFunction formatterPipe = new FormatterPipeFunction(order, mConfigColumnsToOutput);
// NOTE: Don't need a metadata object for the compress pipe since annotate always does a compress,
// and this is taken care of when constructing the annotate metadata line.
mMetadataToAdd = formatterPipe.getMetadataForUserColumns(mCatalogForColumn);
// specify final output cols to compress - compress to have 1-to-1 variants match
FieldSpecification fSpec = new FieldSpecification(formatterPipe.getColumnsAdded().size() + "-", FieldDirection.RIGHT_TO_LEFT);
this.setPipes( new Pipeline(
new AnnotateEXE(new String[] { "/bin/sh", "-c", pipesAsStr }, envVars, mUtils.getMaxLinesInFlight(), mUtils.getTimeout(), order.size(), mUtils.getMaxAlts()),
new TransformFunctionPipe(formatterPipe),
//new PrintPipe(),
new CompressPipe(Cmds.Names.bior_compress.toString(), fSpec, "|", "\\|", true)
).getPipes() );
}
private void addTrim(List<String> pipeList) {
// Don't use the STDBUF option on the first command as this will cause an exception
String cmd = mBiorCmdDir + Cmds.Names.bior_trim_spaces + logFlag();
pipeList.add(cmd);
}
private void addVcfToTjson(List<JsonColumn> order, List<String> pipeList) {
// 1ST JSON column is the original variant
order.add(JsonColumn.VARIANT);
// Add "null" since this column is not used directly by formatters
mCatalogForColumn.add(null);
// Don't use the STDBUF option on the first command as this will cause an exception
String cmd = mBiorCmdDir + Cmds.Names.bior_vcf_to_tjson + logFlag();
pipeList.add(cmd);
}
private void addSameVariant(List<JsonColumn> order, List<String> pipeList,
Formatter formatter, BiorProperties.Key catalogKey, JsonColumn jsonColName) throws FileNotFoundException
{
if(! mUtils.isNeedPipe(formatter))
return;
mUtils.throwErrorIfMissing(catalogKey, TreatUtils.FileType.catalog);
order.add(jsonColName);
mCatalogForColumn.add(mUtils.getFile(catalogKey));
// Using 1-order.size because we don't know how many columns the user passed in.
// We want to reference the vcf2variant column, but it is easier to reference it from the end
String cmd = mBiorCmdDir + Cmds.Names.bior_same_variant + " -d " + mUtils.getFile(catalogKey) + " -c " + (1-order.size()) + logFlag();
pipeList.add(cmd);
}
private void addOverlap(List<JsonColumn> order, List<String> pipeList,
Formatter formatter, BiorProperties.Key catalogKey, JsonColumn jsonColName) throws FileNotFoundException
{
if( ! mUtils.isNeedPipe(formatter) )
return;
mUtils.throwErrorIfMissing(catalogKey, TreatUtils.FileType.catalog);
order.add(jsonColName);
mCatalogForColumn.add(mUtils.getFile(catalogKey));
// Using 1-order.size because we don't know how many columns the user passed in.
// We want to reference the vcf2variant column, but it is easier to reference it from the end
String cmd = mBiorCmdDir + Cmds.Names.bior_overlap + " -d " + mUtils.getFile(catalogKey) + " -c " + (1-order.size()) + logFlag();
pipeList.add(cmd);
}
private void addLookup(List<JsonColumn> order, List<String> pipeList, Formatter formatter,
BiorProperties.Key catalogKey, JsonColumn jsonColName, BiorProperties.Key indexKey,
String jsonPathToDrill, String jsonPathToLookup ) throws FileNotFoundException
{
if( ! mUtils.isNeedPipe(formatter) )
return;
mUtils.throwErrorIfMissing(catalogKey, TreatUtils.FileType.catalog);
mUtils.throwErrorIfMissing(indexKey, TreatUtils.FileType.index);
// Drill column, and keep the JSON
// (note: this requires that we insert at end minus one position since the JSON stays on the end)
// Ignore drilled column, since it is the JSON we lookup that we care about
order.add(order.size()-1, JsonColumn.IGNORE);
// Add "null" since this column is not used directly by formatters
// NOTE: drill swaps last 2 columns, so we need to make the second to last column null
mCatalogForColumn.add(mCatalogForColumn.size()-1, null);
String drillCmd = mBiorCmdDir + Cmds.Names.bior_drill + " -p " + jsonPathToDrill + " -k" + logFlag();
pipeList.add(drillCmd);
// Now add the lookup column, which will be JSON
// NOTE: Json path to lookup will likely be different from the path to drill
order.add(jsonColName);
mCatalogForColumn.add(mUtils.getFile(catalogKey));
String lookupCmd = mBiorCmdDir + Cmds.Names.bior_lookup + " -d " + mUtils.getFile(catalogKey)
+ " -i " + mUtils.getFile(indexKey) + " -p " + jsonPathToLookup + " -c -2" + logFlag();
pipeList.add(lookupCmd);
}
private void addSnpEff(List<JsonColumn> order, List<String> pipeList) throws IOException, URISyntaxException {
// Since SNPEff takes a long time to load, AND that load is in the constructor, let's check if we need it first before calling the constructor
if(! mUtils.isNeedPipe(new SNPEffFormatter()) )
return;
//check to see if it is even installed, if not bail!
if(DependancyUtil.isSNPEffInstalled()){
order.add(JsonColumn.SNPEFF);
mCatalogForColumn.add("/tools/snpeff");
String cmd = mBiorCmdDir + Cmds.Names.bior_snpeff + logFlag();
pipeList.add(cmd);
} else { // Show warning:
System.err.println("Warning: SnpEffect is listed as a required field, but is not installed. Running without it...");
}
}
private void addVepHgnc(List<JsonColumn> order, List<String> pipeList) throws FileNotFoundException, IOException {
// Since the drill and cut are for HGNC lookup, we must check if HGNC is needed before we perform the drill and cut
VEPHgncFormatter vepHgncFormatter = new VEPHgncFormatter();
if( ! mUtils.isNeedPipe(vepHgncFormatter) )
return;
if(DependancyUtil.isVEPInstalled()){
addLookup(order, pipeList, vepHgncFormatter, Key.hgncFile, JsonColumn.VEP_HGNC,
Key.hgncEnsemblGeneIndexFile, "Gene", "Ensembl_Gene_ID");
mUtils.throwErrorIfMissing(Key.hgncEnsemblGeneIndexFile, TreatUtils.FileType.index);
} else { // Show warning:
System.err.println("Warning: VEP is listed as a required field (HGNC fields are dependent on it), but is not installed. Running without it...");
}
}
private void addVep(List<JsonColumn> order, List<String> pipeList) throws IOException, URISyntaxException {
if( ! mUtils.isNeedPipe(new VEPFormatter()))
return;
//if vep is not installed and they try to use it, then we need to bail!
if(DependancyUtil.isVEPInstalled()){
order.add(JsonColumn.VEP);
mCatalogForColumn.add("/tools/vep");
String cmd = mBiorCmdDir + Cmds.Names.bior_vep + logFlag();
pipeList.add(cmd);
} else { // Show warning:
System.err.println("Warning: VEP is listed as a required field, but is not installed. Running without it...");
}
}
/** Given a list of separate pipe commands, build a string with all of them piped together using "|"
* Ex: "bior_vcf_to_json | bior_vep | bior_drill ...." */
private String pipeAsString(List<String> pipeCmds) {
StringBuilder bigPipe = new StringBuilder();
for(int i=0; i < pipeCmds.size(); i++) {
bigPipe.append(pipeCmds.get(i));
if( i < pipeCmds.size()-1 )
bigPipe.append(" | ");
}
return bigPipe.toString();
}
//==========================================================================
private String logFlag() {
boolean isLogOn = sLogger.isDebugEnabled() || sLogger.isInfoEnabled();
sLogger.info("If logging is on, then set it for all bior_annotate sub-commands. Is logging on? " + isLogOn);
return isLogOn ? " -l" : "";
}
private String setBiorLiteCmdDir() throws IOException {
mBiorLiteHome = System.getenv().get("BIOR_LITE_HOME");
// If not given, then auto-detect inside maven target folder
if( mBiorLiteHome == null || mBiorLiteHome.trim().length() == 0 ) {
File targetFolder = new File("target");
for (File f: targetFolder.listFiles()) {
if (f.isDirectory() && (f.getName().startsWith("bior_pipeline"))) {
mBiorLiteHome = f.getCanonicalPath();
break;
}
}
}
mBiorCmdDir = mBiorLiteHome + "/bin/";
if( ! new File(mBiorCmdDir).exists() )
throw new IOException("Could not find the directory containing the BioR commands!");
return mBiorCmdDir;
}
/** Use this after calling the constructor to get the metadata
* for ONLY the columns the user wants in the end,
* that will be used for HistoryInPipe
* @return List of {@link Metadata}
*/
public List<Metadata> getMetadata() {
return mMetadataToAdd;
}
}