/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package edu.mayo.bior.pipeline.VEP;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import com.tinkerpop.pipes.PipeFunction;
import edu.mayo.bior.util.BiorProperties;
import edu.mayo.bior.util.BiorProperties.Key;
import edu.mayo.exec.AbnormalExitException;
import edu.mayo.exec.UnixStreamCommand;
/**
* @author m102417
*/
public class VEPEXE implements PipeFunction<String,String>{
private static final Logger sLogger = Logger.getLogger(VEPEXE.class);
private UnixStreamCommand mVep;
// NOTE: A buffer size of "1" appears to be required when using streaming thru Java classes
// else the call will hang.
// (though when used separately on just the command line, 20-50 is most efficient)
private static final String VEP_BUFFER_SIZE = "1";
private static final int VCF_REF_COL = 3;
private static final int VCF_ALT_COL = 4;
// 10 second timeout for VEP writing response to STDOUT
private static final long RECEIVE_TIMEOUT = 10;
private int mLinesSent = 0;
public VEPEXE(String[] vepCmd) throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException {
final Map<String, String> NO_CUSTOM_ENV = Collections.emptyMap();
mVep = new UnixStreamCommand(getVEPCommand(vepCmd), NO_CUSTOM_ENV, true, true);
//-------------------------------------------------------------------------------
// NOTE: VEP will only work on 64-bit systems, when the "forcelinebuffering"
// flag at the end of this line is set to "true"
// To get it to work on 32-bit systems, change this flag to "false"
// HOWEVER, this should be tested thoroughly before putting it into production!
//-------------------------------------------------------------------------------
//mVep = new UnixStreamCommand(getVEPCommand(vepCmd), NO_CUSTOM_ENV, true, false);
mVep.launch();
mVep.send("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
//send some fake data to get the ball rolling...
mVep.send("chr1\t1588717\trs009\tG\tA\t0.0\t.\t.");
//and get out all of the header lines... dump them to /dev/null
mVep.receive();//#fileformat=VCFv4.0
mVep.receive();//##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|HGNC|DISTANCE|SIFT|PolyPhen|CELL_TYPE">
mVep.receive();//#CHROM POS ID REF ALT QUAL FILTER INFO
}
public VEPEXE() throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException{
this(getVEPCommand(null));
}
public static String[] getVEPCommand(String[] userOptions) throws IOException {
// See src/main/resources/bior.properties for example file to put into your user home directory
BiorProperties biorProps = new BiorProperties();
//VEP_COMMAND="$BIOR_VEP_PERL_HOME/perl $BIOR_VEP_HOME/variant_effect_predictor.pl -i /dev/stdin -o STDOUT -dir $BIOR_VEP_HOME/cache/ -vcf --hgnc -polyphen b -sift b --offline --buffer_size $VEP_BUFFER_SIZE"
// NOTE: Need to type-cast the Arrays.asList() to an ArrayList, otherwise the list will NOT be modifiable!!!
List<String> command = new ArrayList<String>(Arrays.asList(
// On Dan's Mac, first part of cmd: "/usr/bin/perl"
biorProps.get(Key.BiorVepPerl),
biorProps.get(Key.BiorVep),
"-i",
"/dev/stdin",
"-o",
"STDOUT",
"-dir",
biorProps.get(Key.BiorVepCache),
"-vcf",
"--hgnc",
"-polyphen",
"b",
"-sift",
"b",
"--offline",
"--buffer_size",
VEP_BUFFER_SIZE
));
if( userOptions != null )
command.addAll(Arrays.asList(userOptions));
command.addAll(getMacFlagsIfNecessary());
return command.toArray(new String[command.size()]);
}
private static List<String> getMacFlagsIfNecessary() {
String os = System.getProperty("os.name");
List<String> macFlags = new ArrayList<String>();
if (os.equals("Mac OS X"))
{
// MAC ONLY
// @see https://github.com/arq5x/gemini/blob/master/docs/content/functional_annotation.rst
//
// To use the cache, the gzip and zcat utilities are required. VEP uses zcat to
// decompress cached files. For systems where zcat may not be installed or may
// not work, the following option needs to be added along with the --cache option:
//
// "--compress gunzip -c"
//
macFlags.add("--compress");
macFlags.add("gunzip -c");
sLogger.info(String.format("Running on %s. Adding Mac-specific options.", os));
}
return macFlags;
}
public String compute(String vcfLine)
{
//sLogger.info("VEP in: " + vcfLine);
// For cases where VEP will not send a response to STDOUT,
// VEP will be bypassed
if (bypass(vcfLine))
{
sLogger.warn(String.format("WARNING!!!!! Line cannot be processed by VEP!!!!!! bypassing VCF line: %s", vcfLine));
return getFakeResponse(vcfLine);
}
try
{
sLogger.info("VEP - sending line #" + ++mLinesSent + ": " + firstXchars(vcfLine,100));
mVep.send(vcfLine);
sLogger.info("VEP - Line sent.");
try
{
sLogger.info("VEP - Receiving line...");
String result = mVep.receive(RECEIVE_TIMEOUT, TimeUnit.SECONDS);
String[] vepParts = result.split("\t");
sLogger.info("VEP - received line: " + firstXchars(result, 100) + "\t....\t" + vepParts[vepParts.length-1]);
return result;
}
catch (TimeoutException te)
{
sLogger.warn(String.format("Timeout of %s seconds reached for VCF line: %s",RECEIVE_TIMEOUT, vcfLine));
return getFakeResponse(vcfLine);
}
}
catch( RuntimeException runtimeExc)
{
terminate();
// Rethrow any runtime exceptions
throw runtimeExc;
}
catch (Exception ex)
{
terminate();
sLogger.error(ex);
}
// If we make it hear, then throw a NoSuchElementException
// However, since this is not a normal pipe, it may not reach this point
throw new NoSuchElementException();
}
private String firstXchars(String str, int charLimit) {
if(str == null || str.length() < charLimit )
return str;
return str.substring(0,charLimit);
}
/**
* Gets a "fake" response from VEP that is useful for when VEP is
* either bypassed entirely or VEP failed to send a response.
*
* @param vcfLine
* @return
*/
private String getFakeResponse(String vcfLine)
{
// return with blank CSQ field in INFO column
return vcfLine + "\tCSQ=VEPERR";
}
public void terminate() {
try {
this.mVep.terminate();
} catch(Exception e) {
sLogger.error("Error terminating VEPEXE pipe" + e);
}
}
/**
* Checks for cases where VEP will <b>not</b> send a response to STDOUT.
* These cases should be avoided as the {@link UnixStreamCommand#receive()}
* will hang indefinitely.
* <p/>
* The following cases cause VEP to not return a response and are checked:
* <ul>
* <li>absent (e.g. NULL) value for ALT column</li>
* <li>1 or more whitespace characters for ALT column</li>
* <li>"." character for ALT column</li>
* <li>ALT and REF columns match</li>
* </ul>
*
* @param line
* A single data line from a VCF file.
* @return
* True if VEP should be bypassed. False otherwise.
*/
private boolean bypass(String line)
{
String[] cols = line.split("\t");
// make sure we can access the ALT column
if (cols.length < (VCF_ALT_COL + 1))
{
return true;
}
final String ref = cols[VCF_REF_COL].trim();
final String alt = cols[VCF_ALT_COL].trim();
if (
// NULL or whitespace
(alt.length() == 0) ||
// dot
alt.equals(".") ||
// REF and ALT are same
alt.equals(ref)
)
{
return true;
}
else
{
return false;
}
}
}