/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package edu.mayo.bior.pipeline.SNPEff;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.TimeoutException;
import org.apache.log4j.Logger;
import ca.mcgill.mcb.pcingola.fileIterator.NeedlemanWunsch;
import ca.mcgill.mcb.pcingola.interval.Chromosome;
import ca.mcgill.mcb.pcingola.interval.Genome;
import ca.mcgill.mcb.pcingola.interval.Marker;
import ca.mcgill.mcb.pcingola.interval.SeqChange.ChangeType;
import ca.mcgill.mcb.pcingola.util.Gpr;
import ca.mcgill.mcb.pcingola.vcf.VcfGenotype;
import com.tinkerpop.pipes.PipeFunction;
import edu.mayo.bior.util.BiorProperties;
import edu.mayo.bior.util.BiorProperties.Key;
import edu.mayo.exec.AbnormalExitException;
import edu.mayo.exec.UnixStreamCommand;
/**
* @author m102417
*/
public class SNPEFFEXE implements PipeFunction<String,String>{
private static final Logger sLog = Logger.getLogger(SNPEFFEXE.class);
private UnixStreamCommand mSnpeff;
private static int mCmdTimeout = 10; // This should be updated in a call to the getCmdTimeout() method
public SNPEFFEXE(String[] snpEffCmd) throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException {
final Map<String, String> NO_CUSTOM_ENV = Collections.emptyMap();
mSnpeff = new UnixStreamCommand(getSnpEffCommand(snpEffCmd), NO_CUSTOM_ENV, true, false);
mSnpeff.launch();
mSnpeff.send("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
//send some fake data to get the ball rolling...
mSnpeff.send("chr1\t1588717\trs009\tG\tA\t0.0\t.\t.");
//and get out all of the header lines... dump them to /dev/null
mSnpeff.receive();
mSnpeff.receive();
mSnpeff.receive();
mSnpeff.receive();
mSnpeff.receive();
}
public SNPEFFEXE() throws IOException, InterruptedException, BrokenBarrierException, TimeoutException, AbnormalExitException{
this(getSnpEffCommand(null));
}
//a back door constructor that lets us access methods for testing without starting SNPEFF
public SNPEFFEXE(boolean silent){
}
public static int getCmdTimeout(BiorProperties props) {
int cmdTimeoutInSeconds = props.getAsInt(Key.TimeoutCommand, 30);
sLog.info("TimeoutCommand (seconds) = " + cmdTimeoutInSeconds);
if( cmdTimeoutInSeconds < 10 || cmdTimeoutInSeconds > 300 )
sLog.warn("WARNING: TimeoutCommand is set to " + cmdTimeoutInSeconds + ". It should probably be between 10 and 300. Too short and it may timeout for a long line and crash the program unnecessarily. Too long and it may wait a long time for a hung command to finish.");
return cmdTimeoutInSeconds;
}
/** NOTE: Caller no longer has to include the genome build as this will be specified in the bior.properties file */
public static String[] getSnpEffCommand(String[] userCmd) throws IOException {
// See src/main/resources/bior.properties for example file to put into your user home directory
BiorProperties biorProps = new BiorProperties();
mCmdTimeout = getCmdTimeout(biorProps);
//Old: java -Xmx4g -jar /data/snpEff/snpEff_3_1/snpEff.jar eff -c /data/snpEff/snpEff_3_1/snpEff.config -v GRCh37.68 example.vcf > output.vcf
//Latest: java -Xmx4g -jar $SnpEffJar eff -c $SnpEffConfig -v -o vcf -noLog -noStats
// Split the command from the properties file on spaces, ignoring any multiple spaces
// NOTE: Spaces in the path for the jar and config file should be handled ok here since we are replacing variables with their full paths after the split
final String[] command = biorProps.get(Key.SnpEffCmd).split("\\s+");
// Replace the $SnpEffJar and $SnpEffConfig in the command list
replace(command, "$" + Key.SnpEffJar.toString(), biorProps.get(Key.SnpEffJar));
replace(command, "$" + Key.SnpEffConfig.toString(), biorProps.get(Key.SnpEffConfig));
if (userCmd != null) {
return concat(command,userCmd);
} else {
return command;
}
}
private static void replace(String[] command, String stringToRemove, String stringToInsert) {
for(int i=0; i < command.length; i++) {
if( command[i].contains(stringToRemove) )
command[i] = command[i].replace(stringToRemove, stringToInsert);
}
}
public static String[] concat(String[] A, String[] B) {
List<String> list = new ArrayList<String>();
list.addAll(Arrays.asList(A));
list.addAll(Arrays.asList(B));
return list.toArray(new String[list.size()]);
}
public String compute(String a) {
try {
sLog.info("SnpEff in (before checking if SNPEFF can handle it):\n" + firstXchars(a, 100) + "....");
String error = canCreateSeqChange(a);
if(error == null){
sLog.info("SnpEff sending...");
mSnpeff.send(a);
String result = mSnpeff.receive();
String[] resultParts = result.split("\t");
sLog.info("SnpEff out: \n" + firstXchars(result,100) + "\t....\t" + resultParts[resultParts.length-1]);
return result;
}else {
System.err.println("SnpEff could not process line: " + a);
System.err.println(" " + error);
sLog.warn("SnpEff could not process line: " + a);
sLog.warn(" " + error);
//log.info("SnpEff out: " + a + "\t" + error);
return a + "\t" + error;
}
} catch (Exception ex) {
terminate();
sLog.error("SNPEff failed at line:"+a + "\n" + ex);
// Rethrow any runtime exceptions
throw new RuntimeException("SNPEff failed at line:"+a + "\n" + ex);
}
}
private String firstXchars(String str, int charLimit) {
if(str == null || str.length() < charLimit )
return str;
return str.substring(0,charLimit);
}
public void terminate() {
try {
this.mSnpeff.terminate();
} catch(Exception e) {
sLog.error("Error terminating SNPEFFEXE pipe" + e);
}
}
////
// BELOW THIS LINE WE ARE PULLING FORWARD THE INFORMATION FROM SNPEFF
////
private static final long serialVersionUID = 4226374412681243433L;
private String line; // Line from VCF file
private int lineNum; // Line number
private String chromosomeName; // Original chromosome name
private String ref;
private String[] alts;
private Double quality;
private String filterPass;
private String infoStr = "";
private HashMap<String, String> info;
private String format;
private ArrayList<VcfGenotype> vcfGenotypes = null;
private ChangeType changeType;
private String genotypeFields[]; // Raw fields from VCF file
private String genotypeFieldsStr; // Raw fields from VCF file (one string, tab separated)
private Marker parent;
protected Genome genome = new Genome();
/**
* This method was
* Find chromosome 'chromoName'. If it does not exists and 'createChromos' is true, the chromosome is created
* @param chromoName
* @return
*/
public Chromosome getChromosome(String chromoName) {
//if (createChromos) return genome.getOrCreateChromosome(chromoName);
return genome.getChromosome(chromoName);
}
/**
* Create a list of seqChanges frmo this VcfEntry
* @return
*/
// public List<SeqChange> seqChanges() {
// LinkedList<SeqChange> list = new LinkedList<SeqChange>();
//
// // Coverage
// int coverage = Gpr.parseIntSafe(getInfo("DP"));
//
// // Is it heterozygous, homozygous or undefined?
// Boolean isHetero = calcHetero();
//
// // Create one SeqChange for each alt
// for (String alt : alts) {
// Chromosome chr = (Chromosome) parent;
// SeqChange seqChange = createSeqChange(chr, start, ref, alt, strand, id, getQuality(), coverage);
// seqChange.setHeterozygous(isHetero);
// list.add(seqChange);
// }
//
// return list;
// }
private int parsepos(String posStr){
return Gpr.parseIntSafe(posStr) - 1;
}
//takes a line of VCF input and returns if the line can be processed by SNPEFF
public String canCreateSeqChange(String line){
// Parse line
String fields[] = line.split("\t", 10); // Only pare the fist 9 fields (i.e. do not parse genotypes)
// Is line OK?
if (fields.length >= 4) {
// Chromosome and position. VCF files are one-base, so inOffset should be 1.
chromosomeName = fields[0].trim();
Chromosome chromo = getChromosome(chromosomeName);
parent = chromo;
int start = parsepos(fields[1]);
ref = fields[3].toUpperCase(); // Reference
String altsStr = fields[4].toUpperCase();
parseAlts(altsStr);
// ID (e.g. might indicate dbSnp)
String id = fields[2];
int strand = 1;
// Quality
String qStr = fields[5];
if (!qStr.isEmpty()) quality = Gpr.parseDoubleSafe(qStr);
else quality = null;
filterPass = fields[6]; // Filter parameters
// INFO fields
if(fields.length>7){
infoStr = fields[7];
info = null;
}
//coverage
int coverage = Gpr.parseIntSafe(getInfo("DP"));
//Don't need this, we stripped it off in the step before
// Add genotype fields (lazy parse)
//if (fields.length > 9) genotypeFieldsStr = fields[9];
String error = null;
for (String altN : alts) {
error = canCreateSeqChange(chromo, start, ref, altN, strand, id, quality, coverage);
if(error != null){//if the alt is null, then it is ok, check the other ones, if it is not null, we have an error, return it!
return error;
}
}
return error;
}if(fields.length < 4){
return "Malformed VCF SNPEFFERR this line is too short to be valid VCF: " + line;
}
return null;
}
/**
* Create a seqChange
* @return A string, null if true - SNPEff can process the variant
* , message if false -SNPEff can not process the variant and the message contains the reason why
*/
private String canCreateSeqChange(Chromosome chromo, int start, String reference, String alt, int strand, String id, double quality, int coverage) {
// No change?
if (alt.isEmpty()) return null; //new SeqChange(chromo, start, reference, reference, strand, id, quality, coverage);
alt = alt.toUpperCase();
// Case: Structural variant
// 2 321682 . T <DEL> 6 PASS IMPRECISE;SVTYPE=DEL;END=321887;SVLEN=-105;CIPOS=-56,20;CIEND=-10,62
if (alt.startsWith("<DEL")) {
int end = start + reference.length() - 1;
// If there is an 'END' tag, we should use it
if ((getInfo("END") != null)) {
// Get 'END' field and do some sanity check
end = (int) getInfoInt("END");
if (end < start){//throw new RuntimeException("INFO field 'END' is before varaint's 'POS'\n\tEND : " + end + "\n\tPOS : " + start);
return "SNPEFFERR=INFO field 'END' is before varaint's 'POS'\n\tEND : " + end + "\n\tPOS : " + start;
}
}
// Create deletion string
// TODO: This should be changed. We should be using "imprecise" for these variants
int size = end - start + 1;
char change[] = new char[size];
for (int i = 0; i < change.length; i++)
change[i] = reference.length() > i ? reference.charAt(i) : 'N';
String ch = "-" + new String(change);
// Create SeqChange
return null;//new SeqChange(chromo, start, reference, ch, strand, id, quality, coverage);
}
// Case: SNP, MNP
// 20 3 . C G . PASS DP=100
// 20 3 . TC AT . PASS DP=100
if (reference.length() == alt.length()) {
int startDiff = Integer.MAX_VALUE;
String ch = "";
String ref = "";
for (int i = 0; i < reference.length(); i++) {
if (reference.charAt(i) != alt.charAt(i)) {
ref += reference.charAt(i);
ch += alt.charAt(i);
startDiff = Math.min(startDiff, i);
}
}
return null;//new SeqChange(chromo, start + startDiff, ref, ch, strand, id, quality, coverage);
}
// Case: Deletion
// 20 2 . TC T . PASS DP=100
// 20 2 . AGAC AAC . PASS DP=100
if (reference.length() > alt.length()) {
NeedlemanWunsch nw = new NeedlemanWunsch(alt, reference);
nw.align();
int startDiff = nw.getOffset();
String ref = "*";
String ch = nw.getAlignment();
if (!ch.startsWith("-")){
//throw new RuntimeException("Deletion '" + ch + "' does not start with '-'. This should never happen!");
return "SNPEFFERR=Deletion '" + ch + "' does not start with '-'. This should never happen!";
}
return null;//new SeqChange(chromo, start + startDiff, ref, ch, strand, id, quality, coverage);
}
// Case: Insertion of A { tC ; tCA } tC is the reference allele
// 20 2 . TC TCA . PASS DP=100
if (reference.length() < alt.length()) {
NeedlemanWunsch nw = new NeedlemanWunsch(alt, reference);
nw.align();
int startDiff = nw.getOffset();
String ch = nw.getAlignment();
String ref = "*";
if (!ch.startsWith("+")){
//throw new RuntimeException("Insertion '" + ch + "' does not start with '+'. This should never happen!");
return "SNPEFFERR=Insertion '" + ch + "' does not start with '+'. This should never happen!";
}
return null;//new SeqChange(chromo, start + startDiff, ref, ch, strand, id, quality, coverage);
}
// Other change type?
//throw new RuntimeException("Unsupported VCF change type '" + reference + "' => '" + alt + "'\nVcfEntry: " + this);
return "SNPEFFERR=Unsupported VCF change type '" + reference + "' => '" + alt + "'\nVcfEntry: " + this.line;
}
public String getInfo(String key) {
if (info == null) parseInfo();
return info.get(key);
}
/**
* Parse INFO fields
*/
void parseInfo() {
// Parse info entries
info = new HashMap<String, String>();
for (String inf : infoStr.split(";")) {
String vp[] = inf.split("=");
if (vp.length > 1) info.put(vp[0], vp[1]);
else info.put(vp[0], "true"); // A property that is present, but has no value (e.g. "INDEL")
}
}
/**
* Get info field as an long number
* The norm specifies data type as 'INT', that is why the name of this method might be not intuitive
* @param key
* @return
*/
public long getInfoInt(String key) {
if (info == null) parseInfo();
return Gpr.parseLongSafe(info.get(key));
}
/**
* Is this entry heterozygous?
*
* Infer Hom/Her if there is only one sample in the file.
* Otherwise the field is null.
*
* @return
*/
// public Boolean calcHetero() {
// // No genotyping information? => Use number of ALT fielsd
// if (genotypeFieldsStr == null) return isHeterozygous();
//
// Boolean isHetero = null;
//
// // If there is only one genotype field => parse fields
// if (genotypeFields == null) {
//
// // Are there more than two tabs? (i.e. more than one format field + one genotype field)
// int countFields, fromIndex;
// for (countFields = 0, fromIndex = 0; (fromIndex >= 0) && (countFields < 1); countFields++, fromIndex++)
// fromIndex = genotypeFieldsStr.indexOf('\t', fromIndex);
//
// // OK only one genotype field => Parse it in order to extract homo info.
// if (countFields == 1) parseGenotypes();
// }
//
// // OK only one genotype field => calculate if it is heterozygous
// if ((genotypeFields != null) && (genotypeFields.length == 1)) isHetero = getVcfGenotype(0).isHeterozygous();
//
// return isHetero;
// }
/**
* Parse ALT field
* @param altsStr
*/
private void parseAlts(String altsStr) {
if (altsStr.length() == 1) {
if (altsStr.equals("A") || altsStr.equals("C") || altsStr.equals("G") || altsStr.equals("T") || altsStr.equals(".")) {
alts = new String[] { altsStr };
} else if ("N".equals(altsStr)) { // aNy base
alts = new String[] {"A", "C", "G", "T"};
} else if ("B".equals(altsStr)) { // B: not A
alts = new String[] {"C", "G", "T"};
} else if ("D".equals(altsStr)) { // D: not C
alts = new String[] {"A", "G", "T"};
} else if ("H".equals(altsStr)) { // H: not G
alts = new String[] {"A", "C", "T"};
} else if ("V".equals(altsStr)) { // V: not T
alts = new String[] {"A", "C", "G"};
} else if ("M".equals(altsStr)) {
alts = new String[] {"A", "C"};
} else if ("R".equals(altsStr)) {
alts = new String[] {"A", "G"};
} else if ("W".equals(altsStr)) { // Weak
alts = new String[] {"A", "T"};
} else if ("S".equals(altsStr)) { // Strong
alts = new String[] {"C", "G"};
} else if ("Y".equals(altsStr)) {
alts = new String[] {"C", "T"};
} else if ("K".equals(altsStr)) {
alts = new String[] {"G", "T"};
} else if (".".equals(altsStr)) { // No alternative (same as reference)
alts = new String[] {ref};
} else {
throw new RuntimeException("WARNING: Unknown IUB code for SNP '" + altsStr + "'");
}
} else alts = altsStr.split(",");
// What type of change do we have?
int maxAltLen = Integer.MIN_VALUE, minAltLen = Integer.MAX_VALUE;
for (int i = 0; i < alts.length; i++) {
maxAltLen = Math.max(maxAltLen, alts[i].length());
minAltLen = Math.min(minAltLen, alts[i].length());
}
// Infer change type
if ((ref.length() == maxAltLen) && (ref.length() == minAltLen)) {
if (ref.length() == 1) changeType = ChangeType.SNP;
else changeType = ChangeType.MNP;
} else if (ref.length() > minAltLen) changeType = ChangeType.DEL;
else if (ref.length() < maxAltLen) changeType = ChangeType.INS;
else changeType = ChangeType.MIXED;
}
}