package edu.mayo.bior.pipeline.SNPEff;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.junit.Assert;
import org.junit.Test;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import edu.mayo.pipes.util.test.FileCompareUtils;
/**
* This is a utility class that can be used for testing the SNPEFF jar output and biorsnpeff output
* It assumes standard 8 column header for SNPEFF jar output
* and 9 column header output with 9th column (SNPeffect in JSON) from bior_snpeffpipeline
* @author m106573
*
* Examples of how output files should look.
*
* BiorSNPEFF Output
* #CHROM POS ID REF ALT QUAL FILTER INFO SNPEff
* 21 26960070 rs116645811 G A . . A {"EFF":[{"Effect":"INTRON","Effect_impact":"MODIFIER","Functional_class":"NONE","Gene_name":"MRPL39","Gene_bioType":"protein_coding","Coding":"CODING","Transcript":"ENST00000352957"},{"Effect":"NON_SYNONYMOUS_CODING","Effect_impact":"MODERATE","Functional_class":"MISSENSE","Codon_change":"aCg/aTg","Amino_acid_change":"T334M","Gene_name":"MRPL39","Gene_bioType":"protein_coding","Coding":"CODING","Transcript":"ENST00000307301","Exon":"exon_21_26960013_26960101"}]}
*
*SNPEff Jar Output
*
*#CHROM POS ID REF ALT QUAL FILTER INFO
*21 26960070 rs116645811 G A 0.0 . A;EFF=INTRON(MODIFIER||||MRPL39|protein_coding|CODING|ENST00000352957|),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|aCg/aTg|T334M|MRPL39|protein_coding|CODING|ENST00000307301|exon_21_26960013_26960101)
*
*/
public class SNPEffOutputTest {
@Test
public void compareBiorToSnpJarCmd() throws IOException {
List<String> bior = Arrays.asList("21 26960070 rs116645811 G A 0 . A {\"EFF\":[{\"Effect\":\"INTRON\",\"Effect_impact\":\"MODIFIER\",\"Functional_class\":\"NONE\",\"Gene_name\":\"MRPL39\",\"Gene_bioType\":\"protein_coding\",\"Coding\":\"CODING\",\"Transcript\":\"ENST00000352957\"},{\"Effect\":\"NON_SYNONYMOUS_CODING\",\"Effect_impact\":\"MODERATE\",\"Functional_class\":\"MISSENSE\",\"Codon_change\":\"aCg/aTg\",\"Amino_acid_change\":\"T334M\",\"Gene_name\":\"MRPL39\",\"Gene_bioType\":\"protein_coding\",\"Coding\":\"CODING\",\"Transcript\":\"ENST00000307301\",\"Exon\":\"exon_21_26960013_26960101\"}]}");
List<String> jar = Arrays.asList("21 26960070 rs116645811 G A 0.00 . A;EFF=INTRON(MODIFIER||||MRPL39|protein_coding|CODING|ENST00000352957|),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|aCg/aTg|T334M|MRPL39|protein_coding|CODING|ENST00000307301|exon_21_26960013_26960101)");
Assert.assertTrue(testOutput(jar, bior));
}
public boolean testOutputFiles(String snpEffJarOutPath, String biorSnpEffOutPath) throws IOException {
return testOutput(FileCompareUtils.loadFile(snpEffJarOutPath), FileCompareUtils.loadFile(biorSnpEffOutPath));
}
DecimalFormat mNumFormat = new DecimalFormat("#0.00");
/**
* Takes two files snpeffjar output file and biorsnpeff output file and compares both
* @param bioroutputfilepath
* @param snpeffjaroutputfilepath
* @return boolean
*/
public boolean testOutput(List<String> snpEffJarOut, List<String> biorSnpEffOut) throws IOException {
List<String> biorSnpeffOutputList = new ArrayList<String>();
List<String> snpeffJarOutputList = new ArrayList<String>();
for(String line : biorSnpEffOut) {
if (line.matches("^[0-9].*$") || line.matches("^chr.*$")) {
String[] data = line.split("\t");
List<SNPEffectColInfo> listcolsnp = jsonToSnpEffectCol(data[data.length-1]);
biorSnpeffOutputList.add(variantToString(data[0],data[1],data[2],data[3],data[4],data[5],data[6],data[7],listcolsnp));
}
}
for(String line : snpEffJarOut) {
if (line.matches("^[0-9].*$") || line.matches("^chr.*$")) {
String[] data = line.split("\t");
String infocol;
String[] infovalue = data[7].split("EFF=");
if (infovalue[0].endsWith(";") && infovalue[0].length() > 1) {
infocol = infovalue[0].substring(0,infovalue[0].lastIndexOf(";"));
} else {
infocol = ".";
}
String[] snpeffectsValue = infovalue[1].split(",");
List<SNPEffectColInfo> snpeffectcol = new ArrayList<SNPEffectColInfo>();
for (String snpeffect: snpeffectsValue) {
SNPEffectColInfo snpeffectcolinf = new SNPEffectColInfo();
String[] snpeffect1 = snpeffect.split("\\(");
snpeffectcolinf.setEffect(snpeffect1[0].trim());
String[] snpeffect2 = snpeffect1[1].split("\\|");
snpeffectcolinf.setEffect_impact(snpeffect2[0]);
snpeffectcolinf.setFunctional_class(snpeffect2[1]);
snpeffectcolinf.setCodon_change(snpeffect2[2]);
snpeffectcolinf.setAmino_acid_change(snpeffect2[3]);
snpeffectcolinf.setGene_name(snpeffect2[4]);
snpeffectcolinf.setGene_bioType(snpeffect2[5]);
snpeffectcolinf.setCoding(snpeffect2[6]);
snpeffectcolinf.setTranscript(snpeffect2[7]);
snpeffectcolinf.setExon(snpeffect2[8].replaceAll("\\)", ""));
snpeffectcol.add(snpeffectcolinf);
}
String quality = mNumFormat.format(Double.parseDouble(data[5]));
// If it ends in ".00", then remove that since Bior just integerizes the output
quality = quality.replace(".00", "");
snpeffJarOutputList.add(variantToString(data[0],data[1],data[2],data[3],data[4],quality,data[6],infocol,snpeffectcol));
}
}
printMismatches(biorSnpeffOutputList, snpeffJarOutputList);
return biorSnpeffOutputList.equals(snpeffJarOutputList);
}
private void printMismatches(List<String> biorSnpeffOutputList, List<String> snpeffJarOutputList) {
System.out.println("snpeffJarOutputList size: " + snpeffJarOutputList.size());
System.out.println("biorSnpeffOutputList size: " + biorSnpeffOutputList.size());
int numMatch = 0;
int numTotal = 0;
for(int i=0; i < snpeffJarOutputList.size(); i++) {
numTotal++;
if(biorSnpeffOutputList.get(i).equals(snpeffJarOutputList.get(i))) {
numMatch++;
} else {
System.out.println("MISMATCH on line " + (i+1) + ":");
System.out.println(" Bior: " + biorSnpeffOutputList.get(i));
System.out.println(" SnpEffJar: " + snpeffJarOutputList.get(i));
}
}
System.out.println("Total: "+ numTotal + ", matches: " + numMatch);
}
/** This method returns the String representation of each row in vcf file.
*
* @param chrm
* @param pos
* @param rsid
* @param ref
* @param alt
* @param qual
* @param filter
* @param info
* @param listcolsnp
* @return String representation of each row in vcf file
*/
private String variantToString(String chrm, String pos, String rsid, String ref, String alt,
String qual,String filter, String info, List<SNPEffectColInfo> listcolsnp)
{
return chrm + " " + pos + " " + rsid + " " + ref + " " + alt + " " + qual + " "
+ filter + " " + info + " " + this.convertListToString(listcolsnp);
}
/**
* Parses SNPeffect JSON column in bior output file and returns a list of SNPEffectColObjects
* @param json
* @return List<SNPEffectColInfo>
*/
private static List<SNPEffectColInfo> jsonToSnpEffectCol(String json) {
List<SNPEffectColInfo> snpeffectList = new ArrayList<SNPEffectColInfo>();
//System.out.println("Converting string to json: " + json);
if (json.contains("EFF")) { //if multiple effects are present
JsonElement root = new JsonParser().parse(json);
JsonArray root1 = root.getAsJsonObject().get("EFF").getAsJsonArray();
for ( JsonElement jsonElem : root1){
snpeffectList.add(json2Obj(jsonElem));
}
} else { // When only single effect is present
JsonElement jsonElem = new JsonParser().parse(json);
snpeffectList.add(json2Obj(jsonElem));
}
return snpeffectList;
}
private static SNPEffectColInfo json2Obj(JsonElement jsonElem) {
SNPEffectColInfo snpEffectColInfo = new SNPEffectColInfo();
String functionalClass = getJson(jsonElem, "Functional_class");
if("NONE".equalsIgnoreCase(functionalClass))
functionalClass = "";
snpEffectColInfo.setFunctional_class( functionalClass);
snpEffectColInfo.setCodon_change( getJson(jsonElem, "Codon_change"));
snpEffectColInfo.setAmino_acid_change( getJson(jsonElem, "Amino_acid_change"));
snpEffectColInfo.setGene_name( getJson(jsonElem, "Gene_name"));
snpEffectColInfo.setGene_bioType( getJson(jsonElem, "Gene_bioType"));
snpEffectColInfo.setCoding( getJson(jsonElem, "Coding"));
snpEffectColInfo.setTranscript( getJson(jsonElem, "Transcript"));
snpEffectColInfo.setExon( getJson(jsonElem, "Exon"));
snpEffectColInfo.setEffect( getJson(jsonElem, "Effect"));
snpEffectColInfo.setEffect_impact( getJson(jsonElem, "Effect_impact"));
return snpEffectColInfo;
}
private static String getJson(JsonElement jsonElem, String jsonPath) {
String val = "";
if( jsonElem.getAsJsonObject().has(jsonPath)) {
val = jsonElem.getAsJsonObject().get(jsonPath).getAsString();
}
return val;
}
/**
* String representation of SNPEffectColInfo Object
* @param snpeffectcolinf
* @return String representation of SNPEffectColInfo object
*/
private String convertToString(SNPEffectColInfo snpeffectcolinf) {
return snpeffectcolinf.getEffect()+"("+snpeffectcolinf.getEffect_impact()
+ "|" + snpeffectcolinf.getFunctional_class() + "|" + snpeffectcolinf.getCodon_change()
+ "|" + snpeffectcolinf.getAmino_acid_change() + "|" + snpeffectcolinf.getGene_name()
+ "|" + snpeffectcolinf.getGene_bioType() + "|" + snpeffectcolinf.getCoding()
+ "|" + snpeffectcolinf.getTranscript() + "|" + snpeffectcolinf.getExon() + ")";
}
/**
* String representation of list of SNPEffectColInfo objects
* @param snpeffectlist
* @return String
*/
private String convertListToString(List<SNPEffectColInfo> snpeffectlist) {
StringBuilder value = new StringBuilder();
for(SNPEffectColInfo snpeffectcol : snpeffectlist) {
value.append(convertToString(snpeffectcol)) ;
}
return value.toString();
}
}