/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package edu.mayo.bior.pipeline.Treat; import com.tinkerpop.pipes.PipeFunction; import edu.mayo.pipes.history.ColumnMetaData; import edu.mayo.pipes.history.History; import java.util.List; /** * * Treat is a stand along application for variant annotation. Variants come in * as strings in VCF format (VCF header is stripped) and annotation is provided * in the TREAT.xls format - this is a tab delimited file with columns defined below. * * @author dquest */ public class TreatFunction implements PipeFunction<String,History>{ int count =0; private ColumnMetaData cmd(String s){ return new ColumnMetaData(s); } private void setTreatMetadata(){ List<ColumnMetaData> columns = History.getMetaData().getColumns(); //strip out all columns --- they are wrong and this is an end user application columns.removeAll(columns); //IGV Link --depricated columns.add(cmd("Chr")); //input columns.add(cmd("Position")); //input columns.add(cmd("Ref")); //input VCF Standard columns.add(cmd("Alt")); //input VCF Standard columns.add(cmd("Quality")); //input columns.add(cmd("dbSNP130/132/135")); //dbSNP - rsID columns.add(cmd("dbSNP130/132/135Alleles")); //dbSNP columns.add(cmd("DiseaseVariant")); //dbSNP columns.add(cmd("HapMap_CEU_allele_freq")); //HapMap columns.add(cmd("1kgenome_CEU_allele_freq")); //1000Genome columns.add(cmd("HapMap_YRI_allele_freq")); //HapMap columns.add(cmd("1kgenome_YRI_allele_freq")); //1000Genome columns.add(cmd("HapMap_JPT+CHB_allele_freq")); //HapMap columns.add(cmd("1kgenome_JPT+CHB_allele_freq"));//1000Genome columns.add(cmd("BGI200_Danish")); //BGI columns.add(cmd("COSMIC")); //COSMIC columns.add(cmd("ESP6500_EUR_maf")); //ESP columns.add(cmd("ESP6500_AFR_maf")); //ESP //InCaptureKit --depricated //Following columns are in GenomeGPS //#AlternateHits --depricated //GenotypeClass --depricated //Alt-SupportedReads --depricated //Ref-SupportedReads --depricated //ReadDepth --depricated //CloseToIndel --depricated columns.add(cmd("Codons")); //VEP - SIFT columns.add(cmd("Transcript_ID")); //VEP - SIFT //Protein_ID --depricated columns.add(cmd("Substitution")); //VEP - SIFT //Region --depricated columns.add(cmd("SNP_Type")); //VEP - SIFT columns.add(cmd("Prediction")); //VEP - SIFT columns.add(cmd("Score")); //VEP - SIFT //Median_Info --depricated columns.add(cmd("Gene_ID")); //NCBIGene columns.add(cmd("Gene_Name")); //NCBIGene columns.add(cmd("OMIM_Disease")); //OMIM columns.add(cmd("Average_Allele_Freqs"));//HAPMAP //User Comment --depricated //SynonymousCodonUsage --depricated //Difference --depricated columns.add(cmd("BlacklistedRegion"));//UCSC columns.add(cmd("Alignability_Uniquness"));//UCSC columns.add(cmd("Repeat_Region"));//UCSC columns.add(cmd("miRbase")); //miRBASE columns.add(cmd("SNP_SuspectRegion"));//dbSNP columns.add(cmd("SNP_ClinicalSig"));//dbSNP columns.add(cmd("conservation"));//UCSC columns.add(cmd("regulation")); //UCSC columns.add(cmd("tfbs")); //UCSC columns.add(cmd("tss")); //UCSC columns.add(cmd("enhancer")); //UCSC columns.add(cmd("UniprotID")); //HGNC columns.add(cmd("polyphen2")); //VEP columns.add(cmd("Homozygous")); //SNPEFF columns.add(cmd("Bio_type")); //SNPEFF columns.add(cmd("accession")); //SNPEFF columns.add(cmd("Exon_ID")); //SNPEFF columns.add(cmd("Exon_Rank")); //SNPEFF columns.add(cmd("functionGVS"));//SNPEFF columns.add(cmd("aminoAcids")); //SNPEFF columns.add(cmd("proteinPosition"));//SNPEFF columns.add(cmd("Codon_Degeneracy"));//SNPEFF columns.add(cmd("geneList"));//SNPEFF columns.add(cmd("Entrez_id"));//SNPEFF columns.add(cmd("Gene_title")); //NCBIGene //Tissue_specificity --depricated //pathway --depricated //GeneCards --depricated //Kaviar_Variants --depricated } /** * compute takes in a VCF line (NO HEADER), and shreds it into a TREAT line * Columns in the input (e.g. sample columns) that are not needed for annotation * are thrown away... this way it works JUST LIKE THE LEGACY TREAT COMMAND. * Outputs from compute can not be compared to TREAT because TREAT processing * was wrong in many respects, most notably same variant did not compare alleles. * This version of TREAT removes many of the errors, and also drops columns * that where validated by the BioR working group as not being used often. * @param VCFLine * @return */ public History compute(String VCFLine) { History xlsLine = new History(); //Compute Treat and put the result in xlsLine (it is just an array of strings, history is getting set above) if(count == 0){ setTreatMetadata(); } count++; return xlsLine; } }