package edu.mayo.bior.pipeline; import java.text.ParseException; import java.util.*; import com.google.gson.Gson; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; import edu.mayo.pipes.history.ColumnMetaData; import edu.mayo.pipes.util.metadata.AddMetadataLines; import org.apache.log4j.Logger; import com.tinkerpop.pipes.AbstractPipe; import edu.mayo.pipes.history.History; import edu.mayo.pipes.history.HistoryMetaData; public class VCFGeneratorPipe extends AbstractPipe<History, History> { public final String DEFAULT_DESCRIPTION = "BioR property file missing description"; public final String DEFAULT_TYPE = "String"; public final String DEFAULT_NUMBER = "."; private static final Logger sLogger = Logger.getLogger(VCFGeneratorPipe.class); Map<Integer, String> biorindexes = new HashMap<Integer, String>(); boolean modifyMetadata = false; @Override protected History processNextStart() throws NoSuchElementException { History history = this.starts.next(); // Modify Metadata only once if (!modifyMetadata) { history = changeHeader(history); modifyMetadata = true; } history = modifyhistory(history, biorindexes); history = (History) removeAnnotationColumns(history, biorindexes); return history; } // public History changeHeader(History history){ // List<String> biorcolumnsFromMetadata = getBIORColumnsFromMetadata(History.getMetaData().getOriginalHeader()); // List<String> colsFromHeader = getBIORColumnsFromHeader(History.getMetaData().getColumns(), biorcolumnsFromMetadata); // return history; // } public History changeHeader(History history) { int totalcolumns = History.getMetaData().getColumns().size(); //System.err.println(History.getMetaData().getOriginalHeader().get(History.getMetaData().getOriginalHeader().size()-1)); List<String> biorcolumnsFromMetadata = getBIORColumnsFromMetadata(History.getMetaData().getOriginalHeader()); List<String> colsFromHeader = getBIORColumnsFromHeader(History.getMetaData().getColumns(), biorcolumnsFromMetadata); if (totalcolumns > 7 && History.getMetaData().getColumns().get(7).getColumnName().equalsIgnoreCase("INFO")) { biorindexes = getBiorColumnsIndexes(history, biorcolumnsFromMetadata); } if(headerLinesForHeaderKeys == null){ populateHeaderLinesForHeaderKeys(); } //checks if biorcolumns is not null if (biorcolumnsFromMetadata != null) { //Happy Path biorcolumnsFromMetadata === colsFromHeader if (biorcolumnsFromMetadata.containsAll(colsFromHeader) && biorcolumnsFromMetadata.size() == colsFromHeader.size()) { HistoryMetaData hmd = removeColumnHeader(History.getMetaData(), biorindexes); History.getMetaData().setOriginalHeader( addColumnheaders( hmd.getOriginalHeader(), null, null) ); //There are more biorcolumnsFromMetadata columns than colsFromHeader - remove the extra ##BIOR don't convert them into info, but convert those that are in colsFromHeader } else if (biorcolumnsFromMetadata.containsAll(colsFromHeader) && biorcolumnsFromMetadata.size() > colsFromHeader.size()) { List<String> biorcolumn = biorcolumnsFromMetadata; biorcolumn.removeAll(colsFromHeader); HistoryMetaData hmd = (removeColumnHeader(History.getMetaData(), biorindexes)); History.getMetaData().setOriginalHeader( addColumnheaders(hmd.getOriginalHeader(), null, biorcolumn) ); //There are move colsFromHeader and there is no available biorcolumnsFromMetadata - build default ##INFO for those that don't have metadata } else if (colsFromHeader.containsAll(biorcolumnsFromMetadata) && colsFromHeader.size() > biorcolumnsFromMetadata.size()) { List<String> addDefaultColumn = colsFromHeader; addDefaultColumn.removeAll(biorcolumnsFromMetadata); History.getMetaData().setOriginalHeader(addColumnheaders((removeColumnHeader(History.getMetaData(), biorindexes)).getOriginalHeader(), addDefaultColumn, null)); //biorcolumnsFromMetadata is a subset of colsFromHeader => we need to build some default ##INFO lines - so intersect the sets then build ##INFO based on the intersection } else if (!colsFromHeader.containsAll(biorcolumnsFromMetadata) || !biorcolumnsFromMetadata.containsAll(colsFromHeader)) { List<String> biorcolumn = biorcolumnsFromMetadata; biorcolumn.removeAll(colsFromHeader); List<String> addDefaultColumn = colsFromHeader; addDefaultColumn.removeAll(biorcolumnsFromMetadata); History.getMetaData().setOriginalHeader(addColumnheaders((removeColumnHeader(History.getMetaData(), biorindexes)).getOriginalHeader(), addDefaultColumn, biorcolumn)); } } else { //No ##BIOR available since bior //build all default ##INFO because we don't have any metadata History.getMetaData().setOriginalHeader(addColumnheaders((removeColumnHeader(History.getMetaData(), biorindexes)).getOriginalHeader(), colsFromHeader, null)); } return history; } /** * Returns a Map of BioR column index and name * * @param h - History object containing all metadata (currently static, but we may refactor) * @param biorcolumn - these are the IDs in the ##BIOR columns, most will be bior.X but some may be any arbitrary string (e.g. output from bior_annotate) * get these by calling getBIORColumnsFromMetadata(); * @return */ public Map<Integer, String> getBiorColumnsIndexes(History h, List<String> biorcolumn) { int totalcolumn = History.getMetaData().getColumns().size(); List<ColumnMetaData> columns = History.getMetaData().getColumns(); List<String> originalheader = History.getMetaData().getOriginalHeader(); Map<Integer, String> biorindex = new HashMap<Integer, String>(); int indexsize = originalheader.size(); String columnheader = originalheader.get(indexsize - 1); if (columnheader.startsWith("#CHROM")) { for (int i = 0; i < columns.size(); i++) { String colname = columns.get(i).getColumnName(); if (colname.contains("bior") || colname.contains("BIOR")) { biorindex.put(i, colname); } else if (biorcolumn.contains(colname)) { biorindex.put(i, colname); } } } return biorindex; } /* Add ##INFO columns to the Metadata lines * @param colmeta (List<String> of current metadata) - the current header as a list * @param retain (List of BioRColumns that has no ##BIOR...Constructs default ##INFO) --for these build a default info string * @param remove (List of BioRColumns that are only mentioned in ##BIOR but not in ColumnHeader(no data available)) -- in the metadata but there is not in column header * @returns List of modified metadata after adding ##INFO Columns */ public List<String> addColumnheaders(List<String> colmeta, List<String> retain, List<String> remove) { List<String> infoMeta = new ArrayList<String>(); List<String> biorList = new ArrayList<String>(); for (String info : colmeta) { //Places all BIOR Headers in a list if (info.startsWith("##BIOR=<ID")) { biorList.add(info); } if (info.startsWith("##BIOR=<ID") && info.contains("bior_drill") || info.startsWith("##BIOR=<ID") && info.contains("bior_annotate") || info.startsWith("##BIOR=<ID") && info.contains("bior_compress") ) { LinkedHashMap<String,String> attr = amdl.parseHeaderLine(info); //If there is nothing to remove, then add a new ##INFO if (remove == null ) { String newInfoRow =buildInfoFromBioRAttr(attr); //System.err.println(newInfoRow); infoMeta.add(newInfoRow); //If remove does not contain the current line, then add new ##INFO }else if(!remove.contains(attr.get("ID"))){ String newInfoRow =buildInfoFromBioRAttr(attr); //System.err.println(newInfoRow); infoMeta.add(newInfoRow); } } } // Builds default Info Metadata line when ##BIOR is not available if (retain != null && !retain.isEmpty()) { for (String key : retain) { String newInfoRow = buildDefaultINFO(key); //System.err.println(newInfoRow); infoMeta.add(newInfoRow); } } int index = lastInfoLineNumber(colmeta); //add the info columns before the #CHROM column header colmeta.addAll(index, infoMeta); colmeta.removeAll(biorList); return colmeta; } /** * for a line without a ##BIOR line and returns a ##INFO line * @return */ public String buildDefaultINFO(String id){ return buildINFO(id,".","String",DEFAULT_DESCRIPTION); } AddMetadataLines amdl = new AddMetadataLines(); /** * given a ##BIOR line, return a ##INFO line * @param biorLine- the bior line that we will turn into an info line * @return */ public String buildINFOFromBioR(String biorLine){ LinkedHashMap<String,String> attr = amdl.parseHeaderLine(biorLine); return buildInfoFromBioRAttr(attr); } /** * build a ##INFO row given a parsed ##BIOR row * @param attr the parsed hash of the ##BIOR row * @return */ public String buildInfoFromBioRAttr(LinkedHashMap<String,String> attr){ String fielddesc = attr.get(AddMetadataLines.BiorMetaControlledVocabulary.FIELDDESCRIPTION.toString()); if(fielddesc == null || fielddesc.length() < 1){ //if the description is empty //try to give it the description fielddesc = attr.get(AddMetadataLines.BiorMetaControlledVocabulary.DESCRIPTION.toString()); if(fielddesc == null || fielddesc.length() < 1){ attr.put(AddMetadataLines.BiorMetaControlledVocabulary.FIELDDESCRIPTION.toString(), DEFAULT_DESCRIPTION); }else { //give it the default attr.put(AddMetadataLines.BiorMetaControlledVocabulary.FIELDDESCRIPTION.toString(), fielddesc); } } String datatype = attr.get(AddMetadataLines.BiorMetaControlledVocabulary.DATATYPE.toString()); if(datatype == null || datatype.length() < 1){ attr.put(AddMetadataLines.BiorMetaControlledVocabulary.DATATYPE.toString(), DEFAULT_TYPE);//String }else if(datatype.equalsIgnoreCase(ColumnMetaData.Type.Boolean.toString())){ attr.put(AddMetadataLines.BiorMetaControlledVocabulary.DATATYPE.toString(), "Flag"); //booleans are represented as flags in VCF } String number = attr.get(AddMetadataLines.BiorMetaControlledVocabulary.NUMBER.toString()); if(number == null || number.length() < 1){ attr.put(AddMetadataLines.BiorMetaControlledVocabulary.NUMBER.toString(), "."); } return buildINFO( attr.get("ID"), attr.get(AddMetadataLines.BiorMetaControlledVocabulary.NUMBER.toString()), attr.get(AddMetadataLines.BiorMetaControlledVocabulary.DATATYPE.toString()), attr.get(AddMetadataLines.BiorMetaControlledVocabulary.FIELDDESCRIPTION.toString())); } /** * Construct a ##INFO line: * e.g. * ##INFO=<ID=BIOR.genes.GeneID,Number=.,Type=String,Description="something"> */ public String buildINFO(String id, String number, String type, String description ){ if(description == null){ description = DEFAULT_DESCRIPTION; } StringBuilder sb = new StringBuilder(); sb.append("##INFO=<ID="); sb.append(id); sb.append(",Number="); sb.append(number); sb.append(",Type="); sb.append(type); sb.append(",Description=\""); if(description.length() > 1){ sb.append(description); }else { sb.append(DEFAULT_DESCRIPTION); } sb.append("\">"); return sb.toString(); } /** * for a header line e.g. #CHROM, remove all bior/annotation columns. * * @param metaData * @param biorindexes2 * @return */ public HistoryMetaData removeColumnHeader(HistoryMetaData metaData, Map<Integer, String> biorindexes2) { List<ColumnMetaData> columns = metaData.getColumns(); List<Integer> indexes = new ArrayList<Integer>(biorindexes2.keySet()); Collections.sort(indexes); // 8 9 10 ... Collections.reverse(indexes); // 10 9 8 ... for (int j : indexes) { ColumnMetaData cmd = columns.get(j); //System.err.println(cmd.getColumnName()); columns.remove(cmd); } return metaData; } /** * Removes columns from history after appending them into INFO Column * * @param h -List<String> of history * @param biorindexes ---indexes of BioR columns that needs to be removed * @return Modified history String */ public History removeAnnotationColumns(History h, Map<Integer, String> biorindexes) { List<Integer> indexes = new ArrayList<Integer>(biorindexes.keySet()); Collections.sort(indexes); //8 9 10 ... Collections.reverse(indexes); //10 9 8 ... for (int j : indexes) { h.remove(j); } return h; } /** * Extracts the List of Bior Columns looking at Metadata (##BIOR lines) * * @param metadata - the original header lines including ##BIOR lines * @return */ public List<String> getBIORColumnsFromMetadata(List<String> metadata) { List<String> columns = new ArrayList<String>(); for (String c : metadata) { if (c.startsWith("##BIOR=<ID")) { String[] ast = c.split("<")[1].replace(">", "").split(",")[0].split("="); columns.add(ast[1].replace("\"", "")); } } return columns; } /** * Extracts the List of BioR Columns looking at column header * * @param header - header passed to historyIn e.g. #CHROM\tPOS\t...\tbior.foo\n * @param biorcolumn - any overides that are bior annotations, but are not prefixed with bior. (there must be a ##BIOR line designating them) * @return the columns from the header that are bior columns. */ public List<String> getBIORColumnsFromHeader(List<ColumnMetaData> header, List<String> biorcolumn) { List<String> columns = new ArrayList<String>(); int indexsize = header.size(); if (header.get(0).getColumnName().startsWith("CHROM")) { for (ColumnMetaData cmd : header) { String colname = cmd.getColumnName(); // String colname =History.getMetaData().getColumns().get(j).getColumnName(); if (colname.contains("bior") || colname.contains("BIOR")) { columns.add(colname); } else if (biorcolumn.contains(colname)) { columns.add(colname); } } } return columns; } /** * VCF Spec for info: * * INFO additional information: (String, no white-space, semi-colons, or equals-signs permitted; * commas are permitted only as delimiters for lists of values) * INFO fields are encoded as a semicolon-separated series of short keys with optional values in the format: * <key>=<data>[,data]. * Arbitrary keys are permitted, although the following sub-fields are reserved (albeit optional): AA : ancestral allele AC : allele count in genotypes, for each ALT allele, in the same order as listed AF : allele frequency for each ALT allele in the same order as listed: use this when estimated from primary data, not called genotypes AN : total number of alleles in called genotypes BQ : RMS base quality at this position CIGAR : cigar string describing how to align an alternate allele to the reference allele DB : dbSNP membership DP : combined depth across samples, e.g. DP=154 END : end position of the variant described in this record (for use with symbolic alleles) H2 : membership in hapmap2 H3 : membership in hapmap3 MQ : RMS mapping quality, e.g. MQ=52 MQ0 : Number of MAPQ == 0 reads covering this record NS : Number of samples with data SB : strand bias at this position SOMATIC : indicates that the record is a somatic mutation, for cancer genomics VALIDATED : validated by follow-up experiment 1000G : membership in 1000 Genomes */ public final String delimForLists = "|"; public String infoDataPair(String key, String value){ String newval = value; LinkedHashMap<String, String> attrs = headerLinesForHeaderKeys.get(key); //in the special case it is a json array if(newval.startsWith("[") && newval.endsWith("]") && newval.length() > 1){ newval = handleJsonArray(newval); }else if(attrs != null){ String delim = attrs.get(AddMetadataLines.BiorMetaControlledVocabulary.DELIMITER.toString()); if(delim.equalsIgnoreCase("|")){ //it is a regex, we need to fix it delim = "\\|"; } newval = newval.replaceAll(delim,","); }else if(newval.contains(",")){ newval = newval.replaceAll(",","|"); } if(newval.contains(" ")){ newval = newval.replaceAll(" ","_"); } if(newval.contains("=")){ newval = newval.replaceAll("=",":"); } if(value.contains(";")){ newval = newval.replaceAll(";",delimForLists); //if the raw data being inserted contains semi-colons - then replace with pipe "|" } StringBuilder sb = new StringBuilder(); //It is actually a Flag NOT a string if(value.equalsIgnoreCase("true") || value.equalsIgnoreCase("false")){ sb.append(";"); sb.append(key); }else { sb.append(";"); sb.append(key); sb.append("="); sb.append(newval); } return sb.toString(); } private Gson gson = new Gson(); public String handleJsonArray(String jarr){ StringBuilder sb = new StringBuilder(); JsonElement jelement = new JsonParser().parse("{ \"arr\" : " + jarr + "}"); JsonObject jobject = jelement.getAsJsonObject(); JsonArray jarray = jobject.getAsJsonArray("arr"); for(int i=0; i<jarray.size();i++){ JsonElement e = jarray.get(i); //if it is an array of strings if(jarr.contains("\"")){ sb.append(e.toString().replaceAll("\"","")); if(i<jarray.size()-1){ sb.append(","); } }else { // it is a number, we don't care about arrays of flags/booleans Double d = e.getAsDouble(); String dstring = d.toString(); if(dstring.endsWith(".0")){ sb.append(dstring.substring(0, dstring.length()-2)); }else { sb.append(dstring); } if(i<jarray.size()-1){ sb.append(","); } } } return sb.toString(); } /** * generate a key-value hash for the ##BIOR lines in the header (we have a similar hash for column number, but this one is for * those lines that specify that some column should be treated as an array. */ private HashMap<String,LinkedHashMap<String, String>> headerLinesForHeaderKeys = null; public void populateHeaderLinesForHeaderKeys(){ AddMetadataLines amdl = new AddMetadataLines(); headerLinesForHeaderKeys = new HashMap<String,LinkedHashMap<String, String>>(); List<String> header = History.getMetaData().getOriginalHeader(); for(String line : header){ String delim = AddMetadataLines.BiorMetaControlledVocabulary.DELIMITER.toString(); if(line.contains(delim)){ LinkedHashMap<String, String> kv = amdl.parseHeaderLine(line); headerLinesForHeaderKeys.put(kv.get(AddMetadataLines.BiorMetaControlledVocabulary.ID.toString()),kv); } } } /** * Modify the history string(VCF row) by appending the columns into INFO * * @param history ---Takes History list<String> * @param biorindexes2 ---corresponding bior indexes (columns that need to be pushed into INFO Column) * @return modified history ---History String after appending BIOR Columns into INFO column */ private History modifyhistory(History history, Map<Integer, String> biorindexes2) { // history. Set<Integer> indexes = biorindexes2.keySet(); Iterator<Integer> iterator = indexes.iterator(); while (iterator.hasNext()) { int value = iterator.next(); String val = null; //Checks sure the index of BioR is within the history if (value < history.size()) { val = history.get(value); if (val != null && !val.isEmpty() && !val.contentEquals(".") && !val.startsWith("{")) { String newValue = history.get(7).concat(infoDataPair( biorindexes2.get(value), val )); //TODO: potential performance issue! if (newValue.startsWith(".;")) history.set(7, newValue.replaceFirst(".;", "")); else history.set(7, newValue); } } } // history = removeColumns(history,biorindexes2); return history; } /** * returns the position of the last ##INFO row in the header * @param originalHeader all of the lines int the original header * @return */ public int lastInfoLineNumber(List<String> originalHeader){ int last = 0; int count =0; for(String headerLine : originalHeader){ if(headerLine.startsWith("##INFO")){ last = count; } count++; } if(last == 0){ return originalHeader.size() -1; } return last+1; } }