package org.molgenis.data.vcf.utils; import com.google.common.collect.Lists; import org.apache.commons.lang3.StringUtils; import org.molgenis.data.Entity; import org.molgenis.data.MolgenisDataException; import org.molgenis.data.MolgenisInvalidFormatException; import org.molgenis.data.meta.model.Attribute; import org.molgenis.data.meta.model.EntityType; import org.molgenis.data.vcf.VcfRepository; import org.molgenis.data.vcf.model.VcfAttributes; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.util.*; import java.util.stream.Collectors; import java.util.stream.StreamSupport; import static com.google.common.base.Joiner.on; import static com.google.common.base.Strings.isNullOrEmpty; import static com.google.common.collect.Iterables.transform; import static org.molgenis.data.meta.AttributeType.BOOL; import static org.molgenis.data.support.EntityTypeUtils.isReferenceType; import static org.molgenis.data.vcf.VcfRepository.DEFAULT_ATTRIBUTE_DESCRIPTION; import static org.molgenis.data.vcf.model.VcfAttributes.*; public class VcfWriterUtils { public static final String VARIANT = "VARIANT"; public static final String EFFECT = "EFFECT"; private static final char ANNOTATION_FIELD_SEPARATOR = ';'; private static final String SPACE_PIPE_SEPERATOR = " | "; private static final LinkedList<String> VCF_ATTRIBUTE_NAMES = new LinkedList<>( Arrays.asList(CHROM, POS, ID, REF, ALT, QUAL, FILTER)); private static final char PIPE_SEPARATOR = '|'; /** * Convert an vcfEntity to a VCF line Only output attributes that are in the attributesToInclude list, or all if * attributesToInclude is empty * * @param inputVcfFile * @param outputVCFWriter * @param addedAttributes * @throws IOException,MolgenisInvalidFormatException */ public static void writeVcfHeader(File inputVcfFile, BufferedWriter outputVCFWriter, List<Attribute> addedAttributes) throws MolgenisInvalidFormatException, IOException { writeVcfHeader(inputVcfFile, outputVCFWriter, addedAttributes, Collections.emptyList()); } /** * Checks for previous annotations * * @param inputVcfFile * @param outputVCFWriter * @param addedAttributes * @param attributesToInclude , the Attribute to write to the VCF file, if empty writes all attributes * @return * @throws MolgenisInvalidFormatException * @throws IOException */ public static void writeVcfHeader(File inputVcfFile, BufferedWriter outputVCFWriter, List<Attribute> addedAttributes, List<String> attributesToInclude) throws MolgenisInvalidFormatException, IOException { System.out.println("Detecting VCF column header..."); Scanner inputVcfFileScanner = new Scanner(inputVcfFile, "UTF-8"); String line = inputVcfFileScanner.nextLine(); Map<String, String> infoHeaderLinesMap = new LinkedHashMap<>(); if (line.startsWith(VcfRepository.PREFIX)) { line = processHeaders(outputVCFWriter, inputVcfFileScanner, line, infoHeaderLinesMap); System.out.println("\nHeader line found:\n" + line); checkColumnHeaders(outputVCFWriter, inputVcfFileScanner, line); writeInfoHeaders(outputVCFWriter, addedAttributes, attributesToInclude, infoHeaderLinesMap); writeColumnHeaders(outputVCFWriter, line); } else { outputVCFWriter.close(); inputVcfFileScanner.close(); throw new MolgenisInvalidFormatException( "Did not find ## on the first line, are you sure it is a VCF file?"); } inputVcfFileScanner.close(); } /** * Overload of writeToVcf to support a simpler call with only Entity and Writer. * * @param vcfEntity * @param writer * @throws MolgenisDataException * @throws IOException */ public static void writeToVcf(Entity vcfEntity, BufferedWriter writer) throws MolgenisDataException, IOException { writeToVcf(vcfEntity, new ArrayList<>(), new ArrayList<>(), writer); } /** * Convert an vcfEntity to a VCF line Only output attributes that are in the attributesToInclude list, or all if * attributesToInclude is empty * * @param vcfEntity * @param addedAttributes * @param attributesToInclude * @param writer * @throws IOException,Exception */ public static void writeToVcf(Entity vcfEntity, List<Attribute> addedAttributes, List<String> attributesToInclude, BufferedWriter writer) throws MolgenisDataException, IOException { addStandardFieldsToVcf(vcfEntity, writer); writeInfoData(vcfEntity, writer, addedAttributes, attributesToInclude); // if we have SAMPLE data, add to output VCF Iterable<Entity> sampleEntities = vcfEntity.getEntities(SAMPLES); if (sampleEntities != null) { addSampleEntitiesToVcf(sampleEntities, writer); } } // **************** // * Parse header * // **************** private static String processHeaders(BufferedWriter outputVCFWriter, Scanner inputVcfFileScanner, String line, Map<String, String> infoHeaderLinesMap) throws IOException { while (inputVcfFileScanner.hasNextLine()) { if (line.startsWith(VcfRepository.PREFIX + VcfAttributes.INFO)) { infoHeaderLinesMap.put(VcfUtils.getIdFromInfoField(line), line); } else if (line.startsWith(VcfRepository.PREFIX)) { outputVCFWriter.write(line); outputVCFWriter.newLine(); } else { break; } line = inputVcfFileScanner.nextLine(); System.out.print("."); } return line; } private static void checkColumnHeaders(BufferedWriter outputVCFWriter, Scanner inputVcfFileScanner, String line) throws IOException, MolgenisInvalidFormatException { if (!line.startsWith(CHROM)) { outputVCFWriter.close(); inputVcfFileScanner.close(); throw new MolgenisInvalidFormatException( "Header does not start with #CHROM, are you sure it is a VCF file?"); } } // **************** // * Write header * // **************** private static void writeColumnHeaders(BufferedWriter outputVCFWriter, String line) throws IOException { outputVCFWriter.write(line); outputVCFWriter.newLine(); } private static void writeInfoHeaders(BufferedWriter outputVCFWriter, List<Attribute> annotatorAttributes, List<String> attributesToInclude, Map<String, String> infoHeaderLinesMap) throws IOException { Map<String, Attribute> annotatorAttributesMap = VcfUtils.getAttributesMapFromList(annotatorAttributes); writeExistingInfoHeaders(outputVCFWriter, infoHeaderLinesMap, annotatorAttributesMap); writeAddedInfoHeaders(outputVCFWriter, attributesToInclude, annotatorAttributesMap, infoHeaderLinesMap); } private static void writeAddedInfoHeaders(BufferedWriter outputVCFWriter, List<String> attributesToInclude, Map<String, Attribute> annotatorAttributes, Map<String, String> infoHeaderLinesMap) throws IOException { for (Attribute annotatorInfoAttr : annotatorAttributes.values()) { if (attributesToInclude.isEmpty() || attributesToInclude.contains(annotatorInfoAttr.getName()) || isReferenceType(annotatorInfoAttr)) { outputVCFWriter .write(createInfoStringFromAttribute(annotatorAttributes.get(annotatorInfoAttr.getName()), attributesToInclude, infoHeaderLinesMap.get(annotatorInfoAttr.getName()))); outputVCFWriter.newLine(); } } } private static String createInfoStringFromAttribute(Attribute infoAttribute, List<String> attributesToInclude, String currentInfoField) { String attributeName = infoAttribute.getName(); StringBuilder sb = new StringBuilder(); sb.append("##INFO=<ID="); sb.append(attributeName); // FIXME: once we support list of primitives we can calculate based on combination of type and nillable sb.append(",Number=."); sb.append(",Type="); sb.append(VcfUtils.toVcfDataType(infoAttribute.getDataType())); sb.append(",Description=\""); // http://samtools.github.io/hts-specs/VCFv4.1.pdf --> "The Description // value must be surrounded by double-quotes. Double-quote character can be escaped with backslash \ and // backslash as \\." if (StringUtils.isBlank(infoAttribute.getDescription())) { if (isReferenceType(infoAttribute) && !attributeName.equals(SAMPLES)) { String currentAttributesString = currentInfoField != null ? currentInfoField .substring((currentInfoField.indexOf("'") + 1), currentInfoField.lastIndexOf("'")) : ""; writeRefAttributePartsToInfoDescription(infoAttribute, attributesToInclude, attributeName, sb, currentAttributesString); } else { sb.append(DEFAULT_ATTRIBUTE_DESCRIPTION); } } else { sb.append(infoAttribute.getDescription().replace("\"", "\\\"").replace("\n", " ")); } sb.append("\">"); return sb.toString(); } private static void writeRefAttributePartsToInfoDescription(Attribute infoAttribute, List<String> attributesToInclude, String attributeName, StringBuilder sb, String existingAttributes) { Iterable<Attribute> atomicAttributes = infoAttribute.getRefEntity().getAtomicAttributes(); sb.append(attributeName); sb.append(" annotations: '"); if (!existingAttributes.isEmpty()) { sb.append(existingAttributes); sb.append(" | "); } sb.append( refAttributesToString(atomicAttributes, attributesToInclude).replace("\\", "\\\\").replace("\"", "\\\"") .replace("\n", " ")); sb.append("'"); } private static String refAttributesToString(Iterable<Attribute> atomicAttributes, List<String> attributesToInclude) { Iterable<Attribute> attributes = StreamSupport.stream(atomicAttributes.spliterator(), false) .filter(attribute -> (attribute.isVisible() && isOutputAttribute(attribute, Lists.newArrayList(atomicAttributes), attributesToInclude))).collect(Collectors.toList()); return on(SPACE_PIPE_SEPERATOR).join(transform(attributes, Attribute::getName)); } private static void writeExistingInfoHeaders(BufferedWriter outputVCFWriter, Map<String, String> infoHeaderLinesMap, Map<String, Attribute> annotatorAttributes) throws IOException { for (String infoHeaderFieldKey : infoHeaderLinesMap.keySet()) { if (!annotatorAttributes.containsKey(infoHeaderFieldKey)) { outputVCFWriter.write(infoHeaderLinesMap.get(infoHeaderFieldKey)); outputVCFWriter.newLine(); } } } // *************** // * Write data * // *************** private static void addStandardFieldsToVcf(Entity vcfEntity, BufferedWriter writer) throws IOException { for (String attribute : VCF_ATTRIBUTE_NAMES) { Object value = vcfEntity.get(attribute); String stringValue = "."; if (value != null) stringValue = value.toString(); if (stringValue.isEmpty()) stringValue = "."; writer.write(stringValue); writer.write('\t'); } } private static void writeInfoData(Entity vcfEntity, BufferedWriter writer, List<Attribute> annotatorAttributes, List<String> attributesToInclude) throws IOException { boolean hasInfoFields = false; for (Attribute attribute : StreamSupport .stream(vcfEntity.getEntityType().getAllAttributes().spliterator(), false) .filter(attr -> !(VCF_ATTRIBUTE_NAMES.contains(attr.getName()) || attr.getName().equals(INFO))) .collect(Collectors.toList())) { if (isOutputAttribute(attribute, annotatorAttributes, attributesToInclude)) { hasInfoFields = writeSingleInfoField(vcfEntity, writer, hasInfoFields, attribute); } } String refEntityAttributesInfoFields = parseRefAttributesToDataString(vcfEntity, annotatorAttributes, attributesToInclude); if (!isNullOrEmpty(refEntityAttributesInfoFields)) { writer.append(refEntityAttributesInfoFields); hasInfoFields = true; } if (!hasInfoFields) { writer.append('.'); } } private static String parseRefAttributesToDataString(Entity vcfEntity, List<Attribute> annotatorAttributes, List<String> attributesToInclude) { Iterable<Attribute> attributes = vcfEntity.getEntityType().getAllAttributes(); StringBuilder refEntityInfoFields = new StringBuilder(); for (Attribute attribute : attributes) { String attributeName = attribute.getName(); if (isReferenceType(attribute) && !attributeName.equals(SAMPLES)) { // If the MREF field is empty, no effects were found, so we do not add an EFFECT field to this entity if (vcfEntity.get(attributeName) != null && isOutputAttribute(attribute, annotatorAttributes, attributesToInclude)) { parseRefFieldsToInfoField(vcfEntity.getEntities(attributeName), attribute, refEntityInfoFields, annotatorAttributes, attributesToInclude); } } } return refEntityInfoFields.toString(); } private static boolean writeSingleInfoField(Entity vcfEntity, BufferedWriter writer, boolean hasInfoFields, Attribute attribute) throws IOException { String infoAttrName = attribute.getName(); if (attribute.getDataType() == BOOL) { Boolean infoAttrBoolValue = vcfEntity.getBoolean(infoAttrName); if (infoAttrBoolValue != null && infoAttrBoolValue) { writer.append(infoAttrName); writer.append(ANNOTATION_FIELD_SEPARATOR); hasInfoFields = true; } } else if (!isReferenceType(attribute)) { Object infoAttrStringValue = vcfEntity.get(infoAttrName); if (infoAttrStringValue != null) { writer.append(infoAttrName); writer.append('='); writer.append(infoAttrStringValue.toString()); writer.append(ANNOTATION_FIELD_SEPARATOR); hasInfoFields = true; } } return hasInfoFields; } /** * Create a INFO field annotation and add values */ private static void parseRefFieldsToInfoField(Iterable<Entity> refEntities, Attribute attribute, StringBuilder refEntityInfoFields, List<Attribute> annotatorAttributes, List<String> attributesToInclude) { boolean secondValuePresent = false; for (Entity refEntity : refEntities) { Iterable<Attribute> refAttributes = refEntity.getEntityType().getAttributes(); if (!secondValuePresent) { refEntityInfoFields.append(attribute.getName()); refEntityInfoFields.append("="); addEntityValuesToRefEntityInfoField(refEntityInfoFields, refEntity, refAttributes, annotatorAttributes, attributesToInclude); } else { refEntityInfoFields.append(","); addEntityValuesToRefEntityInfoField(refEntityInfoFields, refEntity, refAttributes, annotatorAttributes, attributesToInclude); } secondValuePresent = true; } refEntityInfoFields.append(ANNOTATION_FIELD_SEPARATOR); } /** * Add the values of each EFFECT entity to the info field */ private static void addEntityValuesToRefEntityInfoField(StringBuilder refEntityInfoFields, Entity refEntity, Iterable<Attribute> refAttributes, List<Attribute> annotatorAttributes, List<String> attributesToInclude) { boolean previousValuePresent = false; for (Attribute refAttribute : refAttributes) { if (refAttribute.isVisible() && !isReferenceType(refAttribute) && isOutputAttribute(refAttribute, annotatorAttributes, attributesToInclude)) { if (previousValuePresent) refEntityInfoFields.append(PIPE_SEPARATOR); String value = refEntity.getString(refAttribute.getName()) == null ? "" : refEntity .getString(refAttribute.getName()); refEntityInfoFields.append(value); previousValuePresent = true; } } } // ***************** // * Write samples * // ***************** private static void addSampleEntitiesToVcf(Iterable<Entity> sampleEntities, BufferedWriter writer) throws IOException { boolean first = true; for (Entity sample : sampleEntities) { writer.append('\t'); if (first) { writeFormatString(writer, sample); } writeSampleData(writer, sample); first = false; } } private static void writeSampleData(BufferedWriter writer, Entity sample) throws IOException { StringBuilder sampleColumn = new StringBuilder(); if (sample.getEntityType().getAttribute(FORMAT_GT) != null) { String sampleAttrValue = sample.getString(FORMAT_GT); if (sampleAttrValue != null) { sampleColumn.append(sampleAttrValue); } else { sampleColumn.append("."); } } EntityType entityType = sample.getEntityType(); for (Attribute sampleAttribute : entityType.getAttributes()) { String sampleAttributeName = sampleAttribute.getName(); if (!sampleAttributeName.equals(FORMAT_GT) && !sampleAttributeName.equals(VcfRepository.ORIGINAL_NAME)) { // skip the field that were generated for the use of the entity within molgenis if (!sampleAttribute.equals(entityType.getIdAttribute()) && !sampleAttribute .equals(entityType.getLabelAttribute())) { if (sampleColumn.length() != 0) sampleColumn.append(":"); Object sampleAttrValue = sample.get(sampleAttributeName); if (sampleAttrValue != null) { sampleColumn.append(sampleAttrValue.toString()); } else { sampleColumn.append("."); } } } } writer.write(sampleColumn.toString()); } private static void writeFormatString(BufferedWriter writer, Entity sample) throws IOException { StringBuilder formatColumn = new StringBuilder(); // write GT first if available if (sample.getEntityType().getAttribute(FORMAT_GT) != null) { formatColumn.append(FORMAT_GT); } EntityType entityType = sample.getEntityType(); for (Attribute sampleAttribute : entityType.getAttributes()) { String sampleAttributeName = sampleAttribute.getName(); if (!sampleAttributeName.equals(FORMAT_GT) && !sampleAttributeName.equals(VcfRepository.ORIGINAL_NAME)) { // skip the field that were generated for the use of the entity within molgenis if (!sampleAttribute.equals(entityType.getIdAttribute()) && !sampleAttribute .equals(entityType.getLabelAttribute())) { if (formatColumn.length() != 0) formatColumn.append(':'); formatColumn.append(sampleAttributeName); } } } if (formatColumn.length() > 0) { formatColumn.append('\t'); writer.write(formatColumn.toString()); } else { throw new MolgenisDataException("Missing FORMAT information while trying to print first sample"); } } // ********************* // * Utility functions * // ********************* private static boolean isOutputAttribute(Attribute attribute, List<Attribute> addedAttributes, List<String> attributesToInclude) { List<Attribute> expandedAddedAttributes = new ArrayList<>(); for (Attribute annotatorAttr : addedAttributes) { if (isReferenceType(annotatorAttr)) expandedAddedAttributes.addAll(Lists.newArrayList(annotatorAttr.getRefEntity().getAtomicAttributes())); else expandedAddedAttributes.add(annotatorAttr); } List<String> annotatorAttributeNames = expandedAddedAttributes.stream().map(Attribute::getName) .collect(Collectors.toList()); // always write all fields that were not added by this annotation run. // else write the field if it was specified or if nothing was sepcified at all. return (!annotatorAttributeNames.contains(attribute.getName()) || attributesToInclude .contains(attribute.getName()) || attributesToInclude.isEmpty()) && attribute.isVisible() && !attribute .getName().equals(VcfAttributes.SAMPLES); } }