package org.molgenis.data.vcf.utils; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.google.common.collect.PeekingIterator; import com.google.common.io.BaseEncoding; import org.apache.commons.lang3.StringUtils; import org.molgenis.data.Entity; import org.molgenis.data.MolgenisDataException; import org.molgenis.data.meta.AttributeType; import org.molgenis.data.meta.model.Attribute; import org.molgenis.data.meta.model.AttributeFactory; import org.molgenis.data.meta.model.EntityType; import org.molgenis.data.meta.model.EntityTypeFactory; import org.molgenis.data.support.DynamicEntity; import org.molgenis.data.vcf.VcfRepository; import org.molgenis.data.vcf.datastructures.Sample; import org.molgenis.data.vcf.datastructures.Trio; import org.molgenis.vcf.meta.VcfMetaInfo; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import java.io.FileNotFoundException; import java.nio.charset.Charset; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; import static org.molgenis.data.meta.AttributeType.*; import static org.molgenis.data.vcf.model.VcfAttributes.*; import static org.molgenis.data.vcf.utils.VcfWriterUtils.VARIANT; @Component public class VcfUtils { @Autowired private EntityTypeFactory entityTypeFactory; @Autowired private AttributeFactory attributeFactory; /** * Creates a internal molgenis id from a vcf entity * * @param vcfEntity * @return the id */ public static String createId(Entity vcfEntity) { String idStr = StringUtils.strip(vcfEntity.get(CHROM).toString()) + "_" + StringUtils .strip(vcfEntity.get(POS).toString()) + "_" + StringUtils.strip(vcfEntity.get(REF).toString()) + "_" + StringUtils.strip(vcfEntity.get(ALT).toString()) + "_" + StringUtils .strip(vcfEntity.get(ID).toString()) + "_" + StringUtils .strip(vcfEntity.get(QUAL) != null ? vcfEntity.get(QUAL).toString() : "") + "_" + StringUtils .strip(vcfEntity.get(FILTER) != null ? vcfEntity.get(FILTER).toString() : ""); // use MD5 hash to prevent ids that are too long MessageDigest messageDigest; try { messageDigest = MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e) { throw new RuntimeException(e); } byte[] md5Hash = messageDigest.digest(idStr.getBytes(Charset.forName("UTF-8"))); // convert MD5 hash to string ids that can be safely used in URLs return BaseEncoding.base64Url().omitPadding().encode(md5Hash); } public static String getIdFromInfoField(String line) { int idStartIndex = line.indexOf("ID=") + 3; int idEndIndex = line.indexOf(','); return line.substring(idStartIndex, idEndIndex); } public static List<Attribute> getAtomicAttributesFromList(Iterable<Attribute> outputAttrs) { List<Attribute> result = new ArrayList<>(); for (Attribute attribute : outputAttrs) { if (attribute.getDataType() == COMPOUND) { result.addAll(getAtomicAttributesFromList(attribute.getChildren())); } else { result.add(attribute); } } return result; } public static Map<String, Attribute> getAttributesMapFromList(Iterable<Attribute> outputAttrs) { Map<String, Attribute> attributeMap = new LinkedHashMap<>(); List<Attribute> attributes = getAtomicAttributesFromList(outputAttrs); for (Attribute attribute : attributes) { attributeMap.put(attribute.getName(), attribute); } return attributeMap; } protected static String toVcfDataType(AttributeType dataType) { switch (dataType) { case BOOL: return VcfMetaInfo.Type.FLAG.toString(); case LONG: case DECIMAL: return VcfMetaInfo.Type.FLOAT.toString(); case INT: return VcfMetaInfo.Type.INTEGER.toString(); case EMAIL: case ENUM: case HTML: case HYPERLINK: case STRING: case TEXT: case DATE: case DATE_TIME: case CATEGORICAL: case XREF: case CATEGORICAL_MREF: case MREF: case ONE_TO_MANY: return VcfMetaInfo.Type.STRING.toString(); case COMPOUND: case FILE: throw new RuntimeException("invalid vcf data type " + dataType); default: throw new RuntimeException("unsupported vcf data type " + dataType); } } /** * Get pedigree data from VCF Now only support child, father, mother No fancy data structure either Output: * result.put(childID, Arrays.asList(new String[]{motherID, fatherID})); * * @param inputVcfFileScanner * @return * @throws FileNotFoundException */ public static HashMap<String, Trio> getPedigree(Scanner inputVcfFileScanner) { HashMap<String, Trio> result = new HashMap<>(); while (inputVcfFileScanner.hasNextLine()) { String line = inputVcfFileScanner.nextLine(); // quit when we don't see header lines anymore if (!line.startsWith(VcfRepository.PREFIX)) { break; } // detect pedigree line // expecting e.g. ##PEDIGREE=<Child=100400,Mother=100402,Father=100401> if (line.startsWith("##PEDIGREE")) { System.out.println("Pedigree data line: " + line); String childID = null; String motherID = null; String fatherID = null; String lineStripped = line.replace("##PEDIGREE=<", "").replace(">", ""); String[] lineSplit = lineStripped.split(",", -1); for (String element : lineSplit) { if (element.startsWith("Child")) { childID = element.replace("Child=", ""); } else if (element.startsWith("Mother")) { motherID = element.replace("Mother=", ""); } else if (element.startsWith("Father")) { fatherID = element.replace("Father=", ""); } else { throw new MolgenisDataException( "Expected Child, Mother or Father, but found: " + element + " in line " + line); } } if (childID != null && motherID != null && fatherID != null) { // good result.put(childID, new Trio(new Sample(childID), new Sample(motherID), new Sample(fatherID))); } else { throw new MolgenisDataException("Missing Child, Mother or Father ID in line " + line); } } } return result; } }