package org.molgenis.data.annotation.core.filter; import com.google.common.base.Optional; import com.google.common.collect.*; import org.molgenis.data.Entity; import org.molgenis.data.MolgenisDataException; import org.molgenis.data.annotation.core.datastructures.Location; import org.molgenis.data.annotation.core.entity.ResultFilter; import org.molgenis.data.meta.model.Attribute; import org.molgenis.data.support.EntityTypeUtils; import org.molgenis.data.vcf.model.VcfAttributes; import java.util.*; import static com.google.common.collect.FluentIterable.from; import static java.util.Arrays.asList; import static org.molgenis.data.vcf.model.VcfAttributes.ALT; import static org.molgenis.data.vcf.model.VcfAttributes.REF; /** * TODO: Support multi-allelic combination fields. These fields contain not only info for ref-alt pairs, but also values * for all possible alt-alt pairs. For example, the "AC_Het" field in ExAC: * <p> * For 2 alt alleles, there are 3 values in the AC_Het field, the third being the count for T-A: 1 6536051 . C T,A * 11129.51 PASS AC_Adj=5,3;AC_Het=5,3,0;AC_Hom=0,0; Alt-allele combinations in AC_Het field occur in the following * order: C-T, C-A, T-A. * <p> * For 3 alt alleles, there are 6 values (3+2+1): 15 66641732 rs2063690 G C,A,T 35371281.87 PASS * AC_Adj=13570,3,2;AC_Het=11380,1,2,2,0,0;AC_Hom=1094,0,0; Alt-allele combinations in AC_Het field occur in the * following order: G-C, G-A, G-T, C-A, C-T, A-T. * <p> * For 4 alt alleles, there are 10 values (4+3+2+1): 21 45650009 rs3831401 T G,C,TG,A 8366813.26 PASS * AC_Adj=2528,3415,1,0;AC_Het=934,1240,0,0,725,1,0,0,0,0;AC_Hom=434,725,0,0; Alt-allele combinations in AC_Het field * occur in the following order: T-G, T-C, T-TG, T-A, G-C, G-TG, G-A, C-TG, C-A, TG-A. * <p> * <p> * <p> * TODO: Smart matching of non-obvious ref and alt alleles. Right now, we only 'string match' the exact values of ref * and alt alleles. However, depending on which other genotypes you call for a certain genomic position, the same * variant may be denoted in a different way. For example: "1 231094050 GA G" is the same variant as the GAA/GA in * "1 231094050 GAA GA,G", but to allow notation of the the AA deletion (in GAA/G), the ref was written as GAA instead * of GA. * <p> * This can be tricky. Consider this variant: 1 6529182 . TTCCTCC TTCC * <p> * And after some puzzling, you will find that it is seen in ExAC: 1 6529182 . TTCCTCCTCC * TTCCTCC,TTCC,T,TTCCTCCTCCTCC,TTCCTCCTCCTCCTCC,TTCCTCCTCCTCCTCCTCCTCC * <p> * But here denoted as "TTCCTCCTCC/TTCCTCC". In both cases, a TCC was deleted, but in ExAC this variant is trailed with * another TCC. Finding and parsing these variants to correctly match them against databases such as 1000 Genomes and * ExAC would be very valuable. */ public class MultiAllelicResultFilter implements ResultFilter { private final List<Attribute> attributes; private final boolean mergeMultilineResourceResults; private final VcfAttributes vcfAttributes; public MultiAllelicResultFilter(List<Attribute> alleleSpecificAttributes, boolean mergeMultilineResourceResults, VcfAttributes vcfAttributes) { this.attributes = alleleSpecificAttributes; this.mergeMultilineResourceResults = mergeMultilineResourceResults; this.vcfAttributes = vcfAttributes; } public MultiAllelicResultFilter(List<Attribute> alleleSpecificAttributes, VcfAttributes vcfAttributes) { this.attributes = alleleSpecificAttributes; this.mergeMultilineResourceResults = false; this.vcfAttributes = vcfAttributes; } @Override public Collection<Attribute> getRequiredAttributes() { return asList(vcfAttributes.getRefAttribute(), vcfAttributes.getAltAttribute()); } @Override public Optional<Entity> filterResults(Iterable<Entity> resourceEntities, Entity sourceEntity, boolean updateMode) { List<Entity> processedResults = new ArrayList<>(); String sourceRef = sourceEntity.getString(REF); if (sourceRef == null) { return Optional.absent(); } if (mergeMultilineResourceResults) { resourceEntities = merge(resourceEntities); } for (Entity resourceEntity : resourceEntities) { String resourceRef = resourceEntity.getString(REF); if (resourceRef.equals(sourceRef)) { processedResults.addAll(filter(sourceEntity, resourceEntity, "", "", updateMode)); } // example: ref AGG, input A, substring AGG from index 1, so GG is the postfix to use to match against this // reference else if (resourceRef.startsWith(sourceRef)) { String postFix = resourceRef.substring(sourceRef.length()); processedResults.addAll(filter(sourceEntity, resourceEntity, postFix, "", updateMode)); } // example: ref T, input TC, substring TC from index 1, so C is the postfix to use to match against this // input else if (sourceRef.startsWith(resourceRef)) { String postFix = sourceRef.substring(resourceRef.length()); processedResults.addAll(filter(sourceEntity, resourceEntity, "", postFix, updateMode)); } } return from(processedResults).first(); } private List<Entity> filter(Entity sourceEntity, Entity resourceEntity, String sourcePostfix, String resourcePostfix, boolean updateMode) { List<Entity> result = Lists.newArrayList(); Map<String, String> alleleValueMap = new HashMap<>(); Map<String, String> sourceAlleleValueMap = new HashMap<>(); String[] alts = resourceEntity.getString(VcfAttributes.ALT).split(","); String[] sourceAlts = sourceEntity.getString(VcfAttributes.ALT).split(","); for (Attribute attribute : attributes) { String[] values = resourceEntity.getString(attribute.getName()).split(",", -1); for (int i = 0; i < alts.length; i++) { alleleValueMap.put(alts[i] + resourcePostfix, values[i]); } // also compile a list of original source alleles and their values for use in 'update mode' if (updateMode && sourceEntity.get(attribute.getName()) != null) { Attribute sourceAttr = sourceEntity.getEntityType().getAttribute(attribute.getName()); if (EntityTypeUtils.isTextType(sourceAttr) || EntityTypeUtils.isStringType(sourceAttr)) { String[] sourceValues = sourceEntity.getString(attribute.getName()).split(",", -1); for (int i = 0; i < sourceAlts.length; i++) { sourceAlleleValueMap.put(sourceAlts[i] + resourcePostfix, sourceValues[i].isEmpty() ? "." : sourceValues[i]); } } else if (sourceAlts.length == 1) { sourceAlleleValueMap.put(sourceAlts[0], sourceEntity.get(attribute.getName()).toString()); } } StringBuilder newAttributeValue = new StringBuilder(); String[] annotatedEntityAltAlleles = sourceEntity.getString(ALT).split(","); for (int i = 0; i < annotatedEntityAltAlleles.length; i++) { if (i != 0) { newAttributeValue.append(","); } if (alleleValueMap.get(annotatedEntityAltAlleles[i] + sourcePostfix) != null) { newAttributeValue.append(alleleValueMap.get(annotatedEntityAltAlleles[i] + sourcePostfix)); } else { // default: no data in in resource, add "." for missing value // because we are not in update mode, don't check the original source if (updateMode == false) { newAttributeValue.append("."); } else { // we are in update mode, so let's check the source entity if there was an original value if (sourceAlleleValueMap.get(annotatedEntityAltAlleles[i] + sourcePostfix) != null) { newAttributeValue .append(sourceAlleleValueMap.get(annotatedEntityAltAlleles[i] + sourcePostfix)); } // if there was no original value either, add "." for missing value else { newAttributeValue.append("."); } } } } // add entity only if something was found, so no '.' or any multiple of '.,' (e.g. ".,.,.") if (!newAttributeValue.toString().matches("[\\.,]+")) { resourceEntity.set(attribute.getName(), newAttributeValue.toString()); result.add(resourceEntity); } } return result; } /** * Combine ALT information per reference allele (in VCF there is only 1 reference by letting ALT vary, but that * might not always be the case) * <p> * So we want to support this hypothetical example: * 3 300 G A 0.2|23.1 * 3 300 G T -2.4|0.123 * 3 300 G X -0.002|2.3 * 3 300 G C 0.5|14.5 * 3 300 GC A 0.2|23.1 * 3 300 GC T -2.4|0.123 * 3 300 C GX -0.002|2.3 * 3 300 C GC 0.5|14.5 * <p> * <p> * So we want to support this hypothetical example: 3 300 G A 0.2|23.1 3 300 G T -2.4|0.123 3 300 G X -0.002|2.3 3 * 300 G C 0.5|14.5 3 300 GC A 0.2|23.1 3 300 GC T -2.4|0.123 3 300 C GX -0.002|2.3 3 300 C GC 0.5|14.5 * <p> * and it should become: * <p> * 3 300 G A,T,X,C 0.2|23.1,-2.4|0.123,-0.002|2.3,0.5|14.5 3 300 GC A,T 0.2|23.1,-2.4|0.123 3 300 C GX,GC * -0.002|2.3,0.5|14.5 * <p> * <p> * 3 300 G A,T,X,C 0.2|23.1,-2.4|0.123,-0.002|2.3,0.5|14.5 * 3 300 GC A,T 0.2|23.1,-2.4|0.123 * 3 300 C GX,GC -0.002|2.3,0.5|14.5 * <p> * so that the multi-allelic filter can then find back the appropriate values as if it were a multi-allelic VCF line */ public Iterable<Entity> merge(Iterable<Entity> resourceEntities) { ArrayList<Entity> resourceEntitiesMerged = new ArrayList<Entity>(); PeekingIterator<Entity> resourceEntitiesIterator = Iterators.peekingIterator(resourceEntities.iterator()); if (!resourceEntitiesIterator.hasNext()) { return resourceEntitiesMerged; } Location location = Location.create(resourceEntitiesIterator.peek()); // collect entities to be merged by ref Multimap<String, Entity> refToMergedEntity = LinkedListMultimap.create(); while (resourceEntitiesIterator.hasNext()) { Entity resourceEntity = resourceEntitiesIterator.next(); // verify if all results have the same chrom & pos Location thisLoc = Location.create(resourceEntity); // at least chrom and pos have to be the same, ref may be different if (!location.equals(thisLoc)) { throw new MolgenisDataException("Mismatch in location! " + location + " vs " + thisLoc); } // add to map by ref, so we get [ref -> entities to be merged into one] refToMergedEntity.put(resourceEntity.getString(REF), resourceEntity); } // now iterate over map with refs and merge entities per ref for (String refKey : refToMergedEntity.keySet()) { boolean first = true; Entity mergeWithMe = null; for (Entity entityToBeMerged : refToMergedEntity.get(refKey)) { if (first) { // merge all following entities with the first one mergeWithMe = entityToBeMerged; first = false; } else { // concatenate alleles mergeWithMe.set(ALT, mergeWithMe.get(ALT).toString() + "," + entityToBeMerged.get(ALT).toString()); // concatenate allele specific attributes for (Attribute alleleSpecificAttributes : attributes) { String attrName = alleleSpecificAttributes.getName(); mergeWithMe.set(attrName, mergeWithMe.get(attrName).toString() + "," + entityToBeMerged.get(attrName).toString()); } } } resourceEntitiesMerged.add(mergeWithMe); } return resourceEntitiesMerged; } }