package org.molgenis.annotation.cmd.conversion; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.google.common.collect.PeekingIterator; import org.apache.commons.lang3.StringUtils; import org.molgenis.data.Entity; import org.molgenis.data.meta.AttributeType; import org.molgenis.data.meta.model.Attribute; import org.molgenis.data.meta.model.AttributeFactory; import org.molgenis.data.meta.model.EntityType; import org.molgenis.data.meta.model.EntityTypeFactory; import org.molgenis.data.support.DynamicEntity; import org.molgenis.data.vcf.VcfRepository; import org.molgenis.data.vcf.utils.VcfUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.stream.StreamSupport; import static org.molgenis.data.meta.AttributeType.MREF; import static org.molgenis.data.meta.AttributeType.STRING; import static org.molgenis.data.meta.AttributeType.XREF; import static org.molgenis.data.vcf.utils.VcfWriterUtils.EFFECT; import static org.molgenis.data.vcf.utils.VcfWriterUtils.VARIANT; /** * This class is used to convert from (MOLGENIS)SnpEff annotated VCF files * to the entity structure expected by Effects annotators (for example GAVIN), and the other way around * <p> * An effect is an entity containing information about the combination of a variant (CHROM POS REF ALT(single allele)) in a specific GENE */ @Component public class EffectStructureConverter { private EntityTypeFactory entityTypeFactory; private AttributeFactory attributeFactory; @Autowired public EffectStructureConverter(EntityTypeFactory entityTypeFactory, AttributeFactory attributeFactory) { this.entityTypeFactory = entityTypeFactory; this.attributeFactory = attributeFactory; } public Iterator<Entity> createVcfEntityStructure(Iterator<Entity> annotatedRecords) { return new Iterator<Entity>() { final PeekingIterator<Entity> effects = Iterators.peekingIterator(annotatedRecords); EntityType vcfVariantEntityType; EntityType effectEntityType; private void createResultEntityType(Entity effect, EntityType variantEMD) { if (vcfVariantEntityType == null || effectEntityType == null) { effectEntityType = effect.getEntityType(); vcfVariantEntityType = EntityType.newInstance(variantEMD); vcfVariantEntityType.addAttribute( attributeFactory.create().setName(EFFECT).setDataType(MREF).setRefEntity(effectEntityType)); } } @Override public boolean hasNext() { return effects.hasNext(); } @Override public Entity next() { Entity variant = null; String peekedId; List<Entity> effectsForVariant = Lists.newArrayList(); while (effects.hasNext()) { peekedId = effects.peek().getEntity(VARIANT).getIdValue().toString(); if (variant == null || variant.getIdValue().toString().equals(peekedId)) { Entity effect = effects.next(); variant = effect.getEntity(VARIANT); effectsForVariant.add(effect); } else { return createVcfEntityStructureForSingleEntity(variant, effectsForVariant); } } return createVcfEntityStructureForSingleEntity(variant, effectsForVariant); } private Entity createVcfEntityStructureForSingleEntity(Entity variant, List<Entity> effectsForVariant) { createResultEntityType(effectsForVariant.get(0), variant.getEntityType()); Entity newVariant = new DynamicEntity(vcfVariantEntityType); newVariant.set(variant); if (effectsForVariant.size() > 1) { newVariant.set(EFFECT, effectsForVariant); } else { // is this an empty effect entity? Entity effectForVariant = effectsForVariant.get(0); if (!isEmptyEffectEntity(effectForVariant)) newVariant.set(EFFECT, effectsForVariant); } return newVariant; } private boolean isEmptyEffectEntity(Entity effectEntity) { boolean isEmpty = true; for (Attribute effectAttribute : effectEntityType.getAtomicAttributes()) { //was an empty effect entity created? this entity can be recoginized by the fact that it only has a filled Id attribute and Variant xref if (effectAttribute.getName().equals(effectEntityType.getIdAttribute().getName()) || effectAttribute .getName().equals(VARIANT)) { } else if (effectEntity.get(effectAttribute.getName()) != null) { isEmpty = false; break; } } return isEmpty; } }; } public Stream<Entity> createVariantEffectStructure(String effectAttributeName, List<Attribute> annotatorAttributes, VcfRepository vcfRepository) { EntityType inputVcfEntityType = vcfRepository.getEntityType(); EntityType variantEntityType = removeAttributeAndCreateEntityTypeCopy( vcfRepository.getEntityType().getAttribute(EFFECT), vcfRepository.getEntityType()); Attribute effectsAttribute = inputVcfEntityType.getAttribute(effectAttributeName); String description = getEffectDescription(effectsAttribute); String[] step1 = description.split(":"); String effectEntityName = StringUtils.deleteWhitespace(step1[0]); String attributesString = step1[1].replaceAll("^\\s'|'$", ""); ArrayList<Attribute> effectFieldAttributeList = parseEffectAttributeDescription(attributesString, annotatorAttributes); EntityType effectsEntityType = createEffectsEntityType(effectFieldAttributeList, effectEntityName, annotatorAttributes); return StreamSupport.stream(vcfRepository.spliterator(), false).flatMap( entity -> createVariantEffectStructureForSingleEntity(effectsAttribute, effectFieldAttributeList, effectsEntityType, entity, variantEntityType)); } private EntityType removeAttributeAndCreateEntityTypeCopy(Attribute attributeToParse, EntityType inputEntityType) { EntityType newMeta = EntityType.newInstance(inputEntityType); newMeta.removeAttribute(attributeToParse); return newMeta; } private Stream<Entity> createVariantEffectStructureForSingleEntity(Attribute attributeToParse, ArrayList<Attribute> effectFieldAttributeList, EntityType effectsEntityType, Entity vcfInputEntity, EntityType variantEntityType) { List<Entity> results = new ArrayList<>(); Entity variantEntity = new DynamicEntity(variantEntityType); for (String attr : variantEntity.getAttributeNames()) { if (vcfInputEntity.getEntityType().getAttribute(attr) != null) { variantEntity.set(attr, vcfInputEntity.get(attr)); } } List<Entity> result = createEffectsEntitiesForSingleVariant(effectsEntityType, effectFieldAttributeList, vcfInputEntity.getString(attributeToParse.getName()), variantEntity).collect(Collectors.toList()); results.addAll(result); return results.stream(); } //Get the description of the field that needs to be parsed to determine the attributes of the Entity based on the attribute //for example, this line: //##INFO=<ID=EFFECT,Number=.,Type=String,Description="EFFECT annotations: 'Alt_Allele | Gene_Name | Annotation | Putative_impact | Gene_ID | Feature_type | Feature_ID | Transcript_biotype | Rank_total | HGVS_c | HGVS_p | cDNA_position | CDS_position | Protein_position | Distance_to_feature | Errors'"> public String getEffectDescription(Attribute effectAttributeToParse) { String description = effectAttributeToParse.getDescription(); if (description.indexOf(':') == -1) { throw new RuntimeException( "Unable to create entitystructure, missing semicolon in description of [" + effectAttributeToParse .getName() + "]"); } return description; } private EntityType createEffectsEntityType(ArrayList<Attribute> effectFieldAttributeList, String effectEntityName, List<Attribute> annotatorAttributes) { EntityType effectsEntityType = entityTypeFactory.create().setName(effectEntityName); effectsEntityType.addAttribute(attributeFactory.create().setName("identifier").setAuto(true).setVisible(false), EntityType.AttributeRole.ROLE_ID); effectsEntityType.addAttributes(effectFieldAttributeList); addAnnotatorAttributes(annotatorAttributes, effectsEntityType); effectsEntityType.addAttribute(attributeFactory.create().setName(VARIANT).setDataType(XREF)); return effectsEntityType; } //if annotator attributes not present add them //check if needed for annotators running on an already annotated file private void addAnnotatorAttributes(List<Attribute> annotatorAttributes, EntityType effectsEntityType) { for (Attribute attr : annotatorAttributes) { if (effectsEntityType.getAttribute(attr.getName()) == null) { effectsEntityType.addAttribute(attr); } } } //Create a map of attributes based on the pipe separated attribute names in the description private ArrayList<Attribute> parseEffectAttributeDescription(String attributesString, List<Attribute> annotatorAttributes) { String[] attributeStrings = attributesString.replaceAll("^\\s'|'$", "").split("\\|"); ArrayList<Attribute> attributeList = new ArrayList<>(); Map<String, Attribute> annotatorAttributeMap = VcfUtils.getAttributesMapFromList(annotatorAttributes); for (String attribute : attributeStrings) { AttributeType type = annotatorAttributeMap.containsKey(attribute) ? annotatorAttributeMap.get(attribute) .getDataType() : STRING; Attribute attr = attributeFactory.create().setName(StringUtils.deleteWhitespace(attribute)) .setDataType(type).setLabel(attribute); attributeList.add(attr); } return attributeList; } private static Stream<Entity> createEffectsEntitiesForSingleVariant(EntityType effectsEntityType, List<Attribute> effectFieldAttributeList, String descriptionFieldsString, Entity variantEntity) { List<Entity> listOfEffectsEntities = new ArrayList<>(); if (descriptionFieldsString == null) return listOfEffectsEntities.stream(); String[] descriptionFieldValues = descriptionFieldsString.split(","); for (String descriptionFieldValue : descriptionFieldValues) { String[] descriptionFieldPartValues = descriptionFieldValue.split("\\|", -1); DynamicEntity singleEffectsEntity = new DynamicEntity(effectsEntityType); int i = 0; for (Attribute attribute : effectFieldAttributeList) { if (i > descriptionFieldPartValues.length) { throw new RuntimeException( "Description of the attribute contains more values (pipe separated values) than the actual value"); } singleEffectsEntity.set(attribute.getName(), descriptionFieldPartValues[i]); i++; } singleEffectsEntity.set(VARIANT, variantEntity); listOfEffectsEntities.add(singleEffectsEntity); } return listOfEffectsEntities.stream(); } }