package org.molgenis.data.vcf.format;
import com.google.common.collect.Lists;
import org.apache.commons.lang3.StringUtils;
import org.molgenis.data.Entity;
import org.molgenis.data.MolgenisDataException;
import org.molgenis.data.meta.AttributeType;
import org.molgenis.data.meta.NameValidator;
import org.molgenis.data.meta.model.Attribute;
import org.molgenis.data.meta.model.AttributeFactory;
import org.molgenis.data.meta.model.EntityType;
import org.molgenis.data.meta.model.EntityTypeFactory;
import org.molgenis.data.support.DynamicEntity;
import org.molgenis.data.vcf.VcfRepository;
import org.molgenis.data.vcf.model.VcfAttributes;
import org.molgenis.data.vcf.utils.VcfUtils;
import org.molgenis.genotype.Allele;
import org.molgenis.genotype.GenotypeDataException;
import org.molgenis.vcf.VcfInfo;
import org.molgenis.vcf.VcfRecord;
import org.molgenis.vcf.VcfSample;
import org.molgenis.vcf.meta.VcfMeta;
import org.molgenis.vcf.meta.VcfMetaFormat;
import org.molgenis.vcf.meta.VcfMetaInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.math.BigDecimal;
import java.util.*;
import static com.google.common.collect.Iterables.size;
import static com.google.common.collect.Maps.newHashMapWithExpectedSize;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.stream.Collectors.toSet;
import static java.util.stream.StreamSupport.stream;
import static org.molgenis.data.meta.AttributeType.*;
import static org.molgenis.data.meta.model.EntityType.AttributeRole.*;
import static org.molgenis.data.vcf.VcfRepository.NAME;
import static org.molgenis.data.vcf.VcfRepository.ORIGINAL_NAME;
import static org.molgenis.data.vcf.model.VcfAttributes.*;
import static org.molgenis.util.EntityUtils.getTypedValue;
public class VcfToEntity
{
private static final Logger LOG = LoggerFactory.getLogger(VcfToEntity.class);
private static final String[] EMPTY_FORMAT = { "." };
private final VcfMeta vcfMeta;
private final VcfAttributes vcfAttributes;
private final EntityTypeFactory entityTypeFactory;
private final AttributeFactory attrMetaFactory;
private final EntityType sampleEntityType;
private final EntityType entityType;
/**
* Performance: VCF record info column keys of for info columns of type 'Flag'
*/
private final Set<String> vcfInfoFlagFieldKeys;
/**
* Performance: VCF record info column ID to attribute name map
*/
private final Map<String, String> infoFieldKeyToAttrNameMap;
public VcfToEntity(String entityName, VcfMeta vcfMeta, VcfAttributes vcfAttributes,
EntityTypeFactory entityTypeFactory, AttributeFactory attrMetaFactory)
{
requireNonNull(entityName);
this.vcfMeta = requireNonNull(vcfMeta);
requireNonNull(vcfMeta.getFormatMeta());
this.vcfAttributes = requireNonNull(vcfAttributes);
this.entityTypeFactory = requireNonNull(entityTypeFactory);
this.attrMetaFactory = requireNonNull(attrMetaFactory);
this.vcfInfoFlagFieldKeys = determineVcfInfoFlagFields(vcfMeta);
this.infoFieldKeyToAttrNameMap = createInfoFieldKeyToAttrNameMap(vcfMeta, entityName);
this.sampleEntityType = createSampleEntityType(entityName, vcfMeta.getFormatMeta());
this.entityType = createEntityType(entityName, vcfMeta);
}
private EntityType createEntityType(String entityName, VcfMeta vcfMeta)
{
Attribute idAttribute = attrMetaFactory.create().setName(INTERNAL_ID).setDataType(STRING);
idAttribute.setVisible(false);
EntityType entityType = entityTypeFactory.create().setSimpleName(entityName);
entityType.addAttribute(vcfAttributes.getChromAttribute());
entityType.addAttribute(vcfAttributes.getAltAttribute());
entityType.addAttribute(vcfAttributes.getPosAttribute());
entityType.addAttribute(vcfAttributes.getRefAttribute());
entityType.addAttribute(vcfAttributes.getFilterAttribute());
entityType.addAttribute(vcfAttributes.getQualAttribute());
entityType.addAttribute(vcfAttributes.getIdAttribute());
entityType.addAttribute(idAttribute, ROLE_ID);
Attribute infoMetaData = attrMetaFactory.create().setName(INFO).setDataType(COMPOUND).setNillable(true);
for (VcfMetaInfo info : vcfMeta.getInfoMeta())
{
String attrName = toAttributeName(info.getId());
AttributeType attrType = vcfReaderFormatToMolgenisType(info);
String attrDescription = StringUtils
.isBlank(info.getDescription()) ? VcfRepository.DEFAULT_ATTRIBUTE_DESCRIPTION : info
.getDescription();
Attribute attribute = attrMetaFactory.create().setName(attrName).setDataType(attrType)
.setDescription(attrDescription).setAggregatable(true).setParent(infoMetaData);
entityType.addAttribute(attribute);
}
entityType.addAttribute(infoMetaData);
if (sampleEntityType != null)
{
Attribute samplesAttributeMeta = attrMetaFactory.create().setName(SAMPLES).setDataType(MREF)
.setRefEntity(sampleEntityType).setLabel("SAMPLES");
entityType.addAttribute(samplesAttributeMeta);
}
return entityType;
}
private EntityType createSampleEntityType(String entityName, Iterable<VcfMetaFormat> formatMetaData)
{
EntityType result = null;
if (formatMetaData.iterator().hasNext())
{
result = entityTypeFactory.create().setSimpleName(entityName + "_Sample");
Attribute idAttr = attrMetaFactory.create().setName(ID).setAggregatable(true).setVisible(false);
Attribute nameAttr = attrMetaFactory.create().setName(NAME).setDataType(TEXT).setAggregatable(true)
.setNillable(false);
Attribute originalNameAttr = attrMetaFactory.create().setName(ORIGINAL_NAME).setDataType(TEXT);
result.addAttribute(idAttr, ROLE_ID);
result.addAttribute(nameAttr, ROLE_LABEL, ROLE_LOOKUP);
for (VcfMetaFormat meta : formatMetaData)
{
String name = meta.getId();
if (NameValidator.KEYWORDS.contains(name) || NameValidator.KEYWORDS
.contains(name.toUpperCase()))
{
name = name + "_";
}
Attribute attr = attrMetaFactory.create().setName(name.replaceAll("[-.*$&%^()#!@?]", "_"))
.setDataType(vcfFieldTypeToMolgenisFieldType(meta)).setAggregatable(true)
.setLabel(meta.getId());
result.addAttribute(attr);
}
result.addAttribute(originalNameAttr);
}
return result;
}
private static AttributeType vcfReaderFormatToMolgenisType(VcfMetaInfo vcfMetaInfo)
{
String number = vcfMetaInfo.getNumber();
boolean isListValue;
try
{
isListValue = number.equals("A") || number.equals("R") || number.equals("G") || number.equals(".")
|| Integer.parseInt(number) > 1;
}
catch (NumberFormatException ex)
{
throw new GenotypeDataException("Error parsing length of vcf info field. " + number
+ " is not a valid int or expected preset (A, R, G, .)", ex);
}
switch (vcfMetaInfo.getType())
{
case CHARACTER:
if (isListValue)
{
// TODO support list of primitives datatype
return STRING;
}
return STRING;
case FLAG:
return BOOL;
case FLOAT:
if (isListValue)
{
// TODO support list of primitives datatype
return STRING;
}
return DECIMAL;
case INTEGER:
if (isListValue)
{
// TODO support list of primitives datatype
return STRING;
}
return INT;
case STRING:
if (isListValue)
{
// TODO support list of primitives datatype
return TEXT;
}
return TEXT;
default:
throw new MolgenisDataException(format("Unknown vcf info type [%s]", vcfMetaInfo.getType()));
}
}
private static AttributeType vcfFieldTypeToMolgenisFieldType(VcfMetaFormat format)
{
String number = format.getNumber();
boolean isListValue;
try
{
isListValue = number.equals("A") || number.equals("R") || number.equals("G") || number.equals(".")
|| Integer.parseInt(number) > 1;
}
catch (NumberFormatException ex)
{
throw new GenotypeDataException("Error parsing length of vcf info field. " + number
+ " is not a valid int or expected preset (A, R, G, .)", ex);
}
switch (format.getType())
{
case CHARACTER:
if (isListValue)
{
// TODO support list of primitives datatype
return STRING;
}
return STRING;
case FLOAT:
if (isListValue)
{
// TODO support list of primitives datatype
return STRING;
}
return DECIMAL;
case INTEGER:
if (isListValue)
{
// TODO support list of primitives datatype
return STRING;
}
return INT;
case STRING:
if (isListValue)
{
// TODO support list of primitives datatype
return STRING;
}
return STRING;
default:
throw new MolgenisDataException(format("Unknown vcf field type [%s]", format.getType()));
}
}
public Entity toEntity(String[] tokens)
{
return toEntity(new VcfRecord(vcfMeta, tokens));
}
public Entity toEntity(VcfRecord vcfRecord)
{
Entity entity = new DynamicEntity(entityType);
entity.set(CHROM, vcfRecord.getChromosome());
entity.set(ALT, StringUtils.join(Lists.transform(vcfRecord.getAlternateAlleles(), Allele::toString), ','));
entity.set(POS, vcfRecord.getPosition());
entity.set(REF, vcfRecord.getReferenceAllele().toString());
entity.set(FILTER, vcfRecord.getFilterStatus());
entity.set(QUAL, vcfRecord.getQuality());
entity.set(ID, StringUtils.join(vcfRecord.getIdentifiers(), ','));
String id = VcfUtils.createId(entity);
entity.set(INTERNAL_ID, id);
writeInfoFieldsToEntity(vcfRecord, entity);
if (sampleEntityType != null)
{
List<Entity> samples = createSampleEntities(vcfRecord, entity.get(POS) + "_" + entity.get(ALT), id);
entity.set(SAMPLES, samples);
}
return entity;
}
private List<Entity> createSampleEntities(VcfRecord vcfRecord, String entityPosAlt, String entityId)
{
List<Entity> samples = new ArrayList<>();
Iterator<VcfSample> sampleIterator = vcfRecord.getSamples().iterator();
if (vcfRecord.getNrSamples() > 0)
{
Iterator<String> sampleNameIterator = vcfMeta.getSampleNames().iterator();
for (int j = 0; sampleIterator.hasNext(); ++j)
{
String[] format = vcfRecord.getFormat();
VcfSample sample = sampleIterator.next();
Entity sampleEntity = new DynamicEntity(sampleEntityType);
for (int i = 0; i < format.length; i = i + 1)
{
String strValue = sample.getData(i);
Object value = null;
EntityType sampleEntityType = sampleEntity.getEntityType();
Attribute attr = sampleEntityType.getAttribute(format[i]);
if (attr != null)
{
if (strValue != null)
{
value = getTypedValue(strValue, attr);
}
}
else
{
if (Arrays.equals(EMPTY_FORMAT, format))
{
LOG.debug("Found a dot as format, assuming no samples present");
}
else
{
throw new MolgenisDataException("Sample entity contains an attribute [" + format[i]
+ "] which is not specified in vcf headers");
}
}
sampleEntity.set(format[i], value);
}
sampleEntity.set(ID, entityId + j);
// FIXME remove entity ID from Sample label after #1400 is fixed, see also:
// jquery.molgenis.table.js line 152
String original_name = sampleNameIterator.next();
sampleEntity.set(NAME, entityPosAlt + "_" + original_name);
sampleEntity.set(ORIGINAL_NAME, original_name);
samples.add(sampleEntity);
}
}
return samples;
}
private void writeInfoFieldsToEntity(VcfRecord vcfRecord, Entity entity)
{
// Set default values for VCF info fields of type 'flag' to false. Note that VcfInfo of a VcfRecord do not
// have to contain all flag fields.
for (String vcfInfoFlagFieldKey : vcfInfoFlagFieldKeys)
{
entity.set(toAttributeName(vcfInfoFlagFieldKey), false);
}
for (VcfInfo vcfInfo : vcfRecord.getInformation())
{
if (vcfInfo.getKey().equals(".")) // value not available
{
continue;
}
Object val;
if (vcfInfoFlagFieldKeys.contains(vcfInfo.getKey()))
{
val = true;
}
else
{
Object vcfInfoVal = vcfInfo.getVal();
if (vcfInfoVal == null)
{
val = null;
}
else if (vcfInfoVal instanceof List<?>)
{
// TODO Use list data type once available (see http://www.molgenis.org/ticket/2681)
val = StringUtils.join((List<?>) vcfInfoVal, ',');
}
else if (vcfInfoVal instanceof Float)
{
if (Float.isNaN((Float) vcfInfoVal))
{
val = null;
}
else
{
val = new BigDecimal(String.valueOf(vcfInfoVal))
.doubleValue(); // TODO why not Double.valueOf(string)?
}
}
else if (vcfInfoVal instanceof Character)
{
val = vcfInfoVal.toString();
}
else
{
val = vcfInfoVal; // VCF value type matches type expected for this MOLGENIS attribute type
}
}
entity.set(toAttributeName(vcfInfo.getKey()), val);
}
}
public EntityType getEntityType()
{
return entityType;
}
/**
* Returns the corresponding attribute name for a VCF info field key
*
* @param vcfInfoFieldKey VCF info field key
* @return MOLGENIS attribute name
* @throws RuntimeException if no attribute could be found for a VCF info field key
*/
private String toAttributeName(String vcfInfoFieldKey)
{
String attrName = infoFieldKeyToAttrNameMap.get(vcfInfoFieldKey);
if (attrName == null)
{
throw new RuntimeException(format("Missing attribute for VCF info field [%s]", vcfInfoFieldKey));
}
return attrName;
}
/**
* Returns a set of all possible VCF info fields of type 'Flag'
*
* @param vcfMeta VCF metadata
* @return Set of VCF info fields of type 'Flag'
*/
private static Set<String> determineVcfInfoFlagFields(VcfMeta vcfMeta)
{
return stream(vcfMeta.getInfoMeta().spliterator(), false)
.filter(vcfInfoMeta -> vcfInfoMeta.getType().equals(VcfMetaInfo.Type.FLAG)).map(VcfMetaInfo::getId)
.collect(toSet());
}
/**
* Returns a mapping of VCF info field keys to MOLGENIS attribute names
*
* @param vcfMeta VCF metadata
* @param entityName entity name (that could be used to create a MOLGENIS attribute name)
* @return map of VCF info field keys to MOLGENIS attribute names
*/
private static Map<String, String> createInfoFieldKeyToAttrNameMap(VcfMeta vcfMeta, String entityName)
{
Map<String, String> infoFieldIdToAttrNameMap = newHashMapWithExpectedSize(size(vcfMeta.getInfoMeta()));
for (VcfMetaInfo info : vcfMeta.getInfoMeta())
{
// according to the VCF standard it is allowed to have info columns with names that equal default VCF cols.
// rename these info columns in the meta data to prevent collisions.
String postFix = "";
switch (info.getId())
{
case INTERNAL_ID:
case CHROM:
case ALT:
case POS:
case REF:
case FILTER:
case QUAL:
case ID:
postFix = '_' + entityName;
break;
default:
break;
}
String name = info.getId();
if (NameValidator.KEYWORDS.contains(name) || NameValidator.KEYWORDS
.contains(name.toUpperCase()))
{
name = name + '_';
}
infoFieldIdToAttrNameMap.put(info.getId(), name + postFix);
}
return infoFieldIdToAttrNameMap;
}
}