package org.molgenis.data.annotation.core.entity.impl;
import com.google.common.base.Predicates;
import com.google.common.collect.Iterables;
import org.molgenis.data.DataService;
import org.molgenis.data.Entity;
import org.molgenis.data.MolgenisDataException;
import org.molgenis.data.annotation.core.RepositoryAnnotator;
import org.molgenis.data.annotation.core.entity.AnnotatorConfig;
import org.molgenis.data.annotation.core.entity.AnnotatorInfo;
import org.molgenis.data.annotation.core.entity.EntityAnnotator;
import org.molgenis.data.annotation.core.entity.impl.framework.QueryAnnotatorImpl;
import org.molgenis.data.annotation.core.entity.impl.framework.RepositoryAnnotatorImpl;
import org.molgenis.data.annotation.core.query.LocusQueryCreator;
import org.molgenis.data.annotation.core.resources.MultiResourceConfig;
import org.molgenis.data.annotation.core.resources.Resource;
import org.molgenis.data.annotation.core.resources.Resources;
import org.molgenis.data.annotation.core.resources.impl.MultiFileResource;
import org.molgenis.data.annotation.core.resources.impl.MultiResourceConfigImpl;
import org.molgenis.data.annotation.core.resources.impl.RepositoryFactory;
import org.molgenis.data.annotation.core.resources.impl.tabix.TabixVcfRepositoryFactory;
import org.molgenis.data.meta.model.Attribute;
import org.molgenis.data.meta.model.AttributeFactory;
import org.molgenis.data.meta.model.EntityTypeFactory;
import org.molgenis.data.vcf.model.VcfAttributes;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import static com.google.common.collect.Lists.newArrayList;
import static org.molgenis.data.annotation.web.settings.GoNLAnnotatorSettings.Meta.*;
import static org.molgenis.data.meta.AttributeType.STRING;
@Configuration
public class GoNLAnnotator implements AnnotatorConfig
{
public static final String NAME = "gonl";
public static final String GONL_GENOME_AF = "GoNL_AF";
public static final String GONL_GENOME_GTC = "GoNL_GTC";
public static final String GONL_AF_LABEL = "Genome of the netherlands allele frequency";
public static final String GONL_GTC_LABEL = "Genome of the netherlands Genotype counts frequency";
public static final String INFO_GTC = "GTC";
public static final String INFO_AN = "AN";
public static final String INFO_AC = "AC";
public static final String GONL_MULTI_FILE_RESOURCE = "gonlresources";
@Autowired
private Entity goNLAnnotatorSettings;
@Autowired
private DataService dataService;
@Autowired
private Resources resources;
@Autowired
private VcfAttributes vcfAttributes;
@Autowired
private AttributeFactory attributeFactory;
@Autowired
private EntityTypeFactory entityTypeFactory;
private RepositoryAnnotatorImpl annotator;
@Bean
public RepositoryAnnotator gonl()
{
annotator = new RepositoryAnnotatorImpl(NAME);
return annotator;
}
@Override
public void init()
{
List<Attribute> attributes = createGoNlOutputAttributes();
AnnotatorInfo thousandGenomeInfo = AnnotatorInfo
.create(AnnotatorInfo.Status.READY, AnnotatorInfo.Type.POPULATION_REFERENCE, NAME,
"What genetic variation is to be found in the Dutch indigenous population? "
+ "Detailed knowledge about this is not only interesting in itself, "
+ "it also helps to extract useful biomedical information from Dutch biobanks. "
+ "The Dutch biobank collaboration BBMRI-NL has initiated the extensive Rainbow Project “Genome of the Netherlands” (GoNL) "
+ "because it offers unique opportunities for science and for the development of new treatments and diagnostic techniques. "
+ "A close-up look at the DNA of 750 Dutch people-250 trio’s of two parents and an adult child-plus a "
+ "global genetic profile of large numbers of Dutch will disclose a wealth of new information, new insights, "
+ "and possible applications.", attributes);
LocusQueryCreator locusQueryCreator = new LocusQueryCreator(vcfAttributes);
EntityAnnotator entityAnnotator = new QueryAnnotatorImpl(GONL_MULTI_FILE_RESOURCE, thousandGenomeInfo,
locusQueryCreator, dataService, resources, (annotationSourceFileName) ->
{
goNLAnnotatorSettings.set(ROOT_DIRECTORY, annotationSourceFileName);
goNLAnnotatorSettings.set(FILEPATTERN, "gonl.chr%s.snps_indels.r5.vcf.gz");
goNLAnnotatorSettings.set(OVERRIDE_CHROMOSOME_FILES, "X:gonl.chrX.release4.gtc.vcf.gz");
goNLAnnotatorSettings.set(CHROMOSOMES, "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X");
})
{
@Override
public List<Attribute> createAnnotatorAttributes(AttributeFactory attributeFactory)
{
return createGoNlOutputAttributes();
}
public String postFixResource = "";
@Override
protected void processQueryResults(Entity entity, Iterable<Entity> annotationSourceEntities,
boolean updateMode)
{
if (updateMode)
{
throw new MolgenisDataException("This annotator/filter does not support updating of values");
}
String afs = null;
String gtcs = null;
List<Entity> refMatches = newArrayList();
for (Entity resourceEntity : annotationSourceEntities)
{
//situation example: input A, GoNL A
if (resourceEntity.get(vcfAttributes.getRefAttribute().getName())
.equals(vcfAttributes.getRefAttribute().getName()))
{
refMatches.add(resourceEntity);
}
//situation example: input ATC/TTC, GoNL A/T
//we then match on A (leaving TC), lengthen the GoNL ref to A+TC, and alt to T+TC
//now it has a match to ATC/TTC (as it should, but was not obvious due to notation)
else if (vcfAttributes.getRefAttribute().getName()
.indexOf(resourceEntity.getString(vcfAttributes.getRefAttribute().getName())) == 0)
{
postFixResource = entity.getString(vcfAttributes.getRefAttribute().getName()).substring(
resourceEntity.getString(vcfAttributes.getRefAttribute().getName()).length());
resourceEntity.set(vcfAttributes.getRefAttribute().getName(),
resourceEntity.getString(vcfAttributes.getRefAttribute().getName()) + postFixResource);
String newAltString = Arrays
.stream(resourceEntity.getString(vcfAttributes.getAltAttribute().getName()).split(","))
.map(alt -> alt + postFixResource).collect(Collectors.joining(","));
resourceEntity.set(vcfAttributes.getAltAttribute().getName(), newAltString);
refMatches.add(resourceEntity);
}
//situation example: input T/G, GoNL TCT/GCT
//we then match on T (leaving CT), and shorten the GoNL ref to T (-CT), and alt to G (-CT)
//now it has a match to T/G (as it should, but was not obvious due to notation)
else if (resourceEntity.getString(vcfAttributes.getRefAttribute().getName())
.indexOf(entity.getString(vcfAttributes.getRefAttribute().getName())) == 0)
{
int postFixInputLength = resourceEntity.getString(vcfAttributes.getRefAttribute().getName())
.substring(entity.getString(vcfAttributes.getRefAttribute().getName()).length())
.length();
//bugfix: matching A/G to ACT/A results in postFixInputLength=2, correctly updating ref from ACT to A,
//but then tries to substring the alt allele A to length -1 (1 minus 2) which is not allowed.
//added a check to prevent this: alt.length() > postFixInputLength ? trim the alt : change to 'n/a' because we cannot use this alt.
resourceEntity.set(vcfAttributes.getRefAttribute().getName(),
resourceEntity.getString(vcfAttributes.getRefAttribute().getName()).substring(0,
(resourceEntity.getString(vcfAttributes.getRefAttribute().getName()).length()
- postFixInputLength)));
String newAltString = Arrays
.stream(resourceEntity.getString(vcfAttributes.getAltAttribute().getName()).split(","))
.map(alt -> alt.length() > postFixInputLength ? alt
.substring(0, (alt.length() - postFixInputLength)) : "n/a")
.collect(Collectors.joining(","));
resourceEntity.set(vcfAttributes.getAltAttribute().getName(), newAltString);
refMatches.add(resourceEntity);
}
}
if (entity.getString(vcfAttributes.getAltAttribute().getName()) != null)
{
List<Entity> alleleMatches = newArrayList();
for (String alt : entity.getString(vcfAttributes.getAltAttribute().getName()).split(","))
{
alleleMatches.add(refMatches.stream().filter(gonl -> (alt)
.equals((gonl.getString(vcfAttributes.getAltAttribute().getName())))).findFirst()
.orElseGet(() -> null));
}
if (!alleleMatches.stream().allMatch(Predicates.isNull()::apply))
{
afs = alleleMatches.stream().map(gonl -> gonl == null ? "." : Double
.toString(Double.valueOf(gonl.getString(INFO_AC)) / gonl.getInt(INFO_AN)))
.collect(Collectors.joining(","));
//update GTC field to separate allele combinations by pipe instead of comma, since we use comma to separate alt allele info
gtcs = alleleMatches.stream()
.map(gonl -> gonl == null ? "." : gonl.getString(INFO_GTC).replace(",", "|"))
.collect(Collectors.joining(","));
}
}
entity.set(GONL_GENOME_AF, afs);
entity.set(GONL_GENOME_GTC, gtcs);
}
};
annotator.init(entityAnnotator);
}
private List<Attribute> createGoNlOutputAttributes()
{
List<Attribute> attributes = new ArrayList<>();
Attribute goNlAfAttribute = attributeFactory.create().setName(GONL_GENOME_AF).setDataType(STRING)
.setDescription("The allele frequency for variants seen in the population used for the GoNL project")
.setLabel(GONL_AF_LABEL);
Attribute goNlGtcAttribute = attributeFactory.create().setName(GONL_GENOME_GTC).setDataType(STRING)
.setDescription(
"GenoType Counts. For each ALT allele in the same order as listed = 0/0,0/1,1/1,0/2,1/2,2/2,0/3,1/3,2/3,3/3,etc. Phasing is ignored; hence 1/0, 0|1 and 1|0 are all counted as 0/1. When one or more alleles is not called for a genotype in a specific sample (./., ./0, ./1, ./2, etc.), that sample's genotype is completely discarded for calculating GTC.")
.setLabel(GONL_GTC_LABEL);
attributes.add(goNlGtcAttribute);
attributes.add(goNlAfAttribute);
return attributes;
}
@Bean
Resource gonlresources()
{
MultiResourceConfig goNLConfig = new MultiResourceConfigImpl(CHROMOSOMES, FILEPATTERN, ROOT_DIRECTORY,
OVERRIDE_CHROMOSOME_FILES, goNLAnnotatorSettings);
return new MultiFileResource(GONL_MULTI_FILE_RESOURCE, goNLConfig)
{
@Override
public RepositoryFactory getRepositoryFactory()
{
return new TabixVcfRepositoryFactory(GONL_MULTI_FILE_RESOURCE, vcfAttributes, entityTypeFactory,
attributeFactory);
}
};
}
}