package org.molgenis.data.annotation.core.resources.impl.tabix; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableList.Builder; import org.molgenis.data.Entity; import org.molgenis.data.Query; import org.molgenis.data.QueryRule; import org.molgenis.data.QueryRule.Operator; import org.molgenis.data.RepositoryCapability; import org.molgenis.data.meta.model.AttributeFactory; import org.molgenis.data.meta.model.EntityTypeFactory; import org.molgenis.data.vcf.VcfReaderFactory; import org.molgenis.data.vcf.VcfRepository; import org.molgenis.data.vcf.model.VcfAttributes; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; import static org.elasticsearch.common.base.Preconditions.checkNotNull; /** * An indexed VCF Repository */ public class TabixVcfRepository extends VcfRepository { private static final Logger LOG = LoggerFactory.getLogger(TabixVcfRepository.class); private final TabixReader tabixReader; public TabixVcfRepository(File file, String entityName, VcfAttributes vcfAttributes, EntityTypeFactory entityTypeFactory, AttributeFactory attrMetaFactory) throws IOException { super(file, entityName, vcfAttributes, entityTypeFactory, attrMetaFactory); tabixReader = new TabixReader(file.getCanonicalPath()); } TabixVcfRepository(VcfReaderFactory readerFactory, TabixReader tabixReader, String entityName, VcfAttributes vcfAttributes, EntityTypeFactory entityTypeFactory, AttributeFactory attrMetaFactory) { super(readerFactory, entityName, vcfAttributes, entityTypeFactory, attrMetaFactory); this.tabixReader = tabixReader; } @Override public Set<RepositoryCapability> getCapabilities() { return Collections.emptySet(); } /** * Examines a {@link Query} and finds the first {@link QueryRule} with operator {@link Operator#EQUALS} whose field * matches an attributeName. It returns the value of that first matching {@link QueryRule}. * * @param attributeName the query field name to match * @param q the query to search in * @return the value from the first matching query rule */ private static Object getFirstEqualsValueFor(String attributeName, Query<Entity> q) { return q.getRules().stream() .filter(rule -> attributeName.equals(rule.getField()) && rule.getOperator() == Operator.EQUALS) .findFirst().get().getValue(); } @Override public Stream<Entity> findAll(Query<Entity> q) { Object posValue = getFirstEqualsValueFor(VcfAttributes.POS, q); Object chromValue = getFirstEqualsValueFor(VcfAttributes.CHROM, q); List<Entity> result = new ArrayList<Entity>(); // if one of both required attributes is null, skip the query and return an empty list if (posValue != null && chromValue != null) { int posIntValue = Integer.parseInt(posValue.toString()); String chromStringValue = chromValue.toString(); result = query(chromStringValue, Integer.valueOf(posIntValue), Integer.valueOf(posIntValue)); } return result.stream(); } /** * Queries the tabix reader. * * @param chrom Name of chromosome * @param posFrom position lower bound (inclusive) * @param posTo position upper bound (inclusive) * @return {@link ImmutableList} of entities found */ public synchronized List<Entity> query(String chrom, int posFrom, int posTo) { String queryString = String.format("%s:%s-%s", checkNotNull(chrom), checkNotNull(posFrom), checkNotNull(posTo)); try { Collection<String> lines = getLines(tabixReader.query(queryString)); return lines.stream().map(line -> line.split("\t")).map(vcfToEntitySupplier.get()::toEntity) .filter(entity -> positionMatches(entity, posFrom, posTo)).collect(Collectors.toList()); } catch (NullPointerException e) { //FIXME: group the occurances of this exception and log once per annotation run LOG.trace("Unable to read from tabix resource for query: " + queryString + " (Position not present in resource file?)"); LOG.debug("", e); } catch (ArrayIndexOutOfBoundsException e) { //FIXME: group the occurances of this exception and log once per annotation run LOG.trace("Unable to read from tabix resource for query: " + queryString + " (Chromosome not present in resource file?)"); LOG.debug("", e); } return Collections.emptyList(); } /** * Tabix is not always so precise. For example, the cmdline query * <p> * <pre> * tabix ExAC.r0.3.sites.vep.vcf.gz 1:1115548-1115548 * </pre> * <p> * returns 2 variants: * <ul> * <li>"1 1115547 . CG C,TG"</li> * <li>"1 1115548 rs114390380 G A"</li> * </ul> * It is therefore needed to verify the position of the elements returned. */ private boolean positionMatches(Entity entity, int posFrom, int posTo) { int entityPos = entity.getInt(VcfAttributes.POS); return entityPos >= posFrom && entityPos <= posTo; } /** * Collect the lines returned in a {@link TabixReader.Iterator}. * * @param iterator the iterator from which the lines are collected, may be null. * @return {@link Collection} of lines, is empty if the iterator was null. */ protected Collection<String> getLines( org.molgenis.data.annotation.core.resources.impl.tabix.TabixReader.Iterator iterator) { Builder<String> builder = ImmutableList.<String>builder(); if (iterator != null) { try { String line = iterator.next(); while (line != null) { builder.add(line); line = iterator.next(); } } catch (IOException e) { LOG.error("Error reading from tabix reader.", e); } } return builder.build(); } }