package org.molgenis.data.annotation.core.resources.impl.tabix; import au.com.bytecode.opencsv.CSVParser; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableList.Builder; import org.molgenis.data.DataConverter; import org.molgenis.data.Entity; import org.molgenis.data.Query; import org.molgenis.data.RepositoryCapability; import org.molgenis.data.meta.model.Attribute; import org.molgenis.data.meta.model.EntityType; import org.molgenis.data.support.AbstractRepository; import org.molgenis.data.support.DynamicEntity; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.util.*; import java.util.stream.Stream; import static java.util.Objects.requireNonNull; import static org.molgenis.data.vcf.model.VcfAttributes.CHROM; import static org.molgenis.data.vcf.model.VcfAttributes.POS; public class TabixRepository extends AbstractRepository { private static final Logger LOG = LoggerFactory.getLogger(TabixRepository.class); private TabixReader reader; private EntityType entityType; private final String chromosomeAttributeName; private final String positionAttributeName; /** * Creates a new {@link TabixRepository} * * @param file the Tabix file * @param entityType {@link EntityType} for the tabix file. Attributes should be in the order of the file's columns * @throws IOException if something goes wrong creating the {@link TabixReader} for the file */ public TabixRepository(File file, EntityType entityType) throws IOException { this(file, entityType, CHROM, POS); } public TabixRepository(File file, EntityType entityType, String chromosomeAttributeName, String positionAttributeName) throws IOException { this.entityType = entityType; this.reader = new TabixReader(file.getAbsolutePath()); this.chromosomeAttributeName = requireNonNull(chromosomeAttributeName); this.positionAttributeName = requireNonNull(positionAttributeName); } TabixRepository(TabixReader reader, EntityType entityType, String chromosomeAttributeName, String positionAttributeName) { this.reader = requireNonNull(reader); this.entityType = requireNonNull(entityType); this.chromosomeAttributeName = requireNonNull(chromosomeAttributeName); this.positionAttributeName = requireNonNull(positionAttributeName); } public static CSVParser getCsvParser() { return new CSVParser('\t'); } @Override public Set<RepositoryCapability> getCapabilities() { return Collections.singleton(RepositoryCapability.QUERYABLE); } public EntityType getEntityType() { return entityType; } @Override public Stream<Entity> findAll(Query<Entity> q) { Object posValue = getFirstEqualsValueFor(positionAttributeName, q); Object chromValue = getFirstEqualsValueFor(chromosomeAttributeName, q); List<Entity> result = new ArrayList<Entity>(); // if one of both required attributes is null, skip the query and return an empty list if (posValue != null && chromValue != null) { int posIntValue = Integer.parseInt(posValue.toString()); String chromStringValue = chromValue.toString(); result = query(chromStringValue, Integer.valueOf(posIntValue)); } return result.stream(); } /** * Queries the tabix reader. * * @param chrom name of the chromosome * @param pos position * @return {@link ImmutableList} of entities found */ private synchronized ImmutableList<Entity> query(String chrom, int pos) { String queryString = String.format("%s:%s-%2$s", chrom, pos); LOG.debug("query({})", queryString); Builder<Entity> builder = ImmutableList.<Entity>builder(); try { org.molgenis.data.annotation.core.resources.impl.tabix.TabixReader.Iterator iterator = reader .query(queryString); if (iterator != null) { String line = iterator.next(); while (line != null) { Entity entity = toEntity(line); if (entity.getInt(positionAttributeName) == pos) { builder.add(entity); } else { LOG.warn("TabixReader returns entity that does not match the query!"); } line = iterator.next(); } } else { return ImmutableList.of(); // empty list } } catch (IOException e) { LOG.error("Error reading from tabix resource", e); } catch (NullPointerException e) { //FIXME: group the occurances of this exception and log once per annotation run LOG.trace("Unable to read from tabix resource for query: " + queryString + " (Position not present in resource file?)"); LOG.debug("", e); } catch (ArrayIndexOutOfBoundsException e) { //FIXME: group the occurances of this exception and log once per annotation run LOG.trace("Unable to read from tabix resource for query: " + queryString + " (Chromosome not present in resource file?)"); LOG.debug("", e); } return builder.build(); } private static Object getFirstEqualsValueFor(String attributeName, Query<Entity> q) { return q.getRules().stream().filter(rule -> attributeName.equals(rule.getField())).findFirst().get().getValue(); } protected Entity toEntity(String line) throws IOException { Entity result = new DynamicEntity(entityType); CSVParser csvParser = getCsvParser(); String[] columns = csvParser.parseLine(line); int i = 0; for (Attribute amd : entityType.getAtomicAttributes()) { if (i < columns.length) { result.set(amd.getName(), DataConverter.convert(columns[i++], amd)); } } return result; } private class TabixRepositoryIterator implements Iterator<Entity> { private String nextLine = null; @Override public boolean hasNext() { if (nextLine != null) { return true; } try { nextLine = reader.readLine(); return nextLine != null; } catch (IOException e) { return false; } } @Override public Entity next() { if (!hasNext()) { throw new NoSuchElementException(); } try { return toEntity(nextLine); } catch (IOException e) { throw new NoSuchElementException(); } finally { nextLine = null; } } } @Override public Iterator<Entity> iterator() { return new TabixRepositoryIterator(); } }