package org.molgenis.data.annotation.core.entity.impl.omim; import au.com.bytecode.opencsv.CSVReader; import com.google.common.collect.Iterables; import org.molgenis.data.Entity; import org.molgenis.data.MolgenisDataException; import org.molgenis.data.Query; import org.molgenis.data.QueryRule.Operator; import org.molgenis.data.RepositoryCapability; import org.molgenis.data.meta.model.AttributeFactory; import org.molgenis.data.meta.model.EntityType; import org.molgenis.data.meta.model.EntityTypeFactory; import org.molgenis.data.support.AbstractRepository; import org.molgenis.data.support.DynamicEntity; import java.io.*; import java.util.*; import java.util.stream.Stream; import static au.com.bytecode.opencsv.CSVParser.DEFAULT_QUOTE_CHARACTER; import static com.google.common.collect.Lists.newArrayList; import static java.nio.charset.Charset.forName; import static java.util.Arrays.asList; import static java.util.Collections.emptySet; import static org.apache.commons.lang3.StringUtils.join; import static org.molgenis.data.annotation.core.entity.impl.omim.OmimAnnotator.NAME; import static org.molgenis.data.annotation.core.entity.impl.omim.OmimAnnotator.SEPARATOR; import static org.molgenis.data.meta.model.EntityType.AttributeRole.ROLE_ID; public class OmimRepository extends AbstractRepository { public static final String OMIM_AUTO_ID_COL_NAME = "ID"; public static final String OMIM_PHENOTYPE_COL_NAME = "Phenotype"; public static final String OMIM_GENE_SYMBOLS_COL_NAME = "Gene_Name"; public static final String OMIM_MIM_NUMBER_COL_NAME = "MIMNumber"; public static final String OMIM_CYTO_LOCATION_COL_NAME = "CytoLocation"; public static final String OMIM_ENTRY_COL_NAME = "OmimEntry"; public static final String OMIM_TYPE_COL_NAME = "OmimType"; private final AttributeFactory attributeFactory; private final EntityTypeFactory entityTypeFactory; private Map<String, List<Entity>> entitiesByGeneSymbol; private final File file; public OmimRepository(File file, EntityTypeFactory entityTypeFactory, AttributeFactory attributeFactory) { this.file = file; this.entityTypeFactory = entityTypeFactory; this.attributeFactory = attributeFactory; } @Override public Set<RepositoryCapability> getCapabilities() { return emptySet(); } @Override public EntityType getEntityType() { EntityType entityType = entityTypeFactory.create().setSimpleName(NAME); entityType.addAttribute(attributeFactory.create().setName(OMIM_GENE_SYMBOLS_COL_NAME), ROLE_ID); entityType.addAttribute(attributeFactory.create().setName(OMIM_PHENOTYPE_COL_NAME)); entityType.addAttribute(attributeFactory.create().setName(OMIM_MIM_NUMBER_COL_NAME)); entityType.addAttribute(attributeFactory.create().setName(OMIM_CYTO_LOCATION_COL_NAME)); entityType.addAttribute(attributeFactory.create().setName(OMIM_ENTRY_COL_NAME)); entityType.addAttribute(attributeFactory.create().setName(OMIM_TYPE_COL_NAME)); return entityType; } @Override public Iterator<Entity> iterator() { return getEntities().iterator(); } @Override public Stream<Entity> findAll(Query<Entity> q) { if (q.getRules().isEmpty()) { return getEntities().stream(); } if ((q.getRules().size() != 1) || (q.getRules().get(0).getOperator() != Operator.EQUALS)) { throw new MolgenisDataException("The only query allowed on this Repository is gene EQUALS"); } String geneSymbol = (String) q.getRules().get(0).getValue(); List<Entity> entities = getEntitiesByGeneSymbol().get(geneSymbol); return entities != null ? entities.stream() : Stream.empty(); } @Override public long count() { return Iterables.size(this); } private List<Entity> getEntities() { List<Entity> entities = new ArrayList<>(); getEntitiesByGeneSymbol().forEach((geneSymbol, geneSymbolEntities) -> entities.addAll(geneSymbolEntities)); return entities; } private Map<String, List<Entity>> getEntitiesByGeneSymbol() { if (entitiesByGeneSymbol == null) { Map<String, List<List<String>>> omimEntriesByGeneSymbol = new HashMap<>(); entitiesByGeneSymbol = new LinkedHashMap<>(); try (CSVReader csvReader = new CSVReader(new InputStreamReader(new FileInputStream(file), forName("UTF-8")), SEPARATOR, DEFAULT_QUOTE_CHARACTER, 1)) { String[] values = csvReader.readNext(); while (values != null) { addLineToMap(omimEntriesByGeneSymbol, values); values = csvReader.readNext(); } for (String geneSymbol : omimEntriesByGeneSymbol.keySet()) { addEntityToGeneEntityList(omimEntriesByGeneSymbol, geneSymbol); } } catch (IOException e) { throw new UncheckedIOException(e); } } return entitiesByGeneSymbol; } /** * Uses the map containing the parsed OMIM map to create a list of {@link Entity} * * @param omimEntriesByGeneSymbol * @param geneSymbol */ private void addEntityToGeneEntityList(Map<String, List<List<String>>> omimEntriesByGeneSymbol, String geneSymbol) { Entity entity = new DynamicEntity(getEntityType()); entity.set(OMIM_GENE_SYMBOLS_COL_NAME, geneSymbol); entity.set(OMIM_PHENOTYPE_COL_NAME, join(omimEntriesByGeneSymbol.get(geneSymbol).get(0), ",")); entity.set(OMIM_MIM_NUMBER_COL_NAME, join(omimEntriesByGeneSymbol.get(geneSymbol).get(1), ",")); entity.set(OMIM_CYTO_LOCATION_COL_NAME, join(omimEntriesByGeneSymbol.get(geneSymbol).get(2), ",")); entity.set(OMIM_TYPE_COL_NAME, join(omimEntriesByGeneSymbol.get(geneSymbol).get(3), ",")); entity.set(OMIM_ENTRY_COL_NAME, join(omimEntriesByGeneSymbol.get(geneSymbol).get(4), ",")); List<Entity> entities = entitiesByGeneSymbol.get(geneSymbol); if (entities == null) { entities = new ArrayList<>(); entitiesByGeneSymbol.put(geneSymbol, entities); } entities.add(entity); } /* * Get and parse OMIM entries. * * Do not store entries without an OMIM identifier... e.g. this one: Leukemia, acute myelogenous (3)|KRAS, KRAS2, * RASK2, NS, CFC2|190070|12p12.1 * * But do store this one: Leukemia, acute myelogenous, 601626 (3)|GMPS|600358|3q25.31 */ private void addLineToMap(Map<String, List<List<String>>> omimEntriesByGeneSymbol, String[] values) { // trim mapping method field, example: (3) String entry = values[0]; entry = entry.substring(0, entry.length() - 3); entry = entry.trim(); // last six characters should be OMIM id entry = entry.substring(entry.length() - 6); if (entry.matches("[0-9]+")) { String disorder = values[0].substring(0, values[0].length() - 12); List<String> genes = asList(values[1].split(", ")); String causalIdentifier = values[2]; String cytogenicLocation = values[3]; String type = values[0].substring(values[0].length() - 2, values[0].length() - 1); String omimEntry = entry; for (String geneSymbol : genes) { if (omimEntriesByGeneSymbol.containsKey(geneSymbol)) { omimEntriesByGeneSymbol.get(geneSymbol).get(0).add(disorder); // first list is phenoype omimEntriesByGeneSymbol.get(geneSymbol).get(1).add(causalIdentifier); // second is mim number omimEntriesByGeneSymbol.get(geneSymbol).get(2).add(cytogenicLocation); // third is cyto location omimEntriesByGeneSymbol.get(geneSymbol).get(3).add(type); // fourth is type of syndrome omimEntriesByGeneSymbol.get(geneSymbol).get(4).add(omimEntry); // fifth is omim entry location } else { LinkedList<List<String>> mapList = new LinkedList<>(); mapList.add(newArrayList(disorder)); mapList.add(newArrayList(causalIdentifier)); mapList.add(newArrayList(cytogenicLocation)); mapList.add(newArrayList(type)); mapList.add(newArrayList(omimEntry)); omimEntriesByGeneSymbol.put(geneSymbol, mapList); } } } } }