/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.catalog.utils; import org.opencb.biodata.formats.pedigree.io.PedigreePedReader; import org.opencb.biodata.formats.pedigree.io.PedigreeReader; import org.opencb.biodata.models.pedigree.Individual; import org.opencb.biodata.models.pedigree.Pedigree; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryResult; import org.opencb.opencga.catalog.managers.CatalogFileUtils; import org.opencb.opencga.catalog.managers.CatalogManager; import org.opencb.opencga.catalog.exceptions.CatalogException; import org.opencb.opencga.catalog.models.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URI; import java.util.*; /** * Created by jacobo on 29/01/15. */ public class CatalogSampleAnnotationsLoader { private static Logger logger = LoggerFactory.getLogger(CatalogFileUtils.class); private final CatalogManager catalogManager; public CatalogSampleAnnotationsLoader(CatalogManager catalogManager) { this.catalogManager = catalogManager; } protected CatalogSampleAnnotationsLoader() { this.catalogManager = null; } public QueryResult<Sample> loadSampleAnnotations(File pedFile, Long variableSetId, String sessionId) throws CatalogException { if (!pedFile.getFormat().equals(File.Format.PED)) { throw new CatalogException(pedFile.getId() + " is not a pedigree file"); } URI fileUri = catalogManager.getFileUri(pedFile); long studyId = catalogManager.getStudyIdByFileId(pedFile.getId()); long auxTime; long startTime = System.currentTimeMillis(); //Read Pedigree file Pedigree ped = readPedigree(fileUri.getPath()); Map<String, Sample> sampleMap = new HashMap<>(); //Take or infer the VariableSet VariableSet variableSet; if (variableSetId != null) { variableSet = catalogManager.getVariableSet(variableSetId, null, sessionId).getResult().get(0); } else { variableSet = getVariableSetFromPedFile(ped); CatalogAnnotationsValidator.checkVariableSet(variableSet); } //Check VariableSet for all samples for (Individual individual : ped.getIndividuals().values()) { Map<String, Object> annotation = getAnnotation(individual, sampleMap, variableSet, ped.getFields()); HashSet<Annotation> annotationSet = new HashSet<>(annotation.size()); for (Map.Entry<String, Object> annotationEntry : annotation.entrySet()) { annotationSet.add(new Annotation(annotationEntry.getKey(), annotationEntry.getValue())); } try { CatalogAnnotationsValidator.checkAnnotationSet(variableSet, new AnnotationSet("", variableSet.getId(), annotationSet, "", null), null); } catch (CatalogException e) { String message = "Validation with the variableSet {id: " + variableSetId + "} over ped File = {id: " + pedFile.getId() + ", name: \"" + pedFile.getName() + "\"} failed"; logger.info(message); throw new CatalogException(message, e); } } /** Pedigree file validated. Add samples and VariableSet **/ //Add VariableSet (if needed) if (variableSetId == null) { auxTime = System.currentTimeMillis(); variableSet = catalogManager.createVariableSet(studyId, pedFile.getName(), true, "Auto-generated VariableSet from File = {id: " + pedFile.getId() + ", name: \"" + pedFile.getName() + "\"}", null, variableSet.getVariables(), sessionId).getResult().get(0); variableSetId = variableSet.getId(); logger.debug("Added VariableSet = {id: {}} in {}ms", variableSetId, System.currentTimeMillis() - auxTime); } //Add Samples Query samplesQuery = new Query("name", new LinkedList<>(ped.getIndividuals().keySet())); Map<String, Sample> loadedSamples = new HashMap<>(); for (Sample sample : catalogManager.getAllSamples(studyId, samplesQuery, null, sessionId).getResult()) { loadedSamples.put(sample.getName(), sample); } auxTime = System.currentTimeMillis(); for (Individual individual : ped.getIndividuals().values()) { Sample sample; if (loadedSamples.containsKey(individual.getId())) { sample = loadedSamples.get(individual.getId()); logger.info("Sample " + individual.getId() + " already loaded with id : " + sample.getId()); } else { QueryResult<Sample> sampleQueryResult = catalogManager.createSample(studyId, individual.getId(), pedFile.getName(), "Sample loaded from the pedigree File = {id: " + pedFile.getId() + ", name: \"" + pedFile.getName() + "\" }", Collections.emptyMap(), null, sessionId); sample = sampleQueryResult.getResult().get(0); } sampleMap.put(individual.getId(), sample); } logger.debug("Added {} samples in {}ms", ped.getIndividuals().size(), System.currentTimeMillis() - auxTime); //Annotate Samples auxTime = System.currentTimeMillis(); for (Map.Entry<String, Sample> entry : sampleMap.entrySet()) { Map<String, Object> annotations = getAnnotation(ped.getIndividuals().get(entry.getKey()), sampleMap, variableSet, ped .getFields()); catalogManager.getSampleManager().createAnnotationSet(Long.toString(entry.getValue().getId()), Long.toString(studyId), variableSetId, "pedigreeAnnotation", annotations, Collections.emptyMap(), sessionId); } logger.debug("Annotated {} samples in {}ms", ped.getIndividuals().size(), System.currentTimeMillis() - auxTime); //TODO: Create Cohort QueryResult<Sample> sampleQueryResult = catalogManager.getAllSamples(studyId, new Query("variableSetId", variableSetId), null, sessionId); return new QueryResult<>("loadPedigree", (int) (System.currentTimeMillis() - startTime), sampleMap.size(), sampleMap.size(), null, null, sampleQueryResult.getResult()); } /** * @param individual Individual from Pedigree file * @param sampleMap Map<String, Sample>, to relate "sampleName" with "sampleId" * @param variableSet VariableSet to annotate * @param fields fields * @return Map<String, Object> Map */ protected Map<String, Object> getAnnotation(Individual individual, Map<String, Sample> sampleMap, VariableSet variableSet, Map<String, Integer> fields) { if (sampleMap == null) { sampleMap = new HashMap<>(); } Map<String, Object> annotations = new HashMap<>(); for (Variable variable : variableSet.getVariables()) { switch (variable.getName()) { case "family": annotations.put("family", individual.getFamily()); break; case "name": annotations.put("name", individual.getId()); break; case "fatherName": annotations.put("fatherName", individual.getFatherId()); break; case "motherName": annotations.put("motherName", individual.getMotherId()); break; case "sex": annotations.put("sex", individual.getSex()); break; case "phenotype": annotations.put("phenotype", individual.getPhenotype()); break; case "id": Sample sample = sampleMap.get(individual.getId()); if (sample != null) { annotations.put("id", sample.getId()); } else { annotations.put("id", -1); } break; case "fatherId": Sample father = sampleMap.get(individual.getFatherId()); if (father != null) { annotations.put("fatherId", father.getId()); } break; case "motherId": Sample mother = sampleMap.get(individual.getMotherId()); if (mother != null) { annotations.put("motherId", mother.getId()); } break; default: Integer idx = fields.get(variable.getName()); if (idx != null) { annotations.put(variable.getName(), individual.getFields()[idx]); } break; } } return annotations; } protected VariableSet getVariableSetFromPedFile(Pedigree ped) throws CatalogException { List<Variable> variableList = new LinkedList<>(); String category = "PEDIGREE"; variableList.add(new Variable("family", category, Variable.VariableType.TEXT, null, true, false, Collections.<String>emptyList(), variableList.size(), null, "", null, null)); variableList.add(new Variable("id", category, Variable.VariableType.NUMERIC, null, true, false, Collections.<String>emptyList(), variableList.size(), null, "", null, null)); variableList.add(new Variable("name", category, Variable.VariableType.TEXT, null, true, false, Collections.<String>emptyList(), variableList.size(), null, "", null, null)); variableList.add(new Variable("fatherId", category, Variable.VariableType.NUMERIC, null, false, false, Collections.<String>emptyList(), variableList.size(), null, "", null, null)); variableList.add(new Variable("fatherName", category, Variable.VariableType.TEXT, null, false, false, Collections.<String>emptyList(), variableList.size(), null, "", null, null)); variableList.add(new Variable("motherId", category, Variable.VariableType.NUMERIC, null, false, false, Collections.<String>emptyList(), variableList.size(), null, "", null, null)); variableList.add(new Variable("motherName", category, Variable.VariableType.TEXT, null, false, false, Collections.<String>emptyList(), variableList.size(), null, "", null, null)); Set<String> allowedSexValues = new HashSet<>(); HashSet<String> allowedPhenotypeValues = new HashSet<>(); for (Individual individual : ped.getIndividuals().values()) { allowedPhenotypeValues.add(individual.getPhenotype()); allowedSexValues.add(individual.getSex()); } variableList.add(new Variable("sex", category, Variable.VariableType.CATEGORICAL, null, true, false, new LinkedList<>(allowedSexValues), variableList.size(), null, "", null, null)); variableList.add(new Variable("phenotype", category, Variable.VariableType.CATEGORICAL, null, true, false, new LinkedList<>(allowedPhenotypeValues), variableList.size(), null, "", null, null)); int categoricalThreshold = (int) (ped.getIndividuals().size() * 0.1); for (Map.Entry<String, Integer> entry : ped.getFields().entrySet()) { boolean isNumerical = true; Set<String> allowedValues = new HashSet<>(); for (Individual individual : ped.getIndividuals().values()) { String s = individual.getFields()[entry.getValue()]; if (isNumerical) { try { Double.parseDouble(s); } catch (Exception e) { isNumerical = false; } } allowedValues.add(s); } Variable.VariableType type; if (allowedValues.size() < categoricalThreshold) { float meanSize = 0; for (String value : allowedValues) { meanSize += value.length(); } meanSize /= allowedValues.size(); float deviation = 0; for (String value : allowedValues) { deviation += (value.length() - meanSize) * (value.length() - meanSize); } deviation /= allowedValues.size(); if (deviation < 10) { type = Variable.VariableType.CATEGORICAL; } else { if (isNumerical) { type = Variable.VariableType.NUMERIC; } else { type = Variable.VariableType.TEXT; } } } else { if (isNumerical) { type = Variable.VariableType.NUMERIC; } else { type = Variable.VariableType.TEXT; } } if (!type.equals(Variable.VariableType.CATEGORICAL)) { allowedValues.clear(); } variableList.add(new Variable(entry.getKey(), category, type, null, false, false, new ArrayList<>(allowedValues), variableList.size(), null, "", null, null)); } VariableSet variableSet = new VariableSet(-1, "", false, "", new HashSet(variableList), null); return variableSet; } protected Pedigree readPedigree(String fileName) { PedigreeReader reader = new PedigreePedReader(fileName); reader.open(); reader.pre(); List<Pedigree> read = reader.read(); reader.post(); reader.close(); return read.get(0); } }