/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.mongodb.variant.converters; import org.bson.Document; import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.AlternateCoordinate; import org.opencb.biodata.models.variant.avro.FileEntry; import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.commons.datastore.core.QueryResult; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager; import java.io.IOException; import java.util.*; import java.util.logging.Level; import java.util.logging.Logger; /** * @author Cristina Yenyxe Gonzalez Garcia <cyenyxe@ebi.ac.uk> */ public class DocumentToStudyVariantEntryConverter { public static final String STUDYID_FIELD = "sid"; // public static final String FORMAT_FIELD = "fm"; public static final String GENOTYPES_FIELD = "gt"; public static final String FILES_FIELD = "files"; public static final String FILEID_FIELD = "fid"; public static final String SAMPLE_DATA_FIELD = "sampleData"; public static final String ATTRIBUTES_FIELD = "attrs"; public static final String ORI_FIELD = "_ori"; public static final String ALTERNATES_FIELD = "alts"; public static final String ALTERNATES_CHR = "chr"; public static final String ALTERNATES_ALT = "alt"; public static final String ALTERNATES_REF = "ref"; public static final String ALTERNATES_START = "start"; public static final String ALTERNATES_END = "end"; public static final String ALTERNATES_TYPE = "type"; private boolean includeSrc; private Set<Integer> returnedFiles; // private Integer fileId; private DocumentToSamplesConverter samplesConverter; private StudyConfigurationManager studyConfigurationManager = null; private Map<Integer, String> studyIds = new HashMap<>(); /** * Create a converter between VariantSourceEntry and Document entities when * there is no need to provide a list of samples or statistics. * * @param includeSrc If true, will include and gzip the "src" attribute in the Document */ public DocumentToStudyVariantEntryConverter(boolean includeSrc) { this.includeSrc = includeSrc; this.samplesConverter = null; this.returnedFiles = null; } /** * Create a converter from VariantSourceEntry to Document entities. A * samples converter and a statistics converter may be provided in case those * should be processed during the conversion. * * @param includeSrc If true, will include and gzip the "src" attribute in the Document * @param samplesConverter The object used to convert the samples. If null, won't convert */ public DocumentToStudyVariantEntryConverter(boolean includeSrc, DocumentToSamplesConverter samplesConverter) { this(includeSrc); this.samplesConverter = samplesConverter; } /** * Create a converter from VariantSourceEntry to Document entities. A * samples converter and a statistics converter may be provided in case those * should be processed during the conversion. * * @param includeSrc If true, will include and gzip the "src" attribute in the Document * @param returnedFiles If present, reads the information of this files from FILES_FIELD * @param samplesConverter The object used to convert the samples. If null, won't convert */ public DocumentToStudyVariantEntryConverter(boolean includeSrc, Collection<Integer> returnedFiles, DocumentToSamplesConverter samplesConverter) { this(includeSrc); this.returnedFiles = (returnedFiles != null) ? new HashSet<>(returnedFiles) : null; this.samplesConverter = samplesConverter; } public DocumentToStudyVariantEntryConverter(boolean includeSrc, Integer returnedFile, DocumentToSamplesConverter samplesConverter) { this(includeSrc, Collections.singletonList(returnedFile), samplesConverter); } public void setStudyConfigurationManager(StudyConfigurationManager studyConfigurationManager) { this.studyConfigurationManager = studyConfigurationManager; } public void addStudyName(int studyId, String studyName) { this.studyIds.put(studyId, studyName); } public StudyEntry convertToDataModelType(Document document) { int studyId = ((Number) document.get(STUDYID_FIELD)).intValue(); // String fileId = this.fileId == null? null : String.valueOf(this.fileId); // String fileId = returnedFiles != null && returnedFiles.size() == 1? returnedFiles.iterator().next().toString() : null; StudyEntry study = new StudyEntry(getStudyName(studyId)); // String fileId = (String) object.get(FILEID_FIELD); Document fileObject; if (document.containsKey(FILES_FIELD)) { List<FileEntry> files = new ArrayList<>(((List) document.get(FILES_FIELD)).size()); for (Document fileDocument : (List<Document>) document.get(FILES_FIELD)) { Integer fid = ((Number) fileDocument.get(FILEID_FIELD)).intValue(); if (fid < 0) { fid = -fid; } if (returnedFiles != null && !returnedFiles.contains(fid)) { continue; } HashMap<String, String> attributes = new HashMap<>(); FileEntry fileEntry = new FileEntry(fid.toString(), null, attributes); files.add(fileEntry); fileObject = fileDocument; // Attributes if (fileObject.containsKey(ATTRIBUTES_FIELD)) { Map<String, Object> attrs = ((Document) fileObject.get(ATTRIBUTES_FIELD)); for (Map.Entry<String, Object> entry : attrs.entrySet()) { // Unzip the "src" field, if available if (entry.getKey().equals("src")) { if (includeSrc) { byte[] o = (byte[]) entry.getValue(); try { attributes.put(entry.getKey(), org.opencb.commons.utils.StringUtils.gunzip(o)); } catch (IOException ex) { Logger.getLogger(DocumentToStudyVariantEntryConverter.class.getName()).log(Level.SEVERE, null, ex); } } } else { attributes.put(entry.getKey().replace(DocumentToStudyConfigurationConverter.TO_REPLACE_DOTS, "."), entry.getValue().toString()); } } } if (fileObject.containsKey(ORI_FIELD)) { Document ori = (Document) fileObject.get(ORI_FIELD); fileEntry.setCall(ori.get("s") + ":" + ori.get("i")); } else { fileEntry.setCall(""); } } study.setFiles(files); } // Alternate alleles // if (fileObject != null && fileObject.containsKey(ALTERNATES_COORDINATES_FIELD)) { List<Document> list = (List<Document>) document.get(ALTERNATES_FIELD); if (list != null && !list.isEmpty()) { for (Document alternateDocument : list) { VariantType variantType = null; String type = (String) alternateDocument.get(ALTERNATES_TYPE); if (type != null && !type.isEmpty()) { variantType = VariantType.valueOf(type); } AlternateCoordinate alternateCoordinate = new AlternateCoordinate( (String) alternateDocument.get(ALTERNATES_CHR), (Integer) alternateDocument.get(ALTERNATES_START), (Integer) alternateDocument.get(ALTERNATES_END), (String) alternateDocument.get(ALTERNATES_REF), (String) alternateDocument.get(ALTERNATES_ALT), variantType); if (study.getSecondaryAlternates() == null) { study.setSecondaryAlternates(new ArrayList<>(list.size())); } study.getSecondaryAlternates().add(alternateCoordinate); } } // String[] alternatives = new String[list.size()]; // int i = 0; // for (Object o : list) { // alternatives[i] = o.toString(); // i++; // } // study.setSecondaryAlternates(list); // } // if (fileObject != null && fileObject.containsKey(FORMAT_FIELD)) { // study.setFormat((String) fileObject.get(FORMAT_FIELD)); // } else { // } // Samples if (samplesConverter != null) { samplesConverter.convertToDataModelType(document, study, studyId); } return study; } public String getStudyName(int studyId) { if (!studyIds.containsKey(studyId)) { if (studyConfigurationManager == null) { studyIds.put(studyId, Integer.toString(studyId)); } else { QueryResult<StudyConfiguration> queryResult = studyConfigurationManager.getStudyConfiguration(studyId, null); if (queryResult.getResult().isEmpty()) { studyIds.put(studyId, Integer.toString(studyId)); } else { studyIds.put(studyId, queryResult.first().getStudyName()); } } } return studyIds.get(studyId); } public Document convertToStorageType(Variant variant, StudyEntry studyEntry) { if (studyEntry.getFiles().size() != 1) { throw new IllegalArgumentException("Expected just one file in the study to convert"); } FileEntry file = studyEntry.getFiles().get(0); return convertToStorageType(variant, studyEntry, file, new LinkedHashSet<>(studyEntry.getOrderedSamplesName())); } public Document convertToStorageType(Variant variant, StudyEntry studyEntry, FileEntry file, LinkedHashSet<String> sampleNames) { return convertToStorageType(variant, studyEntry, Collections.singletonList(file), sampleNames); } public Document convertToStorageType(Variant variant, StudyEntry studyEntry, List<FileEntry> files, LinkedHashSet<String> sampleNames) { int studyId = Integer.parseInt(studyEntry.getStudyId()); Document studyObject = new Document(STUDYID_FIELD, studyId); // Alternate alleles List<Document> alternates = new LinkedList<>(); if (studyEntry.getSecondaryAlternates().size() > 0) { // assuming secondaryAlternates doesn't contain the primary alternate // fileObject.append(ALTERNATES_FIELD, studyEntry.getSecondaryAlternatesAlleles()); for (AlternateCoordinate coordinate : studyEntry.getSecondaryAlternates()) { Document alt = new Document(); alt.put(ALTERNATES_CHR, coordinate.getChromosome() != null ? coordinate.getChromosome() : variant.getChromosome()); alt.put(ALTERNATES_REF, coordinate.getReference() != null ? coordinate.getReference() : variant.getReference()); alt.put(ALTERNATES_ALT, coordinate.getAlternate()); alt.put(ALTERNATES_START, coordinate.getStart() != null ? coordinate.getStart() : variant.getStart()); alt.put(ALTERNATES_END, coordinate.getEnd() != null ? coordinate.getEnd() : variant.getEnd()); alt.put(ALTERNATES_TYPE, coordinate.getType() != null ? coordinate.getType().toString() : variant.getType().toString()); alternates.add(alt); } } final List<Document> fileDocuments; if (!files.isEmpty()) { fileDocuments = new ArrayList<>(files.size()); for (FileEntry file : files) { Document fileObject = convertFileDocument(studyEntry, file); fileDocuments.add(fileObject); if (samplesConverter != null) { Document otherFields = new Document(); fileObject.append(SAMPLE_DATA_FIELD, otherFields); studyObject.putAll(samplesConverter.convertToStorageType(studyEntry, studyId, otherFields, sampleNames)); } } } else { fileDocuments = Collections.singletonList(convertFileDocument(studyEntry, new FileEntry())); } studyObject.append(FILES_FIELD, fileDocuments); if (alternates != null && !alternates.isEmpty()) { studyObject.append(ALTERNATES_FIELD, alternates); } return studyObject; } protected Document convertFileDocument(StudyEntry studyEntry, FileEntry file) { int fileId = Integer.parseInt(file.getFileId()); Document fileObject = new Document(FILEID_FIELD, fileId); // Attributes if (file.getAttributes().size() > 0) { Document attrs = null; for (Map.Entry<String, String> entry : file.getAttributes().entrySet()) { String stringValue = entry.getValue(); String key = entry.getKey().replace(".", DocumentToStudyConfigurationConverter.TO_REPLACE_DOTS); Object value = stringValue; if (key.equals("src")) { if (includeSrc) { try { value = org.opencb.commons.utils.StringUtils.gzip(stringValue); } catch (IOException ex) { Logger.getLogger(DocumentToStudyVariantEntryConverter.class.getName()).log(Level.SEVERE, null, ex); } } else { continue; } } else { try { value = Integer.parseInt(stringValue); } catch (NumberFormatException notAnInt) { try { value = Long.parseLong(stringValue); } catch (NumberFormatException notALong) { try { value = Double.parseDouble(stringValue); } catch (NumberFormatException notADouble) { // leave it as a String } } } } if (attrs == null) { attrs = new Document(key, value); } else { attrs.append(key, value); } } if (attrs != null) { fileObject.put(ATTRIBUTES_FIELD, attrs); } } String call = studyEntry.getFile(Integer.toString(fileId)).getCall(); if (call != null && !call.isEmpty()) { int indexOf = call.lastIndexOf(":"); fileObject.append(ORI_FIELD, new Document("s", call.substring(0, indexOf)) .append("i", Integer.parseInt(call.substring(indexOf + 1)))); } return fileObject; } public DocumentToSamplesConverter getSamplesConverter() { return samplesConverter; } public void setIncludeSrc(boolean includeSrc) { this.includeSrc = includeSrc; } }