/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.catalog.utils;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.opencb.biodata.formats.alignment.sam.io.AlignmentSamDataReader;
import org.opencb.biodata.models.alignment.AlignmentHeader;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.biodata.models.variant.stats.VariantGlobalStats;
import org.opencb.biodata.tools.variant.VariantFileUtils;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.commons.utils.FileUtils;
import org.opencb.opencga.catalog.db.api.FileDBAdaptor;
import org.opencb.opencga.catalog.exceptions.CatalogException;
import org.opencb.opencga.catalog.exceptions.CatalogIOException;
import org.opencb.opencga.catalog.managers.CatalogFileUtils;
import org.opencb.opencga.catalog.managers.CatalogManager;
import org.opencb.opencga.catalog.models.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.nio.file.Paths;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class FileMetadataReader {
public static final String VARIANT_STATS = "variantStats";
private static final QueryOptions STUDY_QUERY_OPTIONS =
new QueryOptions("include", Arrays.asList("projects.studies.id", "projects.studies.name", "projects.studies.alias"));
private final CatalogManager catalogManager;
protected static Logger logger = LoggerFactory.getLogger(FileMetadataReader.class);
public static final String CREATE_MISSING_SAMPLES = "createMissingSamples";
private final CatalogFileUtils catalogFileUtils;
public FileMetadataReader(CatalogManager catalogManager) {
this.catalogManager = catalogManager;
catalogFileUtils = new CatalogFileUtils(catalogManager);
}
/**
* Creates a file entry in catalog reading metadata information from the fileUri.
* Do not upload or sync file. Created file status will be {@link File.FileStatus#STAGE}
*
* @param studyId Study on where the file entry is created
* @param fileUri File URI to read metadata information.
* @param path File path, relative to the study
* @param description File description (optional)
* @param parents Create parent folders or not
* @param options Other options
* @param sessionId User sessionId
* @return The created file with status {@link File.FileStatus#STAGE}
* @throws CatalogException if a Catalog error occurs
*/
public QueryResult<File> create(long studyId, URI fileUri, String path, String description, boolean parents,
QueryOptions options, String sessionId) throws CatalogException {
File.Type type = fileUri.getPath().endsWith("/") ? File.Type.DIRECTORY : File.Type.FILE;
File.Format format = FormatDetector.detect(fileUri);
File.Bioformat bioformat = BioformatDetector.detect(fileUri);
if (path.endsWith("/")) {
path += Paths.get(fileUri.getPath()).getFileName().toString();
}
QueryResult<File> fileResult = catalogManager.createFile(studyId, type, format, bioformat, path, null, description,
new File.FileStatus(File.FileStatus.STAGE), 0, -1, null, -1, null, null, parents, options, sessionId);
File modifiedFile = null;
try {
modifiedFile = setMetadataInformation(fileResult.first(), fileUri, options, sessionId, false);
} catch (CatalogException e) {
logger.error("Fail at getting the metadata information", e);
}
fileResult.setResult(Collections.singletonList(modifiedFile));
return fileResult;
}
/**
* Reads the file and modifies the Catalog file entry with metadata information. The metadata information read is:
* Bioformat
* Format
* FileHeader (for known bioformats)
* SampleIds
* Disk usage (size)
* Checksum (if calculateChecksum == true)
*
* @param file File from which read metadata
* @param fileUri File location. If null, ask to Catalog.
* @param options Other options
* @param sessionId User sessionId
* @param simulate Simulate the metadata modifications.
* @return If there are no modifications, return the same input file. Else, return the updated file
* @throws CatalogException if a Catalog error occurs
*/
public File setMetadataInformation(final File file, URI fileUri, QueryOptions options, String sessionId, boolean simulate)
throws CatalogException {
long studyId = catalogManager.getStudyIdByFileId(file.getId());
if (fileUri == null) {
fileUri = catalogManager.getFileUri(file);
}
options = ParamUtils.defaultObject(options, QueryOptions::new);
ObjectMap modifyParams = new ObjectMap();
// long start;
if (file.getType() == File.Type.DIRECTORY) {
return file;
}
//Get metadata information
// start = System.currentTimeMillis();
File.Format format = FormatDetector.detect(fileUri);
// logger.trace("FormatDetector = " + (System.currentTimeMillis() - start) / 1000.0);
// start = System.currentTimeMillis();
File.Bioformat bioformat = BioformatDetector.detect(fileUri);
// logger.trace("BioformatDetector = " + (System.currentTimeMillis() - start) / 1000.0);
if (format != File.Format.UNKNOWN && !format.equals(file.getFormat())) {
modifyParams.put(FileDBAdaptor.QueryParams.FORMAT.key(), format);
file.setFormat(format);
}
if (bioformat != File.Bioformat.NONE && !bioformat.equals(file.getBioformat())) {
modifyParams.put(FileDBAdaptor.QueryParams.BIOFORMAT.key(), bioformat);
file.setBioformat(bioformat);
}
Study study = null;
// start = System.currentTimeMillis();
boolean exists = catalogManager.getCatalogIOManagerFactory().get(fileUri).exists(fileUri);
// logger.trace("Exists = " + (System.currentTimeMillis() - start) / 1000.0);
if (exists) {
switch (bioformat) {
case ALIGNMENT: {
// start = System.currentTimeMillis();
study = catalogManager.getStudy(studyId, STUDY_QUERY_OPTIONS, sessionId).first();
// logger.trace("getStudy = " + (System.currentTimeMillis() - start) / 1000.0);
AlignmentHeader alignmentHeader = readAlignmentHeader(study, file, fileUri);
if (alignmentHeader != null) {
HashMap<String, Object> attributes = new HashMap<>();
attributes.put("alignmentHeader", alignmentHeader);
modifyParams.put(FileDBAdaptor.QueryParams.ATTRIBUTES.key(), attributes);
}
break;
}
case VARIANT: {
// start = System.currentTimeMillis();
study = catalogManager.getStudy(studyId, STUDY_QUERY_OPTIONS, sessionId).first();
// logger.trace("getStudy = " + (System.currentTimeMillis() - start) / 1000.0);
VariantSource variantSource = null;
try {
variantSource = readVariantSource(study, file, fileUri);
} catch (IOException e) {
throw new CatalogIOException("Unable to read VariantSource", e);
}
if (variantSource != null) {
HashMap<String, Object> attributes = new HashMap<>();
attributes.put("variantSource", variantSource);
modifyParams.put(FileDBAdaptor.QueryParams.ATTRIBUTES.key(), attributes);
}
break;
}
default:
break;
}
}
// start = System.currentTimeMillis();
/*List<Sample> fileSamples = */
getFileSamples(study, file, fileUri, modifyParams, options.getBoolean(CREATE_MISSING_SAMPLES, true), simulate, options, sessionId);
// logger.trace("FileSamples = " + (System.currentTimeMillis() - start) / 1000.0);
// start = System.currentTimeMillis();
modifyParams.putAll(catalogFileUtils.getModifiedFileAttributes(file, fileUri, false));
// logger.trace("FileAttributes = " + (System.currentTimeMillis() - start) / 1000.0);
if (!modifyParams.isEmpty()) {
// start = System.currentTimeMillis();
if (modifyParams.get(FileDBAdaptor.QueryParams.SIZE.key()) != null) {
catalogManager.getFileManager()
.setDiskUsage(file.getId(), modifyParams.getLong(FileDBAdaptor.QueryParams.SIZE.key()), sessionId);
modifyParams.remove(FileDBAdaptor.QueryParams.SIZE.key());
}
if (modifyParams.get(FileDBAdaptor.QueryParams.MODIFICATION_DATE.key()) != null) {
catalogManager.getFileManager()
.setModificationDate(file.getId(), modifyParams.getString(FileDBAdaptor.QueryParams.MODIFICATION_DATE.key()),
sessionId);
modifyParams.remove(FileDBAdaptor.QueryParams.MODIFICATION_DATE.key());
}
if (modifyParams.get(FileDBAdaptor.QueryParams.URI.key()) != null) {
catalogManager.getFileManager()
.setUri(file.getId(), modifyParams.getString(FileDBAdaptor.QueryParams.URI.key()), sessionId);
modifyParams.remove(FileDBAdaptor.QueryParams.URI.key());
}
if (!modifyParams.isEmpty()) {
catalogManager.getFileManager().update(file.getId(), modifyParams, new QueryOptions(), sessionId);
}
// logger.trace("modifyFile = " + (System.currentTimeMillis() - start) / 1000.0);
return catalogManager.getFile(file.getId(), options, sessionId).first();
}
return file;
}
/**
* Get samples from file header.
*
* @param study Study where the file is.
* @param file File from which read samples.
* @param fileUri File location. If null, ask to Catalog.
* @param fileModifyParams ModifyParams to add sampleIds and other related information (like header).
* @param createMissingSamples Create samples from the file that where missing.
* @param simulate Simulate the creation of samples.
* @param options Options
* @param sessionId User sessionId
* @return List of samples in the given file
* @throws CatalogException if a Catalog error occurs
*/
public List<Sample> getFileSamples(Study study, File file, URI fileUri, final ObjectMap fileModifyParams,
boolean createMissingSamples, boolean simulate, QueryOptions options, String sessionId)
throws CatalogException {
options = ParamUtils.defaultObject(options, QueryOptions::new);
List<Sample> sampleList;
Map<String, Object> attributes;
if (!fileModifyParams.containsKey("attributes")) {
attributes = new HashMap<>();
} else {
attributes = fileModifyParams.getMap("attributes");
}
List<String> includeSampleNameId = Arrays.asList("projects.studies.samples.id", "projects.studies.samples.name");
if (file.getSampleIds() == null || file.getSampleIds().isEmpty()) {
//Read samples from file
List<String> sortedSampleNames = null;
switch (fileModifyParams.containsKey("bioformat") ? (File.Bioformat) fileModifyParams.get("bioformat") : file.getBioformat()) {
case VARIANT: {
Object variantSourceObj = null;
if (file.getAttributes().containsKey("variantSource")) {
variantSourceObj = file.getAttributes().get("variantSource");
} else if (attributes.containsKey("variantSource")) {
variantSourceObj = fileModifyParams.getMap("attributes").get("variantSource");
}
if (variantSourceObj != null) {
if (variantSourceObj instanceof VariantSource) {
sortedSampleNames = ((VariantSource) variantSourceObj).getSamples();
} else if (variantSourceObj instanceof Map) {
sortedSampleNames = new ObjectMap((Map) variantSourceObj).getAsStringList("samples");
} else {
logger.warn("Unexpected object type of variantSource ({}) in file attributes. Expected {} or {}",
variantSourceObj.getClass(), VariantSource.class, Map.class);
}
}
if (sortedSampleNames == null) {
VariantSource variantSource = null;
try {
variantSource = readVariantSource(study, file, fileUri);
} catch (IOException e) {
throw new CatalogIOException("Unable to read VariantSource", e);
}
if (variantSource != null) {
attributes.put("variantSource", variantSource);
sortedSampleNames = variantSource.getSamples();
} else {
sortedSampleNames = new LinkedList<>();
}
}
break;
}
case ALIGNMENT: {
Object alignmentHeaderObj = null;
if (file.getAttributes().containsKey("alignmentHeader")) {
alignmentHeaderObj = file.getAttributes().get("alignmentHeader");
} else if (attributes.containsKey("alignmentHeader")) {
alignmentHeaderObj = fileModifyParams.getMap("attributes").get("alignmentHeader");
}
if (alignmentHeaderObj != null) {
if (alignmentHeaderObj instanceof AlignmentHeader) {
sortedSampleNames = getSampleFromAlignmentHeader(((AlignmentHeader) alignmentHeaderObj));
} else if (alignmentHeaderObj instanceof Map) {
sortedSampleNames = getSampleFromAlignmentHeader((Map) alignmentHeaderObj);
} else {
logger.warn("Unexpected object type of AlignmentHeader ({}) in file attributes. Expected {} or {}",
alignmentHeaderObj.getClass(), AlignmentHeader.class, Map.class);
}
}
if (sortedSampleNames == null) {
AlignmentHeader alignmentHeader = readAlignmentHeader(study, file, fileUri);
if (alignmentHeader != null) {
attributes.put("alignmentHeader", alignmentHeader);
sortedSampleNames = getSampleFromAlignmentHeader(alignmentHeader);
} else {
sortedSampleNames = new LinkedList<>();
}
}
break;
}
default:
return new LinkedList<>();
// throw new CatalogException("Unknown to get samples names from bioformat " + file.getBioformat());
}
if (sortedSampleNames.isEmpty()) {
return new LinkedList<>();
}
//Find matching samples in catalog with the sampleName from the header.
QueryOptions sampleQueryOptions = new QueryOptions("include", includeSampleNameId);
Query sampleQuery = new Query("name", sortedSampleNames);
sampleList = catalogManager.getAllSamples(study.getId(), sampleQuery, sampleQueryOptions, sessionId).getResult();
//check if all file samples exists on Catalog
if (sampleList.size() != sortedSampleNames.size()) { //Size does not match. Find the missing samples.
//Use a LinkedHashSet to keep the order
Set<String> set = new LinkedHashSet<>(sortedSampleNames);
for (Sample sample : sampleList) {
set.remove(sample.getName());
}
logger.warn("Some samples from file \"{}\" were not registered in Catalog. Registering new samples: {}",
file.getName(), set);
if (createMissingSamples) {
for (String sampleName : set) {
if (simulate) {
sampleList.add(new Sample(-1, sampleName, file.getName(), new Individual(), null));
} else {
try {
sampleList.add(catalogManager.createSample(study.getId(), sampleName, file.getName(),
null, null, null, sessionId).first());
} catch (CatalogException e) {
Query query = new Query("name", sampleName);
QueryOptions queryOptions = new QueryOptions("include", includeSampleNameId);
if (catalogManager.getAllSamples(study.getId(), query, queryOptions, sessionId).getResult().isEmpty()) {
throw e; //Throw exception if sample does not exist.
} else {
logger.debug("Do not create the sample \"" + sampleName + "\". It has magically appeared");
}
}
}
}
} else {
throw new CatalogException("Can not find samples " + set + " in catalog"); //FIXME: Create missing samples??
}
}
//Samples may not be sorted.
//Sort samples as they appear in the original file.
Map<String, Sample> sampleMap = sampleList.stream().collect(Collectors.toMap(Sample::getName, Function.identity()));
sampleList = new ArrayList<>(sampleList.size());
for (String sampleName : sortedSampleNames) {
sampleList.add(sampleMap.get(sampleName));
}
} else {
//Get samples from file.sampleIds
Query query = new Query("id", file.getSampleIds());
sampleList = catalogManager.getAllSamples(study.getId(), query, new QueryOptions(), sessionId).getResult();
}
List<Long> sampleIdsList = sampleList.stream().map(Sample::getId).collect(Collectors.toList());
fileModifyParams.put("sampleIds", sampleIdsList);
if (!attributes.isEmpty()) {
fileModifyParams.put("attributes", attributes);
}
return sampleList;
}
private List<String> getSampleFromAlignmentHeader(Map alignmentHeaderObj) {
List<String> sampleNames;
sampleNames = new LinkedList<>(new ObjectMap(alignmentHeaderObj).getList("readGroups")
.stream()
.map((rg) -> ((Map) ((Map) rg).get("attributes")).get("SM").toString())
.filter((s) -> s != null)
.collect(Collectors.toSet()));
return sampleNames;
}
private List<String> getSampleFromAlignmentHeader(AlignmentHeader alignmentHeader) {
List<String> sampleNames;
Set<String> sampleSet = alignmentHeader.getReadGroups().stream()
.map((rg) -> rg.getAttributes().get("SM"))
.filter((s) -> s != null)
.collect(Collectors.toSet());
sampleNames = new LinkedList<>(sampleSet);
return sampleNames;
}
public static VariantSource readVariantSource(Study study, File file, URI fileUri)
throws IOException {
if (file.getFormat() == File.Format.VCF || FormatDetector.detect(fileUri) == File.Format.VCF) {
//TODO: Fix aggregate and studyType
VariantSource source = new VariantSource(file.getName(), Long.toString(file.getId()),
Long.toString(study.getId()), study.getName());
return VariantFileUtils.readVariantSource(Paths.get(fileUri.getPath()), source);
} else {
return null;
}
}
public static AlignmentHeader readAlignmentHeader(Study study, File file, URI fileUri) {
if (file.getFormat() == File.Format.SAM
|| file.getFormat() == File.Format.BAM
|| FormatDetector.detect(fileUri) == File.Format.SAM
|| FormatDetector.detect(fileUri) == File.Format.BAM) {
AlignmentSamDataReader reader = new AlignmentSamDataReader(Paths.get(fileUri), study.getName());
try {
reader.open();
reader.pre();
reader.post();
// reader.getSamHeader().get
return reader.getHeader();
} finally {
reader.close();
}
} else {
return null;
}
}
/**
* Updates the file stats from a transformed variant file.
* Reads the stats generated on the transform step.
*
* @param job Job that executed successfully the transform step
* @param sessionId User sessionId
* @throws CatalogException if a Catalog error occurs
*/
@Deprecated
public void updateVariantFileStats(Job job, String sessionId) throws CatalogException {
long studyId = catalogManager.getStudyIdByJobId(job.getId());
Query query = new Query()
.append(FileDBAdaptor.QueryParams.ID.key(), job.getInput())
.append(FileDBAdaptor.QueryParams.BIOFORMAT.key(), File.Bioformat.VARIANT);
QueryResult<File> fileQueryResult = catalogManager.getAllFiles(studyId, query, new QueryOptions(), sessionId);
if (fileQueryResult.getResult().isEmpty()) {
return;
}
File inputFile = fileQueryResult.first();
if (inputFile.getBioformat().equals(File.Bioformat.VARIANT)) {
query = new Query()
.append(FileDBAdaptor.QueryParams.ID.key(), job.getOutput())
.append(FileDBAdaptor.QueryParams.NAME.key(), "~" + inputFile.getName() + ".file");
fileQueryResult = catalogManager.getAllFiles(studyId, query, new QueryOptions(), sessionId);
if (fileQueryResult.getResult().isEmpty()) {
return;
}
File variantsFile = fileQueryResult.first();
URI fileUri = catalogManager.getFileUri(variantsFile);
try (InputStream is = FileUtils.newInputStream(Paths.get(fileUri.getPath()))) {
VariantSource variantSource = new ObjectMapper().readValue(is, VariantSource.class);
VariantGlobalStats stats = variantSource.getStats();
catalogManager.getFileManager().update(inputFile.getId(), new ObjectMap("stats", new ObjectMap(VARIANT_STATS, stats)),
new QueryOptions(), sessionId);
} catch (IOException e) {
throw new CatalogException("Error reading file \"" + fileUri + "\"", e);
}
}
}
public static FileMetadataReader get(CatalogManager catalogManager) {
return new FileMetadataReader(catalogManager);
}
}