/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.analysis.storage;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.VariantSource;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.opencga.analysis.AnalysisExecutionException;
import org.opencb.opencga.analysis.JobFactory;
import org.opencb.opencga.storage.core.manager.variant.operations.StorageOperation;
import org.opencb.opencga.storage.core.manager.variant.CatalogStudyConfigurationFactory;
import org.opencb.opencga.catalog.monitor.executors.old.ExecutorManager;
import org.opencb.opencga.catalog.utils.FileMetadataReader;
import org.opencb.opencga.catalog.managers.CatalogManager;
import org.opencb.opencga.catalog.db.api.CohortDBAdaptor;
import org.opencb.opencga.catalog.db.api.FileDBAdaptor;
import org.opencb.opencga.catalog.exceptions.CatalogException;
import org.opencb.opencga.catalog.models.*;
import org.opencb.opencga.core.common.Config;
import org.opencb.opencga.core.common.TimeUtils;
import org.opencb.opencga.storage.core.StorageEngineFactory;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
/**
* Created by jacobo on 16/10/14.
*
* IndexFile (fileId, backend, outDir)
* - get Samples data
* - create temporal outDir (must be a new folder)
* - Add index information to the file
* - create command line
* - create job
* - update index file
*
*
* * If only transform, do not add index information
* * If only load, take "originalFile" from the "inputFileId" jobId -> job.attributes.indexedFile
*
* UnIndexFile (fileId, backend)
* ?????????????????????????????
*
*/
@Deprecated
public class AnalysisFileIndexer {
//Properties
public static final String OPENCGA_ANALYSIS_STORAGE_DATABASE_PREFIX = "OPENCGA.ANALYSIS.STORAGE.DATABASE_PREFIX";
//Options
public static final String PARAMETERS = "parameters";
public static final String TRANSFORM = "transform";
public static final String CREATE = "create";
public static final String LOAD = "load";
public static final String LOG_LEVEL = "logLevel";
//Other
@Deprecated
public static final String OPENCGA_STORAGE_BIN_NAME = "opencga-storage.sh";
public static final String OPENCGA_ANALYSIS_BIN_NAME = "opencga-analysis.sh";
private final CatalogManager catalogManager;
protected static Logger logger = LoggerFactory.getLogger(AnalysisFileIndexer.class);
public AnalysisFileIndexer(CatalogManager catalogManager) {
this.catalogManager = catalogManager;
}
/**
*
* @param fileId File to index
* @param outDirId Place where locate the temporary files
* @param sessionId User session Id
* @param options Other options
* @return Generated job for the indexation
* @throws IOException
* @throws CatalogException
* @throws AnalysisExecutionException
*/
public QueryResult<Job> index(long fileId, long outDirId, String sessionId, QueryOptions options)
throws CatalogException, AnalysisExecutionException {
if (options == null) {
options = new QueryOptions();
}
final boolean execute = options.getBoolean(ExecutorManager.EXECUTE);
final boolean simulate = options.getBoolean(ExecutorManager.SIMULATE);
final long start = System.currentTimeMillis();
final boolean transform;
final boolean load;
if (!options.getBoolean(TRANSFORM, false) && !options.getBoolean(LOAD, false)) { // if not present --transform nor --load, do both
transform = true;
load = true;
} else {
transform = options.getBoolean(TRANSFORM, false);
load = options.getBoolean(LOAD, false);
}
/** Query catalog for user data. **/
String userId = catalogManager.getUserIdBySessionId(sessionId);
File inputFile = catalogManager.getFile(fileId, sessionId).first();
File originalFile;
File outDir = catalogManager.getFile(outDirId, sessionId).first();
long studyIdByOutDirId = catalogManager.getStudyIdByFileId(outDirId);
long studyIdByInputFileId = catalogManager.getStudyIdByFileId(inputFile.getId());
Study study = catalogManager.getStudy(studyIdByOutDirId, sessionId).getResult().get(0);
if (inputFile.getType() != File.Type.FILE) {
throw new CatalogException("Expected file type = " + File.Type.FILE + " instead of " + inputFile.getType());
}
if (studyIdByInputFileId != studyIdByOutDirId) {
logger.warn("Indexing a file from the study " + studyIdByInputFileId + " in the study " + studyIdByOutDirId);
// TODO: Raise an exception?
}
// The file is already transformed, but the transformation job is not registered in Catalog.
boolean externalTransformed = false;
/** Get the original file. **/
if (!transform && load) { //Don't transform. Just load. Select the original file
final long indexedFileId;
// FIXME: What if the original VCF contains a jobId ?
if (inputFile.getJob().getId() <= 0) {
// TODO: Move this code to an external class for relating Transformed and Original files.
// If the input file is the original file, read from the Index object
if (inputFile.getIndex() != null) {
long jobId = inputFile.getIndex().getJobId();
Query query;
if (inputFile.getBioformat().equals(File.Bioformat.VARIANT)) {
query = new Query(FileDBAdaptor.QueryParams.JOB_ID.key(), jobId)
.append(FileDBAdaptor.QueryParams.NAME.key(), "~" + inputFile.getName() + ".variants");
} else {
throw new CatalogException("Error: can't load this file. Only transformed files can be loaded.");
}
QueryResult<File> result = catalogManager.searchFile(studyIdByOutDirId, query, sessionId);
indexedFileId = inputFile.getId();
inputFile = result.first();
} else if (inputFile.getAttributes().containsKey(Job.INDEXED_FILE_ID)) {
indexedFileId = new ObjectMap(inputFile.getAttributes()).getInt(Job.INDEXED_FILE_ID);
} else {
// If the input file is an externally transformed file, find the original file.
if (inputFile.getBioformat().equals(File.Bioformat.VARIANT) && !(inputFile.getFormat() == File.Format.VCF)) {
VariantReaderUtils utils = new VariantReaderUtils();
try {
// Read the VariantSource to get the source file
VariantSource variantSource = utils.readVariantSource(catalogManager.getFileUri(inputFile));
Query query = new Query(FileDBAdaptor.QueryParams.NAME.key(), variantSource.getFileName());
QueryResult<File> result = catalogManager.searchFile(studyIdByOutDirId, query, sessionId);
if (result.getResult().size() == 0) {
// TODO: Continue with the transformed file as indexed file?
throw new CatalogException("Unable to find file \"" + variantSource.getFileName() + "\" "
+ "as source file from \"" + inputFile.getName() + "\"");
} else if (result.getResult().size() > 1) {
List<String> foundFilesSummary = result.getResult().stream().map(File::getPath).collect(Collectors.toList());
throw new CatalogException("Unable to find single file \"" + variantSource.getFileName() + "\" "
+ "as source file from \"" + inputFile.getName() + "\". Got multiple versions: " + foundFilesSummary);
} else {
externalTransformed = true;
indexedFileId = result.first().getId();
// inputFile = result.first();
}
} catch (StorageEngineException e) {
throw new CatalogException("Unable to find source file from \"" + inputFile.getName() + "\"", e);
}
} else {
// TODO: Solve for other bioformats
throw new CatalogException("Unable to find source file from \"" + inputFile.getName() + "\"");
}
}
// throw new CatalogException("Error: can't load this file. JobId unknown. Need jobId to know origin file. " +
// "Only transformed files can be loaded.");
} else {
Job job = catalogManager.getJob(inputFile.getJob().getId(), null, sessionId).first();
if (job.getAttributes().containsKey(Job.INDEXED_FILE_ID)) {
indexedFileId = new ObjectMap(job.getAttributes()).getInt(Job.INDEXED_FILE_ID);
} else {
logger.warn("INDEXED_FILE_ID missing in job " + job.getId());
List<Long> jobInputFiles = job.getInput();
if (jobInputFiles.size() != 1) {
throw new CatalogException("Error: Job {id: " + job.getId() + "} input is empty");
}
indexedFileId = jobInputFiles.get(0);
}
}
originalFile = catalogManager.getFile(indexedFileId, null, sessionId).first();
if (!originalFile.getStatus().getName().equals(File.FileStatus.READY)) {
throw new CatalogException("Error: Original file status must be \"READY\", not \"" + originalFile.getStatus().getName() + "\"");
}
} else {
originalFile = inputFile;
}
final DataStore dataStore = StorageOperation.getDataStore(catalogManager, catalogManager.getStudyIdByFileId(originalFile.getId()), originalFile.getBioformat(), sessionId);
/** Check if file can be indexed **/
if (originalFile.getIndex() != null) {
switch (originalFile.getIndex().getStatus().getName()) {
case FileIndex.IndexStatus.TRANSFORMING:
throw new CatalogException("File '" + originalFile.getId() + "' it's being transformed");
case FileIndex.IndexStatus.TRANSFORMED:
if (transform) {
throw new CatalogException("File '" + originalFile.getId() + "' is already transformed");
}
break;
case FileIndex.IndexStatus.LOADING:
throw new CatalogException("File '" + originalFile.getId() + "' it's being loaded");
case FileIndex.IndexStatus.INDEXING:
throw new CatalogException("File '" + originalFile.getId() + "' it's being indexed");
case FileIndex.IndexStatus.READY:
throw new CatalogException("File '" + originalFile.getId() + "' is already indexed");
case FileIndex.IndexStatus.NONE:
break;
}
} else {
if (!transform && load) {
if (!externalTransformed) {
throw new CatalogException("File '" + originalFile.getId() + "' need to be transformed before loading");
}
}
}
// ObjectMap to fill with modifications over the indexed file (like new attributes or jobId)
final ObjectMap fileModifyParams = new ObjectMap("attributes", new ObjectMap());
final ObjectMap indexAttributes;
if (originalFile.getIndex() == null || originalFile.getIndex().getAttributes() == null) {
indexAttributes = new ObjectMap();
} else {
indexAttributes = new ObjectMap(originalFile.getIndex().getAttributes());
}
/** Create temporal Job Outdir **/
final URI temporalOutDirUri;
final String randomString = "I_" + RandomStringUtils.randomAlphanumeric(10);
if (simulate) {
temporalOutDirUri = createSimulatedOutDirUri(randomString);
} else {
temporalOutDirUri = catalogManager.createJobOutDir(studyIdByOutDirId, randomString, sessionId);
}
/** Get file samples **/
List<Sample> sampleList;
if (originalFile.getSampleIds() == null || originalFile.getSampleIds().isEmpty()) {
try {
sampleList = FileMetadataReader.get(catalogManager).getFileSamples(study, originalFile,
catalogManager.getFileUri(originalFile), fileModifyParams,
options.getBoolean(FileMetadataReader.CREATE_MISSING_SAMPLES, true), simulate, options, sessionId);
} catch (CatalogException e) {
throw new AnalysisExecutionException(e);
}
} else {
sampleList = catalogManager.getAllSamples(study.getId(), new Query("id", originalFile.getSampleIds()), new QueryOptions(), sessionId).getResult();
}
if (!simulate) {
Cohort defaultCohort = null;
QueryResult<Cohort> cohorts = catalogManager.getAllCohorts(studyIdByOutDirId,
new Query(CohortDBAdaptor.QueryParams.NAME.key(), StudyEntry.DEFAULT_COHORT), new QueryOptions(), sessionId);
if (cohorts.getResult().isEmpty()) {
defaultCohort = catalogManager.createCohort(studyIdByOutDirId, StudyEntry.DEFAULT_COHORT, Study.Type.COLLECTION,
"Default cohort with almost all indexed samples", Collections.emptyList(), null, sessionId).first();
} else {
defaultCohort = cohorts.first();
}
ObjectMap updateParams = new ObjectMap();
if (options.getBoolean(VariantStorageEngine.Options.CALCULATE_STATS.key()) && load) {
updateParams.append("status.name", Cohort.CohortStatus.CALCULATING);
}
//Samples are the already indexed plus those that are going to be indexed
Set<Long> samples = new HashSet<>(defaultCohort.getSamples());
samples.addAll(sampleList.stream().map(Sample::getId).collect(Collectors.toList()));
if (samples.size() != defaultCohort.getSamples().size()) {
logger.debug("Updating \"{}\" cohort", StudyEntry.DEFAULT_COHORT);
updateParams.append("samples", new ArrayList<>(samples));
}
if (!updateParams.isEmpty()) {
catalogManager.modifyCohort(defaultCohort.getId(), updateParams, new QueryOptions(), sessionId);
}
}
/** Create commandLine **/
String commandLine = createCommandLine(study, originalFile, inputFile, sampleList,
outDirId, temporalOutDirUri, randomString, indexAttributes, dataStore, sessionId, options);
if (options.containsKey(PARAMETERS)) {
List<String> extraParams = options.getAsStringList(PARAMETERS);
for (String extraParam : extraParams) {
commandLine += " " + extraParam;
}
}
/** Update StudyConfiguration **/
if (!simulate) {
try {
if (inputFile.getBioformat().equals(File.Bioformat.VARIANT)) {
StudyConfigurationManager studyConfigurationManager = StorageEngineFactory.get().getVariantStorageEngine(dataStore.getStorageEngine())
.getDBAdaptor(dataStore.getDbName()).getStudyConfigurationManager();
new CatalogStudyConfigurationFactory(catalogManager).updateStudyConfigurationFromCatalog(studyIdByOutDirId, studyConfigurationManager, sessionId);
}
} catch (StorageEngineException | ClassNotFoundException | InstantiationException | IllegalAccessException e) {
e.printStackTrace();
}
}
/** Create index information **/
FileIndex indexInformation = originalFile.getIndex();
if (indexInformation == null) {
String status = externalTransformed ? FileIndex.IndexStatus.TRANSFORMED : FileIndex.IndexStatus.NONE;
indexInformation = new FileIndex(userId, TimeUtils.getTime(), new FileIndex.IndexStatus(status), -1, indexAttributes);
}
if (transform && !load) {
indexInformation.getStatus().setName(FileIndex.IndexStatus.TRANSFORMING);
} else if (!transform && load) {
indexInformation.getStatus().setName(FileIndex.IndexStatus.LOADING);
} else if (transform && load) {
indexInformation.getStatus().setName(FileIndex.IndexStatus.INDEXING);
}
if (!simulate) {
fileModifyParams.put("index", indexInformation);
}
/** Modify file with new information **/
if (!simulate) {
catalogManager.getFileManager().update(originalFile.getId(), fileModifyParams, new QueryOptions(), sessionId).getResult();
}
/** Create job **/
ObjectMap jobAttributes = new ObjectMap();
jobAttributes.put(Job.TYPE, Job.Type.INDEX);
jobAttributes.put(Job.INDEXED_FILE_ID, originalFile.getId());
jobAttributes.put(VariantStorageEngine.Options.CALCULATE_STATS.key(), options.getBoolean(VariantStorageEngine.Options.CALCULATE_STATS.key(), VariantStorageEngine.Options.CALCULATE_STATS.defaultValue()));
// jobAttributes.put(VariantStorageEngine.Options.AGGREGATED_TYPE.key(), options.get(VariantStorageEngine.Options.AGGREGATED_TYPE.key(), VariantSource.Aggregation.class, VariantStorageEngine.Options.AGGREGATED_TYPE.defaultValue()));
String jobName;
String jobDescription;
switch (indexInformation.getStatus().getName()) {
default:
// throw new IllegalStateException("Unexpected state");
case FileIndex.IndexStatus.INDEXING:
jobName = "index";
jobDescription = "Indexing file " + originalFile.getName() + " (" + originalFile.getId() + ")";
break;
case FileIndex.IndexStatus.LOADING:
jobName = "load";
jobDescription = "Loading file " + originalFile.getName() + " (" + originalFile.getId() + ")";
break;
case FileIndex.IndexStatus.TRANSFORMING:
jobName = "transform";
jobDescription = "Transforming file " + originalFile.getName() + " (" + originalFile.getId() + ")";
break;
}
JobFactory jobFactory = new JobFactory(catalogManager);
final Job job = jobFactory.createJob(studyIdByOutDirId, jobName,
OPENCGA_ANALYSIS_BIN_NAME, jobDescription, outDir, Collections.singletonList(inputFile.getId()),
sessionId, randomString, temporalOutDirUri, commandLine, execute, simulate, jobAttributes, null).first();
if (!simulate) {
modifyIndexJobId(originalFile.getId(), job.getId(), transform, load, sessionId);
}
if (simulate) {
return new QueryResult<>("indexFile", (int) (System.currentTimeMillis() - start), 1, 1, "", "", Collections.singletonList(job));
} else {
return new QueryResult<>("indexFile", (int) (System.currentTimeMillis() - start), 1, 1, "", "",
catalogManager.getJob(job.getId(), null, sessionId).getResult());
}
}
private void modifyIndexJobId(long fileId, long jobId, boolean transform, boolean load, String sessionId) throws CatalogException {
File file = catalogManager.getFile(fileId, sessionId).first();
FileIndex index = file.getIndex();
index.setJobId(jobId);
if (transform && !load) {
index.getAttributes().put("transformJobId", jobId);
} else if (!transform && load) {
index.getAttributes().put("loadJobId", jobId);
} else if (transform && load) {
index.getAttributes().put("indexJobId", jobId);
}
catalogManager.getFileManager().update(fileId, new ObjectMap("index", index), new QueryOptions(), sessionId);
}
/**
*
* @param study Study where file is located
* @param inputFile File to be indexed
* @param sampleList
* @param outDirId
* @param outDirUri Index outdir
* @param randomString
* @param indexAttributes Attributes of the index object
* @param dataStore
* @return CommandLine
*
* @throws CatalogException
*/
private String createCommandLine(Study study, File originalFile, File inputFile, List<Sample> sampleList,
long outDirId, URI outDirUri, String randomString, final ObjectMap indexAttributes, final DataStore dataStore,
String sessionId, QueryOptions options)
throws CatalogException {
//Create command line
// String userId = inputFile.getOwnerId();
String name = originalFile.getName();
String commandLine;
// String opencgaStorageBin = Paths.get(Config.getOpenCGAHome(), "bin", OPENCGA_STORAGE_BIN_NAME).toString();
String opencgaAnalysisBin = Paths.get(Config.getOpenCGAHome(), "bin", OPENCGA_ANALYSIS_BIN_NAME).toString();
if(originalFile.getBioformat() == File.Bioformat.ALIGNMENT || name.endsWith(".bam") || name.endsWith(".sam")) {
int chunkSize = 200; //TODO: Read from properties.
StringBuilder sb = new StringBuilder(opencgaAnalysisBin)
.append(" alignment index ")
.append(" --file-id ").append(originalFile.getId())
.append(" --outdir-id ").append(outDirId)
.append(" --calculate-coverage ")
.append(" --mean-coverage ").append(chunkSize)
.append(" --session-id ").append(sessionId)
.append(" --job-id ").append(randomString);
// .append(" --credentials ")
if (options.containsKey(LOG_LEVEL)) {
sb.append(" --log-level ").append(options.getString(LOG_LEVEL));
}
commandLine = sb.toString();
indexAttributes.put("chunkSize", chunkSize);
} else if (name.endsWith(".fasta") || name.endsWith(".fasta.gz")) {
throw new UnsupportedOperationException();
} else if (originalFile.getBioformat() == File.Bioformat.VARIANT || name.contains(".vcf") || name.contains(".vcf.gz")) {
StringBuilder sb = new StringBuilder(opencgaAnalysisBin)
.append(" variant index ")
.append(" --file-id ").append(inputFile.getId())
.append(" --outdir ").append(outDirId)
.append(" --session-id ").append(sessionId)
.append(" --job-id ").append(randomString);
if (options.getBoolean(VariantStorageEngine.Options.ANNOTATE.key(), VariantStorageEngine.Options.ANNOTATE.defaultValue())) {
sb.append(" --annotate ");
}
if (options.getBoolean(VariantStorageEngine.Options.CALCULATE_STATS.key(), VariantStorageEngine.Options.CALCULATE_STATS.defaultValue())) {
sb.append(" --calculate-stats ");
}
if (options.getBoolean(TRANSFORM, false)) {
sb.append(" --transform ");
}
if (StringUtils.isNotEmpty(options.getString(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key()))) {
sb.append(" --include-extra-fields ").append(options.getString(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key()));
}
if (options.getBoolean(VariantStorageEngine.Options.EXCLUDE_GENOTYPES.key(), VariantStorageEngine.Options.EXCLUDE_GENOTYPES.defaultValue())) {
sb.append(" --exclude-genotypes ");
}
if (options.getBoolean(LOAD, false)) {
sb.append(" --load ");
}
if (StringUtils.isNotEmpty(options.getString(LOG_LEVEL))) {
sb.append(" --log-level ").append(options.getString(LOG_LEVEL));
}
if (options.containsKey(VariantStorageEngine.Options.AGGREGATED_TYPE.key())) {
sb.append(" --aggregated ").append(options.getString(VariantStorageEngine.Options.AGGREGATED_TYPE.key()));
}
commandLine = sb.toString();
} else {
return null;
}
return commandLine;
}
////AUX METHODS
public static URI createSimulatedOutDirUri() {
return createSimulatedOutDirUri("J_" + RandomStringUtils.randomAlphanumeric(10));
}
public static URI createSimulatedOutDirUri(String randomString) {
return Paths.get("/tmp","simulatedJobOutdir" , randomString).toUri();
}
}