/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*/
package org.opencb.opencga.storage.hadoop.variant.index;
import com.google.common.collect.BiMap;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.AlternateCoordinate;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.biodata.tools.variant.merge.VariantMerger;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.hadoop.variant.GenomeHelper;
import org.opencb.opencga.storage.hadoop.variant.models.protobuf.VariantTableStudyRowsProto;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* @author Matthias Haimel mh719+git@cam.ac.uk
*/
public class VariantTableMapper extends AbstractVariantTableMapReduce {
private final AtomicBoolean parallel = new AtomicBoolean(false);
private boolean isParallel() {
return this.parallel.get();
}
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
int cores = context.getConfiguration().getInt(MRJobConfig.MAP_CPU_VCORES, 1);
int parallelism = ForkJoinPool.getCommonPoolParallelism();
this.parallel.set(cores == parallelism); // has to match
if (isParallel()) {
getLog().info("Using ForkJoinPool of {} ... ", cores);
this.getResultConverter().setParallel(true);
}
}
public static ForkJoinPool createForkJoinPool(final String prefix, int vcores) {
return new ForkJoinPool(vcores, pool -> {
ForkJoinWorkerThread worker = ForkJoinPool.defaultForkJoinWorkerThreadFactory.newThread(pool);
worker.setName(prefix + "_fjp_" + pool.getPoolSize());
return worker;
}, null, false);
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
super.cleanup(context);
}
protected static final VariantType[] TARGET_VARIANT_TYPE = new VariantType[] {
VariantType.SNV, VariantType.SNP,
VariantType.INDEL, VariantType.INSERTION, VariantType.DELETION,
VariantType.MNV, VariantType.MNP,
};
public static VariantType[] getTargetVariantType() {
return Arrays.copyOf(TARGET_VARIANT_TYPE, TARGET_VARIANT_TYPE.length);
}
/*
*
* +---------+----------+
* | ARCHIVE | ANALYSIS |
* +----------+---------+----------+
* | 1:10:A:T | DATA | ---- | <= New variant (1)
* +----------+---------+----------+
* | 1:20:C:G | ---- | DATA | <= Missing variant (2)
* +----------+---------+----------+
* | 1:30:G:T | DATA | DATA | <= Same variant (3)
* +----------+---------+----------+
* | 1:40:T:C | DATA | ---- | <= Overlapped variant (new)
* | 1:40:T:G | ---- | DATA | <= Overlapped variant (missing)
* +----------+---------+----------+
*
*/
public enum OpenCGAVariantTableCounters {
ARCHIVE_TABLE_VARIANTS,
ARCHIVE_TABLE_SEC_ALT_VARIANTS,
ANALYSIS_TABLE_VARIANTS,
NEW_VARIANTS,
MISSING_VARIANTS,
SAME_VARIANTS
}
private List<Variant> loadArchiveVariants(VariantMapReduceContext ctx) {
// Archive: unpack Archive data (selection only
getLog().info("Read Archive ...");
List<Variant> archiveVar = getResultConverter().convert(ctx.value, true, var -> {
completeAlternateCoordinates(var);
int from = toPosition(var, true);
int to = toPosition(var, false);
return from <= ctx.nextStartPos && to >= ctx.startPos;
});
ctx.context.getCounter(COUNTER_GROUP_NAME, "VARIANTS_FROM_ARCHIVE").increment(archiveVar.size());
return archiveVar;
}
private Set<Variant> processScanVariants(VariantMapReduceContext ctx, List<VariantTableStudyRow> rows) {
startTime();
List<Variant> archiveVar = loadArchiveVariants(ctx);
endTime("1 Unpack and convert input ARCHIVE variants");
getLog().info("Index ...");
NavigableMap<Integer, List<Variant>> varPosRegister = indexAlts(archiveVar, (int) ctx.startPos, (int) ctx.nextStartPos);
endTime("2 Index input ARCHIVE variants");
/* Update and submit Analysis Variants */
Set<Variant> analysisNew = processAnalysisVariants(ctx, archiveVar, varPosRegister, rows);
getLog().info("Merge {} new variants ", analysisNew.size());
final AtomicLong overlap = new AtomicLong(0);
final AtomicLong merge = new AtomicLong(0);
this.getVariantMerger().setExpectedSamples(this.currentIndexingSamples); // RESET expected set to current once only
// with current files of same region
Consumer<Variant> variantConsumer = (var) -> {
ctx.getContext().progress(); // Call process to avoid timeouts
long start = System.nanoTime();
Collection<Variant> cleanList = buildOverlappingNonRedundantSet(var, varPosRegister);
long mid = System.nanoTime();
this.getVariantMerger().merge(var, cleanList);
long end = System.nanoTime();
overlap.getAndAdd(mid - start);
merge.getAndAdd(end - mid);
};
processVariants(analysisNew, variantConsumer);
registerRuntime("8a Merge NEW variants - overlap", overlap.get());
registerRuntime("8b Merge NEW variants - merge", merge.get());
getLog().info("Merge 1 - overlap {}; merge {}; ns", overlap, merge);
return analysisNew;
}
private Set<Variant> processAnalysisVariants(
VariantMapReduceContext ctx, List<Variant> archiveVar,
final NavigableMap<Integer, List<Variant>> varPosRegister, List<VariantTableStudyRow> rows) {
List<Cell> variantCells = GenomeHelper.getVariantColumns(ctx.getValue().rawCells());
getLog().info("Parse ...");
List<Variant> analysisVar = parseCurrentVariantsRegion(variantCells, ctx.getChromosome());
ctx.getContext().getCounter(COUNTER_GROUP_NAME, "VARIANTS_FROM_ANALYSIS").increment(analysisVar.size());
endTime("3 Unpack and convert input ANALYSIS variants (" + GenomeHelper.VARIANT_COLUMN_PREFIX + ")");
// Check if Archive covers all bases in Analysis
// TODO switched off at the moment down to removed variant calls from gVCF files (malformated variants)
// checkArchiveConsistency(ctx.context, ctx.startPos, ctx.nextStartPos, archiveVar, analysisVar);
endTime("4 Check consistency -- skipped");
final AtomicLong overlap = new AtomicLong(0);
final AtomicLong merge = new AtomicLong(0);
final AtomicLong submit = new AtomicLong(0);
// (2) and (3): Same, missing (and overlapping missing) variants
Consumer<Variant> variantConsumer = var -> {
long start = System.nanoTime();
ctx.getContext().progress(); // Call process to avoid timeouts
Collection<Variant> cleanList = buildOverlappingNonRedundantSet(var, varPosRegister);
long mid = System.nanoTime();
this.getVariantMerger().merge(var, cleanList);
long end = System.nanoTime();
overlap.getAndAdd(mid - start);
merge.getAndAdd(end - mid);
};
getLog().info("Merge ...");
processVariants(analysisVar, variantConsumer);
registerRuntime("5a Merge same and missing - overlap", overlap.get());
registerRuntime("5b Merge same and missing - merge", merge.get());
getLog().info("Submit ...");
startTime();
updateOutputTable(ctx.context, analysisVar, rows, ctx.sampleIds);
endTime("6 Update OUTPUT table");
getLog().info("Filter ...");
List<Variant> archiveTarget = filterForVariant(archiveVar.stream(), TARGET_VARIANT_TYPE).collect(Collectors.toList());
endTime("7a Filter archive variants by target");
ctx.context.getCounter(COUNTER_GROUP_NAME, "VARIANTS_FROM_ARCHIVE_TARGET").increment(archiveTarget.size());
getLog().info("Loaded current: " + analysisVar.size()
+ "; archive: " + archiveVar.size()
+ "; target: " + archiveTarget.size());
/* ******** Update Analysis Variants ************** */
// Variants of target type
Set<Variant> analysisNew = getNewVariantsAsTemplates(ctx, analysisVar, archiveTarget, (int) ctx.startPos, (int) ctx.nextStartPos);
endTime("7b Create NEW variants");
return analysisNew;
}
private void processVariants(Collection<Variant> variants, Consumer<Variant> variantConsumer) {
if (isParallel()) {
variants.parallelStream().forEach(variantConsumer);
} else {
variants.forEach(variantConsumer);
}
}
private boolean processVColumn(VariantMapReduceContext ctx) throws IOException, InterruptedException {
List<Cell> variantCells = GenomeHelper.getVariantColumns(ctx.getValue().rawCells());
if (!variantCells.isEmpty()) {
byte[] data = CellUtil.cloneValue(variantCells.get(0));
VariantTableStudyRowsProto proto = VariantTableStudyRowsProto.parseFrom(data);
getLog().info("Column _V: found " + variantCells.size()
+ " columns - check timestamp " + getTimestamp() + " with " + proto.getTimestamp());
if (proto.getTimestamp() == getTimestamp()) {
ctx.context.getCounter(COUNTER_GROUP_NAME, "X_ALREADY_LOADED_SLICE").increment(1);
for (Cell cell : variantCells) {
VariantTableStudyRowsProto rows = VariantTableStudyRowsProto.parseFrom(CellUtil.cloneValue(cell));
List<VariantTableStudyRow> variants = parseVariantStudyRowsFromArchive(ctx.getChromosome(), rows);
ctx.context.getCounter(COUNTER_GROUP_NAME, "X_ALREADY_LOADED_ROWS").increment(variants.size());
updateOutputTable(ctx.context, variants);
}
endTime("X Unpack, convert and write ANALYSIS variants (" + GenomeHelper.VARIANT_COLUMN_PREFIX + ")");
return true;
}
}
return false;
}
private void processNewVariants(VariantMapReduceContext ctx, Collection<Variant> analysisNew, List<VariantTableStudyRow> rows)
throws IOException, InterruptedException {
// with all other gVCF files of same region
if (!analysisNew.isEmpty()) {
fillNewWithIndexedSamples(ctx, analysisNew);
}
// WRITE VALUES
startTime();
updateOutputTable(ctx.context, analysisNew, rows, null);
endTime("10 Update OUTPUT table");
}
private void fillNewWithIndexedSamples(VariantMapReduceContext ctx, Collection<Variant> analysisNew) throws IOException {
AtomicLong overlap = new AtomicLong(0);
AtomicLong merge = new AtomicLong(0);
Set<Integer> coveredPositions = new ConcurrentSkipListSet<>();
processVariants(analysisNew, var -> {
int min = toPosition(var, true);
int max = toPosition(var, false);
coveredPositions.addAll(IntStream.range(min, max + 1).boxed().collect(Collectors.toList()));
});
// Reset expected set
this.getVariantMerger().setExpectedSamples(this.currentIndexingSamples);
Map<Integer, LinkedHashSet<Integer>> samplesInFiles = this.getStudyConfiguration().getSamplesInFiles();
BiMap<Integer, String> id2name = StudyConfiguration.getIndexedSamples(this.getStudyConfiguration()).inverse();
loadFromArchive(ctx.context, ctx.getCurrRowKey(), ctx.fileIds, (fileIds, res) -> {
Set<String> names = fileIds.stream().flatMap(fid -> samplesInFiles.get(fid).stream())
.map(id -> {
String name = id2name.get(id);
if (name == null) {
throw new IllegalStateException("No name for sample id " + id);
}
return name;
}).collect(Collectors.toSet());
this.getVariantMerger().addExpectedSamples(names); // add loaded names to merger
if (null == res || res.isEmpty()) {
getLog().info("No variants found for {} files for {} samples...", fileIds.size(), names.size());
return;
}
long startTime = System.nanoTime();
// Uses ForkJoinPool !!!
// only load variants which have overlap.
List<Variant> archiveOther = getResultConverter().convert(res, true, var -> {
// Complete ALTs
completeAlternateCoordinates(var);
int min = toPosition(var, true);
int max = toPosition(var, false);
return IntStream.range(min, max + 1).boxed().anyMatch(i -> coveredPositions.contains(i));
});
registerRuntime("9b Convert to Variants", System.nanoTime() - startTime);
getLog().info("Loaded "
+ archiveOther.size() + " variants for "
+ fileIds.size() + " files for "
+ names.size() + " samples... ");
startTime = System.nanoTime();
final NavigableMap<Integer, List<Variant>> varPosSortedOther =
indexAlts(archiveOther, (int)ctx.startPos, (int)ctx.nextStartPos);
getLog().info("Create alts index of size " + varPosSortedOther.size() + " ... ");
ctx.context.getCounter(COUNTER_GROUP_NAME, "OTHER_VARIANTS_FROM_ARCHIVE").increment(archiveOther.size());
ctx.context.getCounter(COUNTER_GROUP_NAME, "OTHER_VARIANTS_FROM_ARCHIVE_NUM_QUERIES").increment(1);
processVariants(analysisNew, var -> {
ctx.getContext().progress(); // Call process to avoid timeouts
long start = System.nanoTime();
Collection<Variant> cleanList = buildOverlappingNonRedundantSet(var, varPosSortedOther);
if (getLog().isDebugEnabled()) {
getLog().debug(
"Merge 2 - merge {} variants for {} - overlap {}; merge {}; ns ... ",
cleanList.size(), var, overlap, merge);
}
long mid = System.nanoTime();
this.getVariantMerger().merge(var, cleanList);
overlap.addAndGet(mid - start);
merge.addAndGet(System.nanoTime() - mid);
});
getLog().info("Merge 2 - overlap {}; merge {}; ns", overlap, merge);
registerRuntime("9c Merge NEW with archive slice - overlap", overlap.get());
registerRuntime("9d Merge NEW with archive slice - merge", merge.get());
ctx.getContext().progress(); // Call process to avoid timeouts
});
}
@Override
protected void doMap(VariantMapReduceContext ctx) throws IOException, InterruptedException {
this.getVariantMerger().setExpectedSamples(this.getIndexedSamples().keySet());
this.getVariantMerger().addExpectedSamples(this.currentIndexingSamples);
if (processVColumn(ctx)) {
return; // All stored in V column already.
}
final List<VariantTableStudyRow> rows = new CopyOnWriteArrayList<>();
Set<Variant> analysisNew = processScanVariants(ctx, rows);
processNewVariants(ctx, analysisNew, rows);
// Checkpoint -> update archive table!!!
startTime();
updateArchiveTable(ctx.getCurrRowKey(), ctx.context, rows);
endTime("11 Update INPUT table");
getLog().info("Done merging");
}
private NavigableMap<Integer, List<Variant>> indexAlts(List<Variant> variants, int startPos, int nextStartPos) {
// TODO Check if Alternates need indexing as well !!!
final ConcurrentSkipListMap<Integer, List<Variant>> retMap = new ConcurrentSkipListMap<>();
Consumer<Variant> variantConsumer = v -> {
int from = Math.max(toPosition(v, true), startPos);
int to = Math.min(toPosition(v, false) + 1, nextStartPos);
IntStream.range(from, to).forEach(p -> retMap.computeIfAbsent(p, (idx) -> new CopyOnWriteArrayList<>()).add(v));
};
processVariants(variants, variantConsumer);
return retMap;
}
private static Integer toPosition(Variant variant, boolean isStart) {
Integer pos = getPosition(variant.getStart(), variant.getEnd(), isStart);
for (StudyEntry study : variant.getStudies()) {
List<AlternateCoordinate> alternates = study.getSecondaryAlternates();
if (alternates != null) {
for (AlternateCoordinate alt : alternates) {
pos = getPosition(pos, getPosition(alt.getStart(), alt.getEnd(), isStart), isStart);
}
}
}
return pos;
}
private static Integer getPosition(Integer start, Integer end, boolean isStart) {
return isStart ? Math.min(start, end) : Math.max(start, end);
}
private void completeAlternateCoordinates(Variant variant) {
for (StudyEntry study : variant.getStudies()) {
List<AlternateCoordinate> alternates = study.getSecondaryAlternates();
if (alternates != null) {
for (AlternateCoordinate alt : alternates) {
alt.setChromosome(alt.getChromosome() == null ? variant.getChromosome() : alt.getChromosome());
alt.setStart(alt.getStart() == null ? variant.getStart() : alt.getStart());
alt.setEnd(alt.getEnd() == null ? variant.getEnd() : alt.getEnd());
alt.setReference(alt.getReference() == null ? variant.getReference() : alt.getReference());
alt.setAlternate(alt.getAlternate() == null ? variant.getAlternate() : alt.getAlternate());
}
}
}
}
private void completeAlternateCoordinates(List<Variant> variants) {
Consumer<Variant> variantConsumer = variant -> completeAlternateCoordinates(variant);
if (isParallel()) {
variants.parallelStream().forEach(variantConsumer);
} else {
variants.stream().forEach(variantConsumer);
}
}
private Set<Variant> getNewVariantsAsTemplates(
VariantMapReduceContext ctx, List<Variant> analysisVar,
List<Variant> archiveTarget, int startPos, int nextStartPos) {
String studyId = Integer.toString(getStudyConfiguration().getStudyId());
// (1) NEW variants (only create the position, no filling yet)
Set<String> analysisVarSet = analysisVar.stream().map(Variant::toString).collect(Collectors.toSet());
analysisVarSet.addAll(analysisVar.stream().flatMap(v -> v.getStudy(studyId).getSecondaryAlternates().stream())
.map(a -> toVariantString(a)).collect(Collectors.toSet()));
Set<Variant> analysisNew = new ConcurrentSkipListSet<>(); // for later parallel processing
Set<String> archiveTargetSet = new HashSet<>();
Set<String> secAltTargetSet = new HashSet<>();
// For all main variants
for (Variant tar : archiveTarget) {
int minStart = Math.min(tar.getStart(), tar.getEnd());
if (minStart < startPos || minStart >= nextStartPos) {
continue; // Skip variants with start position in previous or next slice
}
// Get all the archive target variants that are not in the analysis variants.
// is new Variant?
String tarString = tar.toString();
archiveTargetSet.add(tarString);
if (!analysisVarSet.contains(tarString)) {
// Empty variant with no Sample information
// Filled with Sample information later (see 2)
StudyEntry se = tar.getStudy(studyId);
if (null == se) {
throw new IllegalStateException(String.format(
"Study Entry for study %s of target variant is null: %s", studyId, tar));
}
Variant tarNew = this.getVariantMerger().createFromTemplate(tar);
analysisNew.add(tarNew);
}
}
// For all SecondaryAlternate
for (Variant tar : archiveTarget) {
List<AlternateCoordinate> secAlt = tar.getStudy(studyId).getSecondaryAlternates();
for (AlternateCoordinate coordinate : secAlt) {
int minStart = Math.min(coordinate.getStart(), coordinate.getEnd());
if (minStart < startPos || minStart >= nextStartPos) {
continue; // Skip variants with start position in previous or next slice
}
String variantString = toVariantString(coordinate);
if (!archiveTargetSet.contains(variantString) && !secAltTargetSet.contains(variantString)
&& !analysisVarSet.contains(variantString)) {
secAltTargetSet.add(variantString);
// Create new Variant from Secondary Alternate
String chromosome = useUnlessNull(coordinate.getChromosome(), tar.getChromosome());
Integer start = useUnlessNull(coordinate.getStart(), tar.getStart());
Integer end = useUnlessNull(coordinate.getEnd(), tar.getEnd());
String reference = useUnlessNull(coordinate.getReference(), tar.getReference());
String alternate = coordinate.getAlternate();
VariantType type = coordinate.getType();
try {
Variant tarNew = new Variant(chromosome, start, end, reference, alternate);
tarNew.setType(type);
for (StudyEntry tse : tar.getStudies()) {
StudyEntry se = new StudyEntry(tse.getStudyId());
se.setFiles(Collections.singletonList(new FileEntry("", "", new HashMap<>())));
se.setFormat(Arrays.asList(VariantMerger.GT_KEY, VariantMerger.GENOTYPE_FILTER_KEY));
se.setSamplesPosition(new HashMap<>());
se.setSamplesData(new ArrayList<>());
tarNew.addStudyEntry(se);
}
analysisNew.add(tarNew);
} catch (NullPointerException e) {
throw new IllegalStateException(StringUtils.join(new Object[]{
"Chr: ", chromosome, "Start: ", start, "End: ", end, "Ref: ", reference,
"ALT: ", alternate, }, ";"), e);
}
}
}
}
Set<String> totalSet = new HashSet<>(archiveTargetSet);
totalSet.addAll(secAltTargetSet);
int sameVariants = totalSet.size() - analysisNew.size();
ctx.context.getCounter(OpenCGAVariantTableCounters.SAME_VARIANTS).increment(sameVariants);
ctx.context.getCounter(OpenCGAVariantTableCounters.NEW_VARIANTS).increment(analysisNew.size());
ctx.context.getCounter(OpenCGAVariantTableCounters.MISSING_VARIANTS).increment(analysisVarSet.size() - sameVariants);
ctx.context.getCounter(OpenCGAVariantTableCounters.ARCHIVE_TABLE_VARIANTS).increment(archiveTargetSet.size());
ctx.context.getCounter(OpenCGAVariantTableCounters.ARCHIVE_TABLE_SEC_ALT_VARIANTS).increment(secAltTargetSet.size());
ctx.context.getCounter(OpenCGAVariantTableCounters.ANALYSIS_TABLE_VARIANTS).increment(analysisVar.size());
return analysisNew;
}
private <T> T useUnlessNull(T a, T b) {
return a != null ? a : b;
}
protected String toVariantString(AlternateCoordinate alt) {
if (alt.getReference() == null) {
return alt.getChromosome() + ":" + alt.getStart() + ":"
+ (StringUtils.isEmpty(alt.getAlternate()) ? "-" : alt.getAlternate());
} else {
return alt.getChromosome() + ":" + alt.getStart() + ":"
+ (StringUtils.isEmpty(alt.getReference()) ? "-" : alt.getReference())
+ ":" + (StringUtils.isEmpty(alt.getAlternate()) ? "-" : alt.getAlternate());
}
}
// Find only Objects with the same object ID
private static class VariantWrapper {
private volatile Variant var;
VariantWrapper(final Variant var) {
this.var = var;
}
@Override
public int hashCode() {
return System.identityHashCode(this.var);
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof VariantWrapper)) {
return false;
}
VariantWrapper wrapper = (VariantWrapper) o;
return this.var.equals(wrapper.var);
}
}
private Collection<Variant> buildOverlappingNonRedundantSet(Variant var, final NavigableMap<Integer, List<Variant>> archiveVar) {
int min = toPosition(var, true);
int max = toPosition(var, false);
Set<VariantWrapper> vars = new HashSet<>();
IntStream.range(min, max + 1).boxed().forEach(p -> {
List<Variant> lst = archiveVar.get(p);
if (null != lst) {
for (Variant v : lst) {
vars.add(new VariantWrapper(v)); // Wrap for faster 'HashCode' comparison.
}
}
});
return vars.stream().map(v -> v.var).collect(Collectors.toList());
}
/**
* Load all variants for all files (except in currFileIds) listed in the study configuration for the specified rowKey.
* @param context Context
* @param rowKey Slice to extract data for
* @param currFileIds File ids to ignore
* @param merge BiConsumer accepting ID list and List of {@link Variant} to merge (batch mode)
* @throws IOException
*/
private void loadFromArchive(Context context, byte[] rowKey, Set<Integer> currFileIds,
BiConsumer<Set<Integer>, Result> merge) throws IOException {
// Extract File IDs to search through
LinkedHashSet<Integer> indexedFiles = getStudyConfiguration().getIndexedFiles();
Set<String> archiveFileIds = indexedFiles.stream().filter(k -> !currFileIds.contains(k)).map(s -> s.toString())
.collect(Collectors.toSet());
if (archiveFileIds.isEmpty()) {
getLog().info("No files found to search for in archive table");
merge.accept(Collections.emptySet(), null);
return; // done
}
getLog().info("Search archive for " + archiveFileIds.size() + " files in total in batches of " + this.archiveBatchSize + " ... ");
while (!archiveFileIds.isEmpty()) {
Long startTime = System.nanoTime();
// create batch
Set<String> batch = new HashSet<>();
for (String e : archiveFileIds) {
if (batch.size() < this.archiveBatchSize) {
batch.add(e);
} else {
break;
}
}
archiveFileIds.removeAll(batch); // remove ids
getLog().info("Search archive for " + batch.size() + " files with " + archiveFileIds.size() + " remaining ... ");
if (getLog().isDebugEnabled()) {
getLog().debug("Add files to search in archive: " + StringUtils.join(batch, ','));
}
Get get = new Get(rowKey);
byte[] cf = getHelper().getColumnFamily();
batch.forEach(e -> get.addColumn(cf, Bytes.toBytes(e)));
Set<Integer> batchIds = batch.stream().map(e -> Integer.valueOf(e)).collect(Collectors.toSet());
Result res = getHelper().getHBaseManager().act(getHelper().getIntputTable(), table -> table.get(get));
registerRuntime("9a Load archive slice from hbase", System.nanoTime() - startTime);
if (res.isEmpty()) {
getLog().warn("No data found in archive table!!!");
merge.accept(batchIds, null);
} else {
merge.accept(batchIds, res);
}
}
getLog().info("Done processing archive data!");
}
/**
* Check if Archive has Variant objects covering all bases (including no-call objects).
* Increases HBase counter with the name VCF_VARIANT-error-FIXME to act on.
* @param context
* @param startPos
* @param nextStartPos
* @param archiveVar
* @param analysisVar
*/
private void checkArchiveConsistency(Context context, long startPos,
long nextStartPos, List<Variant> archiveVar, List<Variant> analysisVar) {
// Report Missing regions in ARCHIVE table, which are seen in VAR table
Set<Integer> archPosMissing = generateCoveredPositions(analysisVar.stream(), startPos, nextStartPos);
archPosMissing.removeAll(generateCoveredPositions(archiveVar.stream(), startPos, nextStartPos));
if (!archPosMissing.isEmpty()) {
// should never happen - positions exist in variant table but not in archive table
context.getCounter(COUNTER_GROUP_NAME, "VCF_VARIANT-error-FIXME").increment(1);
getLog().error(
String.format("Positions found in variant table but not in Archive table: %s",
Arrays.toString(archPosMissing.toArray(new Integer[0]))));
}
}
protected Set<Integer> generateCoveredPositions(Stream<Variant> variants, long startPos, long nextStartPos) {
final int sPos = (int) startPos;
final int ePos = (int) (nextStartPos - 1);
// limit to max start position end min end position (only slice region)
// hope this works
return variants.map(v -> generateRegion(Math.max(v.getStart(), sPos), Math.min(v.getEnd(), ePos))).flatMap(l -> l.stream())
.collect(Collectors.toSet());
}
protected Set<Integer> generateRegion(Integer start, Integer end) {
if (end < start) {
end = start;
}
int len = end - start;
Integer[] array = new Integer[len + 1];
for (int a = 0; a <= len; a++) { // <= to be inclusive
array[a] = (start + a);
}
return new HashSet<Integer>(Arrays.asList(array));
}
protected Stream<Variant> filterForVariant(Stream<Variant> variants, VariantType ... types) {
Set<VariantType> whiteList = new HashSet<>(Arrays.asList(types));
return variants.filter(v -> whiteList.contains(v.getType()));
}
}