/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.core.alignment.tasks; import org.opencb.biodata.models.alignment.Alignment; import org.opencb.biodata.models.alignment.Alignment.AlignmentDifference; import org.opencb.biodata.models.alignment.AlignmentRegion; import org.opencb.biodata.models.alignment.stats.MeanCoverage; import org.opencb.biodata.models.alignment.stats.RegionCoverage; import org.opencb.biodata.models.core.Region; import org.opencb.commons.run.Task; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedList; import java.util.List; /** * Date: 11/21/13. * * @author Jacobo Coll Moragon <jcoll@ebi.ac.uk> * <p> * Calculates coverage and mean coverage for AlignmentRegion **/ public class AlignmentRegionCoverageCalculatorTask extends Task<AlignmentRegion> { /** * Calculates the Mean Coverage at intervals. */ private class MeanCoverageCalculator { private final int size; private final String name; private int accumulator; private long next; MeanCoverageCalculator(String name) { this.accumulator = 0; this.next = 0; this.size = MeanCoverage.nameToSizeConvert(name); this.name = name; } MeanCoverageCalculator(int size, String name) { this.accumulator = 0; this.next = 0; this.size = size; this.name = name; } public List<MeanCoverage> calculateMeanCoverage(RegionCoverage coverage) { List<MeanCoverage> list = new LinkedList<>(); final short[] all = coverage.getAll(); if (coverage.getStart() >= next) { Region region = new Region(coverage.getChromosome(), (int) next - size, (int) next - 1); list.add(new MeanCoverage(size, name, region, (float) accumulator / size)); reset(coverage.getStart()); } int i = 0; boolean lastIteration = false; while (i < all.length) { int lim = (int) (next - coverage.getStart()); if (all.length < lim) { lim = all.length; lastIteration = true; } for (; i < lim; i++) { accumulator += all[i]; } if (!lastIteration) { //The last iteration will keep the defaultValue for the next call to this function Region region = new Region(coverage.getChromosome(), (int) next - size, (int) next - 1); list.add(new MeanCoverage(size, name, region, (float) accumulator / size)); next += size; accumulator = 0; } } return list; } public void reset(long position) { this.next = ((position - 1) / size + 1) * size + 1; //Calculates the NEXT interval starting position this.accumulator = 0; //Reset the accumulator } } private final class NativeShortArrayList { private short[] array = new short[0]; private int size = 0; private int capacity = 0; private NativeShortArrayList(int capacity) { this.capacity = capacity; this.array = new short[capacity]; } private NativeShortArrayList() { this.capacity = 0; } public void resize(int newSize) { array = Arrays.copyOf(array, newSize); capacity = newSize; } public void clear() { size = 0; } public void empty() { size = 0; array = null; } public short get(int position) { return array[position]; } public void add(short elem) { if (size >= capacity) { resize(capacity * 2); } array[size++] = elem; } private int size() { return size; } private short[] getArray() { return Arrays.copyOfRange(array, 0, size); } private int getCapacity() { return capacity; } } private List<MeanCoverageCalculator> meanCoverageCalculator; private long start = 0; private long end = 0; private RegionCoverage coverage; private int regionCoverageSize; private long regionCoverageMask; private NativeShortArrayList a; private NativeShortArrayList c; private NativeShortArrayList g; private NativeShortArrayList t; private NativeShortArrayList all; private int savedSize; public AlignmentRegionCoverageCalculatorTask() { setRegionCoverageSize(4000); a = new NativeShortArrayList(); c = new NativeShortArrayList(); g = new NativeShortArrayList(); t = new NativeShortArrayList(); all = new NativeShortArrayList(); meanCoverageCalculator = new ArrayList<>(); reset(); } public void reset() { start = 0; end = 0; savedSize = 0; } /** * Uses a circular array to store intermediate values. * <p> * The start and end values represents the first and the last position stored in the intermediate values. * d e f g * |---------------------*-*-----*-----------------* end * | ^ ^ ^ ^ * 1| ACTGCAGTCGATGTCTATGCTT | | * 2| | CAGTCGATGTCGATGC | | * 3| | | GTCGATGCTTATGCTA | * 4| | | | CCTGCAGTCGATGCTA * | v v v v * |--*---*-------*---------------------------------- start * a b c d * Every time the start position represents the start of the last alignment, and the bigger end. * Information becomes final when: * -The start of the new alignment is greater than the start. * When the alignment 3 begins to be processed, the region [b,c) will be stored with this.saveCoverage(c) * -The start position of a new alignment is greater than the end * When the alignment 4 begins to be processed, the region [c.d) will be stored * -The AlignmentRegion does not overlap with the next AlignmentRegion. * When all the Alignments are processed, the region [start,end] will be stored **/ @Override public boolean apply(List<AlignmentRegion> batch) throws IOException { for (AlignmentRegion alignmentRegion : batch) { if (alignmentRegion == null) { continue; } /* Initialize */ long coverageStart = start; if (start == 0) { //Set Default defaultValue coverageStart = alignmentRegion.getStart(); start = alignmentRegion.getStart(); end = alignmentRegion.getStart(); for (MeanCoverageCalculator aux : meanCoverageCalculator) { aux.reset(start); } } int totalSize = (int) (alignmentRegion.getEnd() - alignmentRegion.getStart()); if (all.getCapacity() < totalSize) { totalSize *= 1.4; all.resize(totalSize); a.resize(totalSize); c.resize(totalSize); g.resize(totalSize); t.resize(totalSize); } savedSize = 0; /* Calculate Coverage */ for (Alignment alignment : alignmentRegion.getAlignments()) { coverage(alignment); } if (!alignmentRegion.isOverlapEnd()) { saveCoverage(alignmentRegion.getEnd() + 1); //[start-end] } /* Create Region Coverage //Todo jcoll: Profile this part */ RegionCoverage regionCoverage = new RegionCoverage(); regionCoverage.setA(a.getArray()); regionCoverage.setC(c.getArray()); regionCoverage.setG(g.getArray()); regionCoverage.setT(t.getArray()); regionCoverage.setAll(all.getArray()); regionCoverage.setStart(coverageStart); regionCoverage.setEnd(coverageStart + savedSize); regionCoverage.setChromosome(alignmentRegion.getChromosome()); // assert start-coverageStart == savedSize; //TODO jcoll: Assert this alignmentRegion.setCoverage(regionCoverage); savedSize = 0; a.clear(); c.clear(); g.clear(); t.clear(); all.clear(); /* Create Mean Coverage List */ List<MeanCoverage> meanCoverageList = new ArrayList<>(meanCoverageCalculator.size()); for (MeanCoverageCalculator aux : meanCoverageCalculator) { meanCoverageList.addAll(aux.calculateMeanCoverage(regionCoverage)); } alignmentRegion.setMeanCoverage(meanCoverageList); if (!alignmentRegion.isOverlapEnd()) { end = alignmentRegion.getEnd(); reset(); } } return true; } private void saveCoverage(long endP) { //Saves the actual coverage from start to end int pos; for (long i = start; i < endP; i++) { pos = (int) (i & regionCoverageMask); a.add(/*savedSize,*/ coverage.getA()[pos]); coverage.getA()[pos] = 0; c.add(/*savedSize, */coverage.getC()[pos]); coverage.getC()[pos] = 0; g.add(/*savedSize, */coverage.getG()[pos]); coverage.getG()[pos] = 0; t.add(/*savedSize, */coverage.getT()[pos]); coverage.getT()[pos] = 0; all.add(/*savedSize, */coverage.getAll()[pos]); coverage.getAll()[pos] = 0; savedSize++; } start = endP; } private int coverage(Alignment alignment) { if ((alignment.getFlags() & Alignment.SEGMENT_UNMAPPED) != 0) { return 0; } if (alignment.getLength() > regionCoverageSize) { setRegionCoverageSize(alignment.getLength()); } if (alignment.getStart() > end) { saveCoverage(end + 1); //Save to the end saveCoverage(alignment.getStart()); //Save zeros to the start //System.out.println(alignment.getStart()); } else { saveCoverage(alignment.getStart()); } start = alignment.getStart(); if (alignment.getEnd() > end) { end = alignment.getEnd(); } //byte[] sequence = alignment.getReadSequence(); String seq; //Iterator<AlignmentDifference> diferencesIterator = alignment.getDifferences().iterator(); //AlignmentDifference alignmentDifference = diferencesIterator.hasNext()? diferencesIterator.next():null; int offset = 0; // offset caused by insertions and deletions int clipping = 0; //int validBases = (int)(alignment.getEnd()-alignment.getStart()); int pos = 0; for (AlignmentDifference diff : alignment.getDifferences()) { for (; pos + clipping < diff.getPos(); pos++) { coverage.getAll()[(int) ((pos + start) & regionCoverageMask)]++; } switch (diff.getOp()) { case AlignmentDifference.INSERTION: //i += alignmentDifference.getLength(); offset -= diff.getLength(); break; case AlignmentDifference.DELETION: pos += diff.getLength(); offset += diff.getLength(); break; case AlignmentDifference.SOFT_CLIPPING: //pos += diff.getLength(); clipping += diff.getLength(); case AlignmentDifference.SKIPPED_REGION: case AlignmentDifference.HARD_CLIPPING: case AlignmentDifference.PADDING: case AlignmentDifference.MATCH_MISMATCH: case AlignmentDifference.MISMATCH: { seq = diff.getSeq(); if (seq != null) { for (char c : seq.toCharArray()) { switch (c) { case 'A': coverage.getA()[(int) ((pos + start) & regionCoverageMask)]++; break; case 'C': coverage.getC()[(int) ((pos + start) & regionCoverageMask)]++; break; case 'G': coverage.getG()[(int) ((pos + start) & regionCoverageMask)]++; break; case 'T': coverage.getT()[(int) ((pos + start) & regionCoverageMask)]++; break; default: break; } coverage.getAll()[(int) ((pos + start) & regionCoverageMask)]++; pos++; } } //else, in the next loop will increase the "all" coverage break; } default: break; } } for (; pos + clipping - offset < alignment.getLength(); pos++) { coverage.getAll()[(int) ((pos + start) & regionCoverageMask)]++; } //assert pos == validBases; if (pos + clipping - offset != alignment.getLength()) { System.out.println("[ERROR] assert pos == validBases"); //TODO jcoll: Assert this } //assert pos+clipping == alignment.getLength(); return 0; } /** * Set size to the nearest upper 2^n number for quick modulus operation. * * @param size Region size in bp */ public void setRegionCoverageSize(int size) { if (size < 0) { return; } int lg = (int) Math.ceil(Math.log(size) / Math.log(2)); //int lg = 31 - Integer.numberOfLeadingZeros(size); int newRegionCoverageSize = 1 << lg; int newRegionCoverageMask = newRegionCoverageSize - 1; RegionCoverage newCoverage = new RegionCoverage(newRegionCoverageSize); if (coverage != null) { for (int i = 0; i < (end - start); i++) { newCoverage.getA()[(int) ((start + i) & newRegionCoverageMask)] = coverage.getA()[(int) ((start + i) & regionCoverageMask)]; } } regionCoverageSize = newRegionCoverageSize; regionCoverageMask = newRegionCoverageMask; coverage = newCoverage; // System.out.println("Region Coverage Mask : " + regionCoverageMask); } public void addMeanCoverageCalculator(int size, String name) { this.meanCoverageCalculator.add(new MeanCoverageCalculator(size, name)); } public void addMeanCoverageCalculator(String name) { this.meanCoverageCalculator.add(new MeanCoverageCalculator(name)); } }