package com.milaboratory.core.alignment.benchmark;
import cc.redberry.pipe.OutputPort;
import com.milaboratory.core.Range;
import com.milaboratory.core.alignment.AffineGapAlignmentScoring;
import com.milaboratory.core.alignment.Aligner;
import com.milaboratory.core.alignment.Alignment;
import com.milaboratory.core.alignment.AlignmentUtils;
import com.milaboratory.core.mutations.Mutations;
import com.milaboratory.core.mutations.MutationsBuilder;
import com.milaboratory.core.mutations.generator.GenericNucleotideMutationModel;
import com.milaboratory.core.mutations.generator.MutationsGenerator;
import com.milaboratory.core.mutations.generator.NucleotideMutationModel;
import com.milaboratory.core.mutations.generator.SubstitutionModels;
import com.milaboratory.core.sequence.Alphabet;
import com.milaboratory.core.sequence.NucleotideSequence;
import com.milaboratory.core.sequence.Sequence;
import com.milaboratory.core.sequence.SequenceBuilder;
import org.apache.commons.math3.random.RandomDataGenerator;
import org.apache.commons.math3.random.RandomGenerator;
import org.apache.commons.math3.random.Well19937c;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Created by dbolotin on 27/10/15.
*/
public final class ChallengeProvider implements OutputPort<Challenge> {
public static int MAX_RERUNS = 200;
final ChallengeParameters parameters;
final RandomGenerator gen;
final RandomDataGenerator rdg;
public ChallengeProvider(ChallengeParameters parameters, long seed) {
this.parameters = parameters;
this.gen = new Well19937c(seed);
this.rdg = new RandomDataGenerator(gen);
}
@Override
public Challenge take() {
NucleotideSequence[] db = generateDB(rdg, parameters);
long seed = gen.nextLong();
final RandomGenerator rg = new Well19937c(seed);
final RandomDataGenerator generator = new RandomDataGenerator(rg);
final AtomicInteger counter = new AtomicInteger();
List<KAlignerQuery> queries = new ArrayList<>();
int consequentReruns = 0;
for (int n = 0; n < parameters.queryCount; n++) {
MutationsBuilder<NucleotideSequence> totalMutations = new MutationsBuilder<>(NucleotideSequence.ALPHABET);
NucleotideMutationModel model = parameters.mutationModel.clone();
model.reseed(generator.getRandomGenerator().nextLong());
int targetId = generator.nextInt(0, db.length - 1);
NucleotideSequence target = db[targetId];
SequenceBuilder<NucleotideSequence> queryBuilder = NucleotideSequence.ALPHABET.createBuilder();
if (generator.nextUniform(0, 1) < parameters.boundaryInsertProbability)
queryBuilder.append(randomSequence(NucleotideSequence.ALPHABET, generator,
parameters.minIndelLength, parameters.maxIndelLength, true));
List<Range> tRanges = new ArrayList<>(), qRanges = new ArrayList<>();
List<Mutations<NucleotideSequence>> muts = new ArrayList<>();
int tOffset = generator.nextInt(0, parameters.maxIndelLength), qOffset = queryBuilder.size();
Range r;
Mutations<NucleotideSequence> m;
NucleotideSequence ins;
double v;
for (int i = parameters.minClusters == parameters.maxClusters ? parameters.minClusters :
generator.nextInt(parameters.minClusters, parameters.maxClusters); i > 0; --i)
if (tRanges.isEmpty()) {
r = new Range(tOffset, tOffset += generator.nextInt(parameters.minClusterLength,
parameters.maxClusterLength));
if (r.getTo() > target.size())
break;
tRanges.add(r);
muts.add(m = MutationsGenerator.generateMutations(target, model, r));
NucleotideSequence queryPart = m.move(-r.getFrom()).mutate(target.getRange(r));
qRanges.add(new Range(qOffset, qOffset += queryPart.size()));
queryBuilder.append(queryPart);
totalMutations.append(m);
} else {
MutationsBuilder<NucleotideSequence> tempMutations =
new MutationsBuilder<>(NucleotideSequence.ALPHABET);
if ((v = generator.nextUniform(0, 1.0)) < parameters.insertionProbability) {
// Insertion into target
ins = randomSequence(NucleotideSequence.ALPHABET, generator, parameters.minIndelLength,
parameters.maxIndelLength, true);
tempMutations.appendInsertion(tOffset, ins);
} else if (v < parameters.insertionProbability + parameters.deletionProbability) {
// Deletion from target
int previousOffset = tOffset;
tOffset += generator.nextInt(parameters.minIndelLength, parameters.maxIndelLength);
ins = NucleotideSequence.EMPTY;
if (tOffset <= target.size())
tempMutations.appendDeletion(previousOffset, tOffset - previousOffset, target);
} else {
// Big mismatch
ins = randomSequence(NucleotideSequence.ALPHABET, generator, parameters.minIndelLength,
parameters.maxIndelLength, true);
int previousOffset = tOffset;
tOffset += generator.nextInt(parameters.minIndelLength, parameters.maxIndelLength);
Alignment<NucleotideSequence> result =
Aligner.alignGlobalAffine(parameters.scoring, target.getRange(previousOffset, tOffset),
ins);
if (tOffset <= target.size())
tempMutations.append(result.getAbsoluteMutations().move(previousOffset));
}
r = new Range(tOffset, tOffset += generator.nextInt(parameters.minClusterLength,
parameters.maxClusterLength));
if (r.getTo() > target.size())
break;
totalMutations.append(tempMutations);
tRanges.add(r);
muts.add(m = MutationsGenerator.generateMutations(target, model, r));
qRanges.add(new Range(qOffset += ins.size(), qOffset += r.length() + m.getLengthDelta()));
queryBuilder.append(ins).append(m.move(-r.getFrom()).mutate(target.getRange(r)));
totalMutations.append(m);
}
if (generator.nextUniform(0, 1) < parameters.boundaryInsertProbability)
queryBuilder.append(randomSequence(NucleotideSequence.ALPHABET, generator, parameters.minIndelLength,
parameters.maxIndelLength, true));
Alignment<NucleotideSequence> expectedAlignment = new Alignment<>(target,
totalMutations.createAndDestroy(),
new Range(tRanges.get(0).getFrom(), tRanges.get(tRanges.size() - 1).getTo()),
new Range(qRanges.get(0).getFrom(), qRanges.get(qRanges.size() - 1).getTo()),
parameters.scoring);
if (expectedAlignment.getScore() < parameters.minAlignmentScoring ||
expectedAlignment.getScore() > parameters.maxAlignmentScoring) {
--n;
if (++consequentReruns > MAX_RERUNS)
throw new RuntimeException("Too many reruns.");
continue;
}
consequentReruns = 0;
NucleotideSequence q = queryBuilder.createAndDestroy();
assert AlignmentUtils.getAlignedSequence2Part(expectedAlignment)
.equals(q.getRange(expectedAlignment.getSequence2Range()));
KAlignerQuery query = new KAlignerQuery(targetId, qRanges, tRanges, muts,
q, expectedAlignment);
queries.add(query);
}
for (int i = 0; i < parameters.falseCount; ++i)
queries.add(new KAlignerQuery(randomSequence(NucleotideSequence.ALPHABET, generator,
parameters.minClusters * (parameters.minClusterLength + parameters.minIndelLength) + parameters.minIndelLength,
parameters.maxClusters * (parameters.maxClusterLength + parameters.maxIndelLength) + parameters.maxIndelLength,
true
)));
Collections.shuffle(queries, new Random(gen.nextLong()));
return new Challenge(db, queries, parameters, seed);
}
public static NucleotideSequence[] generateDB(RandomDataGenerator generator, ChallengeParameters params) {
NucleotideSequence[] db = new NucleotideSequence[params.dbSize];
for (int i = 0; i < params.dbSize; i++)
db[i] = randomSequence(NucleotideSequence.ALPHABET, generator, params.dbMinSeqLength, params.dbMaxSeqLength, true);
return db;
}
public static <S extends Sequence<S>> S randomSequence(Alphabet<S> alphabet, RandomDataGenerator r,
int minLength, int maxLength, boolean basicLettersOnly) {
return randomSequence(alphabet, r.getRandomGenerator(), minLength, maxLength, basicLettersOnly);
}
public static <S extends Sequence<S>> S randomSequence(Alphabet<S> alphabet, RandomGenerator r,
int minLength, int maxLength, boolean basicLettersOnly) {
int length = minLength == maxLength ?
minLength : minLength + r.nextInt(maxLength - minLength + 1);
SequenceBuilder<S> builder = alphabet.createBuilder();
for (int i = 0; i < length; ++i)
builder.append((byte) r.nextInt(basicLettersOnly ? alphabet.basicSize() : alphabet.size()));
return builder.createAndDestroy();
}
public static ChallengeParameters getParams1NoGap(AffineGapAlignmentScoring<NucleotideSequence> scoring,
int minAlignmentScoring, int maxAlignmentScoring,
double multiplier) {
return new ChallengeParameters(100, 100, 500,
100000, 1000000,
1, 4, 15, 50, 3, 30,
0.45, 0.45, 0.5,
new GenericNucleotideMutationModel(
SubstitutionModels.getEmpiricalNucleotideSubstitutionModel(),
0, 0).multiplyProbabilities(multiplier),
minAlignmentScoring, maxAlignmentScoring,
scoring
);
}
private final static double deletionProbability = 0.000522, insertionProbability = 0.000198;
public static ChallengeParameters getParamsOneCluster(AffineGapAlignmentScoring<NucleotideSequence> scoring,
int minAlignmentScoring, int maxAlignmentScoring,
double multiplier) {
return new ChallengeParameters(100, 350, 500,
1000000, 1000000,
1, 1, 35, 80, 30, 100,
0.45, 0.45, 0.5,
new GenericNucleotideMutationModel(
SubstitutionModels.getEmpiricalNucleotideSubstitutionModel(),
deletionProbability, insertionProbability).multiplyProbabilities(multiplier),
minAlignmentScoring, maxAlignmentScoring,
scoring
);
}
public static ChallengeParameters getParamsTwoClusters(AffineGapAlignmentScoring<NucleotideSequence> scoring,
int minAlignmentScoring, int maxAlignmentScoring,
double multiplier) {
return new ChallengeParameters(100, 350, 500,
1000000, 1000000,
2, 2, 35, 100, 30, 80,
0.45, 0.45, 0.5,
new GenericNucleotideMutationModel(
SubstitutionModels.getEmpiricalNucleotideSubstitutionModel(),
deletionProbability, insertionProbability).multiplyProbabilities(multiplier),
minAlignmentScoring, maxAlignmentScoring,
scoring
);
}
}