/*
* Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br)
* Copyright (C) 2008,2009 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com)
*
* For further information check the LICENSE file.
*/
package bio.pih.genoogle.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.NoSuchElementException;
import org.apache.log4j.Logger;
import bio.pih.genoogle.index.IndexConstructionException;
import bio.pih.genoogle.index.ValueOutOfBoundsException;
import bio.pih.genoogle.io.proto.Io.StoredDatabank;
import bio.pih.genoogle.io.proto.Io.StoredSequenceInfo;
import bio.pih.genoogle.io.reader.IOTools;
import bio.pih.genoogle.io.reader.ParseException;
import bio.pih.genoogle.io.reader.RichSequenceStreamReader;
import bio.pih.genoogle.seq.Alphabet;
import bio.pih.genoogle.seq.IllegalSymbolException;
import bio.pih.genoogle.seq.RichSequence;
import com.google.common.collect.Lists;
/**
* the divided sequence databank will receive 1..n diferents fasta files and a integer 1..m where m
* is multiple of n. it will create m sub-databanks where all should have the most similar size
* possible. By example: Databank alpha -> 100milions base Databank beta -> 200milions base Databank
* gama -> 35milions base Databank delta -> 65milions base Databank zeta -> 300milions base
*
* n = 5
*
* if m is 1: one databank with 700milions bases.
*
* if m is 10: ten databanks with 70 milions bases each.
*
* A high value of m is good for paralelism and is recomended a valus of 2 *
* ManagementFactory.getOperatingSystemMXBean().getAvailableProcessors();
*
* It is also important to pay attention that each sub sequence databak requires (4**10) * 16)
* bytes, aprox. 20megabytes * of ram memory just to store the skeleton of the index, without any
* data. It means if you create 10 databanks, to store a total of 200millions bases, you will use
* aprox. 10 * (20 + 20) = 400 megabyte, while if you use 4, you will need 4 * (50 + 20) = 280
* megabytes.
*
* @author Pih
*/
public class SplittedDatabankCollection extends AbstractDatabankCollection<IndexedSequenceDataBank> {
private static Logger logger = Logger.getLogger(SplittedDatabankCollection.class.getName());
private final int qtdSubBases;
private final String mask;
/**
* @param name
* name of this databank
* @param path
* directory where it will be
* @param subSequenceLength
* @param qtdSubBases
* how many parts will have this sequence databank
* @param mask
*/
public SplittedDatabankCollection(String name, Alphabet alphabet, File path, int subSequenceLength, int qtdSubBases,
String mask) {
super(name, alphabet, subSequenceLength, path, null);
this.qtdSubBases = qtdSubBases;
this.mask = mask;
}
@Override
public void encodeSequences(boolean forceFormatting) throws IOException, NoSuchElementException,
ValueOutOfBoundsException, IndexConstructionException, ParseException, IllegalSymbolException {
long totalBases = 0;
List<FastaFileInfo> fastaFiles = Lists.newLinkedList();
for (AbstractSequenceDataBank sequence : databanks.values()) {
fastaFiles.add(new FastaFileInfo(sequence.getFullPath(true), alphabet, forceFormatting));
}
long totalBasesCount = 0;
for (FastaFileInfo fastaFileInfo : fastaFiles) {
totalBasesCount += fastaFileInfo.getQtdBases();
}
sortFiles(fastaFiles);
long totalBasesBySubBase = totalBasesCount / qtdSubBases;
long subCount = 0;
IndexedSequenceDataBank actualSequenceDatank = new IndexedSequenceDataBank("Sub_" + subCount, alphabet, subSequenceLength, mask, new File(getSubDatabankName(subCount)), this);
actualSequenceDatank.beginIndexBuild();
if (!getFilePath().exists()) {
boolean mkdirs = getFullPath().mkdirs();
if (!mkdirs) {
logger.error(getFilePath() + " was not possible to create.");
}
}
FileChannel dataBankFileChannel = new FileOutputStream(getDatabankFile(subCount)).getChannel();
FileChannel storedSequenceInfoChannel = new FileOutputStream(getStoredDatabakFileName(subCount), true).getChannel();
bio.pih.genoogle.io.proto.Io.StoredDatabank.Builder storedDatabankBuilder = StoredDatabank.newBuilder();
for (FastaFileInfo fastaFile : fastaFiles) {
logger.info("Adding a FASTA file from " + fastaFile.getFastaFile());
BufferedReader is = new BufferedReader(new FileReader(fastaFile.getFastaFile()));
RichSequenceStreamReader readFastaDNA = IOTools.readFasta(is, alphabet);
while (readFastaDNA.hasNext()) {
RichSequence richSequence;
try {
richSequence = readFastaDNA.nextRichSequence();
} catch (IllegalSymbolException e) {
if (forceFormatting) {
// illegal symbols were got at the FastaFileInfo constructor.
continue;
} else {
throw e;
}
}
StoredSequenceInfo[] infos = actualSequenceDatank.addSequence(richSequence, dataBankFileChannel);
for (int i = 0; i < infos.length; i++) {
storedDatabankBuilder.addSequencesInfo(infos[i]);
totalBases += infos[i].getLength();
}
if (totalBases > totalBasesBySubBase) {
finalizeSubDatabankConstruction(actualSequenceDatank, dataBankFileChannel, storedSequenceInfoChannel,
storedDatabankBuilder);
subCount++;
logger.info("Wrote " + subCount + " of " + qtdSubBases + " sub databanks.");
totalBases = 0;
dataBankFileChannel = new FileOutputStream(getDatabankFile(subCount)).getChannel();
storedSequenceInfoChannel = new FileOutputStream(getStoredDatabakFileName(subCount), true).getChannel();
storedDatabankBuilder = StoredDatabank.newBuilder();
actualSequenceDatank = new IndexedSequenceDataBank("Sub_" + subCount, alphabet, subSequenceLength, mask, new File(getSubDatabankName(subCount)), this);
actualSequenceDatank.beginIndexBuild();
}
}
}
finalizeSubDatabankConstruction(actualSequenceDatank, dataBankFileChannel, storedSequenceInfoChannel, storedDatabankBuilder);
logger.info("Wrote " + (subCount + 1) + " of " + qtdSubBases + " sub databanks.");
}
private File getStoredDatabakFileName(long subCount) {
return new File(getFullPath(), getStoredDatabankFileName(subCount));
}
private File getDatabankFile(long subCount) {
return new File(getFullPath(), getDatabankFileName(subCount));
}
private String getStoredDatabankFileName(long subCount) {
return getSubDatabankName(subCount) + ".ssdb";
}
private String getDatabankFileName(long subCount) {
return getSubDatabankName(subCount) + ".dsdb";
}
private String getSubDatabankName(long subCount) {
return this.getName() + "_sub_" + subCount;
}
private void finalizeSubDatabankConstruction(IndexedSequenceDataBank actualSequenceDatank, FileChannel dataBankFileChannel,
FileChannel storedSequenceInfoChannel, StoredDatabank.Builder storedDatabankBuilder) throws IOException, IndexConstructionException {
actualSequenceDatank.endIndexBuild();
actualSequenceDatank.setStoredDatabankInfo(storedDatabankBuilder);
StoredDatabank storedDatabank = storedDatabankBuilder.build();
storedSequenceInfoChannel.write(ByteBuffer.wrap(storedDatabank.toByteArray()));
storedSequenceInfoChannel.close();
dataBankFileChannel.close();
}
private void sortFiles(List<FastaFileInfo> fastaFiles) {
Collections.sort(fastaFiles, new Comparator<FastaFileInfo>() {
@Override
public int compare(final FastaFileInfo o1, final FastaFileInfo o2) {
final long diff = o1.getQtdBases() - o2.getQtdBases();
if (diff > 0l) {
return 1;
} else if (diff < 0) {
return -1;
}
return 0;
}
});
}
@Override
public boolean check() {
for (int i = 0; i < qtdSubBases; i++) {
try {
IndexedSequenceDataBank actualSequenceDatank = new IndexedSequenceDataBank("Sub_" + i, alphabet, subSequenceLength, mask, new File(getSubDatabankName(i)), this);
if (!actualSequenceDatank.check()) {
return false;
}
} catch (Exception e) {
logger.fatal(e);
return false;
}
}
return true;
}
@Override
public void delete() {
for (int i = 0; i < qtdSubBases; i++) {
try {
IndexedSequenceDataBank actualSequenceDatank = new IndexedSequenceDataBank("Sub_" + i, alphabet, subSequenceLength, mask, new File(getSubDatabankName(i)), this);
actualSequenceDatank.delete();
} catch (Exception e) {
logger.fatal(e);
}
}
}
@Override
public boolean load() throws IOException, ValueOutOfBoundsException {
logger.info("Loading internals databanks");
long time = System.currentTimeMillis();
this.clear();
for (int i = 0; i < qtdSubBases; i++) {
IndexedSequenceDataBank subDataBank = new IndexedSequenceDataBank(this.getName() + "_sub_" + i, alphabet, subSequenceLength, mask, new File(getSubDatabankName(i)), this);
boolean b = subDataBank.load();
if (b == false) {
return false;
}
try {
this.addDatabank(subDataBank);
} catch (DuplicateDatabankException e) {
logger.info("Fatal error while loading sub databanks.", e);
}
logger.info("Loaded " + (i + 1) + " of " + qtdSubBases + " sub-databanks.");
}
logger.info("Databanks loaded in " + (System.currentTimeMillis() - time) + "ms.");
return true;
}
private static class FastaFileInfo {
File fastaFile;
long qtdBases;
long qtdSequences;
public FastaFileInfo(File fastaFile, Alphabet alphabet, boolean forceFormatting) throws NoSuchElementException,
IOException, ParseException, IllegalSymbolException {
this.fastaFile = fastaFile;
this.qtdBases = 0;
this.qtdSequences = 0;
BufferedReader is = new BufferedReader(new FileReader(fastaFile));
RichSequenceStreamReader readFastaDNA = IOTools.readFasta(is, alphabet);
logger.info("Reading informations from " + fastaFile);
while (readFastaDNA.hasNext()) {
RichSequence sequence = null;
try {
sequence = readFastaDNA.nextRichSequence();
} catch (IllegalSymbolException e) {
if (forceFormatting) {
logger.info("Ignoring sequence: " + e.getMessage());
continue;
} else {
throw e;
}
}
qtdBases += sequence.getLength();
qtdSequences++;
}
}
@Override
public String toString() {
return this.fastaFile.toString();
}
public long getQtdBases() {
return qtdBases;
}
public File getFastaFile() {
return fastaFile;
}
}
}