/* * Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br) * Copyright (C) 2008,2009 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com) * * For further information check the LICENSE file. */ package bio.pih.genoogle.io; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import bio.pih.genoogle.encoder.SequenceEncoder; import bio.pih.genoogle.encoder.SequenceEncoderFactory; import bio.pih.genoogle.index.IndexConstructionException; import bio.pih.genoogle.index.ValueOutOfBoundsException; import bio.pih.genoogle.io.proto.Io.StoredSequence; import bio.pih.genoogle.io.proto.Io.StoredSequenceInfo; import bio.pih.genoogle.seq.Alphabet; import bio.pih.genoogle.seq.AminoAcidAlphabet; import bio.pih.genoogle.seq.IllegalSymbolException; import bio.pih.genoogle.seq.Reduced_AA_8_Alphabet; import bio.pih.genoogle.seq.RichSequence; import bio.pih.genoogle.seq.SymbolList; import bio.pih.genoogle.seq.protein.Converter; import bio.pih.genoogle.util.SymbolListWindowIteratorFactory; import com.google.protobuf.ByteString; /** * A data bank witch index its sequences and uses similar subsequences index. * * @author albrecht * */ public class RemoteSimilaritySequenceDataBank extends IndexedSequenceDataBank { static SymbolListWindowIteratorFactory factory = SymbolListWindowIteratorFactory.getNotOverlappedFactory(); static SequenceEncoder aaEncoder = SequenceEncoderFactory.getEncoder(AminoAcidAlphabet.SINGLETON, 6); static SequenceEncoder reducedEncoder = SequenceEncoderFactory.getEncoder(Reduced_AA_8_Alphabet.SINGLETON, 9); public RemoteSimilaritySequenceDataBank(String name, Alphabet alphabet, int subSequenceLength, File path, AbstractDatabankCollection<? extends AbstractSimpleSequenceDataBank> parent) throws ValueOutOfBoundsException { super(name, alphabet, subSequenceLength, reducedEncoder, null, path, parent); } synchronized StoredSequenceInfo[] addSequence(RichSequence s, FileChannel dataBankFileChannel) throws IOException, IndexConstructionException, IllegalSymbolException { if (!s.getAlphabet().equals(this.alphabet)) { logger.fatal("Invalid symbol in the sequence for sequence " + s.getName() + ". This sequence will be ignored."); return new StoredSequenceInfo[] {}; } if (s.getLength() < 8) { logger.info(s.getName() + "is too short (" + s.getLength() + ") and will not be stored in this data bank"); return new StoredSequenceInfo[] {}; } return processReads(s, dataBankFileChannel); } private StoredSequenceInfo[] processReads(RichSequence s, FileChannel dataBankFileChannel) throws IOException, IndexConstructionException, IllegalSymbolException { StoredSequenceInfo info; StoredSequenceInfo[] infos = new StoredSequenceInfo[1]; info = processRead1(s, dataBankFileChannel); infos[0] = info; return infos; } private StoredSequenceInfo processRead1(RichSequence s, FileChannel dataBankFileChannel) throws IOException, IndexConstructionException, IllegalSymbolException { SymbolList protein = Converter.dnaToProtein1(s); return storeInDatabase(s, protein, dataBankFileChannel); } private StoredSequenceInfo storeInDatabase(RichSequence s, SymbolList converted, FileChannel dataBankFileChannel) throws IOException, IndexConstructionException, IllegalSymbolException { long offset = dataBankFileChannel.position(); final byte[] ret = intArrayToByteArray(converted); int id = getNextSequenceId(); bio.pih.genoogle.io.proto.Io.StoredSequence.Builder builder = StoredSequence.newBuilder() .setId(id) .setGi(s.getGi()) .setName(s.getName()) .setType(s.getType()) .setAccession(s.getAccession()) .setDescription(s.getDescription()) .setEncodedSequence(ByteString.copyFrom(ret)); StoredSequence storedSequence = builder.build(); byte[] byteArray = storedSequence.toByteArray(); dataBankFileChannel.write(ByteBuffer.wrap(byteArray)); SymbolList reducedAA = Converter.proteinToReducedAA(converted); int[] reducedEncoded = reducedEncoder.encodeSymbolListToIntegerArray(reducedAA); doSequenceProcessing(id, reducedEncoded); this.numberOfSequences++; this.dataBankSize += converted.getLength(); return StoredSequenceInfo.newBuilder().setId(id).setOffset(offset).setLength(byteArray.length).build(); } @Override protected byte[] intArrayToByteArray(SymbolList s) { int[] encoded = aaEncoder.encodeSymbolListToIntegerArray(s); ByteBuffer byteBuf = ByteBuffer.allocate(encoded.length * 4); for (int i = 0; i < encoded.length; i++) { byteBuf.putInt(encoded[i]); } return byteBuf.array(); } @Override public int doSequenceProcessing(int sequenceId, int[] encodedSequence) throws IndexConstructionException, IllegalSymbolException { int size = SequenceEncoder.getSequenceLength(encodedSequence); indexBuilder.addSequence(sequenceId, encodedSequence); return size; } @Override public int getSubSequencesOffset() { return reducedEncoder.getSubSequenceLength(); } public SequenceEncoder getAaEncoder() { return aaEncoder; } public SequenceEncoder getReducedEncoder() { return reducedEncoder; } }