/* * Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br) * Copyright (C) 2008,2009 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com) * * For further information check the LICENSE file. */ package bio.pih.genoogle.io; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.NoSuchElementException; import bio.pih.genoogle.encoder.SequenceEncoder; import bio.pih.genoogle.encoder.SequenceEncoderFactory; import bio.pih.genoogle.index.IndexConstructionException; import bio.pih.genoogle.index.ValueOutOfBoundsException; import bio.pih.genoogle.io.proto.Io.StoredDatabank; import bio.pih.genoogle.io.proto.Io.StoredDatabank.SequenceType; import bio.pih.genoogle.io.reader.ParseException; import bio.pih.genoogle.seq.Alphabet; import bio.pih.genoogle.seq.DNAAlphabet; import bio.pih.genoogle.seq.IllegalSymbolException; /** * This abstract class which specifies the ways to access a sequences data banks. The methods are * divided into 3 classes: general information like name and path, add a sequence file, a single * sequence or a collection of them and sync these data and for last, and some way the most * important, do searches. * * @author albrecht */ public abstract class AbstractSequenceDataBank { protected final String name; protected final Alphabet alphabet; protected final int subSequenceLength; protected final SequenceEncoder encoder; protected int numberOfSequences; protected long dataBankSize; protected int lowComplexityFilter = -1; protected final File path; protected final AbstractDatabankCollection<? extends AbstractSimpleSequenceDataBank> parent; protected AbstractSequenceDataBank(String name, Alphabet alphabet, int subSequenceLength, File path, AbstractDatabankCollection<? extends AbstractSimpleSequenceDataBank> parent) { this.name = name; this.alphabet = alphabet; this.subSequenceLength = subSequenceLength; this.encoder = SequenceEncoderFactory.getEncoder(alphabet, subSequenceLength); this.path = path; this.parent = parent; this.numberOfSequences = 0; this.dataBankSize = 0; } /** * The name is related with the files names too. * * @return the name of this sequence bank. */ public String getName() { return name; } /** * @return the file name and directory where is this SequenceDataBank. It should used at run * time and not formating db time. */ public File getFilePath() { return getFilePath(false); } /** * @param formating * informs if it was called during formating db time. * @return the file name and directory where is this SequenceDataBank. */ public File getFilePath(boolean formating) { return path; } /** * @return the file name and directory where is this SequenceDataBank considering its parent. It * should used at run time and not formating db time. */ public File getFullPath() { return getFullPath(false); } /** * @param formating * informs if it was called during formating db time. * @return the file name and directory where is this SequenceDataBank considering its parent. */ public File getFullPath(boolean formating) { if (getParent() == null) { return getFilePath(formating); } else { return new File(getParent().getFullPath(formating), this.getFilePath(formating).getPath()); } } /** * @return the number of sequences stored in this SequenceDataBank */ abstract public int getNumberOfSequences(); /** * @return the total number of sequences stored in this SequenceDataBank and all siblings. <b>To * calculate statistics, this value should be used</b>. */ abstract public long getTotalNumberOfSequences(); /** * @return the number of nucleotides (DNA) or amino acids (Protein) stored in this * SequenceDataBank. */ abstract public long getDataBaseSize(); /** * @return the number of bases stored in this SequenceDataBank and all siblings. <b>This value * should be used to calculate statistics, </b>. */ abstract public long getTotalDataBaseSize(); /** * @return the {@link Alphabet} of the sequences of this sequence bank. */ public Alphabet getAlphabet() { return alphabet; } /** * Add a fasta formated sequence collection into the SequenceBank. * * @param fastaFile * file which contains the sequences * @param forceFormatting * <code>true</code> if it should continue the formatting process even some sequence * has invalid character. This sequences will be ignored. */ abstract public void addFastaFile(File fastaFile, boolean forceFormatting) throws FileNotFoundException, NoSuchElementException, IOException, IndexConstructionException, ParseException, IllegalSymbolException; /** * Load this sequence bank * * @return <true> if the data bank was loaded correctly, or <code>false</code> otherwise. */ abstract public boolean load() throws IOException, ValueOutOfBoundsException; /** * Encode the sequences into a computer legible mode * * @param forceFormatting * continues if some sequence had invalid character. This invalid sequence will be * ignored. */ abstract public void encodeSequences(boolean forceFormatting) throws IOException, NoSuchElementException, ValueOutOfBoundsException, IndexConstructionException, ParseException, IllegalSymbolException; /** * @return the parent of this {@link AbstractSequenceDataBank} or <code>null</code> if it do not * have parent */ protected AbstractSequenceDataBank getParent() { return parent; } public AbstractSequenceDataBank getAbsolutParent() { AbstractSequenceDataBank db = this; while (db.getParent() != null) { db = db.getParent(); } return db; } /** * @return <code>true</code> if the data bank files and its data are okay. This method do * <b>not</b> check file consistency. */ abstract public boolean check(); /** * @return {@link DNASequenceEncoderToInteger} witch is responsible to encode the sequences in * this data bank. */ public SequenceEncoder getEncoder() { return encoder; } /** * @return length of the sub sequences stored in this data bank. */ public int getSubSequenceLength() { return subSequenceLength; } /** * Delete all file informations of this data bank. */ abstract public void delete(); public void setLowComplexityFilter(int lowComplexityFilter) { this.lowComplexityFilter = lowComplexityFilter; } public int getLowComplexityFilter() { if (lowComplexityFilter == -1 && parent != null) { return parent.getLowComplexityFilter(); } return lowComplexityFilter; } protected void setStoredDatabankInfo(StoredDatabank.Builder storedDatabankBuilder) { storedDatabankBuilder.setQtdSequences(numberOfSequences); storedDatabankBuilder.setQtdBases(dataBankSize); storedDatabankBuilder.setSubSequenceLength(subSequenceLength); if (alphabet == DNAAlphabet.SINGLETON) { storedDatabankBuilder.setType(SequenceType.DNA); } else { storedDatabankBuilder.setType(SequenceType.RNA); } } }