/*******************************************************************************
* GenPlay, Einstein Genome Analyzer
* Copyright (C) 2009, 2014 Albert Einstein College of Medicine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu>
* Nicolas Fourel <nicolas.fourel@einstein.yu.edu>
* Eric Bouhassira <eric.bouhassira@einstein.yu.edu>
*
* Website: <http://genplay.einstein.yu.edu>
******************************************************************************/
package edu.yu.einstein.genplay.dataStructure.list.chromosomeWideList.nucleotideListView.twoBitListView;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import edu.yu.einstein.genplay.core.manager.project.ProjectManager;
import edu.yu.einstein.genplay.core.multiGenome.data.synchronization.MGSOffset;
import edu.yu.einstein.genplay.core.multiGenome.utils.FormattedMultiGenomeName;
import edu.yu.einstein.genplay.core.multiGenome.utils.ShiftCompute;
import edu.yu.einstein.genplay.dataStructure.chromosome.Chromosome;
import edu.yu.einstein.genplay.dataStructure.enums.AlleleType;
import edu.yu.einstein.genplay.dataStructure.enums.Nucleotide;
import edu.yu.einstein.genplay.dataStructure.list.listView.AbstractListView;
import edu.yu.einstein.genplay.dataStructure.list.listView.ListView;
/**
* This class provides the representation of a sequence from a .2bit file as described
* in the help file of the UCSC Genome Browser: http://genome.ucsc.edu/FAQ/FAQformat.html#format7
* @author Julien Lajugie
*/
public final class TwoBitListView extends AbstractListView<Nucleotide> implements ListView<Nucleotide>, Serializable {
/** Generated Serial ID */
private static final long serialVersionUID = -4820838292720902481L;
/** Version number of the class */
private static final transient int CLASS_VERSION_NUMBER = 0;
/** Size of the cached region. Should be a multiple of 4 */
private static final transient int CACHED_REGION_SIZE = 800000;
/** 2bit random access file */
private transient RandomAccessFile raf;
/** Path to the 2bit file (used for the serialization) */
private final String filePath;
/** The size in byte of the header of the sequence */
private final int headerSize;
/** The offset of the sequence data relative to the start of the file */
private final int offset;
/** Number of bases of DNA in the sequence */
private final int dnaSize;
/** The starting position for each block of Ns */
private final int[] nBlockStarts;
/** The length for each block of Ns */
private final int[] nBlockSizes;
/** Genome name for a multi genome project */
private final String genomeName;
/** Allele type for a multi genome project */
private final AlleleType alleleType;
/** Chromosome of the current list */
private final Chromosome chromosome;
/** Start position of the region cached */
private transient int cachedRegionStart;
/** Stop position of the region cached */
private transient List<Nucleotide> cachedRegion;
/**
* Creates an instance of {@link TwoBitListView}
* @param filePath path to the 2bit file (used for the serialization)
* @param headerSize the size in byte of the header of the sequence
* @param offset the offset of the sequence data relative to the start of the file
* @param dnaSize the number of bases of DNA in the sequence
* @param nBlockStarts the starting position for each block of Ns
* @param nBlockSizes the length for each block of Ns
* @param genomeName The genome name for a multi genome project
* @param alleleType the allele type for a multi genome project
* @param chromosome the chromosome of the current list
* @param raf {@link RandomAccessFile}
*/
public TwoBitListView(
String filePath,
int headerSize,
int offset,
int dnaSize,
int[] nBlockStarts,
int[] nBlockSizes,
String genomeName,
AlleleType alleleType,
Chromosome chromosome,
RandomAccessFile raf
) {
super();
this.filePath = filePath;
this.headerSize = headerSize;
this.offset = offset;
this.dnaSize = dnaSize;
this.nBlockStarts = nBlockStarts;
this.nBlockSizes = nBlockSizes;
this.genomeName = genomeName;
this.alleleType = alleleType;
this.chromosome = chromosome;
this.raf = raf;
cachedRegion = new ArrayList<Nucleotide>(CACHED_REGION_SIZE);
}
/**
* Creates a new instance of {@link TwoBitListView} similar to the
* one in parameter but reading the specified file instead. This constructor
* is handy when the file path has been modified.
* @param listView
* @param file 2bit file containing the sequences
* @throws FileNotFoundException
*/
public TwoBitListView(TwoBitListView listView, File file) throws FileNotFoundException {
// note that we can't just modify the file path of a TwoBitListView objects because they are immutable
filePath = file.getAbsolutePath();
headerSize = listView.headerSize;
offset = listView.offset;
dnaSize = listView.dnaSize;
nBlockStarts = listView.nBlockStarts;
nBlockSizes = listView.nBlockSizes;
genomeName = listView.genomeName;
alleleType = listView.alleleType;
chromosome = listView.chromosome;
cachedRegion = new ArrayList<Nucleotide>(CACHED_REGION_SIZE);
reinitDataFile();
}
/**
* Save the region surrounding the specified position in cache
* @param position
*/
private void cacheRegion(int position) {
cachedRegion.clear();
cachedRegionStart = position - (CACHED_REGION_SIZE / 2);
cachedRegionStart -= cachedRegionStart % 4; // make sure we start on a position that can be divided by 4
cachedRegionStart = Math.max(cachedRegionStart, 0); // cannot be negative
int offsetStart = cachedRegionStart / 4;
int size = (int) Math.ceil((CACHED_REGION_SIZE / 4));
byte[] readBytes = new byte[size];
try {
raf.seek(offsetStart + offset + headerSize);
raf.readFully(readBytes);
} catch (IOException e) {
return; // leave if the file cannot be read
}
for(int i = 0; i < CACHED_REGION_SIZE; i++) {
int curPos = cachedRegionStart + i;
if (isInNBlock(curPos)) {
cachedRegion.add(Nucleotide.ANY) ;
} else {
// position of the nucleotide inside the integer
int offsetInsideByte = 3 - (i % 4);
// rotate the result until the two bits we want are on the far right
// and then apply a 0x0003 filter
int result2Bit = Integer.rotateRight(readBytes[i / 4], offsetInsideByte * 2) & 0x3;
Nucleotide resultNucleo = Nucleotide.get((byte)result2Bit);
cachedRegion.add(resultNucleo) ;
}
}
cachedRegionStart++;
}
/**
* Returns the {@link Nucleotide} at the specified position
*/
@Override
public Nucleotide get(int position) {
if (ProjectManager.getInstance().isMultiGenomeProject()) {
position = ShiftCompute.getPosition(FormattedMultiGenomeName.META_GENOME_NAME, alleleType, position, chromosome, genomeName);
if (position == MGSOffset.MISSING_POSITION_CODE) {
return Nucleotide.BLANK;
}
}
if ((position <= 0) || (position > dnaSize)) {
return null;
}
if ((position < cachedRegionStart) || (position >= (cachedRegionStart + cachedRegion.size()))) {
cacheRegion(position);
}
int index = position - cachedRegionStart; // -1 because positions are 1-based
if (index < cachedRegion.size()) {
return cachedRegion.get(index);
} else {
return null;
}
}
/**
* @return the dnaSize of the sequence
*/
public final int getDnaSize() {
return dnaSize;
}
/**
* @return the headerSize of the sequence
*/
public final int getHeaderSize() {
return headerSize;
}
/**
* @return the nBlockSizes of the sequence
*/
public final int[] getnBlockSizes() {
return nBlockSizes;
}
/**
* @return the nBlockStarts of the sequence
*/
public final int[] getnBlockStarts() {
return nBlockStarts;
}
/**
* @return the offset of the sequence
*/
public final int getOffset() {
return offset;
}
/**
* Note: this method can be optimized
* @param position
* @return true if the specified position is in a N block
*/
private boolean isInNBlock(int position) {
int i = 0;
while ((i < nBlockStarts.length) && (nBlockStarts[i] <= position)) {
if (position < (nBlockStarts[i] + nBlockSizes[i])) {
return true;
}
i++;
}
return false;
}
/**
* Method used for unserialization
* @param in
* @throws IOException
* @throws ClassNotFoundException
*/
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
// read the class version number
in.readInt();
// read the final fields
in.defaultReadObject();
cachedRegion = new ArrayList<Nucleotide>(CACHED_REGION_SIZE);
}
/**
* Reinitialize the reader
* @throws FileNotFoundException
*/
public void reinitDataFile() throws FileNotFoundException {
raf = new RandomAccessFile(new File(filePath), "r");
}
/**
* Returns the number of nucleotides
*/
@Override
public int size() {
return dnaSize;
}
/**
* Method used for serialization
* @param out
* @throws IOException
*/
private void writeObject(ObjectOutputStream out) throws IOException {
// write the class version number
out.writeInt(CLASS_VERSION_NUMBER);
// write the final fields
out.defaultWriteObject();
}
}