/******************************************************************************* * GenPlay, Einstein Genome Analyzer * Copyright (C) 2009, 2014 Albert Einstein College of Medicine * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu> * Nicolas Fourel <nicolas.fourel@einstein.yu.edu> * Eric Bouhassira <eric.bouhassira@einstein.yu.edu> * * Website: <http://genplay.einstein.yu.edu> ******************************************************************************/ package edu.yu.einstein.genplay.core.IO.extractor; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.List; import edu.yu.einstein.genplay.core.IO.dataReader.NucleotideReader; import edu.yu.einstein.genplay.core.manager.project.ProjectChromosomes; import edu.yu.einstein.genplay.core.manager.project.ProjectManager; import edu.yu.einstein.genplay.dataStructure.chromosome.Chromosome; import edu.yu.einstein.genplay.dataStructure.enums.AlleleType; import edu.yu.einstein.genplay.dataStructure.enums.Nucleotide; import edu.yu.einstein.genplay.dataStructure.list.chromosomeWideList.nucleotideListView.twoBitListView.TwoBitListView; import edu.yu.einstein.genplay.dataStructure.list.genomeWideList.GenomicListView; import edu.yu.einstein.genplay.dataStructure.list.listView.ListView; import edu.yu.einstein.genplay.exception.exceptions.InvalidFileTypeException; import edu.yu.einstein.genplay.gui.statusBar.Stoppable; /** * Extract the data from a 2bit file. * This extractor is used to create {@link GenomicListView} nucleotide * @author Julien Lajugie */ public class TwoBitExtractor extends Extractor implements Stoppable, NucleotideReader { /** Default first base position of bed files. 2bit files are 0-based */ public static final int DEFAULT_FIRST_BASE_POSITION = 0; /** Signature of a 2bit file */ private final static String TWOBIT_SIGNATURE = "1A412743"; /** Set to true if the execution of the extractor needs to be stopped */ private boolean isStopped = false; /** Position of the first base */ private int firstBasePosition = DEFAULT_FIRST_BASE_POSITION; /** Genome name for a multi genome project */ private String genomeName; /** Allele type for a multi genome project */ private AlleleType alleleType; /** Each element of this list read a chromosome in the file */ private final List<ListView<Nucleotide>> data; /** 2bit random access file */ private RandomAccessFile twoBitFile; /** Path to the 2bit file (used for the serialization) */ private String filePath; /** True if the order of the bytes of multi-bytes entities need to be reversed */ private boolean reverseBytes; /** * Creates an instance of {@link TwoBitExtractor} * @param dataFile 2Bit file */ public TwoBitExtractor(File dataFile) { super(dataFile); data = new ArrayList<ListView<Nucleotide>>(); ProjectChromosomes projectChromosomes = ProjectManager.getInstance().getProjectChromosomes(); for (int i = 0; i < projectChromosomes.size(); i++) { data.add(null); } } /** * Extracts the sequence list from a 2bit file * @param genomeName the genome name for a multi genome project * @param alleleType the allele type for a multi genome project * @throws FileNotFoundException * @throws IOException * @throws InvalidFileTypeException * @throws InterruptedException */ public void extract(String genomeName, AlleleType alleleType) throws FileNotFoundException, IOException, InvalidFileTypeException, InterruptedException { this.genomeName = genomeName; this.alleleType = alleleType; // true if the bytes of multi-byte entities need to be reversed when read reverseBytes = false; filePath = getDataFile().getAbsolutePath(); twoBitFile = new RandomAccessFile(getDataFile(), "r"); twoBitFile.seek(0); int signature = twoBitFile.readInt(); // if the signature is not equal to the signature defined in the 2bit files // it might means that the byte order need to be reversed if (!Integer.toHexString(signature).equalsIgnoreCase(TWOBIT_SIGNATURE)) { signature = Integer.reverseBytes(signature); // check if it matches with the bytes reversed if (Integer.toHexString(signature).equalsIgnoreCase(TWOBIT_SIGNATURE)) { // if it matches, turns the reverse mode on reverseBytes = true; } else { // if it doesn't the file is not correct throw new InvalidFileTypeException(); } } if (reverseBytes) { // read the version of the 2bit file Integer.reverseBytes(twoBitFile.readInt()); } else { twoBitFile.readInt(); } int sequenceCount = 0; if (reverseBytes) { sequenceCount = Integer.reverseBytes(twoBitFile.readInt()); } else { sequenceCount = twoBitFile.readInt(); } // skip 4 reserved bytes twoBitFile.skipBytes(4); String[] sequenceNames = new String[sequenceCount]; int[] offsets = new int[sequenceCount]; for (int i = 0; i < sequenceCount; i++) { // if the execution need to be stopped we generate an InterruptedException if (isStopped) { throw new InterruptedException(); } byte sequenceNameSize = twoBitFile.readByte(); byte[] sequenceNameBytes = new byte[sequenceNameSize]; twoBitFile.read(sequenceNameBytes); sequenceNames[i] = new String(sequenceNameBytes); if (reverseBytes) { offsets[i] = Integer.reverseBytes(twoBitFile.readInt()); } else { offsets[i] = twoBitFile.readInt(); } } // we add the sequence to the list if the chromosome is specified in the ChromosomeManager for (int i = 0; i < sequenceCount; i++) { short k = 0; boolean found = false; ProjectChromosomes projectChromosomes = ProjectManager.getInstance().getProjectChromosomes(); while ((k < projectChromosomes.size()) && (!found)) { if (projectChromosomes.get(k).getName().equalsIgnoreCase(sequenceNames[i])) { // if the execution need to be stopped we generate an InterruptedException if (isStopped) { throw new InterruptedException(); } long currentPosition = twoBitFile.getFilePointer(); Chromosome chromosome = projectChromosomes.get(k); data.set(k, extractChromosome(chromosome, offsets[i])); twoBitFile.seek(currentPosition); found = true; } k++; } } } /** * Extracts the information for a chromosome and create a {@link ListView} of {@link Nucleotide} objects * from the data extracted. * @param chromosome chromosome to extract * @param offset offset of the beginning of the section of the chromosome to extract in the random file * @return A {@link ListView} of {@link Nucleotide} objects from the data extracted * @throws IOException * @throws InterruptedException */ private ListView<Nucleotide> extractChromosome(Chromosome chromosome, int offset) throws IOException, InterruptedException { int dnaSize; int[] nBlockStarts; int[] nBlockSizes; twoBitFile.seek(offset); if (reverseBytes) { dnaSize = Integer.reverseBytes(twoBitFile.readInt()); } else { dnaSize = twoBitFile.readInt(); } int nBlockCount = 0; if (reverseBytes) { nBlockCount = Integer.reverseBytes(twoBitFile.readInt()); } else { nBlockCount = twoBitFile.readInt(); } nBlockStarts = new int[nBlockCount]; for (int i = 0; i < nBlockCount; i++) { // if the execution need to be stopped we generate an InterruptedException if (isStopped) { throw new InterruptedException(); } if (reverseBytes) { nBlockStarts[i] = Integer.reverseBytes(twoBitFile.readInt()); } else { nBlockStarts[i] = twoBitFile.readInt(); } } nBlockSizes = new int[nBlockCount]; for (int i = 0; i < nBlockCount; i++) { // if the execution need to be stopped we generate an InterruptedException if (isStopped) { throw new InterruptedException(); } if (reverseBytes) { nBlockSizes[i] = Integer.reverseBytes(twoBitFile.readInt()); } else { nBlockSizes[i] = twoBitFile.readInt(); } } int maskBlockCount = 0; if (reverseBytes) { maskBlockCount = Integer.reverseBytes(twoBitFile.readInt()); } else { maskBlockCount = twoBitFile.readInt(); } int headerSize = 8 * (nBlockCount + maskBlockCount + 2); ListView<Nucleotide> lv = new TwoBitListView(filePath, headerSize, offset, dnaSize, nBlockStarts, nBlockSizes, genomeName, alleleType, chromosome, twoBitFile); return lv; } /** * @return The data extracted from the 2bit file. */ public List<ListView<Nucleotide>> getExtractedData() { return data; } @Override public int getFirstBasePosition() { return firstBasePosition; } /** * @return True if the order of the bytes of multi-bytes entities need to be reversed */ public boolean needToReverseBytes() { return reverseBytes; } @Override protected String retrieveDataName(File dataFile) { return dataFile.getName(); } @Override public void setFirstBasePosition(int firstBasePosition) { this.firstBasePosition = firstBasePosition; } @Override public void stop() { isStopped = true; } }