/******************************************************************************* * GenPlay, Einstein Genome Analyzer * Copyright (C) 2009, 2014 Albert Einstein College of Medicine * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu> * Nicolas Fourel <nicolas.fourel@einstein.yu.edu> * Eric Bouhassira <eric.bouhassira@einstein.yu.edu> * * Website: <http://genplay.einstein.yu.edu> ******************************************************************************/ package edu.yu.einstein.genplay.core.IO.extractor; import java.io.File; import java.io.FileNotFoundException; import edu.yu.einstein.genplay.core.IO.dataReader.SCWReader; import edu.yu.einstein.genplay.core.IO.dataReader.StrandReader; import edu.yu.einstein.genplay.core.IO.utils.DataLineValidator; import edu.yu.einstein.genplay.core.IO.utils.Extractors; import edu.yu.einstein.genplay.core.IO.utils.StrandedExtractorOptions; import edu.yu.einstein.genplay.dataStructure.chromosome.Chromosome; import edu.yu.einstein.genplay.dataStructure.chromosomeWindow.SimpleChromosomeWindow; import edu.yu.einstein.genplay.dataStructure.enums.Strand; import edu.yu.einstein.genplay.exception.exceptions.DataLineException; import edu.yu.einstein.genplay.exception.exceptions.InvalidChromosomeException; /** * A Eland Extended file extractor * @author Julien Lajugie */ public final class ElandExtendedExtractor extends TextFileExtractor implements SCWReader, StrandReader, StrandedExtractor { /** Default first base position of bed files. Eland files are 0-based */ public static final int DEFAULT_FIRST_BASE_POSITION = 1; private int firstBasePosition = DEFAULT_FIRST_BASE_POSITION;// position of the first base private StrandedExtractorOptions strandOptions; // options on the strand and read length / shift private Chromosome chromosome; // chromosome of the last item read private Integer start; // start position of the last item read private Integer stop; // stop position of the last item read private Float score; // score of the last item read private Strand strand; // strand of the last item read private final int[][] matchTypeCount; // number of lines with 0,1,2 mistakes per chromosome private int NMCount = 0; // Non matched line count private int QCCount = 0; // quality control line count private int multiMatchCount = 0;// multi-match line count /** * Creates an instance of {@link ElandExtendedExtractor} * @param dataFile file containing the data * @throws FileNotFoundException if the specified file is not found */ public ElandExtendedExtractor(File dataFile) throws FileNotFoundException { super(dataFile); matchTypeCount = new int[getProjectChromosome().size()][3]; for(short i = 0; i < getProjectChromosome().size(); i++) { for(short j = 0; j < 3; j++) { matchTypeCount[i][j] = 0; } } } @Override protected int extractDataLine(String extractedLine) throws DataLineException { chromosome = null; start = null; stop = null; score = null; byte[] line = extractedLine.getBytes(); byte[] matchChar = new byte[4]; byte[] chromoChar = new byte[64]; byte[] positionChar = new byte[10]; short match0MNumber, match1MNumber, match2MNumber, chromoNumber; if (line[0] == '\0') { throw new DataLineException("Null character found at the beginning of the line."); } // skip first field int i = 0; while (line[i] != '\t') { i++; } // skip second field i++; while (line[i] != '\t') { i++; } // try to extract the number of match 0M i++; int j = 0; while ((line[i] != '\t') && (line[i] != ':')) { matchChar[j] = line[i]; i++; j++; } // case where we don't found a match if (line[i] == '\t') { if (matchChar[0] == 'N') { NMCount++; } else if (matchChar[0] == 'Q') { QCCount++; } throw new DataLineException("No match found for: " + matchChar[0]); } match0MNumber = Short.parseShort(new String(matchChar, 0, j)); // try to extract the number of match 1M i++; j = 0; while (line[i] != ':') { matchChar[j] = line[i]; i++; j++; } match1MNumber = Short.parseShort(new String(matchChar, 0, j)); // try to extract the number of match 2M i++; j = 0; while (line[i] != '\t') { matchChar[j] = line[i]; i++; j++; } match2MNumber = Short.parseShort(new String(matchChar, 0, j)); // we only want lines that correspond to our criteria if ((match0MNumber + match1MNumber + match2MNumber) != 1) { multiMatchCount++; throw new DataLineException("The line does not match the criteria: " + match0MNumber + " + " + match1MNumber + " + " + match2MNumber + " != 1"); } while ((i < line.length) && (line[i] != '.')) { chromoChar[j] = line[i]; i++; j++; } // if we reach the end of the line now there is no data to extract if (i == line.length) { throw new DataLineException("End of the line reached, no data to extract."); } // chromosome String chromosomeName = new String(chromoChar, 0, j).trim(); if (getChromosomeSelector() != null) { // case where last chromosome already extracted, no more data to extract if (getChromosomeSelector().isExtractionDone(chromosomeName)) { return EXTRACTION_DONE; } // chromosome was not selected for extraction if (!getChromosomeSelector().isSelected(chromosomeName)) { return LINE_SKIPPED; } } try { chromosome = getProjectChromosome().get(chromosomeName) ; } catch (InvalidChromosomeException e) { // unknown chromosome return LINE_SKIPPED; } chromoNumber = (short) getProjectChromosome().getIndex(chromosomeName); // try to extract the position number i+=4; // we want to get rid of 'fa:' j = 0; while ((line[i] != 'F') && (line[i] != 'R')) { positionChar[j] = line[i]; i++; j++; } // retrieve the strand char strandChar = (char) (line[i] & 0xFF); // because byte goes from -128 to 127 and char from 0 to 255 strand = Strand.get(strandChar); if ((strand != null) && (strandOptions != null) && (!strandOptions.isSelected(strand))) { chromosome = null; return LINE_SKIPPED; } start = Extractors.getInt(new String(positionChar, 0, j)); stop = start; String errors = DataLineValidator.getErrors(chromosome, start, stop); if (!errors.isEmpty()) { throw new DataLineException(errors); } // Stop position checking, must not be greater than the chromosome length String stopEndErrorMessage = DataLineValidator.getErrors(chromosome, stop); if (!stopEndErrorMessage.isEmpty()) { DataLineException stopEndException = new DataLineException(stopEndErrorMessage, DataLineException.SHRINK_STOP_PROCESS); // notify the listeners that the stop position needed to be shrunk notifyDataEventListeners(stopEndException, getCurrentLineNumber(), extractedLine); stop = chromosome.getLength(); } // compute the read position with specified strand shift and read length if (strandOptions != null) { SimpleChromosomeWindow resultStartStop = strandOptions.computeStartStop(chromosome, start, stop, strand); start = resultStartStop.getStart(); stop = resultStartStop.getStop(); } // if we are in a multi-genome project, we compute the position on the meta genome start = getRealGenomePosition(chromosome, start); stop = getRealGenomePosition(chromosome, stop); // add data for the statistics matchTypeCount[chromoNumber][0] += match0MNumber; matchTypeCount[chromoNumber][1] += match1MNumber; matchTypeCount[chromoNumber][2] += match2MNumber; return ITEM_EXTRACTED; } @Override public Chromosome getChromosome() { return chromosome; } @Override public int getFirstBasePosition() { return firstBasePosition; } /** * @return the number of lines with 0,1,2 mistakes per chromosome */ public int[][] getMatchTypeCount() { return matchTypeCount; } /** * @return the count of multi-match lines */ public int getMultiMatchCount() { return multiMatchCount; } /** * @return the count of non-matched lines */ public int getNMCount() { return NMCount; } /** * @return the count of quality control lines */ public int getQCCount() { return QCCount; } @Override public Float getScore() { return score; } @Override public Integer getStart() { return start; } @Override public Integer getStop() { return stop; } @Override public Strand getStrand() { return strand; } @Override public StrandedExtractorOptions getStrandedExtractorOptions() { return strandOptions; } @Override public void setFirstBasePosition(int firstBasePosition) { this.firstBasePosition = firstBasePosition; } @Override public void setStrandedExtractorOptions(StrandedExtractorOptions options) { strandOptions = options; } }