/*******************************************************************************
* GenPlay, Einstein Genome Analyzer
* Copyright (C) 2009, 2014 Albert Einstein College of Medicine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu>
* Nicolas Fourel <nicolas.fourel@einstein.yu.edu>
* Eric Bouhassira <eric.bouhassira@einstein.yu.edu>
*
* Website: <http://genplay.einstein.yu.edu>
******************************************************************************/
package edu.yu.einstein.genplay.core.IO.extractor;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Map;
import edu.yu.einstein.genplay.core.IO.dataReader.GeneReader;
import edu.yu.einstein.genplay.core.IO.dataReader.StrandReader;
import edu.yu.einstein.genplay.core.IO.utils.DataLineValidator;
import edu.yu.einstein.genplay.core.IO.utils.Extractors;
import edu.yu.einstein.genplay.core.IO.utils.StrandedExtractorOptions;
import edu.yu.einstein.genplay.dataStructure.chromosome.Chromosome;
import edu.yu.einstein.genplay.dataStructure.chromosomeWindow.SimpleChromosomeWindow;
import edu.yu.einstein.genplay.dataStructure.enums.GeneScoreType;
import edu.yu.einstein.genplay.dataStructure.enums.Strand;
import edu.yu.einstein.genplay.dataStructure.list.chromosomeWideList.SCWListView.SCWListViewBuilder;
import edu.yu.einstein.genplay.dataStructure.list.chromosomeWideList.SCWListView.generic.GenericSCWListViewBuilder;
import edu.yu.einstein.genplay.dataStructure.list.listView.ListView;
import edu.yu.einstein.genplay.dataStructure.scoredChromosomeWindow.ScoredChromosomeWindow;
import edu.yu.einstein.genplay.exception.exceptions.DataLineException;
import edu.yu.einstein.genplay.exception.exceptions.InvalidChromosomeException;
import edu.yu.einstein.genplay.util.Utils;
/**
* A GTF file extractor
* @author Julien Lajugie
*/
public class GTFExtractor extends TextFileExtractor implements GeneReader, StrandReader, StrandedExtractor {
/** Default first base position of bed files. GTF files are 1-based */
public static final int DEFAULT_FIRST_BASE_POSITION = 1;
private int firstBasePosition = DEFAULT_FIRST_BASE_POSITION;// position of the first base
private StrandedExtractorOptions strandOptions; // options on the strand and read length / shift
private Chromosome currentChromosome; // chromosome of the current item
private Chromosome previousChromosome; // chromosome of the last item read
private String currentName; // name of the current item
private String previousName; // name of the last item read
private Strand currentStrand; // strand of the current item
private Strand previousStrand; // strand of the last item read
private SCWListViewBuilder exonBuilder; // exons builders of the last item read
private ListView<ScoredChromosomeWindow> previousExons; // exons of the previous item read
/**
* Creates an instance of {@link GTFExtractor}
* @param dataFile file containing the data
* @throws FileNotFoundException if the specified file is not found
*/
public GTFExtractor(File dataFile) throws FileNotFoundException {
super(dataFile);
exonBuilder = new GenericSCWListViewBuilder();
}
@Override
protected int extractDataLine(String line) throws DataLineException {
previousChromosome = currentChromosome;
previousName = currentName;
previousStrand = currentStrand;
currentChromosome = null;
currentName = null;
currentStrand = null;
String[] splitedLine = Extractors.parseLineTabOnly(line);
if (splitedLine.length < 8) {
throw new DataLineException(DataLineException.INVALID_PARAMETER_NUMBER);
}
// chromosome
String chromosomeName = splitedLine[0];
if (getChromosomeSelector() != null) {
// case where last chromosome already extracted, no more data to extract
if (getChromosomeSelector().isExtractionDone(chromosomeName)) {
return EXTRACTION_DONE;
}
// chromosome was not selected for extraction
if (!getChromosomeSelector().isSelected(chromosomeName)) {
return LINE_SKIPPED;
}
}
try {
currentChromosome = getProjectChromosome().get(chromosomeName) ;
} catch (InvalidChromosomeException e) {
// unknown chromosome
return LINE_SKIPPED;
}
// case where we need to extract the current chromosome
// retrieve the strand
String strandStr = splitedLine[6].trim();
if (!strandStr.equals(".")) {
currentStrand = Strand.get(strandStr.charAt(0));
}
if ((currentStrand != null) && (strandOptions != null) && (!strandOptions.isSelected(currentStrand))) {
currentChromosome = null;
return LINE_SKIPPED;
}
// start and stop position
int start = Extractors.getInt(splitedLine[3].trim());
int stop = Extractors.getInt(splitedLine[4].trim());
String errors = DataLineValidator.getErrors(currentChromosome, start, stop);
if (!errors.isEmpty()) {
throw new DataLineException(errors);
}
// Stop position checking, must not be greater than the chromosome length
String stopEndErrorMessage = DataLineValidator.getErrors(currentChromosome, stop);
if (!stopEndErrorMessage.isEmpty()) {
DataLineException stopEndException = new DataLineException(stopEndErrorMessage, DataLineException.SHRINK_STOP_PROCESS);
// notify the listeners that the stop position needed to be shrunk
notifyDataEventListeners(stopEndException, getCurrentLineNumber(), line);
stop = currentChromosome.getLength();
}
// compute the read position with specified strand shift and read length
if (strandOptions != null) {
SimpleChromosomeWindow resultStartStop = strandOptions.computeStartStop(currentChromosome, start, stop, currentStrand);
start = resultStartStop.getStart();
stop = resultStartStop.getStop();
}
// if we are in a multi-genome project, we compute the position on the meta genome
start = getRealGenomePosition(currentChromosome, start);
stop = getRealGenomePosition(currentChromosome, stop);
// retrieve the score
Float score = Extractors.getFloat(splitedLine[5].trim(), null);
// if there is some attribute informations
if (splitedLine.length >= 9) {
Map<String, String> attributes = parseAttributes(splitedLine[8]);
// try to retrieve the gene name
if (attributes.containsKey("gene_id")) {
currentName = attributes.get("gene_id");
} else {
// this is a mandatory attribute for genplay
throw new DataLineException("The attribute 'gene_id' is missing.");
}
if (attributes.containsKey("RPKM")) {
// if there is a RPKM attribute we replace the score by the RPKM
score = Extractors.getFloat(attributes.get("RPKM"));
} else if (attributes.containsKey("FPKM")) {
// if there is no RPKM but there is a FPKM we replace the score by the FPKM
score = Extractors.getFloat(attributes.get("FPKM"));
} else {
score = Float.NaN;
}
if (currentName.equals(previousName)) {
exonBuilder.addElementToBuild(start, stop, score);
return LINE_EXTRACTED;
} else {
previousExons = exonBuilder.getListView();
exonBuilder = new GenericSCWListViewBuilder();
exonBuilder.addElementToBuild(start, stop, score);
return ITEM_EXTRACTED;
}
}
// this is a mandatory attribute for genplay
throw new DataLineException("The attribute 'gene_id' is missing.");
}
@Override
public Chromosome getChromosome() {
return previousChromosome;
}
@Override
public ListView<ScoredChromosomeWindow> getExons() {
return previousExons;
}
@Override
public int getFirstBasePosition() {
return firstBasePosition;
}
@Override
public String getGeneDBURL() {
return getTrackLineHeader().getGeneDBURL();
}
@Override
public GeneScoreType getGeneScoreType() {
return getTrackLineHeader().getGeneScoreType();
}
@Override
public String getName() {
return previousName;
}
@Override
public Float getScore() {
return Float.NaN;
}
@Override
public Integer getStart() {
if ((previousExons != null) && !previousExons.isEmpty()) {
return previousExons.get(0).getStart();
} else {
return null;
}
}
@Override
public Integer getStop() {
if ((previousExons != null) && !previousExons.isEmpty()) {
return previousExons.get(previousExons.size() - 1).getStart();
} else {
return null;
}
}
@Override
public Strand getStrand() {
return previousStrand;
}
@Override
public StrandedExtractorOptions getStrandedExtractorOptions() {
return strandOptions;
}
@Override
public Integer getUTR3Bound() {
if ((previousExons != null) && !previousExons.isEmpty()) {
return previousExons.get(previousExons.size() - 1).getStart();
} else {
return null;
}
}
@Override
public Integer getUTR5Bound() {
if ((previousExons != null) && !previousExons.isEmpty()) {
return previousExons.get(0).getStart();
} else {
return null;
}
}
/**
* Parses the attribute field of the GTF file
* @param attributeString attribute field
* @return a Map with the attribute names as keys and the attribute values as fields
*/
private Map<String, String> parseAttributes(String attributeString) {
Map<String, String> attributeMap = new HashMap<String, String>();
//String[] attributes = attributeString.split(";");
String[] attributes = Utils.split(attributeString, ';');
for (String currentAttribute: attributes) {
int indexFirstQuote = currentAttribute.indexOf('"');
int indexLastQuote = currentAttribute.lastIndexOf('"');
String attributeName = currentAttribute.substring(0, indexFirstQuote).trim();
String attributeValue = currentAttribute.substring(indexFirstQuote + 1, indexLastQuote).trim();
attributeMap.put(attributeName, attributeValue);
}
return attributeMap;
}
@Override
public void setFirstBasePosition(int firstBasePosition) {
this.firstBasePosition = firstBasePosition;
}
/**
* We raise a new UnsupportedOperationException because it's not possible to load
* a random fraction of a GTF file
*/
@Override
public void setRandomLineCount(Integer randomLineCount) throws UnsupportedOperationException {
throw new UnsupportedOperationException("Wiggle files need to be entirely extracted");
}
@Override
public void setStrandedExtractorOptions(StrandedExtractorOptions options) {
strandOptions = options;
}
}