/******************************************************************************* * GenPlay, Einstein Genome Analyzer * Copyright (C) 2009, 2014 Albert Einstein College of Medicine * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu> * Nicolas Fourel <nicolas.fourel@einstein.yu.edu> * Eric Bouhassira <eric.bouhassira@einstein.yu.edu> * * Website: <http://genplay.einstein.yu.edu> ******************************************************************************/ package edu.yu.einstein.genplay.core.multiGenome.VCF.VCFScanner; import java.io.IOException; import java.util.ArrayList; import java.util.List; import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFLine; import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFFile.VCFFile; import edu.yu.einstein.genplay.core.multiGenome.filter.MGFilter; import edu.yu.einstein.genplay.core.multiGenome.utils.VCFLineUtility; import edu.yu.einstein.genplay.dataStructure.enums.VariantType; import edu.yu.einstein.genplay.exception.exceptions.DataLineException; import edu.yu.einstein.genplay.gui.dialog.exceptionDialog.WarningReportDialog; /** * A {@link VCFScanner} provides an easy way to go through the lines of a {@link VCFFile}. * A {@link VCFScanner} must know the {@link VCFFile} to scan as well as the {@link VCFScannerReceiver} who will process the current line. * It is also possible to define which genomes, variations and filters to use while scanning in order to scan only the correct lines. * In order to scan all lines without distinction, just do not provide these information. * * @author Nicolas Fourel * @version 0.1 */ public abstract class VCFScanner { private final VCFScannerReceiver receiver; // The VCF scanner receiver. protected final VCFFile vcfFile; // The VCF file to scan. private List<String> genomes; // The list of genomes to take into account while scanning. private List<VariantType> variations; // The list of variations to take into account while scanning. private List<MGFilter> filters; // The list of filters to apply while scanning. private List<Integer> genomeIndexes; // The indexes of every genome, used as class variable to optimize the scan. /** * Constructor of {@link VCFScanner} * @param receiver * @param vcfFile * @throws IOException */ public VCFScanner (VCFScannerReceiver receiver, VCFFile vcfFile) throws IOException { this.receiver = receiver; this.vcfFile = vcfFile; genomes = null; variations = null; filters = null; } /** * Starts scanning the file. * @throws IOException */ public void compute () throws IOException { // Get genome indexes genomeIndexes = getGenomeIndexes(); // Get the first line of data VCFLine line = getFirstLine(); int lineNumber = 0; // Scan the file line by line while ((line != null) && !line.isLastLine()) { lineNumber++; if (line.isIntegrityValid()) { // Initialize the line for treatments line.processForAnalyse(); // Filters boolean pass = true; List<Integer> alternativeIndexes = getScopeDefinedVariationIndexes(line); // Get alternative indexes from all required genomes if (!genomesValidation(alternativeIndexes)) { // Genome validation filter pass = false; } if (!variationsValidation(line, alternativeIndexes)) { // Variation validation filter pass = false; } // Send the line for process if meets requirements if (pass) { receiver.processLine(line); } } else { DataLineException exception = new DataLineException("The line " + lineNumber + " seems to be missing elements.", DataLineException.SKIP_PROCESS); exception.setFile(vcfFile.getFile()); exception.setLine(line.getMergedElements()); exception.setLineNumber(lineNumber); WarningReportDialog.getInstance().addMessage(exception.getMessage()); WarningReportDialog.getInstance().showDialog(null); } // Move to the next line line = getNextLine(); } // Closes the file streams endScan(); } /** * @return the first line to process */ protected abstract VCFLine getFirstLine (); /** * @return the next line to process */ protected abstract VCFLine getNextLine (); /** * Performs last operations such as closing the file streams. */ protected abstract void endScan (); /** * Retrieves indexes of the alternatives from the scope of the project. * The scope is defined by the genomes to load, defined by the user. * @param line the {@link VCFLine} * @return the list of indexes (include no call (-2) and references (-1)) */ private List<Integer> getScopeDefinedVariationIndexes (VCFLine line) { // Get alternatives indexes define by selected genomes List<Integer> altIndexes = new ArrayList<Integer>(); for (int index: genomeIndexes) { String genotype = line.getFormatField(index, 0).toString(); genotype = genotype.replace('|', '/'); String[] currentAltIndexes = genotype.split("/"); for (String currentAlt: currentAltIndexes) { int current = VCFLineUtility.getAlleleIndex(currentAlt); if ((current != -1) && !altIndexes.contains(current)) { altIndexes.add(current); } } } // return result return altIndexes; } /** * @return the indexes of all required genomes */ private List<Integer> getGenomeIndexes () { // Get genome indexes List<Integer> genomeIndexes = new ArrayList<Integer>(); if (genomes != null) { for (String genome: genomes) { int index = vcfFile.getHeader().getIndexFromFullGenomeName(genome); if (index > -1) { genomeIndexes.add(index); } } } else { int genomeNumber = vcfFile.getHeader().getGenomeNames().size(); int index = 9; for (int i = 0; i < genomeNumber; i++) { genomeIndexes.add(index + i); } } return genomeIndexes; } /** * @param alternativeIndexes list of alternatives indexes * @return true if at least one variation is defined (no call is seen as a {@link VariantType}), false otherwise */ private boolean genomesValidation (List<Integer> alternativeIndexes) { boolean defineSomething = false; int index = 0; while ((index < alternativeIndexes.size()) && !defineSomething) { if (alternativeIndexes.get(index) != -1) { defineSomething = true; } index++; } return defineSomething; } /** * * @param line the {@link VCFLine} * @param alternativeIndexes the list of alternatives indexes * @return true if at least one of the mandatory variation is defined, false otherwise */ private boolean variationsValidation (VCFLine line, List<Integer> alternativeIndexes) { if (variations == null) { return true; } List<VariantType> foundVariations = new ArrayList<VariantType>(); VariantType[] definedVariations = line.getAlternativesTypes(); for (int index: alternativeIndexes) { VariantType currentType = null; if (index == -2) { currentType = VariantType.NO_CALL; } else if (index > -1) { currentType = definedVariations[index]; } if ((currentType != null) && !foundVariations.contains(currentType)) { foundVariations.add(currentType); } } for (VariantType currentType: foundVariations) { if (variations.contains(currentType)) { return true; } } return false; } /** * @return the genomes */ public List<String> getGenomes() { return genomes; } /** * @param genomes the genomes to set */ public void setGenomes(List<String> genomes) { this.genomes = genomes; } /** * @return the variations */ public List<VariantType> getVariations() { return variations; } /** * @param variations the variations to set */ public void setVariations(List<VariantType> variations) { this.variations = variations; } /** * @return the filters */ public List<MGFilter> getFilters() { return filters; } /** * @param filters the filters to set */ public void setFilters(List<MGFilter> filters) { this.filters = filters; } }