/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.checkers; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.ANNOTATION_GTF; import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.ANNOTATION_GFF; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.ATTRIBUTE_ID_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.COUNTER_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.FEATURES_FILE_FORMAT; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.GENOMIC_TYPE_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.OLD_ATTRIBUTE_ID_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.OLD_GENOMIC_TYPE_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.OLD_OVERLAP_MODE_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.OLD_REMOVE_AMBIGUOUS_CASES_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.OLD_SPLIT_ATTRIBUTE_VALUES_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.OVERLAP_MODE_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.REMOVE_AMBIGUOUS_CASES_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.SPLIT_ATTRIBUTE_VALUES_PARAMETER_NAME; import static fr.ens.biologie.genomique.eoulsan.modules.expression.AbstractExpressionModule.STRANDED_PARAMETER_NAME; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import com.google.common.collect.Sets; import fr.ens.biologie.genomique.eoulsan.EoulsanException; import fr.ens.biologie.genomique.eoulsan.bio.BadBioEntryException; import fr.ens.biologie.genomique.eoulsan.bio.GFFEntry; import fr.ens.biologie.genomique.eoulsan.bio.GenomeDescription; import fr.ens.biologie.genomique.eoulsan.bio.GenomicArray; import fr.ens.biologie.genomique.eoulsan.bio.GenomicInterval; import fr.ens.biologie.genomique.eoulsan.bio.io.GFFReader; import fr.ens.biologie.genomique.eoulsan.bio.io.GTFReader; import fr.ens.biologie.genomique.eoulsan.core.Parameter; import fr.ens.biologie.genomique.eoulsan.data.Data; import fr.ens.biologie.genomique.eoulsan.data.DataFile; import fr.ens.biologie.genomique.eoulsan.data.DataFormat; import fr.ens.biologie.genomique.eoulsan.data.DataFormats; import fr.ens.biologie.genomique.eoulsan.modules.generators.GenomeDescriptionCreator; /** * This class define a Checker on GFF annotation. * @since 1.0 * @author Laurent Jourdren */ public class GFFChecker implements Checker { private String genomicType; private String attributeId; private boolean stranded = true; private final boolean gtfFormat; @Override public String getName() { return "gff_checker"; } @Override public DataFormat getFormat() { return this.gtfFormat ? ANNOTATION_GTF : ANNOTATION_GFF; } @Override public Set<DataFormat> getCheckersRequired() { return Sets.newHashSet(DataFormats.GENOME_FASTA); } @Override public void configure(final Set<Parameter> stepParameters) throws EoulsanException { // TODO the parsing of the parameter must be shared with // AbstractExpressionStep for (Parameter p : stepParameters) { switch (p.getName()) { case OLD_GENOMIC_TYPE_PARAMETER_NAME: case GENOMIC_TYPE_PARAMETER_NAME: this.genomicType = p.getStringValue(); break; case OLD_ATTRIBUTE_ID_PARAMETER_NAME: case ATTRIBUTE_ID_PARAMETER_NAME: this.attributeId = p.getStringValue(); break; case STRANDED_PARAMETER_NAME: this.stranded = "yes".equals(p.getStringValue()) || "reverse".equals(p.getStringValue()); break; default: if (!FEATURES_FILE_FORMAT.equals(p.getName()) && !COUNTER_PARAMETER_NAME.equals(p.getName()) && !OVERLAP_MODE_PARAMETER_NAME.equals(p.getName()) && !REMOVE_AMBIGUOUS_CASES_PARAMETER_NAME.equals(p.getName()) && !SPLIT_ATTRIBUTE_VALUES_PARAMETER_NAME.equals(p.getName()) && !OLD_OVERLAP_MODE_PARAMETER_NAME.equals(p.getName()) && !OLD_REMOVE_AMBIGUOUS_CASES_PARAMETER_NAME.equals(p.getName()) && !OLD_SPLIT_ATTRIBUTE_VALUES_PARAMETER_NAME.equals(p.getName())) { throw new EoulsanException( "Unknown parameter for " + getName() + " step: " + p.getName()); } } } } @Override public boolean check(final Data data, final CheckStore checkInfo) throws EoulsanException { if (data == null) { throw new NullPointerException("The data is null"); } if (checkInfo == null) { throw new NullPointerException("The check info info is null"); } try { final DataFile featureFile = data.getDataFile(); if (!featureFile.exists()) { // Check if the protocol is deprecated if (!featureFile.getProtocol().canRead()) { // Force exception try (InputStream in = featureFile.open()) { } } return true; } if (this.genomicType == null) { return true; } final GenomeDescription desc = getGenomeDescription(featureFile, checkInfo); validationAnnotation(featureFile, this.gtfFormat, desc, this.genomicType, this.attributeId, this.stranded); } catch (IOException e) { throw new EoulsanException( "Annotation Check: Error while reading annotation file for checking: " + e.getMessage(), e); } catch (BadBioEntryException e) { throw new EoulsanException("Annotation Check: " + e.getMessage() + " in line \"" + e.getEntry() + "\"", e); } return false; } private static void validationAnnotation(final DataFile file, final boolean gtfFormat, final GenomeDescription desc, final String featureType, final String attributeId, final boolean stranded) throws IOException, BadBioEntryException, EoulsanException { final GenomicArray<String> features = new GenomicArray<>(); Map<String, long[]> sequenceRegions = null; final Map<String, Long> sequenceLengths = getSequencesLengths(desc); boolean featuresFound = false; long[] interval = null; long sequenceLength = -1; String lastSequenceName = null; try (final GFFReader gffReader = gtfFormat ? new GTFReader(file.open()) : new GFFReader(file.open())) { GFFEntry lastEntry = null; for (final GFFEntry e : gffReader) { lastEntry = e; if (!featureType.equals(e.getType())) { continue; } final String sequenceName = e.getSeqId(); final int start = e.getStart(); final int end = e.getEnd(); if (sequenceRegions != null) { if (!sequenceName.equals(lastSequenceName)) { interval = sequenceRegions.get(sequenceName); if (interval == null) { throw new BadBioEntryException( "GFF entry with id (" + sequenceName + ") not found in sequence region", formatEntry(e, gtfFormat)); } } if (Math.min(start, end) < interval[0]) { throw new BadBioEntryException("GFF entry with start position (" + Math.min(start, end) + ") lower than the start of sequence region" + sequenceName + " (" + interval[0] + ")", formatEntry(e, gtfFormat)); } if (Math.max(start, end) > interval[1]) { throw new BadBioEntryException( "GFF entry with end position (" + Math.max(start, end) + ") greater than the end of sequence region " + sequenceName + " (" + interval[1] + ")", formatEntry(e, gtfFormat)); } } if (sequenceLengths != null) { if (!sequenceName.equals(lastSequenceName)) { if (!sequenceLengths.containsKey(sequenceName)) { throw new BadBioEntryException( "GFF entry with id (" + sequenceName + ") not found in genome", formatEntry(e, gtfFormat)); } sequenceLength = sequenceLengths.get(sequenceName); } if (Math.min(start, end) < 1) { throw new BadBioEntryException("GFF entry with start position (" + Math.min(start, end) + ") lower than 1 in sequence " + sequenceName, formatEntry(e, gtfFormat)); } if (Math.max(start, end) - 1 > sequenceLength) { gffReader.close(); throw new BadBioEntryException( "GFF entry with end position (" + Math.max(start, end) + ") greater than the the length of sequence " + sequenceName + " (" + sequenceLength + ")", formatEntry(e, gtfFormat)); } } final String featureId = e.getAttributeValue(attributeId); if (attributeId != null && featureId == null) { throw new BadBioEntryException("Feature " + featureType + " does not contain a " + attributeId + " attribute", formatEntry(e, gtfFormat)); } if (featureId != null) { features.addEntry(new GenomicInterval(e, stranded), featureId); featuresFound = true; } lastSequenceName = sequenceName; } gffReader.throwException(); // Check the sequence regions described in the GFF file if (lastEntry != null) { sequenceRegions = checkSequenceRegions(lastEntry, desc); } if (featureType != null && !featuresFound) { throw new EoulsanException("No feature \"" + featureType + "\" with attribute \"" + attributeId + "\" in annotation."); } } } /** * Format a GFFEntry in GFF3 or GTF format. * @param e the entry * @param gtfFormat true if the entry is in GTF format * @return the entry in the correct format */ private static final String formatEntry(final GFFEntry e, final boolean gtfFormat) { if (gtfFormat) { return e.toGTF(); } return e.toGFF3(); } private static Map<String, Long> getSequencesLengths( final GenomeDescription desc) { if (desc == null) { return null; } final Map<String, Long> result = new HashMap<>(); for (String sequenceName : desc.getSequencesNames()) { result.put(sequenceName, desc.getSequenceLength(sequenceName)); } return result; } private static Map<String, long[]> checkSequenceRegions(final GFFEntry entry, final GenomeDescription desc) throws BadBioEntryException { if (entry == null || desc == null) { return null; } final Map<String, long[]> result = new HashMap<>(); final List<String> sequenceRegions = entry.getMetadataEntryValues("sequence-region"); if (sequenceRegions == null) { return null; } for (String sequenceRegion : sequenceRegions) { if (sequenceRegion == null) { continue; } final String[] fields = sequenceRegion.trim().split(" "); if (fields.length != 3) { throw new BadBioEntryException("Invalid GFF metadata", "##sequence-region " + sequenceRegion); } try { final String sequenceName = fields[0].trim(); final long start = Integer.parseInt(fields[1]); final long end = Integer.parseInt(fields[2]); result.put(sequenceName, new long[] {start, end}); final long len = desc.getSequenceLength(sequenceName); if (len == -1) { throw new BadBioEntryException( "Unknown sequence found in GFF metadata", "##sequence-region " + sequenceRegion); } // Don't check the start position because it can // be < 1 // TODO Why len+2 for dmel annotation ? if (end > len + 2) { throw new BadBioEntryException( "Invalid GFF metadata, the end position (" + end + ") is greater than the length of the sequence (" + (len + 2) + ")", "##sequence-region " + sequenceRegion); } } catch (NumberFormatException e) { throw new BadBioEntryException("Invalid GFF metadata", "##sequence-region " + sequenceRegion); } } return result; } private GenomeDescription getGenomeDescription(final DataFile annotationFile, final CheckStore checkInfo) throws EoulsanException, BadBioEntryException, IOException { GenomeDescription result = (GenomeDescription) checkInfo.get(GenomeChecker.GENOME_DESCRIPTION); if (result != null) { return result; } result = new GenomeDescriptionCreator() .createGenomeDescriptionFromAnnotation(annotationFile); return result; } // // Constructors // /** * Protected constructor. * @param gtfFormat true if the format the file is GTF */ protected GFFChecker(final boolean gtfFormat) { this.gtfFormat = gtfFormat; } /** * Public constructor. */ public GFFChecker() { this(false); } }