/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.bio; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import fr.ens.biologie.genomique.eoulsan.EoulsanException; /** * This class allow the easily get fields of Illumina reads ids. * @since 1.0 * @author Laurent Jourdren */ public final class IlluminaReadId { private static final Pattern PATTERN_1 = Pattern.compile("^([a-zA-Z0-9\\-\\_]+):(\\d+):(\\d+):(\\d+):(\\d+)$"); private static final Pattern PATTERN_2 = Pattern .compile("^([a-zA-Z0-9\\-\\_]+):(\\d+):(\\d+):(\\d+):(\\d+)/(\\d)$"); private static final Pattern PATTERN_1_4 = Pattern.compile( "^([a-zA-Z0-9\\-\\_]+):(\\d+):(\\d+):(\\d+):(\\d+)#([0ATGC]+)/(\\d)$"); private static final Pattern PATTERN_1_8 = Pattern.compile( "^([a-zA-Z0-9\\-\\_]+):(\\d+):([a-zA-Z0-9]+):(\\d+):(\\d+):(\\d+):(\\d+) " + "(\\d+):([YN]):(\\d+):([NATGC\\+]*)$"); private static final Pattern PATTERN_3 = Pattern.compile( "^([a-zA-Z0-9\\-\\_]+):(\\d+):([a-zA-Z0-9]+):(\\d+):(\\d+):(\\d+):(\\d+) " + "(\\d+):([YN]):(\\d+):(\\d)$"); private static final Pattern PATTERN_SRA = Pattern.compile( "^[a-zA-Z0-9\\.]+ ([a-zA-Z0-9\\-\\_]+):(\\d+):([a-zA-Z0-9]+):(\\d+):(\\d+):(\\d+):(\\d+) " + ".*$"); private static final Pattern SEQUENCE_INDEX_SPLITTER_PATTERN = Pattern.compile("\\+"); private static final String NO_SEQUENCE_INDEX = "0"; private final Pattern pattern; private String instrumentId; private int runId; private String flowCellId; private int flowCellLane; private int tileNumberInFlowCellLane; private int xClusterCoordinateInTile; private int yClusterCoordinateInTile; private String sequenceIndex; private int pairMember; private boolean filtered; private int controlNumber; // // Getters // /** * Get instrument id. * @return a String with the instrument id */ public final String getInstrumentId() { return this.instrumentId; } /** * Get Run id. * @return the run id or -1 if there is no run id */ public final int getRunId() { return this.runId; } /** * Get the flow cell id. * @return the flow cell id as a string or null if there is no flow cell id */ public final String getFlowCellId() { return this.flowCellId; } /** * Get the flowcell lane. * @return the flowcell lane */ public final int getFlowCellLane() { return this.flowCellLane; } /** * Get the tile number within the flowcell lane. * @return the tile number within the flowcell lane */ public final int getTileNumberInFlowCellLane() { return this.tileNumberInFlowCellLane; } /** * Get 'x'-coordinate of the cluster within the tile. * @return the 'x'-coordinate of the cluster within the tile */ public final int getXClusterCoordinateInTile() { return this.xClusterCoordinateInTile; } /** * Get 'y'-coordinate of the cluster within the tile. * @return the 'y'-coordinate of the cluster within the tile */ public final int getYClusterCoordinateInTile() { return this.yClusterCoordinateInTile; } /** * Get the sequence index for a multiplexed sample. * @return the sequence index for a multiplexed sample, "0" for no indexing */ public final String getSequenceIndex() { return this.sequenceIndex; } /** * Get the sequence index for a multiplexed sample. * @return the sequence index for a multiplexed sample, "0" for no indexing */ public final List<String> getSequenceIndexList() { if (NO_SEQUENCE_INDEX.equals(this.sequenceIndex)) { return Collections.emptyList(); } return Collections.unmodifiableList(Arrays .asList(SEQUENCE_INDEX_SPLITTER_PATTERN.split(this.sequenceIndex))); } /** * Get the member of a pair. * @return the the member of a pair, /1 or /2 (paired-end or mate-pair reads * only) */ public final int getPairMember() { return this.pairMember; } /** * Test if the read is filtered. * @return true if the read is filtered */ public final boolean isFiltered() { return this.filtered; } /** * Get the value of the control number. * @return the control number or -1 if there is no control number */ public final int getControlNumber() { return this.controlNumber; } // // Test if the fields exists // /** * Test if instrument id field exists. * @return true if instrument id field exists */ public final boolean isInstrumentIdField() { return true; } /** * Test if the Run id field exists. * @return true if the Run id field exists */ public final boolean isRunIdField() { if (this.pattern == PATTERN_1_4 || this.pattern == PATTERN_2 || this.pattern == PATTERN_1) { return false; } return true; } /** * Test if the flow cell id field exist. * @return true if the flow cell id field exist */ public final boolean isFlowCellIdField() { if (this.pattern == PATTERN_1_4 || this.pattern == PATTERN_2 || this.pattern == PATTERN_1) { return false; } return true; } /** * Test if the flowcell lane field exists. * @return true if the flowcell lane field exists */ public final boolean isFlowCellLaneField() { return true; } /** * Test if the tile number within the flowcell lane field exists. * @return true if the tile number within the flowcell lane field exists */ public final boolean isTileNumberInFlowCellLaneField() { return true; } /** * Test if 'x'-coordinate of the cluster within the tile field exists. * @return true if the the 'x'-coordinate of the cluster within the tile field * exists */ public final boolean isXClusterCoordinateInTileField() { return true; } /** * Test if the 'y'-coordinate of the cluster within the tile exists. * @return true if the 'y'-coordinate of the cluster within the tile exists */ public final boolean isYClusterCoordinateInTileField() { return true; } /** * Test if the sequence index for a multiplexed sample exist. * @return true if the sequence index for a multiplexed sample exist */ public final boolean isSequenceIndexField() { if (this.pattern == PATTERN_3 || this.pattern == PATTERN_2 || this.pattern == PATTERN_1 || this.pattern == PATTERN_SRA) { return false; } return true; } /** * Test if the member of a pair field exists. * @return true if the member of a pair field exists */ public final boolean isPairMemberField() { if (this.pattern == PATTERN_1 || this.pattern == PATTERN_SRA) { return false; } return true; } /** * Test if the read filtered field exists. * @return true if the read filtered field exist */ public final boolean isFilteredField() { if (this.pattern == PATTERN_1_4 || this.pattern == PATTERN_2 || this.pattern == PATTERN_1 || this.pattern == PATTERN_SRA) { return false; } return true; } /** * Test if the control number field exists. * @return true if the control number field exists */ public final boolean isControlNumberField() { if (this.pattern == PATTERN_1_4 || this.pattern == PATTERN_2 || this.pattern == PATTERN_1 || this.pattern == PATTERN_SRA) { return false; } return true; } // // Other method // private static Pattern findPattern(final String readId) throws EoulsanException { if (PATTERN_1_8.matcher(readId).lookingAt()) { return PATTERN_1_8; } if (PATTERN_3.matcher(readId).lookingAt()) { return PATTERN_3; } if (PATTERN_1_4.matcher(readId).lookingAt()) { return PATTERN_1_4; } if (PATTERN_2.matcher(readId).lookingAt()) { return PATTERN_2; } if (PATTERN_1.matcher(readId).lookingAt()) { return PATTERN_1; } if (PATTERN_SRA.matcher(readId).lookingAt()) { return PATTERN_SRA; } throw new EoulsanException("Invalid illumina id: " + readId); } /** * Parse an Illumina id string in a Sequence object. * @param sequence sequence witch name must be parsed * @throws EoulsanException if the id is not an Illumina id */ public final void parse(final Sequence sequence) throws EoulsanException { if (sequence == null) { throw new NullPointerException("The sequence is null"); } parse(sequence.getName()); } /** * Parse an Illumina id string. * @param readId String with the Illumina id * @throws EoulsanException if the id is not an Illumina id */ public final void parse(final String readId) throws EoulsanException { if (readId == null) { throw new NullPointerException("The string to parse is null"); } final Matcher matcher = this.pattern.matcher(readId.trim()); if (!matcher.lookingAt()) { throw new EoulsanException("Invalid illumina id: " + readId); } if (this.pattern == PATTERN_1_8) { this.instrumentId = matcher.group(1); this.runId = Integer.parseInt(matcher.group(2)); this.flowCellId = matcher.group(3); this.flowCellLane = Integer.parseInt(matcher.group(4)); this.tileNumberInFlowCellLane = Integer.parseInt(matcher.group(5)); this.xClusterCoordinateInTile = Integer.parseInt(matcher.group(6)); this.yClusterCoordinateInTile = Integer.parseInt(matcher.group(7)); this.pairMember = Integer.parseInt(matcher.group(8)); this.filtered = matcher.group(9).charAt(0) == 'Y'; this.controlNumber = Integer.parseInt(matcher.group(10)); this.sequenceIndex = matcher.group(11); return; } else if (this.pattern == PATTERN_3) { this.instrumentId = matcher.group(1); this.runId = Integer.parseInt(matcher.group(2)); this.flowCellId = matcher.group(3); this.flowCellLane = Integer.parseInt(matcher.group(4)); this.tileNumberInFlowCellLane = Integer.parseInt(matcher.group(5)); this.xClusterCoordinateInTile = Integer.parseInt(matcher.group(6)); this.yClusterCoordinateInTile = Integer.parseInt(matcher.group(7)); this.pairMember = Integer.parseInt(matcher.group(8)); this.filtered = matcher.group(9).charAt(0) == 'Y'; this.controlNumber = Integer.parseInt(matcher.group(10)); this.sequenceIndex = NO_SEQUENCE_INDEX; return; } else if (this.pattern == PATTERN_1_4) { this.instrumentId = matcher.group(1); this.runId = -1; this.flowCellId = null; this.flowCellLane = Integer.parseInt(matcher.group(2)); this.tileNumberInFlowCellLane = Integer.parseInt(matcher.group(3)); this.xClusterCoordinateInTile = Integer.parseInt(matcher.group(4)); this.yClusterCoordinateInTile = Integer.parseInt(matcher.group(5)); this.sequenceIndex = matcher.group(6); this.pairMember = Integer.parseInt(matcher.group(7)); this.filtered = false; this.controlNumber = -1; return; } else if (this.pattern == PATTERN_2) { this.instrumentId = matcher.group(1); this.runId = -1; this.flowCellId = null; this.flowCellLane = Integer.parseInt(matcher.group(2)); this.tileNumberInFlowCellLane = Integer.parseInt(matcher.group(3)); this.xClusterCoordinateInTile = Integer.parseInt(matcher.group(4)); this.yClusterCoordinateInTile = Integer.parseInt(matcher.group(5)); this.sequenceIndex = NO_SEQUENCE_INDEX; this.pairMember = Integer.parseInt(matcher.group(6)); this.filtered = false; this.controlNumber = -1; return; } else if (this.pattern == PATTERN_SRA) { this.instrumentId = matcher.group(1); this.runId = Integer.parseInt(matcher.group(2)); this.flowCellId = matcher.group(3); this.flowCellLane = Integer.parseInt(matcher.group(4)); this.tileNumberInFlowCellLane = Integer.parseInt(matcher.group(5)); this.xClusterCoordinateInTile = Integer.parseInt(matcher.group(6)); this.yClusterCoordinateInTile = Integer.parseInt(matcher.group(7)); this.sequenceIndex = NO_SEQUENCE_INDEX; this.pairMember = -1; this.filtered = false; this.controlNumber = -1; return; } // PATTERN_1 this.instrumentId = matcher.group(1); this.runId = -1; this.flowCellId = null; this.flowCellLane = Integer.parseInt(matcher.group(2)); this.tileNumberInFlowCellLane = Integer.parseInt(matcher.group(3)); this.xClusterCoordinateInTile = Integer.parseInt(matcher.group(4)); this.yClusterCoordinateInTile = Integer.parseInt(matcher.group(5)); this.sequenceIndex = NO_SEQUENCE_INDEX; this.pairMember = -1; this.filtered = false; this.controlNumber = -1; } // // Constructor // /** * Public constructor. * @param readId String with Illumina id to parse * @throws EoulsanException if the id is not an Illumina id */ public IlluminaReadId(final String readId) throws EoulsanException { if (readId == null) { throw new NullPointerException("The string to parse is null"); } this.pattern = findPattern(readId); parse(readId); } /** * Public constructor. * @param sequence sequence witch name must be parsed * @throws EoulsanException if the id is not an Illumina id */ public IlluminaReadId(final Sequence sequence) throws EoulsanException { if (sequence == null) { throw new NullPointerException("The sequence is null"); } final String readId = sequence.getName(); if (readId == null) { throw new NullPointerException("The string to parse is null"); } this.pattern = findPattern(readId); parse(readId); } }