/* * Copyright (C) 2015 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package com.google.cloud.genomics.dataflow.utils; import com.google.api.client.util.Preconditions; import com.google.api.services.genomics.model.Annotation; import com.google.api.services.genomics.model.Exon; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Range; import htsjdk.samtools.util.SequenceUtil; import java.util.Map; import java.util.logging.Logger; /** * Utilities for dealing with genomics {@link Annotation}s. Provides some very * basic functions for determining the effect of a given mutation on a * transcript. */ public class AnnotationUtils { private static final Logger LOG = Logger.getLogger(AnnotationUtils.class.getName()); private static final char STOP = '*'; private static final Map<String, Character> AMINO_ACIDS = ImmutableMap.<String, Character>builder() .put("TTT", 'F').put("TTC", 'F').put("TTA", 'L').put("TTG", 'L') .put("TCT", 'S').put("TCC", 'S').put("TCA", 'S').put("TCG", 'S') .put("TAT", 'Y').put("TAC", 'Y').put("TAA", STOP).put("TAG", STOP) .put("TGT", 'C').put("TGC", 'C').put("TGA", STOP).put("TGG", 'W') .put("CTT", 'L').put("CTC", 'L').put("CTA", 'L').put("CTG", 'L') .put("CCT", 'P').put("CCC", 'P').put("CCA", 'P').put("CCG", 'P') .put("CAT", 'H').put("CAC", 'H').put("CAA", 'Q').put("CAG", 'Q') .put("CGT", 'R').put("CGC", 'R').put("CGA", 'R').put("CGG", 'R') .put("ATT", 'I').put("ATC", 'I').put("ATA", 'I').put("ATG", 'M') .put("ACT", 'T').put("ACC", 'T').put("ACA", 'T').put("ACG", 'T') .put("AAT", 'N').put("AAC", 'N').put("AAA", 'K').put("AAG", 'K') .put("AGT", 'S').put("AGC", 'S').put("AGA", 'R').put("AGG", 'R') .put("GTT", 'V').put("GTC", 'V').put("GTA", 'V').put("GTG", 'V') .put("GCT", 'A').put("GCC", 'A').put("GCA", 'A').put("GCG", 'A') .put("GAT", 'D').put("GAC", 'D').put("GAA", 'E').put("GAG", 'E') .put("GGT", 'G').put("GGC", 'G').put("GGA", 'G').put("GGG", 'G').build(); private AnnotationUtils() {} /** * Describes the effect of a given variant on a transcript. Subset of the * annotations API field {@code VariantAnnotation.effect}. */ public enum VariantEffect { SYNONYMOUS_SNP, STOP_GAIN, STOP_LOSS, NONSYNONYMOUS_SNP } /** * Determines the effect of the given allele at the given position within a * transcript. Currently, this routine is relatively primitive: it only works * with SNPs and only computes effects on coding regions of the transcript. * This utility currently does not handle any splice sites well, including * splice site disruption and codons which span two exons. * * @param variantStart 0-based absolute start for the variant. * @param allele Bases to substitute at {@code variantStart}. * @param transcript The affected transcript. * @param transcriptBases The reference bases which span the provided * {@code transcript}. Must match the exact length of the * {@code transcript.position}. * @return The effect of this variant on the given transcript, or null if * unknown. */ public static VariantEffect determineVariantTranscriptEffect( long variantStart, String allele, Annotation transcript, String transcriptBases) { long txLen = transcript.getEnd() - transcript.getStart(); Preconditions.checkArgument(transcriptBases.length() == txLen, "transcriptBases must have equal length to the transcript; got " + transcriptBases.length() + " and " + txLen + ", respectively"); if (allele.length() != 1) { LOG.fine("determineVariantTranscriptEffects() only supports SNPs, ignoring " + allele); return null; } if (transcript.getTranscript().getCodingSequence() == null) { LOG.fine("determineVariantTranscriptEffects() only supports intersection with coding " + "transcript, ignoring "); return null; } long variantEnd = variantStart + 1; Range<Long> variantRange = Range.closedOpen(variantStart, variantEnd); Range<Long> codingRange = Range.closedOpen( transcript.getTranscript().getCodingSequence().getStart(), transcript.getTranscript().getCodingSequence().getEnd()); if (Boolean.TRUE.equals(transcript.getReverseStrand())) { allele = SequenceUtil.reverseComplement(allele); } for (Exon exon : transcript.getTranscript().getExons()) { // For now, only compute effects on variants within the coding region of an exon. Range<Long> exonRange = Range.closedOpen(exon.getStart(), exon.getEnd()); if (exonRange.isConnected(codingRange) && exonRange.intersection(codingRange).isConnected(variantRange) && !exonRange.intersection(codingRange).intersection(variantRange).isEmpty()) { // Get the bases which correspond to this exon. int txOffset = transcript.getStart().intValue(); String exonBases = transcriptBases.substring( exon.getStart().intValue() - txOffset, exon.getEnd().intValue() - txOffset); int variantExonOffset = (int) (variantStart - exon.getStart()); if (Boolean.TRUE.equals(transcript.getReverseStrand())) { // Normalize the offset and bases to 5' -> 3'. exonBases = SequenceUtil.reverseComplement(exonBases); variantExonOffset = (int) (exon.getEnd() - variantEnd); } // Determine the indices for the codon which contains this variant. if (exon.getFrame() == null) { LOG.fine("exon lacks frame data, cannot determine effect"); return null; } int offsetWithinCodon = (variantExonOffset + exon.getFrame()) % 3; int codonExonOffset = variantExonOffset - offsetWithinCodon; if (codonExonOffset < 0 || exonBases.length() <= codonExonOffset+3) { LOG.fine("variant codon spans multiple exons, this case is not yet handled"); return null; } String fromCodon = exonBases.substring(codonExonOffset, codonExonOffset+3); String toCodon = fromCodon.substring(0, offsetWithinCodon) + allele + fromCodon.substring(offsetWithinCodon + 1); return codonChangeToEffect(fromCodon, toCodon); } } return null; } private static VariantEffect codonChangeToEffect(String fromCodon, String toCodon) { Preconditions.checkArgument(AMINO_ACIDS.containsKey(fromCodon), "no such codon: " + fromCodon); Preconditions.checkArgument(AMINO_ACIDS.containsKey(toCodon), "no such codon: " + toCodon); char fromAA = AMINO_ACIDS.get(fromCodon); char toAA = AMINO_ACIDS.get(toCodon); if (fromAA == toAA) { return VariantEffect.SYNONYMOUS_SNP; } else if (toAA == STOP) { return VariantEffect.STOP_GAIN; } else if (fromAA == STOP) { return VariantEffect.STOP_LOSS; } else { return VariantEffect.NONSYNONYMOUS_SNP; } } }