/* * The MIT License (MIT) * * Copyright (c) 2007-2015 Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package org.broad.igv.feature.tribble; import htsjdk.samtools.util.BlockCompressedInputStream; import org.apache.log4j.Logger; import org.broad.igv.data.cufflinks.FPKMTrackingCodec; import org.broad.igv.feature.FeatureType; import org.broad.igv.feature.dsi.DSICodec; import org.broad.igv.feature.dsi.DSIFeature; import org.broad.igv.feature.genome.Genome; import org.broad.igv.gwas.EQTLCodec; import org.broad.igv.peaks.PeakCodec; import org.broad.igv.util.ParsingUtils; import org.broad.igv.util.ResourceLocator; import htsjdk.tribble.AsciiFeatureCodec; import htsjdk.tribble.FeatureCodec; import htsjdk.variant.bcf2.BCF2Codec; import htsjdk.variant.vcf.VCF3Codec; import htsjdk.variant.vcf.VCFCodec; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * A factory class for Tribble codecs. implements a single, static, public method to return a codec given a * path to a feature file (bed, gff, vcf, etc). */ public class CodecFactory { private static Logger log = Logger.getLogger(CodecFactory.class); public static final List<String> validExtensions = new ArrayList<String>(15); static { validExtensions.addAll(Arrays.asList("vcf4", "vcf", "bed", "refflat", "genepred", "ensgene", "refgene", "ucscgene", "repmask", "gff3", "gvf", "gff", "gtf", "psl", "mut", "maf")); } /** * @param path * @param genome * @return * @deprecated Use {@link #getCodec(org.broad.igv.util.ResourceLocator, org.broad.igv.feature.genome.Genome)} * This won't handle URLs with query strings properly for all codecs */ public static FeatureCodec getCodec(String path, Genome genome) { return getCodec(new ResourceLocator(path), genome); } /** * Return a tribble codec to decode the supplied file, or null if not found. * * @param locator the ResourceLocator (file or URL) to the feature file */ public static FeatureCodec getCodec(ResourceLocator locator, Genome genome) { String path = locator.getPath(); String fn = locator.getTypeString().toLowerCase(); if (fn.endsWith(".vcf3")) { return new VCFWrapperCodec(new VCF3Codec(), genome); } if (fn.endsWith(".vcf4")) { return new VCFWrapperCodec(new VCFCodec(), genome); } else if (fn.endsWith(".vcf")) { return new VCFWrapperCodec(getVCFCodec(locator), genome); } else if (fn.endsWith(".bcf")) { return new BCF2WrapperCodec(new BCF2Codec(), genome); } else if (fn.endsWith(".bed")) { final IGVBEDCodec codec = new IGVBEDCodec(genome); if (fn.endsWith("junctions.bed")) { codec.setFeatureType(FeatureType.SPLICE_JUNCTION); } return codec; } else if (fn.endsWith(".gappedpeak")) { return new IGVBEDCodec(genome, FeatureType.GAPPED_PEAK); } else if (fn.endsWith(".dgv")) { return new DGVCodec(genome); } else if (fn.contains("refflat")) { return new UCSCGeneTableCodec(UCSCGeneTableCodec.Type.REFFLAT, genome); } else if (fn.contains("genepred") || fn.contains("ensgene") || fn.contains("refgene")) { return new UCSCGeneTableCodec(UCSCGeneTableCodec.Type.GENEPRED, genome); } else if (fn.contains("ucscgene")) { return new UCSCGeneTableCodec(UCSCGeneTableCodec.Type.UCSCGENE, genome); } else if (fn.endsWith(".rmask") || (fn.endsWith(".repmask"))) { return new REPMaskCodec(genome); } else if (fn.endsWith(".gff3") || fn.endsWith(".gvf")) { return new GFFCodec(GFFCodec.Version.GFF3, genome); } else if (fn.endsWith(".gff") || fn.endsWith(".gtf")) { return new GFFCodec(genome); //} else if (fn.endsWith(".sam")) { //return new SAMCodec(); } else if (fn.endsWith(".psl") || fn.endsWith(".pslx")) { return new PSLCodec(genome); } else if (MUTCodec.isMutationAnnotationFile(locator)) { return new MUTCodec(path, genome); } else if (fn.endsWith(".narrowpeak") || fn.endsWith(".broadpeak")) { return new EncodePeakCodec(genome); } else if (fn.endsWith(".peak")) { return new PeakCodec(genome); } else if (fn.endsWith(".snp") || fn.endsWith(".ucscsnp")) { return new UCSCSnpCodec(genome); } else if (fn.endsWith(".eqtl")) { return new EQTLCodec(genome); } else if (fn.endsWith("fpkm_tracking")) { return new FPKMTrackingCodec(path); //} else if (fn.endsWith("gene_exp.diff") || fn.endsWith("cds_exp.diff")) { // return new ExpDiffCodec(path); } else if (fn.endsWith(".dsi")) { return new DSICodec(genome); } else { return null; } } /** * Return the appropriate VCFCodec based on the version tag. * <p/> * e.g. ##fileformat=VCFv4.1 * * @param locator * @return */ private static AsciiFeatureCodec getVCFCodec(ResourceLocator locator) { String path = locator.getPath(); BufferedReader reader = null; try { // If the file ends with ".gz" assume it is a tabix indexed file if (locator.getURLPath().toLowerCase().endsWith(".gz")) { // NOTE: MUST USE THE PICARD VERSION OF ParsingUtils. The IGV version will return a gzip stream. reader = new BufferedReader(new InputStreamReader(new BlockCompressedInputStream( htsjdk.tribble.util.ParsingUtils.openInputStream(path)))); } else { reader = ParsingUtils.openBufferedReader(path); } // Look for fileformat directive. This should be the first line, but just in case check the first 20 int lineCount = 0; String formatLine; while ((formatLine = reader.readLine()) != null && lineCount < 20) { if (formatLine.toLowerCase().startsWith("##fileformat") || formatLine.toLowerCase().startsWith("##format")) { String[] tmp = formatLine.split("="); if (tmp.length > 1) { String version = tmp[1].toLowerCase(); if (version.startsWith("vcfv3")) { return new VCF3Codec(); } else { return new VCFCodec(); } } } lineCount++; } } catch (IOException e) { log.error("Error checking VCF Version"); } finally { if (reader != null) try { reader.close(); } catch (IOException e) { } } // Should never get here, but as a last resort assume this is a VCF 4.x file. return new VCFCodec(); } /** * Return true if a file represented by "path" is indexable. This method is an optimization, we could just look * for the index but that is expensive to do for remote resources. All tribble indexable extensions should be * listed here. * * @param locator * @param genome * @return */ public static boolean hasCodec(ResourceLocator locator, Genome genome) { String fn = locator.getTypeString(); if (fn.endsWith(".gz")) { int l = fn.length() - 3; fn = fn.substring(0, l); } // The vcf extension is for performance, it doesn't matter which codec is returned all vcf files // are indexable. return fn.endsWith(".vcf") || fn.endsWith(".bcf") || getCodec(locator, genome) != null; } }