/*
* The MIT License (MIT)
*
* Copyright (c) 2007-2015 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broad.igv.feature.tribble;
import htsjdk.samtools.util.BlockCompressedInputStream;
import org.apache.log4j.Logger;
import org.broad.igv.data.cufflinks.FPKMTrackingCodec;
import org.broad.igv.feature.FeatureType;
import org.broad.igv.feature.dsi.DSICodec;
import org.broad.igv.feature.dsi.DSIFeature;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.gwas.EQTLCodec;
import org.broad.igv.peaks.PeakCodec;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.FeatureCodec;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.vcf.VCF3Codec;
import htsjdk.variant.vcf.VCFCodec;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* A factory class for Tribble codecs. implements a single, static, public method to return a codec given a
* path to a feature file (bed, gff, vcf, etc).
*/
public class CodecFactory {
private static Logger log = Logger.getLogger(CodecFactory.class);
public static final List<String> validExtensions = new ArrayList<String>(15);
static {
validExtensions.addAll(Arrays.asList("vcf4", "vcf", "bed", "refflat", "genepred", "ensgene", "refgene", "ucscgene", "repmask", "gff3", "gvf", "gff", "gtf", "psl", "mut", "maf"));
}
/**
* @param path
* @param genome
* @return
* @deprecated Use {@link #getCodec(org.broad.igv.util.ResourceLocator, org.broad.igv.feature.genome.Genome)}
* This won't handle URLs with query strings properly for all codecs
*/
public static FeatureCodec getCodec(String path, Genome genome) {
return getCodec(new ResourceLocator(path), genome);
}
/**
* Return a tribble codec to decode the supplied file, or null if not found.
*
* @param locator the ResourceLocator (file or URL) to the feature file
*/
public static FeatureCodec getCodec(ResourceLocator locator, Genome genome) {
String path = locator.getPath();
String fn = locator.getTypeString().toLowerCase();
if (fn.endsWith(".vcf3")) {
return new VCFWrapperCodec(new VCF3Codec(), genome);
}
if (fn.endsWith(".vcf4")) {
return new VCFWrapperCodec(new VCFCodec(), genome);
} else if (fn.endsWith(".vcf")) {
return new VCFWrapperCodec(getVCFCodec(locator), genome);
} else if (fn.endsWith(".bcf")) {
return new BCF2WrapperCodec(new BCF2Codec(), genome);
} else if (fn.endsWith(".bed")) {
final IGVBEDCodec codec = new IGVBEDCodec(genome);
if (fn.endsWith("junctions.bed")) {
codec.setFeatureType(FeatureType.SPLICE_JUNCTION);
}
return codec;
} else if (fn.endsWith(".gappedpeak")) {
return new IGVBEDCodec(genome, FeatureType.GAPPED_PEAK);
} else if (fn.endsWith(".dgv")) {
return new DGVCodec(genome);
} else if (fn.contains("refflat")) {
return new UCSCGeneTableCodec(UCSCGeneTableCodec.Type.REFFLAT, genome);
} else if (fn.contains("genepred") || fn.contains("ensgene") || fn.contains("refgene")) {
return new UCSCGeneTableCodec(UCSCGeneTableCodec.Type.GENEPRED, genome);
} else if (fn.contains("ucscgene")) {
return new UCSCGeneTableCodec(UCSCGeneTableCodec.Type.UCSCGENE, genome);
} else if (fn.endsWith(".rmask") || (fn.endsWith(".repmask"))) {
return new REPMaskCodec(genome);
} else if (fn.endsWith(".gff3") || fn.endsWith(".gvf")) {
return new GFFCodec(GFFCodec.Version.GFF3, genome);
} else if (fn.endsWith(".gff") || fn.endsWith(".gtf")) {
return new GFFCodec(genome);
//} else if (fn.endsWith(".sam")) {
//return new SAMCodec();
} else if (fn.endsWith(".psl") || fn.endsWith(".pslx")) {
return new PSLCodec(genome);
} else if (MUTCodec.isMutationAnnotationFile(locator)) {
return new MUTCodec(path, genome);
} else if (fn.endsWith(".narrowpeak") || fn.endsWith(".broadpeak")) {
return new EncodePeakCodec(genome);
} else if (fn.endsWith(".peak")) {
return new PeakCodec(genome);
} else if (fn.endsWith(".snp") || fn.endsWith(".ucscsnp")) {
return new UCSCSnpCodec(genome);
} else if (fn.endsWith(".eqtl")) {
return new EQTLCodec(genome);
} else if (fn.endsWith("fpkm_tracking")) {
return new FPKMTrackingCodec(path);
//} else if (fn.endsWith("gene_exp.diff") || fn.endsWith("cds_exp.diff")) {
// return new ExpDiffCodec(path);
} else if (fn.endsWith(".dsi")) {
return new DSICodec(genome);
} else {
return null;
}
}
/**
* Return the appropriate VCFCodec based on the version tag.
* <p/>
* e.g. ##fileformat=VCFv4.1
*
* @param locator
* @return
*/
private static AsciiFeatureCodec getVCFCodec(ResourceLocator locator) {
String path = locator.getPath();
BufferedReader reader = null;
try {
// If the file ends with ".gz" assume it is a tabix indexed file
if (locator.getURLPath().toLowerCase().endsWith(".gz")) {
// NOTE: MUST USE THE PICARD VERSION OF ParsingUtils. The IGV version will return a gzip stream.
reader = new BufferedReader(new InputStreamReader(new BlockCompressedInputStream(
htsjdk.tribble.util.ParsingUtils.openInputStream(path))));
} else {
reader = ParsingUtils.openBufferedReader(path);
}
// Look for fileformat directive. This should be the first line, but just in case check the first 20
int lineCount = 0;
String formatLine;
while ((formatLine = reader.readLine()) != null && lineCount < 20) {
if (formatLine.toLowerCase().startsWith("##fileformat") ||
formatLine.toLowerCase().startsWith("##format")) {
String[] tmp = formatLine.split("=");
if (tmp.length > 1) {
String version = tmp[1].toLowerCase();
if (version.startsWith("vcfv3")) {
return new VCF3Codec();
} else {
return new VCFCodec();
}
}
}
lineCount++;
}
} catch (IOException e) {
log.error("Error checking VCF Version");
} finally {
if (reader != null) try {
reader.close();
} catch (IOException e) {
}
}
// Should never get here, but as a last resort assume this is a VCF 4.x file.
return new VCFCodec();
}
/**
* Return true if a file represented by "path" is indexable. This method is an optimization, we could just look
* for the index but that is expensive to do for remote resources. All tribble indexable extensions should be
* listed here.
*
* @param locator
* @param genome
* @return
*/
public static boolean hasCodec(ResourceLocator locator, Genome genome) {
String fn = locator.getTypeString();
if (fn.endsWith(".gz")) {
int l = fn.length() - 3;
fn = fn.substring(0, l);
}
// The vcf extension is for performance, it doesn't matter which codec is returned all vcf files
// are indexable.
return fn.endsWith(".vcf") || fn.endsWith(".bcf") || getCodec(locator, genome) != null;
}
}