package com.compomics.util.experiment.biology.taxonomy; import com.compomics.util.Util; import com.compomics.util.experiment.biology.taxonomy.mappings.BiomartMapping; import com.compomics.util.experiment.biology.taxonomy.mappings.EnsemblGenomesSpecies; import com.compomics.util.experiment.biology.taxonomy.mappings.EnsemblGenomesSpecies.EnsemblGenomeDivision; import com.compomics.util.experiment.biology.taxonomy.mappings.EnsemblSpecies; import com.compomics.util.experiment.biology.taxonomy.mappings.UniprotTaxonomy; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; /** * Class related to the handling of species. * * @author Marc Vaudel * @author Harald Barsnes */ public class SpeciesFactory { /** * The instance of the factory. */ private static SpeciesFactory instance = null; /** * Tag for unknown species. */ public static final String UNKNOWN = "Unknown"; /** * The subfolder relative to the jar file where gene mapping files are * stored in tools. */ private final static String TOOL_SPECIES_MAPPING_SUBFOLDER = "resources/conf/taxonomy/"; /** * The name of the UniProt taxonomy file. */ public static final String UNIPROT_TAXONOMY_FILENAME = "uniprot_taxonomy"; /** * The names of the Ensembl species files. */ public static final String ENSEMBL_SPECIES = "ensembl_species"; /** * The names of the Ensembl genome species files. */ public static final String ENSEMBL_GENOME_SPECIES = "ensembl-genome_species"; /** * The name of the Ensembl BioMart datasets file. */ public static final String BIOMART_ENSEMBL_FILENAME = "ensembl_biomart"; /** * The name of the Ensembl Genome BioMart datasets file. */ public static final String BIOMART_ENSEMBL_GENOME_FILENAME = "ensembl-genome_biomart"; /** * The Ensembl species mapping. */ private EnsemblSpecies ensemblSpecies; /** * The Ensembl genome species mapping. */ private EnsemblGenomesSpecies ensemblGenomesSpecies; /** * The UniProt taxonomy. */ private UniprotTaxonomy uniprotTaxonomy; /** * The BioMart mapping. */ private BiomartMapping biomartMapping; /** * Static method returning the instance of the factory. * * @return the instance of the factory */ public static SpeciesFactory getInstance() { if (instance == null) { instance = new SpeciesFactory(); } return instance; } /** * Constructor. */ private SpeciesFactory() { } /** * Initiates the factory using the files of the static fields. * * @param jarFilePath path to the jar file * * @throws IOException Exception thrown whenever an error occurred while * reading a mapping file. */ public void initiate(String jarFilePath) throws IOException { ensemblSpecies = new EnsemblSpecies(); ensemblSpecies.loadMapping(getEnsemblSpeciesFile(jarFilePath)); ensemblGenomesSpecies = new EnsemblGenomesSpecies(); ensemblGenomesSpecies.loadMapping(getEnsemblGenomesSpeciesFile(jarFilePath)); uniprotTaxonomy = new UniprotTaxonomy(); uniprotTaxonomy.loadMapping(getUniprotTaxonomyFile(jarFilePath)); biomartMapping = new BiomartMapping(); biomartMapping.loadMapping(getBiomartEnsemblMappingFile(jarFilePath), getBiomartEnsemblGenomeMappingFile(jarFilePath)); } /** * Returns a listing of the species occurrence map provided. * * @param speciesOccurrence a map containing the occurrence of different * species * * @return a listing of the species occurrence map provided */ public static String getSpeciesDescription(HashMap<String, Integer> speciesOccurrence) { HashMap<Integer, ArrayList<String>> occurrenceToSpecies = new HashMap<Integer, ArrayList<String>>(speciesOccurrence.size()); double total = 0.0; for (String taxonomy : speciesOccurrence.keySet()) { Integer occurrence = speciesOccurrence.get(taxonomy); total += occurrence; ArrayList<String> species = occurrenceToSpecies.get(occurrence); if (species == null) { species = new ArrayList<String>(1); occurrenceToSpecies.put(occurrence, species); } species.add(taxonomy); } StringBuilder description = new StringBuilder(); ArrayList<Integer> occurrences = new ArrayList<Integer>(occurrenceToSpecies.keySet()); Collections.sort(occurrences, Collections.reverseOrder()); for (Integer occurrence : occurrences) { ArrayList<String> species = occurrenceToSpecies.get(occurrence); Collections.sort(species); for (String taxonomy : species) { double percentage = 100.0 * occurrence / total; if (description.length() > 0) { description.append(", "); } description.append(taxonomy); if (speciesOccurrence.size() > 1) { String occurrencePercentage; if (percentage > 99.9) { occurrencePercentage = ">99.9"; } else if (percentage < 0.1) { occurrencePercentage = "<0.1"; } else { double roundedDouble = Util.roundDouble(percentage, 1); occurrencePercentage = roundedDouble + ""; } description.append(" (").append(occurrence).append(", ").append(occurrencePercentage).append("%)"); } } } return description.toString(); } /** * Returns the Ensembl species file. * * @param jarFilePath the path to the jar file * * @return the Ensembl species file */ public static File getEnsemblSpeciesFile(String jarFilePath) { return new File(jarFilePath, TOOL_SPECIES_MAPPING_SUBFOLDER + ENSEMBL_SPECIES); } /** * Returns the Ensembl genome species file. * * @param jarFilePath the path to the jar file * * @return the Ensembl genome species file */ public static File getEnsemblGenomesSpeciesFile(String jarFilePath) { return new File(jarFilePath, TOOL_SPECIES_MAPPING_SUBFOLDER + ENSEMBL_GENOME_SPECIES); } /** * Returns the UniProt taxonomy file. * * @param jarFilePath the path to the jar file * * @return the UniProt taxonomy species file */ public static File getUniprotTaxonomyFile(String jarFilePath) { return new File(jarFilePath, TOOL_SPECIES_MAPPING_SUBFOLDER + UNIPROT_TAXONOMY_FILENAME); } /** * Returns the Ensembl BioMart file. * * @param jarFilePath the path to the jar file * * @return the Ensembl BioMart file */ public static File getBiomartEnsemblMappingFile(String jarFilePath) { return new File(jarFilePath, TOOL_SPECIES_MAPPING_SUBFOLDER + BIOMART_ENSEMBL_FILENAME); } /** * Returns the Ensembl Genome BioMart file. * * @param jarFilePath the path to the jar file * * @return the Ensembl Genome BioMart file */ public static File getBiomartEnsemblGenomeMappingFile(String jarFilePath) { return new File(jarFilePath, TOOL_SPECIES_MAPPING_SUBFOLDER + BIOMART_ENSEMBL_GENOME_FILENAME); } /** * Returns the Latin name of the species corresponding to the given taxon * according to the UniProt mapping. Null if not found. * * @param taxon the NCBI taxon ID * * @return the Latin name of the species */ public String getLatinName(Integer taxon) { return uniprotTaxonomy.getLatinName(taxon); } /** * Returns the name of the species corresponding to the given taxon * according to the UniProt mapping. Null if not found. For species mapping * to plants in the Ensembl genome mapping, the name is Latin name (common * name); common name (Latin Name) for the other species. If no common name * is present the Latin name is used. * * @param taxon the NCBI taxon ID * * @return the Latin name of the species */ public String getName(Integer taxon) { if (uniprotTaxonomy == null || uniprotTaxonomy.getLatinName(taxon) == null) { return null; } boolean plant = false; if (ensemblGenomesSpecies != null) { EnsemblGenomeDivision division = ensemblGenomesSpecies.getDivision(taxon); if (division != null && division == EnsemblGenomeDivision.plants) { plant = true; } } String latinName = uniprotTaxonomy.getLatinName(taxon); String commonName = uniprotTaxonomy.getCommonName(taxon); StringBuilder name = new StringBuilder(); if (plant) { name.append(latinName); if (commonName != null) { name.append(" (").append(commonName).append(")"); } } else { if (commonName != null) { name.append(commonName).append(" ("); } name.append(latinName); if (commonName != null) { name.append(")"); } } return name.toString(); } /** * Returns the Ensembl assembly to use for the given taxon. * * @param taxon the taxon number * * @return the Ensembl assembly to use */ public String getEnsemblAssembly(Integer taxon) { EnsemblGenomeDivision ensemblGenomeDivision = ensemblGenomesSpecies.getDivision(taxon); if (ensemblGenomeDivision == null) { return ensemblSpecies.getAssembly(taxon); } else { return ensemblGenomesSpecies.getAssembly(taxon); } } /** * Returns the Ensembl dataset to use for the given taxon. * * @param taxon the taxon number * * @return the Ensembl dataset to use */ public String getEnsemblDataset(Integer taxon) { String assembly = getEnsemblAssembly(taxon); if (assembly == null) { return null; } return biomartMapping.getDataset(assembly); } /** * Returns the Ensembl species mapping. * * @return the Ensembl species mapping */ public EnsemblSpecies getEnsemblSpecies() { return ensemblSpecies; } /** * Returns the Ensembl genome species mapping. * * @return the Ensembl genome species mapping */ public EnsemblGenomesSpecies getEnsemblGenomesSpecies() { return ensemblGenomesSpecies; } /** * Returns the UniProt taxonomy mapping. * * @return the UniProt taxonomy mapping */ public UniprotTaxonomy getUniprotTaxonomy() { return uniprotTaxonomy; } /** * Returns the BioMart mapping. * * @return the BioMart mapping */ public BiomartMapping getBiomartMapping() { return biomartMapping; } /** * Returns a map of the species in Ensembl. * * @return a map of the species in Ensembl */ public HashMap<String, HashSet<Integer>> getEnsembleSpecies() { HashMap<String, HashSet<Integer>> speciesMap = new HashMap<String, HashSet<Integer>>(EnsemblGenomeDivision.values().length + 1); for (Integer taxon : ensemblGenomesSpecies.getTaxons()) { String divisionName = ensemblGenomesSpecies.getDivision(taxon).ensemblType; HashSet<Integer> taxons = speciesMap.get(divisionName); if (taxons == null) { taxons = new HashSet<Integer>(); speciesMap.put(divisionName, taxons); } taxons.add(taxon); } speciesMap.put("vertebrates", ensemblSpecies.getTaxons()); return speciesMap; } }