package com.compomics.util.experiment.biology.genes; import com.compomics.util.Util; import com.compomics.util.experiment.biology.genes.ensembl.EnsemblVersion; import com.compomics.util.experiment.biology.genes.ensembl.GeneMapping; import com.compomics.util.experiment.biology.genes.go.GoMapping; import com.compomics.util.experiment.biology.taxonomy.SpeciesFactory; import com.compomics.util.experiment.biology.taxonomy.mappings.EnsemblGenomesSpecies.EnsemblGenomeDivision; import com.compomics.util.experiment.identification.protein_sequences.FastaIndex; import com.compomics.util.experiment.identification.protein_sequences.SequenceFactory; import com.compomics.util.gui.waiting.waitinghandlers.ProgressDialogX; import com.compomics.util.preferences.GenePreferences; import com.compomics.util.protein.Header; import com.compomics.util.waiting.WaitingHandler; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.HashMap; import java.util.HashSet; /** * Class used to map proteins to gene information. * * @author Marc Vaudel * @author Harald Barsnes */ public class GeneFactory { /** * The instance of the factory. */ private static GeneFactory instance = null; /** * The separator used to separate line contents. */ public final static String SEPARATOR = "\t"; /** * The folder where gene mapping files are stored. */ private static String GENE_MAPPING_FOLDER = System.getProperty("user.home") + "/.compomics/gene_mappings/"; /** * The subfolder relative to the jar file where gene mapping files are * stored in tools. */ private final static String TOOL_GENE_MAPPING_SUBFOLDER = "resources/conf/gene_mappings/"; /** * The name of the Ensembl versions file. */ private static final String ENSEMBL_VERSIONS = "ensembl_versions"; /** * The name of the GO domains file. */ private static final String GO_DOMAINS = "go_domains"; /** * The suffix to use for files containing gene mappings. */ public final static String GENE_MAPPING_FILE_SUFFIX = "_gene_mappings"; /** * The suffix to use for files containing GO mappings. */ public final static String GO_MAPPING_FILE_SUFFIX = "_go_mappings"; /** * The Ensembl versions for each species. */ private HashMap<String, String> ensemblVersionsMap; /** * The horizontal padding to use when printing to the waiting dialog. */ private final String PADDING = " "; /** * Static method returning the instance of the factory. * * @return the instance of the factory */ public static GeneFactory getInstance() { if (instance == null) { instance = new GeneFactory(); } return instance; } /** * Constructor. */ private GeneFactory() { } /** * Initializes the factory. Note: the species factory must be initialized * first. * * @param jarFilePath the path to the jar file * * @throws java.io.IOException Exception thrown if an error occurs while * reading the species mapping */ public void initialize(String jarFilePath) throws IOException { // load the previous ensembl version numbers File ensemblVersionsFile = getEnsemblVersionsFile(); if (ensemblVersionsFile.exists()) { loadEnsemblSpeciesVersions(ensemblVersionsFile); } else { ensemblVersionsMap = new HashMap<String, String>(); } createDefaultGeneMappingFiles( new File(jarFilePath, TOOL_GENE_MAPPING_SUBFOLDER + ENSEMBL_VERSIONS), new File(jarFilePath, TOOL_GENE_MAPPING_SUBFOLDER + GO_DOMAINS), new File(jarFilePath, TOOL_GENE_MAPPING_SUBFOLDER + "hsapiens_gene_ensembl_go_mappings"), new File(jarFilePath, TOOL_GENE_MAPPING_SUBFOLDER + "hsapiens_gene_ensembl_gene_mappings"), true); } /** * Returns the gene maps for the FASTA file loaded in the factory. * * @param genePreferences the gene preferences * @param waitingHandler waiting handler displaying progress for the * download and allowing canceling of the progress. * * @return the gene maps for the FASTA file loaded in the factory * * @throws java.io.IOException thrown whenever an error occurs while reading * or writing data. * @throws java.lang.InterruptedException exception thrown whenever a threading issue occurs. */ public GeneMaps getGeneMaps(GenePreferences genePreferences, WaitingHandler waitingHandler) throws IOException, InterruptedException { SpeciesFactory speciesFactory = SpeciesFactory.getInstance(); SequenceFactory sequenceFactory = SequenceFactory.getInstance(); FastaIndex fastaIndex = sequenceFactory.getCurrentFastaIndex(); HashMap<String, Integer> speciesOccurrence = fastaIndex.getSpecies(); HashMap<String, GeneMapping> geneMappings = new HashMap<String, GeneMapping>(speciesOccurrence.size()); HashMap<String, GoMapping> goMappings = new HashMap<String, GoMapping>(speciesOccurrence.size()); // download/update species mapping, put them in maps per species for (String uniprotTaxonomy : speciesOccurrence.keySet()) { if (!uniprotTaxonomy.equals(SpeciesFactory.UNKNOWN)) { try { Integer taxon = speciesFactory.getUniprotTaxonomy().getId(uniprotTaxonomy, true); if (taxon != null) { String speciesName = speciesFactory.getName(taxon); String ensemblDatasetName = speciesFactory.getEnsemblDataset(taxon); if (ensemblDatasetName != null) { File geneMappingFile = getGeneMappingFile(ensemblDatasetName); File goMappingFile = getGoMappingFile(ensemblDatasetName); if (genePreferences.getAutoUpdate()) { boolean success = true; try { if (!geneMappingFile.exists() || !goMappingFile.exists() || newVersionExists(taxon)) { success = downloadMappings(waitingHandler, taxon); } if (waitingHandler != null && waitingHandler.isRunCanceled()) { return null; } } catch (Exception e) { e.printStackTrace(); success = false; } if (!success) { waitingHandler.appendReport(PADDING + "Update of gene information for species " + speciesName + " failed. A previous version will be used if available.", true, true); } } if (geneMappingFile.exists()) { GeneMapping geneMapping = new GeneMapping(); try { geneMapping.importFromFile(geneMappingFile, waitingHandler); geneMappings.put(speciesName, geneMapping); } catch (Exception e) { waitingHandler.appendReport(PADDING + "Import of the gene mapping for " + speciesName + " failed. Gene information for this species will not be available.", true, true); } } else { waitingHandler.appendReport(PADDING + "Gene mapping for " + speciesName + " not available. Gene information for this species will not be available.", true, true); } if (goMappingFile.exists()) { GoMapping goMapping = new GoMapping(); try { goMapping.loadMappingsFromFile(goMappingFile, waitingHandler); goMappings.put(speciesName, goMapping); } catch (Exception e) { waitingHandler.appendReport(PADDING + "Import of the GO mapping for " + speciesName + " failed. GO annotation for this species will not be available.", true, true); } } else { waitingHandler.appendReport(PADDING + "GO mapping for " + speciesName + " not available. GO annotation for this species will not be available.", true, true); } } else { waitingHandler.appendReport(PADDING + speciesName + " not available in Ensembl. Gene and GO annotation for this species will not be available.", true, true); } } } catch (Exception e) { waitingHandler.appendReport(PADDING + "No taxonomy found for " + uniprotTaxonomy + ". Gene annotation for this species will not be available.", true, true); } } } // get the mappings for the proteins in the sequence factory GeneMaps geneMaps = new GeneMaps(); if (ensemblVersionsMap == null) { ensemblVersionsMap = new HashMap<String, String>(); } HashMap<String, String> ensemblVersionsUsed = new HashMap<String, String>(ensemblVersionsMap); HashMap<String, String> geneNameToEnsemblIdMap = new HashMap<String, String>(); HashMap<String, String> geneNameToChromosomeMap = new HashMap<String, String>(); HashMap<String, HashSet<String>> proteinToGoMap = new HashMap<String, HashSet<String>>(); HashMap<String, HashSet<String>> goToProteinMap = new HashMap<String, HashSet<String>>(); HashMap<String, String> goNamesMap = new HashMap<String, String>(); SequenceFactory.HeaderIterator it = sequenceFactory.getHeaderIterator(true); while (it.hasNext()) { Header header = it.getNext(); String uniprotTaxonomy = header.getTaxonomy(); if (uniprotTaxonomy != null && !uniprotTaxonomy.equals("")) { try { Integer taxon = speciesFactory.getUniprotTaxonomy().getId(uniprotTaxonomy, false); if (taxon != null) { String speciesName = speciesFactory.getName(taxon); String geneName = header.getGeneName(); if (geneName != null) { GeneMapping geneMapping = geneMappings.get(speciesName); if (geneMapping != null) { String chromosome = geneMapping.getChromosome(geneName); if (chromosome != null) { geneNameToChromosomeMap.put(geneName, chromosome); } String ensemblId = geneMapping.getEnsemblAccession(geneName); if (ensemblId != null) { geneNameToEnsemblIdMap.put(geneName, ensemblId); } } } GoMapping goMapping = goMappings.get(speciesName); if (goMapping != null) { String accession = header.getAccession(); HashSet<String> goTerms = proteinToGoMap.get(accession); if (goTerms == null) { goTerms = new HashSet<String>(); proteinToGoMap.put(accession, goTerms); } HashSet<String> newTerms = goMapping.getGoAccessions(accession); if (newTerms != null) { goTerms.addAll(newTerms); for (String goTerm : newTerms) { String goName = goMapping.getTermName(goTerm); if (goName != null) { goNamesMap.put(goTerm, goName); } HashSet<String> proteins = goToProteinMap.get(goTerm); if (proteins == null) { proteins = new HashSet<String>(); goToProteinMap.put(goTerm, proteins); } proteins.add(accession); } } } } } catch (Exception e) { // Taxon not available, ignore e.printStackTrace(); } } } geneMaps.setEnsemblVersionsMap(ensemblVersionsUsed); geneMaps.setGeneNameToEnsemblIdMap(geneNameToEnsemblIdMap); geneMaps.setGeneNameToChromosomeMap(geneNameToChromosomeMap); geneMaps.setProteinToGoMap(proteinToGoMap); geneMaps.setGoAccessionToProteinMap(goToProteinMap); geneMaps.setGoNamesMap(goNamesMap); return geneMaps; } /** * Download the gene sequences mappings. * * @param destinationFile the destination file where to save the gene * sequences * @param ensemblType the Ensembl type, e.g., default or plants * @param ensemblSchemaName the Ensembl schema name, e.g., default or * plants_mart_18 * @param ensemblDbName the Ensembl DB name of the selected species * @param waitingHandler waiting handler displaying progress and allowing * canceling the process * * @return true if downloading went OK * * @throws MalformedURLException if an MalformedURLException occurs * @throws IOException if an IOException occurs */ public boolean downloadGeneSequences(File destinationFile, String ensemblType, String ensemblSchemaName, String ensemblDbName, WaitingHandler waitingHandler) throws MalformedURLException, IOException { // construct data String requestXml = "query=<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<!DOCTYPE Query>" + "<Query virtualSchemaName = \"" + ensemblSchemaName + "\" formatter = \"FASTA\" header = \"0\" uniqueRows = \"1\" count = \"\" datasetConfigVersion = \"0.7\" >" + "<Dataset name = \"" + ensemblDbName + "\" interface = \"default\" >" + "<Attribute name = \"ensembl_gene_id\" />\n" + "<Attribute name = \"coding\" />" + "</Dataset>\n" + "</Query>" + "</Query>"; String waitingText = "Downloading gene sequences. Please Wait..."; return queryEnsembl(requestXml, waitingText, destinationFile, ensemblType, waitingHandler); } /** * Download the GO mappings. * * @param ensemblType the Ensembl type, e.g., default or plants * @param ensemblSchemaName the Ensembl schema name, e.g., default or * plants_mart_18 * @param ensemblDbName the Ensembl db name of the selected species * @param swissProtMapping if true, use the uniprot_swissprot_accession * parameter, if false use the uniprot_sptrembl parameter * @param waitingHandler waiting handler displaying progress and allowing * canceling the process * * @return true if downloading went OK * * @throws MalformedURLException if an MalformedURLException occurs * @throws IOException if an IOException occurs */ public boolean downloadGoMappings(String ensemblType, String ensemblSchemaName, String ensemblDbName, boolean swissProtMapping, WaitingHandler waitingHandler) throws MalformedURLException, IOException { String accessionMapping; if (swissProtMapping) { if (ensemblType.equalsIgnoreCase("ensembl")) { accessionMapping = "\"uniprot_swissprot\""; } else { accessionMapping = "\"uniprot_swissprot_accession\""; } } else { accessionMapping = "\"uniprot_sptrembl\""; } // construct data String requestXml = "query=<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<!DOCTYPE Query>" + "<Query virtualSchemaName = \"" + ensemblSchemaName + "\" formatter = \"TSV\" header = \"0\" uniqueRows = \"1\" count = \"\" datasetConfigVersion = \"0.7\" >" + "<Dataset name = \"" + ensemblDbName + "\" interface = \"default\" >" + "<Attribute name = " + accessionMapping + " />"; if (ensemblType.equalsIgnoreCase("ensembl")) { requestXml += "<Attribute name = \"goslim_goa_accession\" />" + "<Attribute name = \"goslim_goa_description\" />"; } else { requestXml += "<Attribute name = \"go_accession\" />" + "<Attribute name = \"go_name_1006\" />"; } requestXml += "</Dataset>" + "</Query>"; // @TODO: have to check if goslim_goa_accession and goslim_goa_description is available File tempFile = getGoMappingFile(ensemblDbName); String waitingText = "Downloading GO Mappings. Please Wait..."; return queryEnsembl(requestXml, waitingText, tempFile, ensemblType, waitingHandler); } /** * Sends an XML query to Ensembl and writes the result in a text file. * * @param requestXml the XML request * @param destinationFile the file where to save the results * @param ensemblType the Ensembl type, e.g., default or plants * * @return true if downloading went OK * * @throws MalformedURLException if an MalformedURLException occurs * @throws IOException if an IOException occurs */ public boolean queryEnsembl(String requestXml, File destinationFile, String ensemblType) throws MalformedURLException, IOException { return queryEnsembl(requestXml, destinationFile, ensemblType, null); } /** * Sends an XML query to Ensembl and writes the result in a text file. * * @param requestXml the XML request * @param destinationFile the file where to save the results * @param ensemblType the Ensembl type, e.g., default or plants * @param waitingHandler waiting handler displaying progress and allowing * canceling the process * * @return true if downloading went OK * * @throws MalformedURLException if an MalformedURLException occurs * @throws IOException if an IOException occurs */ public boolean queryEnsembl(String requestXml, File destinationFile, String ensemblType, WaitingHandler waitingHandler) throws MalformedURLException, IOException { return queryEnsembl(requestXml, null, destinationFile, ensemblType, waitingHandler); } /** * Sends an XML query to Ensembl and writes the result in a text file. * * @param requestXml the XML request * @param destinationFile the file where to save the results * @param ensemblType the Ensembl type, e.g., default or plants * @param waitingHandler waiting handler displaying progress and allowing * canceling the process * @param waitingText the text to write in case a progress dialog is used * * @return true if downloading went OK * * @throws MalformedURLException if an MalformedURLException occurs * @throws IOException if an IOException occurs */ public boolean queryEnsembl(String requestXml, String waitingText, File destinationFile, String ensemblType, WaitingHandler waitingHandler) throws MalformedURLException, IOException { if (waitingHandler != null && waitingHandler instanceof ProgressDialogX && waitingText == null) { waitingText = "Downloading from Ensembl. Please wait..."; } boolean success = true; int lastThousand = 0; if (waitingHandler == null || !waitingHandler.isRunCanceled()) { // send data URL url = getEnsemblUrl(ensemblType); URLConnection conn = url.openConnection(); conn.setDoOutput(true); OutputStreamWriter wr = new OutputStreamWriter(conn.getOutputStream()); String lineBreak = System.getProperty("line.separator"); try { wr.write(requestXml); wr.flush(); if (waitingHandler == null || !waitingHandler.isRunCanceled()) { if (waitingHandler != null) { waitingHandler.setWaitingText(waitingText); } else { System.out.println(waitingText); } int counter = 0; boolean fileCreated = destinationFile.createNewFile(); if (fileCreated || destinationFile.exists()) { // get the response BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream())); try { FileWriter w = new FileWriter(destinationFile); try { BufferedWriter bw = new BufferedWriter(w); try { String rowLine = br.readLine(); if (rowLine != null && rowLine.startsWith("Query ERROR")) { if (rowLine.lastIndexOf("Attribute goslim_goa_accession NOT FOUND") != -1) { success = false; } else if (rowLine.lastIndexOf("Attribute uniprot_swissprot_accession NOT FOUND") != -1) { success = false; } else { throw new IllegalArgumentException("Query error: " + rowLine); } } else { while (rowLine != null && success) { if (waitingHandler != null) { if (waitingHandler.isRunCanceled()) { break; } if (waitingHandler instanceof ProgressDialogX) { waitingHandler.setWaitingText(waitingText + " (" + counter++ + " rows downloaded)"); } } else { int thousand = ++counter / 10000; if (thousand > lastThousand) { System.out.println(waitingText + " (" + counter + " rows downloaded)"); lastThousand = thousand; } } bw.write(rowLine + lineBreak); rowLine = br.readLine(); } } } finally { bw.close(); } } finally { w.close(); } } finally { br.close(); } } else { if (waitingHandler != null) { waitingHandler.setRunCanceled(); } throw new IllegalArgumentException("The mapping file could not be created."); } } } finally { wr.close(); } } return success; } /** * Download the gene mappings. * * @param ensemblType the Ensembl type, e.g., default or plants * @param ensemblSchemaName the Ensembl schema name, e.g., default or * plants_mart_18 * @param ensemblDatasetName the Ensembl dataset name of the selected * species * @param ensemblVersion the Ensembl version * @param waitingHandler the waiting handler * * @throws MalformedURLException if an MalformedURLException occurs * @throws IOException if an IOException occurs * @throws IllegalArgumentException if an IllegalArgumentException occurs */ public void downloadGeneMappings(String ensemblType, String ensemblSchemaName, String ensemblDatasetName, String ensemblVersion, WaitingHandler waitingHandler) throws MalformedURLException, IOException, IllegalArgumentException { // fix needed to support both default and custom ensembl species String externalReference; if (ensemblSchemaName.equalsIgnoreCase("default")) { externalReference = "<Attribute name = \"external_gene_name\" />"; } else { externalReference = "<Attribute name = \"external_gene_id\" />"; } // construct data String requestXml = "query=<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<!DOCTYPE Query>" + "<Query virtualSchemaName = \"" + ensemblSchemaName + "\" formatter = \"TSV\" header = \"0\" uniqueRows = \"1\" count = \"\" datasetConfigVersion = \"0.7\" >" + "<Dataset name = \"" + ensemblDatasetName + "\" interface = \"default\" >" + "<Attribute name = \"ensembl_gene_id\" />" + externalReference + "<Attribute name = \"chromosome_name\" />" + "</Dataset>" + "</Query>"; // @TODO: use the queryEnsembl method here as well? if (!waitingHandler.isRunCanceled()) { // send data URL url = getEnsemblUrl(ensemblType); URLConnection conn = url.openConnection(); conn.setDoOutput(true); OutputStreamWriter wr = new OutputStreamWriter(conn.getOutputStream()); String lineBreak = System.getProperty("line.separator"); try { wr.write(requestXml); wr.flush(); if (!waitingHandler.isRunCanceled()) { waitingHandler.setWaitingText("Downloading Gene Mappings. Please Wait..."); int counter = 0; File tempFile = getGeneMappingFile(ensemblDatasetName); // get the response BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream())); try { FileWriter w = new FileWriter(tempFile); try { BufferedWriter bw = new BufferedWriter(w); try { String rowLine = br.readLine(); if (rowLine != null && rowLine.startsWith("Query ERROR")) { throw new IllegalArgumentException("Query error on line: " + rowLine); } else { while (rowLine != null && !waitingHandler.isRunCanceled()) { if (waitingHandler instanceof ProgressDialogX) { waitingHandler.setWaitingText("Downloading Gene Mappings. Please Wait... (" + counter++ + " rows downloaded)"); } bw.write(rowLine + lineBreak); rowLine = br.readLine(); } } } finally { bw.close(); } } finally { w.close(); } } finally { br.close(); } if (!waitingHandler.isRunCanceled()) { updateEnsemblVersion(ensemblDatasetName, "Ensembl " + ensemblVersion); } } } finally { wr.close(); } } } /** * Returns the path to the folder containing the gene mapping files. * * @return the gene mapping folder */ public static File getGeneMappingFolder() { return new File(GENE_MAPPING_FOLDER); } /** * Sets the folder where gene mappings are saved. * * @param geneMappingFolder the folder where gene mappings are saved */ public static void setGeneMappingFolder(String geneMappingFolder) { GENE_MAPPING_FOLDER = geneMappingFolder; } /** * Copies the given gene mapping files to the gene mappings folder. If newer * versions of the mapping exists they will be overwritten according to * updateEqualVersion. * * @param aEnsemblVersionsFile the Ensembl versions file * @param aGoDomainsFile the GO domains file * @param aDefaultSpeciesGoMappingsFile the default species GO mappings file * @param aDefaultSpeciesGeneMappingFile the default species gene mappings * file * @param updateEqualVersion if true, the version is updated with equal * version numbers, false, only update if the new version is newer */ public void createDefaultGeneMappingFiles(File aEnsemblVersionsFile, File aGoDomainsFile, File aDefaultSpeciesGoMappingsFile, File aDefaultSpeciesGeneMappingFile, boolean updateEqualVersion) { if (!getGeneMappingFolder().exists()) { boolean folderCreated = getGeneMappingFolder().mkdirs(); if (!folderCreated) { throw new IllegalArgumentException("Could not create the gene mapping folder."); } } File ensemblVersionsFile = getEnsemblVersionsFile(); File goDomainsFile = getGoDomainsFile(); File defaultSpeciesGoMappingsFile = new File(getGeneMappingFolder(), aDefaultSpeciesGoMappingsFile.getName()); File defaultSpeciesGeneMappingFile = new File(getGeneMappingFolder(), aDefaultSpeciesGeneMappingFile.getName()); boolean updateHumanEnsembl = false; try { if (!ensemblVersionsFile.exists()) { updateHumanEnsembl = true; boolean fileCreated = ensemblVersionsFile.createNewFile(); if (!fileCreated) { throw new IllegalArgumentException("Could not create the Ensembl versions file."); } Util.copyFile(aEnsemblVersionsFile, ensemblVersionsFile); } else { // file exists, just update the human ensembl version // read the "new" human Ensembl versions number Integer humanEnsemblVersionNew = getEnsemblVersionFromFile(aEnsemblVersionsFile, "hsapiens_gene_ensembl"); if (humanEnsemblVersionNew != null) { Integer humanEnsemblVersionOld = getEnsemblVersionFromFile(ensemblVersionsFile, "hsapiens_gene_ensembl"); if (humanEnsemblVersionOld == null || humanEnsemblVersionOld.equals(humanEnsemblVersionNew) && updateEqualVersion || humanEnsemblVersionOld < humanEnsemblVersionNew) { updateHumanEnsembl = true; updateEnsemblVersion("hsapiens_gene_ensembl", "Ensembl " + humanEnsemblVersionNew); } } } } catch (IOException e) { e.printStackTrace(); throw new IllegalArgumentException("Could not create or update the Ensembl versions file."); } try { if (!goDomainsFile.exists()) { boolean fileCreated = goDomainsFile.createNewFile(); if (!fileCreated) { throw new IllegalArgumentException("Could not create the GO domains file."); } } Util.copyFile(aGoDomainsFile, goDomainsFile); } catch (IOException e) { e.printStackTrace(); throw new IllegalArgumentException("Could not create the GO domains file."); } if (updateHumanEnsembl) { try { if (!defaultSpeciesGoMappingsFile.exists()) { boolean fileCreated = defaultSpeciesGoMappingsFile.createNewFile(); if (!fileCreated) { throw new IllegalArgumentException("Could not create the default species GO mapping file."); } } Util.copyFile(aDefaultSpeciesGoMappingsFile, defaultSpeciesGoMappingsFile); } catch (IOException e) { e.printStackTrace(); throw new IllegalArgumentException("Could not create the default species GO mapping file."); } try { if (!defaultSpeciesGeneMappingFile.exists()) { boolean fileCreated = defaultSpeciesGeneMappingFile.createNewFile(); if (!fileCreated) { throw new IllegalArgumentException("Could not create the default species gene mapping file."); } } Util.copyFile(aDefaultSpeciesGeneMappingFile, defaultSpeciesGeneMappingFile); } catch (IOException e) { e.printStackTrace(); throw new IllegalArgumentException("Could not create the default species gene mapping file."); } } } /** * Update the Ensembl version for the given species in the local map and in * the Ensembl versions file. * * @param ensemblDatasetName the dataset name of the species to update, * e.g., hsapiens_gene_ensembl * @param ensemblVersion the new Ensembl version * * @throws IOException if an IOException occurs */ public void updateEnsemblVersion(String ensemblDatasetName, String ensemblVersion) throws IOException { if (ensemblVersionsMap == null) { ensemblVersionsMap = new HashMap<String, String>(); } ensemblVersionsMap.put(ensemblDatasetName, ensemblVersion); FileWriter w = new FileWriter(getEnsemblVersionsFile()); try { BufferedWriter bw = new BufferedWriter(w); try { for (String key : ensemblVersionsMap.keySet()) { bw.write(key + SEPARATOR + ensemblVersionsMap.get(key)); bw.newLine(); } } finally { bw.close(); } } finally { w.close(); } } /** * Gets the Ensembl version of a given species from a file. * * @param ensemblVersionsFile the Ensembl versions file * @param species the species of interest * * @return the Ensembl version * * @throws IOException thrown whenever an error occurred while reading the * file */ public Integer getEnsemblVersionFromFile(File ensemblVersionsFile, String species) throws IOException { Integer version = null; FileReader r = new FileReader(ensemblVersionsFile); try { BufferedReader br = new BufferedReader(r); try { String line; while ((line = br.readLine()) != null) { String[] splittedLine = line.split(SEPARATOR); String speciesAtLine = splittedLine[0]; if (speciesAtLine.equals(species)) { String[] ensemblVersionSplit = splittedLine[1].split(" "); version = new Integer(ensemblVersionSplit[1]); } } } finally { br.close(); } } finally { r.close(); } return version; } /** * Loads the given Ensembl species file. * * @param ensemblVersionsFile the Ensembl species file to load * @throws FileNotFoundException if an FileNotFoundException occurs * @throws IOException if an IOException occurs */ public void loadEnsemblSpeciesVersions(File ensemblVersionsFile) throws FileNotFoundException, IOException { // load the existing ensembl version numbers FileReader r = new FileReader(ensemblVersionsFile); try { BufferedReader br = new BufferedReader(r); try { ensemblVersionsMap = new HashMap<String, String>(); String line = br.readLine(); while (line != null) { String[] elements = line.split("\\t"); ensemblVersionsMap.put(elements[0], elements[1]); line = br.readLine(); } } finally { br.close(); } } finally { r.close(); } } /** * Returns the Ensembl URL for the given Ensembl (sub-)version. * * @param ensemblType the Ensembl type, e.g., fungi or plants * @return the Ensembl URL * @throws MalformedURLException */ private URL getEnsemblUrl(String ensemblType) throws MalformedURLException { if (ensemblType.equalsIgnoreCase("fungi")) { return new URL("http://fungi.ensembl.org/biomart/martservice/result"); } else if (ensemblType.equalsIgnoreCase("plants")) { return new URL("http://plants.ensembl.org/biomart/martservice/result"); } else if (ensemblType.equalsIgnoreCase("protists")) { return new URL("http://protists.ensembl.org/biomart/martservice/result"); } else if (ensemblType.equalsIgnoreCase("metazoa")) { return new URL("http://metazoa.ensembl.org/biomart/martservice/result"); } else { return new URL("http://www.ensembl.org/biomart/martservice/result"); } } /** * Try to download the gene and GO mappings for the currently selected * species. * * @param waitingHandler the waiting handler * @param taxon the NCBI taxon of the species * * @return true if the download was successful * * @throws java.io.IOException exception thrown whenever an error occurred * while reading the mapping files */ public boolean downloadMappings(WaitingHandler waitingHandler, Integer taxon) throws IOException { SpeciesFactory speciesFactory = SpeciesFactory.getInstance(); String latinName = speciesFactory.getLatinName(taxon); if (latinName == null) { latinName = taxon.toString(); } if (waitingHandler.isReport()) { waitingHandler.appendReport(PADDING + "Downloading GO and gene mappings for species " + latinName + ".", true, true); } EnsemblGenomeDivision ensemblGenomeDivision = speciesFactory.getEnsemblGenomesSpecies().getDivision(taxon); String ensemblType = "ensembl"; if (ensemblGenomeDivision != null) { ensemblType = ensemblGenomeDivision.ensemblType; } String schemaName = EnsemblVersion.getEnsemblSchemaName(ensemblGenomeDivision); if (schemaName == null) { return false; } String ensemblDatasetName = speciesFactory.getEnsemblDataset(taxon); if (ensemblDatasetName == null) { return false; } if (!waitingHandler.isRunCanceled()) { boolean goMappingsDownloaded = downloadGoMappings(ensemblType, schemaName, ensemblDatasetName, true, waitingHandler); // swiss prot mapping not found, try trembl if (!goMappingsDownloaded) { goMappingsDownloaded = downloadGoMappings(ensemblType, schemaName, ensemblDatasetName, false, waitingHandler); } if (!goMappingsDownloaded) { waitingHandler.appendReport(PADDING + "Gene ontology mappings not available. Downloading gene mappings only.", true, true); } else if (waitingHandler.isReport()) { waitingHandler.appendReport(PADDING + "GO mappings downloaded.", true, true); } } if (!waitingHandler.isRunCanceled()) { downloadGeneMappings(ensemblType, schemaName, ensemblDatasetName, EnsemblVersion.getCurrentEnsemblVersion(ensemblGenomeDivision).toString(), waitingHandler); if (!waitingHandler.isRunCanceled()) { if (waitingHandler.isReport()) { waitingHandler.appendReport(PADDING + "Gene mappings downloaded.", true, true); } } } boolean canceled = waitingHandler.isRunCanceled(); return !canceled; } /** * Returns the gene mapping file. * * @param ensemblDatasetName the Ensembl dataset name * * @return the gene mapping file */ public static File getGeneMappingFile(String ensemblDatasetName) { return new File(getGeneMappingFolder(), ensemblDatasetName + GENE_MAPPING_FILE_SUFFIX); } /** * Returns the GO mapping file. * * @param ensemblDatasetName the Ensembl dataset name * * @return the GO mapping file */ public static File getGoMappingFile(String ensemblDatasetName) { return new File(getGeneMappingFolder(), ensemblDatasetName + GO_MAPPING_FILE_SUFFIX); } /** * Returns the Ensembl version file. * * @return the Ensembl version file */ public static File getEnsemblVersionsFile() { return new File(getGeneMappingFolder(), ENSEMBL_VERSIONS); } /** * Returns the GO domains file. * * @return the GO domains file */ public static File getGoDomainsFile() { return new File(getGeneMappingFolder(), GO_DOMAINS); } /** * Returns the Ensembl version for a given species. * * @param taxon the NCBI taxon of the species * * @return the Ensembl version for a given species. */ public String getEnsemblVersion(Integer taxon) { SpeciesFactory speciesFactory = SpeciesFactory.getInstance(); String ensemblDatasetName = speciesFactory.getEnsemblDataset(taxon); if (ensemblVersionsMap == null) { return null; } return ensemblVersionsMap.get(ensemblDatasetName); } /** * Returns true if a newer version of the species mapping exists in Ensembl. * * @param taxon the NCBI taxon of the species * * @return rue if a newer version of the species mapping exists in Ensemble */ public boolean newVersionExists(Integer taxon) { EnsemblGenomeDivision ensemblGenomeDivision = SpeciesFactory.getInstance().getEnsemblGenomesSpecies().getDivision(taxon); Integer latestEnsemblVersion = EnsemblVersion.getCurrentEnsemblVersion(ensemblGenomeDivision); String currentEnsemblVersionAsString = getEnsemblVersion(taxon); if (currentEnsemblVersionAsString != null) { currentEnsemblVersionAsString = currentEnsemblVersionAsString.substring(currentEnsemblVersionAsString.indexOf(" ") + 1); Integer currentEnsemblVersion; try { currentEnsemblVersion = new Integer(currentEnsemblVersionAsString); } catch (NumberFormatException e) { e.printStackTrace(); currentEnsemblVersion = latestEnsemblVersion; } return currentEnsemblVersion < latestEnsemblVersion; } return true; } }