/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.genes.ontologyBuilder; import java.io.File; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.erasmusmc.collections.CountingSet; import org.erasmusmc.dataimport.genes.Affymetrix; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.OntologyFileLoader; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.ontology.ontologyutilities.FamilyNameFinder; import org.erasmusmc.ontology.ontologyutilities.GeneTermVariantGenerator; import org.erasmusmc.ontology.ontologyutilities.HomonymAnalyzer; import org.erasmusmc.ontology.ontologyutilities.OntologyCurator; import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities; import org.erasmusmc.ontology.ontologyutilities.evaluationScripts.DetectPlainEnglishWords; import org.erasmusmc.ontology.ontologyutilities.evaluationScripts.OntologyFrequencyCount; import org.erasmusmc.utilities.StringUtilities; public class GeneOntologyMasterScript { public static boolean mergeHomologs = true; public static boolean generateCurationInformation = true; public static Set<Integer> allowedTaxonIDs = new HashSet<Integer>(); public static String homologeneFile = "/home/khettne/Projects/GeneList/Homologene/homologene.xml"; public static String entrezGeneFolder = "/home/khettne/Projects/GeneList/Entrez-Gene/"; public static String uniprotFile = "/home/khettne/Projects/GeneList/Swiss-Prot/uniprot_sprot.dat"; public static String omimFile = "/home/khettne/Projects/GeneList/OMIM/genemap"; public static String hugoFile = "/home/khettne/Projects/GeneList/HUGO/alldata.txt"; public static String affymetrixFolder = "/home/khettne/Projects/GeneList/Affymetrix/"; public static String wordListFilename = "/home/khettne/Projects/GeneList/EnglishWords/ukwords.txt"; public static String curationFile = "/home/khettne/Projects/GeneList/GeneThesaurusCurationFile.txt"; public static String tempFolder = "/home/khettne/Projects/GeneList/tempDEBUG/"; public static String normaliserCacheFile = "/home/public/Peregrine/standardNormCache2006.bin"; public static String randomPMIDSampleFile = "/home/public/PMIDs/Random100.000.PMIDs"; public static String outputFile = "/home/khettne/Projects/GeneList/GeneListHumanMouseRatV6_0.ontology"; public static void main(String[] args) { allowedTaxonIDs.add(9606); // H sapiens allowedTaxonIDs.add(10090); // M musculus allowedTaxonIDs.add(10116); // R norvegicus /*allowedTaxonIDs.add(83333); //E coli allowedTaxonIDs.add(4932); // S cerevisiae allowedTaxonIDs.add(7227); // D melanogaster allowedTaxonIDs.add(7955); // D rerio allowedTaxonIDs.add(6239); // C elegans allowedTaxonIDs.add(9031); // G gallus */ GeneList geneList = extractAndMergeGeneLists(); System.out.println("Merged:"); geneList.printStatistics(); OntologyStore ontology; if (mergeHomologs) ontology= mergeHomologs(geneList); else ontology = geneList.convertToOntologyStore(3000000); expandAndFilter(ontology); addExtraIdentifiers(ontology); saveOntology(ontology, outputFile); if (generateCurationInformation) generateCurationInformation(ontology); } private static void addExtraIdentifiers(OntologyStore ontology) { Affymetrix.libraryFolder = affymetrixFolder; new Affymetrix(ontology); } private static void saveOntology(OntologyStore ontology, String filename) { OntologyFileLoader loader = new OntologyFileLoader(); loader.save(ontology, filename); } private static void generateCurationInformation(OntologyStore ontology) { System.out.println(StringUtilities.now() + "\tDetecting plain english words"); new DetectPlainEnglishWords(ontology, wordListFilename, tempFolder+"englishWords.txt"); System.out.println(StringUtilities.now() + "\tAnalyzing homonyms"); HomonymAnalyzer homcount = new HomonymAnalyzer(); homcount.destroyOntologyDuringRelease = false; homcount.normaliser.loadCacheBinary(normaliserCacheFile); homcount.setOntology(ontology); homcount.countHomonyms(tempFolder+"homonyms.txt"); System.out.println(StringUtilities.now() + "\tCounting frequencies"); OntologyFrequencyCount.disambiguate = false; OntologyFrequencyCount.pmidsFile = randomPMIDSampleFile; OntologyFrequencyCount.outputFile = tempFolder+"frequencyCounts.txt"; new OntologyFrequencyCount(ontology); } private static OntologyStore mergeHomologs(GeneList geneList) { System.out.println(StringUtilities.now() + "\tMerging using Homologene"); HomologeneMerger merger = new HomologeneMerger(homologeneFile, allowedTaxonIDs); GeneList mergedGeneList = merger.merge(geneList); OntologyStore ontology = mergedGeneList.convertToOntologyStore(3000000); saveOntology(ontology, tempFolder + "merged.psf"); return ontology; } private static void expandAndFilter(OntologyStore ontology) { addIDsAsTerms(ontology); System.out.println(StringUtilities.now() + "\tGenerating spelling variations"); GeneTermVariantGenerator.generateVariants(ontology); System.out.println(StringUtilities.now() + "\tApplying generic filter"); OntologyUtilities.filterOntology(ontology, OntologyUtilities.stopwordsForFiltering); System.out.println(StringUtilities.now() + "\tApplying family name filter"); Set<String> familyNames = new HashSet<String>(FamilyNameFinder.findFamilyNamesListOutput(ontology)); OntologyUtilities.geneVocabulary = ""; // no need for voc lookup in removeterms OntologyUtilities.removeTerms(ontology, familyNames); System.out.println(StringUtilities.now() + "\tCuration"); OntologyCurator curator; if (curationFile == null) curator = new OntologyCurator(); else curator = new OntologyCurator(curationFile); curator.curateAndPrepare(ontology); } private static void addIDsAsTerms(OntologyStore ontology) { CountingSet<String> dbCounts = new CountingSet<String>(); for (Concept concept : ontology){ List<DatabaseID> dbIDs = ontology.getDatabaseIDsForConcept(concept.getID()); for (DatabaseID dbID : dbIDs){ String id = dbID.ID; if (StringUtilities.containsLetter(id) && StringUtilities.containsNumber(id)){ TermStore term = new TermStore(id); concept.getTerms().add(term); dbCounts.add(dbID.database); } } } System.out.println("Database IDs added as terms:"); dbCounts.printCounts(); } private static GeneList extractAndMergeGeneLists() { List<GeneList> geneLists = new ArrayList<GeneList>(); System.out.println(StringUtilities.now() + "\tExtracting from Entrez-Gene"); EntrezGeneParser entrezGeneParser = new EntrezGeneParser(); String firstOrganismFile = "Homo_sapiens.xgs"; //String firstOrganismFile = "Rattus_norvegicus.xgs"; //String firstOrganismFile = "Mus_musculus.xgs"; //String firstOrganismFile = "Bacteria.xgs"; //String firstOrganismFile = "Saccharomyces_cerevisiae.xgs"; //String firstOrganismFile = "Drosophila_melanogaster.xgs"; //String firstOrganismFile = "Danio_rerio.xgs"; //String firstOrganismFile = "Caenorhabditis_elegans.xgs"; //String firstOrganismFile = "Gallus_gallus.xgs"; System.out.println("Processing " + firstOrganismFile); GeneList humanGenes = entrezGeneParser.parse(entrezGeneFolder+firstOrganismFile, allowedTaxonIDs); humanGenes.printStatistics(); humanGenes.saveToSimpleFile(tempFolder+firstOrganismFile.replace(".xgs", ".txt")); geneLists.add(humanGenes); File folder = new File(entrezGeneFolder); for (File file : folder.listFiles()) if (file.getName().endsWith(".xgs") && !file.getName().equals(firstOrganismFile)) { System.out.println("Processing " + file.getName()); GeneList entrezGeneGenes = entrezGeneParser.parse(file.getAbsolutePath(), allowedTaxonIDs); entrezGeneGenes.printStatistics(); entrezGeneGenes.saveToSimpleFile(tempFolder+file.getName().replace(".xgs", ".txt")); geneLists.add(entrezGeneGenes); } System.out.println(StringUtilities.now() + "\tExtracting from HUGO"); HGNCParser hgncParser = new HGNCParser(); GeneList hgncGenes = hgncParser.parse(hugoFile, allowedTaxonIDs); System.out.println("HUGO:"); hgncGenes.printStatistics(); hgncGenes.saveToSimpleFile(tempFolder+"HGNC.txt"); geneLists.add(hgncGenes); System.out.println(StringUtilities.now() + "\tExtracting from OMIM"); OMIMParser omimParser = new OMIMParser(); GeneList omimGenes = omimParser.parse(omimFile, allowedTaxonIDs); System.out.println("OMIM:"); omimGenes.printStatistics(); omimGenes.saveToSimpleFile(tempFolder+"OMIM.txt"); geneLists.add(omimGenes); System.out.println(StringUtilities.now() + "\tExtracting from UniProt"); UniProtParser uniProtParser = new UniProtParser(); GeneList uniProtGenes = uniProtParser.parse(uniprotFile, allowedTaxonIDs); System.out.println("UniProt:"); uniProtGenes.printStatistics(); uniProtGenes.saveToSimpleFile(tempFolder+"UniProt.txt"); geneLists.add(uniProtGenes); System.out.println(StringUtilities.now() + "\tMerging databases"); DatabaseMerger merger = new DatabaseMerger(); for (GeneList geneList : geneLists) merger.merge(geneList); GeneList mergedGenes = merger.getMergedGeneList(); mergedGenes.saveToSimpleFile(tempFolder+"merged.txt"); return mergedGenes; } }