GeneOntologyMasterScript.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.dataimport.genes.ontologyBuilder;

import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.erasmusmc.collections.CountingSet;
import org.erasmusmc.dataimport.genes.Affymetrix;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.OntologyFileLoader;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.ontology.ontologyutilities.FamilyNameFinder;
import org.erasmusmc.ontology.ontologyutilities.GeneTermVariantGenerator;
import org.erasmusmc.ontology.ontologyutilities.HomonymAnalyzer;
import org.erasmusmc.ontology.ontologyutilities.OntologyCurator;
import org.erasmusmc.ontology.ontologyutilities.OntologyUtilities;
import org.erasmusmc.ontology.ontologyutilities.evaluationScripts.DetectPlainEnglishWords;
import org.erasmusmc.ontology.ontologyutilities.evaluationScripts.OntologyFrequencyCount;
import org.erasmusmc.utilities.StringUtilities;

public class GeneOntologyMasterScript {
	
	public static boolean mergeHomologs = true;
	public static boolean generateCurationInformation = true;
  
  public static Set<Integer> allowedTaxonIDs = new HashSet<Integer>();
  public static String homologeneFile = "/home/khettne/Projects/GeneList/Homologene/homologene.xml";
  public static String entrezGeneFolder = "/home/khettne/Projects/GeneList/Entrez-Gene/";
  public static String uniprotFile = "/home/khettne/Projects/GeneList/Swiss-Prot/uniprot_sprot.dat";
  public static String omimFile = "/home/khettne/Projects/GeneList/OMIM/genemap";
  public static String hugoFile = "/home/khettne/Projects/GeneList/HUGO/alldata.txt";
  public static String affymetrixFolder = "/home/khettne/Projects/GeneList/Affymetrix/";
	public static String wordListFilename = "/home/khettne/Projects/GeneList/EnglishWords/ukwords.txt";
  public static String curationFile = "/home/khettne/Projects/GeneList/GeneThesaurusCurationFile.txt";
  public static String tempFolder = "/home/khettne/Projects/GeneList/tempDEBUG/";
  public static String normaliserCacheFile = "/home/public/Peregrine/standardNormCache2006.bin";
  public static String randomPMIDSampleFile = "/home/public/PMIDs/Random100.000.PMIDs";
  public static String outputFile = "/home/khettne/Projects/GeneList/GeneListHumanMouseRatV6_0.ontology";
  
  public static void main(String[] args) {
    allowedTaxonIDs.add(9606); // H sapiens
    allowedTaxonIDs.add(10090); // M musculus
    allowedTaxonIDs.add(10116); // R norvegicus
    /*allowedTaxonIDs.add(83333); //E coli
    allowedTaxonIDs.add(4932); // S cerevisiae
    allowedTaxonIDs.add(7227); // D melanogaster
    allowedTaxonIDs.add(7955); // D rerio
    allowedTaxonIDs.add(6239); // C elegans
    allowedTaxonIDs.add(9031); // G gallus
    */
    GeneList geneList = extractAndMergeGeneLists();
    System.out.println("Merged:");
    geneList.printStatistics();
    
    OntologyStore ontology;
    if (mergeHomologs)
    	ontology= mergeHomologs(geneList);
    else 
      ontology = geneList.convertToOntologyStore(3000000);
    expandAndFilter(ontology);
    addExtraIdentifiers(ontology);
    saveOntology(ontology, outputFile);
    
    if (generateCurationInformation)
    	generateCurationInformation(ontology);
  }

  private static void addExtraIdentifiers(OntologyStore ontology) {
    Affymetrix.libraryFolder = affymetrixFolder;
    new Affymetrix(ontology);
  }

  private static void saveOntology(OntologyStore ontology, String filename) {
  	OntologyFileLoader loader = new OntologyFileLoader();
  	loader.save(ontology, filename);
  }

  private static void generateCurationInformation(OntologyStore ontology) {
  	System.out.println(StringUtilities.now() + "\tDetecting plain english words");
  	new DetectPlainEnglishWords(ontology, wordListFilename, tempFolder+"englishWords.txt");
  	
    System.out.println(StringUtilities.now() + "\tAnalyzing homonyms");
    HomonymAnalyzer homcount = new HomonymAnalyzer();
    homcount.destroyOntologyDuringRelease = false;
    homcount.normaliser.loadCacheBinary(normaliserCacheFile);
    homcount.setOntology(ontology);
    homcount.countHomonyms(tempFolder+"homonyms.txt");
    
    System.out.println(StringUtilities.now() + "\tCounting frequencies");
    OntologyFrequencyCount.disambiguate = false;
    OntologyFrequencyCount.pmidsFile = randomPMIDSampleFile;
    OntologyFrequencyCount.outputFile = tempFolder+"frequencyCounts.txt";
    new OntologyFrequencyCount(ontology);
  }

  private static OntologyStore mergeHomologs(GeneList geneList) {
    System.out.println(StringUtilities.now() + "\tMerging using Homologene");
    HomologeneMerger merger = new HomologeneMerger(homologeneFile, allowedTaxonIDs);
    GeneList mergedGeneList = merger.merge(geneList);
    OntologyStore ontology = mergedGeneList.convertToOntologyStore(3000000);
    
    saveOntology(ontology, tempFolder + "merged.psf");
    
    return ontology;
  }

  private static void expandAndFilter(OntologyStore ontology) {
  	addIDsAsTerms(ontology);
  	
    System.out.println(StringUtilities.now() + "\tGenerating spelling variations");
    GeneTermVariantGenerator.generateVariants(ontology);
    
    System.out.println(StringUtilities.now() + "\tApplying generic filter");
    OntologyUtilities.filterOntology(ontology, OntologyUtilities.stopwordsForFiltering);
    
    System.out.println(StringUtilities.now() + "\tApplying family name filter");
    Set<String> familyNames = new HashSet<String>(FamilyNameFinder.findFamilyNamesListOutput(ontology));
    OntologyUtilities.geneVocabulary = ""; // no need for voc lookup in removeterms
    OntologyUtilities.removeTerms(ontology, familyNames);
    
    System.out.println(StringUtilities.now() + "\tCuration");
    OntologyCurator curator;
    if (curationFile == null)
      curator = new OntologyCurator();
    else
      curator = new OntologyCurator(curationFile);
    
    curator.curateAndPrepare(ontology);
  }

  private static void addIDsAsTerms(OntologyStore ontology) {
  	CountingSet<String> dbCounts = new CountingSet<String>();
		for (Concept concept : ontology){
			List<DatabaseID> dbIDs = ontology.getDatabaseIDsForConcept(concept.getID());
			for (DatabaseID dbID : dbIDs){
				String id = dbID.ID;
				if (StringUtilities.containsLetter(id) && StringUtilities.containsNumber(id)){
					TermStore term = new TermStore(id);
					concept.getTerms().add(term);
					dbCounts.add(dbID.database);
				}	
			}
		}
		System.out.println("Database IDs added as terms:");
		dbCounts.printCounts();
		
	}

	private static GeneList extractAndMergeGeneLists() {
    List<GeneList> geneLists = new ArrayList<GeneList>();
    
    System.out.println(StringUtilities.now() + "\tExtracting from Entrez-Gene");
    EntrezGeneParser entrezGeneParser = new EntrezGeneParser();
    String firstOrganismFile = "Homo_sapiens.xgs";
    //String firstOrganismFile = "Rattus_norvegicus.xgs";
    //String firstOrganismFile = "Mus_musculus.xgs";
    //String firstOrganismFile = "Bacteria.xgs";
    //String firstOrganismFile = "Saccharomyces_cerevisiae.xgs";
    //String firstOrganismFile = "Drosophila_melanogaster.xgs";
    //String firstOrganismFile = "Danio_rerio.xgs";
    //String firstOrganismFile = "Caenorhabditis_elegans.xgs";
    //String firstOrganismFile = "Gallus_gallus.xgs";
    
    System.out.println("Processing " + firstOrganismFile);
    GeneList humanGenes = entrezGeneParser.parse(entrezGeneFolder+firstOrganismFile, allowedTaxonIDs);
    humanGenes.printStatistics();
    humanGenes.saveToSimpleFile(tempFolder+firstOrganismFile.replace(".xgs", ".txt"));
    geneLists.add(humanGenes);
    
    
    File folder = new File(entrezGeneFolder);
    for (File file : folder.listFiles())
      if (file.getName().endsWith(".xgs") && !file.getName().equals(firstOrganismFile)) {
        System.out.println("Processing " + file.getName());
        GeneList entrezGeneGenes = entrezGeneParser.parse(file.getAbsolutePath(), allowedTaxonIDs);
        entrezGeneGenes.printStatistics();
        entrezGeneGenes.saveToSimpleFile(tempFolder+file.getName().replace(".xgs", ".txt"));
        geneLists.add(entrezGeneGenes);
      }
    
    System.out.println(StringUtilities.now() + "\tExtracting from HUGO");
    HGNCParser hgncParser = new HGNCParser();
    GeneList hgncGenes = hgncParser.parse(hugoFile, allowedTaxonIDs);
    System.out.println("HUGO:");
    hgncGenes.printStatistics();
    hgncGenes.saveToSimpleFile(tempFolder+"HGNC.txt");
    geneLists.add(hgncGenes);
  
    System.out.println(StringUtilities.now() + "\tExtracting from OMIM");
    OMIMParser omimParser = new OMIMParser();
    GeneList omimGenes = omimParser.parse(omimFile, allowedTaxonIDs);
    System.out.println("OMIM:");
    omimGenes.printStatistics();
    omimGenes.saveToSimpleFile(tempFolder+"OMIM.txt");
    geneLists.add(omimGenes);
  
    System.out.println(StringUtilities.now() + "\tExtracting from UniProt");
    UniProtParser uniProtParser = new UniProtParser();
    GeneList uniProtGenes = uniProtParser.parse(uniprotFile, allowedTaxonIDs);
    System.out.println("UniProt:");
    uniProtGenes.printStatistics();
    uniProtGenes.saveToSimpleFile(tempFolder+"UniProt.txt");
    geneLists.add(uniProtGenes);
  
    System.out.println(StringUtilities.now() + "\tMerging databases");
    DatabaseMerger merger = new DatabaseMerger();
    for (GeneList geneList : geneLists)
      merger.merge(geneList);
    
    GeneList mergedGenes = merger.getMergedGeneList();
    mergedGenes.saveToSimpleFile(tempFolder+"merged.txt");
    
    return mergedGenes;
  }

}