OntologyUtilities.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.ontology.ontologyutilities;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.DefaultTypes;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.peregrine.SimpleTokenizer;
import org.erasmusmc.peregrine.Tokenizer;
import org.erasmusmc.peregrine.UMLSGeneChemTokenizer;
import org.erasmusmc.utilities.StringUtilities;

public class OntologyUtilities {
	public static Set<String> stopwordsForFiltering = getDefaultStopWordsForFiltering();
	public static Set<String> stopwordsForIndexing = getDefaultStopWordsForIndexing();
  // Specifies the maximum amount of characters allowed for a gene symbol.
  public static Set<Integer> chemicalSemanticTypes = getChemicalSemanticTypes();
  public static int maxGeneSymbolLength = 6;
  public static String geneVocabulary = "GENE";
  public static String chemVocabulary = "CHEMICAL";
  public static int minChemID = 4000000;
  public static Tokenizer tokenizer = new SimpleTokenizer();
  //public static Tokenizer tokenizer = new UMLSGeneChemTokenizer();
  /**
   * Terms will not be tossed out if they have more than the specified number of
   * tokens. <br>
   * <br>
   * The default value is 7
   */
  public static int minTokenNumberForNoFilter = 7;

  /**
   * The minimum number of characters for a word to be considered non-ambiguous
   * for the filter. <br>
   * <br>
   * The default value is 2.
   */
  public static int minWordSize = 2;

  /**
   * Terms with less characters will be removed by the filter. <br>
   * <br>
   * The default value is 3.
   */
  public static int minTermSize = 3;

  /**
   * Terms consisting of more tokens will always be matched order-sensitive
   * (default = 7)
   */
  public static int maxTermLengthForOrderInsensitivity = 7;
  /**
   * If one of the tokens consists of the specified number of characters or
   * less, the term will always be matched order-sensitive. The assumtion is
   * that it is likely a systematic name such as a chemical formula, for which
   * order is important. The default value is 3.
   */
  public static int minTokenLengthForOrderInsensitivity = 3;

  public static boolean hasGeneVoc(Concept concept, Ontology ontology) {
    if (geneVocabulary.equals(""))
      return true;
    for (Relation relation: ontology.getRelationsForConceptAsSubject(concept.getID(), DefaultTypes.fromVocabulary)) {

      if (ontology.getConcept(relation.object).getName().equals(geneVocabulary))
        return true;
    }
    return false;
  }
  
  public static boolean hasChemVoc(Concept concept, Ontology ontology) {
	    if (chemVocabulary.equals(""))
	      return true;
	    for (Relation relation: ontology.getRelationsForConceptAsSubject(concept.getID(), DefaultTypes.fromVocabulary)) {

	      if (ontology.getConcept(relation.object).getName().equals(chemVocabulary))
	        return true;
	    }
	    return false;
	  }

  public static boolean isChemical(Integer conceptID, Ontology ontology) {
    List<Relation> relations = ontology.getRelationsForConceptAsSubject(conceptID, DefaultTypes.isOfSemanticType);
    for (Relation relation: relations) {
      if (chemicalSemanticTypes.contains(relation.object))
        return true;
    }
    return false;
  }

  public static void setMatchingFlagsForOntology(Ontology ontology) {
    Iterator<Concept> conceptIterator = ontology.getConceptIterator();

    while (conceptIterator.hasNext()) {
      Concept concept = conceptIterator.next();
      if (hasGeneVoc(concept, ontology)) {
        for (TermStore term: concept.getTerms())
          setGeneChemMatchingFlags(term);
      }
      else {
        for (TermStore term: concept.getTerms())
          setDefaultMatchingFlags(term);
      }
    }
  }

  public static void mergeConceptsIntoNew(Ontology ontology, int id1, int id2, int newConceptID) {
    Concept newConcept = new Concept(newConceptID);
    ontology.setConcept(newConcept);
    mergeConcepts(ontology, id1, newConceptID);
    mergeConcepts(ontology, id2, newConceptID);
  }

  public static void mergeConcepts(Ontology ontology, int fromCUI, int toCUI) {
    mergeConcepts(ontology, fromCUI, toCUI, true);
  }

  public static void mergeConcepts(Ontology ontology, int fromCUI, int toCUI, boolean removeFromConcept) {
    if (fromCUI == toCUI) {
      System.out.println("ERROR: attempted to merge " + fromCUI + " to itself!");
    }
    else {
      Concept fromConcept = ontology.getConcept(fromCUI);
      Concept toConcept = ontology.getConcept(toCUI);
      if (fromConcept != null && toConcept != null) {
        List<TermStore> termsfromConcept = fromConcept.getTerms();
        List<Relation> fromRelationsSub = ontology.getRelationsForConceptAsSubject(fromCUI);
        List<Relation> fromRelationsObj = ontology.getRelationsForConceptAsObject(fromCUI);
        List<DatabaseID> fromDbIDs = ontology.getDatabaseIDsForConcept(fromCUI);

        List<TermStore> toTerms = toConcept.getTerms();
        Set<String> toTermSet = getTermsAsSet(toTerms);
        for (TermStore term: termsfromConcept) {
          if (!toTermSet.contains(term.text)) {
            toTerms.add(term);
          }
        }
        toConcept.setTerms(toTerms);
        if (!fromConcept.getDefinition().equals("")) {
          String def = fromConcept.getDefinition();
          if (!toConcept.getDefinition().equals("")) {
            def = toConcept.getDefinition() + ";" + def;
          }
          toConcept.setDefinition(def);
        }
        for (Relation relation: fromRelationsObj) {
          relation.object = toCUI;
          ontology.setRelation(relation);
        }
        for (Relation relation: fromRelationsSub) {
          relation.subject = toCUI;
          ontology.setRelation(relation);
        }
        for (DatabaseID databaseID: fromDbIDs) {
          ontology.setDatabaseIDForConcept(toCUI, databaseID);
        }
        if (removeFromConcept)
          ontology.removeConcept(fromCUI);
      }
      else {
        System.out.println("Attempted merge with a non existing Concept: either " + toCUI + " and/or " + fromCUI);
      }
    }
  }
  
  

  public static void setDefaultMatchingFlags(TermStore term) {
    term.caseSensitive = false;
    term.normalised = true;
    term.orderSensitive = true;
/*
    tokenizer.tokenize(term.text);
    if (tokenizer.tokens.size() <= maxTermLengthForOrderInsensitivity) {
      term.orderSensitive = false;
      for (String token: tokenizer.tokens) {
        if (token.length() < minTokenLengthForOrderInsensitivity || StringUtilities.containsNumber(token)) {
          term.orderSensitive = true;
          break;
        }
      }
    }
    */
  }

  public static boolean isGeneSymbol(String string) {
    return !((string.contains(" ") || !StringUtilities.isAbbr(string)) && string.length() > maxGeneSymbolLength);
  }

  public static String tokenizeAndRemoveStopwordsFromString(String term, Set<String> stopwords) {
	//tokenizer = new SimpleTokenizer();
    String word;
    tokenizer.tokenize(term);
    for (int i = tokenizer.tokens.size() - 1; i > -1; i--) {
      word = tokenizer.tokens.get(i);
      if (!StringUtilities.isAbbr(word) && stopwords.contains(word.toLowerCase())) {
        tokenizer.removeToken(i);
      }
    }
    return StringUtilities.join(tokenizer.tokens, " ");
  }

  public static void setGeneChemMatchingFlags(TermStore term) {
    term.orderSensitive = true;
    term.caseSensitive = false;
    term.normalised = false;
    if (isGeneSymbol(term.text)) { // gene symbol
        if (StringUtilities.containsNumber(term.text) && !MartijnsFilterRule(term.text.toLowerCase(), stopwordsForFiltering)) { // symbol with number
          term.caseSensitive = false;
        }
        else { // symbol without number
          term.caseSensitive = true;
        }
      }
	}
	
  public static boolean hasChemicalConceptID (Concept concept){
    if (concept.getID()>=minChemID)
      return true;
    return false;
  }

  /**
   * Removes terms that consist only of ambiguous words and/or numbers.
   * 
   * @param ontology
   *            Ontology to be filtered.
   */
  public static void filterOntology(Ontology ontology, Set<String> stopwordsForFiltering) {
    Iterator<Concept> conceptIterator = ontology.getConceptIterator();
    while (conceptIterator.hasNext()) {
      Concept concept = conceptIterator.next();
      Iterator<TermStore> termIterator = concept.getTerms().iterator();
      Set<String> previousTerms = new HashSet<String>();
      while (termIterator.hasNext()) {
        TermStore term = termIterator.next();
        if (previousTerms.contains(term.text) || MartijnsFilterRule(term.text, stopwordsForFiltering) || term.text.length() < minTermSize) {
          termIterator.remove();
        }
        else {
          previousTerms.add(term.text);
        }
      }
    }
  }

  public static boolean MartijnsFilterRule(String term, Set<String> stopwordsForFiltering) {
    tokenizer.tokenize(term);
    if (tokenizer.tokens.size() >= minTokenNumberForNoFilter)
      return false;

    for (String token: tokenizer.tokens) {
      if (token.length() >= minWordSize && !StringUtilities.isNumber(token) && !StringUtilities.isRomanNumeral(token) && (StringUtilities.isAbbr(token) || !stopwordsForFiltering.contains(token.toLowerCase()))) {

        return false;
      }
    }
    return true;
  }

  public static void removeDuplicateTerms(List<TermStore> terms) {
    Set<String> previousTerms = new HashSet<String>();
    Iterator<TermStore> iterator = terms.iterator();
    while (iterator.hasNext()) {
      TermStore term = iterator.next();
      if (previousTerms.contains(term.text)) {
        iterator.remove();
      }
      else {
        previousTerms.add(term.text);
      }
    }
  }

  public static Set<String> getTermsAsSet(List<TermStore> terms) {
    Set<String> result = new HashSet<String>();
    for (TermStore term: terms) {
      result.add(term.text);
    }
    return result;
  }

  public static Set<Integer> getChemicalSemanticTypes() {
    Set<Integer> result = new TreeSet<Integer>();
    BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(OntologyUtilities.class.getResourceAsStream("Chemicals semantic types.txt")));
    try {
      while (bufferedReader.ready()) {
        result.add(-Integer.parseInt(bufferedReader.readLine()));
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
    return result;

  }

  /**
   * This function filters an ontology by semantictype(s). The
   * parameter inclusive indicates whether the set of semantic types should be
   * considered and inclusive or exclusive filter. The parameter strict
   * specifies whether the filter includes/excludes concepts that have only
   * (true) or any (false) of the specified semantic types.
   * 
   * @param ontology
   * @param semanticTypeFilter
   * @param inclusive
   * @param strict
   */
  public static void filterOntologyBySemanticTypes(Ontology ontology, Set<Integer> semanticTypeFilter, boolean inclusive, boolean strict) {
    Set<Integer> selection;
    if (strict)
      selection = getSemanticFilter(ontology, semanticTypeFilter);
    else
      selection = getSemanticSelection(ontology, semanticTypeFilter);
    Iterator<Concept> it = ontology.getConceptIterator();
    if(inclusive){
      while(it.hasNext()){
        if( ! selection.contains(it.next().getID())){
          it.remove();
        }
      }
    }
    else{
      while(it.hasNext()){
        if(selection.contains(it.next().getID())){
          it.remove();
        }
      }
    }
  }

  /**
   * This function selects all concepts that are only of the given semantic
   * types.
   * 
   */
  public static Set<Integer> getSemanticFilter(Ontology ontology, Set<Integer> semanticTypes) {

    Set<Integer> result = getSemanticSelection(ontology, semanticTypes);
    Iterator<Integer> iterator = result.iterator();
    while (iterator.hasNext()) {
      Integer cui = iterator.next();
      List<Relation> relations = ontology.getRelationsForConceptAsSubject(cui, DefaultTypes.isOfSemanticType);
      int i = 0;
      while (i < relations.size()) {
        Relation relation = relations.get(i++);
        if (!semanticTypes.contains(relation.object)) {
          i = relations.size();
          iterator.remove();
        }
      }
    }
    return result;
  }

  /**
   * This function selects all concepts that are of one of the given semantic
   * types.
   * 
   */
  public static Set<Integer> getSemanticSelection(Ontology ontology, Set<Integer> semanticTypes) {
    Set<Integer> result = new HashSet<Integer>();
    for (Integer semantictype: semanticTypes) {
      List<Relation> relations = ontology.getRelationsForConceptAsObject(semantictype, DefaultTypes.isOfSemanticType);
      for (Relation relation: relations)
        result.add(relation.subject);
    }
    return result;
  }

  /** Removes the terms specified in the removeTerms list from the ontology.
   * @@param ontology    The ontology to be filtered.
   * @@param removeTerms The list of terms that will be removed. */
  public static void removeTerms(Ontology ontology, Collection<String> removeTerms){ //removes the specified terms
    System.out.println("Removing " + removeTerms.size() + " terms");
    Iterator<Concept> conceptIterator = ontology.getConceptIterator();
    while (conceptIterator.hasNext()){
      Concept concept = conceptIterator.next();
      if (hasGeneVoc(concept, ontology)){
        Iterator<TermStore> termIterator = concept.getTerms().iterator();
        while (termIterator.hasNext()){
          TermStore term = termIterator.next();
          if (removeTerms.contains(term.text)) 
            termIterator.remove();
        }
      }
    }  
  }
  
  public static Set<String> getDefaultStopWordsForFiltering() {
		Set<String> result = new TreeSet<String>();
		BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(OntologyCurator.class.getResourceAsStream("DefaultStopwordsForFiltering.txt")));
		try {
			while (bufferedReader.ready()) {
				result.add(bufferedReader.readLine());
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return result;
	}

	private static Set<String> getDefaultStopWordsForIndexing() {
		Set<String> result = new TreeSet<String>();
		BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(OntologyCurator.class.getResourceAsStream("DefaultStopwordsForIndexing.txt")));
		try {
			while (bufferedReader.ready()) {
				result.add(bufferedReader.readLine());
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return result;
	}

}