/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.ontology.ontologyutilities;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.DefaultTypes;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.peregrine.SimpleTokenizer;
import org.erasmusmc.peregrine.Tokenizer;
import org.erasmusmc.peregrine.UMLSGeneChemTokenizer;
import org.erasmusmc.utilities.StringUtilities;
public class OntologyUtilities {
public static Set<String> stopwordsForFiltering = getDefaultStopWordsForFiltering();
public static Set<String> stopwordsForIndexing = getDefaultStopWordsForIndexing();
// Specifies the maximum amount of characters allowed for a gene symbol.
public static Set<Integer> chemicalSemanticTypes = getChemicalSemanticTypes();
public static int maxGeneSymbolLength = 6;
public static String geneVocabulary = "GENE";
public static String chemVocabulary = "CHEMICAL";
public static int minChemID = 4000000;
public static Tokenizer tokenizer = new SimpleTokenizer();
//public static Tokenizer tokenizer = new UMLSGeneChemTokenizer();
/**
* Terms will not be tossed out if they have more than the specified number of
* tokens. <br>
* <br>
* The default value is 7
*/
public static int minTokenNumberForNoFilter = 7;
/**
* The minimum number of characters for a word to be considered non-ambiguous
* for the filter. <br>
* <br>
* The default value is 2.
*/
public static int minWordSize = 2;
/**
* Terms with less characters will be removed by the filter. <br>
* <br>
* The default value is 3.
*/
public static int minTermSize = 3;
/**
* Terms consisting of more tokens will always be matched order-sensitive
* (default = 7)
*/
public static int maxTermLengthForOrderInsensitivity = 7;
/**
* If one of the tokens consists of the specified number of characters or
* less, the term will always be matched order-sensitive. The assumtion is
* that it is likely a systematic name such as a chemical formula, for which
* order is important. The default value is 3.
*/
public static int minTokenLengthForOrderInsensitivity = 3;
public static boolean hasGeneVoc(Concept concept, Ontology ontology) {
if (geneVocabulary.equals(""))
return true;
for (Relation relation: ontology.getRelationsForConceptAsSubject(concept.getID(), DefaultTypes.fromVocabulary)) {
if (ontology.getConcept(relation.object).getName().equals(geneVocabulary))
return true;
}
return false;
}
public static boolean hasChemVoc(Concept concept, Ontology ontology) {
if (chemVocabulary.equals(""))
return true;
for (Relation relation: ontology.getRelationsForConceptAsSubject(concept.getID(), DefaultTypes.fromVocabulary)) {
if (ontology.getConcept(relation.object).getName().equals(chemVocabulary))
return true;
}
return false;
}
public static boolean isChemical(Integer conceptID, Ontology ontology) {
List<Relation> relations = ontology.getRelationsForConceptAsSubject(conceptID, DefaultTypes.isOfSemanticType);
for (Relation relation: relations) {
if (chemicalSemanticTypes.contains(relation.object))
return true;
}
return false;
}
public static void setMatchingFlagsForOntology(Ontology ontology) {
Iterator<Concept> conceptIterator = ontology.getConceptIterator();
while (conceptIterator.hasNext()) {
Concept concept = conceptIterator.next();
if (hasGeneVoc(concept, ontology)) {
for (TermStore term: concept.getTerms())
setGeneChemMatchingFlags(term);
}
else {
for (TermStore term: concept.getTerms())
setDefaultMatchingFlags(term);
}
}
}
public static void mergeConceptsIntoNew(Ontology ontology, int id1, int id2, int newConceptID) {
Concept newConcept = new Concept(newConceptID);
ontology.setConcept(newConcept);
mergeConcepts(ontology, id1, newConceptID);
mergeConcepts(ontology, id2, newConceptID);
}
public static void mergeConcepts(Ontology ontology, int fromCUI, int toCUI) {
mergeConcepts(ontology, fromCUI, toCUI, true);
}
public static void mergeConcepts(Ontology ontology, int fromCUI, int toCUI, boolean removeFromConcept) {
if (fromCUI == toCUI) {
System.out.println("ERROR: attempted to merge " + fromCUI + " to itself!");
}
else {
Concept fromConcept = ontology.getConcept(fromCUI);
Concept toConcept = ontology.getConcept(toCUI);
if (fromConcept != null && toConcept != null) {
List<TermStore> termsfromConcept = fromConcept.getTerms();
List<Relation> fromRelationsSub = ontology.getRelationsForConceptAsSubject(fromCUI);
List<Relation> fromRelationsObj = ontology.getRelationsForConceptAsObject(fromCUI);
List<DatabaseID> fromDbIDs = ontology.getDatabaseIDsForConcept(fromCUI);
List<TermStore> toTerms = toConcept.getTerms();
Set<String> toTermSet = getTermsAsSet(toTerms);
for (TermStore term: termsfromConcept) {
if (!toTermSet.contains(term.text)) {
toTerms.add(term);
}
}
toConcept.setTerms(toTerms);
if (!fromConcept.getDefinition().equals("")) {
String def = fromConcept.getDefinition();
if (!toConcept.getDefinition().equals("")) {
def = toConcept.getDefinition() + ";" + def;
}
toConcept.setDefinition(def);
}
for (Relation relation: fromRelationsObj) {
relation.object = toCUI;
ontology.setRelation(relation);
}
for (Relation relation: fromRelationsSub) {
relation.subject = toCUI;
ontology.setRelation(relation);
}
for (DatabaseID databaseID: fromDbIDs) {
ontology.setDatabaseIDForConcept(toCUI, databaseID);
}
if (removeFromConcept)
ontology.removeConcept(fromCUI);
}
else {
System.out.println("Attempted merge with a non existing Concept: either " + toCUI + " and/or " + fromCUI);
}
}
}
public static void setDefaultMatchingFlags(TermStore term) {
term.caseSensitive = false;
term.normalised = true;
term.orderSensitive = true;
/*
tokenizer.tokenize(term.text);
if (tokenizer.tokens.size() <= maxTermLengthForOrderInsensitivity) {
term.orderSensitive = false;
for (String token: tokenizer.tokens) {
if (token.length() < minTokenLengthForOrderInsensitivity || StringUtilities.containsNumber(token)) {
term.orderSensitive = true;
break;
}
}
}
*/
}
public static boolean isGeneSymbol(String string) {
return !((string.contains(" ") || !StringUtilities.isAbbr(string)) && string.length() > maxGeneSymbolLength);
}
public static String tokenizeAndRemoveStopwordsFromString(String term, Set<String> stopwords) {
//tokenizer = new SimpleTokenizer();
String word;
tokenizer.tokenize(term);
for (int i = tokenizer.tokens.size() - 1; i > -1; i--) {
word = tokenizer.tokens.get(i);
if (!StringUtilities.isAbbr(word) && stopwords.contains(word.toLowerCase())) {
tokenizer.removeToken(i);
}
}
return StringUtilities.join(tokenizer.tokens, " ");
}
public static void setGeneChemMatchingFlags(TermStore term) {
term.orderSensitive = true;
term.caseSensitive = false;
term.normalised = false;
if (isGeneSymbol(term.text)) { // gene symbol
if (StringUtilities.containsNumber(term.text) && !MartijnsFilterRule(term.text.toLowerCase(), stopwordsForFiltering)) { // symbol with number
term.caseSensitive = false;
}
else { // symbol without number
term.caseSensitive = true;
}
}
}
public static boolean hasChemicalConceptID (Concept concept){
if (concept.getID()>=minChemID)
return true;
return false;
}
/**
* Removes terms that consist only of ambiguous words and/or numbers.
*
* @param ontology
* Ontology to be filtered.
*/
public static void filterOntology(Ontology ontology, Set<String> stopwordsForFiltering) {
Iterator<Concept> conceptIterator = ontology.getConceptIterator();
while (conceptIterator.hasNext()) {
Concept concept = conceptIterator.next();
Iterator<TermStore> termIterator = concept.getTerms().iterator();
Set<String> previousTerms = new HashSet<String>();
while (termIterator.hasNext()) {
TermStore term = termIterator.next();
if (previousTerms.contains(term.text) || MartijnsFilterRule(term.text, stopwordsForFiltering) || term.text.length() < minTermSize) {
termIterator.remove();
}
else {
previousTerms.add(term.text);
}
}
}
}
public static boolean MartijnsFilterRule(String term, Set<String> stopwordsForFiltering) {
tokenizer.tokenize(term);
if (tokenizer.tokens.size() >= minTokenNumberForNoFilter)
return false;
for (String token: tokenizer.tokens) {
if (token.length() >= minWordSize && !StringUtilities.isNumber(token) && !StringUtilities.isRomanNumeral(token) && (StringUtilities.isAbbr(token) || !stopwordsForFiltering.contains(token.toLowerCase()))) {
return false;
}
}
return true;
}
public static void removeDuplicateTerms(List<TermStore> terms) {
Set<String> previousTerms = new HashSet<String>();
Iterator<TermStore> iterator = terms.iterator();
while (iterator.hasNext()) {
TermStore term = iterator.next();
if (previousTerms.contains(term.text)) {
iterator.remove();
}
else {
previousTerms.add(term.text);
}
}
}
public static Set<String> getTermsAsSet(List<TermStore> terms) {
Set<String> result = new HashSet<String>();
for (TermStore term: terms) {
result.add(term.text);
}
return result;
}
public static Set<Integer> getChemicalSemanticTypes() {
Set<Integer> result = new TreeSet<Integer>();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(OntologyUtilities.class.getResourceAsStream("Chemicals semantic types.txt")));
try {
while (bufferedReader.ready()) {
result.add(-Integer.parseInt(bufferedReader.readLine()));
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
/**
* This function filters an ontology by semantictype(s). The
* parameter inclusive indicates whether the set of semantic types should be
* considered and inclusive or exclusive filter. The parameter strict
* specifies whether the filter includes/excludes concepts that have only
* (true) or any (false) of the specified semantic types.
*
* @param ontology
* @param semanticTypeFilter
* @param inclusive
* @param strict
*/
public static void filterOntologyBySemanticTypes(Ontology ontology, Set<Integer> semanticTypeFilter, boolean inclusive, boolean strict) {
Set<Integer> selection;
if (strict)
selection = getSemanticFilter(ontology, semanticTypeFilter);
else
selection = getSemanticSelection(ontology, semanticTypeFilter);
Iterator<Concept> it = ontology.getConceptIterator();
if(inclusive){
while(it.hasNext()){
if( ! selection.contains(it.next().getID())){
it.remove();
}
}
}
else{
while(it.hasNext()){
if(selection.contains(it.next().getID())){
it.remove();
}
}
}
}
/**
* This function selects all concepts that are only of the given semantic
* types.
*
*/
public static Set<Integer> getSemanticFilter(Ontology ontology, Set<Integer> semanticTypes) {
Set<Integer> result = getSemanticSelection(ontology, semanticTypes);
Iterator<Integer> iterator = result.iterator();
while (iterator.hasNext()) {
Integer cui = iterator.next();
List<Relation> relations = ontology.getRelationsForConceptAsSubject(cui, DefaultTypes.isOfSemanticType);
int i = 0;
while (i < relations.size()) {
Relation relation = relations.get(i++);
if (!semanticTypes.contains(relation.object)) {
i = relations.size();
iterator.remove();
}
}
}
return result;
}
/**
* This function selects all concepts that are of one of the given semantic
* types.
*
*/
public static Set<Integer> getSemanticSelection(Ontology ontology, Set<Integer> semanticTypes) {
Set<Integer> result = new HashSet<Integer>();
for (Integer semantictype: semanticTypes) {
List<Relation> relations = ontology.getRelationsForConceptAsObject(semantictype, DefaultTypes.isOfSemanticType);
for (Relation relation: relations)
result.add(relation.subject);
}
return result;
}
/** Removes the terms specified in the removeTerms list from the ontology.
* @@param ontology The ontology to be filtered.
* @@param removeTerms The list of terms that will be removed. */
public static void removeTerms(Ontology ontology, Collection<String> removeTerms){ //removes the specified terms
System.out.println("Removing " + removeTerms.size() + " terms");
Iterator<Concept> conceptIterator = ontology.getConceptIterator();
while (conceptIterator.hasNext()){
Concept concept = conceptIterator.next();
if (hasGeneVoc(concept, ontology)){
Iterator<TermStore> termIterator = concept.getTerms().iterator();
while (termIterator.hasNext()){
TermStore term = termIterator.next();
if (removeTerms.contains(term.text))
termIterator.remove();
}
}
}
}
public static Set<String> getDefaultStopWordsForFiltering() {
Set<String> result = new TreeSet<String>();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(OntologyCurator.class.getResourceAsStream("DefaultStopwordsForFiltering.txt")));
try {
while (bufferedReader.ready()) {
result.add(bufferedReader.readLine());
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
private static Set<String> getDefaultStopWordsForIndexing() {
Set<String> result = new TreeSet<String>();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(OntologyCurator.class.getResourceAsStream("DefaultStopwordsForIndexing.txt")));
try {
while (bufferedReader.ready()) {
result.add(bufferedReader.readLine());
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
}