/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.ontology.ontologyutilities; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.erasmusmc.collections.ListTree; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.DefaultTypes; import org.erasmusmc.ontology.Ontology; import org.erasmusmc.ontology.Relation; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.peregrine.SimpleTokenizer; import org.erasmusmc.peregrine.Tokenizer; import org.erasmusmc.utilities.StringUtilities; public class FamilyNameFinder { public String geneVoc = "GENE"; public int minFamilyNameLength = 5; public Map<String, Integer> findFamilyNames(Ontology ontology) { Map<String, Integer> result = new HashMap<String, Integer>(); // Generate set of terms: Set<String> termset = new HashSet<String>(); // CountingSet<String> termset = new CountingSet<String>(); Iterator<Concept> conceptIterator = ontology.getConceptIterator(); while (conceptIterator.hasNext()) { Concept concept = conceptIterator.next(); if (hasGeneVoc(concept, ontology)) for (TermStore term: concept.getTerms()) { termset.add(term.text); } } // Build tree: ListTree<String, List<String>> listTree = new ListTree<String, List<String>>(); // ListTree<String, List<String>> reverseListTree = new ListTree<String, // List<String>>(); Tokenizer tokenizer = new SimpleTokenizer(); for (String term: termset) { tokenizer.tokenize(term); tokenizer.tokens = toLowercase(tokenizer.tokens); addToListTree(tokenizer.tokens, term, listTree); } for (ListTree<String, List<String>> terminator: listTree.terminatorSet()) for (Map.Entry<String, ListTree<String, List<String>>> nextNode: terminator.subTree.entrySet()) if (isMemberIndicator(nextNode.getKey())) for (String term: terminator.value) if (term.length() > minFamilyNameLength && !StringUtilities.isAbbr(term)) { Integer count = result.get(term); if (count == null) { count = 0; } result.put(term, ++count); } return result; } public static List<String> findFamilyNamesListOutput(Ontology ontology) { FamilyNameFinder finder = new FamilyNameFinder(); return new ArrayList<String>(finder.findFamilyNames(ontology).keySet()); } private static boolean isMemberIndicator(String key) { return (StringUtilities.isNumber(key) || StringUtilities.isRomanNumeral(key) || StringUtilities.isGreekLetter(key)); } private static void addToListTree(List<String> tokens, String term, ListTree<String, List<String>> listTree) { ListTree<String, List<String>> terminator = listTree.get(tokens); List<String> identicalTerms; if (terminator == null || terminator.value == null) { identicalTerms = new ArrayList<String>(); listTree.put(tokens, identicalTerms); } else identicalTerms = terminator.value; identicalTerms.add(term); } private static List<String> toLowercase(List<String> tokens) { List<String> result = new ArrayList<String>(tokens.size()); for (String token: tokens) { if (StringUtilities.isAbbr(token)) result.add(token); else result.add(token.toLowerCase()); } return result; } private int fromVocabulary = DefaultTypes.fromVocabulary; private boolean hasGeneVoc(Concept concept, Ontology ontology) { List<Relation> relations = ontology.getRelationsForConceptAsSubject(concept.getID(), fromVocabulary); if (geneVoc.equals("") && relations.size() == 0) return true; for (Relation relation: relations) { if (ontology.getConcept(relation.object).getName().equals(geneVoc)) return true; } return false; } /** * System.out.println("Counting substrings " + StringUtilities.now()); List<String> * lines = new ArrayList<String>(); Map<List<String>, Integer> term2count = * new HashMap<List<String>, Integer>(); Set<List<String>> typicalTerms = * new HashSet<List<String>>(); for (ListTree<String, List<String>> * terminator : listTree.terminatorSet()){ int superSets = * terminator.terminatorSet().size()-1; if (superSets > 0) * term2count.put(terminator.value, superSets); } for (ListTree<String, List<String>> * terminator : reverseListTree.terminatorSet()){ int superSets = * terminator.terminatorSet().size()-1; if (superSets > 0){ Integer count = * term2count.get(terminator.value); if (count == null) count = 0; for (String * term : terminator.value){ lines.add((count + superSets) + "\t" + count + * "\t" + superSets + "\t" + term); } * } } */ }