/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.peregrine; //Usage: // First load an ontology in the 'ontology 'field // (optional) Load normaliser cache from disc // (optional) Load stopwords // Release thesaurus // Index // Retrieve results from 'concepts' field import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.Ontology; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.textMining.LVG.LVGNormaliser; //import org.erasmusmc.utilities.LVGNormaliser; /** Finds concepts that are defined in an ontology in text. */ public class ConceptPeregrine extends AbstractPeregrine{ /** Specifies the window size for finding the next word of a term. * A window size of 1 means that no other words are allowed between the words of a term. * <br><br>The default value is 1. */ public int windowSize = 1; /** Specifies whether the input text should also be normalised before matching. * Default is set to false, but if at least one term in the ontology has the normalisation flag set, * it will automatically be turned to true. * <br><br>The default value is false */ public boolean normalize = false; /** If several terms map to the same words, only the term consisting of the most words will be selected, * if this parameter is set to true. * e.g.: Suppose 'Alzheimer's disease' maps to two terms: 'Alzheimer's disease' and 'disease', then * only the first term will be selected if this parameter is set to true. * <br><br>The default value is True.*/ public boolean biggestMatchOnly = true; /** If true, the entire ontology structure will be destroyed during release, thus saving memory. * <br><br>The default value is False.*/ public boolean destroyOntologyDuringRelease = false; /** If true, statistics on the use of tokens will be collected during release that can be used by some external modules. * <br><br>The default value is True.*/ public boolean countTokenUsage = true; //Token usage is used by disambiguator public ConceptPeregrine() { normaliser = new LVGNormaliser(); tokenizer = new UMLSGeneChemTokenizer(); } public ConceptPeregrine(String lvgPropertiesPath) { if (lvgPropertiesPath != null) normaliser = new LVGNormaliser(lvgPropertiesPath); tokenizer = new UMLSGeneChemTokenizer(); } private boolean newTerm; public void release(){ if (destroyOntologyDuringRelease && !(ontology instanceof OntologyStore)) { destroyOntologyDuringRelease = false; } words.clear(); normwords.clear(); lcwords.clear(); terms.clear(); token2Term.clear(); pair2Termlinks.clear(); current = 0; lastTokenID = 0; ontologyName = ontology.getName(); if (countTokenUsage) token2count = new TreeMap<Integer, Count>(); Set<Integer> hashCache = new HashSet<Integer>(); List<String> tokens; int[] tokenIDs; TermStore term; ReleasedTerm releasedTerm; Map<String, Integer> wordlist; Iterator<Concept> values = ontology.getConceptIterator(); while (values.hasNext()){ Concept concept = values.next(); List<TermStore> terms = concept.getTerms(); for (int j = 0; j < terms.size(); j++){ term = terms.get(j); initializeIndex(term.text); //Implies tokenization and stopword removal if (term.normalised) { tokens = normalise(tokenizer.tokens); wordlist = normwords; normalize = true; // at least one normalised term: turn normalisation on } else if (term.caseSensitive) { tokens = casesentiveCaseNorm(tokenizer.tokens); wordlist = words; } else { tokens = toLowercase(tokenizer.tokens); wordlist = lcwords; } if (tokens.size() > 127){ System.err.println("Error: terms longer than 127 tokens are not supported! Concatenating term: " + term.text); tokens = tokens.subList(0, 127); } newTerm = false; releasedTerm = null; tokenIDs = tokens2NewTokenIDs(tokens, wordlist); int hash = tokensHash(tokens); if (!newTerm){ //Quick check using hashCache to see if term is new: if (!hashCache.contains(hash)) { newTerm = true; hashCache.add(hash); } } else hashCache.add(hash); if (!newTerm){ //Exhaustive check for homonyms: newTerm = true; checkTokens(tokens2TokenIDs(tokens, wordlist), 0, tokens.size()-1); for (int t = 0; t < resultTerms.size(); t++){ releasedTerm = resultTerms.get(t).term; if (releasedTerm.length == tokens.size() && releasedTerm.ordered == term.orderSensitive) { newTerm = false; break; } } } if (newTerm){//no homonym found: add term to thesaurus releasedTerm = addTerm(term, tokens.size(), concept.getID(), j); if (tokenIDs.length == 1){ //Single token term: token2Term.put(tokenIDs[0], releasedTerm); } else { //Multi-token term: if (term.orderSensitive){ for (int w1=0; w1<tokenIDs.length-1; w1++){ TokenPair tokenPair = new TokenPair(tokenIDs[w1],tokenIDs[w1+1]); addTokenPair(tokenPair, releasedTerm, w1, w1+1); } } else { //order insensitive: for (int w1=0; w1<tokenIDs.length; w1++) for (int w2=0; w2<tokenIDs.length; w2++) if (w1 != w2){ TokenPair tokenPair = new TokenPair(tokenIDs[w1],tokenIDs[w2]); addTokenPair(tokenPair, releasedTerm, w1, w2); } } } } //If duplicate terms per concept: only accept first term: if (releasedTerm.conceptId[releasedTerm.conceptId.length-1] != concept.getID()) { releasedTerm.addConceptAndTermID(concept.getID(), j); } } if (destroyOntologyDuringRelease) values.remove(); } if (destroyOntologyDuringRelease) ontology = null; trimMemory(); } private void addTokenPair(TokenPair tokenPair, ReleasedTerm releasedTerm, int w1, int w2){ List<TermLink> termlinks = pair2Termlinks.get(tokenPair); if (termlinks == null){ termlinks = new ArrayList<TermLink>(); pair2Termlinks.put(tokenPair, termlinks); } termlinks.add(new TermLink(releasedTerm, w1, w2)); } public void index(String string){ initializeIndex(string); int lineStart = 0; int lineEnd; tokenIDslist.clear(); tokenIDslist.add(tokens2TokenIDs(casesentiveCaseNorm(tokenizer.tokens), words)); if (normalize) tokenIDslist.add(tokens2TokenIDs(normalise(tokenizer.tokens), normwords)); tokenIDslist.add(tokens2TokenIDs(toLowercase(tokenizer.tokens), lcwords)); List<Integer> endOfSentence; if (tokenizer instanceof SubSentenceTokenizer) endOfSentence = ((SubSentenceTokenizer)tokenizer).getSubEndOfSentences(); else endOfSentence = tokenizer.endOfSentence; for (int i = 0; i < endOfSentence.size(); i++){ //find matches per sentence: lineEnd = endOfSentence.get(i)-1; for (int[] tokenIDs : tokenIDslist){ checkTokens(tokenIDs, lineStart, lineEnd); } lineStart = lineEnd + 1; } if (biggestMatchOnly) {removeSmallMatches(resultTerms);} mapTerms2Concepts(resultTerms,resultConcepts); } private IndexTerm createAndAddIndexTerm(ReleasedTerm term){ IndexTerm indexTerm = new IndexTerm(); indexTerm.checkedWordPos = new int[term.length]; for (int i = 0; i < term.length; i++) indexTerm.checkedWordPos[i] = -1; term.modified = current+indexTerms.size(); indexTerms.add(indexTerm); return indexTerm; } protected void checkTokens(int[] tokenIDs, int lineStart, int lineEnd){ current += indexTerms.size()+1; if (current > Integer.MAX_VALUE-10000000){ current = 1; for (ReleasedTerm term : terms){ term.modified = 0; } } indexTerms.clear(); List <TermLink> termLinks; ReleasedTerm currentterm; for (int w1=lineStart; w1<=lineEnd; w1++){ if (tokenIDs[w1] != -1){ //Check single token terms: currentterm = token2Term.get(tokenIDs[w1]); if (currentterm != null) { createAndAddIndexTerm(currentterm).insert(w1); addMatch(currentterm); } //Generate token-pairs: TokenPair tokenPair = new TokenPair(0,0); int last = Math.min(lineEnd, w1+windowSize); for (int w2=w1+1; w2 <= last; w2++){ if (tokenIDs[w2] != -1) { tokenPair.setTokens(tokenIDs[w1],tokenIDs[w2]); termLinks = pair2Termlinks.get(tokenPair); if (termLinks != null){ for (int t = 0; t < termLinks.size(); t++){ TermLink termlink = termLinks.get(t); currentterm = termlink.term; IndexTerm indexTerm; if (current > currentterm.modified){ indexTerm = createAndAddIndexTerm(currentterm); } else { //Delete this: if (currentterm.modified-current < 0) System.out.println("Strange difference: " + currentterm.modified + "-" + current); indexTerm = indexTerms.get(currentterm.modified-current); if (w2 - indexTerm.lastChecked > windowSize) indexTerm.clear(); } //check if this word was not already used to match this term if (w2 != indexTerm.lastChecked){ if (currentterm.ordered){ if (termlink.wordPos1 == 0){ //First pair of this term if (indexTerm.checkedCount == 0) indexTerm.insertFirst(termlink, w1, w2); else if (!otherPairOfThisTerm(termLinks, currentterm, t)){ //checkedcount != 0 indexTerm.clear(); indexTerm.insertFirst(termlink, w1, w2); } } else { //Following pairs of this term if (indexTerm.checkedCount == termlink.wordPos2) //Following pairs of this term indexTerm.insert(termlink, w1, w2); //else if (!otherPairOfThisTerm(termLinks, currentterm, t)) // indexTerm.clear(); } } else { //unordered if (indexTerm.checkedCount == 0) //First pair of this term indexTerm.insertFirst(termlink, w1, w2); else { if (indexTerm.checkedWordPos[termlink.wordPos1] == w1 && indexTerm.checkedWordPos[termlink.wordPos2] == -1) //Following pairs of this term indexTerm.insert(termlink, w1, w2); else { //Didn't fit if (!otherPairOfThisTerm(termLinks, currentterm, t)){ //There's not going to be another pair that will fit indexTerm.clear(); indexTerm.insertFirst(termlink, w1, w2); } } } } if (indexTerm.checkedCount == currentterm.length){ addMatch(currentterm); } } } } } } } } } private final boolean otherPairOfThisTerm(List<TermLink> termLinks, ReleasedTerm term, int t) { if (t == termLinks.size()-1) return false; if (termLinks.get(t+1).term == term) return true; return false; } //Generate resultConcepts based on resultTerms: protected static void mapTerms2Concepts(List<ResultTerm> resultTerms, List<ResultConcept> resultConcepts){ resultConcepts.clear(); Map<Integer, ResultConcept> id2concept = new TreeMap<Integer, ResultConcept>(); int conceptId; for (ResultTerm resultterm : resultTerms){ for (int i = 0; i < resultterm.term.conceptId.length; i++){ conceptId = resultterm.term.conceptId[i]; ResultConcept resultconcept = id2concept.get(conceptId); if (resultconcept == null) { resultconcept = new ResultConcept(); resultconcept.conceptId = conceptId; id2concept.put(conceptId, resultconcept); resultConcepts.add(resultconcept); } resultconcept.terms.add(resultterm); } } } protected int tokensHash(List<String> tokens){ int hash = 0; for (String token : tokens){ hash += token.hashCode(); } return hash; } protected int[] tokens2NewTokenIDs(List<String> tokens, Map<String, Integer> wordlist){ int[] result = new int[tokens.size()]; Integer id; Count count; for (int i =0; i < tokens.size(); i++){ id = wordlist.get(tokens.get(i)); if (id == null){ if (countTokenUsage) { count = new Count(); token2count.put(lastTokenID, count); } result[i] = lastTokenID; wordlist.put(tokens.get(i), lastTokenID); lastTokenID++; newTerm = true; } else { if (countTokenUsage) token2count.get(id).value++; result[i] = id; } } return result; } protected int[] tokens2TokenIDs(List<String> tokens, Map<String, Integer> wordlist){ int[] tokenIDs = new int[tokens.size()]; Integer id = 0; for (int i = 0; i < tokens.size(); i++){ id = wordlist.get(tokens.get(i)); if (id == null) tokenIDs[i] = -1; else tokenIDs[i] = id; } return tokenIDs; } protected ReleasedTerm addTerm(TermStore term, int size, int cid, int termID){ ReleasedTerm releasedTerm; releasedTerm = new ReleasedTerm(); releasedTerm.length = (byte)size; releasedTerm.ordered = term.orderSensitive; releasedTerm.addConceptAndTermID(cid, termID); terms.add(releasedTerm); return releasedTerm; } protected void initializeIndex(String string){ resultTerms.clear(); if (string != null) tokenizer.tokenize(string); removeStopwords(); } protected static void removeSmallMatches(List<ResultTerm> resultTerms){ Map<Integer, List<ResultTerm>> word2term = new TreeMap<Integer, List<ResultTerm>>(); List<ResultTerm> mappedterms; for (ResultTerm resultterm : resultTerms) { for (int word : resultterm.words){ mappedterms = word2term.get(word); if (mappedterms == null){ mappedterms = new ArrayList<ResultTerm>(); mappedterms.add(resultterm); word2term.put(word, mappedterms); } else { for (ResultTerm otherterm : mappedterms){ if (otherterm.term != null){ if (otherterm.term.length < resultterm.term.length) { //other term is shorter otherterm.term = null; } else if (otherterm.term.length > resultterm.term.length) { //this term is shorter resultterm.term = null; break; } } } if (resultterm.term == null) break; else mappedterms.add(resultterm); } } } for (int i = resultTerms.size()-1; i >= 0; i--) { if (resultTerms.get(i).term == null) { resultTerms.remove(i); } } } protected void addMatch(ReleasedTerm aterm){ ResultTerm resultterm = new ResultTerm(); resultterm.words = new int[aterm.length]; IndexTerm indexTerm = indexTerms.get(aterm.modified-current); for (int i = 0; i < aterm.length; i++){ resultterm.words[i] = indexTerm.checkedWordPos[i]; } if (!aterm.ordered){ //Sort words in order: int temp; for (int i = 0; i < resultterm.words.length; i++) for (int j = i+1; j < resultterm.words.length; j++) if (resultterm.words[i] > resultterm.words[j]){ temp = resultterm.words[i]; resultterm.words[i] = resultterm.words[j]; resultterm.words[j] = temp; } } resultterm.term = aterm; resultTerms.add(resultterm); indexTerm.clear(); } protected void trimMemory(){ if (pair2Termlinks instanceof TokenPairToTermLinksMap) ((TokenPairToTermLinksMap)pair2Termlinks).trimToSize(); for (List<TermLink> termLinks : pair2Termlinks.values()) ((ArrayList<TermLink>) termLinks).trimToSize(); ((ArrayList<ReleasedTerm>)terms).trimToSize(); } protected Map<String, Integer> words = new HashMap<String, Integer>(); protected Map<String, Integer> normwords = new HashMap<String, Integer>(); protected Map<String, Integer> lcwords = new HashMap<String, Integer>(); protected List<ReleasedTerm> terms = new ArrayList<ReleasedTerm>(); protected Map<Integer, ReleasedTerm> token2Term = new HashMap<Integer, ReleasedTerm>(); protected Map<TokenPair, List<TermLink>> pair2Termlinks = new TokenPairToTermLinksMap(); protected List<IndexTerm> indexTerms = new ArrayList<IndexTerm>(); protected static class TokenPair implements Serializable, Comparable<TokenPair>{ int token1, token2; public int hashCode(){ return token1+token2; } public boolean equals(Object other) { TokenPair otherPair = (TokenPair) other; return (this.token1 == otherPair.token1) && (this.token2 == otherPair.token2); } public int compareTo(TokenPair otherPair){ int result = this.token1 - otherPair.token1; if (result == 0) return this.token2 - otherPair.token2; else return result; } public TokenPair(int t1, int t2){ token1 = t1; token2 = t2; } public void setTokens (int t1, int t2){ token1 = t1; token2 = t2; } protected static final long serialVersionUID = -8370205486737997308L; } protected class Count{ int value = 1; } protected int lastTokenID = 0; protected int current; protected String ontologyName = ""; //Additions for disambiguator: protected Map<Integer, Count> token2count; protected List<int[]> tokenIDslist = new ArrayList<int[]>(); protected static class TermLink implements Serializable{ public ReleasedTerm term; public int wordPos1 = 0; public int wordPos2 = 0; public TermLink(ReleasedTerm aterm, int wordPos1, int wordPos2){ term = aterm; this.wordPos1 = wordPos1; this.wordPos2 = wordPos2; } protected static final long serialVersionUID = -1147776745742497983L; } public void setOntology(Ontology ontology) { super.setOntology(ontology); } }