/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.peregrine; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.erasmusmc.collections.IntList; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.textMining.LVG.LVGNormaliser; //import org.erasmusmc.utilities.LVGNormaliser; import org.erasmusmc.utilities.StringUtilities; /** * Peregrine for looking up terms in the ontology. It will only return a match when the whole input text * is matched to a term in the ontology. Saves memory when compared to a normal ConceptPeregrine. * @author martijn * */ public class LookupPeregrine extends AbstractPeregrine { /** Specifies whether the input text should also be normalised before matching. * Default is set to false, but if at least one term in the ontology has the normalisation flag set, * it will automatically be turned to true. * <br><br>The default value is false */ public boolean normalize = false; /** If true, the entire ontology structure will be destroyed during release, thus saving memory. * <br><br>The default value is False.*/ public boolean destroyOntologyDuringRelease = false; public LookupPeregrine() { normaliser = new LVGNormaliser(); tokenizer = new SimpleTokenizer(); } public LookupPeregrine(String lvgPropertiesPath) { if (lvgPropertiesPath != null) normaliser = new LVGNormaliser(lvgPropertiesPath); tokenizer = new SubSentenceTokenizer(); } @Override public void index(String string){ initializeIndex(string); checkString(casesentiveCaseNorm(tokenizer.tokens), words); if (normalize) checkString(normalise(tokenizer.tokens), normwords); checkString(toLowercase(tokenizer.tokens), lcwords); } private void checkString(List<String> tokens, Map<String, IntList> wordList) { String neatString = StringUtilities.join(tokens, " "); IntList conceptIDs = wordList.get(neatString); if (conceptIDs != null) for (int conceptID : conceptIDs){ ResultTerm resultTerm = new ResultTerm(); resultTerm.words = new int[tokens.size()]; for (int i = 0; i < tokens.size(); i++) resultTerm.words[i] = i; resultTerms.add(resultTerm); ResultConcept resultConcept = null; for (ResultConcept concept : resultConcepts) if (concept.conceptId == conceptID){ resultConcept = concept; break; } if (resultConcept == null){ resultConcept = new ResultConcept(); resultConcept.conceptId = conceptID; resultConcepts.add(resultConcept); } resultConcept.terms.add(resultTerm); } } @Override public void release(){ if (destroyOntologyDuringRelease && !(ontology instanceof OntologyStore)) { destroyOntologyDuringRelease = false; } words.clear(); normwords.clear(); lcwords.clear(); TermStore term; List<String> tokens; Map<String, IntList> wordlist; Iterator<Concept> values = ontology.getConceptIterator(); while (values.hasNext()){ Concept concept = values.next(); List<TermStore> terms = concept.getTerms(); for (int j = 0; j < terms.size(); j++){ term = terms.get(j); initializeIndex(term.text); //Implies tokenization and stopword removal if (term.normalised) { tokens = normalise(tokenizer.tokens); wordlist = normwords; normalize = true; // at least one normalised term: turn normalisation on } else if (term.caseSensitive) { tokens = casesentiveCaseNorm(tokenizer.tokens); wordlist = words; } else { tokens = toLowercase(tokenizer.tokens); wordlist = lcwords; } String neatTerm = StringUtilities.join(tokens, " "); IntList conceptIDs = wordlist.get(neatTerm); if (conceptIDs == null){ conceptIDs = new IntList(1); wordlist.put(neatTerm, conceptIDs); } if (conceptIDs.size() == 0 || !conceptIDs.get(conceptIDs.size()-1).equals(concept.getID())) conceptIDs.add(concept.getID()); } if (destroyOntologyDuringRelease) values.remove(); } if (destroyOntologyDuringRelease) ontology = null; } protected void initializeIndex(String string){ resultTerms.clear(); resultConcepts.clear(); if (string != null) tokenizer.tokenize(string); removeStopwords(); } protected Map<String, IntList> words = new HashMap<String, IntList>(); protected Map<String, IntList> normwords = new HashMap<String, IntList>(); protected Map<String, IntList> lcwords = new HashMap<String, IntList>(); }