/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.peregrine; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.Map.Entry; import org.erasmusmc.peregrine.ComplexTerms.TokenInfo; import org.erasmusmc.peregrine.ConceptPeregrine.TermLink; public class ConceptKeywords { /** * Words that appear less than this number in the thesaurus are considered * keywords <br> * <br> * The default value is 1000. */ public int maxKeywordUsage = 1000; /** * Remove keywords that occurr for more than one meaning of a term <br> * <br> * The default value is True. */ public static boolean removeNonDistinguishing = true; private int minConceptID; private int maxConceptID; private ConceptPeregrine peregrine; /** * The disambiguator should be initialised using a released ontology before * disambiguation. * * @param peregrine * Specifies the ConceptPeregrine that should be used for * initalisation. */ public ConceptKeywords(ConceptPeregrine peregrine, int minConceptID, int maxConceptID) { this.minConceptID = minConceptID; this.maxConceptID = maxConceptID; this.peregrine = peregrine; Map<Integer, TokenInfo> token2info = new HashMap<Integer, TokenInfo>(); ComplexTerms.analyseTokens(peregrine.words, token2info); ComplexTerms.analyseTokens(peregrine.lcwords, token2info); ComplexTerms.analyseTokens(peregrine.normwords, token2info); // Build set of keywords: Set<Integer> keywords = new TreeSet<Integer>(); for (Entry<Integer, ConceptPeregrine.Count> entry: peregrine.token2count.entrySet()) if (entry.getValue().value < maxKeywordUsage) // Filter for very common // words keywords.add(entry.getKey()); // Remove non-complex keywords: removeNonComplexKeywords(keywords, token2info); // Add keywords to concepts: concept2keywords = new TreeMap<Integer, TreeSet<Integer>>(); for (Entry<ConceptPeregrine.TokenPair, List<ConceptPeregrine.TermLink>> entry: peregrine.pair2Termlinks.entrySet()) { ConceptPeregrine.TokenPair pair = entry.getKey(); if (keywords.contains(pair.token1)) addKeywordToTerm(pair.token1, entry.getValue(), peregrine); if (keywords.contains(pair.token2)) addKeywordToTerm(pair.token2, entry.getValue(), peregrine); } if (removeNonDistinguishing) removeNondistinguishingKeywords(peregrine); } private void removeNondistinguishingKeywords(ConceptPeregrine peregrine) { Set<Integer> uniquekeywords = new TreeSet<Integer>(); Set<Integer> duplicateKeywords = new TreeSet<Integer>(); for (ReleasedTerm term: peregrine.terms) { uniquekeywords.clear(); duplicateKeywords.clear(); for (int conceptID: term.conceptId) { Set<Integer> keywords = concept2keywords.get(conceptID); if (keywords != null) { for (int keyword: keywords) { if (!uniquekeywords.add(keyword)) duplicateKeywords.add(keyword); } } } for (int conceptID: term.conceptId) { Set<Integer> keywords = concept2keywords.get(conceptID); if (keywords != null) keywords.removeAll(duplicateKeywords); } } } private void removeNonComplexKeywords(Set<Integer> keywords, Map<Integer, TokenInfo> token2info) { Iterator<Integer> keywordIterator = keywords.iterator(); while (keywordIterator.hasNext()) { Integer keyword = keywordIterator.next(); TokenInfo tokenInfo = token2info.get(keyword); if (!ComplexTerms.isComplex(tokenInfo)) keywordIterator.remove(); } } private void addKeywordToTerm(Integer keyword, List<TermLink> termlinks, ConceptPeregrine peregrine) { for (TermLink termlink: termlinks) { for (Integer conceptID: termlink.term.conceptId) { if (conceptID > minConceptID && conceptID < maxConceptID) { if (!isSingleWordTerm(keyword, conceptID, peregrine)) { TreeSet<Integer> keywords = concept2keywords.get(conceptID); if (keywords == null) { keywords = new TreeSet<Integer>(); concept2keywords.put(conceptID, keywords); } keywords.add(keyword); } } } } } private boolean isSingleWordTerm(Integer keyword, Integer conceptID, ConceptPeregrine peregrine) { ReleasedTerm term = peregrine.token2Term.get(keyword); if (term == null) //|| !term.conceptId.contains(conceptID)) return false; else { for (int id : term.conceptId) if (id == conceptID) return true; return false; } } private Map<Integer, TreeSet<Integer>> concept2keywords; /* private static void DisplayTerm(ConceptPeregrine indexer, ResultTerm resultTerm) { StringBuffer term = new StringBuffer(); for (Integer word: resultTerm.words) { term.append(indexer.tokenizer.tokens.get(word)); term.append(" "); } StringBuilder termIds = new StringBuilder(); for (int termId : resultTerm.term.termId) termIds.append(termId + ","); System.out.print(term.toString() + " termid:" + termIds.toString()); } */ public boolean hasKeyword(ResultConcept concept) { TreeSet<Integer> keywords = concept2keywords.get(concept.conceptId); if (keywords == null) return false; //No keywords for this concept // Determine words that are already part of found terms: Set<Integer> ignoreTokens = new TreeSet<Integer>(); for (ResultTerm term: concept.terms) for (int word: term.words) for (int[] tokenIDs: peregrine.tokenIDslist) ignoreTokens.add(tokenIDs[word]); for (int[] tokenIDs: peregrine.tokenIDslist) { for (int i = 0; i < tokenIDs.length; i++) { if (keywords.contains(tokenIDs[i]) && !ignoreTokens.contains(tokenIDs[i])) return true; } } return false; } public String findKeyword(ResultConcept concept) { TreeSet<Integer> keywords = concept2keywords.get(concept.conceptId); if (keywords == null) return null; //No keywords for this concept // Determine words that are already part of found terms: Set<Integer> ignoreTokens = new TreeSet<Integer>(); for (ResultTerm term: concept.terms) for (int word: term.words) for (int[] tokenIDs: peregrine.tokenIDslist) ignoreTokens.add(tokenIDs[word]); for (int[] tokenIDs: peregrine.tokenIDslist) { for (int i = 0; i < tokenIDs.length; i++) { if (keywords.contains(tokenIDs[i]) && !ignoreTokens.contains(tokenIDs[i])) return getToken(tokenIDs[i]); } } return null; } private String getToken(int tokenID) { for (Entry<String, Integer> entry: peregrine.normwords.entrySet()) if (entry.getValue() == tokenID) return entry.getKey(); for (Entry<String, Integer> entry: peregrine.words.entrySet()) if (entry.getValue() == tokenID) return entry.getKey(); for (Entry<String, Integer> entry: peregrine.lcwords.entrySet()) if (entry.getValue() == tokenID) return entry.getKey(); return null; } }