/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.peregrine;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import org.erasmusmc.utilities.StringUtilities;
public class ComplexTerms {
/**
* Terms are considered complex if equal or longer than this size (or have
* numbers) <br>
* <br>
* The default value is 6.
*/
public static int minTermLength = 6;
private int minConceptID;
private int maxConceptID;
private Set<ReleasedTerm> complexTerms;
public ComplexTerms(ConceptPeregrine peregrine, int minConceptID, int maxConceptID) {
this.minConceptID = minConceptID;
this.maxConceptID = maxConceptID;
Map<Integer, TokenInfo> token2info = new HashMap<Integer, TokenInfo>();
analyseTokens(peregrine.words, token2info);
analyseTokens(peregrine.lcwords, token2info);
analyseTokens(peregrine.normwords, token2info);
// Analyse terms:
Map<ReleasedTerm, TokenInfo> term2tempInfo = new HashMap<ReleasedTerm, TokenInfo>();
for (Map.Entry<Integer, ReleasedTerm> entry: peregrine.token2Term.entrySet())
if (hasValidConceptID(entry.getValue()))
term2tempInfo.put(entry.getValue(), token2info.get(entry.getKey()));
for (Entry<ConceptPeregrine.TokenPair, List<ConceptPeregrine.TermLink>> entry: peregrine.pair2Termlinks.entrySet()) {
ConceptPeregrine.TokenPair pair = entry.getKey();
for (ConceptPeregrine.TermLink termLink: entry.getValue()) {
if (hasValidConceptID(termLink.term)) {
TokenInfo tempInfo = term2tempInfo.get(termLink.term);
if (tempInfo == null) {
tempInfo = new TokenInfo();
term2tempInfo.put(termLink.term, tempInfo);
}
// Beware: for terms with more than 2 tokens, or with order
// insensitivity, length will be overestimated!
tempInfo.combine(token2info.get(pair.token1));
tempInfo.combine(token2info.get(pair.token2));
}
}
}
complexTerms = new HashSet<ReleasedTerm>();
for (Map.Entry<ReleasedTerm, TokenInfo> entry: term2tempInfo.entrySet()) {
TokenInfo tokenInfo = entry.getValue();
if (isComplex(tokenInfo))
complexTerms.add(entry.getKey());
}
}
protected static void analyseTokens(Map<String, Integer> words, Map<Integer, TokenInfo> token2info) {
for (Map.Entry<String, Integer> entry: words.entrySet()) {
TokenInfo tokenInfo = new TokenInfo();
String token = entry.getKey();
tokenInfo.letters = StringUtilities.countLetters(token);
tokenInfo.numbers = StringUtilities.countNumbers(token);
tokenInfo.length = token.length();
token2info.put(entry.getValue(), tokenInfo);
}
}
private boolean hasValidConceptID(ReleasedTerm term) {
for (Integer conceptID: term.conceptId)
if (conceptID > minConceptID && conceptID < maxConceptID)
return true;
return false;
}
protected static boolean isComplex(TokenInfo tokenInfo) {
if(tokenInfo.length >= minTermLength){
return true;
}
else if(tokenInfo.numbers>0&&tokenInfo.letters>0&&tokenInfo.length>2){
return true;
}
else{
return false;
}
}
protected static class TokenInfo {
int numbers = 0;
int letters = 0;
int length = 0;
public void combine(TokenInfo tokenInfo) {
numbers += tokenInfo.numbers;
letters += tokenInfo.letters;
length += tokenInfo.length;
}
}
public boolean isComplex(ReleasedTerm term){
return complexTerms.contains(term);
}
}