/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.peregrine; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.TreeSet; import org.erasmusmc.ontology.Ontology; import org.erasmusmc.utilities.AbstractNormaliser; import org.erasmusmc.utilities.StringUtilities; /** Defines the elements that the different Peregrines have in common. */ public abstract class AbstractPeregrine { /** Set the ontology that is to be used for indexing. */ public void setOntology(Ontology ontology){ this.ontology = ontology; } public Ontology getOntology(){ return ontology; } /** * Defines the list of stopwords that will be used for indexation. * Should be specified before releasing the thesaurus. * Stopwords should be in lowercase. */ public Set<String> stopwords = getDefaultStopWordsForIndexing(); protected boolean doNotNormaliseAbbreviations = false; /** Tokenizer used to tokenize both thesaurus terms and texts. By default, the SubSentenceTokenizer is used. */ public Tokenizer tokenizer = null; /** Normalizer used during release and indexation. */ public AbstractNormaliser normaliser; /** After indexation, this list will contain all concepts found in the text */ public List<ResultConcept> resultConcepts = new ArrayList<ResultConcept>(); /** After indexation, this list will contain all terms found in the text. * These terms can also be accessed through the resultConcepts. */ public List<ResultTerm> resultTerms = new ArrayList<ResultTerm>(); /** Call this method after setting the ontology, stopwords, and other parameters to prepare * Peregrine for indexation. */ public abstract void release(); /** Finds all concepts in the text. After indexation, the concepts are listed in resultConcepts, * the terms are listed in resultTerms, and the Tokenizer contains the tokens found in the text. * * @param string The text to be indexed */ public abstract void index(String string); /** * Fetches the default stopword used for indexing (i.e. the stopwords used in Medline) * @return The set of stopwords */public static Set<String> getDefaultStopWordsForIndexing(){ Set<String> result = new TreeSet<String>(); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(AbstractPeregrine.class.getResourceAsStream("DefaultStopwordsForIndexing.txt"))); try { while (bufferedReader.ready()){ result.add(bufferedReader.readLine()); } } catch (IOException e) { e.printStackTrace(); } return result; } protected void removeStopwords() { String word; for (int i = tokenizer.tokens.size()-1; i > -1; i--){ word = tokenizer.tokens.get(i); if (!StringUtilities.isAbbr(word) && stopwords.contains(word.toLowerCase())) { tokenizer.removeToken(i); } } } protected List<String> normalise(List<String> tokens){ return normaliser.normalise(tokens); } protected List<String> toLowercase(List<String> tokens){ List<String> result = new ArrayList<String>(tokens.size()); for (int i = 0; i < tokens.size(); i++) result.add(i, tokens.get(i).toLowerCase()); return result; } /** * Converts tokens to lowercase if only their first letter is a capital * @param tokens * @return */ protected List<String> casesentiveCaseNorm(List<String> tokens) { List<String> result = new ArrayList<String>(tokens.size()); for (int i = 0; i < tokens.size(); i++){ result.add(i, StringUtilities.firstLetterToLowerCase(tokens.get(i))); } return result; } protected Ontology ontology; /** * If True, abbreviations (tokens with a majority of uppercase letters) will not be normalised, * even if the term as a whole is set to be matched normalised. * <br><br>The default value is True. */ public boolean isDoNotNormaliseAbbreviations() { return doNotNormaliseAbbreviations; } /** * If True, abbreviations (tokens with a majority of uppercase letters) will not be normalised, * even if the term as a whole is set to be matched normalised. * <br><br>The default value is True. */ public void setDoNotNormaliseAbbreviations(boolean doNotNormaliseAbbreviations) { this.doNotNormaliseAbbreviations = doNotNormaliseAbbreviations; normaliser.doNotNormaliseAbbreviations = doNotNormaliseAbbreviations; } }