AbstractPeregrine.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.peregrine;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.utilities.AbstractNormaliser;
import org.erasmusmc.utilities.StringUtilities;

/** Defines the elements that the different Peregrines have in common. */
public abstract class AbstractPeregrine {
  
  /** Set the ontology that is to be used for indexing. */  
  public void setOntology(Ontology ontology){
    this.ontology = ontology;
  }
    
  public Ontology getOntology(){
    return ontology;
  }
  

  /**
   * Defines the list of stopwords that will be used for indexation.
   * Should be specified before releasing the thesaurus.
   * Stopwords should be in lowercase.
   */
  public Set<String> stopwords = getDefaultStopWordsForIndexing();
  
    
  protected boolean doNotNormaliseAbbreviations = false;
  
  /** Tokenizer used to tokenize both thesaurus terms and texts. By default, the SubSentenceTokenizer is used. */
  public Tokenizer tokenizer = null;
  
  /** Normalizer used during release and indexation. */
  public AbstractNormaliser normaliser;
  
  /** After indexation, this list will contain all concepts found in the text */
  public List<ResultConcept> resultConcepts = new ArrayList<ResultConcept>();
  
  /** After indexation, this list will contain all terms found in the text. 
   * These terms can also be accessed through the resultConcepts. */
  public List<ResultTerm> resultTerms = new ArrayList<ResultTerm>();  

  
  /** Call this method after setting the ontology, stopwords, and other parameters to prepare 
   * Peregrine for indexation. */
  public abstract void release();
  
  /** Finds all concepts in the text. After indexation, the concepts are listed in resultConcepts, 
   * the terms are listed in resultTerms, and the Tokenizer contains the tokens found in the text.
   * 
   * @param string  The text to be indexed
   */
  public abstract void index(String string);
  
  /**
   * Fetches the default stopword used for indexing (i.e. the stopwords used in Medline)
   * @return    The set of stopwords
   */public static Set<String> getDefaultStopWordsForIndexing(){
    Set<String> result = new TreeSet<String>();
    BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(AbstractPeregrine.class.getResourceAsStream("DefaultStopwordsForIndexing.txt")));
    try {
      while (bufferedReader.ready()){
        result.add(bufferedReader.readLine());
      }
    } catch (IOException e) {
       e.printStackTrace();
    }
    return result;
  }
  protected void removeStopwords() {
    String word;
    for (int i = tokenizer.tokens.size()-1; i > -1; i--){
      word = tokenizer.tokens.get(i); 
      if (!StringUtilities.isAbbr(word) && stopwords.contains(word.toLowerCase())) {
        tokenizer.removeToken(i);
      }
    }
  }  
  
  protected List<String> normalise(List<String> tokens){
    return normaliser.normalise(tokens);
  }
  
  protected List<String> toLowercase(List<String> tokens){
    List<String> result = new ArrayList<String>(tokens.size());
    for (int i = 0; i < tokens.size(); i++)
    	result.add(i, tokens.get(i).toLowerCase());
        
    return result;
  }  
  
  /**
   * Converts tokens to lowercase if only their first letter is a capital
   * @param tokens
   * @return
   */
  protected List<String> casesentiveCaseNorm(List<String> tokens) {
    List<String> result = new ArrayList<String>(tokens.size());
    for (int i = 0; i < tokens.size(); i++){
      result.add(i, StringUtilities.firstLetterToLowerCase(tokens.get(i)));
    }
    return result;
  }
  
   
  
  protected Ontology ontology;

  /**
   * If True, abbreviations (tokens with a majority of uppercase letters) will not be normalised, 
   * even if the term as a whole is set to be matched normalised.
   * <br><br>The default value is True.
   */
  public boolean isDoNotNormaliseAbbreviations() {
    return doNotNormaliseAbbreviations;
  }

  /**
   * If True, abbreviations (tokens with a majority of uppercase letters) will not be normalised, 
   * even if the term as a whole is set to be matched normalised.
   * <br><br>The default value is True.
   */
  public void setDoNotNormaliseAbbreviations(boolean doNotNormaliseAbbreviations) {
    this.doNotNormaliseAbbreviations = doNotNormaliseAbbreviations;
    normaliser.doNotNormaliseAbbreviations = doNotNormaliseAbbreviations;
  }
}