LookupPeregrine.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.peregrine;

import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.erasmusmc.collections.IntList;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.textMining.LVG.LVGNormaliser;
//import org.erasmusmc.utilities.LVGNormaliser;
import org.erasmusmc.utilities.StringUtilities;

/**
 * Peregrine for looking up terms in the ontology. It will only return a match when the whole input text
 *  is matched to a term in the ontology. Saves memory when compared to a normal ConceptPeregrine.
 * @author martijn
 *
 */
  public class LookupPeregrine extends AbstractPeregrine {
  
  /** Specifies whether the input text should also be normalised before matching. 
   * Default is set to false, but if at least one term in the ontology has the normalisation flag set,
   * it will automatically be turned to true.
   * <br><br>The default value is false
   */  
  public boolean normalize = false;
  
  /** If true, the entire ontology structure will be destroyed during release, thus saving memory.
   * <br><br>The default value is False.*/
  public boolean destroyOntologyDuringRelease = false;
  
  public LookupPeregrine() {
    normaliser = new LVGNormaliser();
    tokenizer = new SimpleTokenizer();
  }
  
  public LookupPeregrine(String lvgPropertiesPath) {
    if (lvgPropertiesPath != null)
      normaliser = new LVGNormaliser(lvgPropertiesPath);
    tokenizer = new SubSentenceTokenizer();
  }
  
  @Override
  public void index(String string){
    initializeIndex(string);
    checkString(casesentiveCaseNorm(tokenizer.tokens), words);
    if (normalize)
      checkString(normalise(tokenizer.tokens), normwords);
    checkString(toLowercase(tokenizer.tokens), lcwords);
  }

  private void checkString(List<String> tokens, Map<String, IntList> wordList) {
    String neatString = StringUtilities.join(tokens, " ");
    IntList conceptIDs = wordList.get(neatString);
    if (conceptIDs != null)
      for (int conceptID : conceptIDs){
        ResultTerm resultTerm = new ResultTerm();
        resultTerm.words = new int[tokens.size()];
        for (int i = 0; i < tokens.size(); i++)
          resultTerm.words[i] = i;
        resultTerms.add(resultTerm);
        
        ResultConcept resultConcept = null;
        for (ResultConcept concept : resultConcepts)
          if (concept.conceptId == conceptID){
            resultConcept = concept;
            break;
          }
            
        if (resultConcept == null){
          resultConcept = new ResultConcept();
          resultConcept.conceptId = conceptID;
          resultConcepts.add(resultConcept);
        }
        resultConcept.terms.add(resultTerm);
      }
  }

  @Override
  public void release(){
    if (destroyOntologyDuringRelease && !(ontology instanceof OntologyStore)) {
      destroyOntologyDuringRelease = false;
    }
    words.clear();
    normwords.clear();
    lcwords.clear();
    TermStore term;
    List<String> tokens;
    Map<String, IntList> wordlist;
    Iterator<Concept> values = ontology.getConceptIterator();
    while (values.hasNext()){
      Concept concept = values.next();
      List<TermStore> terms = concept.getTerms();
      for (int j = 0; j < terms.size(); j++){      
        term = terms.get(j);
        initializeIndex(term.text); //Implies tokenization and stopword removal
        if (term.normalised) {
          tokens = normalise(tokenizer.tokens);
          wordlist = normwords;
          normalize = true; // at least one normalised term: turn normalisation on
        } else if (term.caseSensitive) {
          tokens = casesentiveCaseNorm(tokenizer.tokens);
          wordlist = words;
        } else {
          tokens = toLowercase(tokenizer.tokens);
          wordlist = lcwords;         
        }
        String neatTerm = StringUtilities.join(tokens, " ");
        
        IntList conceptIDs = wordlist.get(neatTerm);
        if (conceptIDs == null){
          conceptIDs = new IntList(1);
          wordlist.put(neatTerm, conceptIDs);
        }
        if (conceptIDs.size() == 0 || !conceptIDs.get(conceptIDs.size()-1).equals(concept.getID()))
          conceptIDs.add(concept.getID());
      }
      if (destroyOntologyDuringRelease) values.remove();
    }
    if (destroyOntologyDuringRelease) ontology = null;
  }
  
  protected void initializeIndex(String string){
    resultTerms.clear();
    resultConcepts.clear();
    if (string != null) 
      tokenizer.tokenize(string);
    removeStopwords();  
  }
  
  protected Map<String, IntList> words = new HashMap<String, IntList>();
  protected Map<String, IntList> normwords = new HashMap<String, IntList>();
  protected Map<String, IntList> lcwords = new HashMap<String, IntList>();

}