WordPeregrine.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.peregrine;

import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import org.erasmusmc.math.CRC32;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.textMining.LVG.LVGNormaliser;
import org.erasmusmc.utilities.AbstractNormaliser;
//import org.erasmusmc.utilities.LVGNormaliser;
import org.erasmusmc.utilities.StringUtilities;

/** Identifies words in the text. 
 * Each word is given a unique identifier, using a CRC32 as hashcode.*/
public class WordPeregrine extends AbstractPeregrine{
  
  /** If true, words are first normalised before the CRC32 is calculated.
   * <br><br>The default value is True.*/
  public boolean normaliseWords = true; 
  
  /** If true, words are first converted to lowercase before the CRC32 is calculated.
   * <br><br>The default value is True.*/
  public boolean lowercaseWords = true; 
  
  /** Construct an ontology during indexation. 
   * This ontology can be used to translate the CRC32 codes back to words.
   * <br><br>The default value is False.*/
  public boolean buildOntology = false; 
  
  /** If true, Peregrine will keep an internal list of created conceptIDs and will not check the ontology whether 
   *  a concept already exists. This will speed up the process.
   * <br><br>The default value is False.*/
  public boolean assumeEmptyOntology = false;
  
  /**
   * If true, Peregrine will always insert the concept into the ontology, and will not check whether it exists
   */
  public boolean alwaysInsertIntoOntology = false;
  
  /** Defines which n-grams will be returned by Peregrine. 
   * <br><br>The default value is {1}, meaning that only single words are detected */
  public int[] ngrams = {1};
  
  /** For n-grams with n greater than 1, breakChars defines the characters that a n-gram cannot cross.
   * This serves to limit the number of different (spurious) n-grams.
   * <br><br>The default value is [,;():\\"]*/
  public Set<Character> breakChars = new TreeSet<Character>();
    
  public WordPeregrine() {
    normaliser = new LVGNormaliser();
    tokenizer = new SBDtokenizer();
    breakChars.add(',');
    breakChars.add(';');
    breakChars.add('(');
    breakChars.add(')');
    breakChars.add(':');
    breakChars.add('\'');
    breakChars.add('"');
  }
  
  public WordPeregrine(AbstractNormaliser normaliser){
  	this.normaliser = normaliser; 
  }
  
  public void release() {
    
  }

  public void index(String string) {
    initializeIndex(string);
    resultConcepts.clear();
    id2concept = new TreeMap<Integer, ResultConcept>();
    id2string = new TreeMap<Integer, String>();
    int start = 0;
    for (int i = 0; i < tokenizer.tokens.size(); i++){
      String token = tokenizer.tokens.get(i);
      if (!token.equals("")){
        if (precededByBreakChar(i) || tokenizer.endOfSentence.contains(i))
          start = i;
        //if ((!StringUtilities.isAbbr(token) || token.length() == 1)  && stopwords.contains(token.toLowerCase())) {
        if (stopwords.contains(StringUtilities.firstLetterToLowerCase(token))) {
          start = i+1;
        } else {
          for (int n : ngrams){
            int firstWord = i-n+1;
            if (firstWord >= start)
              addTerm(firstWord, i);
          }
        }
      }
    }
    if (buildOntology) addNewWordsToOntology();
  }
  private String text;
  private boolean precededByBreakChar(int tokenpos) {
    int pos = tokenizer.startpositions.get(tokenpos)-1;
    while (pos > 0 && !Character.isLetterOrDigit(text.charAt(pos))){
      if (breakChars.contains(text.charAt(pos)))
        return true;
      pos--;
    }
    return false;
  }

  private Map<Integer, ResultConcept> id2concept;
  private Map<Integer, String> id2string;
  //private SortedIntListSet conceptIDs = new SortedIntListSet(20000000);
  private Set<Integer> conceptIDs = new HashSet<Integer>(20000000);
  
  private void addTerm(int start, int end) {
    String string = StringUtilities.join(tokenizer.tokens.subList(start, end+1), " ");
    int hashcode = crc32.crc32(string);
    ResultTerm resultTerm = new ResultTerm();
    resultTerm.words = new int[end-start+1];
    for (int i = start; i <= end; i++){
      resultTerm.words[i-start] = i;
    }
    ResultConcept resultconcept = id2concept.get(hashcode);
    if (resultconcept == null) {
      resultconcept = new ResultConcept();
      resultconcept.conceptId = hashcode;
      id2concept.put(hashcode, resultconcept);
      resultConcepts.add(resultconcept);
    }
    resultconcept.terms.add(resultTerm);  
    resultTerms.add(resultTerm);
    id2string.put(hashcode, string);
  }

  private void addNewWordsToOntology() {
    for (ResultConcept resultConcept : resultConcepts){
      boolean addConcept;
      if (alwaysInsertIntoOntology)
        addConcept = true;
      else {
        if (assumeEmptyOntology){
          if (conceptIDs.contains(resultConcept.conceptId)){
            addConcept = false;
          } else {
            addConcept = true;
            conceptIDs.add(resultConcept.conceptId);
          }
        } else {//do not assume an empty ontology, so check whether concept already in ontology:
          Concept concept = ontology.getConcept(resultConcept.conceptId);
          addConcept = (concept == null);
        }
      }
      if (addConcept){
        Concept concept = new Concept(resultConcept.conceptId);
        concept.setName(id2string.get(resultConcept.conceptId));
        ontology.setConcept(concept);       
      }
    }
  }
  
  private void initializeIndex(String string){
    text = string;
    resultTerms.clear();
    if (string == null)
      string = "";
    
    tokenizer.tokenize(string);    
    
    if (normaliseWords) 
      tokenizer.tokens = normalise(tokenizer.tokens);  
    else if (lowercaseWords)
      tokenizer.tokens = toLowercase(tokenizer.tokens);    
  }
 
  protected CRC32 crc32= new CRC32();
}