Tokenizer.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.peregrine;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

/** Abstract class defining the common functionality for tokenizers */
public class Tokenizer implements Serializable {
  
  /** The list of tokens found in the text */
  public List<String> tokens = new ArrayList<String>();
  
  /** The start-positions (in characters) of the tokens in the tokens list. 
   * The first character in the string has positions 0.*/  
  public List<Integer> startpositions = new ArrayList<Integer>(); //character position in the string
  
  /** The end-positions (in characters) of the tokens in the tokens list. 
   * The first character in the string has positions 0.*/    
  public List<Integer> endpositions = new ArrayList<Integer>(); //character position in the string
  
  /** The indices of the tokens that are at the beginning of a new sentence. 
   * For example, in the sentence "Malaria is transmitted by mosquitos.", the first (and only) 
   * end-of-sentence is 5.*/ 
  public List<Integer> endOfSentence = new ArrayList<Integer>(); //token position in the list
  
  /** Tokenizes the input string, and stores the tokens, start- and end-positions and end-of-sentences 
   * in the appropriate data structures.
   * @param string  The input string.*/
  public void tokenize(String string){}
  
  public Tokenizer(){}
  
  /** Creates a new tokenizer and copies the data of the source tokenizer. 
   * @param tokenizer   The source tokenizer*/
  public Tokenizer(Tokenizer tokenizer){
    this.tokens = new ArrayList<String>(tokenizer.tokens);
    this.startpositions = new ArrayList<Integer>(tokenizer.startpositions);
    this.endpositions = new ArrayList<Integer>(tokenizer.endpositions);
    this.endOfSentence = new ArrayList<Integer>(tokenizer.endOfSentence);
  }
  
  /** Returns all the tokens belonging to one sentence in the text.
   * @param lineNumber  Specifies which sentence should be returned.
   * @return    Returns a list of tokens.*/
  public List<String> line(int lineNumber){
    if (lineNumber == 0)
      return tokens.subList(0,endOfSentence.get(0));
    else
      return tokens.subList(endOfSentence.get(lineNumber-1), endOfSentence.get(lineNumber));
  }
  
  /** Removes a single token and all of its accompanying data from the data structures.
   * @param index   The index of the token to be removed. The first token has index 0.*/
  public void removeToken(int index){
    tokens.remove(index);
    startpositions.remove(index);
    endpositions.remove(index);
    int value;
    for (int i = endOfSentence.size()-1; i >= 0; i--){
      value = endOfSentence.get(i);
      if (value > index){
        endOfSentence.set(i, value-1);
      } else break;
    }
  }
    
  private static final long serialVersionUID = 1L;
}