/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.peregrine; import java.io.Serializable; import java.util.ArrayList; import java.util.List; /** Abstract class defining the common functionality for tokenizers */ public class Tokenizer implements Serializable { /** The list of tokens found in the text */ public List<String> tokens = new ArrayList<String>(); /** The start-positions (in characters) of the tokens in the tokens list. * The first character in the string has positions 0.*/ public List<Integer> startpositions = new ArrayList<Integer>(); //character position in the string /** The end-positions (in characters) of the tokens in the tokens list. * The first character in the string has positions 0.*/ public List<Integer> endpositions = new ArrayList<Integer>(); //character position in the string /** The indices of the tokens that are at the beginning of a new sentence. * For example, in the sentence "Malaria is transmitted by mosquitos.", the first (and only) * end-of-sentence is 5.*/ public List<Integer> endOfSentence = new ArrayList<Integer>(); //token position in the list /** Tokenizes the input string, and stores the tokens, start- and end-positions and end-of-sentences * in the appropriate data structures. * @param string The input string.*/ public void tokenize(String string){} public Tokenizer(){} /** Creates a new tokenizer and copies the data of the source tokenizer. * @param tokenizer The source tokenizer*/ public Tokenizer(Tokenizer tokenizer){ this.tokens = new ArrayList<String>(tokenizer.tokens); this.startpositions = new ArrayList<Integer>(tokenizer.startpositions); this.endpositions = new ArrayList<Integer>(tokenizer.endpositions); this.endOfSentence = new ArrayList<Integer>(tokenizer.endOfSentence); } /** Returns all the tokens belonging to one sentence in the text. * @param lineNumber Specifies which sentence should be returned. * @return Returns a list of tokens.*/ public List<String> line(int lineNumber){ if (lineNumber == 0) return tokens.subList(0,endOfSentence.get(0)); else return tokens.subList(endOfSentence.get(lineNumber-1), endOfSentence.get(lineNumber)); } /** Removes a single token and all of its accompanying data from the data structures. * @param index The index of the token to be removed. The first token has index 0.*/ public void removeToken(int index){ tokens.remove(index); startpositions.remove(index); endpositions.remove(index); int value; for (int i = endOfSentence.size()-1; i >= 0; i--){ value = endOfSentence.get(i); if (value > index){ endOfSentence.set(i, value-1); } else break; } } private static final long serialVersionUID = 1L; }