/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.peregrine;
/** Simple tokenizer: everything that is not a letter or a number is a word-separator.
* Sentences are split by detecting line-feed characters.*/
public class SimpleTokenizer extends Tokenizer{
public SimpleTokenizer(){
super();
}
public SimpleTokenizer(Tokenizer tokenizer) {
super(tokenizer);
}
public void tokenize(String string){
tokens.clear();
startpositions.clear();
endpositions.clear();
endOfSentence.clear();
int start = 0;
int i = 0;
char ch;
for (; i < string.length(); i++){
ch = string.charAt(i);
if (!Character.isLetterOrDigit(ch) &&
//!(ch == '-') &&
!(ch == '\'' && i>0 && Character.isLetter(string.charAt(i-1)) && string.length()-1 > i && string.charAt(i+1) == 's' && (string.length()-2 == i || !Character.isLetterOrDigit(string.charAt(i+2))))){ //leaves ' in possesive pattern
if (start != i) {
tokens.add(string.substring(start,i));
startpositions.add(start);
endpositions.add(i-1);
}
start = i+1;
if ((int)ch == 10){
endOfSentence.add(tokens.size());
}
}
}
if (start != i) {
tokens.add(string.substring(start,i));
startpositions.add(start);
endpositions.add(i);
}
//Add end of line at end of text if not already in list:
if (endOfSentence.size() == 0 || endOfSentence.get(endOfSentence.size()-1) != tokens.size()){
endOfSentence.add(tokens.size());
}
}
private static final long serialVersionUID = 1L;
}