package de.berlin.hu.uima.ae.tokenizer;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.u_compare.shared.syntactic.Token;
/**
* A fine-grained tokenizer. It also splits at number-letter-changes.
*/
public class FineTokenizerAE extends JCasAnnotator_ImplBase {
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
String text = aJCas.getDocumentText();
char[] chars = text.toCharArray();
int start = 0;
for (int i = 0; i < chars.length; i++) {
char ch = chars[i];
//FIXME: find a better way
char nch = '¬'; //dummy character
if (i < chars.length - 1) nch = chars[i+1];
if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') {
start++;
}
else {
if (Character.isDigit(ch) && !Character.isDigit(nch)) {
createNewTokenAnnotation(aJCas, start, i);
start = i+1;
} else if (Character.isLetter(ch) && !Character.isLetter(nch)) {
createNewTokenAnnotation(aJCas, start, i);
start = i+1;
} else if (!(Character.isDigit(ch) || Character.isLetter(ch))) {
createNewTokenAnnotation(aJCas, start, i);
start = i+1;
}
}
}
}
private void createNewTokenAnnotation(JCas aJCas, int begin, int end) {
Token token = new Token(aJCas);
token.setBegin(begin);
token.setEnd(end+1);
token.addToIndexes();
// tokens for OpenNLP POS tagger
opennlp.uima.Token token2 = new opennlp.uima.Token(aJCas);
token2.setBegin(begin);
token2.setEnd(end+1);
token2.addToIndexes();
}
}