/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cogroo.uima.ae;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import opennlp.tools.util.Span;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import cogroo.MultiCogrooSettings;
import br.usp.pcs.lta.cogroo.entity.Sentence;
import br.usp.pcs.lta.cogroo.entity.Token;
import br.usp.pcs.lta.cogroo.entity.impl.runtime.MorphologicalTag;
import br.usp.pcs.lta.cogroo.entity.impl.runtime.TokenCogroo;
import br.usp.pcs.lta.cogroo.tools.ProcessingEngine;
public class UimaTokenizer extends AnnotationService implements
ProcessingEngine {
private Type tokenType;
private Type sentenceType;
private static final Set<String> PRONOMES_OBLIQUOS_ATONOS;
static {
String[] arr = { "me", "te", "se", "o", "a", "lhe", "nos", "vos", "os",
"as", "lhes", "lo" };
PRONOMES_OBLIQUOS_ATONOS = Collections.unmodifiableSet(new HashSet<String>(
Arrays.asList(arr)));
}
public UimaTokenizer() throws AnnotationServiceException {
super("UIMATokenizer");
}
public void process(Sentence text) {
// ************************************
// Add text to the CAS
// ************************************
updateCas(text, cas);
// ************************************
// Analyze text
// ************************************
try {
ae.process(cas);
} catch (Exception e) {
throw new RuntimeException("Error processing a text.", e);
}
// ************************************
// Extract the result using annotated CAS
// ************************************
FSIterator<Annotation> iterator = cas.getAnnotationIndex(tokenType)
.iterator();
List<Token> tokens = new ArrayList<Token>();
boolean foundHyphen = false;
while (iterator.hasNext()) {
Annotation a = iterator.next();
String tokStr = a.getCoveredText();
Span tokSpan = new Span(a.getBegin(), a.getEnd());
/*
* if(!MultiCogrooSettings.PRE) { if(tokStr != null && foundHyphen &&
* PRONOMES_OBLIQUOS_ATONOS.contains(tokStr.toLowerCase())) { //
* System.out.println("found pronome obliquo"); foundHyphen = false;
* tokStr = '-' + tokStr; tokSpan = new Span(tokSpan.getStart() - 1,
* tokSpan.getEnd()); // System.out.println("tokStr: " + tokStr); //
* System.out.println("tokSpan: " + tokSpan);
*
* TokenCogroo lt = (TokenCogroo)tokens.get(tokens.size()-1);
* lt.setLexeme(lt.getLexeme().substring(0, lt.getLexeme().length() - 1));
* lt.setSpan(new Span(lt.getSpan().getStart(), lt.getSpan().getEnd()
* -1)); // System.out.println("replace tok: " + lt);
* tokens.set(tokens.size()-1,lt); } else if(foundHyphen) { foundHyphen =
* false; // System.out.println("reset foundHyphen"); } else
* if(tokStr.endsWith("-")) { // System.out.println("found hyphen: " +
* tokStr); foundHyphen = true; } }
*/
TokenCogroo t = new TokenCogroo(tokStr, tokSpan);
tokens.add(t);
}
//if (!MultiCogrooSettings.PRE) {
if(true) {
boolean restart = true;
int start = 1;
while (restart) {
restart = false;
for (int i = start; i < tokens.size() - 1 && !restart; i++) {
if ("-".equals(tokens.get(i).getLexeme())) {
if (!hasCharacterBetween(tokens.get(i - 1), tokens.get(i))
&& !hasCharacterBetween(tokens.get(i), tokens.get(i + 1))) {
Token a = tokens.get(i - 1);
Token b = tokens.get(i + 1);
if (PRONOMES_OBLIQUOS_ATONOS
.contains(b.getLexeme().toLowerCase())) {
// remove the "-"
b.setSpan(new Span(b.getSpan().getStart() - 1, b.getSpan()
.getEnd()));
b.setLexeme("-" + b.getLexeme());
tokens.remove(i);
restart = true;
start = i + 1;
}
}
}
}
}
}
text.setTokens(tokens);
cas.reset();
}
private boolean hasCharacterBetween(Token a, Token b) {
int aEnd = a.getSpan().getEnd();
int bStart = b.getSpan().getStart();
if (aEnd == bStart) {
return false;
}
return true;
}
@Override
protected void initTypes(TypeSystem typeSystem) {
sentenceType = cas.getTypeSystem().getType("opennlp.uima.Sentence");
tokenType = cas.getTypeSystem().getType("opennlp.uima.Token");
}
private void updateCas(Sentence sentence, JCas cas) {
cas.reset();
cas.setDocumentText(sentence.getSentence());
AnnotationFS a = cas.getCas().createAnnotation(sentenceType,
sentence.getOffset(),
sentence.getOffset() + sentence.getSentence().length());
cas.getIndexRepository().addFS(a);
}
}