/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cogroo;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import opennlp.tools.util.Span;
import br.usp.pcs.lta.cogroo.entity.Sentence;
import br.usp.pcs.lta.cogroo.entity.Token;
import br.usp.pcs.lta.cogroo.entity.impl.runtime.MorphologicalTag;
import br.usp.pcs.lta.cogroo.entity.impl.runtime.TokenCogroo;
import br.usp.pcs.lta.cogroo.tools.ProcessingEngine;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Class;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Gender;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Number;
import br.usp.pcs.lta.cogroo.tools.dictionary.CogrooTagDictionary;
import br.usp.pcs.lta.cogroo.tools.tagger.CogrooPOSTaggerME;
import cogroo.uima.interpreters.FlorestaTagInterpreter;
public class PostPOSTagger implements ProcessingEngine {
// pronomes obliquuos átonos
private static final Set<String> PRONOMES_OBLIQUOS_ATONOS;
private FlorestaTagInterpreter it = new FlorestaTagInterpreter();
private CogrooTagDictionary dict;
public PostPOSTagger(CogrooTagDictionary dict) {
this.dict = dict;
}
static {
String[] arr = { "me", "te", "se", "o", "a", "lhe", "nos", "vos", "os",
"as", "lhes" };
PRONOMES_OBLIQUOS_ATONOS = Collections.unmodifiableSet(new HashSet<String>(
Arrays.asList(arr)));
}
private static final Set<String> PREFIXOS_HYPHENS;
static {
String[] pho = { "ex", "sota", "soto", "vice", "pré", "pós", "pró",
"extra", "contra", "auto", "neo", "semi", "ultra", "supra", "intra" };
PREFIXOS_HYPHENS = Collections.unmodifiableSet(new HashSet<String>(Arrays
.asList(pho)));
}
private MorphologicalTag toMorphologicalTag(String tag) {
return it.parseMorphologicalTag(tag);
}
public void process(Sentence sentence) {
if(MultiCogrooSettings.TAGGER) {
for (Token t : sentence.getTokens()) {
String tag;
if(t.getOriginalFeatures() != null) {
tag = t.getOriginalPOSTag() + "=" + t.getOriginalFeatures();
} else {
tag = t.getOriginalPOSTag();
}
t.setMorphologicalTag(toMorphologicalTag(tag));
CogrooPOSTaggerME.setPrimitiveAndGeneralize(t, dict);
}
}
mergeHyphenedWords(sentence);
}
private void mergeHyphenedWords(Sentence sentence) {
List<Token> tokens = sentence.getTokens();
// look for "-", check if it makes contact with the other hyphens
boolean restart = true;
int start = 1;
while (restart) {
restart = false;
for (int i = start; i < tokens.size() - 1 && !restart; i++) {
if ("-".equals(tokens.get(i).getLexeme())) {
if (!hasCharacterBetween(tokens.get(i - 1), tokens.get(i))
&& !hasCharacterBetween(tokens.get(i), tokens.get(i + 1))) {
Token a = tokens.get(i - 1);
Token b = tokens.get(i + 1);
if (PRONOMES_OBLIQUOS_ATONOS.contains(b.getLexeme().toLowerCase())) {
// remove the "-"
b.setSpan(new Span(b.getSpan().getStart() - 1, b.getSpan()
.getEnd()));
b.setLexeme("-" + b.getLexeme());
tokens.remove(i);
restart = true;
start = i + 1;
} else {
// merge the terms
MorphologicalTag tag;
if (PREFIXOS_HYPHENS.contains(a.getLexeme().toLowerCase())) {
tag = b.getMorphologicalTag();
} else {
tag = merge(a.getMorphologicalTag(), b.getMorphologicalTag());
}
String lexeme = a.getLexeme() + "-" + b.getLexeme();
StringBuilder lemma = new StringBuilder();
if (a.getPrimitive() != null && a.getPrimitive().length() > 0) {
lemma.append(a.getPrimitive());
} else {
lemma.append(a.getLexeme());
}
lemma.append("-");
if (b.getPrimitive() != null && b.getPrimitive().length() > 0) {
lemma.append(b.getPrimitive());
} else {
lemma.append(b.getLexeme());
}
// String lema = a.getPrimitive() + "-" + b.getPrimitive();
Span span = new Span(a.getSpan().getStart(), b.getSpan().getEnd());
Token newTok = new TokenCogroo(lexeme, span);
newTok.setPrimitive(lemma.toString());
newTok.setMorphologicalTag(tag);
tokens.remove(i + 1);
tokens.remove(i);
tokens.set(i - 1, newTok);
start = i;
restart = true;
}
}
}
}
}
}
FlorestaTagInterpreter ti = new FlorestaTagInterpreter();
private MorphologicalTag merge(MorphologicalTag a, MorphologicalTag b) {
MorphologicalTag ret = a.clone();
Class aClass = a.getClazzE();
Class bClass = b.getClazzE();
if (!isVariable(aClass)) {
ret = b.clone();
} else {
// prefer the noum
if (aClass.equals(Class.NOUN) && bClass.equals(Class.NOUN)
|| aClass.equals(Class.ADJECTIVE) && bClass.equals(Class.ADJECTIVE)) {
ret = b.clone();
if (Gender.FEMALE.equals(a.getGenderE())
|| Gender.FEMALE.equals(b.getGenderE())) {
ret.setGender(Gender.NEUTRAL);
}
if (Number.PLURAL.equals(a.getNumberE())
|| Number.PLURAL.equals(b.getNumberE())) {
ret.setNumber(Number.PLURAL);
}
} else if (aClass.equals(Class.ADJECTIVE) && bClass.equals(Class.NOUN)) {
ret = b.clone();
ret.setClazz(Class.ADJECTIVE);
if (Gender.FEMALE.equals(a.getGenderE())
|| Gender.FEMALE.equals(b.getGenderE())) {
ret.setGender(Gender.NEUTRAL);
}
if (Number.PLURAL.equals(a.getNumberE())
|| Number.PLURAL.equals(b.getNumberE())) {
ret.setNumber(Number.PLURAL);
}
} else if (aClass.equals(Class.VERB) || bClass.equals(Class.PREPOSITION)) {
ret = b.clone();
ret.setGender(Gender.MALE);
} else if (aClass.equals(Class.NOUN)) {
ret = a;
} else if (bClass.equals(Class.NOUN)) {
ret = b;
}
if (isVariable(aClass) && isVariable(bClass)) {
Gender aGender = a.getGenderE();
Gender bGender = b.getGenderE();
Number aNumber = a.getNumberE();
Number bNumber = b.getNumberE();
if (aGender != null && bGender != null) {
if (!aGender.equals(bGender)) {
ret.setGender(Gender.NEUTRAL);
}
}
if (aNumber != null && bNumber != null) {
if (!aNumber.equals(bNumber)) {
ret.setNumber(Number.NEUTRAL);
}
}
}
}
// System.out.print("assertEquals(\"" + ti.serialize(ret) + "\", merge(\"");
// System.out.print(ti.serialize(a));
// System.out.print('"');
// System.out.print(", ");
// System.out.print('"');
// System.out.print(ti.serialize(b));
// System.out.print("\"));");
//
// System.out.println();
return ret;
}
private boolean isVariable(Class a) {
switch (a) {
case ADJECTIVE:
case NOUN:
case PROPER_NOUN:
case NUMERAL:
return true;
default:
return false;
}
}
private boolean hasCharacterBetween(Token a, Token b) {
int aEnd = a.getSpan().getEnd();
int bStart = b.getSpan().getStart();
if (aEnd == bStart) {
return false;
}
return true;
}
}