/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cogroo.tools.postag;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import opennlp.tools.postag.TagDictionary;
import opennlp.tools.util.SequenceValidator;
public class PortuguesePOSSequenceValidator implements
SequenceValidator<String> {
private boolean storeUnknown = false;
public TagDictionary tagDictionary;
private SortedSet<String> unknown;
public PortuguesePOSSequenceValidator(TagDictionary tagDictionary) {
if(storeUnknown) {
unknown = new TreeSet<String>();
}
this.tagDictionary = tagDictionary;
}
public boolean validSequence(int i, String[] inputSequence,
String[] outcomesSequence, String outcome) {
boolean isValid = false;
boolean tokExists = false;
String word = inputSequence[i];
if(i > 0 && "nm".equals(outcome) && "a".equalsIgnoreCase(inputSequence[i-1]) && "artf".equals(outcomesSequence[i-1])) {
return false;
}
outcome = GenderUtil.removeGender(outcome);
// lets start with some punctuation check
if(isPunctuation(word)) {
// this is only true for BOSQUE! XXX: remember this!
return outcome.equals(word);
}
if(i < inputSequence.length - 1 && isPunctuation(inputSequence[i+1])) {
// we can't start a MWE here :(
if(outcome.startsWith("B-")) {
return false;
}
}
// validate B- and I-
if (!validOutcome(outcome, outcomesSequence)) {
return false;
}
if (tagDictionary == null) {
return true;
} else {
if ((outcome.startsWith("B-") || outcome.startsWith("I-")) && inputSequence.length > 1 ) {
return true;
}
if (word.equals(outcome)) {
isValid = true;
}
List<String> tagList = filterMWE(queryDictionary(word, true));
if (tagList != null && tagList.size() > 0) {
tokExists = true;
if("prop".equals(outcome) && Character.isUpperCase(word.charAt(0))) {
return true;
} else
if (contains(tagList, outcome)) {
isValid = true;
}
}
if (!tokExists) {
if(storeUnknown) {
this.unknown.add(word);
}
isValid = true;
}
return isValid;
}
}
private List<String> filterMWE(String[] arr) {
if(arr == null) return null;
List<String> out = new ArrayList<String>(arr.length);
for (String t : arr) {
if (!(t.startsWith("B-") || t.startsWith("I-")))
out.add(t);
}
return out;
}
private String[] queryDictionary(String word, boolean recurse) {
String[] tags = tagDictionary.getTags(word);
if (tags == null) {
tags = tagDictionary.getTags(word.toLowerCase());
}
if(recurse == true) {
if(word.startsWith("-") && word.length() > 1) {
tags = queryDictionary(word.substring(1), false);
}
}
return GenderUtil.removeGender(tags);
}
@Override
protected void finalize() throws Throwable {
super.finalize();
if (storeUnknown) {
System.out.println("... palavras desconhecidas ...");
for (String unk : this.unknown) {
System.out.println(unk);
}
System.out.println("... fim ...");
}
}
static boolean validOutcome(String outcome, String[] sequence) {
String prevOutcome = null;
if (sequence.length > 0) {
prevOutcome = sequence[sequence.length - 1];
}
return validOutcome(outcome, prevOutcome);
}
static boolean validOutcome(String outcome, String prevOutcome) {
boolean prevIsBoundary = false, prevIsIntermediate = false, isBoundary = false, isIntermediate = false;
if (prevOutcome != null) {
prevIsBoundary = prevOutcome.startsWith("B-");
prevIsIntermediate = prevOutcome.startsWith("I-");
}
if (outcome != null) {
isBoundary = outcome.startsWith("B-");
isIntermediate = outcome.startsWith("I-");
}
boolean isSameEntity = false;
if ((prevIsBoundary || prevIsIntermediate) && isIntermediate) {
isSameEntity = prevOutcome.substring(2).equals(outcome.substring(2));
}
if (isIntermediate) {
if (prevOutcome == null) {
return (false);
} else {
if (!isSameEntity) {
return (false);
}
}
} else if (isBoundary) {
if (prevIsBoundary) {
return false; // MWE should have at least two tokens
}
}
if (prevIsBoundary && !isIntermediate) {
return false; // MWE should have at least two tokens
}
return true;
}
private static boolean isPunctuation(String word) {
return word.matches("^[\\.,;:()?-]$");
}
private boolean contains(List<String> tagList, String outcome) {
if (tagList.contains(outcome)) {
return true;
}
if (outcome.equals("n-adj")) {
if (tagList.contains("n") || tagList.contains("adj")) {
return true;
}
} else if (outcome.equals("n") || outcome.equals("adj")) {
if (tagList.contains("n-adj")) {
return true;
}
} else if (outcome.contains("=")) {
String outcomeClass = outcome.substring(0, outcome.indexOf('='));
for (String tag : tagList) {
if (tag.startsWith(outcomeClass)
&& (tag.contains("/") || outcome.contains("/"))) {
String[] outcomeParts = outcome.split("=");
String[] tagParts = tag.split("="); // will only check parts without /
// for simplicity
if (outcomeParts.length != tagParts.length) {
return false;
}
for (int i = 0; i < outcomeParts.length; i++) {
String outcomePart = outcomeParts[i];
String tagPart = tagParts[i];
if (!outcomePart.contains("/") && !tagPart.contains("/")) {
if (!outcomePart.equals(tagPart)) {
return false;
}
}
}
return true;
}
}
}
return false;
}
}