/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cogroo.tools.featurizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.cogroo.dictionary.FeatureDictionary;
import org.cogroo.tools.chunker2.TokenTag;
import opennlp.tools.util.SequenceValidator;
public class DefaultFeaturizerSequenceValidator implements
SequenceValidator<TokenTag> {
private FeatureDictionary tagDict = null;
private Set<String> poisonedTags;
public DefaultFeaturizerSequenceValidator(FeatureDictionary tagDict,
Set<String> poisonedTags) {
this.tagDict = tagDict;
this.poisonedTags = poisonedTags;
}
public boolean validSequence(int i, TokenTag[] sequence, String[] s,
String outcome) {
String word = sequence[i].getToken();
String postag = sequence[i].getTag();
// // if isCont, we only validate if this outcome equals to previous
// if (postag.startsWith("I-")) {
// return s[i - 1].equals(outcome);
// }
//
// if (postag.startsWith("B-") && s[i - 1].startsWith("B-")) {
// return false; // MWE should have at least two tokens
// }
if (tagDict == null) {
return true;
}
// if (postag.startsWith("B-")) {
// postag = postag.substring(2);
// }
if(postag == null) {
System.err.println("NULL: " + Arrays.toString(sequence));
System.err.println("NULL: " + Arrays.toString(s));
}
String[] tagsArr = expandedSearch(word, postag, true);
List<String> tags = null;
if(tagsArr != null) {
tags = filterPoisoned(tagsArr);
}
boolean match = true;
if (tags != null) {
// System.err.println("-- eval: " + word + " (" + postag + ") "+ tags +
// " outcome: " + outcome);
match = matches(outcome, tags);
// validate subjuntive verb:
if(match && hasNoSubjOpt(tags) && postag.equals("v-fin") && outcome.endsWith("=SUBJ")) {
// we need to find a "que"
boolean found = false;
for (int j = i - 1; j >= 0; j--) {
String lexeme = sequence[j].getToken().toLowerCase();
if (lexeme.equals("que")) {
found = true;
break;
}
}
if(!found) {
return false;
}
}
}
return match;
}
// private boolean isCont(WordTag[] sequence, int i) {
// if(i > 0) {
// String prev = sequence[i-1].getPostag();
// if(prev.startsWith("B-") || prev.startsWith("I-"))
// return true;
// }
// return false;
// }
private boolean hasNoSubjOpt(List<String> tags) {
if(tags.size() > 1) {
for (String tag : tags) {
if(!tag.endsWith("=SUBJ")) {
return true;
}
}
}
return false;
}
private String[] expandedSearch(String word, String postag, boolean recurse) {
String[] tagsArr = tagDict.getFeatures(word, postag);
if(tagsArr == null || tagsArr.length == 0) {
tagsArr = tagDict.getFeatures(word.toLowerCase(), postag);
}
if((tagsArr == null || tagsArr.length == 0) && recurse) {
if(postag.equals("n-adj")) {
tagsArr = expandedSearch(word, "adj", false);
if(tagsArr == null || tagsArr.length == 0) {
tagsArr = expandedSearch(word, "n", false);
}
} else if(postag.equals("n")) {
tagsArr = expandedSearch(word, "n-adj", false);
} else if(postag.equals("adj")) {
tagsArr = expandedSearch(word, "n-adj", false);
} else if(word.length() > 1 && word.charAt(0) == '-') {
tagsArr = expandedSearch(word.substring(1), postag, false);
}
}
return tagsArr;
}
private List<String> filterPoisoned(String[] featureTag) {
if (featureTag == null) {
return null;
}
List<String> filtered = new ArrayList<String>();
for (String tag : featureTag) {
if (!this.poisonedTags.contains(tag)) {
filtered.add(tag);
} else {
System.err.println("found poisoned tag. Will ignore all! " + tag);
return null;
}
}
if (filtered.size() == 0) {
return null;
}
return Collections.unmodifiableList(filtered);
}
private boolean matches(String outcome, List<String> tags) {
if(tags.contains(outcome)) {
return true;
}
for (String t : tags) {
if(matches(outcome, t))
return true;
}
return false;
}
public static boolean matches(String outcome, String tag) {
if("-".equals(tag) || "-".equals(outcome)) {
return false;
}
Set<String> outcomeParts = new HashSet<String>(Arrays.asList(outcome
.split("[=-]")));
Set<String> tagParts = new HashSet<String>(Arrays.asList(tag.split("[=-]")));
if(outcomeParts.size() != tagParts.size()) {
return false;
}
List<String> remove = new ArrayList<String>();
for (String t : tagParts) {
if (outcomeParts.contains(t)) {
remove.add(t);
}
}
tagParts.removeAll(remove);
if(tagParts.size() == 0) {
return true;
}
if(!tag.contains("/")) {
return false;
}
outcomeParts.removeAll(remove);
// lets split the outcome, we don't need to consume all of this tags.
Set<String> outcomeParts2 = new HashSet<String>();
for (String o : outcomeParts) {
outcomeParts2.addAll(Arrays.asList(o.split("/")));
}
remove.clear();
// the tagPars we use as it is... we iterate and eliminate the parts that match with outcomeParts2
for (String t : tagParts) {
for (String o : outcomeParts2) {
if(t.contains(o) && !Character.isDigit(o.charAt(0))) {
remove.add(t);remove.add(o);
break;
}
}
}
tagParts.removeAll(remove);
outcomeParts2.removeAll(remove);
if(tagParts.size() == 0 && outcomeParts2.size() == 0) {
return true;
}
if(tagParts.size() == 1 && outcomeParts2.size() >= 1) {
for (String op : outcomeParts2) {
String t = new ArrayList<String>(tagParts).get(0);
if(Character.isDigit(op.charAt(0)) && Character.isDigit(t.charAt(0))) {
boolean ok = true;
for (int i = 0; i < op.length(); i++) {
if(!t.contains(Character.toString(op.charAt(i)))) {
ok = false;
break;
}
}
if(ok)
return true;
}
}
}
return false;
}
}