package org.cogroo.tools.shallowparser;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.cogroo.tools.chunker2.DefaultChunkerSequenceValidator;
import org.cogroo.tools.featurizer.WordTag;
public class ShallowParserSequenceValidator extends DefaultChunkerSequenceValidator {
private static final Set<String> PRONOMES_OBLIQUOS;
static {
String[] pronomes_obliq = {"me", "te", "nos", "vos", "o", "os", "a", "as", /*"se",*/
"lhe", "lhes", "mim", "ti", "comigo", "contigo", "conosco", "convosco", "consigo", "sí"};
PRONOMES_OBLIQUOS = new HashSet<String>(Arrays.asList(pronomes_obliq));
}
public boolean validSequence(int i, WordTag[] inputSequence,
String[] outcomesSequence, String outcome) {
boolean isValid = validOutcome(outcome, outcomesSequence);
isValid = isValid && validateNested(i, inputSequence, outcomesSequence, outcome);
isValid = isValid && validateSubj(i, inputSequence, outcomesSequence, outcome);
return isValid;
}
private boolean validateSubj(int i, WordTag[] inputSequence,
String[] outcomesSequence, String outcome) {
// check if previous outcome was SUBJ, and if yes, we can't close it with only an article
if(i > 0) {
if(outcomesSequence[i-1].equals("B-SUBJ") && !outcome.equals("I-SUBJ")) { // this checks singleton subjects
if(inputSequence[i-1].getPostag().startsWith("art") || isPronObli(inputSequence[i-1])) {
return false;
}
}
// if(outcomesSequence[i-1].endsWith("SUBJ") && outcome.equals("B-SUBJ")) {
// return false;
// }
if(outcomesSequence[i-1].endsWith("SUBJ") && !outcome.endsWith("SUBJ") && inputSequence[i-1].getPostag().equals(",")) {
return false;
}
}
return true;
}
private boolean isPronObli(WordTag wordTag) {
if(PRONOMES_OBLIQUOS.contains(wordTag.getWord().toLowerCase()) && extractPOS(wordTag.getPostag()).equals("pron-pers")) {
// System.out.println(wordTag.getWord().toLowerCase());
return true;
}
return false;
}
private boolean validateNested(int i, WordTag[] inputSequence, String[] outcomesSequence, String outcome) {
// we can't start a new sequence if we are in the middle of a chunk
if(i > 0) {
String previousChunk = extractChunk(inputSequence[i-1].getPostag());
String chunk = extractChunk(inputSequence[i].getPostag());
if(isContinuation(previousChunk, chunk)) {
if(isOther(outcomesSequence[i-1], outcome)) {
return true;
} else if(!isContinuation(outcomesSequence[i-1], outcome)) {
return false;
}
}
}
return true;
}
private boolean isOther(String a, String b) {
return "O".equals(a) && "O".equals(b);
}
private boolean isContinuation(String a, String b) {
if((a.startsWith("B-") || a.startsWith("I-")) && b.startsWith("I-") )
return true;
return false;
}
private String extractChunk(String postag) {
int i = postag.indexOf('|');
return postag.substring(i + 1);
}
private String extractPOS(String postag) {
int i = postag.indexOf('|');
return postag.substring(0, i);
}
}