package de.berlin.hu.uima.ae.expander;
import java.util.ArrayList;
import java.util.List;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.u_compare.shared.semantic.NamedEntity;
import org.u_compare.shared.syntactic.Token;
import org.uimafit.util.JCasUtil;
import de.berlin.hu.util.Constants;
public class MentionExpander extends JCasAnnotator_ImplBase {
private static NamedEntity expandEntity(NamedEntity ne) {
String text = ne.getCAS().getDocumentText();
int begin = ne.getBegin();
int end = ne.getEnd();
boolean changed = true;
while (changed) {
changed = false;
if (begin-1 > 0 && (text.charAt(begin-1) + "").matches("[^\\s/\\.]")) {
begin--;
changed = true;
}
if (end < text.length() && (text.charAt(end) + "").matches("[^\\s/\\.]")) {
end++;
changed = true;
}
}
NamedEntity result = (NamedEntity)ne.clone();
result.setBegin(begin);
result.setEnd(end);
return result;
}
private static int countChar(char c, String s) {
int result = 0;
for (char ch : s.toCharArray()) {
if (c == ch) result++;
}
return result;
}
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
for (NamedEntity ne : JCasUtil.iterate(aJCas, NamedEntity.class)) {
if (Constants.CRF.equals(ne.getSource())) continue;
NamedEntity expanded = expandEntity(ne);
if (expanded.getBegin() == ne.getBegin() && expanded.getEnd() == ne.getEnd()) continue;
int left = -2;
int right = -2;
int i = 0;
List<Token> tokens = new ArrayList<Token>();
for (Token token : JCasUtil.iterate(Token.class, expanded)) {
tokens.add(token);
if (token.getBegin() <= ne.getBegin()) {
left = i;
}
if (right == -2 && token.getEnd() >= ne.getEnd()) {
right = i;
}
i++;
}
if (left == -2 || right == -2) return;
int initialLeft = left;
while (left > 0) {
Token t = tokens.get(left-1);
String pos = t.getLabel();
if (pos != null && left-1 == 0 && pos.startsWith("VB")) {
//System.out.println("- not expanding " + t.getCoveredText());
break;
}
left--;
}
while (!tokens.get(left).getCoveredText().matches("([^\\p{Punct}]|[\\(\\[])+") && left < initialLeft) left++;
int initialRight = right;
while (right < tokens.size()-1) {
Token t = tokens.get(right+1);
String pos = t.getLabel();
if (pos != null && right+1 == tokens.size()-1 && (pos.startsWith("VB") || pos.startsWith("JJ") || pos.equals("IN"))) {
//System.out.println("- not expanding " + t.getCoveredText());
break;
}
right++;
}
while (!tokens.get(right).getCoveredText().matches("([^\\p{Punct}]|[\\)\\]])+") && right > initialRight) right--;
expanded.setBegin(tokens.get(left).getBegin());
expanded.setEnd(tokens.get(right).getEnd());
String text = expanded.getCoveredText();
if ((text.startsWith("(") && text.endsWith(")")) || (text.startsWith("[") && text.endsWith("]"))) {
expanded.setBegin(expanded.getBegin()+1);
expanded.setEnd(expanded.getEnd()-1);
text = expanded.getCoveredText();
}
while((text.startsWith("(") && countChar('(', text) > countChar(')', text))
|| (text.startsWith("[") && countChar('[', text) > countChar(']', text))) {
expanded.setBegin(expanded.getBegin()+1);
text = expanded.getCoveredText();
}
while((text.endsWith(")") && countChar(')', text) > countChar('(', text))
|| (text.endsWith("]") && countChar(']', text) > countChar('[', text))) {
expanded.setEnd(expanded.getEnd()-1);
text = expanded.getCoveredText();
}
if (expanded.getBegin() != ne.getBegin() || expanded.getEnd() != ne.getEnd()) {
//System.out.printf("+ Expanding %s to %s%n", ne.getCoveredText(), expanded.getCoveredText());
ne.setBegin(expanded.getBegin());
ne.setEnd(expanded.getEnd());
}
}
}
}