package org.cogroo.tools.shallowparser;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.cogroo.tools.chunker2.ChunkerContextGenerator;
import org.cogroo.tools.chunker2.TokenTag;
import opennlp.tools.util.Span;
public class ShallowParserContextGenerator implements ChunkerContextGenerator {
@Override
public String[] getContext(int index, TokenTag[] sequence,
String[] priorDecisions, Object[] additionalContext) {
return getContext(index, sequence, priorDecisions);
}
@Override
public String[] getContext(int i, String[] toks, String[] tags, String[] preds) {
return getContext(i, TokenTag.create(toks, tags), preds);
}
public String[] getContext(int index, TokenTag[] sequence, String[] priorDecisions) {
String[] toks = new String[sequence.length];
String[] tags = new String[sequence.length];
String[] chunks = new String[sequence.length];
for (int i = 0; i < sequence.length; i++) {
toks[i] = sequence[i].getToken();
String t = sequence[i].getTag();
int bar = t.indexOf("|");
tags[i] = t.substring(0, bar);
chunks[i] = t.substring(bar+1);
}
return getContext(index, toks, tags, chunks, priorDecisions);
}
public String[] getContext(int i, String[] toks, String[] tags, String[] chks, String[] preds) {
// Words in a 5-word window
String w_2, w_1, w0, w1, w2;
// Tags in a 5-word window
String t_2, t_1, t0, t1, t2;
// Chunks in a 5-word window
String c_2, c_1, c0, c1, c2;
String cs_2, cs_1, cs0, cs1, cs2;
String ps_2, ps_1;
// Previous predictions
String p_2, p_1;
if (i < 2) {
w_2 = "w_2=bos";
t_2 = "t_2=bos";
c_2 = "c_2=bos";
p_2 = "p_2=bos";
}
else {
w_2 = "w_2=" + toks[i - 2];
t_2 = "t_2=" + tags[i - 2];
c_2 = "c_2=" + chks[i - 2];
p_2 = "p_2" + preds[i - 2];
}
if (i < 1) {
w_1 = "w_1=bos";
t_1 = "t_1=bos";
c_1 = "c_1=bos";
p_1 = "p_1=bos";
}
else {
w_1 = "w_1=" + toks[i - 1];
t_1 = "t_1=" + tags[i - 1];
c_1 = "c_1=" + chks[i - 1];
p_1 = "p_1=" + preds[i - 1];
}
w0 = "w0=" + toks[i];
t0 = "t0=" + tags[i];
c0 = "c0=" + chks[i];
if (i + 1 >= toks.length) {
w1 = "w1=eos";
t1 = "t1=eos";
c1 = "c1=eos";
}
else {
w1 = "w1=" + toks[i + 1];
t1 = "t1=" + tags[i + 1];
c1 = "c1=" + chks[i + 1];
}
if (i + 2 >= toks.length) {
w2 = "w2=eos";
t2 = "t2=eos";
c2 = "c2=eos";
}
else {
w2 = "w2=" + toks[i + 2];
t2 = "t2=" + tags[i + 2];
c2 = "c2=" + chks[i + 2];
}
Span[] chunkSpans = phrasesAsSpanList(chks);
Span[] predSpans = phrasesAsSpanList(preds);
int indexPreds = -1;
for (int j = 0; j < predSpans.length; j++) {
if(predSpans[j].contains(i - 1)) {
indexPreds = j;
break;
}
}
if (indexPreds < 2) {
ps_2 = "ps_2=bos";
}
else {
ps_2 = "ps_2=" + predSpans[indexPreds - 2].getType();
}
if (indexPreds < 1) {
ps_1 = "ps_1=bos";
}
else {
ps_1 = "ps_1=" + predSpans[indexPreds - 1].getType();
}
// if(indexPreds >= 0)
// ps0 = "ps0=" + predSpans[indexPreds].getType();
// else
// ps0 = "ps0=bos";
int indexChunks = -1;
for (int j = 0; j < chunkSpans.length; j++) {
if(chunkSpans[j].contains(i)) {
indexChunks = j;
break;
}
}
if (indexChunks < 2) {
cs_2 = "cs_2=bos";
}
else {
cs_2 = "cs_2=" + chunkSpans[indexChunks - 2].getType();
}
if (indexChunks < 1) {
cs_1 = "cs_1=bos";
}
else {
cs_1 = "cs_1=" + chunkSpans[indexChunks - 1].getType();
}
if(indexChunks >= 0)
cs0 = "cs0=" + chunkSpans[indexChunks].getType();
else
cs0 = "cs0=bos";
if (indexChunks + 1 >= chunkSpans.length) {
cs1 = "cs1=eos";
}
else {
cs1 = "cs1=" + chunkSpans[indexChunks + 1].getType();
}
if (indexChunks + 2 >= chunkSpans.length) {
cs2 = "cs2=eos";
}
else {
cs2 = "cs2=" + chunkSpans[indexChunks + 1].getType();
}
String[] features = new String[] {
//add word features
w_2,
w_1,
w0,
w1,
w2,
w_1 + w0,
w0 + w1,
//add tag features
t_2,
t_1,
t0,
t1,
t2,
t_2 + t_1,
t_1 + t0,
t0 + t1,
t1 + t2,
t_2 + t_1 + t0,
t_1 + t0 + t1,
t0 + t1 + t2,
//add chks features
c_2,
c_1,
c0,
c1,
c2,
c_2 + c_1,
c_1 + c0,
c0 + c1,
c1 + c2,
c_2 + c_1 + c0,
c_1 + c0 + c1,
c0 + c1 + c2,
//add chks span features
cs_2,
cs_1,
cs0,
cs1,
cs2,
cs_2 + cs_1,
cs_1 + cs0,
cs0 + cs1,
cs1 + cs2,
cs_2 + cs_1 + cs0,
cs_1 + cs0 + cs1,
cs0 + cs1 + cs2,
//add pred tags
p_2,
p_1,
p_2 + p_1,
//add pred span tags
ps_2,
ps_1,
ps_2 + ps_1,
//add pred and tag
p_1 + t_2,
p_1 + t_1,
p_1 + t0,
p_1 + t1,
p_1 + t2,
p_1 + t_2 + t_1,
p_1 + t_1 + t0,
p_1 + t0 + t1,
p_1 + t1 + t2,
p_1 + t_2 + t_1 + t0,
p_1 + t_1 + t0 + t1,
p_1 + t0 + t1 + t2,
//add pred and chunk
p_1 + c_2,
p_1 + c_1,
p_1 + c0,
p_1 + c1,
p_1 + c2,
p_1 + c_2 + c_1,
p_1 + c_1 + c0,
p_1 + c0 + c1,
p_1 + c1 + c2,
p_1 + c_2 + c_1 + c0,
p_1 + c_1 + c0 + c1,
p_1 + c0 + c1 + c2,
//add pred and word
p_1 + w_2,
p_1 + w_1,
p_1 + w0,
p_1 + w1,
p_1 + w2,
p_1 + w_1 + w0,
p_1 + w0 + w1,
//
t_2 + c_2,
t_1 + c_1,
t0 + c0,
t1 + c1,
t2 + c2
};
return features;
}
public static Span[] phrasesAsSpanList(String[] aChunksIn) {
// initialize with the list maximum size
List<Span> phrases = new ArrayList<Span>(aChunksIn.length);
String startTag = "";
int startIndex = 0;
boolean foundPhrase = false;
String[] aChunks = Arrays.copyOf(aChunksIn, aChunksIn.length);
for (int i = 0; i < aChunks.length; i++) {
if(aChunks[i].equals("O")) {
aChunks[i] = "B-O";
}
}
for (int ci = 0, cn = aChunks.length; ci < cn; ci++) {
String pred = aChunks[ci];
if (pred.startsWith("B-")
|| (!pred.equals("I-" + startTag) && !pred.equals("O"))) { // start
if (foundPhrase) { // handle the last
phrases.add(new Span(startIndex, ci, startTag));
}
startIndex = ci;
startTag = pred.substring(2);
foundPhrase = true;
} else if (pred.equals("I-" + startTag)) { // middle
// do nothing
} else if (foundPhrase) {// end
phrases.add(new Span(startIndex, ci, startTag));
foundPhrase = false;
startTag = "";
}
}
if (foundPhrase) { // leftover
phrases.add(new Span(startIndex, aChunks.length, startTag));
}
return phrases.toArray(new Span[phrases.size()]);
}
}