package is2.tag;
import is2.data.*;
import is2.io.CONLLReader09;
import is2.parser.Parser;
import is2.tag.MFO;
import is2.tools.IPipe;
import is2.util.OptionsSuper;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map.Entry;
final public class ExtractorT2 extends PipeGen implements IPipe {
final static int _MAX = 71;
private static final String STWRD = "STWRD", STPOS = "STPOS";
private static short s_pos, s_word, s_char;
protected static short s_type;
private static int _strp, _ewrd;
static int _CEND;
public String[] types;
final public MFO mf;
final MFO.Data4 d1 = new MFO.Data4(), d2 = new MFO.Data4(), d3 = new MFO.Data4(),
dw = new MFO.Data4(), dwp = new MFO.Data4();
Cluster cl;
private OptionsSuper options;
public ExtractorT2(OptionsSuper options, MFO mf) throws IOException {
this.mf = mf;
this.options = options;
}
public HashMap<Integer, int[]> _pps = new HashMap<>();
private Lexicon lx;
public int corpusWrds = 0;
/*
* (non-Javadoc) @see is2.tag5.IPipe#createInstances(java.lang.String,
* java.io.File, is2.data.InstancesTagger)
*/
@Override
public Instances createInstances(String file) throws Exception {
return createInstances(file, -1, -1);
}
public Instances createInstances(String file, int skipStart, int skipEnd) throws Exception {
InstancesTagger is = new InstancesTagger();
CONLLReader09 depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE);
depReader.startReading(file);
mf.register(POS, "<root-POS>");
mf.register(WORD, "<root>");
Parser.out.println("Registering feature parts ");
HashMap<Integer, HashSet<Integer>> pps = new HashMap<>();
int ic = 0;
while (true) {
SentenceData09 instance1 = depReader.getNext();
if (instance1 == null) {
break;
}
ic++;
String[] w = instance1.forms;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(WORD, w[i1]);
}
for (int i1 = 0; i1 < w.length; i1++) {
registerChars(CHAR, w[i1]);
}
for (int i1 = 0; i1 < w.length; i1++) {
registerChars(CHAR, w[i1].toLowerCase());
}
w = instance1.plemmas;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(WORD, w[i1]);
}
for (int i1 = 0; i1 < w.length; i1++) {
registerChars(CHAR, w[i1]);
}
w = instance1.gpos;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(POS, w[i1]);
}
for (int i1 = 0; i1 < w.length; i1++) {
HashSet<Integer> ps = pps.get(mf.getValue(POS, w[i1]));
if (ps == null) {
ps = new HashSet<>();
pps.put(mf.getValue(POS, w[i1]), ps);
}
if (i1 + 1 < w.length) {
ps.add(mf.getValue(POS, w[i1 + 1]));
}
}
}
for (Entry<Integer, HashSet<Integer>> e : pps.entrySet()) {
int[] ps = new int[e.getValue().size()];
int j = 0;
for (int k : e.getValue().toArray(new Integer[0])) {
ps[j++] = k;
}
_pps.put(e.getKey(), ps);
// Parser.out.println("put "+e.getKey()+" "+ps.length+" pps size "+_pps.size());
}
Parser.out.println("words in corpus " + (corpusWrds = mf.getFeatureCounter().get(ExtractorT2.WORD)));
if (options.clusterFile == null) {
cl = new Cluster();
} else {
cl = new Cluster(options.clusterFile, mf, 6);
}
if (options.lexicon == null) {
lx = new Lexicon(new byte[0][0]);
} else {
lx = new Lexicon(options.lexicon, mf);
}
initFeatures();
mf.calculateBits();
initValues();
Parser.out.println("" + mf.toString());
depReader.startReading(file);
int num1 = 0;
int instanceCount = 0;
Parser.out.print("Creating Instances: ");
is.init(ic, mf);
int del = 0;
while (true) {
if (num1 % 100 == 0) {
del = outValue(num1, del);
}
if (num1 >= skipStart && num1 < skipEnd && skipStart >= 0) {
SentenceData09 instance1 = depReader.getNext();
if (instance1 == null) {
break;
}
num1++;
continue;
}
SentenceData09 instance1 = depReader.getNext(is);
if (instance1 == null) {
break;
}
is.fillChars(instance1, instanceCount, _CEND);
for (int k = 0; k < instance1.length(); k++) {
if (instance1.ppos[k].contains("\\|")) {
is.pposs[num1][k] = (short) mf.getValue(FM, instance1.ppos[k].split("\\|")[1]);
}
}
if (num1 > options.count) {
break;
}
num1++;
instanceCount++;
}
outValue(num1, del);
Parser.out.println();
types = is2.tag.MFO.reverse(mf.getFeatureSet().get(POS));
return is;
}
private void registerChars(String type, String word) {
for (int i = 0; i < word.length(); i++) {
mf.register(type, Character.toString(word.charAt(i)));
}
}
/*
* (non-Javadoc) @see is2.tag5.IPipe#initValues()
*/
@Override
public void initValues() {
s_pos = mf.getFeatureBits(POS);
s_word = mf.getFeatureBits(WORD);
s_type = mf.getFeatureBits(TYPE);
s_char = mf.getFeatureBits(CHAR);
d1.a0 = s_type;
d1.a1 = s_pos;
d1.a2 = s_word;
d1.a3 = s_word;
d2.a0 = s_type;
d2.a1 = s_pos;
d2.a2 = s_pos;
d2.a3 = s_pos;
d2.a4 = s_pos;
d2.a5 = s_pos;
d2.a6 = s_pos;
d3.a0 = s_type;
d3.a1 = s_pos;
d3.a2 = s_char;
d3.a3 = s_char;
d3.a4 = s_char;
d3.a5 = s_char;
d3.a6 = s_char;
d3.a7 = s_char;
dw.a0 = s_type;
dw.a1 = s_pos;
dw.a2 = s_word;
dw.a3 = s_word;
dw.a4 = s_word;
dw.a5 = s_word;
dw.a6 = s_word;
dw.a7 = s_word;
dwp.a0 = s_type;
dwp.a1 = s_pos;
dwp.a2 = s_word;
dwp.a3 = s_pos;
dwp.a4 = s_word;
}
/*
* (non-Javadoc) @see is2.tag5.IPipe#initFeatures()
*/
@Override
public void initFeatures() {
// 62
for (int t = 0; t < 67; t++) {
mf.register(TYPE, "F" + t);
}
mf.register(POS, MID);
_strp = mf.register(POS, STR);
mf.register(POS, END);
mf.register(WORD, STR);
_ewrd = mf.register(WORD, END);
_CEND = mf.register(CHAR, END);
mf.register(WORD, STWRD);
mf.register(POS, STPOS);
}
final public void addFeatures(InstancesTagger is, int ic, String fs, int i, short pposs[], int[] forms, int[] lemmas, long[] vs) {
int c0 = is.chars[ic][i][0], c1 = is.chars[ic][i][1], c2 = is.chars[ic][i][2], c3 = is.chars[ic][i][3], c4 = is.chars[ic][i][4], c5 = is.chars[ic][i][5];
int e0 = is.chars[ic][i][6], e1 = is.chars[ic][i][7], e2 = is.chars[ic][i][8], e3 = is.chars[ic][i][9], e4 = is.chars[ic][i][10];
int f = 1, n = 0;
short upper = 0, number = 1;
for (int k1 = 0; k1 < fs.length(); k1++) {
char c = fs.charAt(k1);
if (Character.isUpperCase(c)) {
if (k1 == 0) {
upper = 1;
} else {
// first char + another
if (upper == 1) {
upper = 3;
} // another uppercase in the word
else if (upper == 0) {
upper = 2;
}
}
}
// first
if (Character.isDigit(c) && k1 == 0) {
number = 2;
} else if (Character.isDigit(c) && number == 1) {
number = 3;
}
// if(number==2 &&Character.isDigit(c)) number=4;
// if(number==4 && !Character.isDigit(c)) number=5;
}
// if (i==0 && upper>0) upper+=4;
int form = forms[i], form2 = forms[i] < corpusWrds ? forms[i] : -1;
int len = forms.length;
long l;
d1.v0 = f++;
d1.v2 = form2;
l = mf.calc3(d1);
vs[n++] = mf.calc3(d1);
d1.v0 = f++;
d1.v2 = is.formlc[ic][i];
vs[n++] = mf.calc3(d1);
d3.v2 = c0;
d3.v3 = c1;
d3.v4 = c2;
d3.v5 = c3;
d3.v6 = c4;
d3.v0 = f++;
vs[n++] = mf.calc3(d3);
d3.v0 = f++;
vs[n++] = mf.calc4(d3);
d3.v0 = f++;
vs[n++] = mf.calc5(d3);
d3.v0 = f++;
vs[n++] = mf.calc6(d3);
d3.v0 = f++;
vs[n++] = mf.calc7(d3);
if (form != -1) {
d3.v2 = c2;
d3.v3 = c3;
d3.v4 = c4;
d3.v5 = c5;
d3.v6 = cl.getLP(form);
d3.v0 = f;
vs[n++] = mf.calc6(d3);
d3.v0 = f + 1;
vs[n++] = mf.calc7(d3);
}
f += 2;
if (form > 0) {
d3.v0 = f;
d3.v5 = cl.getLP(form);
vs[n++] = mf.calc6(d3);
d3.v0 = f + 1;
d3.v4 = cl.getLP(form);
vs[n++] = mf.calc5(d3);
d3.v0 = f + 2;
d3.v3 = cl.getLP(form);
vs[n++] = mf.calc4(d3);
}
f += 5;
d3.v2 = e0;
d3.v3 = e1;
d3.v4 = e2;
d3.v5 = e3;
d3.v6 = e4;
d3.v0 = f++;
vs[n++] = mf.calc3(d3);
d3.v0 = f++;
vs[n++] = l = mf.calc4(d3);
vs[n++] = d3.calcs(3, upper, l);
d3.v0 = f++;
vs[n++] = l = mf.calc5(d3);
vs[n++] = d3.calcs(3, upper, l);
d3.v0 = f++;
vs[n++] = l = mf.calc6(d3);
vs[n++] = d3.calcs(3, upper, l);
d3.v0 = f++;
vs[n++] = l = mf.calc7(d3);
vs[n++] = d3.calcs(3, upper, l);
if (form > 0) {
d3.v0 = f;
d3.v5 = cl.getLP(form);
vs[n++] = mf.calc6(d3);
d3.v0 = f + 1;
d3.v4 = cl.getLP(form);
vs[n++] = mf.calc5(d3);
d3.v0 = f + 2;
d3.v3 = cl.getLP(form);
vs[n++] = mf.calc4(d3);
d3.v2 = e0;
d3.v3 = e1;
d3.v4 = e2;
d3.v0 = f + 3;
d3.v2 = lx.getTag(form);
vs[n++] = mf.calc3(d3);
d3.v0 = f + 4;
d3.v4 = cl.getLP(form);
vs[n++] = mf.calc5(d3);
d3.v0 = f + 5;
d3.v3 = cl.getLP(form);
vs[n++] = mf.calc4(d3);
}
f += 6;
// sign three-grams
d3.v0 = f++;
d3.v2 = c1;
d3.v3 = c2;
d3.v4 = c3;
vs[n++] = mf.calc5(d3);
d3.v0 = f++;
d3.v2 = c2;
d3.v3 = c3;
d3.v4 = c4;
vs[n++] = mf.calc5(d3);
d3.v0 = f++;
d3.v2 = c3;
d3.v3 = c4;
d3.v4 = c5;
vs[n++] = mf.calc5(d3);
// sign quad-grams
d3.v0 = f++;
d3.v2 = c1;
d3.v3 = c2;
d3.v4 = c3;
d3.v5 = c4;
vs[n++] = mf.calc6(d3);
d3.v0 = f++;
d3.v2 = c2;
d3.v3 = c3;
d3.v4 = c4;
d3.v5 = c5;
vs[n++] = mf.calc6(d3); // changed to 6
if (i + 1 < len && forms[i + 1] < this.corpusWrds) {
dw.v0 = f;
dw.v2 = forms[i + 1];
dw.v3 = form2;
vs[n++] = mf.calc4(dw);
}
f++;
if (len > i + 1) {
if (forms[i + 1] < corpusWrds) {
dw.v0 = f;
dw.v2 = forms[i + 1];
vs[n++] = mf.calc3(dw);
}
d3.v0 = f + 1;
d3.v2 = is.chars[ic][i + 1][0];
vs[n++] = mf.calc3(d3);
d3.v0 = f + 2;
d3.v2 = is.chars[ic][i + 1][6];
vs[n++] = mf.calc3(d3);
d3.v2 = e0;
d3.v3 = e1;
d3.v0 = f + 3;
d3.v4 = is.chars[ic][i + 1][0];
vs[n++] = mf.calc5(d3);
d3.v0 = f + 4;
d3.v4 = is.chars[ic][i + 1][6];
vs[n++] = mf.calc5(d3);
if (is.chars[ic][i + 1][11] > 1) { // instance.forms[i+1].length()
d3.v0 = f + 5;
d3.v2 = is.chars[ic][i + 1][0];
d3.v3 = is.chars[ic][i + 1][1];
vs[n++] = mf.calc4(d3);
d3.v0 = f + 6;
d3.v2 = is.chars[ic][i + 1][6];
d3.v3 = is.chars[ic][i + 1][7];
vs[n++] = mf.calc4(d3);
d3.v2 = e0;
d3.v3 = e1;
d3.v0 = f + 7;
d3.v4 = is.chars[ic][i + 1][0];
d3.v5 = is.chars[ic][i + 1][1];
vs[n++] = mf.calc6(d3);
d3.v0 = f + 8;
d3.v4 = is.chars[ic][i + 1][6];
d3.v5 = is.chars[ic][i + 1][7];
vs[n++] = mf.calc6(d3);
if (forms[i + 1] > 0) {
d3.v0 = f + 9;
d3.v2 = is.chars[ic][i + 1][0];
d3.v3 = is.chars[ic][i + 1][1];
d3.v4 = cl.getLP(forms[i + 1]);
vs[n++] = mf.calc5(d3);
d3.v0 = f + 10;
d3.v2 = is.chars[ic][i + 1][6];
d3.v3 = is.chars[ic][i + 1][7];
d3.v4 = cl.getLP(forms[i + 1]);
vs[n++] = mf.calc5(d3);
}
}
if (forms[i + 1] > 0) {
dw.v0 = f + 11;
dw.v2 = cl.getLP(forms[i + 1]);
dw.v3 = form2;
vs[n++] = mf.calc4(dw);
// if (forms[i]>0){
// dw.v0=f+12; dw.v2= cl.getLP(forms[i+1]); dw.v3=lx.getTag(form);vs[n++]=mf.calc4(dw);
// dw.v0=f+13; dw.v2= cl.getLP(forms[i]); dw.v3=lx.getTag(forms[i+1]);vs[n++]=mf.calc4(dw);
// }
}
if (len > i + 2) {
if (forms[i + 2] < corpusWrds && forms[i + 1] < corpusWrds) {
dw.v0 = f + 12;
dw.v2 = forms[i + 2];
dw.v3 = forms[i + 1];
vs[n++] = mf.calc4(dw);
vs[n++] = mf.calc3(dw);
}
d2.v0 = f + 13;
d2.v2 = pposs[i + 1];
d2.v3 = pposs[i + 2];
vs[n++] = mf.calc4(d2);
}
if (len > i + 3) {
if (forms[i + 3] < this.corpusWrds && forms[i + 2] < this.corpusWrds) {
dw.v0 = f + 14;
dw.v2 = forms[i + 3];
dw.v3 = forms[i + 2];
vs[n++] = mf.calc4(dw);
vs[n++] = mf.calc3(dw);
}
}
}
f += 15;
// length
d2.v0 = f++;
d2.v2 = is.chars[ic][i][11];
vs[n++] = mf.calc3(d2);
// contains a number
d2.v0 = f++;
d2.v2 = number;
vs[n++] = mf.calc3(d2);
if (lemmas[i] < corpusWrds) {
d1.v0 = f;
d1.v2 = lemmas[i];
vs[n++] = mf.calc3(d1);
}
f++;
if (i != 0 && len > i + 1) {
if (lemmas[i - 1] < corpusWrds && lemmas[i + 1] < corpusWrds) {
dw.v0 = f;
dw.v2 = lemmas[i - 1];
dw.v3 = lemmas[i + 1];
vs[n++] = mf.calc4(dw);
}
d2.v0 = f + 1;
d2.v2 = pposs[i - 1];
d2.v3 = pposs[i + 1];
vs[n++] = mf.calc4(d2);
}
f += 2;
d2.v0 = f++;
d2.v2 = i >= 1 ? pposs[i - 1] : _strp;
vs[n++] = mf.calc3(d2);
if (i > 0) {
dw.v0 = f;
dw.v2 = i >= 1 ? forms[i - 1] < corpusWrds ? forms[i - 1] : -1 : _strp;
vs[n++] = mf.calc3(dw);
f++;
if (lemmas[i - 1] < corpusWrds) {
dw.v0 = f;
dw.v2 = i >= 1 ? lemmas[i - 1] : _strp;
vs[n++] = mf.calc3(dw);
}
f++;
//if (len>i+1) {d2.v0=f; d2.v2= pposs[i-1];d2.v3= pposs[i+1]; vs[n++]=mf.calc4(d2);}
//f++;
if (i > 1) {
d2.v0 = f++;
d2.v2 = i < 2 ? _strp : pposs[i - 2];
vs[n++] = mf.calc3(d2);
d2.v0 = f++;
d2.v2 = pposs[i - 1];
d2.v3 = pposs[i - 2];
vs[n++] = mf.calc4(d2);
if (forms[i - 2] < corpusWrds) {
dw.v0 = f;
dw.v2 = forms[i - 2];
vs[n++] = mf.calc3(dw);
}
f++;
if (forms[i - 1] < corpusWrds) {
dwp.v0 = f;
dwp.v2 = forms[i - 1];
dwp.v3 = pposs[i - 2];
vs[n++] = mf.calc4(dwp);
}
f++;
if (forms[i - 2] < corpusWrds) {
dwp.v0 = f;
dwp.v2 = forms[i - 2];
dwp.v3 = pposs[i - 1];
vs[n++] = mf.calc4(dwp);
}
f++;
if (i > 2) {
d2.v0 = f++;
d2.v2 = pposs[i - 3];
vs[n++] = mf.calc3(d2);
d2.v0 = f++;
d2.v2 = pposs[i - 2];
d2.v3 = pposs[i - 3];
vs[n++] = mf.calc4(d2);
if (forms[i - 3] < this.corpusWrds && forms[i - 2] < this.corpusWrds) {
dw.v0 = f;
dw.v2 = forms[i - 3];
dw.v3 = forms[i - 2];
vs[n++] = mf.calc4(dw);
}
f++;
}
}
}
vs[n] = Integer.MIN_VALUE;
}
public int fillFeatureVectorsOne(String fs, ParametersFloat params, int w1, InstancesTagger is, int n, short[] pos, Long2IntInterface li, float[] score) {
float best = -1000;
int bestType = -1;
F2SF f = new F2SF(params.parameters);
long vs[] = new long[_MAX];
int lemmas[];
if (options.noLemmas) {
lemmas = new int[is.length(n)];
} else {
lemmas = is.plemmas[n];
}
addFeatures(is, n, fs, w1, pos, is.forms[n], lemmas, vs);
//for(int t = 0; t < types.length; t++) {
for (int t = 0; t < types.length; t++) {
int p = t << s_type;
f.clear();
for (int k = 0; vs[k] != Integer.MIN_VALUE; k++) {
if (vs[k] > 0) {
f.add(li.l2i(vs[k] + p));
}
}
if (f.score > best) {
bestType = t;
score[w1] = best = f.score;
}
}
return bestType;
}
public ArrayList<POS> classify(String fs, ParametersFloat params, int w1, InstancesTagger is, int n, short[] pos, Long2IntInterface li) {
F2SF f = new F2SF(params.parameters);
long vs[] = new long[_MAX];
int lemmas[];
if (options.noLemmas) {
lemmas = new int[is.length(n)];
} else {
lemmas = is.plemmas[n];
}
addFeatures(is, n, fs, w1, pos, is.forms[n], lemmas, vs);
ArrayList<POS> best = new ArrayList<>(types.length);
for (int t = 0; t < types.length; t++) {
int p = t << s_type;
f.clear();
f.add(vs, li, p);
POS px = new POS(t, f.score);
best.add(px);
}
Collections.sort(best);
return best;
}
/*
* (non-Javadoc) @see is2.tag5.IPipe#write(java.io.DataOutputStream)
*/
@Override
public void write(DataOutputStream dos) {
try {
this.cl.write(dos);
this.lx.write(dos);
dos.writeInt(this.corpusWrds);
dos.writeInt(_pps.size());
for (Entry<Integer, int[]> e : _pps.entrySet()) {
dos.writeInt(e.getValue().length);
for (int k : e.getValue()) {
dos.writeInt(k);
}
dos.writeInt(e.getKey());
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void read(DataInputStream dis) {
try {
this.cl = new Cluster(dis);
this.lx = new Lexicon(dis);
this.corpusWrds = dis.readInt();
int pc = dis.readInt();
for (int j = 0; j < pc; j++) {
int ps[] = new int[dis.readInt()];
for (int k = 0; k < ps.length; k++) {
ps[k] = dis.readInt();
}
_pps.put(dis.readInt(), ps);
}
// Parser.out.println("_pps "+ps.length);
} catch (IOException e) {
e.printStackTrace();
}
}
}