package is2.mtag;
import is2.data.*;
import is2.io.CONLLReader09;
import is2.parser.Parser;
import is2.tools.IPipe;
import is2.util.OptionsSuper;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map.Entry;
final public class Pipe extends PipeGen implements IPipe {
public static int _CEND;
private static final String STWRD = "STWRD", STPOS = "STPOS", END = "END", STR = "STR";
public String[] types;
Cluster cl;
final public MFO mf = new MFO();
public Long2IntInterface li;
final MFO.Data4 d1 = new MFO.Data4(), d2 = new MFO.Data4(), d3 = new MFO.Data4(), dw = new MFO.Data4();
final MFO.Data4 dwp = new MFO.Data4(), dp = new MFO.Data4();
private OptionsSuper options;
private int _ewrd;
static private int _mid, _strp, _endp;
public Pipe(Options options, Long2Int long2Int) throws IOException {
this.options = options;
li = long2Int;
}
public Pipe(OptionsSuper options) {
this.options = options;
}
public HashMap<Integer, Integer> form2morph = new HashMap<>();
@Override
public Instances createInstances(String file) throws Exception {
CONLLReader09 depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE);
depReader.startReading(file);
mf.register(POS, "<root-POS>");
mf.register(FEAT, CONLLReader09.NO_TYPE);
mf.register(FEAT, "");
InstancesTagger is = new InstancesTagger();
Parser.out.println("Registering feature parts ");
HashMap<String, HashSet<String>> op2form = new HashMap<>();
HashMap<String, Integer> freq = new HashMap<>();
int ic = 0;
while (true) {
SentenceData09 instance1 = depReader.getNext();
if (instance1 == null) {
break;
}
ic++;
String[] w = instance1.forms;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(WORD, w[i1]);
}
for (int i1 = 0; i1 < w.length; i1++) {
registerChars(CHAR, w[i1]);
}
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(WORD, w[i1].toLowerCase());
Integer f = freq.get(w[i1].toLowerCase());
if (f == null) {
freq.put(w[i1].toLowerCase(), 1);
} else {
freq.put(w[i1].toLowerCase(), f + 1);
}
HashSet<String> forms = op2form.get(w[i1].toLowerCase());
if (forms == null) {
forms = new HashSet<>();
op2form.put(w[i1].toLowerCase(), forms);
}
forms.add(instance1.ofeats[i1] == null ? "_" : instance1.ofeats[i1]);
}
for (int i1 = 0; i1 < w.length; i1++) {
registerChars(CHAR, w[i1].toLowerCase());
}
w = instance1.plemmas;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(WORD, w[i1]);
}
for (int i1 = 0; i1 < w.length; i1++) {
registerChars(CHAR, w[i1]);
}
w = instance1.ppos;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(POS, w[i1]);
}
w = instance1.gpos;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(POS, w[i1]);
}
w = instance1.ofeats;
for (int i1 = 0; i1 < w.length; i1++) {
if (w[i1] != null) {
mf.register(FEAT, w[i1]);
}
}
// w = instance1.pfeats;
//for(int i1 = 0; i1 < w.length; i1++) if (w[i1]!=null) mf.register(FEAT, w[i1]);
}
for (Entry<String, HashSet<String>> e : op2form.entrySet()) {
if (e.getValue().size() == 1 && freq.get(e.getKey()) > 10) {
// Parser.out.println("found map "+e.getKey()+" "+e.getValue()+" "+freq.get(e.getKey()));
form2morph.put(mf.getValue(Pipe.WORD, e.getKey()), mf.getValue(FEAT, (String) e.getValue().toArray()[0]));
}
}
initFeatures();
mf.calculateBits();
initValues();
Parser.out.println("" + mf.toString());
depReader.startReading(file);
int num1 = 0;
long start1 = System.currentTimeMillis();
Parser.out.print("Creating Features: ");
is.init(ic, mf);
int del = 0;
while (true) {
if (num1 % 100 == 0) {
del = outValue(num1, del);
}
SentenceData09 instance1 = depReader.getNext(is);
if (instance1 == null) {
break;
}
if (num1 > options.count) {
break;
}
num1++;
}
long end1 = System.currentTimeMillis();
System.gc();
long mem2 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
Parser.out.print(" time " + (end1 - start1) + " mem " + (mem2 / 1024) + " kb");
types = new String[mf.getFeatureCounter().get(FEAT)];
for (Entry<String, Integer> e : is2.mtag.MFO.getFeatureSet().get(FEAT).entrySet()) {
types[e.getValue()] = e.getKey();
}
if (options.clusterFile == null) {
cl = new Cluster();
} else {
cl = new Cluster(options.clusterFile, mf, 6);
}
Parser.out.println("Num Features: " + types.length);
depReader.startReading(file);
int num11 = 0;
while (true) {
SentenceData09 instance = depReader.getNext();
if (instance == null) {
break;
}
is.fillChars(instance, num11, _CEND);
if (num11 > options.count) {
break;
}
num11++;
}
return is;//.toNativeArray();
}
private void registerChars(String type, String word) {
for (int i = 0; i < word.length(); i++) {
mf.register(type, Character.toString(word.charAt(i)));
}
}
@Override
public void initValues() {
s_feat = is2.mtag.MFO.getFeatureBits(FEAT);
s_word = is2.mtag.MFO.getFeatureBits(WORD);
s_type = is2.mtag.MFO.getFeatureBits(TYPE);
s_char = is2.mtag.MFO.getFeatureBits(CHAR);
s_pos = is2.mtag.MFO.getFeatureBits(POS);
// dl1.a[0] = s_type; dl1.a[1] = s_pos;
// for (int k = 2; k < 7; k++) dl1.a[k] = s_pos;
d1.a0 = s_type;
d1.a1 = s_feat;
d1.a2 = s_word;
d2.a0 = s_type;
d2.a1 = s_feat;
d2.a2 = s_feat;
d2.a3 = s_feat;
d2.a4 = s_feat;
d2.a5 = s_feat;
d2.a6 = s_feat;
d3.a0 = s_type;
d3.a1 = s_feat;
d3.a2 = s_char;
d3.a3 = s_char;
d3.a4 = s_char;
d3.a5 = s_char;
d3.a6 = s_char;
d3.a7 = s_char;
dp.a0 = s_type;
dp.a1 = s_feat;
dp.a2 = s_pos;
dp.a3 = s_pos;
dp.a4 = s_feat;// dp.a5= s_char; dp.a6= s_char; dp.a7= s_char;
dw.a0 = s_type;
dw.a1 = s_feat;
dw.a2 = s_word;
dw.a3 = s_word;
dw.a4 = s_word;
dw.a5 = s_word;
dw.a6 = s_word;
dw.a7 = s_word;
dwp.a0 = s_type;
dwp.a1 = s_feat;
dwp.a2 = s_word;
dwp.a3 = s_feat;
dwp.a4 = s_word;
}
public static short s_feat, s_word, s_type, s_dir, s_dist, s_char, s_pos;
/**
* Initialize the features types.
*/
@Override
public void initFeatures() {
for (int t = 0; t < 62; t++) {
mf.register(TYPE, "F" + t);
}
// _mid = mf.register(POS, MID);
_strp = mf.register(POS, STR);
_endp = mf.register(POS, END);
mf.register(WORD, STR);
_ewrd = mf.register(WORD, END);
_CEND = mf.register(CHAR, END);
// optional features
mf.register(WORD, STWRD);
mf.register(POS, STPOS);
}
final public void addCF(InstancesTagger is, int ic, String fs, int i, int pfeat[], short ppos[], int[] forms, int[] lemmas, long[] vs) {
int c0 = is.chars[ic][i][0], c1 = is.chars[ic][i][1], c2 = is.chars[ic][i][2], c3 = is.chars[ic][i][3], c4 = is.chars[ic][i][4], c5 = is.chars[ic][i][5];
int e0 = is.chars[ic][i][6], e1 = is.chars[ic][i][7], e2 = is.chars[ic][i][8], e3 = is.chars[ic][i][9], e4 = is.chars[ic][i][10];
int f = 1, n = 0;
short upper = 0, number = 1;
for (int k1 = 0; k1 < fs.length(); k1++) {
char c = fs.charAt(k1);
if (Character.isUpperCase(c)) {
if (k1 == 0) {
upper = 1;
} else {
// first char + another
if (upper == 1) {
upper = 3;
} // another uppercase in the word
else if (upper == 0) {
upper = 2;
}
}
}
if (Character.isDigit(c) && k1 == 0) {
number = 2;
} else if (Character.isDigit(c) && number == 1) {
number = 3;
}
}
int form = forms[i];
int len = forms.length;
long l;
d1.v0 = f++;
d1.v2 = form;
l = mf.calc3(d1);
vs[n++] = mf.calc3(d1);
d1.v0 = f++;
d1.v2 = is.formlc[ic][i];
vs[n++] = mf.calc3(d1);
d3.v2 = c0;
d3.v3 = c1;
d3.v4 = c2;
d3.v5 = c3;
d3.v6 = c4;
d3.v0 = f++;
vs[n++] = mf.calc3(d3);
d3.v0 = f++;
vs[n++] = mf.calc4(d3);
d3.v0 = f++;
vs[n++] = mf.calc5(d3);
d3.v0 = f++;
vs[n++] = mf.calc6(d3);
d3.v0 = f++;
vs[n++] = mf.calc7(d3);
if (form != -1) {
d3.v2 = c2;
d3.v3 = c3;
d3.v4 = c4;
d3.v5 = c5;
d3.v6 = cl.getLP(form);
d3.v0 = f;
vs[n++] = mf.calc6(d3);
d3.v0 = f + 1;
vs[n++] = mf.calc7(d3);
}
f += 2;
if (form > 0) {
d3.v0 = f;
d3.v5 = cl.getLP(form);
vs[n++] = mf.calc6(d3);
d3.v0 = f + 1;
d3.v4 = cl.getLP(form);
vs[n++] = mf.calc5(d3);
d3.v0 = f + 2;
d3.v3 = cl.getLP(form);
vs[n++] = mf.calc4(d3);
}
f += 3;
d3.v2 = e0;
d3.v3 = e1;
d3.v4 = e2;
d3.v5 = e3;
d3.v6 = e4;
d3.v0 = f++;
vs[n++] = mf.calc3(d3);
d3.v0 = f++;
vs[n++] = l = mf.calc4(d3);
vs[n++] = d3.calcs(3, upper, l);
d3.v0 = f++;
vs[n++] = l = mf.calc5(d3);
vs[n++] = d3.calcs(3, upper, l);
d3.v0 = f++;
vs[n++] = l = mf.calc6(d3);
vs[n++] = d3.calcs(3, upper, l);
d3.v0 = f++;
vs[n++] = l = mf.calc7(d3);
vs[n++] = d3.calcs(3, upper, l);
if (form > 0) {
d3.v0 = f;
d3.v5 = cl.getLP(form);
vs[n++] = mf.calc6(d3);
d3.v0 = f + 1;
d3.v4 = cl.getLP(form);
vs[n++] = mf.calc5(d3);
d3.v0 = f + 2;
d3.v3 = cl.getLP(form);
vs[n++] = mf.calc4(d3);
}
f += 3;
dw.v0 = f++;
dw.v2 = i + 1 < len ? forms[i + 1] : _ewrd;
dw.v3 = forms[i];
vs[n++] = mf.calc4(dw);
if (len > i + 1) {
dw.v0 = f;
dw.v2 = forms[i + 1];
vs[n++] = mf.calc3(dw);
d3.v0 = f + 1;
d3.v2 = is.chars[ic][i + 1][0];
vs[n++] = mf.calc3(d3);
d3.v0 = f + 2;
d3.v2 = is.chars[ic][i + 1][6];
vs[n++] = mf.calc3(d3);
d3.v2 = e0;
d3.v3 = e1;
d3.v0 = f + 3;
d3.v4 = is.chars[ic][i + 1][0];
vs[n++] = mf.calc5(d3);
d3.v0 = f + 4;
d3.v4 = is.chars[ic][i + 1][6];
vs[n++] = mf.calc5(d3);
if (is.chars[ic][i + 1][11] > 1) { // instance.forms[i+1].length()
d3.v0 = f + 5;
d3.v2 = is.chars[ic][i + 1][0];
d3.v3 = is.chars[ic][i + 1][1];
vs[n++] = mf.calc4(d3);
d3.v0 = f + 6;
d3.v2 = is.chars[ic][i + 1][6];
d3.v3 = is.chars[ic][i + 1][7];
vs[n++] = mf.calc4(d3);
d3.v2 = e0;
d3.v3 = e1;
d3.v0 = f + 7;
d3.v4 = is.chars[ic][i + 1][0];
d3.v5 = is.chars[ic][i + 1][1];
vs[n++] = mf.calc6(d3);
d3.v0 = f + 8;
d3.v4 = is.chars[ic][i + 1][6];
d3.v5 = is.chars[ic][i + 1][7];
vs[n++] = mf.calc6(d3);
if (forms[i + 1] > 0) {
d3.v0 = f + 9;
d3.v2 = is.chars[ic][i + 1][0];
d3.v3 = is.chars[ic][i + 1][1];
d3.v4 = cl.getLP(forms[i + 1]);
vs[n++] = mf.calc5(d3);
d3.v0 = f + 10;
d3.v2 = is.chars[ic][i + 1][6];
d3.v3 = is.chars[ic][i + 1][7];
d3.v4 = cl.getLP(forms[i + 1]);
vs[n++] = mf.calc5(d3);
}
}
if (forms[i + 1] > 0) {
dw.v0 = f + 11;
dw.v2 = cl.getLP(forms[i + 1]);
dw.v3 = forms[i];
vs[n++] = mf.calc4(dw);
}
if (len > i + 2) {
dw.v0 = f + 12;
dw.v2 = forms[i + 2];
dw.v3 = forms[i + 1];
vs[n++] = mf.calc4(dw);
vs[n++] = mf.calc3(dw);
// d2.v0=f+13; d2.v2=pfeat[i+1]; d2.v3= pfeat[i+2]; vs[n++]=mf.calc4(d2);
// dp.v0= f+14; dp.v2=ppos[i+1]; dp.v3=ppos[i+2]; vs[n++]=mf.calc4(dp);
}
if (len > i + 3) {
dw.v0 = f + 14;
dw.v2 = forms[i + 3];
dw.v3 = forms[i + 2];
vs[n++] = mf.calc4(dw);
vs[n++] = mf.calc3(dw);
}
}
f += 16;
// length
d2.v0 = f++;
d2.v2 = is.chars[ic][i][11];
vs[n++] = mf.calc3(d2);
// contains a number
d2.v0 = f++;
d2.v2 = number;
vs[n++] = mf.calc3(d2);
d1.v0 = f++;
d1.v2 = lemmas[i];
vs[n++] = mf.calc3(d1);
if (i != 0 && len > i + 1) {
dw.v0 = f;
dw.v2 = lemmas[i - 1];
dw.v3 = lemmas[i + 1];
vs[n++] = mf.calc4(dw);
d2.v0 = f + 1;
d2.v2 = pfeat[i - 1];
d2.v3 = pfeat[i + 1];
vs[n++] = mf.calc4(d2);
}
f += 2;
d2.v0 = f++;
d2.v2 = i >= 1 ? pfeat[i - 1] : _strp;
vs[n++] = mf.calc3(d2);
dp.v0 = f++;
dp.v2 = ppos[i];
vs[n++] = mf.calc3(dp);
if (i > 0) {
dw.v0 = f++;
dw.v2 = i >= 1 ? forms[i - 1] : _strp;
vs[n++] = mf.calc3(dw);
dw.v0 = f++;
dw.v2 = i >= 1 ? lemmas[i - 1] : _strp;
vs[n++] = mf.calc3(dw);
if (len > i + 1) {
// d2.v0=f; d2.v2= pfeat[i-1];d2.v3= pfeat[i+1]; vs[n++]=mf.calc4(d2);
// dp.v0= f+1; dp.v2=ppos[i-1]; dp.v3=ppos[i+1]; vs[n++]=mf.calc4(dp);
}
f++;
dp.v0 = f++;
dp.v2 = ppos[i];
dp.v3 = ppos[i - 1];
vs[n++] = mf.calc4(dp);
if (i > 1) {
d2.v0 = f++;
d2.v2 = i < 2 ? _strp : pfeat[i - 2];
vs[n++] = mf.calc3(d2);
d2.v0 = f++;
d2.v2 = pfeat[i - 1];
d2.v3 = pfeat[i - 2];
vs[n++] = mf.calc4(d2);
dw.v0 = f++;
dw.v2 = forms[i - 2];
vs[n++] = mf.calc3(dw);
dwp.v0 = f++;
dwp.v2 = forms[i - 1];
dwp.v3 = pfeat[i - 2];
vs[n++] = mf.calc4(dwp);
dwp.v0 = f++;
dwp.v2 = forms[i - 2];
dwp.v3 = pfeat[i - 1];
vs[n++] = mf.calc4(dwp);
if (i > 2) {
d2.v0 = f++;
d2.v2 = pfeat[i - 3];
vs[n++] = mf.calc3(d2);
d2.v0 = f++;
d2.v2 = pfeat[i - 2];
d2.v3 = pfeat[i - 3];
vs[n++] = mf.calc4(d2);
dw.v0 = f++;
dw.v2 = forms[i - 3];
dw.v3 = forms[i - 2];
vs[n++] = mf.calc4(dw);
// dp.v0= f++; dp.v2=ppos[i-3]; dp.v3=ppos[i-2]; vs[n++]=mf.calc4(dp);
}
}
}
vs[n] = Integer.MIN_VALUE;
}
public int fillFeatureVectorsOne(ParametersFloat params, int w1, String form, Instances is, int n, int[] features, long[] vs) {
double best = -1;
int bestType = -1;
F2SF f = new F2SF(params.parameters);
//is.gfeats[n]
addCF((InstancesTagger) is, n, form, w1, features, is.pposs[n], is.forms[n], is.plemmas[n], vs);
for (int t = 0; t < types.length; t++) {
f.clear();
int p = t << Pipe.s_type;
for (int k = vs.length - 1; k >= 0; k--) {
if (vs[k] >= 0) {
f.add(li.l2i(vs[k] + p));
}
}
if (f.score > best) {
bestType = t;
best = f.score;
}
}
return bestType;
}
//static ArrayList<T> todo = new ArrayList<T>();
static SentenceData09 instance;
public static int _FC = 200;
/**
* Write the lemma that are not mapped by operations
*
* @param dos
*/
public void writeMap(DataOutputStream dos) {
try {
dos.writeInt(this.form2morph.size());
for (Entry<Integer, Integer> e : form2morph.entrySet()) {
dos.writeInt(e.getKey());
dos.writeInt(e.getValue());
}
} catch (IOException e1) {
e1.printStackTrace();
}
}
/**
* Read the form-lemma mapping not read by operations
*
* @param dis
*/
public void readMap(DataInputStream dis) {
try {
int size = dis.readInt();
for (int i = 0; i < size; i++) {
form2morph.put(dis.readInt(), dis.readInt());
}
} catch (IOException e1) {
e1.printStackTrace();
}
}
/*
* (non-Javadoc) @see is2.tools.IPipe#write(java.io.DataOutputStream)
*/
@Override
public void write(DataOutputStream dos) {
try {
cl.write(dos);
writeMap(dos);
} catch (IOException e) {
e.printStackTrace();
}
}
}