package is2.parserR2;
import extractors.Extractor;
import extractors.ParallelExtract;
import is2.data.*;
import is2.io.CONLLReader09;
import is2.util.OptionsSuper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.ExecutorService;
final public class Pipe extends PipeGen {
public Extractor[] extractor;
final public MFB mf = new MFB();
Cluster cl;
private OptionsSuper options;
public static long timeExtract;
public Pipe(OptionsSuper o) {
options = o;
}
public void createInstances(String file, Instances is) throws Exception {
CONLLReader09 depReader = new CONLLReader09(file);
mf.register(REL, "<root-type>");
// register at least one predicate since the parsing data might not contain predicates as in
// the Japaness corpus but the development sets contains some
long sl = 0;
is2.parser.Parser.out.print("Registering feature parts of sentence: ");
int ic = 0;
int del = 0;
while (true) {
SentenceData09 instance = depReader.getNext();
if (instance == null) {
break;
}
ic++;
sl += instance.labels.length;
if (ic % 1000 == 0) {
del = outValue(ic, del);
}
String[] labs1 = instance.labels;
for (int i1 = 0; i1 < labs1.length; i1++) {
mf.register(REL, labs1[i1]);
}
String[] w = instance.forms;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(WORD, depReader.normalize(w[i1]));
}
w = instance.plemmas;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(WORD, depReader.normalize(w[i1]));
}
w = instance.ppos;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(POS, w[i1]);
}
w = instance.gpos;
for (int i1 = 0; i1 < w.length; i1++) {
mf.register(POS, w[i1]);
}
if (instance.feats != null) {
String fs[][] = instance.feats;
for (int i1 = 0; i1 < fs.length; i1++) {
w = fs[i1];
if (w == null) {
continue;
}
for (int i2 = 0; i2 < w.length; i2++) {
mf.register(FEAT, w[i2]);
}
}
}
if ((ic - 1) > options.count) {
break;
}
}
del = outValue(ic, del);
for (Extractor e : extractor) {
e.setMaxForm(mf.getFeatureCounter().get(WORD));
}
if (options.clusterFile == null) {
cl = new Cluster();
} else {
cl = new Cluster(options.clusterFile, mf, 6);
}
mf.calculateBits();
is2.parser.Parser.out.println("" + mf.toString());
for (Extractor e : extractor) {
e.initStat();
e.init();
}
depReader.startReading(file);
int num1 = 0;
Edges.init(mf.getFeatureCounter().get(POS));
is2.parser.Parser.out.print("Creating edge filters and read corpus: ");
del = 0;
is.init(ic, new MFB());
while (true) {
if (num1 % 100 == 0) {
del = outValue(num1, del);
}
SentenceData09 instance1 = depReader.getNext(is);
if (instance1 == null) {
break;
}
int last = is.size() - 1;
short[] pos = is.pposs[last];
for (int k = 0; k < is.length(last); k++) {
if (is.heads[last][k] < 0) {
continue;
}
Edges.put(pos[is.heads[last][k]], pos[k], k < is.heads[last][k], is.labels[last][k]);
}
if (!options.allFeatures && num1 > options.count) {
break;
}
num1++;
}
del = outValue(num1, del);
is2.parser.Parser.out.println();
Edges.findDefault();
}
public void getInstances(String file, Instances is) throws Exception {
CONLLReader09 depReader = new CONLLReader09(file);
int ic = options.count + 2;
is.init(ic, new MFB());
int num1 = 0, del = 0;
while (true) {
if (num1 % 100 == 0) {
del = outValue(num1, del);
}
SentenceData09 instance1 = depReader.getNext(is);
if (instance1 == null) {
break;
}
if (!options.allFeatures && num1 > options.count) {
break;
}
num1++;
}
del = outValue(num1, del);
is2.parser.Parser.out.println();
}
/**
* Creates an instance for outputParses
*
* @param is
* @return
* @throws IOException
*/
protected final SentenceData09 nextInstance(Instances is, CONLLReader09 depReader) throws Exception {
SentenceData09 instance = depReader.getNext(is);
if (instance == null || instance.forms == null) {
return null;
}
return instance;
}
public static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS);
public DataF fillVector(F2SF params, Instances is, int inst, DataF d, Cluster cluster, int threads, Long2IntInterface li) throws InterruptedException {
long ts = System.nanoTime();
if (executerService.isShutdown()) {
executerService = java.util.concurrent.Executors.newCachedThreadPool();
}
final int length = is.length(inst);
if (d == null || d.len < length) {
d = new DataF(length, mf.getFeatureCounter().get(PipeGen.REL).shortValue());
}
ArrayList<ParallelExtract> pe = new ArrayList<>();
for (int i = 0; i < threads; i++) {
// DB.println(""+((ExtractorClusterStackedR2)extractor[i]).s_dist);
pe.add(new ParallelExtract(extractor[i], is, inst, d, (F2SF) params.clone(), cluster, li));
}
for (int w1 = 0; w1 < length; w1++) {
for (int w2 = 0; w2 < length; w2++) {
if (w1 == w2) {
continue;
}
ParallelExtract.add(w1, w2);
}
}
executerService.invokeAll(pe);
timeExtract += (System.nanoTime() - ts);
return d;
}
/**
* the loss function
*/
public double errors(Instances is, int ic, Parse p) {
if (p.heads == null) {
p.signature2parse(p.signature());
}
short[] act = is.heads[ic];
double correct = 0;
// do not count root
for (int i = 1; i < act.length; i++) {
if (p.heads[i] == act[i]) {
correct += 0.5;
if (p.labels[i] == is.labels[ic][i]) {
correct += 0.5;
}
}
}
double x = ((double) act.length - 1 - correct);
//p.f1 = (double)correct / (double)(act.length-1);
return x;
}
}