CONLLReader09.java example

Explorer
dependency-parsing-toolbox-master
- Source
package is2.io;

import is2.data.Instances;
import is2.data.SentenceData09;
import is2.parser.Parser;
import is2.util.DB;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;

/**
 * This class reads files in the CONLL-09 format.
 *
 * @author Bernd Bohnet
 */
public class CONLLReader09 extends CONLLReader {

    public static final boolean NORMALIZE = true;
    public static final boolean NO_NORMALIZE = false;
    static public String joint = "";
    private int format = 0;

    public CONLLReader09(boolean normalize) {

        normalizeOn = normalize;
    }

    public CONLLReader09(String file) {
        lineNumber = 0;
        try {
            inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"), 32768);
        } catch (FileNotFoundException | UnsupportedEncodingException e) {
            e.printStackTrace();
        }
    }

    public CONLLReader09(String file, boolean normalize) {
        this(file);
        normalizeOn = normalize;
    }

    /**
     * Sets the input format:
     *
     * CONLL09 is standard, ONE_LINE
     *
     * @param format the fomrat (see the constants starting with F_).
     */
    public void setInputFormat(int format) {
        this.format = format;
    }

    /**
     *
     */
    public CONLLReader09() {
    }

    /**
     * @param testfile
     * @param formatTask
     */
    public CONLLReader09(String testfile, int formatTask) {
        this(testfile);
    }

    @Override
    public SentenceData09 getNext() {

        if (F_ONE_LINE == format) {
            return getNextOneLine();
        } else {
            return getNextCoNLL09();
        }
    }

    /**
     * @return
     */
    private SentenceData09 getNextOneLine() {

        String line;
        int i = 0;
        try {

            line = inputReader.readLine();
            lineNumber++;

            if (line == null) {
                inputReader.close();
                return null;
            }

            String[] tokens = line.split(" ");
            int length = tokens.length;
            if (line.isEmpty()) {
                length = 0;
            }

            SentenceData09 it = new SentenceData09();

            it.forms = new String[length + 1];

            it.plemmas = new String[length + 1];
            //	it.ppos = new String[length+1];
            it.gpos = new String[length + 1];
            it.labels = new String[length + 1];
            it.heads = new int[length + 1];
            it.pheads = new int[length + 1];
            it.plabels = new String[length + 1];

            it.ppos = new String[length + 1];
            it.lemmas = new String[length + 1];
            it.fillp = new String[length + 1];
            it.feats = new String[length + 1][];
            it.ofeats = new String[length + 1];
            it.pfeats = new String[length + 1];
            it.id = new String[length + 1];

            it.forms[0] = ROOT;
            it.plemmas[0] = ROOT_LEMMA;
            it.fillp[0] = "N";
            it.lemmas[0] = ROOT_LEMMA;

            it.gpos[0] = ROOT_POS;
            it.ppos[0] = ROOT_POS;
            it.labels[0] = NO_TYPE;
            it.heads[0] = -1;
            it.plabels[0] = NO_TYPE;
            it.pheads[0] = -1;
            it.ofeats[0] = NO_TYPE;
            it.id[0] = "0";

            // root is 0 therefore start with 1

            for (i = 1; i <= length; i++) {

                it.id[i] = "" + i;

                it.forms[i] = this.normalizeOn ? normalize(tokens[i - 1]) : tokens[i - 1]; //normalize(
            }

            return it;

        } catch (Exception e) {
            Parser.out.println("\n!!! Error in input file sentence before line: " + lineNumber + " (in sentence line " + i + " ) " + e.toString());
            e.printStackTrace();
            System.exit(0);

            //throw new Exception();
            return null;
        }
    }

    /**
     * i.forms[heads[l]-1]+" "+rel+" "+ Read a instance
     *
     * @return a instance
     * @throws Exception
     */
    public SentenceData09 getNextCoNLL09() {

        String line;
        int i = 0;
        try {

            ArrayList<String[]> lineList = new ArrayList<>();

            line = inputReader.readLine();
            lineNumber++;

            while (line != null && line.length() == 0) {
                line = inputReader.readLine();
                lineNumber++;
                Parser.out.println("skip empty line at line " + lineNumber);
            }

            while (line != null && line.length() != 0 && !line.startsWith(STRING) && !line.startsWith(REGEX)) {
                lineList.add(line.split(REGEX));
                line = inputReader.readLine();
                lineNumber++;
            }

            int length = lineList.size();

            if (length == 0) {
                inputReader.close();
                return null;
            }

            SentenceData09 it = new SentenceData09();

            it.forms = new String[length + 1];

            it.plemmas = new String[length + 1];
            //	it.ppos = new String[length+1];
            it.gpos = new String[length + 1];
            it.labels = new String[length + 1];
            it.heads = new int[length + 1];
            it.pheads = new int[length + 1];
            it.plabels = new String[length + 1];

            it.ppos = new String[length + 1];
            it.lemmas = new String[length + 1];
            it.fillp = new String[length + 1];
            it.feats = new String[length + 1][];
            it.ofeats = new String[length + 1];
            it.pfeats = new String[length + 1];
            it.id = new String[length + 1];

            it.forms[0] = ROOT;
            it.plemmas[0] = ROOT_LEMMA;
            it.fillp[0] = "N";
            it.lemmas[0] = ROOT_LEMMA;

            it.gpos[0] = ROOT_POS;
            it.ppos[0] = ROOT_POS;
            it.labels[0] = NO_TYPE;
            it.heads[0] = -1;
            it.plabels[0] = NO_TYPE;
            it.pheads[0] = -1;
            it.ofeats[0] = NO_TYPE;
            it.id[0] = "0";

            // root is 0 therefore start with 1

            for (i = 1; i <= length; i++) {



                String[] info = lineList.get(i - 1);

                it.id[i] = info[0];
                it.forms[i] = info[1]; //normalize(
                if (info.length < 3) {
                    continue;
                }

                it.lemmas[i] = info[2];
                it.plemmas[i] = info[3];
                it.gpos[i] = info[4];

                if (info.length < 5) {
                    continue;
                }
                it.ppos[i] = info[5];//.split("\\|")[0];
                // feat 6


                // now we try underscore
                it.ofeats[i] = info[6].equals(CONLLWriter09.DASH) ? "_" : info[6];

                if (joint.length() > 0) {

                    StringBuilder b = new StringBuilder();
//					b.append(it.gpos[i]);
                    if (joint.startsWith("cz")) {

                        //	boolean caseFound =false;

                        String[] split = it.ofeats[i].split(PIPE);
                        //		if (!caseFound)
                        for (String s : split) {
                            if (s.startsWith("SubPOS")) {
                                if (b.length() > 0) {
                                    b.append("|");
                                }
                                b.append(s);
                            }
                        }

                        for (String s : split) {
                            if (s.startsWith("Cas")) {
                                if (b.length() > 0) {
                                    b.append("|");
                                }
                                b.append(s);
                            }

                        }

//						for(String s : split) {
//							if (s.startsWith("Num")) {
//								if (b.length()>0 )b.append("|");
//								b.append(s);
//							}
//						}



                    } else if (joint.contains("ger")) {

                        String[] split = it.ofeats[i].split(PIPE);
                        for (String s : split) {
                            if (s.matches("Nom|Acc|Dat|Gen")) {
                                if (b.length() > 0) {
                                    b.append("|");
                                }
                                b.append(s);
                            }
                            if (s.matches("Sg|Pl")) {
                                if (b.length() > 0) {
                                    b.append("|");
                                }
                                b.append(s);
                            }
                        }

                    } else {
                        String[] split = it.ofeats[i].split(PIPE);
                        for (String s : split) {
                            if (s.matches(joint)) {
                                b.append("|").append(s);
                            }
                        }
                    }
                    if (b.length() == 0) {
                        b.append("_");
                    }
                    it.ofeats[i] = b.toString();
                }

                if (info[7].equals(CONLLWriter09.DASH)) {
                    it.feats[i] = null;
                } else {
                    it.feats[i] = info[7].split(PIPE);
                    it.pfeats[i] = info[7];
                }

                if (info[8].equals(US)) {
                    it.heads[i] = -1;
                } else {
                    it.heads[i] = Integer.parseInt(info[8]);// head
                }
                it.pheads[i] = info[9].equals(US) ? it.pheads[i] = -1 : Integer.parseInt(info[9]);// head

                it.labels[i] = info[10];
                it.plabels[i] = info[11];
                it.fillp[i] = info[12];

                if (info.length > 13) {
                    if (!info[13].equals(US)) {
                        it.addPredicate(i, info[13]);
                    }
                    for (int k = 14; k < info.length; k++) {
                        it.addArgument(i, k - 14, info[k]);
                    }
                }
            }
            return it;

        } catch (IOException | NumberFormatException e) {
            Parser.out.println("\n!!! Error in input file sentence before line: " + lineNumber + " (in sentence line " + i + " ) " + e.toString());
            e.printStackTrace();
            System.exit(0);

            //throw new Exception();
            return null;
        }
    }

    @Override
    final public boolean insert(Instances is, SentenceData09 it) throws IOException {

        try {

            if (it == null) {
                inputReader.close();
                return false;
            }

            int i = is.createInstance09(it.length());

            for (int p = 0; p < it.length(); p++) {

                is.setForm(i, p, normalize(it.forms[p]));
                //	is.setFormOrg(i, p, it.forms[p]);
                is.setGPos(i, p, it.gpos[p]);

                //		Parser.out.println(""+is.gpos[i][p]);

                if (it.ppos[p] == null || it.ppos[p].equals(US)) {

                    is.setPPoss(i, p, it.gpos[p]);
                } else {
                    is.setPPoss(i, p, it.ppos[p]);
                }


                if (it.plemmas[p] == null || it.plemmas[p].equals(US)) {
                    is.setLemma(i, p, normalize(it.forms[p]));
                } else {
                    is.setLemma(i, p, normalize(it.plemmas[p]));
                }

                if (it.lemmas != null) {
                    if (it.lemmas[p] == null) { // ||it.org_lemmas[p].equals(US) that harms a lot the lemmatizer
                        is.setGLemma(i, p, it.plemmas[p]);
                    } else {
                        is.setGLemma(i, p, it.lemmas[p]);
                    }
                }


                if (it.feats != null && it.feats[p] != null) {
                    is.setFeats(i, p, it.feats[p]);
                }

                if (it.ofeats != null) {
                    is.setFeature(i, p, it.ofeats[p]);
                }
                if (it.pfeats != null) {
                    is.setPFeature(i, p, it.pfeats[p]);
                }


                is.setRel(i, p, it.labels[p]);
                if (it.plabels != null) {
                    is.setPRel(i, p, it.plabels[p]);
                }

                is.setHead(i, p, it.heads[p]);
                if (it.pheads != null) {
                    is.setPHead(i, p, it.pheads[p]);
                }

                if (it.fillp != null && it.fillp[p] != null && it.fillp[p].startsWith("Y")) {
                    is.pfill[i].set(p);
                } else {
                    is.pfill[i].clear(p);
                }
            }

            if (is.createSem(i, it)) {
                DB.println("count " + i + " len " + it.length());
                DB.println(it.printSem());
            }
        } catch (Exception e) {
            DB.println("head " + it);
            e.printStackTrace();
        }
        return true;
    }
}