package optimizer; import algorithmTester.AlgorithmTester; import java.io.*; import java.util.*; /** * * @author Miguel Ballesteros * */ public class CoNLLHandler { BufferedReader br; String trainingCorpus; String head0; int numberOfTrees; boolean cposEqPos; boolean lemmaBlank; boolean featsBlank; String danglingFreq = ""; public String getDanglingFreq() { return danglingFreq; } public void setDanglingFreq(String danglingFreq) { this.danglingFreq = danglingFreq; } String messageDivision = ""; public String getMessageDivision() { return messageDivision; } public void setMessageDivision(String messageDivision) { this.messageDivision = messageDivision; } int numbSentences; public static int numSentences; public int getNumbSentences() { return numbSentences; } public void setNumbSentences(int numbSentences) { this.numbSentences = numbSentences; } public int getNumbTokens() { return numbTokens; } public void setNumbTokens(int numbTokens) { CoNLLHandler.numbTokens = numbTokens; } public static int numbTokens; public boolean isFeatsBlank() { return featsBlank; } public HashMap<String, Double> getRootlabels() { return rootlabels; } public void setRootlabels(HashMap<String, Double> rootlabels) { this.rootlabels = rootlabels; } String training80; String testing20; HashMap<String, Double> rootlabels; TreeMap<String, String> tree; TreeMap<Integer, ArrayList<String>> invtree; private Writer writer = null; public CoNLLHandler(String trainingCorpus) { this(trainingCorpus, null); } public CoNLLHandler(String trainingCorpus, Writer writer) { this.writer = writer; rootlabels = new HashMap<>(); this.trainingCorpus = trainingCorpus; if (trainingCorpus.contains("/")) { StringTokenizer st = new StringTokenizer(trainingCorpus, "/"); String relPath = ""; while (st.hasMoreTokens()) { relPath = st.nextToken("/"); } //Echo(relPath); training80 = relPath.replaceAll(".conll", ""); testing20 = relPath.replaceAll(".conll", ""); training80 += "_train80.conll"; testing20 += "_test20.conll"; } else { training80 = trainingCorpus.replaceAll(".conll", ""); testing20 = trainingCorpus.replaceAll(".conll", ""); training80 += "_train80.conll"; testing20 += "_test20.conll"; } try { br = new BufferedReader(new FileReader(trainingCorpus)); } catch (FileNotFoundException e) { e.printStackTrace(); } } public int getNumberOfTrees() { return numberOfTrees; } public void setNumberOfTrees(int numberOfTrees) { this.numberOfTrees = numberOfTrees; } public int getNumberOfSentences() { return numberOfTrees; } /** * * @return A simple Text plain random sentence from the corpus. */ public String getSamplePlainText() { Random r = new Random(); int nrand = r.nextInt(15); //int nrand=0; boolean buscando = true; int i = 0; try { br = new BufferedReader(new FileReader(trainingCorpus)); } catch (FileNotFoundException e1) { e1.printStackTrace(); } while (buscando) { String line; try { line = br.readLine(); if (line != null && line.equals("")) { i++; if (i == nrand) { buscando = false; } } } catch (IOException e) { e.printStackTrace(); } } boolean nofinfrase = true; String cat = ""; while (nofinfrase) { String line; try { line = br.readLine(); //Echo(line); StringTokenizer st = new StringTokenizer(line); if (st.hasMoreTokens()) { st.nextToken(); } if (st.hasMoreTokens()) { cat += st.nextToken() + " "; } if (line != null && line.equals("")) { nofinfrase = false; } } catch (IOException e) { e.printStackTrace(); } } //Echo(cat); return cat; } /** * * @return true if the root labels are for the word ROOT, otherwise false. * */ public boolean rootLabels() { int numberRoots = 0; boolean diferentes = false; try { br = new BufferedReader(new FileReader(trainingCorpus)); while (br.ready()) { String line; try { line = br.readLine(); //Echo(line); String head = getColumn(line, 7); if (head.equals("0")) { numberRoots++; String root = getColumn(line, 8); Double d = rootlabels.get(root); if (d == null) { d = new Double(0); } d = d + 1.0; rootlabels.put(root, d); head0 = root; if (root.equals("ROOT")) { diferentes = true; } } } catch (IOException e) { e.printStackTrace(); } } } catch (IOException e) { e.printStackTrace(); } int numb = 0; Set<String> set = rootlabels.keySet(); Iterator<String> it = set.iterator(); while (it.hasNext()) { numb++; String s = it.next(); Double nr = new Double(numberRoots); rootlabels.put(s, rootlabels.get(s) / nr); } if (numb > 1) { return true; } return false; } public String getTraining80() { return training80; } public void setTraining80(String training80) { this.training80 = training80; } public String getTesting20() { return testing20; } public void setTesting20(String testing20) { this.testing20 = testing20; } /** * * @return true if the HEAD=0 for punctuation tokens is covered by another * arc, otherwise false. * */ public boolean danglingPunctuation() { int contNonProjLocal = 0; int numberOfCovered = 0; int sentencesCount = 0; boolean dangling = false; int contCoveredRoots = 0; TreeMap<String, String> treeDangling; TreeMap<String, String> treeForms; try { br = new BufferedReader(new FileReader(trainingCorpus)); treeDangling = new TreeMap<>(); treeForms = new TreeMap<>(); while (br.ready()) { String line; try { line = br.readLine(); //Echo(line); if (line == null || line.equals("")) { sentencesCount++; treeDangling = new TreeMap<>(); treeForms = new TreeMap<>(); if (dangling) { contCoveredRoots++; } dangling = false; } if (line != null && !line.equals("")) { String id = getColumn(line, 1); String head = getColumn(line, 7); treeDangling.put(id, head); String form = getColumn(line, 2); treeForms.put(id, form); int contador = 0; //if (head.equals("0")){ int intid = Integer.parseInt(id); int inthead = Integer.parseInt(head); contNonProjLocal = 0; if (inthead < intid) { //COVERED ROOTS!! for (int i = inthead + 1; i < intid; i++) { String numb = "" + i; String formN = treeForms.get(numb); if (formN.equals(",") || formN.equals(".") || formN.equals(";") || formN.equals("-") || formN.equals("\"") || formN.equals("'")) { //Echo(numb); String headOld = treeDangling.get(numb); //Echo(headOld); if (headOld != null) { int intHeadOld = Integer.parseInt(headOld); if (intHeadOld == 0) { if (headOld != null) { int hold = Integer.parseInt(headOld); if (hold < inthead) { contador++; contNonProjLocal++; } else if (hold > intid) { contador++; contNonProjLocal++; } } } } } } } } if (contNonProjLocal > 0) { //Echo(line); dangling = true; numberOfCovered += contNonProjLocal; contNonProjLocal = 0; //return true; } } catch (IOException e) { e.printStackTrace(); } } } catch (IOException e) { e.printStackTrace(); } //Echo(contCoveredRoots); double frequency = ((double) contCoveredRoots / (double) sentencesCount) * 100; String freq = String.format(Optimizer.pattern, frequency); //Echo("Frequency of Sentences with Covered Roots (dangling punctuation): "+freq); //Echo("Number of Covered Roots (punctuation): "+numberOfCovered); danglingFreq = "" + numberOfCovered; if (contCoveredRoots > 0) { return true; } return false; } /** * * @return true if the HEAD=0 covered by another Arc, otherwise false. * */ public boolean coveredRoots() { int contNonProjLocal = 0; int numberOfCovered = 0; int sentencesCount = 0; boolean dangling = false; int contCoveredRoots = 0; TreeMap<String, String> treeDangling; try { br = new BufferedReader(new FileReader(trainingCorpus)); treeDangling = new TreeMap<>(); while (br.ready()) { String line; try { line = br.readLine(); //Echo(line); if (line == null || line.equals("")) { sentencesCount++; treeDangling = new TreeMap<>(); if (dangling) { contCoveredRoots++; } dangling = false; } if (line != null && !line.equals("")) { String id = getColumn(line, 1); String head = getColumn(line, 7); treeDangling.put(id, head); String form = getColumn(line, 2); int contador = 0; //if (head.equals("0")){ int intid = Integer.parseInt(id); int inthead = Integer.parseInt(head); contNonProjLocal = 0; if (inthead < intid) { //COVERED ROOTS!! for (int i = inthead + 1; i < intid; i++) { String numb = "" + i; //Echo(numb); String headOld = treeDangling.get(numb); //Echo(headOld); if (headOld != null) { int intHeadOld = Integer.parseInt(headOld); if (intHeadOld == 0) { if (headOld != null) { int hold = Integer.parseInt(headOld); if (hold < inthead) { contador++; contNonProjLocal++; } else if (hold > intid) { contador++; contNonProjLocal++; } } } } } } } if (contNonProjLocal > 0) { //Echo(line); dangling = true; numberOfCovered += contNonProjLocal; contNonProjLocal = 0; //return true; } } catch (IOException e) { e.printStackTrace(); } } } catch (IOException e) { e.printStackTrace(); } double frequency = (double) contCoveredRoots / (double) sentencesCount; String freq = "" + frequency; if (freq.length() > 6) { freq = freq.substring(0, 6); } println("Frequency of Sentences with Covered Roots: " + freq); println("Number of Covered Roots: " + numberOfCovered); if (contCoveredRoots > 0) { return true; } return false; } /** * * @return true if the HEAD=0 covered by another Arc, otherwise false. * */ public boolean coveredRootsWithoutChildren() { String concat = ""; int contNonProjLocal = 0; int numberOfCovered = 0; int sentencesCount = 0; boolean dangling = false; int contCoveredRoots = 0; ArrayList<String> candidates = new ArrayList<>(); TreeMap<String, String> treeDangling; try { br = new BufferedReader(new FileReader(trainingCorpus)); treeDangling = new TreeMap<>(); while (br.ready()) { String line; try { line = br.readLine(); //Echo(line); if (line == null || line.equals("")) { sentencesCount++; Iterator<String> it = candidates.iterator(); boolean covered = true; if (!it.hasNext()) { covered = false; } while (it.hasNext()) { String c = it.next(); ///int candidato=Integer.parseInt(c); boolean esCovered = true; Set<String> idSet = treeDangling.keySet(); Iterator<String> ids = idSet.iterator(); while (ids.hasNext()) { String id = ids.next(); String headN = treeDangling.get(id); if (headN.equals(c)) { covered = false; esCovered = false; } } if (esCovered) { numberOfCovered++; } } //if (covered) println(concat); //concat=""; treeDangling = new TreeMap<>(); candidates = new ArrayList<>(); if (covered) { contCoveredRoots++; } dangling = false; } if (line != null && !line.equals("")) { //concat+=line+"\n"; String id = getColumn(line, 1); String head = getColumn(line, 7); treeDangling.put(id, head); String form = getColumn(line, 2); int contador = 0; //if (head.equals("0")){ int intid = Integer.parseInt(id); int inthead = Integer.parseInt(head); contNonProjLocal = 0; if (inthead < intid) { //COVERED ROOTS!! for (int i = inthead + 1; i < intid; i++) { String numb = "" + i; //Echo(numb); String headOld = treeDangling.get(numb); //Echo(headOld); if (headOld != null) { int intHeadOld = Integer.parseInt(headOld); if (intHeadOld == 0) { if (headOld != null) { int hold = Integer.parseInt(headOld); if (hold < inthead) { contador++; contNonProjLocal++; candidates.add(numb); } else if (hold > intid) { contador++; contNonProjLocal++; candidates.add(numb); } } } } } } } if (contNonProjLocal > 0) { //Echo(line); dangling = true; contNonProjLocal = 0; //return true; } } catch (IOException e) { e.printStackTrace(); } } } catch (IOException e) { e.printStackTrace(); } //Echo(contCoveredRoots); //Echo(numbSentences); double frequency = (double) contCoveredRoots / (double) sentencesCount; //Echo(frequency); String freq = "" + frequency; if (freq.length() > 6) { freq = freq.substring(0, 6); } println("Frequency of Sentences with Covered Roots (without children): " + freq); println("Number of Covered Roots (without children): " + numberOfCovered); if (contCoveredRoots > 0) { return true; } return false; } public String extraDataCharacteristics() { String characteristics = ""; int numTokens = 0; int sentencesCount = 0; cposEqPos = true; lemmaBlank = true; featsBlank = true; try { br = new BufferedReader(new FileReader(trainingCorpus)); while (br.ready()) { String line; try { line = br.readLine(); if ((line == null) || (line.equals(""))) { sentencesCount++; } else if (line != null && (!line.equals(""))) { numTokens++; String cpos = getColumn(line, 4); String pos = getColumn(line, 5); if (!pos.equals(cpos)) { cposEqPos = false; } String lemma = getColumn(line, 3); if (!lemma.equals("_")) { lemmaBlank = false; } String feats = getColumn(line, 3); if (!feats.equals("_")) { //Echo(feats); featsBlank = false; } } } catch (IOException e) { e.printStackTrace(); } } } catch (IOException e) { e.printStackTrace(); } println("Your training set consists of " + sentencesCount + " sentences and " + numTokens + " tokens."); println("Testing Java Heap ... "); //JAVA HEAP HANDLING boolean validHeap = false; String javaHeap = ""; AlgorithmTester at = new AlgorithmTester("lang", this, this.trainingCorpus, writer); int val = numTokens + 1; int nMaxTokens = val; //Echo(nMaxTokens); while (!validHeap) { javaHeap = calculateJavaHeapValue(nMaxTokens); generateDivision8020(); /* * println("Trying with "+Optimizer.javaHeapValue); println(Optimizer.nMaxTokens); */ validHeap = at.executeCovNonProjEagerDefaultJavaHeapTesting("CovingtonNonProjective.xml"); if (!validHeap) { if (nMaxTokens > 700000) { nMaxTokens = 700000; } else if (nMaxTokens > 650000) { nMaxTokens = 650000; } else if (nMaxTokens > 500000) { nMaxTokens = 500000; } else { nMaxTokens -= 20000; } Optimizer.nMaxTokens = nMaxTokens; if (nMaxTokens < 0) { println("MaltParser cannot work."); System.exit(0); } } //Echo(nMaxTokens); } characteristics += javaHeap; if (nMaxTokens != val) { println("MaltOptimizer inferred that your system cannot allocate enough memory to run experiments with the whole corpus."); //Echo("MaltOptimizer will reduce the size of the training set."); double percentage = (double) nMaxTokens * 100 / numTokens; String perc = String.format(Optimizer.pattern, percentage); //Echo("The performance is going to be affected by this fact."); //Echo("We recommend the use of a system with higher memory allocation."); println("MaltOptimizer will reduce the size of the training set and use only " + nMaxTokens + " tokens (" + perc + "%)."); Optimizer.nMaxTokens = nMaxTokens; } //Echo(Optimizer.nMaxTokens); //calculateJavaHeapValue(numTokens); // if (cposEqPos) { characteristics += "CPOSTAG and POSTAG are identical in your training set.\n"; Optimizer.cposEqPos = true; } else { characteristics += "CPOSTAG and POSTAG are distinct in your training set.\n"; Optimizer.cposEqPos = false; } if (lemmaBlank) { characteristics += "The LEMMA column is not used in your training set.\n"; Optimizer.lemmaBlank = true; } else { characteristics += "The LEMMA column is used in your training set.\n"; Optimizer.lemmaBlank = false; } if (featsBlank) { characteristics += "The FEATS column is not used in your training set."; Optimizer.featsBlank = true; } else { characteristics += "The FEATS column is used in your training set."; Optimizer.featsBlank = false; } CoNLLHandler.numbTokens = numTokens; CoNLLHandler.numSentences = sentencesCount; return characteristics; } private String calculateJavaHeapValue(int numTokens) { if (numTokens <= 70000) { Optimizer.javaHeapValue = "-Xmx2048m"; return "MaltOptimizer has inferred that MaltParser needs at least 1.5Gb of free memory.\n"; } else if (numTokens <= 100000) { Optimizer.javaHeapValue = "-Xmx2560m"; return "MaltOptimizer has inferred that MaltParser needs at least 2Gb of free memory.\n"; } else if (numTokens <= 150000) { Optimizer.javaHeapValue = "-Xmx3072m"; return "MaltOptimizer has inferred that MaltParser needs at least 2.5Gb of free memory.\n"; } /* * else if (numTokens<=250000) { Optimizer.javaHeapValue="-Xmx3072m"; * return "MaltOptimizer has inferred that MaltParser needs at least 3Gb * of free memory.\n"; } */ else if (numTokens <= 350000) { Optimizer.javaHeapValue = "-Xmx5120m"; return "MaltOptimizer has inferred that MaltParser needs at least 4Gb of free memory.\n"; } else if (numTokens <= 400000) { Optimizer.javaHeapValue = "-Xmx6144m"; return "MaltOptimizer has inferred that MaltParser needs at least 5Gb of free memory.\n"; } else if (numTokens <= 450000) { Optimizer.javaHeapValue = "-Xmx7168m"; return "MaltOptimizer has inferred that MaltParser needs at least 6Gb of free memory.\n"; } else if (numTokens <= 500000) { Optimizer.javaHeapValue = "-Xmx8192m"; return "MaltOptimizer has inferred that MaltParser needs at least 7Gb of free memory.\n"; } else if (numTokens <= 600000) { Optimizer.javaHeapValue = "-Xmx10240m"; return "MaltOptimizer has inferred that MaltParser needs at least 8Gb of free memory.\n"; } /* * else if (numTokens<=6500000) { Optimizer.javaHeapValue="-Xmx20480m"; * return "MaltOptimizer has inferred that MaltParser needs at least 9Gb * of free memory.\n"; } */ else if (numTokens <= 700000) { Optimizer.javaHeapValue = "-Xmx16384m"; return "MaltOptimizer has inferred that MaltParser needs at least 16Gb of free memory.\n"; }/* * else if (numTokens<=1200000){ Optimizer.javaHeapValue="-Xmx20480m"; * return "MaltOptimizer has inferred that MaltParser needs at least * 16Gb of free memory.\n"; } */ else { Optimizer.javaHeapValue = "-Xmx20480m"; return "MaltOptimizer has inferred that MaltParser needs at least 20Gb of free memory.\n"; } //AlgorithmTester at=new AlgorithmTester(); } /** * * @return the percentage of non-projective trees in the training set * */ public double projectiveOrNonProjective() { //CROSSING EDGES--->contador++ int numberOfNonProjectives = 0; int treesCount = 0; int contador = 0; int anteriorHead = -1; int anteriorId = -1; int contProjectivities = 0; int numberOfArcs = 0; String cat = ""; int cont = 0; tree = new TreeMap<>(); invtree = new TreeMap<>(); try { br = new BufferedReader(new FileReader(trainingCorpus)); try { while (br.ready()) { String line; line = br.readLine(); if (line != null && line.equals("")) { cont++; treesCount++; //Echo(cat); cat = ""; boolean nonprojective = false; if (cont > 0) { nonprojective = inorder(invtree); } //if (contador>0){ if (nonprojective) { //Echo("Non Projective"); numberOfNonProjectives++; //number of non-projectivities in the previous sentence contador = 0; } tree = new TreeMap<>(); invtree = new TreeMap<>(); } if (line != null && (!line.equals(""))) { cat += getColumn(line, 2) + " "; numberOfArcs++; String head = getColumn(line, 7); String id = getColumn(line, 1); tree.put(id, head); Integer ihead = Integer.parseInt(head); ArrayList<String> children = invtree.get(ihead); if (children == null) { children = new ArrayList<>(); } children.add(id); invtree.put(ihead, children); } } /* * int intid = Integer.parseInt(id); int * inthead=Integer.parseInt(head); int contNonProjLocal=0; if * (inthead<intid) { //Se trata de encontrar una dependencia que * vaya por delante de donde depende este. //Es decir, //John * saw a dog yesterday which was a Yorkshire terrier. //Estamos * en WAS (7) //Was tiene como HEAD DOG (4) (4<7) //Pero * Yesterday (5) que está por delante de dog (4) tiene como HEAD * Saw (2) //Los arcos de was a dog y de yesterday a Saw se * CRUZAN... //Esto es lo que hay que detectar for(int * i=inthead+1;i<intid;i++){ String numb=""+i; String * headOld=tree.get(numb); if (headOld!=null) { int * intHeadOld=Integer.parseInt(headOld); * * if (headOld!=null) { int hold=Integer.parseInt(headOld); if * (hold<inthead) { contador++; contNonProjLocal++; } else if * (hold>intid) { contador++; contNonProjLocal++; } } } } //if * (contNonProjLocal==0){ for(int i=inthead+1;i<intid;i++){ * String numb=""+i; String headOld=tree.get(numb); if * (headOld!=null) { int intHeadOld=Integer.parseInt(headOld); * /*System.out.print("("+numb+","); * println(headOld+")"); if (!head.equals("0") && * (!(headOld.equals(head)))){//&& (!(headOld.equals(id)))) { * //Echo(line); boolean * out=expandNodeRoot(intHeadOld,inthead); //is I "dominated" by * head? (in a transitive way) //if i is dominated by head * return false, else return true and it is a non projective arc * //(numb is i) if (out) { //Echo("GOOD!"); * contador++; contNonProjLocal++; } } } } //} * * * * //}// * * * /*if (headOld!=null) { int hold=Integer.parseInt(headOld); if * (hold<inthead) { contador++; contNonProjLocal++; } else if * (hold>intid) { contador++; contNonProjLocal++; } /*else { * String headOldS=tree.get(headOld); //father of headOld if * (headOldS!=null){ Integer holds=Integer.parseInt(headOldS); * if (holds<inthead) { contador++; contNonProjLocal++; } /*else * if (holds>intid) { contador++; contNonProjLocal++; * * } * } * * } * } * * if (contNonProjLocal>0) { contProjectivities++; * //Echo("NonProjective Arc:"+getColumn(line,2) * +" head:"+head); } * * //} /*else if (inthead>intid) { for(int i=inthead-1;i>1;i--){ * String numb=""+i; String headOld=tree.get(numb); if * (headOld!=null) { int hold=Integer.parseInt(headOld); if * (hold<inthead) { contador++; } } * * } * } * anteriorHead=inthead; anteriorId=intid; } */ } catch (IOException e) { e.printStackTrace(); } } catch (FileNotFoundException e) { e.printStackTrace(); } double value = 0; /* * this.numberOfTrees=numberOfTrees; if (this.numberOfTrees>0){ * value=((double)numberOfNonProjectives/(double)numberOfTrees)*100; } */ double k = ((double) numberOfNonProjectives / (double) treesCount) * 100; String ntrees = String.format(Optimizer.pattern, k); //Echo(k); //Echo(ntrees+"% of the trees contain non-projective arcs"); this.numberOfTrees = treesCount; this.numbSentences = treesCount; /* * if (this.numberOfTrees>0){ * value=((double)contProjectivities/(double)numberOfArcs)*100; } */ //Echo(value); return k;//(numberOfNonProjectives/numberOfTrees)*100; } private boolean inorder(TreeMap<Integer, ArrayList<String>> tree2) { //leftmost child String cat = ""; Set<Integer> keySet = tree2.keySet(); //Echo(keySet); Integer head = 0; ArrayList<String> children = tree2.get(head); cat += getSubtree(tree2, head, children); //Echo(cat); StringTokenizer st = new StringTokenizer(cat); Integer anterior = 0; while (st.hasMoreTokens()) { String s = st.nextToken(); Integer is = Integer.parseInt(s); Integer shouldBe = anterior + 1; if ((is != (shouldBe))) { if (cat.contains(shouldBe.toString())) { return true; } //else // anterior=shouldBe; } anterior = is; } //Echo(cat); return false; } private String getSubtree(TreeMap<Integer, ArrayList<String>> tree2, Integer head, ArrayList<String> children) { String cat = ""; //Echo("("+head+","+children+")"); Iterator<String> itch = children.iterator(); if (!itch.hasNext()) { cat += " " + head; } else { while (itch.hasNext()) { String child = itch.next(); Integer intChild = Integer.parseInt(child); ArrayList<String> nietos = tree2.get(intChild); if (head < intChild && (!cat.contains(head.toString())) && (head != 0)) { cat += " " + head; } if (nietos == null) { cat += " " + child; //if ((intChild==head-1)) if ((intChild == head - 1) && (!cat.contains(head.toString()))) { cat += " " + head; } } else { /* * if (head!=0) cat+=getSubtree(tree2,intChild,nietos) + * head.toString(); else */ cat += getSubtree(tree2, intChild, nietos); } } } if (!cat.contains(head.toString())) { cat += " " + head; } return cat; } private boolean expandNodeRoot(int head, int originalHead) { if (head == 0) { return true; //si llegas a ROOT antes que a originalHead el arco original es non-projective } if (head == originalHead) { return false; } /* * String strHead=""+head; String nhead=tree.get(strHead); */ int nIntHead = head; while (nIntHead != 0) { String strHeadN = "" + nIntHead; String nheadN = tree.get(strHeadN); //Echo("("+originalHead+","+nheadN+")"); if (nheadN == null) { return true; } nIntHead = Integer.parseInt(nheadN); if (nIntHead == 0) { return true; //si llegas a ROOT antes que a originalHead el arco original es non-projective } if (nIntHead == originalHead) { return false; } } /* * if (nhead!=null) { //Echo(nhead); int * nIntHead=Integer.parseInt(nhead); return expandNodeRoot(nIntHead, * originalHead); } */ return true; } private String getColumn(String line, int columna) { StringTokenizer st = new StringTokenizer(line, "\t"); String ret = ""; for (int i = 0; i < columna; i++) { if (st.hasMoreTokens()) { ret = st.nextToken(); } } return ret; } public String getHead0() { return head0; } public void generateDivision8020() { //CROSSING EDGES--->contador++ //Echo("Generating training and test corpus"); double percent80 = 0.8 * numberOfTrees; int numbLinesTrain = 0; int numbSentencesTest = 0; int numbSentencesTrain = 0; double percent20 = 0.2 * numberOfTrees; String concatTrain = ""; String concatTest = ""; int trainTimes = 0; String lastTest = ""; String line; boolean trainTurn = false; int ntokens = 0; int ntokensTrain = 0; int ntokensTest = 0; //4 for train, 1 for test try { br = new BufferedReader(new FileReader(trainingCorpus)); BufferedWriter bwTrain = new BufferedWriter(new FileWriter(training80)); BufferedWriter bwTest = new BufferedWriter(new FileWriter(testing20)); boolean metido = false; try { while (br.ready()) { line = br.readLine(); ntokens++; if (line != null && line.equals("")) { ntokens++; if (trainTimes < 4) { trainTimes++; trainTurn = true; numbSentencesTrain++; } else { trainTimes = 0; trainTurn = false; numbSentencesTest++; } int total = numbSentencesTrain + numbSentencesTest; /* * if (total%150==0) System.out.print("."); */ } if (ntokens < Optimizer.nMaxTokens - 10000) { if (trainTurn) { if (numbLinesTrain > 0) { //concatTrain+=line+"\n"; //Echo(line+"\n"); bwTrain.write(line + "\n"); ntokensTrain++; } numbLinesTrain++; } else { bwTest.write(line + "\n"); ntokensTest++; lastTest = line; //concatTest+=line+"\n"; } } else { if (!metido && !line.equals("")) { if (trainTurn) { if (numbLinesTrain > 0) { //concatTrain+=line+"\n"; //Echo(line+"\n"); bwTrain.write(line + "\n"); ntokensTrain++; } numbLinesTrain++; } else { bwTest.write(line + "\n"); ntokensTrain++; lastTest = line; //concatTest+=line+"\n"; } } else { if (line.equals("")) { metido = true; } } } } //Echo(""); //bwTrain.write(concatTrain); bwTrain.close(); //bwTest.write(concatTest); //bwTest.write("\n"); //bwTest.write("\n"); if (!lastTest.equals("")) { bwTest.write("\n"); } bwTest.close(); numbSentencesTest++; numbSentencesTrain--; /* * println("Testing Set of "+numbSentencesTest +" * sentences generated"); println("Training Set of * "+numbSentencesTrain +" sentences generated"); */ messageDivision = "Generated training set (" + ntokensTrain + " tokens) and devtest set (" + ntokensTest + " tokens)."; } catch (IOException e) { e.printStackTrace(); } } catch (FileNotFoundException e) { e.printStackTrace(); } //Echo("\nCorpora generated"); catch (IOException e) { e.printStackTrace(); } } public void generate5FoldCrossCorpora() { if (!Optimizer.pseudoRandomizeSelection) { this.generate5FoldCrossCorporaNoPseudo(); } else { this.generate5FoldCrossCorporaPseudo(); } } public void generate5FoldCrossCorporaNoPseudo() { //Generate 5 Small Folds and 5 corresponding big folds. Save the names as follows. //fold_train1 //fold_test1 //fold_train2 //fold_test2 //fold_train3 //... //CROSSING EDGES--->contador++ //Echo("Generating training and test corpus"); int numbLinesTrain = 0; int numbSentencesTest = 0; int numbSentencesTrain = 0; //this.n int numberOfLines = Optimizer.numbTokens + Optimizer.numbSentences; /* * println(Optimizer.numbTokens); * println(Optimizer.numbSentences); * println(numberOfLines); * /*println(numberOfLines); println(numSentences); */ double percent20 = 0.2 * (numberOfLines); double percent40 = 0.4 * (numberOfLines); double percent60 = 0.6 * (numberOfLines); double percent80 = 0.8 * (numberOfLines); /* * println(percent20); println(percent40); * println(percent60); println(percent80); println(numberOfLines); */ //1 empieza en 0 para test //2 empieza en 20% para test //3 empieza en 40% para test //4 empieza en 60% para test //5 empieza en 60% para test double limitInfTest = 0; double limitSupTest = percent20; for (int i = 1; i < 6; i++) { String training = "fold_train_" + i + ".conll"; String test = "fold_test_" + i + ".conll"; int trainTimes = 0; String lastTest = ""; String line; boolean trainTurn = false; int ntokens = 0; //4 for train, 1 for test try { br = new BufferedReader(new FileReader(trainingCorpus)); BufferedWriter bwTrain = new BufferedWriter(new FileWriter(training)); BufferedWriter bwTest = new BufferedWriter(new FileWriter(test)); try { boolean testTurn = false; if (i == 1) { limitInfTest = 0; limitSupTest = percent20; testTurn = true; } if (i == 2) { limitInfTest = percent20; limitSupTest = percent40; } if (i == 3) { limitInfTest = percent40; limitSupTest = percent60; } if (i == 4) { limitInfTest = percent60; limitSupTest = percent80; } if (i == 5) { limitInfTest = percent80; limitSupTest = numberOfLines; } int nSentencesTest = 0; String concat = ""; while (br.ready()) { line = br.readLine(); ntokens++; if (!line.equals("")) { concat += line + "\n"; } else { if (ntokens > limitInfTest && ntokens <= limitSupTest) { bwTest.write(concat + "\n"); concat = ""; } else { bwTrain.write(concat + "\n"); concat = ""; } } } //Echo(""); //bwTrain.write(concatTrain); bwTrain.close(); //bwTest.write(concatTest); //bwTest.write("\n"); //bwTest.write("\n"); if (!lastTest.equals("")) { bwTest.write("\n"); } bwTest.close(); numbSentencesTest++; numbSentencesTrain--; //Echo("Test Fold "+i +"; Train Fold "+i+" generated"); messageDivision = "Test Fold " + i + "; Train Fold " + i + " generated"; } catch (IOException e) { e.printStackTrace(); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } public void generate5FoldCrossCorporaPseudo() { //CROSSING EDGES--->contador++ int numbLinesTrain = 0; int numbSentencesTest = 0; int numbSentencesTrain = 0; int trainTimes = 0; String lastTest = ""; String line; boolean trainTurn = false; boolean firstTime = true; int ntokens = 0; //4 for train, 1 for test for (int i = 1; i < 6; i++) { String training = "fold_train_" + i + ".conll"; String test = "fold_test_" + i + ".conll"; int contLinesTest = 0; try { br = new BufferedReader(new FileReader(trainingCorpus)); BufferedWriter bwTrain = new BufferedWriter(new FileWriter(training)); BufferedWriter bwTest = new BufferedWriter(new FileWriter(test)); try { while (br.ready()) { line = br.readLine(); ntokens++; if (line != null && line.equals("")) { ntokens++; if ((trainTimes < i) || (trainTimes > i)) { trainTurn = true; numbSentencesTrain++; if (trainTimes == 5) { trainTimes = 0; } else { trainTimes++; } } else { //trainTimes==i trainTurn = false; firstTime = true; numbSentencesTest++; if (trainTimes == 5) { trainTimes = 0; } else { trainTimes++; } } } //if (ntokens<Optimizer.nMaxTokens) { if (trainTurn) { if (numbLinesTrain > 0) { //concatTrain+=line+"\n"; bwTrain.write(line + "\n"); } numbLinesTrain++; } else { if (contLinesTest == 0 && line.equals("")) { firstTime = false; } else { bwTest.write(line + "\n"); lastTest = line; contLinesTest++; } //concatTest+=line+"\n"; } //} } //Echo(""); //bwTrain.write(concatTrain); bwTrain.close(); if (!lastTest.equals("")) { bwTest.write("\n"); } bwTest.close(); numbSentencesTest++; numbSentencesTrain--; /* * println("Testing Set of "+numbSentencesTest +" * sentences generated"); println("Training Set * of "+numbSentencesTrain +" sentences generated"); */ //Echo("FOLD: "+i+" generated."); //messageDivision="Generated training set ("+numbSentencesTrain +" sentences) and test set ("+numbSentencesTest +" sentences)."; } catch (IOException e) { e.printStackTrace(); } } catch (FileNotFoundException e) { e.printStackTrace(); } //Echo("\nCorpora generated"); catch (IOException e) { e.printStackTrace(); } } println("Five cross-validation folds generated."); } public double evaluator(String testCorpus, String goldStandard) { switch (Optimizer.evaluationMeasure) { case "LAS": return evalLAS(testCorpus, goldStandard); case "UAS": return evalUAS(testCorpus, goldStandard); case "LCM": return evalLCM(testCorpus, goldStandard); case "UCM": return evalUCM(testCorpus, goldStandard); } return evalLAS(testCorpus, goldStandard); } public static double evalLAS(String testCorpus, String goldStandard) { return evalX(testCorpus, goldStandard, EvalType.LAS); } public double evalUAS(String testCorpus, String goldStandard) { return evalX(testCorpus, goldStandard, EvalType.UAS); } private enum EvalType { LAS, UAS } private static double evalX(String testCorpus, String goldStandard, EvalType type) { int total = 0; int ucorrect = 0; int lcorrect = 0; try { BufferedReader ss; try (BufferedReader gs = new BufferedReader(new FileReader(goldStandard))) { ss = new BufferedReader(new FileReader(testCorpus)); String gl, sl; int lineCount = 0; while ((gl = gs.readLine()) != null) { lineCount++; gl = gl.trim(); sl = ss.readLine(); sl = sl.trim(); if (gl.length() == 0 && sl.length() != 0) { throw new RuntimeException("GOLD sentence ended before Test at line " + lineCount); } if (gl.length() != 0 && sl.length() == 0) { throw new RuntimeException("Test sentence ended before GOLD at line " + lineCount); } if (gl.length() == 0) { continue; // EOS } String[] gtoks = gl.split("[\t]+"); String[] stoks = sl.split("[\t]+"); if (gtoks.length <= Math.min(6, 7)) { gs.close(); ss.close(); return -1f; } int ghead = Integer.parseInt(gtoks[6]); String glabel = normLabel(gtoks[7]); int shead = Integer.parseInt(stoks[6]); String slabel = normLabel(stoks[7]); total++; if (ghead == shead) { ucorrect++; if (glabel.equalsIgnoreCase(slabel)) { lcorrect++; } } } } ss.close(); } catch(IOException | RuntimeException e) { return -1f; } switch(type) { case LAS: return 100.0 * (double) lcorrect / (double) total; case UAS: return 100.0 * (double) ucorrect / (double) total; } return -1f; } public static String normLabel(String l) { if (l.equalsIgnoreCase("null")) { return "root"; } return l.toLowerCase(); } public double evalLCM(String testCorpus, String goldStandard) { BufferedReader tc; BufferedReader gs; double correctNodes = 0; double totNodes = 0; double totSentences = 0; double correctSentences = 0; //4 for train, 1 for test try { tc = new BufferedReader(new FileReader(testCorpus)); gs = new BufferedReader(new FileReader(goldStandard)); try { String lineTc; String lineGs; while (tc.ready() && gs.ready()) { lineTc = tc.readLine(); lineGs = gs.readLine(); if ((lineTc != null && lineGs != null) && (!lineTc.equals("") && !lineGs.equals(""))) { String tok = getColumn(lineTc, 2); String headTc = getColumn(lineTc, 7); String headGs = getColumn(lineGs, 7); String deprelTc = getColumn(lineTc, 8); String deprelGs = getColumn(lineGs, 8); //if (!((tok.equals(","))||(tok.equals("."))||(tok.equals(":"))||(tok.equals("-"))||(tok.equals(";"))||(tok.equals("'"))||(tok.equals('"'))||(tok.equals("^"))||(tok.equals("-"))||(tok.equals("..."))||(tok.equals("_")))) { if (!Optimizer.includePunctuation) { if (tok.length() == 1) { if (Character.getType(tok.charAt(0)) != 20 && Character.getType(tok.charAt(0)) != 21 && Character.getType(tok.charAt(0)) != 22 && Character.getType(tok.charAt(0)) != 23 && Character.getType(tok.charAt(0)) != 24 && Character.getType(tok.charAt(0)) != 29 && Character.getType(tok.charAt(0)) != 30) { /* * totNodes=totNodes+1.0; if * (headTc.equals(headGs) && * deprelTc.equals(deprelGs)) { * correctNodes=correctNodes+1.0; } */ } else { totNodes = totNodes + 1.0; if (headTc.equals(headGs) && deprelTc.equals(deprelGs)) { correctNodes = correctNodes + 1.0; } } } else { totNodes = totNodes + 1.0; if (headTc.equals(headGs) && deprelTc.equals(deprelGs)) { correctNodes = correctNodes + 1.0; } } } else { totNodes = totNodes + 1.0; if (headTc.equals(headGs) && deprelTc.equals(deprelGs)) { correctNodes = correctNodes + 1.0; } } //} } else { totSentences++; if (totNodes == correctNodes) { correctSentences++; } totNodes = 0; correctNodes = 0; } } } catch (IOException e) { e.printStackTrace(); } } catch (FileNotFoundException e) { e.printStackTrace(); } /* * println(correctNodes); println(totNodes); */ println("\tLAS:" + evalLAS(testCorpus, goldStandard)); return (correctSentences / totSentences) * 100; } public double evalUCM(String testCorpus, String goldStandard) { BufferedReader tc; BufferedReader gs; double correctNodes = 0; double totNodes = 0; double totSentences = 0; double correctSentences = 0; //4 for train, 1 for test try { tc = new BufferedReader(new FileReader(testCorpus)); gs = new BufferedReader(new FileReader(goldStandard)); try { String lineTc; String lineGs; while (tc.ready() && gs.ready()) { lineTc = tc.readLine(); lineGs = gs.readLine(); if ((lineTc != null && lineGs != null) && (!lineTc.equals("") && !lineGs.equals(""))) { String tok = getColumn(lineTc, 2); String headTc = getColumn(lineTc, 7); String headGs = getColumn(lineGs, 7); String deprelTc = getColumn(lineTc, 8); String deprelGs = getColumn(lineGs, 8); //if (!((tok.equals(","))||(tok.equals("."))||(tok.equals(":"))||(tok.equals("-"))||(tok.equals(";"))||(tok.equals("'"))||(tok.equals('"'))||(tok.equals("^"))||(tok.equals("-"))||(tok.equals("..."))||(tok.equals("_")))) { if (!Optimizer.includePunctuation) { if (tok.length() == 1) { if (Character.getType(tok.charAt(0)) != 20 && Character.getType(tok.charAt(0)) != 21 && Character.getType(tok.charAt(0)) != 22 && Character.getType(tok.charAt(0)) != 23 && Character.getType(tok.charAt(0)) != 24 && Character.getType(tok.charAt(0)) != 29 && Character.getType(tok.charAt(0)) != 30) { /* * totNodes=totNodes+1.0; if * (headTc.equals(headGs)) { * correctNodes=correctNodes+1.0; } */ } else { totNodes = totNodes + 1.0; if (headTc.equals(headGs)) { correctNodes = correctNodes + 1.0; } } } else { totNodes = totNodes + 1.0; if (headTc.equals(headGs)) { correctNodes = correctNodes + 1.0; } } } else { totNodes = totNodes + 1.0; if (headTc.equals(headGs)) { correctNodes = correctNodes + 1.0; } } } else { totSentences++; if (totNodes == correctNodes) { correctSentences++; } totNodes = 0; correctNodes = 0; } } } catch (IOException e) { e.printStackTrace(); } } catch (FileNotFoundException e) { e.printStackTrace(); } /* * println(correctNodes); println(totNodes); */ println("\tLAS:" + evalLAS(testCorpus, goldStandard)); return (correctSentences / totSentences) * 100; } public static void main(String[] args) { CoNLLHandler ch = new CoNLLHandler("goldtags.train_renumber_root_0"); //ch.extraDataCharacteristics(); //ch.generate5FoldCrossCorporaPseudo(); System.out.println(ch.projectiveOrNonProjective()); //Optimizer.includePunctuation=false; //double eval=ch.evaluator("spanish_cast3lb_train_test20.conll","outNivreEager.conll"); //System.out.println(eval); } public void print(String text) { Optimizer.out.print(text); if (writer != null) { try { writer.write(text); } catch (Exception ex) {} } } public void println(String text) { Optimizer.out.println(text); if (writer != null) { try { writer.write(text + "\n"); } catch (Exception ex) {} } } }