package is2.data; import is2.parser.Parser; import is2.util.DB; import java.util.ArrayList; import java.util.Collections; /** * @author Dr. Bernd Bohnet, 17.01.2011 * * */ public class PSTree { int wordCount = 0; public String entries[]; public String lemmas[]; public int head[]; public String pos[]; public int[] ok; public int non; public int terminalCount; public String[] morph; public int[] forms; public int[] phrases; public int[][] psfeats; public int[] ppos; /** * @param d */ public PSTree(SentenceData09 d) { create(d.length() - 1, d.length() * 20); for (int i = 1; i < d.length(); i++) { entries[i - 1] = d.forms[i]; pos[i - 1] = d.ppos[i]; } } /** * Create an undefined phrase tree */ public PSTree() { } /** * @param terminals * @param nonTerminals */ public void create(int terminals, int nonTerminals) { entries = new String[terminals + nonTerminals]; pos = new String[terminals + nonTerminals]; head = new int[terminals + nonTerminals]; lemmas = new String[terminals + nonTerminals]; morph = new String[terminals + nonTerminals]; non = terminals; wordCount = terminals; for (int i = terminals + 1; i < head.length; i++) { head[i] = -1; } } @Override public String toString() { StringBuilder s = new StringBuilder(); for (int i = 0; i < entries.length; i++) { if (head[i] == -1 && entries[i] == null) { break; } s.append(i).append("\t").append(pos[i]).append("\t").append(entries[i]).append("\t").append(head[i]).append((ok == null ? "" : ("\t" + (ok[i] == 1)))).append(" \n"); } // DB.println("entries "+entries.length); return s.toString(); } /** * @return */ public boolean containsNull() { for (int k = 0; k < wordCount - 1; k++) { if (entries[k] == null) { return true; } } return false; } public int equals(SentenceData09 s) { int j = 1; // starts with root for (int i = 0; i < terminalCount - 1; i++) { // if (s.forms[j].equals("erschrekkend")) s.forms[j]="erschreckend"; if (s.forms.length < j) { DB.println("" + s + " " + this.toString()); return i; } if (!entries[i].equals(s.forms[j])) { // Parser.out.println("ps "+entries[i]+" != ds "+s.forms[j]); // Rolls-Royce if (entries[i].startsWith(s.forms[j]) && s.forms.length > i + 2 && s.forms[j + 1].equals("-")) { j += 2; if (entries[i].contains(s.forms[j - 1]) && s.forms.length > i + 3 && s.forms[j + 1].equals("-")) { j += 2; // && // Parser.out.println("s.forms[j] "+s.forms[j]+" s.forms[j-1] "+s.forms[j-1]+" "+entries[i]); if (entries[i].contains(s.forms[j - 1]) && s.forms.length > i + 3 && s.forms[j + 1].equals("-")) { j += 2; // && // Parser.out.println("s.forms[j] "+s.forms[j]+" s.forms[j-1] "+s.forms[j-1]+" "+entries[i]); } } //Interstate\/Johnson } else if (entries[i].startsWith(s.forms[j]) && s.forms.length > i + 2 && s.forms[j + 1].equals("/")) { j += 2; if (entries[i].contains(s.forms[j - 1]) && s.forms.length > i + 3 && s.forms[j + 1].equals("/")) { j += 2; // && // Parser.out.println("s.forms[j] "+s.forms[j]+" s.forms[j-1] "+s.forms[j-1]+" "+entries[i]); } // U.S.-Japan -> U . S . - Japan } else if (entries[i].startsWith(s.forms[j]) && s.forms.length > i + 2 && s.forms[j + 1].equals(".")) { j += 2; if (entries[i].contains(s.forms[j - 1]) && s.forms.length > i + 3 && s.forms[j + 1].equals(".")) { j += 2; // && // Parser.out.println("s.forms[j] "+s.forms[j]+" s.forms[j-1] "+s.forms[j-1]+" "+entries[i]); } } else if (entries[i].startsWith(s.forms[j]) && s.forms.length > i + 1 && s.forms[j + 1].equals("'S")) { j += 1; } else { // chech those !!! // Parser.out.print("entry "+entries[i]+" form "+s.forms[j]+" "); return j; } } j++; } // without root return s.length(); //return j; } /** * @param dn * @return */ public int getPS(int dn) { return this.head[dn - 1]; } /** * @param dn * @param n * @param commonHead the common head in the phrase structure * @return */ public String getChain(int dn, int n, int commonHead) { int pdn = dn - 1, pdh = n - 1; // int phraseHead =head[pdh]; // Parser.out.println("phrase head "+phraseHead+" common head "+commonHead); int[] ch = new int[20]; int head = this.head[pdn]; int i = 0; ch[i++] = head; while (commonHead != head && head != 0) { head = this.head[head]; ch[i++] = head; } StringBuilder chain = new StringBuilder(); for (int k = 0; k < i; k++) { chain.append(entries[ch[k]]).append(" "); } return chain.toString(); } /** * @param dn * @param n * @return */ public int getCommonHead(int d, int dh) { int pdh = this.getPS(dh), pd = this.getPS(d); ArrayList<Integer> path2root = getPath2Root(pdh); //Parser.out.println("path 2 root "+path2root+" pdh "+pdh); for (int n : path2root) { int candidateHead = pd; while (candidateHead != 0 && candidateHead != -1) { if (n == candidateHead) { return n; } candidateHead = this.head[candidateHead]; } } return -1; } /** * @param pdh */ private ArrayList<Integer> getPath2Root(int pdh) { ArrayList<Integer> path = new ArrayList<>(); // restrict the number in case its a cycle which should never be for (int k = 0; k < 100; k++) { if (pdh == -1) { break; } path.add(pdh); pdh = this.head[pdh]; if (pdh == 0) { break; } } return path; } /** * Get operations to create root see operation in method getOperation * * @param pr */ public String getOperationRoot(int pr) { StringBuilder o = new StringBuilder(); int h = pr; int[] path = new int[10]; // Parser.out.println(" start node "+pr); int k = 0; for (; k < 10; k++) { h = head[h]; if (h == -1) { break; } path[k] = h; if (h == 0) { break; } } k -= 2; boolean first = true; for (; k >= 0; k--) { // create phrase if (first) { o.append("c:").append(entries[path[k]]); first = false; } // insert and create phrase else { o.append(":ci:").append(entries[path[k]]); } } // insert dependent node //if (o.length()>0) o.append(":in:d"); //else o.append("in:d"); // insert root into nothing return o.toString(); } /** * Create operation to include dependency edges in phrase structure * Operations: c - create ; i - insert ; in - insert (dependent) node ; up:X * go the (phrase) X up ci create and insert ... * * @param dn * @param n * @param commonHead * @return */ public String getOperation(int dn, int n, int commonHead) { StringBuilder o = new StringBuilder(); // from n move up to common head, if needed int ph = n - 1, pd = dn - 1; int[] path = new int[20]; int i = 0; int h = ph; boolean nth = false; for (int k = 0; k < 10; k++) { h = head[h]; path[k] = h; if (nth) { o.append(':'); } o.append("up:").append(entries[h]); nth = true; if (h == commonHead) { break; } } // from common head to the node int k = 0; h = pd; for (; k < 10; k++) { h = head[h]; path[k] = h; if (h == commonHead) { break; } } k -= 1; // boolean first=true; for (; k >= 0; k--) { // create phrase if (!nth) { o.append("ci:").append(entries[path[k]]); nth = true; } // insert and create phrase else { o.append(":ci:").append(entries[path[k]]); } } // insert dependent node o.append(":in:d"); return o.toString(); } /** * @param ph node in the phrase structure corresponding to the head in the * dependency structure * @param pt node in the prhase structure corresponding to the dependent in * the ds. * @param check * @return rules was applicable */ public boolean exec(String r, int ph, int pt, boolean check) { String o[] = r.split(":"); int last = -1, headP = -1; // create root node // Parser.out.println("operation "+r+" "+ph+" "+pt); boolean done = true; for (int i = 0; i < o.length; i++) { if (o[i].equals("c")) { if (check) { return true; } if (ph < 0) { last = non++; } entries[non] = o[++i]; // create head[pt] = non; head[non] = last; // insert into root last = non++; } else if (o[i].equals("ci")) { if (check) { return true; } entries[non] = o[++i]; // create head[non] = last; // insert last = non; non++; } else if (o[i].equals("in") && o[i + 1].equals("d")) { if (check) { return true; } head[pt] = last; // insert i++; // move forward because of 'd' } else if (o[i].equals("up")) { if (ph == -1) { // Parser.out.println("ph is -1 please check this "+ph+" there is a bug "); return false; } if (headP == -1) { headP = head[ph]; } else { headP = head[headP]; } try { if (headP == -1 || entries[headP] == null || !entries[headP].equals(o[i + 1])) { return false; } } catch (Exception e) { e.printStackTrace(); Parser.out.println("" + entries[headP] + " o[i+1] " + o[i + 1] + " " + headP + " " + this.terminalCount); // Parser.out.println(""+ this.toString()); System.exit(0); } i++; last = headP; } else { done = false; } } return done; } /** * More tolerant mapping * * @param ph node in the phrase structure corresponding to the head in the * dependency structure * @param pt node in the prhase structure corresponding to the dependent in * the ds. * @param check * @return rules was applicable */ public boolean execT(String r, int ph, int pt, boolean check) { String o[] = r.split(":"); int last = -1, headP = -1; int up = 0; boolean done = true; for (int i = 0; i < o.length; i++) { if (o[i].equals("c")) { if (check) { return true; } // create root node if (ph < 0) { last = non++; } entries[non] = o[++i]; // create head[pt] = non; head[non] = last; // insert into root last = non++; } else if (o[i].equals("ci")) { if (check) { return true; } entries[non] = o[++i]; // create head[non] = last; // insert last = non; non++; } else if (o[i].equals("in") && o[i + 1].equals("d")) { if (check) { return true; } // DB.println("hallo"); if (last != -1) { head[pt] = last; // insert } // i am not sure if this does much good? // if (last ==-1) // done=true; i++; // move forward because of 'd' } else if (o[i].equals("up")) { up++; if (ph == -1) { return false; } if (headP == -1) { headP = head[ph]; } else { headP = head[headP]; } try { // tolerant mapping if (headP == -1 || entries[headP] == null || ((!entries[headP].equals(o[i + 1])) && up > 1)) { return false; //>1 }// && entries[headP].charAt(0)!=o[i+1].charAt(0) } catch (Exception e) { e.printStackTrace(); Parser.out.println("" + entries[headP] + " o[i+1] " + o[i + 1] + " " + headP + " " + this.terminalCount); } i++; last = headP; } else { done = false; } } return done; } public final static boolean INSERT_NEWLINE = true; /** * Convert to bracket format * * @param newLine * @return */ public String toPennBracket(boolean newLine) { StringBuilder b = new StringBuilder(); ArrayList<Integer> current = null;// = new ArrayList<Integer>(); int open = 0; for (int i = 0; i < terminalCount; i++) { ArrayList<Integer> path = getPathToRoot(i); ArrayList<Integer> diff = getDiffPath(path, current); boolean spaces = false; ArrayList<Integer> common = this.getDiffCommon(path, current); if (current != null && (current.size() > common.size())) { // close brackets for (int bc = 0; bc < current.size() - common.size(); bc++) { b.append(")"); open--; } if (diff.isEmpty() && newLine) { b.append("\n"); } spaces = true; } if (i != 0 && diff.size() > 0 && newLine) { b.append("\n").append(createSpaces(open)); } for (int k = diff.size() - 1; k >= 0; k--) { open++; b.append("(").append((entries[path.get(k)] == null ? " " : entries[path.get(k)])); if (k != 0 && path.size() - 1 != k && newLine) { b.append("\n").append(createSpaces(open)); } spaces = false; } if (spaces) { b.append(createSpaces(open)); } else { b.append(" "); } String term = entries[i]; if (term.equals("(")) { term = "-LRB-"; } if (term.equals(")")) { term = "-RRB-"; } if (term.equals("{")) { term = "-LCB-"; } if (term.equals("}")) { term = "-RCB-"; } String ps = pos[i]; if (ps.equals("(")) { ps = "-LRB-"; } if (ps.equals("$(")) { ps = "-LRB-"; } if (ps.equals(")")) { ps = "-RRB-"; } if (ps.equals("{")) { ps = "-LCB-"; } if (ps.equals("}")) { ps = "-RCB-"; } b.append("(").append(ps).append(" ").append(term).append(')'); current = path; // break; } for (; open > 0; open--) { b.append(")"); } // b.append("\n"); return b.toString(); } static int cnt = 0; /** * @param path * @param current * @return */ private ArrayList<Integer> getDiffPath(ArrayList<Integer> path, ArrayList<Integer> current) { if (current == null) { return path; } ArrayList<Integer> common = new ArrayList<>(); int pindex = path.size() - 1; int cindex = current.size() - 1; while (cindex >= 0 && pindex >= 0) { if (path.get(pindex) == current.get(cindex)) { cindex--; pindex--; } else { break; } } for (int k = 0; k <= pindex; k++) { common.add(path.get(k)); } return common; } private ArrayList<Integer> getDiffCommon(ArrayList<Integer> path, ArrayList<Integer> current) { if (current == null) { return path; } ArrayList<Integer> common = new ArrayList<>(); int pindex = path.size() - 1; int cindex = current.size() - 1; while (cindex >= 0 && pindex >= 0) { if (path.get(pindex) == current.get(cindex)) { common.add(path.get(pindex)); cindex--; pindex--; } else { break; } } Collections.reverse(common); // Parser.out.println("common "+pindex+" "+common); return common; } /** * @param i * @return */ private StringBuffer createSpaces(int i) { StringBuffer s = new StringBuffer(); for (int k = 0; k < i; k++) { s.append(" "); } return s; } /** * @param i * @return */ private ArrayList<Integer> getPathToRoot(int i) { ArrayList<Integer> path = new ArrayList<>(); int h = i; while (true) { h = this.head[h]; if (h < this.terminalCount || path.contains(h)) { break; } path.add(h); } // Collections.reverse(list) return path; } public String conll09() { StringBuilder s = new StringBuilder(); for (int i = 0; i < this.terminalCount; i++) { if (head[i] == -1 && entries[i] == null) { break; } s.append((i + 1)).append('\t').append(entries[i]).append("\t_\t_\t").append(pos[i]).append("\t_\t_\t_\t_\t_\t_\t_\t_\n"); } return s.toString(); } /** * @param phead * @return */ public int[] getChilds(int head) { int count = 0; for (int i = 0; i < this.entries.length; i++) { if (this.head[i] == head) { count++; } } int[] clds = new int[count]; count = 0; for (int i = 0; i < this.entries.length; i++) { if (this.head[i] == head) { clds[count++] = i; } } return clds; } }