/////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008 Carnegie Mellon University and // (C) 2007 University of Texas at Austin and (C) 2005 // University of Pennsylvania and Copyright (C) 2002, 2003 University // of Massachusetts Amherst, Department of Computer Science. // // This software is licensed under the terms of the Common Public // License, Version 1.0 or (at your option) any subsequent version. // // The license is approved by the Open Source Initiative, and is // available from their website at http://www.opensource.org. /////////////////////////////////////////////////////////////////////////////// package mstparser.mallet; import java.io.BufferedReader; import java.io.IOException; /** * @author Dipanjan Das 6/4/08 dipanjan@cs.cmu.edu * * Adapted from code by Ryan McDonald (ryanmcd@google.com) * */ public class Convert { public static String[] convert(BufferedReader in, BufferedReader[] aux_in) throws IOException { String line = in.readLine(); if (line == null) { return null; } String[] tokstmp = line.split("\t"); String[] postmp = in.readLine().split("\t"); in.readLine(); String[] labtmp = in.readLine().split("\t"); String[] depstmp = in.readLine().split("\t"); String[][] aux_lines = new String[aux_in.length][]; for (int i = 0; i < aux_lines.length; i++) { aux_lines[i] = aux_in[i].readLine().split(" "); } String[] res = convert(tokstmp, postmp, labtmp, depstmp, aux_lines); return res; } public static String[] convert(String[] tokstmp, String[] postmp, String[] labtmp, String[] depstmp, String[][] aux_lines) { String[] toks = new String[tokstmp.length + 1]; String[] pos = new String[postmp.length + 1]; String[] lab = new String[labtmp.length + 1]; int[] par = new int[depstmp.length + 1]; toks[0] = "<root>"; pos[0] = "<root-POS>"; lab[0] = "<root-LAB>"; par[0] = -1; for (int i = 0; i < depstmp.length; i++) { par[i + 1] = Integer.parseInt(depstmp[i]); toks[i + 1] = tokstmp[i]; pos[i + 1] = postmp[i]; lab[i + 1] = labtmp[i]; } String[] res = new String[toks.length - 1]; for (int i = 1; i < par.length; i++) { res[i - 1] = lab[i] + " " + getFeats(toks, pos, aux_lines, lab, par, i).trim(); } return res; } // Change this method to add new features // Add features for child ch, in sentence toks, with POS tags pos and parent function par public static String getFeats(String[] toks, String[] pos, String[][] aux_lines, String[] labs, int[] par, int ch) { String[] toks_low = new String[toks.length]; for (int i = 0; i < toks.length; i++) { toks_low[i] = toks[i].toLowerCase(); } toks = toks_low; String att = ch < par[ch] ? "LFT" : "RGT"; int pa = par[ch]; String res = att; for (int a = 0; a < 2; a++) { String suff = a == 0 ? "" : "_" + att; // auxiliary files for (int i = 0; i < aux_lines.length; i++) { int ach = Integer.parseInt(aux_lines[i][ch - 1].split("[\\|:]")[1]); int apa = Integer.parseInt(aux_lines[i][ch - 1].split("[\\|:]")[0]); String aux_lab = aux_lines[i][ch - 1].split(":")[1]; res += " " + "EXT" + i + "=" + aux_lab + suff; res += " " + "POSEXT" + i + "=" + aux_lab + "__" + pos[pa] + "_" + pos[ch] + suff; res += " " + "EXT" + i + "=" + aux_lab + suff + "_" + (ach == ch && apa == par[ch]); res += " " + "POSEXT" + i + "=" + aux_lab + "__" + pos[pa] + "_" + pos[ch] + suff + "_" + (ach == ch && apa == par[ch]); } // standard word/pos features res += " " + "POSCH=" + pos[ch] + suff; res += " " + "POSPA=" + pos[par[ch]] + suff; res += " " + "WRDCH=" + toks[ch] + suff; res += " " + "WRDPA=" + toks[par[ch]] + suff; res += " " + "POSP=" + pos[pa] + "_" + pos[ch] + suff; res += " " + "WRDP=" + toks[pa] + "_" + toks[ch] + suff; res += " " + "WRDPOS=" + toks[pa] + "_" + pos[ch] + suff; res += " " + "POSWRD=" + pos[pa] + "_" + toks[ch] + suff; if (ch > 0) { res += " " + "POSCH-1=" + pos[ch - 1] + suff; res += " " + "APOSCH-1=" + pos[ch - 1] + "_" + pos[ch] + suff; res += " " + "WRDCH-1=" + toks[ch - 1] + suff; } if (ch > 1) { res += " " + "POSCH-2=" + pos[ch - 2] + suff; res += " " + "APOSCH-2=" + pos[ch - 2] + "_" + pos[ch] + suff; res += " " + "WRDCH-2=" + toks[ch - 2] + suff; } if (ch < toks.length - 2) { res += " " + "POSCH+2=" + pos[ch + 2] + suff; res += " " + "APOSCH+1=" + pos[ch + 2] + "_" + pos[ch] + suff; res += " " + "WRDCH+2=" + toks[ch + 2] + suff; } if (ch < toks.length - 1) { res += " " + "POSCH+1=" + pos[ch + 1] + suff; res += " " + "APOSCH+1=" + pos[ch + 1] + "_" + pos[ch] + suff; res += " " + "WRDCH+1=" + toks[ch + 1] + suff; } if (ch > 0 && ch < toks.length - 1) { res += " " + "APOSCH+1-1=" + pos[ch - 1] + "_" + pos[ch] + "_" + pos[ch + 1] + suff; } if (pa > 0) { res += " " + "POSPA-1=" + pos[pa - 1] + suff; res += " " + "APOSPA-1=" + pos[pa - 1] + "_" + pos[pa] + suff; res += " " + "WRDPA-1=" + toks[pa - 1] + suff; } if (pa > 1) { res += " " + "POSPA-2=" + pos[pa - 2] + suff; res += " " + "APOSPA-2=" + pos[pa - 2] + "_" + pos[pa] + suff; res += " " + "WRDPA-2=" + toks[pa - 2] + suff; } if (pa < toks.length - 2) { res += " " + "POSPA+2=" + pos[pa + 2] + suff; res += " " + "APOSPA+2=" + pos[pa + 2] + "_" + pos[pa] + suff; res += " " + "WRDPA+2=" + toks[pa + 2] + suff; } if (pa < toks.length - 1) { res += " " + "POSPA+1=" + pos[pa + 1] + suff; res += " " + "APOSPA+1=" + pos[pa + 1] + "_" + pos[pa] + suff; res += " " + "WRDPA+1=" + toks[pa + 1] + suff; } if (pa > 0 && pa < toks.length - 1) { res += " " + "APOSPA+1-1=" + pos[pa - 1] + "_" + pos[pa] + "_" + pos[pa + 1] + suff; } // POS in-between for (int i = Math.min(ch, pa) + 1; i < Math.max(ch, pa); i++) { res += " " + "POST=" + pos[pa] + "_" + pos[ch] + "_" + pos[i] + suff; res += " " + "APOST=" + pos[ch] + "_" + pos[i] + suff; res += " " + "BPOST=" + pos[pa] + "_" + pos[i] + suff; res += " " + "CPOST=" + pos[i] + suff; } } return res; } private static int getSibL(int ch, int[] par) { for (int i = ch; i >= 0; i--) { if (par[i] == par[ch]) { return i; } } return -1; } private static int getSibR(int ch, int[] par) { for (int i = ch; i < par.length; i++) { if (par[i] == par[ch]) { return i; } } return -1; } }