///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2008 Carnegie Mellon University and
// (C) 2007 University of Texas at Austin and (C) 2005
// University of Pennsylvania and Copyright (C) 2002, 2003 University
// of Massachusetts Amherst, Department of Computer Science.
//
// This software is licensed under the terms of the Common Public
// License, Version 1.0 or (at your option) any subsequent version.
//
// The license is approved by the Open Source Initiative, and is
// available from their website at http://www.opensource.org.
///////////////////////////////////////////////////////////////////////////////
package mstparser.mallet;
/**
* @author Dipanjan Das 6/4/08 dipanjan@cs.cmu.edu
*
* Adapted from code by Ryan McDonald (ryanmcd@google.com)
*
*/
public class MalletFeatures {
// Change this method to add new features
// Add features for child ch, in sentence toks, with POS tags pos and parent function par
public static String getFeats(String[] toks, String[] pos, String[] labs, int[] par, String[] depPred, int[] headPred, int ch) {
String[] toks_low = new String[toks.length];
for (int i = 0; i < toks.length; i++) {
toks_low[i] = toks[i].toLowerCase();
}
toks = toks_low;
String att = ch < par[ch] ? "LFT" : "RGT";
int pa = par[ch];
String res = att;
for (int a = 0; a < 2; a++) {
String suff = a == 0 ? "" : "_" + att;
if (depPred != null || headPred != null) {
//include things here
int ach = ch;
int apa = headPred[ch];
String aux_lab = depPred[ch];
res += " " + "EXT=" + aux_lab + suff;
res += " " + "POSEXT=" + aux_lab + "__" + pos[pa] + "_" + pos[ch] + suff;
res += " " + "EXT=" + aux_lab + suff + "_" + (ach == ch && apa == par[ch]);
res += " " + "POSEXT=" + aux_lab + "__" + pos[pa] + "_" + pos[ch] + suff + "_" + (ach == ch && apa == par[ch]);
}
// standard word/pos features
res += " " + "POSCH=" + pos[ch] + suff;
res += " " + "POSPA=" + pos[par[ch]] + suff;
res += " " + "WRDCH=" + toks[ch] + suff;
res += " " + "WRDPA=" + toks[par[ch]] + suff;
res += " " + "POSP=" + pos[pa] + "_" + pos[ch] + suff;
res += " " + "WRDP=" + toks[pa] + "_" + toks[ch] + suff;
res += " " + "WRDPOS=" + toks[pa] + "_" + pos[ch] + suff;
res += " " + "POSWRD=" + pos[pa] + "_" + toks[ch] + suff;
if (ch > 0) {
res += " " + "POSCH-1=" + pos[ch - 1] + suff;
res += " " + "APOSCH-1=" + pos[ch - 1] + "_" + pos[ch] + suff;
res += " " + "WRDCH-1=" + toks[ch - 1] + suff;
}
if (ch > 1) {
res += " " + "POSCH-2=" + pos[ch - 2] + suff;
res += " " + "APOSCH-2=" + pos[ch - 2] + "_" + pos[ch] + suff;
res += " " + "WRDCH-2=" + toks[ch - 2] + suff;
}
if (ch < toks.length - 2) {
res += " " + "POSCH+2=" + pos[ch + 2] + suff;
res += " " + "APOSCH+1=" + pos[ch + 2] + "_" + pos[ch] + suff;
res += " " + "WRDCH+2=" + toks[ch + 2] + suff;
}
if (ch < toks.length - 1) {
res += " " + "POSCH+1=" + pos[ch + 1] + suff;
res += " " + "APOSCH+1=" + pos[ch + 1] + "_" + pos[ch] + suff;
res += " " + "WRDCH+1=" + toks[ch + 1] + suff;
}
if (ch > 0 && ch < toks.length - 1) {
res += " " + "APOSCH+1-1=" + pos[ch - 1] + "_" + pos[ch] + "_" + pos[ch + 1] + suff;
}
if (pa > 0) {
res += " " + "POSPA-1=" + pos[pa - 1] + suff;
res += " " + "APOSPA-1=" + pos[pa - 1] + "_" + pos[pa] + suff;
res += " " + "WRDPA-1=" + toks[pa - 1] + suff;
}
if (pa > 1) {
res += " " + "POSPA-2=" + pos[pa - 2] + suff;
res += " " + "APOSPA-2=" + pos[pa - 2] + "_" + pos[pa] + suff;
res += " " + "WRDPA-2=" + toks[pa - 2] + suff;
}
if (pa < toks.length - 2) {
res += " " + "POSPA+2=" + pos[pa + 2] + suff;
res += " " + "APOSPA+2=" + pos[pa + 2] + "_" + pos[pa] + suff;
res += " " + "WRDPA+2=" + toks[pa + 2] + suff;
}
if (pa < toks.length - 1) {
res += " " + "POSPA+1=" + pos[pa + 1] + suff;
res += " " + "APOSPA+1=" + pos[pa + 1] + "_" + pos[pa] + suff;
res += " " + "WRDPA+1=" + toks[pa + 1] + suff;
}
if (pa > 0 && pa < toks.length - 1) {
res += " " + "APOSPA+1-1=" + pos[pa - 1] + "_" + pos[pa] + "_" + pos[pa + 1] + suff;
}
// POS in-between
for (int i = Math.min(ch, pa) + 1; i < Math.max(ch, pa); i++) {
res += " " + "POST=" + pos[pa] + "_" + pos[ch] + "_" + pos[i] + suff;
res += " " + "APOST=" + pos[ch] + "_" + pos[i] + suff;
res += " " + "BPOST=" + pos[pa] + "_" + pos[i] + suff;
res += " " + "CPOST=" + pos[i] + suff;
}
}
return res;
}
}