package edu.fudan.nlp.pipe.templet; import edu.fudan.ml.types.Instance; import edu.fudan.ml.types.alphabet.IFeatureAlphabet; /** * 基于模板的文本序列特征抽取 * 处理数据格式为:String[][] * 例如: * x1 x2 x3 * y1 y2 y3 * z1 z2 z3 */ public class BaseTemplet implements Templet { private static final long serialVersionUID = -4019640352729137328L; String templet; int id; /** * 特征标记,两维数组,第二维大小为2 */ int[][] dims; /** * 构造函数 * @param id * @param dims */ public BaseTemplet(int id, int[][] dims) { this.id = id; this.dims = dims; } /** * @see Templet#generateAt(Instance, IFeatureAlphabet, int...) */ public int[] generateAt(Instance instance, IFeatureAlphabet features, int numLabels) throws Exception { String[][] data = (String[][]) instance.getData(); int len = data[0].length; int[] index = new int[len]; for(int pos = 0;pos<len;pos++){ StringBuffer sb = new StringBuffer(); sb.append(id); sb.append(':'); for (int i = 0; i < dims.length; i++) { String rp = ""; int k = dims[i][0]; //行号 int j = dims[i][1]; //列号 if (pos + j < 0 || pos + j >= len) { if (pos + j < 0) rp = "B_" + String.valueOf(-(pos + j) - 1); if (pos + j >= len) rp = "E_" + String.valueOf(pos + j - len); } else { rp = data[k][pos + j]; } if (-1 != rp.indexOf('$')) rp = rp.replaceAll("\\$", "\\\\\\$"); sb.append(rp); sb.append("//"); } // System.out.println(sb.toString()); index[pos] = features.lookupIndex(sb.toString(),numLabels); } return index; } public String toString() { return this.templet; } }