package edu.fudan.nlp.cn.tag.format; import java.util.ArrayList; import java.util.List; import edu.fudan.ml.types.Instance; import edu.fudan.ml.types.InstanceSet; /** * 将序列标注结果转换成List输出 * 将BMES标签转为成词的序列 * @author Administrator * */ public class FormatCWS { public static ArrayList<String> toList(Instance inst, String[] labels) { String[][] data = (String[][]) inst.getSource(); int len = data[0].length; ArrayList<String> res = new ArrayList<String>(len); StringBuilder sb = new StringBuilder(); for (int j = 0; j < len; j++) { String label = labels[j]; String w = data[0][j]; if(data[1][j].equals("B")){//空格特殊处理 if(sb.length()>0){ res.add(sb.toString()); sb = new StringBuilder(); } continue; } sb.append(w); if (label.equals("E") || label.equals("S")) { res.add(sb.toString()); sb = new StringBuilder(); // }else if(j<len-1&&data[1][j].equals("C")&&(data[1][j+1].endsWith("L")||data[1][j+1].endsWith("D"))){ // res.add(sb.toString()); // sb = new StringBuilder(); // } // else if(j<len-1&&data[1][j+1].equals("C")&&(data[1][j].endsWith("L"))){ // res.add(sb.toString()); // sb = new StringBuilder(); } } if(sb.length()>0){ res.add(sb.toString()); } return res; } /** * 将BMES标签转为#delim#隔开的字符串 * @param instSet 样本集 * @param labelsSet 标签集 * @param delim 字之间的间隔符 * @return */ public static String toString(InstanceSet instSet, String[][] labelsSet, String delim) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < instSet.size(); i++) { Instance inst = instSet.getInstance(i); String[] labels = labelsSet[i]; sb.append(toString(inst, labels,delim)); sb.append("\n"); } return sb.toString(); } /** * 将BMES标签转为#delim#隔开的字符串 * @param inst 样本 * @param labels 标签 * @param delim 字之间的间隔符 * @return */ public static String toString(Instance inst, String[] labels,String delim) { String[][] data = (String[][]) inst.getSource(); int len = data[0].length; StringBuilder sb = new StringBuilder(); for (int j = 0; j < len-1; j++) { String label = labels[j]; String w = data[0][j]; sb.append(w); if(data[1][j].equals("B")||data[1][j+1].equals("B")) continue; else if (label.equals("E") || label.equals("S")) { sb.append(delim); // }else if(data[1][j].equals("C")&&(data[1][j+1].endsWith("L")||data[1][j+1].endsWith("D"))){ // sb.append(delim); // }else if(data[1][j+1].equals("C")&&(data[1][j].endsWith("L"))){//||data[1][j].endsWith("D") // sb.append(delim); } } sb.append(data[0][len-1]); return sb.toString(); } }