package edu.fudan.nlp.pipe.seq; import java.io.Serializable; import java.util.ArrayList; import edu.fudan.ml.types.Instance; import edu.fudan.nlp.cn.Chars; import edu.fudan.nlp.cn.Chars.CharType; import edu.fudan.nlp.cn.Chars.StringType; import edu.fudan.nlp.pipe.Pipe; /** * 将字符串直接转换成待标注的序列 * 例子: * "我abc"转换成{"我","a","b","c"};如果预处理英文,则转换成{"我","abc"} * * @author xpqiu * @version 2.0 * */ public class String2Sequence extends Pipe implements Serializable { /** * 是否预处理英文,默认为真 */ boolean isEnFilter = true; private static final long serialVersionUID = 5699154494725645936L; /** * 构造函数 * @param b 是否带标签 */ public String2Sequence(boolean b){ isEnFilter = b; } /** * 将一个字符串转换成按标注序列 * 每列一个字或连续英文token的信息 * @param inst 样本 */ @Override public void addThruPipe(Instance inst) { String str = (String) inst.getData(); String[][] data; if(isEnFilter){ data = genSequence(str); }else{ data = new String[2][str.length()]; for(int i = 0; i < str.length(); i++){ data[0][i] = str.substring(i,i+1); data[1][i] = Chars.getStringType(data[0][i]).toString(); } } inst.setData(data); } /** * 预处理连续的英文和数字 * @param sent * @return */ public static String[][] genSequence(String sent){ CharType[] tags = Chars.getType(sent); int len = sent.length(); ArrayList<String> words = new ArrayList<String>(); ArrayList<String> types = new ArrayList<String>(); int begin =0; for(int j=0; j<len; j++) { if(j<len-1 && tags[j]==CharType.L && tags[j+1]==CharType.L){//当前是连续英文 continue; }else if(j<len-1 &&tags[j]==CharType.D && tags[j+1]==CharType.D){//当前是连续数字 continue; } StringType st = Chars.char2StringType(tags[j]); String w = sent.substring(begin,j+1); words.add(w); types.add(st.toString()); begin = j+1; } String[][] data = new String[2][]; data[0] = words.toArray(new String[words.size()]); data[1] = types.toArray(new String[types.size()]); return data; } }