package edu.fudan.nlp.pipe.seq; import java.io.Serializable; import java.util.Arrays; import edu.fudan.ml.types.Instance; import edu.fudan.ml.types.alphabet.LabelAlphabet; import edu.fudan.nlp.cn.Chars; import edu.fudan.nlp.pipe.Pipe; public class TokenNormalize extends Pipe implements Serializable { private static final long serialVersionUID = 8129957080708134793L; private LabelAlphabet labels; public TokenNormalize(LabelAlphabet labels) { this.labels = labels; } /** * 将英文、数字标点硬标为S,目前废弃 */ public void addThruPipe(Instance instance) throws Exception { String[][] data = (String[][]) instance.getData(); int[][] tempData = new int[data[0].length][labels.size()]; for (int i = 0; i < data[0].length; i++) { char s = data[0][i].charAt(0); if (Chars.isLetterOrDigitOrPunc(s)) { Arrays.fill(tempData[i], 1); tempData[i][labels.lookupIndex("S")] = 0; } } // for(int i = 0; i < tempData.length; i++) { // for(int j = 0; j < tempData[i].length; j++) // System.out.print(tempData[i][j]); // System.out.println(); // } instance.setTempData(tempData); } }