package edu.fudan.nlp.pipe; import java.io.Serializable; import java.util.Iterator; import java.util.List; import edu.fudan.ml.types.Instance; import edu.fudan.ml.types.alphabet.AlphabetFactory; import edu.fudan.ml.types.alphabet.IFeatureAlphabet; import edu.fudan.ml.types.alphabet.LabelAlphabet; /** * 将字符数组类型的数据转换成特征索引 * 数据类型:List\<String\> -\> int[] * @author xpqiu */ public class StringArray2IndexArray extends StringArray2SV{ private static final long serialVersionUID = 358834035189351765L; public StringArray2IndexArray(AlphabetFactory af) { init(af); } public StringArray2IndexArray(AlphabetFactory af,boolean b){ init(af); isSorted = b; } @Override public void addThruPipe(Instance inst) throws Exception { List<String> data = (List<String>) inst.getData(); int size = data.size(); int[] newdata = new int[data.size()+1]; Iterator<String> it = data.iterator(); for(int i=0;i<size;i++){ String token = it.next(); if(isSorted){ token+="@"+i; } int id = features.lookupIndex(token,label.size()); if(id==-1) continue; newdata[i] = id; } newdata[size]=constIndex; inst.setData(newdata); } }