package edu.fudan.nlp.pipe.seq.templet;
import edu.fudan.ml.types.Instance;
import edu.fudan.ml.types.alphabet.IFeatureAlphabet;
import edu.fudan.nlp.cn.ChineseTrans;
import edu.fudan.nlp.similarity.train.KMeansWordCluster;
/**
* 当前位置字符的语义类型
*
* @author xpqiu
*
*/
public class CharClassTemplet3 implements Templet {
private static final long serialVersionUID = 3572735523891704313L;
private int id;
private KMeansWordCluster cluster;
private int[] idxs;
public CharClassTemplet3(int id,KMeansWordCluster kmwc, int... dim) {
this.id = id;
this.cluster = kmwc;
this.idxs = dim;
}
/**
* {@inheritDoc}
*/
@Override
public int generateAt(Instance instance, IFeatureAlphabet features, int pos,
int... numLabels) {
String[][] data = ( String[][]) instance.getData();
int len = data[0].length;
StringBuilder sb = new StringBuilder();
sb.append(id);
sb.append(':');
for(int idx : idxs){
pos = pos+idx;
if(pos<0||pos>=len)
return -1;
String context = "";
if(pos>1)
context += data[0][pos-1]; //这里数据行列和模板中行列相反
else
context += "Begin0";
context += data[0][pos];
if(pos<len-1)
context += data[0][pos+1]; //这里数据行列和模板中行列相反
else
context += "End0";
// 得到字符串类型
int cid;
char c = data[0][pos].charAt(0);
if(c>='A'+ 65248 &&c<='Z'+65248)
cid = 1039;
else
cid= cluster.classifier(context);
sb.append(":");
sb.append(cid);
}
int index = features.lookupIndex(sb.toString(),numLabels[0]);
return index;
}
@Override
public int getOrder() {
return 0;
}
public int[] getVars() {
return new int[] { 0 };
}
public int offset(int... curs) {
return 0;
}
}