package edu.fudan.nlp.pipe.seq;
import java.util.TreeMap;
import java.util.TreeSet;
import edu.fudan.ml.types.Dictionary;
import edu.fudan.ml.types.Instance;
import edu.fudan.ml.types.alphabet.LabelAlphabet;
import edu.fudan.nlp.pipe.Pipe;
import edu.fudan.util.MultiValueMap;
import edu.fudan.util.exception.LoadModelException;
/**
* 词性字典预处理
*
* @author xpqiu
*
*/
public class DictPOSLabel extends Pipe{
private static final long serialVersionUID = 5457382370544508743L;
protected Dictionary dict;
protected LabelAlphabet labels;
public DictPOSLabel(Dictionary dict, LabelAlphabet labels) {
this.dict = dict;
this.labels = labels;
checkLabels();
}
private void checkLabels(){
MultiValueMap<String, String> pos = dict.getPOSDict();
for(TreeSet<String> pp: pos.valueSets()){
if(pp==null)
continue;
for(String p : pp){
if(labels.lookupIndex(p)==-1){
System.err.println("Warning: 自定义词性: " +p+
"\n标签最好在下面列表中:\n" +labels.toString());
labels.setStopIncrement(false);
labels.lookupIndex(p);
labels.setStopIncrement(true);
}
}
}
}
public void addThruPipe(Instance instance) throws Exception {
String[] data = (String[]) instance.getData();
int length = data.length;
int[][] dicData = new int[length][labels.size()];
for(int i = 0; i < data.length; i++) {
// System.out.println(data[i]);
TreeSet<String> posset = dict.getPOS(data[i]);
if(posset != null &&posset.size()>0){
for(String pos:posset)
dicData[i][labels.lookupIndex(pos)] = -1;
}
}
for (int i = 0; i < length; i++)
if (hasWay(dicData[i]))
for(int j = 0; j < dicData[i].length; j++)
dicData[i][j]++;
// for(int i = 0; i < dicData.length; i++) {
// for(int j = 0; j < dicData[i].length; j++)
// System.out.print(dicData[i][j]);
// System.out.println();
// }
instance.setDicData(dicData);
}
private boolean hasWay(int[] ia) {
for(int i = 0; i < ia.length; i++) {
if(ia[i] == -1)
return true;
}
return false;
}
}