package edu.fudan.nlp.cn.tag; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.OutputStreamWriter; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import edu.fudan.ml.classifier.LabelParser.Type; import edu.fudan.ml.classifier.Predict; import edu.fudan.ml.classifier.TPredict; import edu.fudan.ml.classifier.linear.Linear; import edu.fudan.ml.types.Instance; import edu.fudan.ml.types.alphabet.AlphabetFactory; import edu.fudan.ml.types.alphabet.IFeatureAlphabet; import edu.fudan.ml.types.alphabet.LabelAlphabet; import edu.fudan.nlp.pipe.Pipe; import edu.fudan.nlp.pipe.seq.Sequence2FeatureSequence; import edu.fudan.nlp.pipe.seq.templet.TempletGroup; import edu.fudan.util.exception.LoadModelException; /** * 分词训练 * */ public abstract class AbstractTagger { private Linear cl; protected Pipe prePipe=null; protected Pipe featurePipe; public AlphabetFactory factory; protected TempletGroup templets; protected LabelAlphabet labels; /** * 词之间间隔标记,缺省为空格。 */ protected String delim = " "; /** * 抽象标注器构造函数 * @param file 模型文件 * @throws LoadModelException */ public AbstractTagger(String file) throws LoadModelException { loadFrom(file); if(getClassifier()==null){ throw new LoadModelException("模型为空"); } factory = getClassifier().getAlphabetFactory(); labels = factory.DefaultLabelAlphabet(); IFeatureAlphabet features = factory.DefaultFeatureAlphabet(); featurePipe = new Sequence2FeatureSequence(templets, features, labels); } public AbstractTagger() { } /** * 序列标注方法 * @param src 输入句子 * @return */ public abstract Object tag(String src); protected String[] _tag(Instance inst) { doProcess(inst); TPredict pred = getClassifier().classify(inst,Type.SEQ); if (pred == null) return new String[0]; return (String[]) pred.getLabel(0); } /** * 序列标注方法,输入输出为文件 * @param input 输入文件 UTF8编码 * @param output 输出文件 UTF8编码 */ public void tagFile(String input,String output,String sep){ String s = tagFile(input,"\n"); try { OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream( output), "utf-8"); BufferedWriter bw = new BufferedWriter(writer); bw.write(s); bw.close(); } catch (Exception e) { System.out.println("写输出文件错误"); e.printStackTrace(); } } /** * 序列标注方法,输入为文件 * @param input 输入文件 UTF8编码 * @return 标注结果 */ public String tagFile(String input) { return tagFile(input," "); } /** * 序列标注方法,输入为文件 * @param input 输入文件 UTF8编码 * @return 标注结果 */ public String tagFile(String input,String sep) { StringBuilder res = new StringBuilder(); try { InputStreamReader read = new InputStreamReader(new FileInputStream( input), "utf-8"); BufferedReader lbin = new BufferedReader(read); String str = lbin.readLine(); while (str != null) { String s = (String) tag(str); res.append(s); res.append("\n"); str = lbin.readLine(); } lbin.close(); return res.toString(); } catch (IOException e) { System.out.println("读输入文件错误"); e.printStackTrace(); } return ""; } /** * 数据处理方法,将数据从字符串的形式转化成向量形式 * @param carrier 样本实例 */ public void doProcess(Instance carrier) { try { if(prePipe!=null) prePipe.addThruPipe(carrier); carrier.setSource(carrier.getData()); featurePipe.addThruPipe(carrier); } catch (Exception e) { e.printStackTrace(); } } public void saveTo(String modelfile) throws IOException { ObjectOutputStream out = new ObjectOutputStream( new BufferedOutputStream(new GZIPOutputStream( new FileOutputStream(modelfile)))); out.writeObject(templets); out.writeObject(getClassifier()); out.close(); } public void loadFrom(String modelfile) throws LoadModelException{ try { ObjectInputStream in = new ObjectInputStream(new BufferedInputStream( new GZIPInputStream(new FileInputStream(modelfile)))); templets = (TempletGroup) in.readObject(); setClassifier((Linear) in.readObject()); in.close(); } catch (Exception e) { throw new LoadModelException(e,modelfile); } } public Linear getClassifier() { return cl; } public void setClassifier(Linear cl) { this.cl = cl; } }