package org.ansj.app.crf.model; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.List; import org.ansj.app.crf.Check; import org.ansj.app.crf.Model; import org.ansj.app.crf.SplitWord; import org.junit.Before; import org.junit.Test; import org.nlpcn.commons.lang.util.StringUtil; public class CRFppTxtModelTest { private String modelPath = "src/test/resources/crf_txt.model"; private String testPath = "src/test/resources/corpus.txt"; private Model model = new CRFppTxtModel(); @Before public void before() throws Exception { if (!Check.checkFileExit(modelPath)) { return; } model.loadModel(modelPath); } @Test public void savePathTest() throws FileNotFoundException, IOException { if (!Check.checkFileExit(modelPath)) { return; } model.writeModel("crf.model"); new File("crf.model").delete() ; } @Test public void cute() throws Exception { if (!Check.checkFileExit(modelPath)){ return ; } SplitWord sw = new SplitWord(model); System.out.println(sw.cut("瓦西里斯的船只中有40%驶向远东,每个月几乎都有两三条船停靠中国港口。")); model.writeModel("crf.model"); new File("crf.model").delete() ; } @Test public void test() throws Exception { if (!Check.checkFileExit(modelPath)){ return ; } BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(testPath))); SplitWord sw = new SplitWord(model); String temp_str = null; int line_number = 0;// 记录行数 int ansj_term_number = 0;// 记录ansj中分出的term数量 int result_num = 0; double P = 0.0; double R = 0.0; double F = 0.0; int allError = 0; int allSuccess = 0; String[] had_words_array = null;// 按split分开后的数组 String body = null; while ((temp_str = br.readLine()) != null) { if (StringUtil.isBlank(temp_str)) { continue; } int error = 0; int success = 0; temp_str = temp_str.trim(); body = temp_str.replaceAll("\t", ""); had_words_array = new String[body.length()]; int offe = 0; List<String> paser = sw.cut(body); // 填充result String[] result = temp_str.split("\t"); for (int i = 0; i < result.length; i++) { had_words_array[offe] = result[i]; offe += result[i].length(); } offe = 0; for (String word : paser) { if (had_words_array[offe] == null) { error++; } else if (had_words_array[offe].equalsIgnoreCase(word)) { success++; } else { success++; } offe += word.length(); } // ansj分出的个数 ansj_term_number += paser.size(); // 词语的个数 result_num += result.length; // 累计错误数 allError += error; // 累计正确数 allSuccess += success; if (error > 0) { System.out.println("example:" + temp_str); System.out.println(" result:" + paser.toString().replace("[", "").replace("]", "").replace(", ", "\t")); } System.out.println("[" + line_number + "]---准确率P:--" + ((double) success / paser.size())); line_number++; } // 正确数/总词数 P = allSuccess / (double) ansj_term_number; // 正确数/标注文本中的词数 R = allSuccess / (double) result_num; F = (2 * P * R) / (P + R); System.out.println("P:" + P); System.out.println("R:" + R); System.out.println("全文平均准确率--" + F); br.close(); } }