package org.ansj.app.crf.model; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.List; import org.ansj.CorpusTest; import org.ansj.app.crf.Model; import org.ansj.app.crf.SplitWord; import org.junit.Before; import org.junit.Test; import org.nlpcn.commons.lang.util.StringUtil; import junit.framework.Assert; public class CRFModelTest extends CorpusTest { private String modelPath = "src/main/resources/crf.model"; private String testModelPath = "src/main/resources/test.model"; private String testPath = "src/test/resources/corpus.txt"; private Model model = new CRFModel(); @Before public void before() throws Exception { model.loadModel(modelPath); } @Test public void savePathTest() throws FileNotFoundException, IOException { model.writeModel(testModelPath); Assert.assertEquals(true, new File(testModelPath).delete()); } @Test public void cute() throws Exception { SplitWord sw = new SplitWord(model); for (String line : lines) { System.out.println(sw.cut(line)); } } @Test public void test() throws Exception { try (FileInputStream fis = new FileInputStream(testPath)) { BufferedReader br = new BufferedReader(new InputStreamReader(fis)); SplitWord sw = new SplitWord(model); String temp_str = null; int line_number = 0;// 记录行数 int ansj_term_number = 0;// 记录ansj中分出的term数量 int result_num = 0; double P = 0.0; double R = 0.0; double F = 0.0; int allError = 0; int allSuccess = 0; String[] had_words_array = null;// 按split分开后的数组 String body = null; while ((temp_str = br.readLine()) != null) { if (StringUtil.isBlank(temp_str)) { continue; } int error = 0; int success = 0; temp_str = temp_str.trim(); body = temp_str.replaceAll("\t", ""); had_words_array = new String[body.length()]; int offe = 0; List<String> paser = sw.cut(body); // 填充result String[] result = temp_str.split("\t"); for (int i = 0; i < result.length; i++) { had_words_array[offe] = result[i]; offe += result[i].length(); } offe = 0; for (String word : paser) { if (had_words_array[offe] == null) { error++; } else if (had_words_array[offe].equalsIgnoreCase(word)) { success++; } else { success++; } offe += word.length(); } // ansj分出的个数 ansj_term_number += paser.size(); // 词语的个数 result_num += result.length; // 累计错误数 allError += error; // 累计正确数 allSuccess += success; if (error > 0) { System.out.println("example:" + temp_str); System.out.println( " result:" + paser.toString().replace("[", "").replace("]", "").replace(", ", "\t")); } System.out.println("[" + line_number + "]---准确率P:--" + ((double) success / paser.size())); line_number++; } // 正确数/总词数 P = allSuccess / (double) ansj_term_number; // 正确数/标注文本中的词数 R = allSuccess / (double) result_num; F = (2 * P * R) / (P + R); System.out.println("P:" + P); System.out.println("R:" + R); System.out.println("全文平均准确率--" + F); } } }