package org.ansj.app.crf; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.List; import org.ansj.app.crf.pojo.Element; import org.nlpcn.commons.lang.util.IOUtil; import org.nlpcn.commons.lang.util.StringUtil; import org.nlpcn.commons.lang.util.logging.Log; import org.nlpcn.commons.lang.util.logging.LogFactory; /** * 生成crf 或者是 wapiti的训练语聊工具. * * 执行:java org.ansj.app.crf.MakeTrainFile [inputPath] [outputPath] * * @author Ansj * */ public class MakeTrainFile { private static final Log logger = LogFactory.getLog(); public static void main(String[] args) { String inputPath = "corpus.txt"; String outputPath = "train.txt"; if (args != null && args.length == 2) { inputPath = args[0]; outputPath = args[1]; } if (StringUtil.isBlank(inputPath) || StringUtil.isBlank(outputPath)) { logger.info("org.ansj.app.crf.MakeTrainFile [inputPath] [outputPath]"); return; } try (BufferedReader reader = IOUtil.getReader(inputPath, "utf-8"); FileOutputStream fos = new FileOutputStream(outputPath)) { String temp = null; int i = 0; while ((temp = reader.readLine()) != null) { StringBuilder sb = new StringBuilder("\n"); if (StringUtil.isBlank(temp)) { continue; } if (i == 0) { temp = StringUtil.trim(temp); } List<Element> list = Config.makeToElementList(temp, "\\s+"); for (Element element : list) { sb.append(element.nameStr() + " " + Config.getTagName(element.getTag())); sb.append("\n"); } fos.write(sb.toString().getBytes(IOUtil.UTF8)); System.out.println(++i); } } catch (FileNotFoundException e) { logger.warn("文件没有找到", e); } catch (IOException e) { logger.warn("IO异常", e); } } }