package net.paoding.analysis.analyzer.estimate; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import net.paoding.analysis.analyzer.PaodingAnalyzer; import net.paoding.analysis.knife.PaodingMaker; import org.apache.lucene.analysis.Analyzer; public class TryPaodingAnalyzer { private static final String ARGS_TIP = ":"; static String input = null; static String file = null; static Reader reader = null; static String charset = null; static String mode = null; static String analyzerName = null; static String print = null; static String properties = PaodingMaker.DEFAULT_PROPERTIES_PATH; public static void main(String[] args) { try { resetArgs(); int inInput = 0; for (int i = 0; i < args.length; i++) { if (args[i] == null || (args[i] = args[i].trim()).length() == 0) { continue; } if (args[i].equals("--file") || args[i].equals("-f")) { file = args[++i]; } else if (args[i].equals("--charset") || args[i].equals("-c")) { charset = args[++i]; } else if (args[i].equals("--mode") || args[i].equals("-m")) { mode = args[++i]; } else if (args[i].equals("--properties") || args[i].equals("-p")) { properties = args[++i]; } else if (args[i].equals("--analyzer") || args[i].equals("-a")) { analyzerName = args[++i]; } else if (args[i].equals("--print") || args[i].equals("-P")) { print = args[++i]; } else if (args[i].equals("--input") || args[i].equals("-i")) { inInput++; } else if (args[i].equals("--help") || args[i].equals("-h") || args[i].equals("?")) { printHelp(); return; } else { // 非选项的参数数组视为input if (!args[i].startsWith("-") && (i == 0 || args[i - 1].equals("-i") || args[i - 1].equals("--input") || !args[i - 1].startsWith("-"))) { if (input == null) { input = args[i];// !!没有++i } else { input = input + ' ' + args[i];// !!没有++i } inInput++; } } } if (file != null) { input = null; reader = getReader(file, charset); } // analysing(); } catch (Exception e1) { resetArgs(); e1.printStackTrace(); } } private static void resetArgs() { input = null; file = null; reader = null; charset = null; mode = null; print = null; analyzerName = null; properties = PaodingMaker.DEFAULT_PROPERTIES_PATH; } private static void analysing() throws Exception { Analyzer analyzer; if (analyzerName == null || analyzerName.length() == 0 || analyzerName.equalsIgnoreCase("paoding")) { //properties==null等同于new new PaodingAnalyzer(); analyzer = new PaodingAnalyzer(properties); if (mode != null) { ((PaodingAnalyzer) analyzer).setMode(mode); } } else { Class clz; if (analyzerName.equalsIgnoreCase("standard")) { analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer"; } else if (analyzerName.equalsIgnoreCase("cjk")) { analyzerName = "org.apache.lucene.analysis.cjk.CJKAnalyzer"; } else if (analyzerName.equalsIgnoreCase("cn") || analyzerName.equalsIgnoreCase("chinese")) { analyzerName = "org.apache.lucene.analysis.cn.ChineseAnalyzer"; } else if (analyzerName.equalsIgnoreCase("st") || analyzerName.equalsIgnoreCase("standard")) { analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer"; } clz = Class.forName(analyzerName); analyzer = (Analyzer) clz.newInstance(); } boolean readInputFromConsle = false; Estimate estimate = new Estimate(analyzer); if (print != null) { estimate.setPrint(print); } while (true) { if (reader == null) { if (input == null || input.length() == 0 || readInputFromConsle) { input = getInputFromConsole(); readInputFromConsle = true; } if (input == null || input.length() == 0) { System.out.println("Warn: none charactors you input!!"); continue; } else if (input.startsWith(ARGS_TIP)) { String argsStr = input.substring(ARGS_TIP.length()); main(argsStr.split(" ")); continue; } } if (reader != null) { estimate.test(System.out, reader); reader = null; } else { estimate.test(System.out, input); input = null; } System.out.println("--------------------------------------------------"); if (false == readInputFromConsle) { return; } } } private static String getInputFromConsole() throws IOException { printTitleIfNotPrinted(""); String input = null; BufferedReader reader = new BufferedReader(new InputStreamReader( System.in)); String line; do { System.out.print("paoding> "); line = reader.readLine(); if (line == null || line.length() == 0) { continue; } if (line.equals(ARGS_TIP + "clear") || line.equals(ARGS_TIP + "c")) { input = null; System.out.println("paoding> Cleared"); return getInputFromConsole(); } else if (line.equals(ARGS_TIP + "exit") || line.equals(ARGS_TIP + "quit") || line.equals(ARGS_TIP + "e") || line.equals(ARGS_TIP + "q") ) { System.out.println("Bye!"); System.exit(0); } else if (input == null && line.startsWith(ARGS_TIP)) { input = line; break; } else { if (line.endsWith(";")) { if (line.length() > ";".length()) { input = line.substring(0, line.length() - ";".length()); } break; } else { if (input == null) { input = line; } else { input = input + "\n" + line; } } } } while (true); return input == null ? null : input.trim(); } private static void printHelp() { String app = System.getProperty("paoding.try.app", "TryPaodingAnalyzer"); String cmd = System.getProperty("paoding.try.cmd", "java " + TryPaodingAnalyzer.class.getName()); System.out.println(app + "的用法:"); System.out.println("\t" + cmd + " [OPTIONS] [text_content]"); System.out.println("\nOPTIONS:"); System.out.println("\t--file, -f:\n\t\t文章以文件的形式输入,在前缀加上\"classpath:\"表示从类路径中寻找该文件。"); System.out.println("\t--charset, -c:\n\t\t文章的字符集编码,比如gbk,utf-8等。如果没有设置该选项,则使用Java环境默认的字符集编码。"); System.out.println("\t--properties, -p:\n\t\t不读取默认的类路径下的庖丁分词属性文件,而使用指定的文件,在前缀加上\"classpath:\"表示从类路径中寻找该文件。"); System.out.println("\t--mode, -m:\n\t\t强制使用给定的mode的分词器;可以设定为default,most-words,max-word-length或指定类名的其他mode(指定类名的,需要加前缀\"class:\")。"); System.out.println("\t--input, -i:\n\t\t要被分词的文章内容;当没有通过-f或--file指定文章输入文件时可选择这个选项指定要被分词的内容。"); System.out.println("\t--analyzer, -a:\n\t\t测试其他分词器,通过--analyzer或-a指定其完整类名。特别地,paoding、cjk、chinese、st分别对应PaodingAnalyzer、CJKAnalyzer、ChineseAnalyzer、StandardAnalyzer"); System.out.println("\t--print, -P:\n\t\t 是否打印分词结果。默认打印前50行。规则:no表示不打印;50等价于1-50行;1-50表示打印1至50行;可以以逗号组合使用,如20,40-50表示打印1-20以及40-50行"); System.out.println("\n示例:"); System.out.println("\t" + cmd); System.out.println("\t" + cmd + " ?"); System.out.println("\t" + cmd + " 中华人民共和国"); System.out.println("\t" + cmd + " -m max 中华人民共和国"); System.out.println("\t" + cmd + " -f e:/content.txt -c utf8"); System.out.println("\t" + cmd + " -f e:/content.txt -c utf8 -m max-word-length"); System.out.println("\t" + cmd + " -f e:/content.txt -c utf8 -a cjk"); System.out.println("\n若是控制台进入\"paoding>\"后:"); titlePrinted = false; printTitleIfNotPrinted("\t"); } private static boolean titlePrinted = false; private static boolean welcomePrinted = false; private static void printTitleIfNotPrinted(String prefix) { if (!titlePrinted) { System.out.println(); if (!welcomePrinted) { System.out.println("Welcome to Paoding Analyser(2.0.4-alpha2)"); System.out.println(); welcomePrinted = true; } System.out.println(prefix + "直接输入或粘贴要被分词的内容,以分号;结束,回车后开始分词。"); System.out.println(prefix + "另起一行输入:clear或:c,使此次输入无效,用以重新输入。"); System.out.println(prefix + "要使用命令行参数读入文件内容或其他参数请以冒号:开始,然后输入参数选项。"); System.out.println(prefix + "退出,请输入:quit或:q、:exit、:e"); System.out.println(prefix + "需要帮助,请输入:?"); System.out.println(prefix + "注意:指定对文件分词之前要了解该文件的编码,如果系统编码和文件编码不一致,要通过-c指定文件的编码。"); System.out.println(); titlePrinted = true; } } static String getContent(String path, String encoding) throws IOException { return (String) read(path, encoding, true); } static Reader getReader(String path, String encoding) throws IOException { return (Reader) read(path, encoding, false); } static Object read(String path, String encoding, boolean return_string) throws IOException { InputStream in; if (path.startsWith("classpath:")) { path = path.substring("classpath:".length()); URL url = Estimate.class.getClassLoader().getResource(path); if (url == null) { throw new IllegalArgumentException("Not found " + path + " in classpath."); } System.out.println("read content from:" + url.getFile()); in = url.openStream(); } else { File f = new File(path); if (!f.exists()) { throw new IllegalArgumentException("Not found " + path + " in system."); } System.out.println("read content from:" + f.getAbsolutePath()); in = new FileInputStream(f); } Reader re; if (encoding != null) { re = new InputStreamReader(in, encoding); } else { re = new InputStreamReader(in); } if (!return_string) { return re; } char[] chs = new char[1024]; int count; // 为兼容低版本的JDK,使用StringBuffer而不是StringBuilder StringBuffer content = new StringBuffer(); while ((count = re.read(chs)) != -1) { content.append(chs, 0, count); } re.close(); return content.toString(); } }