package edu.fudan.nlp.cn.tag; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.HashMap; import java.util.regex.Pattern; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import edu.fudan.ml.types.Instance; import edu.fudan.nlp.cn.Tags; import edu.fudan.util.MyCollection; /** * 实体名标注器 * 通过词性标注实现。 * @author 邱锡鹏 * */ public class NERTagger { private static POSTagger pos; public NERTagger(CWSTagger cws, String str) throws Exception { pos = new POSTagger(cws, str); } public NERTagger(String segmodel, String posmodel) throws Exception { pos = new POSTagger(segmodel,posmodel); } public NERTagger(POSTagger posmodel){ pos = posmodel; } public HashMap<String, String> tag(String src) { HashMap<String, String> map = new HashMap<String, String>(); tag(src,map); return map; } public void tag(String src,HashMap<String, String> map) { String[] sents = src.split("\\n+"); try { for (int i = 0; i < sents.length; i++) { String[][] res = pos.tag2Array(sents[i]); if(res!=null){ for(int j=0;j<res[0].length;j++){ if(Tags.isEntiry(res[1][j])){ map.put(res[0][j], res[1][j]); } } } } } catch (Exception e) { e.printStackTrace(); } } public HashMap<String, String> tagFile(String input) { try { InputStreamReader read = new InputStreamReader(new FileInputStream( input), "utf-8"); BufferedReader lbin = new BufferedReader(read); String str = lbin.readLine(); HashMap<String, String> map = new HashMap<String, String>(); while (str != null) { tag(str,map); str = lbin.readLine(); } lbin.close(); return map; } catch (IOException e) { e.printStackTrace(); } return null; } public void tagFile(String input,String output) { HashMap<String, String> map = tagFile(input); MyCollection.write(map.keySet(), output); } public static void main(String[] args) throws Exception { Options opt = new Options(); opt.addOption("h", false, "Print help for this application"); opt.addOption("f", false, "segment file. Default string mode."); opt.addOption("s", false, "segment string"); BasicParser parser = new BasicParser(); CommandLine cl = parser.parse(opt, args); if (args.length == 0 || cl.hasOption('h')) { HelpFormatter f = new HelpFormatter(); f.printHelp( "Tagger:\n" + "java edu.fudan.nlp.tag.NERTagger -f segmodel posmodel input_file output_file;\n" + "java edu.fudan.nlp.tag.NERTagger -s segmodel posmodel string_to_segement", opt); return; } String[] arg = cl.getArgs(); String segmodel; String posmodel; String input; String output = null; if (cl.hasOption("f") && arg.length == 4) { segmodel = arg[0]; posmodel = arg[1]; input = arg[2]; output = arg[3]; } else if (arg.length == 3) { segmodel = arg[0]; posmodel = arg[1]; input = arg[2]; } else { System.err.println("paramenters format error!"); System.err.println("Print option \"-h\" for help."); return; } NERTagger ner = new NERTagger(segmodel,posmodel); if (cl.hasOption("f")) { ner.tagFile(input,output); } else { HashMap<String, String> map = ner.tag(input); System.out.println(map); } } }