package edu.fudan.nlp.corpus; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.util.Scanner; public class CreateNounTrainFile { //ctb_v6/processed/postagged.train -> ctb_v6/processed/nountagged.train //目前postagged.train和nountagged.train还有训练noun的template在example-data/ctb_v6下 public static void main(String[] args) throws IOException { FileInputStream in = new FileInputStream("ctb_v6/processed/postagged.train"); FileOutputStream out = new FileOutputStream("ctb_v6/processed/nountagged.train"); Scanner scanner = new Scanner(in); PrintWriter pw = new PrintWriter(out); while(scanner.hasNext()) { String line = scanner.nextLine(); if(line.trim().equals("")) { pw.println(); continue; } String[] sa = line.split("\t"); //System.out.println(sa[0] + " " + sa[1]); if(sa[1].indexOf("-N") != -1) { if(sa[1].indexOf("B-N") != -1) pw.println(sa[0] + "\t" + "B"); else if(sa[1].indexOf("S-N") != -1) pw.println(sa[0] + "\t" + "S"); else if(sa[1].indexOf("M-N") != -1) pw.println(sa[0] + "\t" + "M"); else if(sa[1].indexOf("E-N") != -1) pw.println(sa[0] + "\t" + "E"); else { System.out.println("error!"); System.exit(1); } } else pw.println(sa[0] + "\t" + "O"); } scanner.close(); in.close(); pw.close(); out.close(); } }