package edu.fudan.nlp.corpus.fnlp; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.concurrent.atomic.AtomicInteger; import edu.fudan.util.MyFiles; import edu.fudan.util.UnicodeReader; /** * FudanNLP标准数据格式 * @since FudanNLP 1.5 */ public class FNLPDoc { private static AtomicInteger num = new AtomicInteger(); /** * 文档类别 */ private String classes; /** * 文档文件名 */ public String name; public LinkedList<FNLPSent> sentences = new LinkedList<FNLPSent>(); private Map<Integer,Integer> anaphora; public FNLPDoc(){ name = String.valueOf(num.addAndGet(1)); } public FNLPDoc(List<String> carrier) { parse(carrier); } public void read(String file){ File f = new File(file); if(f.exists()) read(f); } public void read(File file){ name = file.getName(); List<String> carrier = new ArrayList<String>(); BufferedReader bfr =null; try { FileInputStream in = new FileInputStream(file); bfr = new BufferedReader(new InputStreamReader(in,Charset.forName("UTF-8"))); } catch (FileNotFoundException e) { e.printStackTrace(); } String str = null; try { while((str=bfr.readLine())!=null){ carrier.add(str); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } parse(carrier); } private void parse(List<String> carrier) { Iterator<String> it = carrier.iterator(); int bi=0; int ti = 0; while(it.hasNext()){ String line = it.next(); ti++; if(line.equals("<doc>")){ continue; }else if(line.startsWith("<c>")){ int idx1 = line.indexOf("</c>"); this.classes = line.substring(3,idx1); continue; }else if(line.startsWith("<name>")){ int idx1 = line.indexOf("</name>"); this.name = line.substring(6,idx1); continue; }else if(line.equals("</txt>")){//正文结束 List<String> list = carrier.subList(bi, ti-1); if(list.size()>0){ FNLPSent sent = new FNLPSent(list); sentences.add(sent); } continue; }else if(line.equals("<txt>")){ bi=ti; continue; }else if(line.equals("</doc>")){ break; }else if(line.length()==0){//空行,标志一个句子的结束 List<String> list = carrier.subList(bi, ti-1); if(list.size()>0){ FNLPSent sent = new FNLPSent(list); sentences.add(sent); } bi=ti; continue; }else{ } } } public String toString(){ StringBuffer sb = new StringBuffer(); sb.append("<doc>"+"\n"); if(classes!=null) sb.append("<c>"+classes+"</c>\n"); if(name!=null) sb.append("<name>"+name+"</name>\n"); sb.append("<txt>"+"\n"); Iterator<FNLPSent> it = sentences.iterator(); while(it.hasNext()){ FNLPSent sent = it.next(); sb.append(sent.toString()); if(it.hasNext()) sb.append("\n"); } sb.append("</txt>"+"\n"); sb.append("</doc>"); return sb.toString(); } public void write(String path){ Writer out = null; String file = path + "/" + name; try { out = new OutputStreamWriter(new FileOutputStream(file),"utf8"); out.write(this.toString()); out.close(); } catch(Exception e) { e.printStackTrace(); } } public void add(FNLPSent sent) { sentences.add(sent); } public void clear() { sentences.clear(); } public LinkedList<FNLPSent> getSent() { return this.sentences; } public FNLPSent getSent(int idx) { if(idx<sentences.size()) return sentences.get(idx); else return null; } }