package edu.fudan.nlp.corpus.fnlp;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.TreeSet;
import edu.fudan.nlp.cn.ChineseTrans;
import edu.fudan.util.MyCollection;
import edu.fudan.util.MyFiles;
import edu.fudan.util.UnicodeReader;
import edu.fudan.util.ValueComparator;
/**
* FudanNLP标准数据格式
* @since FudanNLP 1.5
*/
public class FNLPCorpus {
public LinkedList<FNLPDoc> docs = new LinkedList<FNLPDoc>();
public FNLPCorpus() {
}
public LinkedList<FNLPDoc> getDocumenList(){
return this.docs;
}
public void add(FNLPDoc doc) {
docs.add(doc);
}
/**
* 将数据输出到多个文件,每个DOC一个文件
* @param path
*/
public void write(String path){
File f = new File(path);
if(!f.exists()){
f.mkdirs();
}
Iterator<FNLPDoc> it = docs.iterator();
while(it.hasNext()){
FNLPDoc doc = it.next();
doc.write(path);
}
}
/**
* 将数据输出到一个文件
* @param path
*/
public void writeOne(String path){
File f = new File(path);
if(!f.getParentFile().exists()){
f.getParentFile().mkdirs();
}
Writer out = null;
try {
out = new OutputStreamWriter(new FileOutputStream(path),"utf8");
Iterator<FNLPDoc> it = docs.iterator();
while(it.hasNext()){
FNLPDoc doc = it.next();
out.write(doc.toString());
out.write("\n");
}
out.close();
} catch(Exception e) {
e.printStackTrace();
}
}
/**
* 统计词信息
* @param path
* @param b 是否输出词频
* @throws IOException
*/
public void count(String path,boolean b) throws IOException{
HashMap<String, Integer> wordsFreq = new HashMap<String, Integer>();
HashMap<Integer, Integer> lensFreq = new HashMap<Integer, Integer>();
HashMap<String, Integer> posFreq = new HashMap<String, Integer>();
HashMap<String, Integer> relsFreq = new HashMap<String, Integer>();
HashMap<String, HashSet<String>> wordsPOS = new HashMap<String, HashSet<String>>();
int total = 0;
int totalsent = 0;
Iterator<FNLPDoc> dit = docs.iterator();
while(dit.hasNext()){
FNLPDoc doc = dit.next();
Iterator<FNLPSent> sit = doc.sentences.iterator();
while(sit.hasNext()){
FNLPSent sent = sit.next();
totalsent++;
for(int i=0;i<sent.words.length;i++){
total++;
String w = sent.words[i];
int len = w.length();
String pos = sent.tags[i];
if(!pos.equals("专有名"))
continue;
String rels = sent.relations[i];
// if(len > 20){
// System.out.println(w);
// }
if (posFreq.containsKey(pos)) {
posFreq.put(pos, posFreq.get(pos) + 1);
} else {
posFreq.put(pos, 1);
}
if (lensFreq.containsKey(len)) {
lensFreq.put(len, lensFreq.get(len) + 1);
} else {
lensFreq.put(len, 1);
}
if (wordsFreq.containsKey(w)) {
wordsFreq.put(w, wordsFreq.get(w) + 1);
} else {
wordsFreq.put(w, 1);
}
if (wordsPOS.containsKey(w)) {
wordsPOS.get(w).add(pos);
} else {
HashSet<String> posset = new HashSet<String>();
posset.add(pos);
wordsPOS.put(w, posset);
}
if (relsFreq.containsKey(rels)) {
relsFreq.put(rels, relsFreq.get(rels) + 1);
} else {
relsFreq.put(rels, 1);
}
}
}
}
System.out.println("总字数:"+total);
System.out.println("总句数:"+totalsent);
List<Entry> sortedwordsFreq = MyCollection.sort(wordsFreq);
MyCollection.write(sortedwordsFreq, path+"/wc.txt", b);
List<Entry> sortedposFreq = MyCollection.sort(posFreq);
MyCollection.write(sortedposFreq, path+"/pos.txt", b);
List<Entry> sortedlrelsFreq = MyCollection.sort(relsFreq);
MyCollection.write(sortedlrelsFreq, path+"/relations.txt", b);
List<Entry> sortedlensFreq = MyCollection.sort(lensFreq);
MyCollection.write(sortedlensFreq, path+"/lc.txt", b);
MyCollection.write(wordsPOS, path+"/wordpos.txt");
}
public void read(String path,String suffix) throws IOException {
List<File> files = MyFiles.getAllFiles(path, suffix);
List<String> carrier = new ArrayList<String>();
Iterator<File> it = files.iterator();
while(it.hasNext()){
BufferedReader bfr =null;
File file = it.next();
try {
bfr = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf8"));
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String line = null;
while ((line = bfr.readLine()) != null) {
line = line.trim();
if(line.equalsIgnoreCase("<doc>")){
carrier.clear();
}
else if (line.matches("</doc>")){
FNLPDoc doc = new FNLPDoc(carrier);
docs.add(doc);
}else
carrier.add(line);
}
}
}
public static void main(String[] args) throws IOException{
FNLPCorpus corpus = new FNLPCorpus();
// corpus.read("./data/FNLPDATA/ctb7.dat",null);
corpus.readOurCorpus("./data/ourdata",null,"UTF8");
corpus.count("./tmp/",false);
System.out.println(new Date().toString());
System.out.println("Done!");
}
/**
* 读只分词的文件
* @param path
* @param suffix
* @param charset
* @throws IOException
*/
public void readCWS(String path, String suffix, String charset) throws IOException {
List<File> files = MyFiles.getAllFiles(path, suffix);//".txt"
Iterator<File> it = files.iterator();
while(it.hasNext()){
BufferedReader bfr =null;
File file = it.next();
try {
FileInputStream in = new FileInputStream(file);
bfr = new BufferedReader(new UnicodeReader(in,charset));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
FNLPDoc doc = new FNLPDoc();
doc.name = file.getName();
String line = null;
while ((line = bfr.readLine()) != null) {
line = line.trim();
if (line.matches("^$"))
continue;
FNLPSent sent = new FNLPSent();
sent.put(line);
doc.add(sent);
}
add(doc);
}
}
/**
* 读自己标注的文件
* @param path
* @param suffix
* @param charset
* @throws IOException
*/
public void readOurCorpus(String path, String suffix, String charset) throws IOException {
List<File> files = MyFiles.getAllFiles(path, suffix);//".txt"
Iterator<File> it = files.iterator();
while(it.hasNext()){
BufferedReader bfr =null;
File file = it.next();
try {
FileInputStream in = new FileInputStream(file);
bfr = new BufferedReader(new UnicodeReader(in,charset));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
FNLPDoc doc = new FNLPDoc();
doc.name = file.getName();
String line = null;
while ((line = bfr.readLine()) != null) {
line = line.trim();
if (line.matches("^$"))
continue;
String[] toks = line.split("\\s+");
assert(toks.length%5==0);
int len = toks.length/5;
FNLPSent sent = new FNLPSent(len);
int base = 0;
for(int i=0;i<len;i++){
int idx = Integer.valueOf( toks[base]);
sent.words[idx] = toks[base+1];
sent.tags[idx] = toks[base+2];
sent.heads[idx] = Integer.valueOf(toks[base+3]);
sent.relations[idx] = toks[base+4];
base += 5;
}
doc.add(sent);
}
add(doc);
}
}
public int getDocumenSize() {
return docs.size();
}
public FNLPDoc getDoc(int idx) {
if(idx<docs.size())
return docs.get(idx);
else
return null;
}
/**
* 得到所有词性
* @return
*/
public TreeSet<String> getAllPOS() {
TreeSet<String> set = new TreeSet<String>();
Iterator<FNLPDoc> it1 = docs.iterator();
while(it1.hasNext()){
FNLPDoc doc = it1.next();
Iterator<FNLPSent> it2 = doc.sentences.iterator();
while(it2.hasNext()){
FNLPSent sent = it2.next();
if(!sent.hasTag())
continue;
for(int i=0;i<sent.size();i++){
set.add(sent.tags[i]);
}
}
}
return set;
}
}