package edu.fudan.nlp.corpus;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import edu.fudan.util.MyCollection;
import edu.fudan.util.MyFiles;
import gnu.trove.map.hash.TCharIntHashMap;
public class CorpusCount {
TCharIntHashMap charfreq = new TCharIntHashMap();
int charnum = 0;
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
CorpusCount wc = new CorpusCount();
wc.countChar("D:/wordcluster/SogouCA","GBK");
wc.toString();
}
private void countChar(String ifile, String enc) throws IOException {
if((new File(ifile)).isDirectory()){
List<File> filese = MyFiles.getAllFiles(ifile,null);
for(File f:filese){
countChar(f.toString(),enc);
}
return;
}
BufferedReader bfr = new BufferedReader(new InputStreamReader(new FileInputStream(ifile),enc));
String line = null;
int count=0;
while ((line = bfr.readLine()) != null) {
if(line.length()==0)
continue;
if(!line.startsWith("<content"))
continue;
if(count%10000==0)
System.out.println(count);
count++;
line.replace("<contenttitle>", "");
line.replace("</contenttitle>", "");
line.replace("<content>", "");
line.replace("</content>", "");
for(int i=0;i<line.length();i++){
char c = line.charAt(i);
charfreq.adjustOrPutValue(c, 1, 1);
charnum++;
}
}
bfr.close();
}
public String toString(){
String s = "";
s += "char type number:\t" + charfreq.size()+"\n";
s += "char number:\t" + charnum+"\n";
return s;
}
}