package edu.fudan.nlp.similarity.train;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import edu.fudan.data.reader.Reader;
import edu.fudan.ml.types.Instance;
import edu.fudan.nlp.cn.ChineseTrans;
public class SougouCA extends Reader {
private static ChineseTrans tc = new ChineseTrans();
File file = null;
BufferedReader reader = null;
String url = null;
String docno = null;
String contenttitle = null;
String content = null;
public SougouCA(String strfile) {
file = new File(strfile);
if (file.exists()) {
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "GBK"));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
} else {
file = null;
reader = null;
}
}
/**
* 向后读取一项
* @return
*/
public boolean hasNext() {
if (reader == null)
return false;
String line;
try {
line = reader.readLine();
if (line == null) return false;
if (line.equals("<doc>")) {
line = reader.readLine();
url = line.replaceAll("^<url>", "");
url = url.replaceAll("</url>$", "");
line = reader.readLine();
docno = line.replaceAll("^<docno>", "");
docno = docno.replaceAll("</docno>$", "");
line = reader.readLine();
contenttitle = line.replaceAll("^<contenttitle>", "");
contenttitle = contenttitle.replaceAll("</contenttitle>$", "");
line = reader.readLine();
content = line;
while(!line.endsWith("</content>")){
line = reader.readLine();
content += line;
}
content = content.replaceAll("^<content>", "");
content = content.replaceAll("</content>$", "");
}
line = reader.readLine();
return true;
} catch (IOException e) {
e.printStackTrace();
}
return false;
}
/**
* 读取内容
* @return
*/
public Instance next(){
return new Instance(content);
}
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
BufferedWriter bout = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("./tmpdata/trad.txt"), "UTF-8"));
SougouCA sca = new SougouCA("./tmpdata/SogouCa/news.allsites.010805.txt");
while(sca.hasNext()){
String s = (String) sca.next().getData();
s = tc.normalize(s);
// System.out.println(s);
if (s.length() == 0)
continue;
bout.write(s);
bout.write("\n");
}
bout.close();
System.out.println("Done!");
}
}