package edu.fudan.data.reader; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import edu.fudan.ml.types.Instance; import edu.fudan.util.MyFiles; /** * @author xpqiu * @version 1.0 * 文档数据读取如下: * 输入为数据存放路径 * 不同类别的文件放在不同的子文件夹下 * 类别:子文件夹名 * 数据:文件内所有字符 * package edu.fudan.ml.data */ public class DocumentReader extends Reader { List<File> files; Instance cur; Charset charset; public DocumentReader(String path) { this(path, "UTF-8"); } public DocumentReader(String path, String charsetName) { files = MyFiles.getAllFiles(path,null); charset = Charset.forName(charsetName); } public boolean hasNext() { if (files.isEmpty()) return false; nextDocument(); return true; } public Instance next() { return cur; } private void nextDocument() { StringBuffer buff = new StringBuffer(); File f = files.remove(files.size()-1); try { BufferedReader cf = new BufferedReader(new InputStreamReader( new FileInputStream(f), charset)); String line = null; while((line = cf.readLine()) != null) { buff.append(line); buff.append('\n'); } cf.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } cur = new Instance(buff.toString(), f.getPath()); buff = null; } }