package edu.fudan.data.reader; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import edu.fudan.ml.types.Instance; /** * @author xpqiu * @version 1.0 * 文档数据读取如下: * 输入为数据存放路径(子文件夹不处理) * 不同类别存放在各自文件中 * 类别:文件名 * 数据:文件内的一行字符 * package edu.fudan.ml.data */ public class FileReader extends Reader { LinkedList<File> files; Instance cur; Charset charset; String content = null; BufferedReader reader; int line; File currentFile; private String filter; public FileReader(String path) { this(path, "UTF-8",null); } /** * * @param path 路径名 * @param charsetName 字符编码 * @param filter 文件类型过滤 */ public FileReader(String path, String charsetName, String filter) { files = new LinkedList<File>(); this.filter = filter; File fpath = new File(path); if(fpath.isDirectory()) { File[] flist = fpath.listFiles(); for(int i=0;i<flist.length;i++){ if(flist[i].isFile()){ if(filter==null) files.push(flist[i]); else if(flist[i].getName().endsWith(filter)) files.push(flist[i]); } } }else{ System.err.println("输入必须为目录"); } if(files.size()==0) System.err.println("找不到合法文件"); charset = Charset.forName(charsetName); getFile(); } private boolean getFile() { currentFile = files.poll(); if(currentFile==null) return false; try { FileInputStream in = new FileInputStream(currentFile); reader = new BufferedReader(new InputStreamReader(in, "UTF-8")); } catch (FileNotFoundException e) { System.err.println("文件不存在"); return false; } catch (UnsupportedEncodingException e) { System.err.println("文件编码错误"); return false; } line=0; return true; } public boolean hasNext() { while(true){ try{ content = reader.readLine(); line++; if(content==null){ reader.close(); if(!getFile()){ return false; } continue; } if(content.length()==0) continue; else return true; }catch (IOException e) { System.err.println("读文件错误。文件名:"+currentFile.getName()+"行数:"+(line-1)); return false; } } } public Instance next() { int idx = currentFile.getName().indexOf("."); return new Instance (content,currentFile.getName().substring(0, idx)); } }