package org.fastcatsearch.datasource.reader; import org.fastcatsearch.datasource.SourceModifier; import org.fastcatsearch.ir.common.IRException; import org.fastcatsearch.ir.config.SingleSourceConfig; import java.io.*; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.zip.GZIPInputStream; /** * 파일기반의 소스데이터를 읽어들이는 Abstract Reader이다. * GZip으로 압축되어 있다면 자동으로 풀면서 읽어들이고, 압축되지 있지 않은 데이터는 그대로 읽어들인다. * 하위 클래스에서는 parse()를 구현하여 어떻게 문서를 읽어들이는지를 정의하도록 한다. * */ public abstract class AbstractFileReader extends SingleSourceReader<Map<String,Object>> implements FileFilter { private LinkedList<Map<String, Object>> items; protected String encoding; protected int bufferSize; protected int limitSize; protected List<String> filePaths; protected BufferedReader reader; private static final int DEFAULT_BUFFER_SIZE = 100; private int readCount; public AbstractFileReader() { super(); } public AbstractFileReader(String collectionId, File filePath, SingleSourceConfig singleSourceConfig , SourceModifier sourceModifier, String lastIndexTime) throws IRException { super(collectionId, filePath, singleSourceConfig, sourceModifier, lastIndexTime); } @Override public void init() throws IRException { String filePathStr = getConfigString("filePath"); encoding = getConfigString("encoding", "utf-8"); bufferSize = getConfigInt("bufferSize", DEFAULT_BUFFER_SIZE); limitSize = getConfigInt("limitSize"); if(bufferSize < DEFAULT_BUFFER_SIZE) { bufferSize = DEFAULT_BUFFER_SIZE; } items = new LinkedList<Map<String, Object>>(); filePaths = new LinkedList<String>(); String[] pathList = filePathStr.split(","); for(String path : pathList) { String rootPath = filePath.makePath(path).file().getAbsolutePath(); File rootFile = new File(rootPath); if(rootFile.isDirectory()) { rootFile.listFiles(this); } else { filePaths.add(rootFile.getAbsolutePath()); } } readCount = 0; } @Override protected void initParameters() { registerParameter(new SourceReaderParameter("filePath", "File or Dir Path", "File path for reading source file. Absolute path or relative path for collection home directory. Multiple paths are allowed with commas." , SourceReaderParameter.TYPE_STRING_LONG, true, null)); registerParameter(new SourceReaderParameter("encoding", "Encoding", "File encoding" , SourceReaderParameter.TYPE_STRING, true, "utf-8")); registerParameter(new SourceReaderParameter("bufferSize", "Buffer Size", "Read Buffer Size" , SourceReaderParameter.TYPE_NUMBER, true, String.valueOf(DEFAULT_BUFFER_SIZE))); registerParameter(new SourceReaderParameter("limitSize", "Limit Size", "Read documents within limit size." , SourceReaderParameter.TYPE_NUMBER, false, "")); } @Override public boolean hasNext() throws IRException { if(items.size() == 0) { fill(); } return items.size() > 0; } @Override protected Map<String, Object> next() throws IRException { if(items.size() == 0) { fill(); } if(items.size() > 0) { return items.removeFirst(); } return null; } private void fill() throws IRException { while (true) { if(reader != null) { try { if(items.size() >= bufferSize) { return; } if(limitSize > 0 && readCount >= limitSize) { return; } Map<String, Object> record = parse(reader); items.addLast(record); readCount++; } catch(IOException e) { //get next reader.. try { reader.close(); } catch (IOException ignore) { } reader = null; } } else { while (filePaths.size() > 0) { String path = filePaths.remove(0); File f = new File(path); if(!f.exists()) { //파일이 없으면 continue logger.error(String.format("File not exists : %s", f.getAbsolutePath())); continue; } try { if(isGZipped(f)) { reader = new BufferedReader((new InputStreamReader(new GZIPInputStream(new FileInputStream(f)), encoding))); } else { reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding)); } initReader(reader); break; } catch (IOException ex) { logger.error("", ex); if(reader != null) { try { reader.close(); } catch (IOException ignore) { } reader = null; } } } //파일이 더 이상 없으면 끝낸다. if(reader == null) { break; } } } } /* * 읽은 데이터를 리턴한다. * reader가 EOF등에 다다르면 IOException을 던져서 다음 reader를 준비하도록 한다. * */ protected abstract Map<String, Object> parse(BufferedReader reader) throws IRException, IOException; protected abstract void initReader(BufferedReader reader) throws IRException, IOException; @Override public void close() throws IRException { super.close(); if(reader != null) { try { reader.close(); } catch (IOException ignore) { } } } @Override public boolean accept(File file) { if(file.isDirectory()) { logger.trace("dir:{}", file.getAbsolutePath()); file.listFiles(this); } else if(file.isFile()) { logger.trace("file : {}", file.getAbsolutePath()); if(! file.isHidden()) { filePaths.add(file.getAbsolutePath()); } } return true; } private boolean isGZipped(File file) { int magic = 0; RandomAccessFile raf = null; try { raf = new RandomAccessFile(file, "r"); magic = raf.read() & 0xff | ((raf.read() << 8) & 0xff00); } catch (Throwable t) { logger.error("error while inspect file header.", t); } finally { if(raf != null) { try { raf.close(); } catch (IOException ignore) { } } } return magic == GZIPInputStream.GZIP_MAGIC; } }