/* * Copyright 2013 Websquared, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fastcatsearch.ir.document; import java.io.File; import java.io.IOException; import java.util.List; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; import org.apache.commons.io.input.BoundedInputStream; import org.apache.lucene.util.BytesRef; import org.fastcatsearch.ir.common.IndexFileNames; import org.fastcatsearch.ir.field.Field; import org.fastcatsearch.ir.field.FieldDataParseException; import org.fastcatsearch.ir.io.BufferedFileInput; import org.fastcatsearch.ir.io.ByteRefArrayOutputStream; import org.fastcatsearch.ir.io.BytesDataInput; import org.fastcatsearch.ir.io.DataInput; import org.fastcatsearch.ir.io.IOUtil; import org.fastcatsearch.ir.io.IndexInput; import org.fastcatsearch.ir.settings.FieldSetting; import org.fastcatsearch.ir.settings.SchemaSetting; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * 문서번호는 세그먼트마다 0부터 시작하는 번호로 read한다. baseNo와는 상관없는 내부문서번호. * */ public class DocumentReader implements Cloneable { private static Logger logger = LoggerFactory.getLogger(DocumentReader.class); private static final int INFLATE_BUFFER_INIT_SIZE = 20 * 1024; private List<FieldSetting> fields; private IndexInput docInput; private IndexInput positionInput; private ByteRefArrayOutputStream inflaterOutput; private byte[] workingBuffer; private int baseDocNo; private int documentCount; private int lastDocNo = -1; private DataInput lastBai; private long positionLimit; public DocumentReader() { } public DocumentReader(SchemaSetting schemaSetting, File dir) throws IOException { this(schemaSetting, dir, 0); } public DocumentReader(SchemaSetting schemaSetting, File dir, int baseDocNo) throws IOException { this.baseDocNo = baseDocNo; fields = schemaSetting.getFieldSettingList(); docInput = new BufferedFileInput(dir, IndexFileNames.docStored); positionInput = new BufferedFileInput(dir, IndexFileNames.docPosition); positionLimit = positionInput.length(); documentCount = docInput.readInt(); logger.info("DocumentCount = {}", documentCount); inflaterOutput = new ByteRefArrayOutputStream(INFLATE_BUFFER_INIT_SIZE); // 자동 증가됨. 초기 20KB으로 내림. 예전에는 3MB였음. workingBuffer = new byte[1024]; } public int getDocumentCount() { return documentCount; } public int getBaseNumber() { return baseDocNo; } // 내부 문서번호로 호출한다. public Document readDocument(int docNo) throws IOException { return readDocument(docNo, null); } public Document readIndexableDocument(int docNo) throws IOException { return readDocument(docNo, null, true); } public Document readDocument(int docNo, boolean[] fieldSelectOption) throws IOException { return readDocument(docNo, fieldSelectOption, false); } public Document readDocument(int docNo, boolean[] fieldSelectOption, boolean indexable) throws IOException { // if(docNo < baseDocNo) throw new // IOException("Request docNo cannot less than baseDocNo! docNo = "+docNo+", baseDocNo = "+baseDocNo); // baseDocNo만큼 빼서 세그먼트별 내부문서번호를 만든다. // docNo -= baseDocNo; DataInput bai = null; if (docNo != lastDocNo) { long positionOffset = docNo * IOUtil.SIZE_OF_LONG; if(positionOffset >= positionLimit){ //없는문서. return null; } positionInput.seek(positionOffset); long pos = positionInput.readLong(); // find a document block docInput.seek(pos); int len = docInput.readInt(); //2014-11-26 검색요청이 많아서 working 버퍼가 너무 빠르게 많이 생길경우 GC 되기전에 OOM 발생할수 있음. // Stream으로 바꾸어 해결. InflaterInputStream decompressInputStream = null; inflaterOutput.reset(); int count = -1; try { BoundedInputStream boundedInputStream = new BoundedInputStream(docInput, len); boundedInputStream.setPropagateClose(false);//하위 docInput 를 닫지않는다. decompressInputStream = new InflaterInputStream(boundedInputStream, new Inflater(), 512); while ((count = decompressInputStream.read(workingBuffer)) != -1) { inflaterOutput.write(workingBuffer, 0, count); } } finally { decompressInputStream.close(); } BytesRef bytesRef = inflaterOutput.getBytesRef(); bai = new BytesDataInput(bytesRef.bytes, 0, bytesRef.length); lastDocNo = docNo; lastBai = bai; } else { lastBai.reset(); bai = lastBai; } Document document = new Document(fields.size()); for (int i = 0; i < fields.size(); i++) { FieldSetting fs = fields.get(i); Field f = null; boolean hasValue = bai.readBoolean(); // logger.debug("read hasValue={}, select={}, fs={} ", hasValue, fieldSelectOption, fs); if (hasValue) { //1. fieldSelectOption 옵션이 없으면 모두 읽음. //2. 옵션이 존재한다면, true인 필드만을 읽는다. if(fieldSelectOption == null || (fieldSelectOption != null && fieldSelectOption[i])){ f = fs.createEmptyField(); f.readRawFrom(bai); }else{ bai.skipVIntData(); } // logger.debug("fill {} >> {}", i, f); }else{ //값이 없는 필드도 빈 필드를 추가해준다. f = fs.createEmptyField(); // logger.debug("fill {} >> empty", i); } if(f != null && indexable){ String multiValueDelimiter = fs.getMultiValueDelimiter(); try { f.parseIndexable(multiValueDelimiter); } catch (FieldDataParseException e) { throw new IOException(e); } } document.set(i, f); } document.setDocId(docNo + baseDocNo); return document; } @Override public DocumentReader clone() { DocumentReader reader = new DocumentReader(); reader.fields = fields; reader.docInput = docInput.clone(); reader.positionInput = positionInput.clone(); reader.baseDocNo = baseDocNo; reader.documentCount = documentCount; reader.inflaterOutput = new ByteRefArrayOutputStream(INFLATE_BUFFER_INIT_SIZE); // 자동 증가됨. reader.workingBuffer = new byte[1024]; reader.positionLimit = positionLimit; return reader; } public void close() throws IOException { docInput.close(); positionInput.close(); } }