DocumentReader.java example

Explorer
fastcatsearch-master
/*
 * Copyright 2013 Websquared, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.fastcatsearch.ir.document;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;

import org.apache.commons.io.input.BoundedInputStream;
import org.apache.lucene.util.BytesRef;
import org.fastcatsearch.ir.common.IndexFileNames;
import org.fastcatsearch.ir.field.Field;
import org.fastcatsearch.ir.field.FieldDataParseException;
import org.fastcatsearch.ir.io.BufferedFileInput;
import org.fastcatsearch.ir.io.ByteRefArrayOutputStream;
import org.fastcatsearch.ir.io.BytesDataInput;
import org.fastcatsearch.ir.io.DataInput;
import org.fastcatsearch.ir.io.IOUtil;
import org.fastcatsearch.ir.io.IndexInput;
import org.fastcatsearch.ir.settings.FieldSetting;
import org.fastcatsearch.ir.settings.SchemaSetting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 문서번호는 세그먼트마다 0부터 시작하는 번호로 read한다. baseNo와는 상관없는 내부문서번호.
 * */

public class DocumentReader implements Cloneable {
	private static Logger logger = LoggerFactory.getLogger(DocumentReader.class);

	private static final int INFLATE_BUFFER_INIT_SIZE = 20 * 1024;
	private List<FieldSetting> fields;
	private IndexInput docInput;
	private IndexInput positionInput;
	private ByteRefArrayOutputStream inflaterOutput;
	private byte[] workingBuffer;

	private int baseDocNo;
	private int documentCount;
	private int lastDocNo = -1;
	private DataInput lastBai;
	private long positionLimit;
	
	public DocumentReader() {
	}

	public DocumentReader(SchemaSetting schemaSetting, File dir) throws IOException {
		this(schemaSetting, dir, 0);
	}

	public DocumentReader(SchemaSetting schemaSetting, File dir, int baseDocNo) throws IOException {
		this.baseDocNo = baseDocNo;
		fields = schemaSetting.getFieldSettingList();
		docInput = new BufferedFileInput(dir, IndexFileNames.docStored);
		positionInput = new BufferedFileInput(dir, IndexFileNames.docPosition);
		positionLimit = positionInput.length();
		documentCount = docInput.readInt();
		logger.info("DocumentCount = {}", documentCount);

		inflaterOutput = new ByteRefArrayOutputStream(INFLATE_BUFFER_INIT_SIZE); // 자동 증가됨. 초기 20KB으로 내림. 예전에는 3MB였음.
		workingBuffer = new byte[1024];
	}

	public int getDocumentCount() {
		return documentCount;
	}

	public int getBaseNumber() {
		return baseDocNo;
	}
	
	// 내부 문서번호로 호출한다.
	public Document readDocument(int docNo) throws IOException {
		return readDocument(docNo, null);
	}
	public Document readIndexableDocument(int docNo) throws IOException {
		return readDocument(docNo, null, true);
	}
	public Document readDocument(int docNo, boolean[] fieldSelectOption) throws IOException {
		return readDocument(docNo, fieldSelectOption, false);
	}
	public Document readDocument(int docNo, boolean[] fieldSelectOption, boolean indexable) throws IOException {
		// if(docNo < baseDocNo) throw new
		// IOException("Request docNo cannot less than baseDocNo! docNo = "+docNo+", baseDocNo = "+baseDocNo);

		// baseDocNo만큼 빼서 세그먼트별 내부문서번호를 만든다.
		// docNo -= baseDocNo;

		DataInput bai = null;

		if (docNo != lastDocNo) {
			long positionOffset = docNo * IOUtil.SIZE_OF_LONG;
			if(positionOffset >= positionLimit){
				//없는문서.
				return null;
			}
			positionInput.seek(positionOffset);
			long pos = positionInput.readLong();
			// find a document block
			docInput.seek(pos);
			int len = docInput.readInt();
			
			//2014-11-26 검색요청이 많아서 working 버퍼가 너무 빠르게 많이 생길경우 GC 되기전에 OOM 발생할수 있음.
			// Stream으로 바꾸어 해결.
			InflaterInputStream decompressInputStream = null;
			inflaterOutput.reset();
			int count = -1;
			try {
				BoundedInputStream boundedInputStream = new BoundedInputStream(docInput, len);
				boundedInputStream.setPropagateClose(false);//하위 docInput 를 닫지않는다.
				decompressInputStream = new InflaterInputStream(boundedInputStream, new Inflater(), 512);
				while ((count = decompressInputStream.read(workingBuffer)) != -1) {
					inflaterOutput.write(workingBuffer, 0, count);
				}
			} finally {
				decompressInputStream.close();
			}

			BytesRef bytesRef = inflaterOutput.getBytesRef();
			bai = new BytesDataInput(bytesRef.bytes, 0, bytesRef.length);

			lastDocNo = docNo;
			lastBai = bai;
		} else {
			lastBai.reset();
			bai = lastBai;
		}

		Document document = new Document(fields.size());
		for (int i = 0; i < fields.size(); i++) {
			FieldSetting fs = fields.get(i);
			Field f = null;
			boolean hasValue = bai.readBoolean();
//			logger.debug("read hasValue={}, select={}, fs={} ", hasValue, fieldSelectOption, fs);
			if (hasValue) {
				//1. fieldSelectOption 옵션이 없으면 모두 읽음.
				//2. 옵션이 존재한다면, true인 필드만을 읽는다.
				if(fieldSelectOption == null || (fieldSelectOption != null && fieldSelectOption[i])){
					f = fs.createEmptyField();
					f.readRawFrom(bai);
				}else{
					bai.skipVIntData();
				}
//				logger.debug("fill {} >> {}", i, f);
			}else{
				//값이 없는 필드도 빈 필드를 추가해준다.
				f = fs.createEmptyField();
//				logger.debug("fill {} >> empty", i);
			}
			if(f != null && indexable){
				String multiValueDelimiter = fs.getMultiValueDelimiter();
				try {
					f.parseIndexable(multiValueDelimiter);
				} catch (FieldDataParseException e) {
					throw new IOException(e);
				}
			}
			document.set(i, f);
		}
		
		document.setDocId(docNo + baseDocNo);
		
		return document;
	}

	@Override
	public DocumentReader clone() {
		DocumentReader reader = new DocumentReader();
		reader.fields = fields;
		reader.docInput = docInput.clone();
		reader.positionInput = positionInput.clone();
		reader.baseDocNo = baseDocNo;
		reader.documentCount = documentCount;

		reader.inflaterOutput = new ByteRefArrayOutputStream(INFLATE_BUFFER_INIT_SIZE); // 자동 증가됨.
		reader.workingBuffer = new byte[1024];
		reader.positionLimit = positionLimit;
		return reader;
	}

	public void close() throws IOException {
		docInput.close();
		positionInput.close();
	}
}