package org.apache.lucene.chunk;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
/**
* Reads and caches chunks from an index.
*/
public class ChunkSource
{
/** Reader to load chunk text from */
protected IndexReader reader;
/** Map of document to chunk numbers */
protected DocNumMap docNumMap;
/** The main document number */
protected int mainDocNum;
/** Max number of words per chunk */
protected int chunkSize;
/** Numer of words one chunk overlaps with the next */
protected int chunkOverlap;
/** Number of words per chunk minus the overlap */
protected int chunkBump;
/** First chunk in the document */
protected int firstChunk;
/** Last chunk in the document */
protected int lastChunk;
/** Field to read from the chunks */
protected String field;
/** Analyzer to use for tokenizing the text */
protected Analyzer analyzer;
/** Cache of recently loaded chunks */
protected LinkedList chunkCache = new LinkedList();
/** Max # of chunks to cache */
protected int chunkCacheSize = 10;
/**
* Construct the iterator and read in starting text from the given
* chunk.
*
* @param reader where to read the chunks from
* @param docNumMap provides a mapping from main document number to
* to chunk numbers.
* @param mainDocNum is the document ID of the main doc
* @param field is the name of the field to read in
* @param analyzer will be used to tokenize the stored field contents
*/
public ChunkSource(IndexReader reader, DocNumMap docNumMap, int mainDocNum,
String field, Analyzer analyzer)
{
this.reader = reader;
this.docNumMap = docNumMap;
this.mainDocNum = mainDocNum;
this.field = field;
this.analyzer = analyzer;
chunkSize = docNumMap.getChunkSize();
chunkOverlap = docNumMap.getChunkOverlap();
chunkBump = chunkSize - chunkOverlap;
firstChunk = docNumMap.getFirstChunk(mainDocNum);
lastChunk = docNumMap.getLastChunk(mainDocNum);
}
/**
* Create a new storage place for chunk tokens (derived classes may
* wish to override)
*/
protected Chunk createChunkTokens(int chunkNum) {
return new Chunk(this, chunkNum);
}
/**
* Check if the given chunk is contained within the main document for this
* chunk source. Essentially, if the chunk number is beyond the first or
* last chunks, or is deleted, it's not in the main doc.
*/
public boolean inMainDoc(int chunkNum) {
if (chunkNum < firstChunk)
return false;
if (chunkNum > lastChunk)
return false;
if (reader.isDeleted(chunkNum))
return false;
return true;
}
/**
* Read the text for the given chunk (derived classes may
* wish to override)
*/
protected void loadText(int chunkNum, Chunk chunk)
throws IOException
{
chunk.text = reader.document(chunkNum).get(field);
}
/**
* Read in and tokenize a chunk. Maintains a cache of recently loaded
* chunks for speed.
*/
public Chunk loadChunk(int chunkNum)
{
Token t;
try
{
// Is the requested chunk already cached? If so, just return it.
for (Iterator i = chunkCache.iterator(); i.hasNext();) {
Chunk c = (Chunk)i.next();
if (c.chunkNum == chunkNum)
return c;
}
// Make a new chunk to store things in.
Chunk chunk = createChunkTokens(chunkNum);
chunk.minWordPos = (chunkNum - firstChunk) * chunkBump;
chunk.maxWordPos = chunk.minWordPos - 1;
// Load in the text of the chunk.
loadText(chunkNum, chunk);
// Make a token stream out of it.
TokenStream stream = analyzer.tokenStream(field,
new StringReader(chunk.text));
// Pull out all the tokens and make them into a list. Stop at the
// first token when overlaps with the next chunk (unless this is
// the very last chunk.)
//
ArrayList tokenList = new ArrayList(10);
int wordPos = chunk.maxWordPos;
while ((t = stream.next()) != null)
{
wordPos += t.getPositionIncrement();
if (chunkNum < lastChunk && wordPos >= chunk.minWordPos + chunkBump) {
chunk.text = chunk.text.substring(0, t.startOffset());
break;
}
tokenList.add(t);
chunk.maxWordPos = wordPos;
}
stream.close();
// Convert the token list into a handy array.
chunk.tokens = (Token[])tokenList.toArray(new Token[tokenList.size()]);
// Make room in the chunk cache if necessary.
if (chunkCache.size() == chunkCacheSize)
chunkCache.removeFirst();
chunkCache.add(chunk);
// All done!
return chunk;
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
/** Retrieve the max number of words per chunk */
public int getChunkSize() {
return chunkSize;
}
/** Retrieve the number of words one chunk overlaps with the next */
public int getChunkOverlap() {
return chunkOverlap;
}
} // class ChunkSource