package org.apache.lucene.chunk;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.mark.BasicWordIter;
import org.apache.lucene.mark.MarkPos;
import org.apache.lucene.mark.WordIter;
/**
* Iterates over words in a large document that has been broken up into
* many overlapping {@link Chunk}s. Applies section limits at empty chunks
* (section limits can be overcome in any method to which they apply by
* simply setting the 'force' parameter.)
*/
public class ChunkedWordIter extends BasicWordIter implements Cloneable
{
/** Source for fetching chunks */
protected ChunkSource chunkSource;
/** Current chunk whose tokens we're currently traversing */
protected Chunk chunk;
/**
* Construct the iterator to access text from the given chunk source.
*
* @param chunkSource Source to read chunks from.
*/
public ChunkedWordIter(ChunkSource chunkSource) {
this.chunkSource = chunkSource;
}
// inherit javadoc
public boolean next(boolean force)
{
if (tokens == null)
reseek(0);
else if (tokNum == tokens.length - 1)
{
while (true)
{
// If we're at the very end, don't go further.
if (!chunkSource.inMainDoc(chunk.chunkNum + 1))
return false;
// Don't skip past a section boundary unless requested to.
Chunk next = chunkSource.loadChunk(chunk.chunkNum + 1);
if (next.tokens.length == 0) {
if (!force)
return false;
chunk = next;
continue;
}
if (!force) {
int initialGap = next.tokens[0].getPositionIncrement() - 1;
if (initialGap >= chunkSource.getChunkOverlap())
return false;
}
// Go to the next chunk.
reseek(next);
return true;
}
}
// Now do the normal work next() always does.
return super.next(force);
}
// inherit javadoc
public boolean prev(boolean force)
{
if (tokens == null)
return false;
else if (tokNum == 0)
{
while (true)
{
// If we're at the very beginning, don't go further.
if (!chunkSource.inMainDoc(chunk.chunkNum - 1))
return false;
// Don't back over a section boundary unless requested to.
if (!force) {
int initialGap = wordPos - chunk.minWordPos;
if (initialGap >= chunk.source.getChunkOverlap())
return false;
}
Chunk prev = chunkSource.loadChunk(chunk.chunkNum - 1);
if (prev.tokens.length == 0) {
if (!force)
return false;
chunk = prev;
continue;
}
// Go to the previous chunk.
reseek(prev);
// Skip to the end of it.
while (wordPos < chunk.maxWordPos)
super.next(true);
// All done.
return true;
}
}
// Now do the normal work prev() always does.
return super.prev(force);
}
// inherit javadoc
protected void reseek(int targetPos)
{
if (chunk != null &&
targetPos >= chunk.minWordPos &&
targetPos < chunk.maxWordPos)
return;
int targetChunk = (targetPos / chunkSource.chunkBump) +
chunkSource.firstChunk;
if (targetChunk == chunkSource.lastChunk + 1)
targetChunk = chunkSource.lastChunk;
chunk = chunkSource.loadChunk(targetChunk);
reseek(chunkSource.loadChunk(targetChunk));
assert chunk.tokens.length > 0 : "reseek should never hit empty chunk";
assert targetPos - wordPos < chunkSource.docNumMap.getChunkSize() : "Incorrect calculation";
}
// inherit javadoc
protected void reseek(Chunk toChunk) {
chunk = toChunk;
tokens = chunk.tokens;
tokNum = 0;
if (chunk.tokens.length > 0)
wordPos = chunk.minWordPos - 1 + chunk.tokens[0].getPositionIncrement();
else
wordPos = chunk.minWordPos;
} // reseek()
// inherit javadoc
public void seekFirst(int targetPos, boolean force)
{
if (force)
reseek(targetPos);
super.seekFirst(targetPos, force);
}
// inherit javadoc
public void seekLast(int targetPos, boolean force)
{
if (force)
reseek(targetPos);
super.seekLast(targetPos, force);
}
// inherit javadoc
public MarkPos createPos() {
return new ChunkMarkPos();
}
// inherit javadoc
public void getPos(MarkPos pos, int startOrEnd)
{
ChunkMarkPos cm = (ChunkMarkPos)pos;
switch (startOrEnd)
{
// FIELD_START and FIELD_END don't make sense for chunked access.
case WordIter.FIELD_START:
case WordIter.FIELD_END:
cm.wordPos = cm.charPos = -1;
cm.chunk = null;
break;
// First character of the current word
case WordIter.TERM_START:
cm.wordPos = wordPos;
cm.charPos = tokens[tokNum].startOffset();
cm.chunk = chunk;
break;
// Last character (plus one) of the current word
case WordIter.TERM_END:
cm.wordPos = wordPos;
cm.charPos = tokens[tokNum].startOffset() + tokens[tokNum].endOffset() -
tokens[tokNum].startOffset();
cm.chunk = chunk;
break;
// End of word plus spaces and punctuation.
case WordIter.TERM_END_PLUS:
cm.wordPos = wordPos;
if (tokNum == tokens.length - 1)
cm.charPos = tokens[tokNum].startOffset() + chunk.text.length() -
tokens[tokNum].startOffset();
else
cm.charPos = tokens[tokNum].startOffset() +
tokens[tokNum + 1].startOffset() -
tokens[tokNum].startOffset();
cm.chunk = chunk;
break;
default:
assert false : "Unknown start/end mode";
}
}
}