package org.cdlib.xtf.textEngine; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.chunk.Chunk; import org.apache.lucene.chunk.ChunkSource; import org.apache.lucene.chunk.DocNumMap; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.cdlib.xtf.util.Trace; /* * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * This file created on Jan 15, 2005 by Martin Haye */ /** Performs special loading duties for our XTF chunks */ class XtfChunkSource extends ChunkSource { private static final char bumpMarker = Constants.BUMP_MARKER; private static final char nodeMarker = Constants.NODE_MARKER; /** Construct a chunk source */ XtfChunkSource(IndexReader reader, DocNumMap docNumMap, int mainDocNum, String field, Analyzer analyzer) { super(reader, docNumMap, mainDocNum, field, analyzer); } // constructor /** * Create a new storage place for chunk tokens (derived classes may * wish to override) */ protected Chunk createChunkTokens(int chunkNum) { return new XtfChunk(this, chunkNum); } // createChunkTokens() /** * Read the text for the given chunk (derived classes may * wish to override) */ protected void loadText(int chunkNum, Chunk ct) throws IOException { XtfChunk chunk = (XtfChunk)ct; Document doc = reader.document(chunkNum); chunk.text = doc.get(field); // Record the starting node number and word offset. try { chunk.startNodeNumber = Integer.parseInt(doc.get("node")); chunk.startWordOffset = Integer.parseInt(doc.get("wordOffset")); chunk.sectionType = doc.get("sectionType"); } catch (NumberFormatException e) { throw new RuntimeException(e); } } // loadText() /** * Read in and tokenize a chunk. Maintains a cache of recently loaded * chunks for speed. */ public Chunk loadChunk(int chunkNum) { assert chunkNum >= firstChunk && chunkNum <= lastChunk; // First, do the normal loading/tokenizing work. XtfChunk chunk = (XtfChunk)super.loadChunk(chunkNum); // If we already post-processed this chunk, don't do it again. if (chunk.nodeNumbers != null) return chunk; // Now figure out the word offset and node number for each token. Along // the way, we also want to delete all the markers and create new tokens // that reference the modified text. // chunk.nodeNumbers = new int[chunk.tokens.length]; chunk.wordOffsets = new int[chunk.tokens.length]; int nodeNumber = chunk.startNodeNumber; int wordOffset = chunk.startWordOffset; int totalWordOffset = chunk.minWordPos - 1; int tokenWordOffset = chunk.minWordPos - 1; int prevCharPos = 0; StringBuffer buf = new StringBuffer(chunk.text.length()); for (int i = 0; i <= chunk.tokens.length; i++) { int charPos = (i < chunk.tokens.length) ? chunk.tokens[i].startOffset() : chunk.text.length(); String textBetween = chunk.text.substring(prevCharPos, charPos); if (i < chunk.tokens.length) { totalWordOffset++; tokenWordOffset += chunk.tokens[i].getPositionIncrement(); } // Process any node or bump markers between the previous token and // this one. // int pos = 0; while (true) { int nodeMarkerPos = textBetween.indexOf(nodeMarker, pos); int bumpMarkerPos = textBetween.indexOf(bumpMarker, pos); if (nodeMarkerPos >= 0 && (bumpMarkerPos < 0 || nodeMarkerPos < bumpMarkerPos)) { buf.append(textBetween.substring(pos, nodeMarkerPos)); pos++; nodeNumber++; wordOffset = 0; pos = nodeMarkerPos + 1; } else if (bumpMarkerPos >= 0) { buf.append(textBetween.substring(pos, bumpMarkerPos)); int bumpEnd = textBetween.indexOf(bumpMarker, bumpMarkerPos + 1); assert bumpEnd >= 0; String bumpText = textBetween.substring(bumpMarkerPos + 1, bumpEnd); try { int bump = Integer.parseInt(bumpText); //WRONG: wordOffset += bump; totalWordOffset += bump; } catch (NumberFormatException e) { throw new RuntimeException(e); } pos = bumpEnd + 1; } else break; } // Trim whitespace at start of the line. if (i > 0) buf.append(textBetween.substring(pos)); if (i == chunk.tokens.length) break; assert totalWordOffset == tokenWordOffset; int startPos = buf.length(); buf.append(chunk.text.substring(chunk.tokens[i].startOffset(), chunk.tokens[i].endOffset())); int endPos = buf.length(); Token oldToken = chunk.tokens[i]; chunk.tokens[i] = new Token(oldToken.termText(), startPos, endPos); chunk.tokens[i].setPositionIncrement(oldToken.getPositionIncrement()); chunk.nodeNumbers[i] = nodeNumber; chunk.wordOffsets[i] = wordOffset; wordOffset++; chunk.maxWordPos = totalWordOffset; prevCharPos = oldToken.endOffset(); } // for i // Replace the old text with the new (which has the markers removed). chunk.text = buf.toString(); // All done! return chunk; } // loadChunk() /** * <p><b>DEBUGGING ONLY:</b></p> * * Print out debugging info for the current chunk, including all of * its tokens. */ @SuppressWarnings("unused") private void debugChunk(XtfChunk chunk) { StringBuffer buf1 = new StringBuffer(); StringBuffer buf2 = new StringBuffer(); String spaces = " "; Trace.debug("*** CHUNK " + chunk.chunkNum + " ***"); if (chunk.tokens.length == 0) { Trace.debug(" [[chunk has no tokens]]"); return; } buf1.append(spaces.substring(0, chunk.tokens[0].startOffset())); buf2.append(chunk.text.substring(0, chunk.tokens[0].startOffset())); int pos = chunk.minWordPos - 1; for (int i = 0; i < chunk.tokens.length; i++) { int tokLen = (i < chunk.tokens.length - 1) ? (chunk.tokens[i + 1].startOffset() - chunk.tokens[i].startOffset()) : (chunk.text.length() - chunk.tokens[i].startOffset()); if (buf1.length() + tokLen > 80) { Trace.debug(" " + buf1.toString()); Trace.debug("\"" + buf2.toString() + "\"\n"); buf1.setLength(0); buf2.setLength(0); } pos += chunk.tokens[i].getPositionIncrement(); String num = Integer.toString(pos) + " "; if (num.length() < tokLen) num += spaces.substring(0, tokLen - num.length()); String tokText = chunk.text.substring(chunk.tokens[i].startOffset(), chunk.tokens[i].startOffset() + tokLen); if (tokText.length() < num.length()) tokText += spaces.substring(0, num.length() - tokText.length()); buf1.append(num); buf2.append(tokText); } Trace.debug(" " + buf1.toString()); Trace.debug("\"" + buf2.toString() + "\"\n"); } // debugChunk() }