package org.cdlib.xtf.textEngine; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ import java.io.IOException; import java.io.StringReader; import java.util.Arrays; import java.util.HashSet; import java.util.Set; import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.chunk.DocNumMap; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.mark.ContextMarker; import org.apache.lucene.mark.MarkCollector; import org.apache.lucene.mark.MarkPos; import org.apache.lucene.mark.WordIter; import org.apache.lucene.search.spans.FieldSpans; import org.apache.lucene.search.spans.Span; import org.cdlib.xtf.textIndexer.XTFTextAnalyzer; import org.cdlib.xtf.util.CharMap; import org.cdlib.xtf.util.WordMap; /** * Does the heavy lifting of interpreting span hits using the actual document * text stored in the index. Marks the hit and any matching terms, and * includes a configurable amount of context words. * * @author Martin Haye */ public class SnippetMaker { /** Lucene index reader used to fetch text data */ public IndexReader reader; /** Lucene analyzer used for tokenizing text */ private Analyzer analyzer; /** * Keeps track of which chunks belong to which source document in the * index. */ private DocNumMap docNumMap; /** Max # of words in an index chunk */ @SuppressWarnings("unused") private int chunkSize; /** Amount of overlap between adjacent index chunks */ private int chunkOverlap; /** Set of stop-words removed (e.g. "the", "a", "and", etc.) */ private Set stopSet; /** Plural words to convert to singular */ private WordMap pluralMap; /** Accented chars to remove diacritics from */ private CharMap accentMap; /** * The fields that were specified as tokenized at index time. Not exactly * the same as field.isTokenized() because facet values, while tokenized * from Lucene's point of view, are not tokenized from the XTF point of * view. */ private Set tokFields; /** Target # of characters to include in the snippet. */ private int maxContext; /** Where to mark terms (all, only in spans, etc.) */ private int termMode; /** List of metadata fields to return in the doc hits, or null for all */ private Set<String> returnMetaFields; // Precompiled patterns for quickly matching common chars special to XML private static final Pattern ampPattern = Pattern.compile("&"); private static final Pattern ltPattern = Pattern.compile("<"); private static final Pattern gtPattern = Pattern.compile(">"); /** * Constructs a SnippetMaker, ready to make snippets using the given * index reader to load text data. * * @param reader Index reader to fetch text data from * @param docNumMap Maps chunk numbers to document numbers * @param stopSet Stop words removed (e.g. "the", "a", "and", etc.) * @param pluralMap Plural words to convert to singular * @param accentMap Accented chars to remove diacritics from * @param maxContext Target # chars for hit + context * @param termMode Where to mark terms (all, only in spans, etc.) * @param string */ public SnippetMaker(IndexReader reader, DocNumMap docNumMap, Set stopSet, WordMap pluralMap, CharMap accentMap, Set tokFields, int maxContext, int termMode, String returnMetaFields) { this.reader = reader; this.docNumMap = docNumMap; this.chunkSize = docNumMap.getChunkSize(); this.chunkOverlap = docNumMap.getChunkOverlap(); this.stopSet = stopSet; this.pluralMap = pluralMap; this.accentMap = accentMap; this.tokFields = tokFields; this.maxContext = maxContext; this.termMode = termMode; if (returnMetaFields != null) this.returnMetaFields = new HashSet(Arrays.asList(returnMetaFields.split("[, ]+"))); else this.returnMetaFields = null; // Use the indexer's actual analyzer, so that our results always // agree (especially the positions which are critical.) // analyzer = new XTFTextAnalyzer(null, pluralMap, accentMap); } // constructor /** * Obtain a list of stop-words in the index (e.g. "the", "a", "and", * etc.) */ public Set stopSet() { return stopSet; } /** * Obtain the set of plural words to convert to singular form. */ public WordMap pluralMap() { return pluralMap; } /** * Obtain the set of accented chars to remove diacritics from. */ public CharMap accentMap() { return accentMap; } /** Obtain the document number map used to make snippets */ public DocNumMap docNumMap() { return docNumMap; } /** Obtain the set of tokenized fields */ public Set tokFields() { return tokFields; } /** Obtain the set of fields that should be returned in doc hits (null for all) */ public Set returnMetaFields() { return returnMetaFields; } /** * Full-blown snippet formation process. * * @param fieldSpans record of the matching spans, and all search terms * @param mainDocNum document ID of the main doc * @param fieldName name of the field we're making snippets of * @param getText true to get the full text of the snippet, false * if we only want the start/end offsets. */ public Snippet[] makeSnippets(FieldSpans fieldSpans, int mainDocNum, String fieldName, final boolean getText) { // Make a chunked iterator to use for traversing the token stream. WordIter wordIter = new XtfChunkedWordIter(reader, docNumMap, mainDocNum, fieldName, analyzer); // Make an array to hold the snippets. int nSnippets = fieldSpans.getSpanCount(fieldName); final Snippet[] snippets = new Snippet[nSnippets]; // Process all the marks as they come ContextMarker.markField( fieldSpans, fieldName, wordIter, getText ? maxContext : 0, getText ? termMode : ContextMarker.MARK_NO_TERMS, stopSet, new MarkCollector() { private Snippet curSnippet; private MarkPos prevPos = null; private StringBuffer buf = getText ? new StringBuffer() : null; private void copyUpTo(MarkPos pos) { if (prevPos != null) buf.append(mapXMLChars(prevPos.getTextTo(pos))); prevPos = pos; } public void beginField(MarkPos pos) { } public void beginContext(MarkPos pos, Span span) { if (getText) buf.setLength(0); prevPos = pos; } public void term(MarkPos startPos, MarkPos endPos, String term) { if (getText) { copyUpTo(startPos); buf.append("<term>"); buf.append(startPos.getTextTo(endPos)); buf.append("</term>"); } prevPos = endPos; } public void beginSpan(MarkPos pos, Span span) { if (getText) { if (maxContext > 0) copyUpTo(pos); else prevPos = pos; buf.append("<hit>"); } curSnippet = snippets[span.rank] = new Snippet(); XtfChunkMarkPos xp = (XtfChunkMarkPos)pos; curSnippet.startNode = xp.nodeNumber; curSnippet.startOffset = xp.wordOffset; curSnippet.sectionType = xp.sectionType; curSnippet.rank = span.rank; curSnippet.score = span.score; } public void endSpan(MarkPos pos) { if (getText) { copyUpTo(pos); buf.append("</hit>"); } XtfChunkMarkPos xp = (XtfChunkMarkPos)pos; curSnippet.endNode = xp.nodeNumber; curSnippet.endOffset = xp.wordOffset; } public void endContext(MarkPos pos) { if (getText) { copyUpTo(pos); curSnippet.text = buf.toString(); } } public void endField(MarkPos pos) { } }); // Make sure all the snippets got marked. for (int i = 0; i < nSnippets; i++) assert snippets[i] != null; // And we're done. return snippets; } // makeSnippets() /** * Marks all the terms within the given text. Typically used to mark * terms within a meta-data field. * * @param doc document to get matching spans from * @param fieldName name of the field to mark. * @param value value of the field to mark * * @return Marked up text value. */ public String markField(Document doc, FieldSpans fieldSpans, final String fieldName, final String value) { try { // Get the text, and allocate a buffer for the marked up version. final StringBuffer buf = new StringBuffer(value.length() * 2); // Now make a word iterator to use for traversing the token stream. While // we're at it, strip start/end markers from the tokens. // TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(value)); stream = new StartEndStripper(stream); final WordIter wordIter = new BoundedWordIter(value, stream, chunkOverlap); // Process all the marks as they come ContextMarker.markField( fieldSpans, fieldName, wordIter, maxContext, termMode, stopSet, new MarkCollector() { private MarkPos prevPos = null; private boolean inContext = false; private boolean inSpan = false; private int contextSize; private MarkPos contextStart; private void copyUpTo(MarkPos pos) { if (prevPos != null) { String toAdd = ((BoundedMarkPos)prevPos).getTextTo(pos, inContext || inSpan); // Don't map XML chars here, since the text indexer did it // for us. // buf.append(toAdd); if (inContext) contextSize += toAdd.length(); } prevPos = pos; } public void beginField(MarkPos pos) { prevPos = pos; } public void beginContext(MarkPos pos, Span span) { copyUpTo(pos); buf.append("<snippet rank=\""); buf.append(Integer.toString(span.rank + 1)); buf.append("\" score=\""); buf.append(Integer.toString((int)(span.score * 100))); buf.append("\">"); inContext = true; contextSize = 0; contextStart = pos; } public void term(MarkPos startPos, MarkPos endPos, String term) { copyUpTo(startPos); String toAdd = startPos.getTextTo(endPos); buf.append("<term>"); buf.append(toAdd); buf.append("</term>"); if (inContext) contextSize += toAdd.length(); prevPos = endPos; } public void beginSpan(MarkPos pos, Span span) { copyUpTo(pos); buf.append("<hit"); if (!inContext) { buf.append(" rank=\""); buf.append(Integer.toString(span.rank + 1)); buf.append("\" score=\""); buf.append(Integer.toString((int)(span.score * 100))); buf.append("\""); } buf.append(">"); inSpan = true; } public void endSpan(MarkPos pos) { copyUpTo(pos); buf.append("</hit>"); inSpan = false; } public void endContext(MarkPos pos) { copyUpTo(pos); buf.append("</snippet>"); if (contextSize > maxContext) { @SuppressWarnings("unused") int posDiff = contextStart.countTextTo(pos); // // NOTE: Do NOT re-enable the assert below. Why? Consider // the situation where the matching search terms are // simply very far apart, and there's no way to // make a snippet that contains all of them within // the specified maxContext. I think you still want // the whole hit in this case. // //assert false : "ContextMarker made snippet too big"; } inContext = false; } public void endField(MarkPos pos) { copyUpTo(pos); } }); String strVal = buf.toString(); return strVal; } catch (IOException e) { throw new RuntimeException("How could StringReader throw an exception?"); } catch (BoundedMarkPos.UnmarkableException e) { // XML structure was found where we can't (yet) figure out how to // insert our <snippet> or <hit> elements and still produce a valid // XML document. So just return the value unmarked. // return value; } } // markField() /** * Replaces 'special' characters in the given string with their XML * equivalent. */ String mapXMLChars(String s) { if (s.indexOf('&') >= 0) s = ampPattern.matcher(s).replaceAll("&"); if (s.indexOf('<') >= 0) s = ltPattern.matcher(s).replaceAll("<"); if (s.indexOf('>') >= 0) s = gtPattern.matcher(s).replaceAll(">"); return s; } // mapXMLChars() /** * Strips the special start-of-field/end-of-field markers from tokens. */ public class StartEndStripper extends TokenFilter { public StartEndStripper(TokenStream input) { super(input); } /** Retrieve the next token in the stream. */ public Token next() throws IOException { Token t = input.next(); if (t == null) return t; // If it starts or ends with the special token character, toss it and // attach the position increment to the next token. // String term = t.termText(); boolean isStartToken = (term.charAt(0) == Constants.FIELD_START_MARKER); boolean isEndToken = (term.charAt(term.length() - 1) == Constants.FIELD_END_MARKER); if (isStartToken || isEndToken) { Token nextTok = input.next(); assert term.indexOf(nextTok.termText()) >= 0; int start = t.startOffset(); if (isStartToken) start++; int end = t.endOffset(); if (isEndToken) end--; Token newTok = new Token(nextTok.termText(), start, end, nextTok.type()); newTok.setPositionIncrement(t.getPositionIncrement()); return newTok; } return t; } // next() } // class StartEndStripper } // class SnippetMaker