package lux.search.highlight;
import java.io.IOException;
import java.io.Reader;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import lux.exception.LuxException;
import lux.index.IndexConfiguration;
import lux.index.analysis.DefaultAnalyzer;
import lux.index.analysis.XmlTextTokenStream;
import lux.xml.QName;
import lux.xml.SaxonDocBuilder;
import lux.xml.XmlReader;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.tree.tiny.TinyDocumentImpl;
import org.apache.commons.io.input.CharSequenceReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TextFragment;
public class XmlHighlighter extends SaxonDocBuilder {
private final HighlightFormatter highlighter;
private QueryScorer scorer;
private final XmlStreamTextReader textReader;
private XMLStreamReader xmlStreamReader;
private StreamingElementTokens xmlStreamTokens;
private TokenStream scorerTokens;
private OffsetAttribute offsetAtt;
private TokenGroup tokenGroup;
private int startOffset;
private int endOffset;
private int lastEndOffset = 0;
private int maxDocCharsToAnalyze = Integer.MAX_VALUE;
private String textFieldName;
private Analyzer analyzer;
private Processor processor;
public XmlHighlighter(Processor processor, IndexConfiguration indexConfig, HighlightFormatter highlighter) {
super(processor);
this.processor = processor;
textFieldName = indexConfig.getTextFieldName();
analyzer = indexConfig.getFieldAnalyzers();
this.highlighter = highlighter;
textReader = new XmlStreamTextReader();
try {
// in order to handle highlighting element-text query terms, we need to
// arrange for element-text tokens to appear in this stream.
// The other place we do that is in ElementTokenStream, but that isn't
// really usable in a simple way in this context
// What do instead is to create yet another TokenStream class
// StreamingElementTokens, which wraps xmlStreamToken
xmlStreamTokens = new StreamingElementTokens(analyzer.tokenStream(textFieldName, textReader));
offsetAtt = xmlStreamTokens.addAttribute(OffsetAttribute.class);
xmlStreamTokens.addAttribute(PositionIncrementAttribute.class);
} catch (IOException e) {
throw new LuxException(e);
}
tokenGroup = new TokenGroup(xmlStreamTokens);
}
public XdmNode highlight (Query query, NodeInfo node) throws XMLStreamException, SaxonApiException {
if (needsPositions(query)) {
// A partial workaround for highlighting element text queries with phrases
query = replaceFields (query, textFieldName);
}
scorer = new QueryScorer(query);
// grab all the text at once so Lucene's lame-ass highlighter can figure out if there are any
// phrases in it...
// TODO: is this the Analyzer we're looking for??? OR ... reimplement using different HL
Analyzer defaultAnalyzer = new DefaultAnalyzer();
TokenStream textTokens = null;
try {
textTokens = defaultAnalyzer.tokenStream("xml_text", new CharSequenceReader(""));
} catch (IOException e) { }
init(new XmlTextTokenStream("xml_text", defaultAnalyzer, textTokens, new XdmNode (node), null, processor));
XmlReader xmlReader = new XmlReader ();
xmlReader.addHandler(this);
xmlReader.read(node);
// setBaseURI (URI.create(node.getBaseURI()));
if (getDocument().getUnderlyingNode() instanceof TinyDocumentImpl) {
((TinyDocumentImpl)getDocument().getUnderlyingNode()).setBaseURI(node.getSystemId());
}
return getDocument();
}
private Query replaceFields(Query query, String fieldName) {
if (query instanceof PhraseQuery) {
PhraseQuery pq = new PhraseQuery();
for (Term t : ((PhraseQuery)query).getTerms()) {
if (t.field().equals(fieldName)) {
return query;
}
pq.add (replaceField(fieldName, t));
}
return pq;
}
if (query instanceof BooleanQuery) {
for (BooleanClause clause : ((BooleanQuery)query).getClauses()) {
clause.setQuery(replaceFields (clause.getQuery(), fieldName));
}
return query;
}
if (query instanceof TermQuery) {
TermQuery tq = (TermQuery)query;
if (! tq.getTerm().field().equals(fieldName)) {
return new TermQuery (new Term (fieldName, tq.getTerm().text().split(":")[1]));
}
}
// MultiTermQuery ?
return query;
}
private Term replaceField(String fieldName, Term t) {
String[] parts = t.text().split(":");
if (parts.length > 1) {
return new Term (fieldName,parts[1]);
} else {
return new Term (fieldName, t.text()); // just in case?
}
}
private boolean needsPositions(Query query) {
if ((query instanceof PhraseQuery)) {
return true;
}
if (query instanceof BooleanQuery) {
for (BooleanClause clause : ((BooleanQuery)query).getClauses()) {
if (needsPositions (clause.getQuery())) {
return true;
}
}
}
return false;
}
@Override
public void reset () {
super.reset();
}
private void init (TokenStream tokenStream) {
try {
tokenStream.reset();
scorer.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
scorerTokens = scorer.init(tokenStream);
if (scorerTokens == null) {
// The scorer didn't consume any tokens (it does that for PhraseQuery),
// in which case we must give it the live token stream
scorer.init(xmlStreamTokens);
}
// we score the entire document as a single fragment
scorer.startFragment(new TextFragment("", 0, 0));
} catch (IOException e) {
throw new LuxException (e);
}
}
@Override
public void handleEvent(XMLStreamReader reader, int eventType) throws XMLStreamException {
switch (eventType) {
case XMLStreamConstants.START_DOCUMENT:
xmlStreamReader = reader; // cache the reader so we can pull events
super.handleEvent(reader, eventType);
break;
case XMLStreamConstants.START_ELEMENT:
super.handleEvent(reader, eventType);
xmlStreamTokens.pushElement(new QName(reader.getNamespaceURI(), reader.getLocalName(), reader.getPrefix()));
break;
case XMLStreamConstants.END_ELEMENT:
super.handleEvent(reader, eventType);
xmlStreamTokens.popElement();
break;
case XMLStreamConstants.COMMENT:
case XMLStreamConstants.PROCESSING_INSTRUCTION:
super.handleEvent(reader, eventType);
break;
case XMLStreamConstants.CDATA:
throw new XMLStreamException ("unexpected CDATA event");
case XMLStreamConstants.SPACE:
super.handleEvent(reader, eventType);
break;
case XMLStreamConstants.CHARACTERS:
textReader.text();
try {
highlightTextNode ();
} catch (IOException e) {
throw new XMLStreamException(e);
}
break;
case XMLStreamConstants.ENTITY_REFERENCE:
throw new XMLStreamException("unexpected entity reference event");
default:
super.handleEvent(reader, eventType);
}
}
/**
* inspired by org.apache.lucene.search.highlight.Highlighter *
*
* send highlighted events to the writer
* @param reader the input document stream
* @param characterOffset beginning of the text to highlight
* @param textLength length of the text to highlight
* @throws XMLStreamException
*/
private void highlightTextNode() throws IOException, XMLStreamException {
TokenStream tokenStream = analyzer.tokenStream(textFieldName, textReader);
xmlStreamTokens.reset (tokenStream);
lastEndOffset = 0;
for (boolean next = xmlStreamTokens.incrementToken();
next && (offsetAtt.startOffset() < maxDocCharsToAnalyze);
next = xmlStreamTokens.incrementToken()) {
if (scorerTokens != null && xmlStreamTokens.isPlainToken()) {
scorerTokens.incrementToken();
}
if (tokenGroup.isDistinct()) {
// write out any accumulated tokens
handleTokenGroup();
tokenGroup.clear();
}
if (scorerTokens == null || xmlStreamTokens.isPlainToken()) {
tokenGroup.addToken(scorer.getTokenScore());
}
}
handleTokenGroup();
tokenGroup.clear();
writeTrailingText();
tokenStream.end();
tokenStream.close();
}
private void writeTrailingText() throws XMLStreamException {
// Test what remains of the original text beyond the point where we stopped analyzing
int textOffset = lastEndOffset;
int totalTextLength = xmlStreamReader.getTextStart() + xmlStreamReader.getTextLength();
if (textOffset < totalTextLength) {
// append it to the output stream
writeText (lastEndOffset, totalTextLength);
}
}
private void handleTokenGroup () throws XMLStreamException {
if(tokenGroup.numTokens>0)
{
// flush the accumulated text
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
// write any whitespace etc from between this and last group
if (startOffset > lastEndOffset) {
writeText (lastEndOffset, startOffset);
}
if (tokenGroup.getTotalScore() > 0) {
// TODO allocate and re-use a single buffer here
char[] tokenText = new char[endOffset - startOffset];
xmlStreamReader.getTextCharacters(startOffset, tokenText, 0, endOffset - startOffset);
highlighter.highlightTerm(writer, new String(tokenText));
} else {
writeText (startOffset, endOffset);
}
lastEndOffset=Math.max(lastEndOffset, endOffset);
}
}
// start and end are absolute numbers, not relative to the current text node
private void writeText (int start, int end) throws XMLStreamException {
// Note: Saxon's StAX "bridge" copies a lot of characters here; we would do better
// grabbing the entire buffer when the text event hits and then parceling it out,
// or else rewriting to use native Saxon API for iterating over XML events...
int length = end - start;
writer.writeCharacters(xmlStreamReader.getTextCharacters(), start, length);
}
final class XmlStreamTextReader extends Reader {
private int offset; // the offset of a text event in the XMLStreamReader
private int len; // the length of the text event
private int pos; // the number of characters read from this text event
/**
* call this method whenever the XMLStreamReader generates a text event
*/
void text () {
pos = 0;
offset = xmlStreamReader.getTextStart();
len = xmlStreamReader.getTextLength();
}
@Override
public void close() {
}
@Override
public int read(char[] target, int off, int count) throws IOException {
/*
while (remaining() <= 0) {
// pull more events from the stream until we get some text or EOF
try {
int evt = xmlStreamReader.next();
if (evt == XMLStreamReader.END_DOCUMENT) {
return -1;
}
handleEvent (xmlStreamReader, evt);
} catch (XMLStreamException e) {
throw new IOException("Error reading XML stream: " + e.getMessage(), e);
}
}
*/
if (remaining() <= 0) {
return -1; // end of text node
}
int nread = remaining() > count ? count : remaining();
try {
xmlStreamReader.getTextCharacters(offset + pos, target, off, nread);
} catch (XMLStreamException e) {
throw new IOException (e);
}
pos += nread;
return nread;
}
private int remaining() {
return len - pos;
}
public int length() {
return len;
}
}
}
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */