/*
* eXist Open Source Native XML Database
* Copyright (C) 2001-2015 The eXist Project
* http://exist-db.org
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
package org.exist.indexing.lucene;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.AttributeSource.State;
import org.exist.Namespaces;
import org.exist.dom.memtree.MemTreeBuilder;
public class PlainTextHighlighter {
private final TreeMap<Object, Query> termMap;
public PlainTextHighlighter(Query query, IndexReader reader) throws IOException {
this.termMap = new TreeMap<>();
LuceneUtil.extractTerms(query, termMap, reader, false);
}
public void highlight(String content, List<Offset> offsets, MemTreeBuilder builder) {
if (offsets == null || offsets.isEmpty()) {
builder.characters(content);
} else {
int lastOffset = 0;
for (Offset offset : offsets) {
if (offset.startOffset() > lastOffset)
builder.characters(content.substring(lastOffset, offset.startOffset()));
builder.startElement(Namespaces.EXIST_NS, "match", "exist:match", null);
builder.characters(content.substring(offset.startOffset(), offset.endOffset()));
builder.endElement();
lastOffset = offset.endOffset();
}
if (lastOffset < content.length())
builder.characters(content.substring(lastOffset));
}
}
public List<Offset> getOffsets(String content, Analyzer analyzer) throws IOException {
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(content));
tokenStream.reset();
MarkableTokenFilter stream = new MarkableTokenFilter(tokenStream);
//Token token;
List<Offset> offsets = null;
try {
int lastOffset = 0;
while (stream.incrementToken()) {
String text = stream.getAttribute(CharTermAttribute.class).toString();
Query termQuery = termMap.get(text);
if (termQuery != null) {
// phrase queries need to be handled differently to filter
// out wrong matches: only the phrase should be marked, not single
// words which may also occur elsewhere in the document
if (termQuery instanceof PhraseQuery) {
PhraseQuery phraseQuery = (PhraseQuery) termQuery;
Term[] terms = phraseQuery.getTerms();
if (text.equals(terms[0].text())) {
// scan the following text and collect tokens to see if
// they are part of the phrase
stream.mark();
int t = 1;
List<State> stateList = new ArrayList<>(terms.length);
stateList.add(stream.captureState());
while (stream.incrementToken() && t < terms.length) {
// DW: what does this do
text = stream.getAttribute(CharTermAttribute.class).toString();
if (text.equals(terms[t].text())) {
stateList.add(stream.captureState());
if (++t == terms.length) {
break;
}
} else {
stream.reset();
break;
}
}
if (stateList.size() == terms.length) {
if (offsets == null)
offsets = new ArrayList<>();
stream.restoreState(stateList.get(0));
int start = stream.getAttribute(OffsetAttribute.class).startOffset();
stream.restoreState(stateList.get(terms.length - 1));
int end = stream.getAttribute(OffsetAttribute.class).endOffset();
offsets.add(new Offset(start, end));
//restore state as before
stream.restoreState(stateList.get(stateList.size() - 1));
}
}
} else {
if (offsets == null)
offsets = new ArrayList<>();
OffsetAttribute offsetAttr = stream.getAttribute(OffsetAttribute.class);
offsets.add(new Offset(offsetAttr.startOffset(), offsetAttr.endOffset()));
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
stream.close();
}
return offsets;
}
public static class Offset {
protected int startOffset, endOffset;
Offset(int start, int end) {
this.startOffset = start;
this.endOffset = end;
}
public int startOffset() { return startOffset; }
public int endOffset() { return endOffset; }
}
}