/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.qparser.keyword.processors;
import java.io.IOException;
import java.io.StringReader;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.core.messages.QueryParserMessages;
import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.FuzzyQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.GroupQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.NoTokenFoundQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.OrQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.RangeQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.TextableQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.TokenizedPhraseQueryNode;
import org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorImpl;
import org.apache.lucene.queryparser.flexible.messages.MessageImpl;
import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys;
import org.apache.lucene.queryparser.flexible.standard.nodes.RegexpQueryNode;
import org.apache.lucene.queryparser.flexible.standard.nodes.WildcardQueryNode;
import org.apache.lucene.search.MultiPhraseQuery;
import org.sindice.siren.qparser.keyword.config.KeywordQueryConfigHandler.KeywordConfigurationKeys;
import org.sindice.siren.qparser.keyword.nodes.DatatypeQueryNode;
import org.sindice.siren.qparser.keyword.nodes.TwigQueryNode;
import org.sindice.siren.qparser.keyword.nodes.WildcardNodeQueryNode;
/**
* This processor analyzes query terms based on their datatype.
*
* <p>
*
* This processor retrieves the {@link Analyzer} associated with the TAG
* {@link DatatypeQueryNode#DATATYPE_TAGID} in the key
* {@link KeywordConfigurationKeys#DATATYPES_ANALYZERS}, and uses it on the
* {@link FieldQueryNode} text, which is not a {@link WildcardQueryNode},
* a {@link FuzzyQueryNode}, a {@link RegexpQueryNode} or the bound of a
* {@link RangeQueryNode}.
*
* <p>
*
* If no term is returned by the analyzer, a {@link NoTokenFoundQueryNode} object
* is returned. An {@link WildcardNodeQueryNode} is returned instead if an ancestor
* is a {@link TwigQueryNode}.
*
* <p>
*
* If the analyzer returns only one term, the
* returned term is set to the {@link FieldQueryNode} and it is returned.
*
* <p>
*
* If the analyzer returns more than one term at different positions, a
* {@link TokenizedPhraseQueryNode} is created. If they are all at the same
* position, a {@link OrQueryNode} object is created and returned.
*
* <p>
*
* If the analyzer returns multiple terms and the parent node is a
* {@link TokenizedPhraseQueryNode}, a {@link QueryNodeException} is thrown
* because {@link MultiPhraseQuery} are not supported in SIREn.
*/
public class DatatypeAnalyzerProcessor
extends QueryNodeProcessorImpl {
private boolean positionIncrementsEnabled;
private int nbTwigs = 0;
@Override
protected QueryNode preProcessNode(final QueryNode node)
throws QueryNodeException {
if (node instanceof TwigQueryNode) {
nbTwigs++;
}
return node;
}
@Override
protected QueryNode postProcessNode(final QueryNode node)
throws QueryNodeException {
if (node instanceof TextableQueryNode
&& !(node instanceof WildcardQueryNode)
&& !(node instanceof FuzzyQueryNode)
&& !(node instanceof RegexpQueryNode)
&& !(node.getParent() instanceof RangeQueryNode)) {
this.positionIncrementsEnabled = false;
final Boolean positionIncrementsEnabled = this.getQueryConfigHandler().get(ConfigurationKeys.ENABLE_POSITION_INCREMENTS);
if (positionIncrementsEnabled != null) {
this.positionIncrementsEnabled = positionIncrementsEnabled;
}
final FieldQueryNode fieldNode = ((FieldQueryNode) node);
final String text = fieldNode.getTextAsString();
final String field = fieldNode.getFieldAsString();
final String datatype = (String) fieldNode.getTag(DatatypeQueryNode.DATATYPE_TAGID);
if (datatype == null) {
return node;
}
final Analyzer analyzer = this.getQueryConfigHandler()
.get(KeywordConfigurationKeys.DATATYPES_ANALYZERS)
.get(datatype);
if (analyzer == null) {
throw new QueryNodeException(new MessageImpl(
QueryParserMessages.INVALID_SYNTAX, "No analyzer associated with " + datatype));
}
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
final TokenStream source;
try {
source = analyzer.tokenStream(field, new StringReader(text));
source.reset();
} catch (final IOException e1) {
throw new RuntimeException(e1);
}
final CachingTokenFilter buffer = new CachingTokenFilter(source);
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
}
try {
while (buffer.incrementToken()) {
numTokens++;
final int positionIncrement = (posIncrAtt != null) ? posIncrAtt
.getPositionIncrement() : 1;
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
severalTokensAtSamePosition = true;
}
}
} catch (final IOException e) {
// ignore
}
try {
// rewind the buffer stream
buffer.reset();
// close original stream - all tokens buffered
source.close();
} catch (final IOException e) {
// ignore
}
if (!buffer.hasAttribute(CharTermAttribute.class)) {
return new NoTokenFoundQueryNode();
}
final CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);
if (numTokens == 0) {
if (nbTwigs != 0) { // Twig special case
return new WildcardNodeQueryNode();
}
return new NoTokenFoundQueryNode();
}
else if (numTokens == 1) {
String term = null;
try {
boolean hasNext;
hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (final IOException e) {
// safe to ignore, because we know the number of tokens
}
fieldNode.setText(term);
return fieldNode;
}
else {
// no phrase query:
final LinkedList<QueryNode> children = new LinkedList<QueryNode>();
int position = -1;
for (int i = 0; i < numTokens; i++) {
String term = null;
final int positionIncrement = 1;
try {
final boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (final IOException e) {
// safe to ignore, because we know the number of tokens
}
final FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);
if (this.positionIncrementsEnabled) {
position += positionIncrement;
newFieldNode.setPositionIncrement(position);
} else {
newFieldNode.setPositionIncrement(i);
}
children.add(new FieldQueryNode(field, term, -1, -1));
}
if (node.getParent() instanceof TokenizedPhraseQueryNode) {
throw new QueryNodeException(new MessageImpl("Cannot build a MultiPhraseQuery"));
}
// If multiple terms at one single position, this must be a query
// expansion. Perform a OR between the terms.
if (severalTokensAtSamePosition && positionCount == 1) {
return new GroupQueryNode(new OrQueryNode(children));
}
// if several tokens at same position && position count > 1, then
// results can be unexpected
else {
final TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();
for (int i = 0; i < children.size(); i++) {
pq.add(children.get(i));
}
return pq;
}
}
} else if (node instanceof TwigQueryNode) {
nbTwigs--;
assert nbTwigs >= 0;
}
return node;
}
@Override
protected List<QueryNode> setChildrenOrder(final List<QueryNode> children)
throws QueryNodeException {
return children;
}
}