/** * Copyright 2014 National University of Ireland, Galway. * * This file is part of the SIREn project. Project and contact information: * * https://github.com/rdelbru/SIREn * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sindice.siren.analysis.filter; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.Version; import org.sindice.siren.analysis.JsonTokenizer; import org.sindice.siren.analysis.attributes.DatatypeAttribute; import org.sindice.siren.analysis.attributes.NodeAttribute; import org.sindice.siren.util.JSONDatatype; import org.sindice.siren.util.ReusableCharArrayReader; import org.sindice.siren.util.XSDDatatype; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This class performs post-processing operation on the tokens extracted by * {@link JsonTokenizer} based on the {@link DatatypeAttribute}. * <p> * This filter provides a {@link #register(char[], Analyzer)} method which allows * to register an {@link Analyzer} to a specific datatype. */ public class DatatypeAnalyzerFilter extends TokenFilter { private final static Logger logger = LoggerFactory.getLogger(DatatypeAnalyzerFilter.class); private final CharArrayMap<Analyzer> dtsAnalyzer; private CharTermAttribute termAtt; private OffsetAttribute offsetAtt; private PositionIncrementAttribute posIncrAtt; private TypeAttribute typeAtt; private DatatypeAttribute dtypeAtt; private NodeAttribute nodeAtt; private CharTermAttribute tokenTermAtt; private OffsetAttribute tokenOffsetAtt; private PositionIncrementAttribute tokenPosIncrAtt; private TypeAttribute tokenTypeAtt; private boolean isConsumingToken = false; private TokenStream currentStream; private ReusableCharArrayReader reusableCharArray; public DatatypeAnalyzerFilter(final Version version, final TokenStream input) { super(input); dtsAnalyzer = new CharArrayMap<Analyzer>(version, 64, false); this.initAttributes(); } /** * Create a {@link DatatypeAnalyzerFilter} with the given default * {@link Analyzer}s for the {@link JSONDatatype#JSON_FIELD} and * {@link XSDDatatype#XSD_STRING}. * * @param version The Lucene version to use * @param input the input token stream * @param fieldAnalyzer the default field name {@link Analyzer} * @param valueAnalyzer the default value {@link Analyzer} */ public DatatypeAnalyzerFilter(final Version version, final TokenStream input, final Analyzer fieldAnalyzer, final Analyzer valueAnalyzer) { this(version, input); // register the default analyzers this.register(XSDDatatype.XSD_STRING.toCharArray(), valueAnalyzer); this.register(JSONDatatype.JSON_FIELD.toCharArray(), fieldAnalyzer); } /** * Initialise the attributes of the main stream */ private void initAttributes() { termAtt = input.getAttribute(CharTermAttribute.class); offsetAtt = input.getAttribute(OffsetAttribute.class); posIncrAtt = input.getAttribute(PositionIncrementAttribute.class); typeAtt = input.getAttribute(TypeAttribute.class); dtypeAtt = input.getAttribute(DatatypeAttribute.class); nodeAtt = this.addAttribute(NodeAttribute.class); } /** * Initialise the attributes of the inner stream used to tokenize the incoming token. */ private void initTokenAttributes() { tokenTermAtt = currentStream.addAttribute(CharTermAttribute.class); tokenOffsetAtt = currentStream.addAttribute(OffsetAttribute.class); tokenPosIncrAtt = currentStream.addAttribute(PositionIncrementAttribute.class); tokenTypeAtt = currentStream.addAttribute(TypeAttribute.class); } /** * Map the given analyzer to that dataTypeURI */ public void register(final char[] dataTypeURI, final Analyzer analyzer) { if (!dtsAnalyzer.containsKey(dataTypeURI)) { dtsAnalyzer.put(dataTypeURI, analyzer); } } @Override public final boolean incrementToken() throws IOException { /* * the use of the loop is necessary in the case where it was consuming a token * but that token stream reached the end, and so incrementToken return false. * The loop makes sure that the next token is processed. */ do { if (!isConsumingToken) { if (!input.incrementToken()) { return false; } final char[] dt = dtypeAtt.datatypeURI(); if (dt == null || dt.length == 0) { // empty datatype, e.g., a bnode // TODO GH-164 logger.warn("Empty datatype for the token [{}]", termAtt); return true; } // the datatype is not registered, leave the token as it is if (!dtsAnalyzer.containsKey(dt)) { throw new IOException("Unregistered datatype [" + new String(dt) + "]. Use the #register method."); } final Analyzer analyzer = dtsAnalyzer.get(dt); if (reusableCharArray == null) { reusableCharArray = new ReusableCharArrayReader(termAtt.buffer(), 0, termAtt.length()); } else { reusableCharArray.reset(termAtt.buffer(), 0, termAtt.length()); } currentStream = analyzer.tokenStream("", reusableCharArray); currentStream.reset(); // reset to prepare the stream for consumption this.initTokenAttributes(); } // Consume the token with the registered analyzer isConsumingToken = currentStream.incrementToken(); } while(!isConsumingToken); this.copyInnerStreamAttributes(); return true; } /** * Copy the inner's stream attributes values to the main stream's ones. This filter * uses an inner stream, therefore it needs to be cleared so that other filters * have clean attributes data. Because of that, the attributes datatypeURI and * node have to saved in order to be restored after. */ private void copyInnerStreamAttributes() { // backup datatype and node path final IntsRef nodePath = IntsRef.deepCopyOf(nodeAtt.node()); final char[] dt = dtypeAtt.datatypeURI(); // clear attributes input.clearAttributes(); // copy inner attributes final int len = tokenTermAtt.length(); termAtt.copyBuffer(tokenTermAtt.buffer(), 0, len); offsetAtt.setOffset(tokenOffsetAtt.startOffset(), tokenOffsetAtt.endOffset()); posIncrAtt.setPositionIncrement(tokenPosIncrAtt.getPositionIncrement()); typeAtt.setType(tokenTypeAtt.type()); // TupleTokenizer handles the setting of tuple/cell values and the datatype URI // restore datatype and node nodeAtt.copyNode(nodePath); dtypeAtt.setDatatypeURI(dt); } @Override public void close() throws IOException { try { if (currentStream != null) { currentStream.close(); } } finally { super.close(); } } }