DatatypeAnalyzerFilter.java example

Explorer
siren-master
/**
 * Copyright 2014 National University of Ireland, Galway.
 *
 * This file is part of the SIREn project. Project and contact information:
 *
 *  https://github.com/rdelbru/SIREn
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.sindice.siren.analysis.filter;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.Version;
import org.sindice.siren.analysis.JsonTokenizer;
import org.sindice.siren.analysis.attributes.DatatypeAttribute;
import org.sindice.siren.analysis.attributes.NodeAttribute;
import org.sindice.siren.util.JSONDatatype;
import org.sindice.siren.util.ReusableCharArrayReader;
import org.sindice.siren.util.XSDDatatype;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class performs post-processing operation on the tokens extracted by
 * {@link JsonTokenizer} based on the {@link DatatypeAttribute}.
 * <p>
 * This filter provides a {@link #register(char[], Analyzer)} method which allows
 * to register an {@link Analyzer} to a specific datatype.
 */
public class DatatypeAnalyzerFilter extends TokenFilter {

  private final static Logger logger =
    LoggerFactory.getLogger(DatatypeAnalyzerFilter.class);

  private final CharArrayMap<Analyzer> dtsAnalyzer;

  private CharTermAttribute termAtt;
  private OffsetAttribute offsetAtt;
  private PositionIncrementAttribute posIncrAtt;
  private TypeAttribute typeAtt;
  private DatatypeAttribute dtypeAtt;
  private NodeAttribute nodeAtt;

  private CharTermAttribute tokenTermAtt;
  private OffsetAttribute tokenOffsetAtt;
  private PositionIncrementAttribute tokenPosIncrAtt;
  private TypeAttribute tokenTypeAtt;

  private boolean isConsumingToken = false;
  private TokenStream currentStream;

  private ReusableCharArrayReader reusableCharArray;

  public DatatypeAnalyzerFilter(final Version version,
                                final TokenStream input) {
    super(input);
    dtsAnalyzer = new CharArrayMap<Analyzer>(version, 64, false);
    this.initAttributes();
  }

  /**
   * Create a {@link DatatypeAnalyzerFilter} with the given default
   * {@link Analyzer}s for the {@link JSONDatatype#JSON_FIELD} and
   * {@link XSDDatatype#XSD_STRING}.
   *
   * @param version The Lucene version to use
   * @param input the input token stream
   * @param fieldAnalyzer the default field name {@link Analyzer}
   * @param valueAnalyzer the default value {@link Analyzer}
   */
  public DatatypeAnalyzerFilter(final Version version,
                                final TokenStream input,
                                final Analyzer fieldAnalyzer,
                                final Analyzer valueAnalyzer) {
    this(version, input);
    // register the default analyzers
    this.register(XSDDatatype.XSD_STRING.toCharArray(), valueAnalyzer);
    this.register(JSONDatatype.JSON_FIELD.toCharArray(), fieldAnalyzer);
  }

  /**
   * Initialise the attributes of the main stream
   */
  private void initAttributes() {
    termAtt = input.getAttribute(CharTermAttribute.class);
    offsetAtt = input.getAttribute(OffsetAttribute.class);
    posIncrAtt = input.getAttribute(PositionIncrementAttribute.class);
    typeAtt = input.getAttribute(TypeAttribute.class);
    dtypeAtt = input.getAttribute(DatatypeAttribute.class);
    nodeAtt = this.addAttribute(NodeAttribute.class);
  }

  /**
   * Initialise the attributes of the inner stream used to tokenize the incoming token.
   */
  private void initTokenAttributes() {
    tokenTermAtt = currentStream.addAttribute(CharTermAttribute.class);
    tokenOffsetAtt = currentStream.addAttribute(OffsetAttribute.class);
    tokenPosIncrAtt = currentStream.addAttribute(PositionIncrementAttribute.class);
    tokenTypeAtt = currentStream.addAttribute(TypeAttribute.class);
  }

  /**
   * Map the given analyzer to that dataTypeURI
   */
  public void register(final char[] dataTypeURI, final Analyzer analyzer) {
    if (!dtsAnalyzer.containsKey(dataTypeURI)) {
      dtsAnalyzer.put(dataTypeURI, analyzer);
    }
  }

  @Override
  public final boolean incrementToken()
  throws IOException {
    /*
     * the use of the loop is necessary in the case where it was consuming a token
     * but that token stream reached the end, and so incrementToken return false.
     * The loop makes sure that the next token is processed.
     */
    do {
      if (!isConsumingToken) {
        if (!input.incrementToken()) {
          return false;
        }

        final char[] dt = dtypeAtt.datatypeURI();
        if (dt == null || dt.length == 0) { // empty datatype, e.g., a bnode
          // TODO GH-164
          logger.warn("Empty datatype for the token [{}]", termAtt);
          return true;
        }

        // the datatype is not registered, leave the token as it is
        if (!dtsAnalyzer.containsKey(dt)) {
          throw new IOException("Unregistered datatype [" + new String(dt)
            + "]. Use the #register method.");
        }

        final Analyzer analyzer = dtsAnalyzer.get(dt);
        if (reusableCharArray == null) {
          reusableCharArray = new ReusableCharArrayReader(termAtt.buffer(), 0, termAtt.length());
        } else {
          reusableCharArray.reset(termAtt.buffer(), 0, termAtt.length());
        }
        currentStream = analyzer.tokenStream("", reusableCharArray);
        currentStream.reset(); // reset to prepare the stream for consumption
        this.initTokenAttributes();
      }
      // Consume the token with the registered analyzer
      isConsumingToken = currentStream.incrementToken();
    } while(!isConsumingToken);
    this.copyInnerStreamAttributes();
    return true;
  }

  /**
   * Copy the inner's stream attributes values to the main stream's ones. This filter
   * uses an inner stream, therefore it needs to be cleared so that other filters
   * have clean attributes data. Because of that, the attributes datatypeURI and
   * node have to saved in order to be restored after.
   */
  private void copyInnerStreamAttributes() {
    // backup datatype and node path
    final IntsRef nodePath = IntsRef.deepCopyOf(nodeAtt.node());
    final char[] dt = dtypeAtt.datatypeURI();
    // clear attributes
    input.clearAttributes();
    // copy inner attributes
    final int len = tokenTermAtt.length();
    termAtt.copyBuffer(tokenTermAtt.buffer(), 0, len);
    offsetAtt.setOffset(tokenOffsetAtt.startOffset(), tokenOffsetAtt.endOffset());
    posIncrAtt.setPositionIncrement(tokenPosIncrAtt.getPositionIncrement());
    typeAtt.setType(tokenTypeAtt.type());
    // TupleTokenizer handles the setting of tuple/cell values and the datatype URI

    // restore datatype and node
    nodeAtt.copyNode(nodePath);
    dtypeAtt.setDatatypeURI(dt);
  }

  @Override
  public void close()
  throws IOException {
    try {
      if (currentStream != null) {
        currentStream.close();
      }
    } finally {
      super.close();
    }
  }

}