JsonTokenizer.java example

Explorer
siren-master
/**
 * Copyright 2014 National University of Ireland, Galway.
 *
 * This file is part of the SIREn project. Project and contact information:
 *
 *  https://github.com/rdelbru/SIREn
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sindice.siren.analysis;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.sindice.siren.analysis.attributes.DatatypeAttribute;
import org.sindice.siren.analysis.attributes.JsonNodeAttributeImpl;
import org.sindice.siren.analysis.attributes.NodeAttribute;
import org.sindice.siren.util.JSONDatatype;
import org.sindice.siren.util.XSDDatatype;

import java.io.IOException;
import java.io.Reader;

/**
 * A tokenizer for data following the JSON syntax.
 * <p>
 * The tokenizer parses JSON data and generates a token for each field name
 * and each value. The tokenizer attaches a node label and a datatype to each
 * token.
 *
 * <p>
 *
 * Regarding datatype, the convention is the following:
 * <ul>
 *   <li> If a field name is parsed, the datatype
 *        {@link JSONDatatype#JSON_FIELD} is assigned;
 *   <li> If a value string is parsed, the datatype
 *        {@link XSDDatatype#XSD_STRING} is assigned;
 *   <li> If a boolean value is parsed, the datatype
 *        {@link XSDDatatype#XSD_BOOLEAN} is assigned;
 *   <li> If a numerical value is parsed, the datatype
 *        {@link XSDDatatype#XSD_LONG} is assigned;
 *   <li> If a numerical value with a fraction is parsed, the datatype
 *        {@link XSDDatatype#XSD_DOUBLE} is assigned;
 * </ul>
 */
public class JsonTokenizer extends Tokenizer {

  private final JsonTokenizerImpl scanner;

  /** Token Definition */
  public static final int         NULL      = 0;
  public static final int         TRUE      = 1;
  public static final int         FALSE     = 2;
  public static final int         NUMBER    = 3;
  public static final int         LITERAL   = 4;

  /** Datatype JSON schema: field for the datatype label */
  public static final String      DATATYPE_LABEL  = "_datatype_";
  /** Datatype JSON schema: field for the datatype value */
  public static final String      DATATYPE_VALUES = "_value_";

  public JsonTokenizer(final Reader input) {
    super(input);
    scanner = new JsonTokenizerImpl(input);
    this.initAttributes();
  }

  protected static String[] TOKEN_TYPES = getTokenTypes();

  public static String[] getTokenTypes() {
    if (TOKEN_TYPES == null) {
      TOKEN_TYPES = new String[5];
      TOKEN_TYPES[NULL] = "<NULL>";
      TOKEN_TYPES[TRUE] = "<TRUE>";
      TOKEN_TYPES[FALSE] = "<FALSE>";
      TOKEN_TYPES[NUMBER] = "<NUMBER>";
      TOKEN_TYPES[LITERAL] = "<LITERAL>";
    }
    return TOKEN_TYPES;
  }

  // the TupleTokenizer generates 6 attributes:
  // term, offset, positionIncrement, type, datatype, node
  private CharTermAttribute          termAtt;
  private OffsetAttribute            offsetAtt;
  private PositionIncrementAttribute posIncrAtt;
  private TypeAttribute              typeAtt;
  private DatatypeAttribute          dtypeAtt;
  private NodeAttribute              nodeAtt;

  private void initAttributes() {
    termAtt = this.addAttribute(CharTermAttribute.class);
    offsetAtt = this.addAttribute(OffsetAttribute.class);
    posIncrAtt = this.addAttribute(PositionIncrementAttribute.class);
    typeAtt = this.addAttribute(TypeAttribute.class);
    dtypeAtt = this.addAttribute(DatatypeAttribute.class);
    if (!this.hasAttribute(NodeAttribute.class)) {
      this.addAttributeImpl(new JsonNodeAttributeImpl());
    }
    nodeAtt = this.addAttribute(NodeAttribute.class);
  }

  @Override
  public final boolean incrementToken() throws IOException {
    this.clearAttributes();
    posIncrAtt.setPositionIncrement(1);
    return this.nextToken();
  }

  private boolean nextToken() throws IOException {
    final int tokenType = scanner.getNextToken();

    switch (tokenType) {
      case FALSE:
        termAtt.append("false");
        this.updateToken(tokenType, scanner.getDatatypeURI(), scanner.yychar());
        break;

      case TRUE:
        termAtt.append("true");
        this.updateToken(tokenType, scanner.getDatatypeURI(), scanner.yychar());
        break;

      case NULL:
        termAtt.append("null");
        this.updateToken(tokenType, scanner.getDatatypeURI(), scanner.yychar());
        break;

      case NUMBER:
        scanner.getLiteralText(termAtt);
        this.updateToken(tokenType, scanner.getDatatypeURI(), scanner.yychar());
        break;

      case LITERAL:
        scanner.getLiteralText(termAtt);
        this.updateToken(tokenType, scanner.getDatatypeURI(), scanner.yychar() + 1);
        break;

      case JsonTokenizerImpl.YYEOF:
        return false;

      default:
        return false;
    }
    return true;
  }

  /**
   * Update type, datatype, offset, tuple id and cell id of the token
   *
   * @param tokenType The type of the generated token
   * @param datatypeURI The datatype of the generated token
   * @param startOffset The starting offset of the token
   */
  private void updateToken(final int tokenType, final char[] datatypeURI, final int startOffset) {
    // Update offset
    offsetAtt.setOffset(this.correctOffset(startOffset),
      this.correctOffset(startOffset + termAtt.length()));
    // update token type
    typeAtt.setType(TOKEN_TYPES[tokenType]);
    // update datatype
    dtypeAtt.setDatatypeURI(datatypeURI);
    // Update structural information
    nodeAtt.copyNode(scanner.getNodePath());
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    if (input.markSupported()) {
      input.reset();
    }
    scanner.yyreset(input);
  }

  @Override
  public void close() throws IOException {
    scanner.yyclose();
  }

}