/** * Copyright 2014 National University of Ireland, Galway. * * This file is part of the SIREn project. Project and contact information: * * https://github.com/rdelbru/SIREn * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sindice.siren.analysis; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.sindice.siren.analysis.attributes.DatatypeAttribute; import org.sindice.siren.analysis.attributes.JsonNodeAttributeImpl; import org.sindice.siren.analysis.attributes.NodeAttribute; import org.sindice.siren.util.JSONDatatype; import org.sindice.siren.util.XSDDatatype; import java.io.IOException; import java.io.Reader; /** * A tokenizer for data following the JSON syntax. * <p> * The tokenizer parses JSON data and generates a token for each field name * and each value. The tokenizer attaches a node label and a datatype to each * token. * * <p> * * Regarding datatype, the convention is the following: * <ul> * <li> If a field name is parsed, the datatype * {@link JSONDatatype#JSON_FIELD} is assigned; * <li> If a value string is parsed, the datatype * {@link XSDDatatype#XSD_STRING} is assigned; * <li> If a boolean value is parsed, the datatype * {@link XSDDatatype#XSD_BOOLEAN} is assigned; * <li> If a numerical value is parsed, the datatype * {@link XSDDatatype#XSD_LONG} is assigned; * <li> If a numerical value with a fraction is parsed, the datatype * {@link XSDDatatype#XSD_DOUBLE} is assigned; * </ul> */ public class JsonTokenizer extends Tokenizer { private final JsonTokenizerImpl scanner; /** Token Definition */ public static final int NULL = 0; public static final int TRUE = 1; public static final int FALSE = 2; public static final int NUMBER = 3; public static final int LITERAL = 4; /** Datatype JSON schema: field for the datatype label */ public static final String DATATYPE_LABEL = "_datatype_"; /** Datatype JSON schema: field for the datatype value */ public static final String DATATYPE_VALUES = "_value_"; public JsonTokenizer(final Reader input) { super(input); scanner = new JsonTokenizerImpl(input); this.initAttributes(); } protected static String[] TOKEN_TYPES = getTokenTypes(); public static String[] getTokenTypes() { if (TOKEN_TYPES == null) { TOKEN_TYPES = new String[5]; TOKEN_TYPES[NULL] = "<NULL>"; TOKEN_TYPES[TRUE] = "<TRUE>"; TOKEN_TYPES[FALSE] = "<FALSE>"; TOKEN_TYPES[NUMBER] = "<NUMBER>"; TOKEN_TYPES[LITERAL] = "<LITERAL>"; } return TOKEN_TYPES; } // the TupleTokenizer generates 6 attributes: // term, offset, positionIncrement, type, datatype, node private CharTermAttribute termAtt; private OffsetAttribute offsetAtt; private PositionIncrementAttribute posIncrAtt; private TypeAttribute typeAtt; private DatatypeAttribute dtypeAtt; private NodeAttribute nodeAtt; private void initAttributes() { termAtt = this.addAttribute(CharTermAttribute.class); offsetAtt = this.addAttribute(OffsetAttribute.class); posIncrAtt = this.addAttribute(PositionIncrementAttribute.class); typeAtt = this.addAttribute(TypeAttribute.class); dtypeAtt = this.addAttribute(DatatypeAttribute.class); if (!this.hasAttribute(NodeAttribute.class)) { this.addAttributeImpl(new JsonNodeAttributeImpl()); } nodeAtt = this.addAttribute(NodeAttribute.class); } @Override public final boolean incrementToken() throws IOException { this.clearAttributes(); posIncrAtt.setPositionIncrement(1); return this.nextToken(); } private boolean nextToken() throws IOException { final int tokenType = scanner.getNextToken(); switch (tokenType) { case FALSE: termAtt.append("false"); this.updateToken(tokenType, scanner.getDatatypeURI(), scanner.yychar()); break; case TRUE: termAtt.append("true"); this.updateToken(tokenType, scanner.getDatatypeURI(), scanner.yychar()); break; case NULL: termAtt.append("null"); this.updateToken(tokenType, scanner.getDatatypeURI(), scanner.yychar()); break; case NUMBER: scanner.getLiteralText(termAtt); this.updateToken(tokenType, scanner.getDatatypeURI(), scanner.yychar()); break; case LITERAL: scanner.getLiteralText(termAtt); this.updateToken(tokenType, scanner.getDatatypeURI(), scanner.yychar() + 1); break; case JsonTokenizerImpl.YYEOF: return false; default: return false; } return true; } /** * Update type, datatype, offset, tuple id and cell id of the token * * @param tokenType The type of the generated token * @param datatypeURI The datatype of the generated token * @param startOffset The starting offset of the token */ private void updateToken(final int tokenType, final char[] datatypeURI, final int startOffset) { // Update offset offsetAtt.setOffset(this.correctOffset(startOffset), this.correctOffset(startOffset + termAtt.length())); // update token type typeAtt.setType(TOKEN_TYPES[tokenType]); // update datatype dtypeAtt.setDatatypeURI(datatypeURI); // Update structural information nodeAtt.copyNode(scanner.getNodePath()); } @Override public void reset() throws IOException { super.reset(); if (input.markSupported()) { input.reset(); } scanner.yyreset(input); } @Override public void close() throws IOException { scanner.yyclose(); } }