/** * Copyright 2014 National University of Ireland, Galway. * * This file is part of the SIREn project. Project and contact information: * * https://github.com/rdelbru/SIREn * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sindice.siren.analysis; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.sindice.siren.analysis.attributes.DatatypeAttribute; import org.sindice.siren.analysis.attributes.NodeAttribute; import org.sindice.siren.analysis.attributes.TupleNodeAttributeImpl; /** * A grammar-based tokenizer constructed with JFlex for N-Tuples. Splits a * N-Tuple into BNode, URI, Literal and Dot tokens. * * @deprecated Use {@link JsonTokenizer} instead */ @Deprecated public class TupleTokenizer extends Tokenizer { /** A private instance of the JFlex-constructed scanner */ private final TupleTokenizerImpl _scanner; /** Structural node counters */ private int _tid = 0; private int _cid = 0; /** Token definition */ public static final int BNODE = 0; public static final int URI = 1; public static final int LITERAL = 2; public static final int DOT = 3; protected static String[] TOKEN_TYPES; public static String[] getTokenTypes() { if (TOKEN_TYPES == null) { TOKEN_TYPES = new String[4]; TOKEN_TYPES[BNODE] = "<BNODE>"; TOKEN_TYPES[URI] = "<URI>"; TOKEN_TYPES[LITERAL] = "<LITERAL>"; TOKEN_TYPES[DOT] = "<DOT>"; } return TOKEN_TYPES; } /** * Creates a new instance of the {@link TupleTokenizer}. Attaches the * <code>input</code> to a newly created JFlex scanner. */ public TupleTokenizer(final Reader input) { super(input); this._scanner = new TupleTokenizerImpl(input); this.initAttributes(); } // the TupleTokenizer generates 6 attributes: // term, offset, positionIncrement, type, datatype, node private CharTermAttribute termAtt; private OffsetAttribute offsetAtt; private PositionIncrementAttribute posIncrAtt; private TypeAttribute typeAtt; private DatatypeAttribute dtypeAtt; private NodeAttribute nodeAtt; private void initAttributes() { termAtt = this.addAttribute(CharTermAttribute.class); offsetAtt = this.addAttribute(OffsetAttribute.class); posIncrAtt = this.addAttribute(PositionIncrementAttribute.class); typeAtt = this.addAttribute(TypeAttribute.class); dtypeAtt = this.addAttribute(DatatypeAttribute.class); if (!this.hasAttribute(NodeAttribute.class)) { this.addAttributeImpl(new TupleNodeAttributeImpl()); } nodeAtt = this.addAttribute(NodeAttribute.class); } @Override public final boolean incrementToken() throws IOException { this.clearAttributes(); posIncrAtt.setPositionIncrement(1); return this.nextTupleToken(); } private boolean nextTupleToken() throws IOException { final int tokenType = _scanner.getNextToken(); switch (tokenType) { case TupleTokenizer.BNODE: _scanner.getBNodeText(termAtt); this.updateToken(tokenType, null, _scanner.yychar() + 2); // Increment tuple cell ID counter _cid++; break; case TupleTokenizer.URI: _scanner.getURIText(termAtt); this.updateToken(tokenType, _scanner.getDatatypeURI(), _scanner.yychar() + 1); // Increment tuple cell ID counter _cid++; break; case TupleTokenizer.LITERAL: _scanner.getLiteralText(termAtt); this.updateToken(tokenType, _scanner.getDatatypeURI(), _scanner.yychar() + 1); // Increment tuple cell ID counter _cid++; break; case DOT: _scanner.getText(termAtt); this.updateToken(tokenType, null, _scanner.yychar()); // Increment tuple ID counter, reset tuple cell ID counter _tid++; _cid = 0; break; case TupleTokenizerImpl.YYEOF: return false; default: return false; } return true; } /** * Update type, datatype, offset, tuple id and cell id of the token * * @param tokenType The type of the generated token * @param datatypeURI The datatype of the generated token * @param startOffset The starting offset of the token */ private void updateToken(final int tokenType, final char[] datatypeURI, final int startOffset) { // Update offset offsetAtt.setOffset(this.correctOffset(startOffset), this.correctOffset(startOffset + termAtt.length())); // update token type typeAtt.setType(TOKEN_TYPES[tokenType]); // update datatype dtypeAtt.setDatatypeURI(datatypeURI); // Update structural information nodeAtt.append(_tid); nodeAtt.append(_cid); } /* * (non-Javadoc) * @see org.apache.lucene.analysis.TokenStream#reset() */ @Override public void reset() throws IOException { super.reset(); if (input.markSupported()) { input.reset(); } _scanner.yyreset(input); _tid = _cid = 0; } @Override public void close() throws IOException { _scanner.yyclose(); } }