/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.sindice.siren.analysis.attributes.DatatypeAttribute;
import org.sindice.siren.analysis.attributes.NodeAttribute;
import org.sindice.siren.analysis.attributes.TupleNodeAttributeImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class MockSirenTokenizer extends Tokenizer {
MockSirenDocument doc;
// the TupleTokenizer generates 6 attributes:
// term, offset, positionIncrement, type, datatype, node
private final CharTermAttribute termAtt;
private final OffsetAttribute offsetAtt;
private final PositionIncrementAttribute posIncrAtt;
private final TypeAttribute typeAtt;
private final DatatypeAttribute dtypeAtt;
private final NodeAttribute nodeAtt;
Iterator<ArrayList<MockSirenToken>> nodeIt = null;
Iterator<MockSirenToken> tokenIt = null;
protected static final Logger logger = LoggerFactory.getLogger(MockSirenTokenizer.class);
public MockSirenTokenizer(final MockSirenReader reader) {
super(reader);
this.doc = reader.getDocument();
nodeIt = doc.iterator();
termAtt = this.addAttribute(CharTermAttribute.class);
offsetAtt = this.addAttribute(OffsetAttribute.class);
posIncrAtt = this.addAttribute(PositionIncrementAttribute.class);
typeAtt = this.addAttribute(TypeAttribute.class);
dtypeAtt = this.addAttribute(DatatypeAttribute.class);
if (!this.hasAttribute(NodeAttribute.class)) {
this.addAttributeImpl(new TupleNodeAttributeImpl());
}
nodeAtt = this.addAttribute(NodeAttribute.class);
}
@Override
public final boolean incrementToken() throws IOException {
this.clearAttributes();
final MockSirenToken token;
while (nodeIt.hasNext() || (tokenIt != null && tokenIt.hasNext())) {
if (tokenIt == null || !tokenIt.hasNext()) { // new node
tokenIt = nodeIt.next().iterator(); // move to next node
}
token = tokenIt.next();
termAtt.copyBuffer(token.term, 0, token.term.length);
offsetAtt.setOffset(token.startOffset, token.endOffset);
typeAtt.setType(TupleTokenizer.getTokenTypes()[token.tokenType]);
posIncrAtt.setPositionIncrement(token.posInc);
dtypeAtt.setDatatypeURI(token.datatype);
for (int i = 0; i < token.nodePath.length; i++) {
nodeAtt.append(token.nodePath.ints[i]);
}
return true;
}
return false;
}
@Override
public void reset() {
final MockSirenReader reader = (MockSirenReader) this.input;
this.doc = reader.getDocument();
nodeIt = doc.iterator();
this.clearAttributes();
}
}