/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.analysis.filter;
import java.nio.CharBuffer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* Extract the localname of an URI, and break it into smaller components based
* on delimiters, such as uppercase or integers.
* <p>
* This filter returns the complete URI, the full localname of the URIs as well
* as the localname tokens.
* <p>
* This filter is less demanding than the {@link URINormalisationFilter}
* in term of CPU. In addition, it is also less costly in term of index size
* since it creates less tokens per URI.
* <p>
* Before tokenisation, check the length of the localname. If the localname is
* too large, it is not tokenised. By default, the maximum localname length is
* set to 64.
*/
public class URILocalnameFilter
extends TokenFilter {
public static final int DEFAULT_MAX_LENGTH = 64;
private int maxLength = DEFAULT_MAX_LENGTH;
protected boolean _isNormalising = false;
protected boolean _shouldReturnLocalname = false;
protected int _nTokens = 0;
private int startLocalname;
private int start;
private int end;
private int termLength;
private CharBuffer termBuffer;
private final CharTermAttribute termAtt;
private final PositionIncrementAttribute posIncrAtt;
public URILocalnameFilter(final TokenStream input) {
super(input);
termAtt = this.addAttribute(CharTermAttribute.class);
posIncrAtt = this.addAttribute(PositionIncrementAttribute.class);
termBuffer = CharBuffer.allocate(256);
}
/**
* Set the maximum length for a localname to be tokenised
*/
public void setMaxLength(final int maxLength) {
this.maxLength = maxLength;
}
@Override
public final boolean incrementToken() throws java.io.IOException {
// While we are normalising the URI
if (_isNormalising) {
this.posIncrAtt.setPositionIncrement(1); // reset the position increment
this.nextToken();
return true;
}
// Otherwise, get next URI token and start normalisation
if (input.incrementToken()) {
termLength = termAtt.length();
this.updateBuffer();
_isNormalising = true;
_shouldReturnLocalname = false; // we return the full localname only if a breakpoint is found
_nTokens = 0;
startLocalname = start = end = 0;
startLocalname = start = this.findLocalname();
this.nextToken();
return true;
}
return false;
}
protected void updateBuffer() {
if (termBuffer.capacity() > termLength) {
termBuffer.clear();
termBuffer.put(termAtt.buffer(), 0, termLength);
}
else {
termBuffer = CharBuffer.allocate(termLength);
termBuffer.put(termAtt.buffer(), 0, termLength);
}
}
/**
* Find the offset of the localname delimiter. If no localname delimiter is
* found, return last offset, i.e., {@code termLength}.
*/
protected int findLocalname() {
int ptr = termLength - 1;
while (ptr > 0) {
if (this.isLocalnameDelim(termBuffer.get(ptr))) {
return ptr;
}
ptr--;
}
return termLength;
}
protected void nextToken() {
// There is still delimiters
while (this.findNextToken()) {
// SRN-66 & SRN-79: skip tokens with less than 3 characters
if (end - start < 3) {
start = end;
continue;
}
this.updateToken();
_nTokens++;
return;
}
if (_shouldReturnLocalname && startLocalname < termLength) { // return the full localname
this.updateLocalnameToken();
_shouldReturnLocalname = false;
return;
}
// No more delimiters, we have to return the full URI as last step
this.updateFinalToken();
_isNormalising = false;
}
protected boolean findNextToken() {
// If localname is too large, do not tokenise it
if (termLength - start > maxLength) {
start++; // increment start pointer since it points to a delimiter
end = termLength;
return true;
}
while (start < termLength) {
if (this.isDelim(termBuffer.get(start))) {
start++; continue;
}
else {
end = start;
do {
end++;
} while (end < termLength && !this.isBreakPoint(termBuffer.get(end)));
if (end < termLength) { // we found a breakpoint, we should return the fulle localname
_shouldReturnLocalname = true;
}
return true;
}
}
return false;
}
protected void updateToken() {
termAtt.copyBuffer(termBuffer.array(), start, end - start);
start = end;
}
protected void updateLocalnameToken() {
termAtt.copyBuffer(termBuffer.array(), startLocalname + 1, termLength - (startLocalname + 1));
posIncrAtt.setPositionIncrement(0);
}
protected void updateFinalToken() {
termAtt.copyBuffer(termBuffer.array(), 0, termLength);
// SRN-80: wrong position increment if no previous tokens
final int posInc = _nTokens == 0 ? 1 : 0;
posIncrAtt.setPositionIncrement(posInc);
}
protected boolean isLocalnameDelim(final char c) {
return c == '#' || c == '/';
}
protected boolean isBreakPoint(final int c) {
return this.isDelim(c) || this.isUppercase(c);
}
protected boolean isDelim(final int c) {
return Character.isLetterOrDigit(c) ? false : true;
}
protected boolean isUppercase(final int c) {
return Character.isUpperCase(c) ? true : false;
}
}