/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.analysis.filter;
import java.nio.CharBuffer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* Break an URI into smaller components based on delimiters, such as ':', '/',
* etc. and uppercase.
* <p>
* This filter is very demanding in term of CPU. In general, when this filter is
* used, the parsing time for a set of tuples doubles. If you don't need it,
* removed it from your token stream.
*/
public class URINormalisationFilter
extends TokenFilter {
protected boolean _isNormalising = false;
private int start;
private int end;
private int termLength;
private CharBuffer termBuffer;
protected int _nTokens = 0;
private final CharTermAttribute termAtt;
private final PositionIncrementAttribute posIncrAtt;
public URINormalisationFilter(final TokenStream input) {
super(input);
termAtt = this.addAttribute(CharTermAttribute.class);
posIncrAtt = this.addAttribute(PositionIncrementAttribute.class);
termBuffer = CharBuffer.allocate(256);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
// While we are normalising the URI
if (_isNormalising) {
this.posIncrAtt.setPositionIncrement(1); // reset the position increment
this.nextToken();
return true;
}
// Otherwise, get next URI token and start normalisation
if (input.incrementToken()) {
termLength = termAtt.length();
this.updateBuffer();
_isNormalising = true;
start = end = 0;
_nTokens =0;
this.skipScheme();
this.nextToken();
return true;
}
return false;
}
protected void updateBuffer() {
if (termBuffer.capacity() > termLength) {
termBuffer.clear();
termBuffer.put(termAtt.buffer(), 0, termLength);
}
else {
termBuffer = CharBuffer.allocate(termLength);
termBuffer.put(termAtt.buffer(), 0, termLength);
}
}
/**
* Skip the scheme part. Added for SRN-66 in order to make the URI
* normalisation less aggressive.
*/
protected void skipScheme() {
while (start < termLength) {
if (termBuffer.get(start++) == ':') {
if (termBuffer.get(start) == '/') {
if (termBuffer.get(start + 1) == '/') {
start += 1;
}
}
return;
}
}
}
protected void nextToken() {
// There is still delimiters
while (this.findNextToken()) {
// SRN-66: skip tokens with less than 4 characters
if (end - start < 4) {
start = end;
continue;
}
this.updateToken();
_nTokens++;
return;
}
// No more delimiters, we have to return the full URI as last step
this.updateFinalToken();
_isNormalising = false;
}
protected boolean findNextToken() {
while (start < termLength) {
if (this.isDelim(termBuffer.get(start))) {
start++; continue;
}
else {
end = start;
do {
end++;
} while (end < termLength && !this.isBreakPoint(termBuffer.get(end)));
return true;
}
}
return false;
}
protected void updateToken() {
termAtt.copyBuffer(termBuffer.array(), start, end - start);
start = end;
}
protected void updateFinalToken() {
termAtt.copyBuffer(termBuffer.array(), 0, termLength);
final int posInc = _nTokens == 0 ? 1 : 0;
posIncrAtt.setPositionIncrement(posInc);
}
protected boolean isBreakPoint(final int c) {
return this.isDelim(c) || this.isUppercase(c);
}
protected boolean isDelim(final int c) {
return Character.isLetterOrDigit(c) ? false : true;
}
protected boolean isUppercase(final int c) {
return Character.isUpperCase(c) ? true : false;
}
}