/** * Copyright 2014 National University of Ireland, Galway. * * This file is part of the SIREn project. Project and contact information: * * https://github.com/rdelbru/SIREn * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sindice.siren.analysis.filter; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.UnsupportedCharsetException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** * Decode the URI encoding format of special characters such as '?' or '<'. * * <p> * * Special characters (except of the SPACE that can be encoded with '+' and * '%20') begins with a '%' and are followed by two characters in hexadecimal * format. If a special character cannot be decoded, it is just skipped and the * decoding process just continue. * * <p> * * When a URI has special characters, two versions of the URI are produced * (both tokens have the same position): * <ul> * <li> the original URI </li> * <li> the decoded URI </li> * </ul> */ public class URIDecodingFilter extends TokenFilter { private final CharsetDecoder charsetDecoder; private final ByteBuffer decoded = ByteBuffer.allocate(32); private boolean modifiedURI = false; private CharBuffer termBuffer; private int termLength; private final CharTermAttribute termAtt; private final PositionIncrementAttribute posIncrAtt; /** * Create a new URI decoding filter configured for the specified charset. * * @param input The input token stream * @param charsetEncoding The name of a supported character encoding. * @throws UnsupportedCharsetException if the character encoding is not supported or recognised. */ public URIDecodingFilter(final TokenStream input, final String charsetEncoding) throws UnsupportedCharsetException { super(input); final Charset charset = this.lookupCharset(charsetEncoding); charsetDecoder = charset.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); termAtt = this.addAttribute(CharTermAttribute.class); posIncrAtt = this.addAttribute(PositionIncrementAttribute.class); termBuffer = CharBuffer.allocate(256); } @Override public final boolean incrementToken() throws IOException { if (modifiedURI) { // Return the previously decoded URI modifiedURI = false; termAtt.setEmpty(); termAtt.copyBuffer(termBuffer.array(), 0, termBuffer.position()); posIncrAtt.setPositionIncrement(0); return true; } if (input.incrementToken()) { termLength = termAtt.length(); this.updateBuffer(); this.decode(); return true; } return false; } /** * Check if the buffer is big enough */ private void updateBuffer() { if (termBuffer.capacity() < termLength) { termBuffer = CharBuffer.allocate(termLength); } termBuffer.clear(); } /** * look for the class of the given charset * @param csn * @throws UnsupportedCharsetException */ private Charset lookupCharset(final String csn) throws UnsupportedCharsetException { if (Charset.isSupported(csn)) { return Charset.forName(csn); } throw new UnsupportedCharsetException(csn); } /** * Return the decimal value of an hexadecimal number. If it is not hexadecimal, * a negative value is returned. * @param c */ private int hexaToInt(final char c) { switch (c) { case '0': return 0; case '1': return 1; case '2': return 2; case '3': return 3; case '4': return 4; case '5': return 5; case '6': return 6; case '7': return 7; case '8': return 8; case '9': return 9; case 'a': return 10; case 'b': return 11; case 'c': return 12; case 'd': return 13; case 'e': return 14; case 'f': return 15; case 'A': return 10; case 'B': return 11; case 'C': return 12; case 'D': return 13; case 'E': return 14; case 'F': return 15; default: /* * Return a negative value if the hexadecimal character is invalid. * Because it is < 0 and big enough, the character won't be decoded. */ return -241; } } /** * Return the decimal value of an hexadecimal number, multiplied by 16. * If it is not hexadecimal, a negative value is returned. * @param c */ private int hexaToInt2(final char c) { switch (c) { case '0': return 0; case '1': return 16; case '2': return 32; case '3': return 48; case '4': return 64; case '5': return 80; case '6': return 96; case '7': return 112; case '8': return 128; case '9': return 144; case 'a': return 160; case 'b': return 176; case 'c': return 192; case 'd': return 208; case 'e': return 224; case 'f': return 240; case 'A': return 160; case 'B': return 176; case 'C': return 192; case 'D': return 208; case 'E': return 224; case 'F': return 240; default: /* * Return a negative value if the hexadecimal character is invalid. * Because it is < 0 and big enough, the character won't be decoded. */ return -241; } } /** * Partial decoding of URI encoded characters. * <br> * Ignore the '+' (SPACE) cases, as it does not * make sense to index URIs with a space. Nobody will be able to search them * as a space will be considered as a character delimitation. * <br> * Replace %20 by +, so that the URI can be tokenised easily (%20 causes * problem during tokenisation, while + does not). */ private void decode() { char c; int i = 0; while (i < termLength) { c = termAtt.charAt(i); switch (c) { case '%': // Special character /* * Starting with this instance of %, process all consecutive substrings * of the form %xy. Each substring %xy will yield a byte. Convert all * consecutive bytes obtained this way to whatever character(s) they * represent in the provided encoding. * * xy is a hexadecimal number. */ modifiedURI = true; while (i + 2 < termLength && c == '%') { final char c1 = termAtt.charAt(i + 1); final char c2 = termAtt.charAt(i + 2); // The next two characters converted from a hex to a decimal value final int value = this.hexaToInt2(c1) + this.hexaToInt(c2); if (value == 32) { // replace the SPACE character, encoded by %20, by + this.decodeChars(); termBuffer.put('+'); } else if (value >= 0) { // Negative value are illegal. Just skip it. if (!decoded.hasRemaining()) { // No more place in the buffer, output what is already there. this.decodeChars(); } decoded.put((byte) value); } else { // put the value back, without changing it this.decodeChars(); termBuffer.put('%').put(c1).put(c2); } i += 3; if (i < termLength) c = termAtt.charAt(i); } // decode the chain of special characters this.decodeChars(); // incomplete byte encoding (e.g., %x). Skip it. if (i < termLength && c == '%') { termBuffer.put('%'); i++; } break; default: termBuffer.put(c); i++; break; } } } private void decodeChars() { final int limit = decoded.position(); decoded.position(0); decoded.limit(limit); charsetDecoder.decode(decoded, termBuffer, true); decoded.clear(); } }