/** * Copyright 2014 National University of Ireland, Galway. * * This file is part of the SIREn project. Project and contact information: * * https://github.com/rdelbru/SIREn * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sindice.siren.analysis.filter; import java.io.IOException; import java.io.StringReader; import java.nio.charset.UnsupportedCharsetException; import java.util.Random; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.LuceneTestCase; import org.junit.Test; import org.sindice.siren.analysis.TupleTokenizer; public class TestURIEncodingFilter extends LuceneTestCase { private final String uritype = TupleTokenizer.getTokenTypes()[TupleTokenizer.URI]; private final Tokenizer _t = new TupleTokenizer(new StringReader("")); /** * Check if an URI with no URL encoded characters is left as it is * @throws Exception */ @Test public void testNoURLEncodedCharacters() throws Exception { this.assertURLDecodedTo(_t, "<http://stephane.net>", new String[] { "http://stephane.net" }); } /** * Check if special characters are correctly decoded and if the filters produces the two stems of the URI * @throws Exception */ @Test public void testSpecialcharacters() throws Exception { this.assertURLDecodedTo(_t, "<http://stephane.net/%32%21Space%21space>", new String[] { "http://stephane.net/%32%21Space%21space", "http://stephane.net/2!Space!space" }); this.assertURLDecodedTo(_t, "<http://stephane.net/%57%68%4F%61%72%65%79%6f%75%3F>", new String[] { "http://stephane.net/%57%68%4F%61%72%65%79%6f%75%3F", "http://stephane.net/WhOareyou?" }); // We does not decode space this.assertURLDecodedTo(_t, "<http://stephane.net/%57%68%4F+%61%72%65+%79%6f%75%20%3F>", new String[] { "http://stephane.net/%57%68%4F+%61%72%65+%79%6f%75%20%3F", "http://stephane.net/WhO+are+you+?" }); } /** * Check if the boundaries of the internal buffers are correct. * @throws Exception */ @Test public void testLongURLEncodedChain() throws Exception { this.assertURLDecodedTo(_t, "<deus%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40>", new String[] { "deus%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40", "deus@@@@@@@@@@@@@@@@@@@@" }); this.assertURLDecodedTo(_t, "<deus%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40+%3f>", new String[] { "deus%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40+%3f", "deus@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@+?" }); final String looong = "%20%21%22%23%24%25%26%27%28%29%2a%2b%2c%2d%2e%2f%30" + "%31%32%33%34%35%36%37%38%39%3a%3b%3c%3d%3e%3f%40%41%42%43%44%45%46%47" + "%48%49%4a%4b%4c%4d%4e%4f%50%51%52%53%54%55%56%57%58%59%5a%5b%5c%5d%5e" + "%5f%60%61%62%63%64%65%66%67%68%69%6a%6b%6c%6d%6e%6f%70%71%72%73%74%75" + "%76%77%78%79%7a%7b%7c%7d%7e"; final String decLooong = "+!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOP" + "QRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; this.assertURLDecodedTo(_t, "<" + looong + looong + looong + looong + ">", new String[]{ looong + looong + looong + looong, decLooong + decLooong + decLooong + decLooong }); this.assertURLDecodedTo(_t, "<" + looong + looong + looong + looong + looong + looong + looong + looong + ">", new String[]{ looong + looong + looong + looong + looong + looong + looong + looong, decLooong + decLooong + decLooong + decLooong + decLooong + decLooong + decLooong + decLooong }); } /** * Check that badly encoded URL characters are left as they are. The decoding continue * nonetheless on the rest of the URI * @throws Exception */ @Test public void testWronglyEncodedCharacters() throws Exception { this.assertURLDecodedTo(_t, "<http://stephane.net/%>", new String[] { "http://stephane.net/%", "http://stephane.net/%" }); this.assertURLDecodedTo(_t, "<http://stephane.net/%8>", new String[] { "http://stephane.net/%8", "http://stephane.net/%8" }); this.assertURLDecodedTo(_t, "<http://stephane.net/%%3f>", new String[] { "http://stephane.net/%%3f", "http://stephane.net/%%3f" }); this.assertURLDecodedTo(_t, "<http://stephane.net/%GGporco>", new String[] { "http://stephane.net/%GGporco", "http://stephane.net/%GGporco" }); this.assertURLDecodedTo(_t, "<http://stephane.net/%G3porco%2erosso>", new String[] { "http://stephane.net/%G3porco%2erosso", "http://stephane.net/%G3porco.rosso" }); } /** * Test bad Charset name * @throws Exception */ @Test(expected=UnsupportedCharsetException.class) public void testUnsupportedCharset() throws Exception { this.assertURLDecodedTo(_t, "FTU_8", "", new String[] {}); } /** * Test where {@link URIDecodingFilter#hexaToInt} return a negative value * @throws Exception */ @Test public void testBadHexadecimalNumber() throws Exception { this.assertURLDecodedTo(_t, "<http://stephane%3f%FGnet/>", new String[] { "http://stephane%3f%FGnet/", "http://stephane?%FGnet/" }); } /** * Test a sequence of tokens with different types. * @throws Exception */ @Test public void testDifferentTypes() throws Exception { this.assertURLDecodedTo(_t, "<stephane%3Fnet/> \"A literal !!!!\" <porco%2erosso>", new String[] { "stephane%3Fnet/", "stephane?net/", "A literal !!!!", "porco%2erosso", "porco.rosso" }, new String[] { uritype, uritype, TupleTokenizer.getTokenTypes()[TupleTokenizer.LITERAL], uritype, uritype }, new int[] { 1, 0, 1, 1, 0 }); } @Test public void testSpaces() throws Exception { this.assertURLDecodedTo(_t, "<http://s+t+e%20%20p+h%20+%20ane/>", new String[] { "http://s+t+e%20%20p+h%20+%20ane/", "http://s+t+e++p+h+++ane/" }); } @Test public void testBufferOverflow() throws Exception { final StringBuilder sb = new StringBuilder(); final Random r = LuceneTestCase.random(); for (int i = 0; i < 300; i++) { sb.append((char) 65 + r.nextInt(26)); } this.assertURLDecodedTo(_t, "<" + sb.toString() + ">", new String[] { sb.toString() }); } /* * Helpers */ private void assertURLDecodedTo(final Tokenizer t, final String uri, final String[] expectedStems) throws IOException { this.assertURLDecodedTo(t, "UTF-8", uri, expectedStems, null, null); } private void assertURLDecodedTo(final Tokenizer t, final String encoding, final String uri, final String[] expectedStems) throws IOException { this.assertURLDecodedTo(t, encoding, uri, expectedStems, null, null); } private void assertURLDecodedTo(final Tokenizer t, final String uri, final String[] expectedStems, final String[] expectedTypes, final int[] expectedPosIncr) throws IOException { this.assertURLDecodedTo(t, "UTF-8", uri, expectedStems, expectedTypes, expectedPosIncr); } private void assertURLDecodedTo(final Tokenizer t, final String encoding, final String uri, final String[] expectedStems, final String[] expectedTypes, final int[] expectedPosIncr) throws IOException { assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); final TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class); assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); final PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); t.setReader(new StringReader(uri)); t.reset(); final URIDecodingFilter filter = new URIDecodingFilter(t, encoding); for (int i = 0; i < expectedStems.length; i++) { assertTrue("token " + i + " exists", filter.incrementToken()); assertEquals(expectedStems[i], termAtt.toString()); if (expectedTypes == null) assertEquals(uritype, typeAtt.type()); else assertEquals(expectedTypes[i], typeAtt.type()); if (expectedPosIncr != null) assertEquals(expectedPosIncr[i], posIncrAtt.getPositionIncrement()); } filter.end(); filter.close(); } }