/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.analysis.filter;
import java.io.IOException;
import java.io.StringReader;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Random;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
import org.sindice.siren.analysis.TupleTokenizer;
public class TestURIEncodingFilter extends LuceneTestCase {
private final String uritype = TupleTokenizer.getTokenTypes()[TupleTokenizer.URI];
private final Tokenizer _t = new TupleTokenizer(new StringReader(""));
/**
* Check if an URI with no URL encoded characters is left as it is
* @throws Exception
*/
@Test
public void testNoURLEncodedCharacters()
throws Exception {
this.assertURLDecodedTo(_t, "<http://stephane.net>", new String[] { "http://stephane.net" });
}
/**
* Check if special characters are correctly decoded and if the filters produces the two stems of the URI
* @throws Exception
*/
@Test
public void testSpecialcharacters()
throws Exception {
this.assertURLDecodedTo(_t, "<http://stephane.net/%32%21Space%21space>",
new String[] { "http://stephane.net/%32%21Space%21space", "http://stephane.net/2!Space!space" });
this.assertURLDecodedTo(_t, "<http://stephane.net/%57%68%4F%61%72%65%79%6f%75%3F>",
new String[] { "http://stephane.net/%57%68%4F%61%72%65%79%6f%75%3F", "http://stephane.net/WhOareyou?" });
// We does not decode space
this.assertURLDecodedTo(_t, "<http://stephane.net/%57%68%4F+%61%72%65+%79%6f%75%20%3F>",
new String[] { "http://stephane.net/%57%68%4F+%61%72%65+%79%6f%75%20%3F", "http://stephane.net/WhO+are+you+?" });
}
/**
* Check if the boundaries of the internal buffers are correct.
* @throws Exception
*/
@Test
public void testLongURLEncodedChain()
throws Exception {
this.assertURLDecodedTo(_t, "<deus%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40>",
new String[] { "deus%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40",
"deus@@@@@@@@@@@@@@@@@@@@" });
this.assertURLDecodedTo(_t, "<deus%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40+%3f>",
new String[] { "deus%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40%40+%3f",
"deus@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@+?" });
final String looong = "%20%21%22%23%24%25%26%27%28%29%2a%2b%2c%2d%2e%2f%30" +
"%31%32%33%34%35%36%37%38%39%3a%3b%3c%3d%3e%3f%40%41%42%43%44%45%46%47" +
"%48%49%4a%4b%4c%4d%4e%4f%50%51%52%53%54%55%56%57%58%59%5a%5b%5c%5d%5e" +
"%5f%60%61%62%63%64%65%66%67%68%69%6a%6b%6c%6d%6e%6f%70%71%72%73%74%75" +
"%76%77%78%79%7a%7b%7c%7d%7e";
final String decLooong = "+!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOP" +
"QRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
this.assertURLDecodedTo(_t, "<" + looong + looong + looong + looong + ">",
new String[]{ looong + looong + looong + looong,
decLooong + decLooong + decLooong + decLooong });
this.assertURLDecodedTo(_t, "<" + looong + looong + looong + looong + looong + looong + looong + looong + ">",
new String[]{ looong + looong + looong + looong + looong + looong + looong + looong,
decLooong + decLooong + decLooong + decLooong + decLooong + decLooong + decLooong + decLooong });
}
/**
* Check that badly encoded URL characters are left as they are. The decoding continue
* nonetheless on the rest of the URI
* @throws Exception
*/
@Test
public void testWronglyEncodedCharacters()
throws Exception {
this.assertURLDecodedTo(_t, "<http://stephane.net/%>",
new String[] { "http://stephane.net/%", "http://stephane.net/%" });
this.assertURLDecodedTo(_t, "<http://stephane.net/%8>",
new String[] { "http://stephane.net/%8", "http://stephane.net/%8" });
this.assertURLDecodedTo(_t, "<http://stephane.net/%%3f>",
new String[] { "http://stephane.net/%%3f", "http://stephane.net/%%3f" });
this.assertURLDecodedTo(_t, "<http://stephane.net/%GGporco>",
new String[] { "http://stephane.net/%GGporco", "http://stephane.net/%GGporco" });
this.assertURLDecodedTo(_t, "<http://stephane.net/%G3porco%2erosso>",
new String[] { "http://stephane.net/%G3porco%2erosso", "http://stephane.net/%G3porco.rosso" });
}
/**
* Test bad Charset name
* @throws Exception
*/
@Test(expected=UnsupportedCharsetException.class)
public void testUnsupportedCharset()
throws Exception {
this.assertURLDecodedTo(_t, "FTU_8", "", new String[] {});
}
/**
* Test where {@link URIDecodingFilter#hexaToInt} return a negative value
* @throws Exception
*/
@Test
public void testBadHexadecimalNumber()
throws Exception {
this.assertURLDecodedTo(_t, "<http://stephane%3f%FGnet/>", new String[] { "http://stephane%3f%FGnet/", "http://stephane?%FGnet/" });
}
/**
* Test a sequence of tokens with different types.
* @throws Exception
*/
@Test
public void testDifferentTypes()
throws Exception {
this.assertURLDecodedTo(_t, "<stephane%3Fnet/> \"A literal !!!!\" <porco%2erosso>",
new String[] { "stephane%3Fnet/", "stephane?net/", "A literal !!!!", "porco%2erosso", "porco.rosso" },
new String[] { uritype, uritype, TupleTokenizer.getTokenTypes()[TupleTokenizer.LITERAL], uritype, uritype },
new int[] { 1, 0, 1, 1, 0 });
}
@Test
public void testSpaces()
throws Exception {
this.assertURLDecodedTo(_t, "<http://s+t+e%20%20p+h%20+%20ane/>", new String[] { "http://s+t+e%20%20p+h%20+%20ane/",
"http://s+t+e++p+h+++ane/" });
}
@Test
public void testBufferOverflow()
throws Exception {
final StringBuilder sb = new StringBuilder();
final Random r = LuceneTestCase.random();
for (int i = 0; i < 300; i++) {
sb.append((char) 65 + r.nextInt(26));
}
this.assertURLDecodedTo(_t, "<" + sb.toString() + ">", new String[] { sb.toString() });
}
/*
* Helpers
*/
private void assertURLDecodedTo(final Tokenizer t, final String uri, final String[] expectedStems)
throws IOException {
this.assertURLDecodedTo(t, "UTF-8", uri, expectedStems, null, null);
}
private void assertURLDecodedTo(final Tokenizer t, final String encoding, final String uri, final String[] expectedStems)
throws IOException {
this.assertURLDecodedTo(t, encoding, uri, expectedStems, null, null);
}
private void assertURLDecodedTo(final Tokenizer t, final String uri, final String[] expectedStems, final String[] expectedTypes, final int[] expectedPosIncr)
throws IOException {
this.assertURLDecodedTo(t, "UTF-8", uri, expectedStems, expectedTypes, expectedPosIncr);
}
private void assertURLDecodedTo(final Tokenizer t, final String encoding, final String uri, final String[] expectedStems, final String[] expectedTypes, final int[] expectedPosIncr)
throws IOException {
assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class));
final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
final TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class);
assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
final PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
t.setReader(new StringReader(uri));
t.reset();
final URIDecodingFilter filter = new URIDecodingFilter(t, encoding);
for (int i = 0; i < expectedStems.length; i++) {
assertTrue("token " + i + " exists", filter.incrementToken());
assertEquals(expectedStems[i], termAtt.toString());
if (expectedTypes == null)
assertEquals(uritype, typeAtt.type());
else
assertEquals(expectedTypes[i], typeAtt.type());
if (expectedPosIncr != null)
assertEquals(expectedPosIncr[i], posIncrAtt.getPositionIncrement());
}
filter.end();
filter.close();
}
}