/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.analysis.filter;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.StringReader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.junit.Test;
import org.sindice.siren.analysis.TupleTokenizer;
public class TestURILocalnameFilter {
private int MAX_LENGTH = URILocalnameFilter.DEFAULT_MAX_LENGTH;
private final Tokenizer _t = new TupleTokenizer(new StringReader(""));
public void assertNormalisesTo(final Tokenizer t, final String input,
final String[] expected)
throws Exception {
this.assertNormalisesTo(t, input, expected, null);
}
public void assertNormalisesTo(final Tokenizer t, final String input,
final String[] expectedImages,
final String[] expectedTypes)
throws Exception {
this.assertNormalisesTo(t, input, expectedImages, expectedTypes, null);
}
public void assertNormalisesTo(final Tokenizer t, final String input,
final String[] expectedImages,
final String[] expectedTypes,
final int[] expectedPosIncrs)
throws Exception {
assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class));
final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = null;
if (expectedTypes != null) {
assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
typeAtt = t.getAttribute(TypeAttribute.class);
}
PositionIncrementAttribute posIncrAtt = null;
if (expectedPosIncrs != null) {
assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
}
t.setReader(new StringReader(input));
t.reset();
final URILocalnameFilter filter = new URILocalnameFilter(t);
filter.setMaxLength(MAX_LENGTH);
for (int i = 0; i < expectedImages.length; i++) {
assertTrue("token "+i+" exists", filter.incrementToken());
assertEquals(expectedImages[i], termAtt.toString());
if (expectedTypes != null) {
assertEquals(expectedTypes[i], typeAtt.type());
}
if (expectedPosIncrs != null) {
assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
}
}
assertFalse("end of stream", filter.incrementToken());
filter.end();
filter.close();
}
@Test
public void testURI()
throws Exception {
this.assertNormalisesTo(_t, "<http://renaud.delbru.fr/>",
new String[] { "http://renaud.delbru.fr/" },
new String[] { "<URI>" });
this.assertNormalisesTo(_t, "<http://renaud.delbru.fr/rdf/foaf#renaud>",
new String[] { "renaud", "http://renaud.delbru.fr/rdf/foaf#renaud" },
new String[] { "<URI>", "<URI>" });
// too short localname, filtered out
this.assertNormalisesTo(_t, "<http://renaud.delbru.fr/rdf/foaf#me>",
new String[] { "http://renaud.delbru.fr/rdf/foaf#me" },
new String[] { "<URI>" });
// Tokenise on upper case
this.assertNormalisesTo(_t, "<http://renaud.delbru.fr/rdf/uppercaseShouldBeTokenised>",
new String[] { "uppercase", "Should", "Tokenised", "uppercaseShouldBeTokenised", "http://renaud.delbru.fr/rdf/uppercaseShouldBeTokenised" });
this.assertNormalisesTo(_t, "<http://renaud.delbru.fr/rdf/AVeryLongLocalnameWithMoreThan64CharactersThatShouldNotBeTokenised>",
new String[] { "AVeryLongLocalnameWithMoreThan64CharactersThatShouldNotBeTokenised", "http://renaud.delbru.fr/rdf/AVeryLongLocalnameWithMoreThan64CharactersThatShouldNotBeTokenised" });
final String triple = "<http://dbpedia.org/resource/The_Kingston_Trio> " +
"<http://purl.org/dc/terms/subject> " +
"<http://dbpedia.org/resource/Category:Decca_Records_artists>";
this.assertNormalisesTo(_t, triple,
new String[] { "The", "Kingston", "Trio", "The_Kingston_Trio", "http://dbpedia.org/resource/The_Kingston_Trio",
"subject", "http://purl.org/dc/terms/subject",
"Category", "Decca", "Records", "artists", "Category:Decca_Records_artists", "http://dbpedia.org/resource/Category:Decca_Records_artists" },
new String[] { "<URI>", "<URI>", "<URI>", "<URI>", "<URI>",
"<URI>", "<URI>",
"<URI>", "<URI>", "<URI>", "<URI>", "<URI>", "<URI>" },
new int[] { 1, 1, 1, 0, 0,
1, 0,
1, 1, 1, 1, 0, 0 });
}
@Test
public void testOpenCycURI()
throws Exception {
this.assertNormalisesTo(_t, "<http://sw.opencyc.org/concept/Mx4ri_sbFDVGEdaAAACgydogAg>",
new String[] { "Mx4ri", "Eda", "Cgydog", "Mx4ri_sbFDVGEdaAAACgydogAg", "http://sw.opencyc.org/concept/Mx4ri_sbFDVGEdaAAACgydogAg" },
new String[] { "<URI>", "<URI>", "<URI>", "<URI>", "<URI>" });
this.assertNormalisesTo(_t, "<http://sw.opencyc.org/concept/Mx4rpZ2oIm5SEdqAAAACs71DGQ>",
new String[] { "Mx4rp", "Z2o", "Im5", "Edq", "Cs71", "Mx4rpZ2oIm5SEdqAAAACs71DGQ", "http://sw.opencyc.org/concept/Mx4rpZ2oIm5SEdqAAAACs71DGQ" },
new String[] { "<URI>", "<URI>", "<URI>", "<URI>", "<URI>", "<URI>", "<URI>" });
this.assertNormalisesTo(_t, "<http://sw.opencyc.org/concept/Mx4r7FpweNCOQdiMucbWDv61HQ>",
new String[] { "Mx4r7", "Fpwe", "Qdi", "Mucb", "Dv61", "Mx4r7FpweNCOQdiMucbWDv61HQ", "http://sw.opencyc.org/concept/Mx4r7FpweNCOQdiMucbWDv61HQ" },
new String[] { "<URI>", "<URI>", "<URI>", "<URI>", "<URI>", "<URI>", "<URI>" });
}
@Test
public void testOpenCycURIWithMaxLength()
throws Exception {
MAX_LENGTH = 20;
this.assertNormalisesTo(_t, "<http://sw.opencyc.org/concept/Mx4ri_sbFDVGEdaAAACgydogAg>",
new String[] { "Mx4ri_sbFDVGEdaAAACgydogAg", "http://sw.opencyc.org/concept/Mx4ri_sbFDVGEdaAAACgydogAg" });
this.assertNormalisesTo(_t, "<http://sw.opencyc.org/concept/Mx4rpZ2oIm5SEdqAAAACs71DGQ>",
new String[] { "Mx4rpZ2oIm5SEdqAAAACs71DGQ", "http://sw.opencyc.org/concept/Mx4rpZ2oIm5SEdqAAAACs71DGQ" });
this.assertNormalisesTo(_t, "<http://sw.opencyc.org/concept/Mx4r7FpweNCOQdiMucbWDv61HQ>",
new String[] { "Mx4r7FpweNCOQdiMucbWDv61HQ", "http://sw.opencyc.org/concept/Mx4r7FpweNCOQdiMucbWDv61HQ" });
}
@Test
public void testPosInc()
throws Exception {
this.assertNormalisesTo(_t, "<http://example.org/schema/age>",
new String[] { "age", "http://example.org/schema/age" },
new String[] { "<URI>", "<URI>" },
new int[] { 1,0 });
this.assertNormalisesTo(_t, "<http://example.org/schema/me>",
new String[] { "http://example.org/schema/me" },
new String[] { "<URI>" },
new int[] { 1 });
this.assertNormalisesTo(_t, "<http://rdf.data-vocabulary.org/#startDate>",
new String[] { "start", "Date", "startDate", "http://rdf.data-vocabulary.org/#startDate" },
new String[] { "<URI>", "<URI>", "<URI>", "<URI>" },
new int[] { 1, 1, 0, 0 });
}
}