/** * Copyright 2014 National University of Ireland, Galway. * * This file is part of the SIREn project. Project and contact information: * * https://github.com/rdelbru/SIREn * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sindice.siren.analysis; import java.io.IOException; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.junit.Test; import org.sindice.siren.analysis.AnyURIAnalyzer.URINormalisation; import org.sindice.siren.analysis.filter.DatatypeAnalyzerFilter; import org.sindice.siren.analysis.filter.URILocalnameFilter; import org.sindice.siren.analysis.filter.URINormalisationFilter; import org.sindice.siren.util.XSDDatatype; public class TestTupleAnalyzer extends NodeAnalyzerTestCase<TupleAnalyzer> { @Override protected TupleAnalyzer getNodeAnalyzer() { final AnyURIAnalyzer uriAnalyzer = new AnyURIAnalyzer(TEST_VERSION_CURRENT); uriAnalyzer.setUriNormalisation(URINormalisation.FULL); final TupleAnalyzer tupleAnalyzer = new TupleAnalyzer(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT), uriAnalyzer); tupleAnalyzer.registerDatatype(XSDDatatype.XSD_ANY_URI.toCharArray(), uriAnalyzer); return tupleAnalyzer; } /** * Test the local URINormalisation: the word "the" is a stop word, hence it is * filtered. The position increment is updated accordingly, but it is not reset for * future calls. Corrects issue SRN-117. * @throws Exception */ @Test public void testURINormalisation() throws Exception { final AnyURIAnalyzer uriAnalyzer = new AnyURIAnalyzer(TEST_VERSION_CURRENT); uriAnalyzer.setUriNormalisation(URINormalisation.LOCALNAME); _a = new TupleAnalyzer(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT), uriAnalyzer); _a.registerDatatype(XSDDatatype.XSD_ANY_URI.toCharArray(), uriAnalyzer); this.assertAnalyzesTo(_a, "<http://dbpedia.org/resource/The_Kingston_Trio>", new String[] { "kingston", "trio", "the_kingston_trio", "http://dbpedia.org/resource/the_kingston_trio" }, new String[] { "word", "word", "word", "word" }, new int[] { 2, 1, 0, 0 }); } /** * The same, with Full normalisation -- the stop word is now "their" because in * {@link URINormalisationFilter}, there is inside a filter of words smaller * than 4 (it was 3 for {@link URILocalnameFilter}. * @throws Exception */ @Test public void testURINormalisation2() throws Exception { final AnyURIAnalyzer uriAnalyzer = new AnyURIAnalyzer(TEST_VERSION_CURRENT); uriAnalyzer.setUriNormalisation(URINormalisation.FULL); _a = new TupleAnalyzer(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT), uriAnalyzer); _a.registerDatatype(XSDDatatype.XSD_ANY_URI.toCharArray(), uriAnalyzer); this.assertAnalyzesTo(_a, "<http://dbpedia.org/resource/their_Kingston_Trio>", new String[] { "dbpedia", "resource", "kingston", "trio", "http://dbpedia.org/resource/their_kingston_trio" }, new String[] { "word", "word", "word", "word", "word" }, new int[] { 1, 1, 2, 1, 0 }); } @Test public void testURI() throws Exception { this.assertAnalyzesTo(_a, "<http://renaud.delbru.fr/>", new String[] { "renaud", "delbru", "http://renaud.delbru.fr" }, new String[] { "word", "word", "word" }); this.assertAnalyzesTo(_a, "<http://Renaud.Delbru.fr/>", new String[] { "renaud", "delbru", "http://renaud.delbru.fr" }, new String[] { "word", "word", "word" }); this.assertAnalyzesTo( _a, "<http://renaud.delbru.fr/page.html?query=a+query&hl=en&start=20&sa=N>", new String[] { "renaud", "delbru", "page", "html", "query", "query", "start", "http://renaud.delbru.fr/page.html?query=a+query&hl=en&start=20&sa=n" }, new String[] { "word", "word", "word", "word", "word", "word", "word", "word" }); this.assertAnalyzesTo(_a, "<mailto:renaud@delbru.fr>", new String[] { "renaud", "delbru", "renaud@delbru.fr", "mailto:renaud@delbru.fr" }, new String[] { "word", "word", "word", "word" }); this.assertAnalyzesTo(_a, "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>", new String[] { "1999", "syntax", "type", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"}, new String[] { "word", "word", "word", "word" }); } @Test public void testLiteral() throws Exception { this.assertAnalyzesTo(_a, "\"foo bar FOO BAR\"", new String[] { "foo", "bar", "foo", "bar" }, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" }); this.assertAnalyzesTo(_a, "\"ABC\\u0061\\u0062\\u0063\\u00E9\\u00e9ABC\"", new String[] { "abcabcééabc" }, new String[] { "<ALPHANUM>" }); } @Test public void testLiteral2() throws Exception { this.assertAnalyzesTo(_a, "\"Renaud\"", new String[] { "renaud" }, new String[] { "<ALPHANUM>" }); this.assertAnalyzesTo(_a, "\"1 and 2\"", new String[] { "1", "2" }, new String[] { "<NUM>", "<NUM>" }); this.assertAnalyzesTo(_a, "\"renaud http://test/ \"", new String[] { "renaud", "http", "test" }, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" }); this.assertAnalyzesTo(_a, "\"foo bar FOO BAR\"", new String[] { "foo", "bar", "foo", "bar" }, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" }); this.assertAnalyzesTo(_a, "\"ABC\\u0061\\u0062\\u0063\\u00E9\\u00e9ABC\"", new String[] { "abcabcééabc" }, new String[] { "<ALPHANUM>" }); } /** * The datatype "en" was not registered. * IOException thrown by {@link DatatypeAnalyzerFilter}. */ @Test(expected=IOException.class) public void testLanguage() throws Exception { this.assertAnalyzesTo(_a, "\"test test2\"@en", new String[] { "test test2" }, new String[] { TupleTokenizer.getTokenTypes()[TupleTokenizer.LITERAL] } ); } /** * Register the "en" and "fr" datatypes analyzers */ @Test public void testLanguage2() throws Exception { _a.registerDatatype("en".toCharArray(), new StandardAnalyzer(TEST_VERSION_CURRENT)); _a.registerDatatype("fr".toCharArray(), new WhitespaceAnalyzer(TEST_VERSION_CURRENT)); this.assertAnalyzesTo(_a, "\"Test Test2\"@en <aaa> \"Test Test2\"@fr", new String[] { "test", "test2", "aaa", "Test", "Test2" }, new String[] { "<ALPHANUM>", "<ALPHANUM>", "word", "word", "word" }); _a.clearDatatypes(); } @Test public void testAlreadyRegisteredAnalyzer() throws Exception { _a.registerDatatype("en".toCharArray(), new WhitespaceAnalyzer(TEST_VERSION_CURRENT)); // this analyzer is not used, as the datatype "en" is already to an analyzer _a.registerDatatype("en".toCharArray(), new StandardAnalyzer(TEST_VERSION_CURRENT)); this.assertAnalyzesTo(_a, "\"Test tesT2\"@en", new String[] { "Test", "tesT2" }, new String[] { "word", "word" }); _a.clearDatatypes(); } @Test public void testBNodeFiltering() throws Exception { this.assertAnalyzesTo(_a, "_:b123 <aaa> <bbb> _:b212", new String[] { "aaa", "bbb" }, new String[] { "word", "word" }); } /** * test that the tokenization is resumed after filtering a token * @throws Exception */ @Test public void testBNodeFiltering2() throws Exception { this.assertAnalyzesTo(_a, "_:b123 <http://renaud.delbru.fr/> _:b212 \"bbb rrr\"", new String[] { "renaud", "delbru", "http://renaud.delbru.fr", "bbb", "rrr" }, new String[] { "word", "word", "word", "<ALPHANUM>", "<ALPHANUM>" }); } /** * In Lucene4.0, the position increment behaviour changed: it is not allowed * anymore to have the first position increment == 0 * @throws Exception */ @Test public void testFirstPosInc() throws Exception { this.assertAnalyzesTo(_a, "<aaa>", new String[] { "aaa" }, new String[] { "word" }, new int[] { 1 }); } }