package org.xbib.elasticsearch.index.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.inject.Injector; import org.elasticsearch.common.inject.ModulesBuilder; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.SettingsModule; import org.elasticsearch.env.Environment; import org.elasticsearch.env.EnvironmentModule; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexNameModule; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.analysis.AnalysisModule; import org.elasticsearch.index.analysis.AnalysisService; import org.elasticsearch.index.settings.IndexSettingsModule; import org.elasticsearch.indices.analysis.IndicesAnalysisService; import org.junit.Test; import org.xbib.elasticsearch.plugin.analysis.standardnumber.StandardnumberPlugin; import java.io.IOException; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; public class StandardNumberAnalysisTests { @Test public void testStandardNumberAnalyzer() throws IOException { NamedAnalyzer namedAnalyzer = createAnalysisService().analyzer("standardnumber"); String[] expected = new String[]{ "Die", "ISBN", "von", "Elasticsearch", "in", "Action", "lautet", "9781617291623", "EAN 9781617291623", "GTIN 9781617291623", "978-1-61729-162-3" }; assertSimpleTSOutput(namedAnalyzer.tokenStream("content", "Die ISBN von Elasticsearch in Action lautet 9781617291623"), expected); } @Test public void testPunctuation() throws IOException { NamedAnalyzer namedAnalyzer = createAnalysisService().analyzer("standardnumber"); String[] expected = new String[]{ "ISBN:", "978-3-12-606004-2.", "GTIN 9783126060042", "978-3-12-606004-2", "9783126060042" }; assertSimpleTSOutput(namedAnalyzer.tokenStream("content", "ISBN: 978-3-12-606004-2."), expected); } /** * Avoid nested ZDB-ID in ISBN like this * 3826628225 * 978-3-8266-2822-1 * 9783826628221 * ZDB 826-6 * ZDB 8266 * by boundary matching for ZDBID. * * @throws IOException */ @Test public void testISBNWithEmbeddedZDB() throws IOException { NamedAnalyzer namedAnalyzer = createAnalysisService().analyzer("standardnumber"); String[] expected = new String[]{ "ISBN", "3-8266-2822-5", "GTIN 3826628225", "3826628225", "978-3-8266-2822-1", "9783826628221" }; assertSimpleTSOutput(namedAnalyzer.tokenStream("content", "ISBN 3-8266-2822-5"), expected); } /** * "linux" is not an ISSN * * @throws IOException */ @Test public void testNonISSN() throws IOException { NamedAnalyzer namedAnalyzer = createAnalysisService().analyzer("standardnumber"); String[] expected = new String[]{ "linux" }; assertSimpleTSOutput(namedAnalyzer.tokenStream("content", "linux"), expected); } private AnalysisService createAnalysisService() { Settings settings = Settings.settingsBuilder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("path.home", System.getProperty("path.home")) .build(); Index index = new Index("test"); Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), new EnvironmentModule(new Environment(settings))) .createInjector(); AnalysisModule analysisModule = new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)); new StandardnumberPlugin(settings).onModule(analysisModule); Injector injector = new ModulesBuilder().add( new IndexSettingsModule(index, settings), new IndexNameModule(index), analysisModule) .createChildInjector(parentInjector); return injector.getInstance(AnalysisService.class); } private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { assertTrue(i < expected.length); assertEquals(expected[i++], termAttr.toString()); } assertEquals(expected.length, i); stream.close(); } }