/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on May 7, 2014 */ package com.bigdata.search; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.ru.RussianAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions; /** * Unit tests for {@link ConfigurableAnalyzerFactory}. * We use the same setup, as defined in {@link #getExtraProperties()} * for all the tests. Some of the tests check whether bad combinations * of options are detected and reported correctly. * Others check that some input, in a particular language is * tokenized as expected. * @author jeremycarroll * */ public class TestConfigurableAnalyzerFactory extends AbstractAnalyzerFactoryTest { public TestConfigurableAnalyzerFactory() { } public TestConfigurableAnalyzerFactory(String arg0) { super(arg0); } @Override String[] getExtraProperties() { String analyzer = ConfigurableAnalyzerFactory.Options.ANALYZER; return new String[]{ FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(), analyzer+"_."+AnalyzerOptions.LIKE, "x-empty", analyzer+"x-empty."+AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(), analyzer+"x-terms."+AnalyzerOptions.PATTERN, "\\W+", analyzer+"x-splits."+AnalyzerOptions.ANALYZER_CLASS, TermCompletionAnalyzer.class.getName(), analyzer+"x-splits."+AnalyzerOptions.STOPWORDS, AnalyzerOptions.STOPWORDS_VALUE_NONE, analyzer+"x-splits."+AnalyzerOptions.WORD_BOUNDARY, " ", analyzer+"x-splits."+AnalyzerOptions.SUB_WORD_BOUNDARY, "(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})", analyzer+"x-hyphen."+AnalyzerOptions.SUB_WORD_BOUNDARY, "[-.]", analyzer+"x-hyphen."+AnalyzerOptions.SOFT_HYPHENS, "-", analyzer+"x-hyphen."+AnalyzerOptions.WORD_BOUNDARY, " ", analyzer+"x-hyphen."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "false", analyzer+"x-hyphen2."+AnalyzerOptions.SUB_WORD_BOUNDARY, "[-.]", analyzer+"x-hyphen2."+AnalyzerOptions.SOFT_HYPHENS, "-", analyzer+"x-hyphen2."+AnalyzerOptions.WORD_BOUNDARY, " ", analyzer+"x-hyphen2."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "true", analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(), analyzer+"en-x-de."+AnalyzerOptions.ANALYZER_CLASS, StandardAnalyzer.class.getName(), analyzer+"en-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(), }; } private void badCombo(String errorMessage, String ... props) { // Check that some combination of properties on a language create an error String myProps[] = new String[props.length+4]; int i=0; for (; i<props.length;i+=2) { myProps[i] = ConfigurableAnalyzerFactory.Options.ANALYZER + "x-testme." + props[i]; myProps[i+1] = props[i+1]; } myProps[i] = ConfigurableAnalyzerFactory.Options.ANALYZER + "_." + AnalyzerOptions.ANALYZER_CLASS; myProps[i+1] = EmptyAnalyzer.class.getName(); myProps[i+2] = FullTextIndex.Options.ANALYZER_FACTORY_CLASS; myProps[i+3] = ConfigurableAnalyzerFactory.class.getName(); try { this.createFullTextIndex("test-in-error"+getName(), myProps).getAnalyzer("en",true); } catch (RuntimeException e) { Throwable t = e; while (t.getCause() != null) { t = t.getCause(); } assertTrue(t.getMessage(),t.getMessage().contains(errorMessage)); return; } fail("No error detected"); } public void testBadLike() { badCombo("en-us-x-banana",AnalyzerOptions.LIKE,"en-us-x-banana"); } public void testMissingClass() { badCombo("exactly one",AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT); } public void testLikeAndClass() { badCombo("exactly one",AnalyzerOptions.LIKE,"*", AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName()); } public void testLikeAndStopwords() { badCombo("stopwords",AnalyzerOptions.LIKE,"*", AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT); } public void testCantAlwaysHaveStopWords() { badCombo("not supported", AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(), AnalyzerOptions.STOPWORDS,StandardAnalyzer.class.getName() ); } public void testCantAlwaysHaveDefaultStopWords() { badCombo("not supported", AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(), AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT ); } public static class NoStopWordsAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName) { throw new UnsupportedOperationException(); } } public void testCantFindStopWords() { badCombo("find", AnalyzerOptions.ANALYZER_CLASS, GermanAnalyzer.class.getName(), AnalyzerOptions.STOPWORDS, NoStopWordsAnalyzer.class.getName() ); } public void testEmptyAnalyzer() throws IOException { comparisonTest("en", false, "The fast car arrived slowly.", "" ); } public void testStopWordSwitch() throws IOException { // en-x-de is an English Analyzer using german stopwords! comparisonTest("en-x-de", true, "The fast car arrived slowly.", "the fast car arrived slowly" ); comparisonTest("en-x-de", true, "The fast car die arrived slowly.", "the fast car arrived slowly" ); comparisonTest("en-x-de", false, "The fast car die arrived slowly.", "the fast car die arrived slowly" ); } public void testSyapseExample1() throws IOException { comparisonTest("x-splits", true, "ADENOCARCINOMA OF LUNG, SOMATIC [ERBB2, INS/DUP, NT2322]", "ADENOCARCINOMA OF LUNG, SOMATIC [ERBB2, ERBB2, INS/DUP, DUP, NT2322]" ); } public void testSyapseExample2() throws IOException { comparisonTest("x-splits", true, "\u2265\u2265\u22653-11.13-11.1", "\u2265\u2265\u22653-11.13-11.1 3-11.13-11.1 11.13-11.1 13-11.1 11.1 1" ); } public void testSyapseExample4() throws IOException { comparisonTest("x-splits", true, "\u00b1-ACE3.1.1", "\u00b1-ACE3.1.1 ACE3.1.1 1.1 1" ); } public void testSyapseExample3() throws IOException { comparisonTest("x-splits", true, "2,2,3-trimethylbutane", "2,2,3-trimethylbutane 2,3-trimethylbutane 3-trimethylbutane trimethylbutane" ); } public void testSyapseExample5() throws IOException { comparisonTest("x-splits", true, "CD8_alpha-low Langerhans cell", "CD8_alpha-low alpha-low low Langerhans cell" ); } public void testSyapseExample6() throws IOException { comparisonTest("x-splits", true, "6-Monoacetylmorphine:Mass Content:Point in time:Meconium:Quantitative", "6-Monoacetylmorphine:Mass Monoacetylmorphine:Mass Mass Content:Point Point in time:Meconium:Quantitative Meconium:Quantitative Quantitative" ); } public void testSyapseExample7() throws IOException { comparisonTest("x-splits", true, "N,N-dimethyl", "N,N-dimethyl N-dimethyl dimethyl" ); } public void testSyapseExample8() throws IOException { comparisonTest("x-hyphen", true, "\u00b1-ACE3.1.1 ab-bc.cd-de", "\u00b1ACE3.1.1 \u00b1-ACE3.1.1 ACE3.1.1 1.1 1 abbc.cdde ab-bc.cd-de bc.cdde bc.cd-de cdde cd-de de" ); } public void testSyapseExample9() throws IOException { comparisonTest("x-hyphen2", true, "\u00b1-ACE3.1.1 ab-bc.cd-de", "\u00b1ACE3.1.1 ACE3.1.1 1.1 1 abbc.cdde bc.cdde cdde de" ); } }