/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.analysis; import monty.solr.util.MontySolrQueryTestCase; import monty.solr.util.MontySolrSetup; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; import java.io.File; import java.io.IOException; import org.junit.BeforeClass; /** * Tests that the fulltext is parsed properly, the ads_text type * is not as simple as it seems * * The ads_text has several tasks to do: * * 1) normalize the input text, ie. token -foo becomes token-foo * this is done through a series of pattern replace filters * 2) use WordDelimiterFilterFactory to split words (ie. all-sky) * 3) discover synonyms (and we have several families of synonyms) * - multi-token: search case insensitively * - acronyms: search case sensitively * - single-token: search case insensitively * Each of the newly discovered tokens is *inserted* into the * document, we take care to preserve also the original token * Synonyms have prefix 'syn::' and acronyms 'acr::' * 4) remove stopwords * 5) normalization (lowercase etc) * * * * The difficult part with this token type is the presence of synonyms (besides other things) * So, for example in the sentence: * * Mirrors of the hubble space telescope * * We must do different things during indexing and querying * * indexing: mirrors,hubble|hubble space telescope|hst,space,telescope * querying: +mirrors +(hubble space telescope | hst) * * * During the indexing we want to output BOTH the original tokens, as well as their * synonyms. But in the search phase, we only want the synonyms. HOWEVER, we need * the original tokens for the proximity queries, if we indexed 'hubble space telescope' * as one token, we cannot search for 'hubble NEAR telescope' * * The default solr synonym filter is configured for indexing, but it has the ability * to do what we want. Unfortunately, the public API does not allow us to configure * its behaviour (so I made a custom factory, hopefully that can go away). * * * ACRONYMS: * Acronyms are identified IFF they were all UPPERCASE and were present in the * source text. Acronym is indexed in the original form, as well as with prefix 'acr::' * * Example: MIT * Indexed: mit|acr::mit * * But if the source text contains: * * Massachusets Institute of Technology * * It is expanded into: * 0: massachusets|mit|massachusets institute of technology * 1: institute * 2: (null, removed by the stop filter) * 3: technology * * Because the synonym filters IGNORE case, the synonym MIT is emitted as 'mit' * Therefore it cannot be recognized by the Acronym filter (even it it sits after the * synonym filter) * * This has the effect that 'acr::*' will find only documents where the acronym * was in the source (as opposed to synonym expansion) * * * TODO: maybe we can make the FST search with ignoreCase=true, but emit UpperCase * TODO: the analyzer for the synonyms must use the same StopFilters as the query chain * */ public class TestAdsabsTypeFulltextParsing extends MontySolrQueryTestCase { @BeforeClass public static void beforeClass() throws Exception { makeResourcesVisible(Thread.currentThread().getContextClassLoader(), new String[] {MontySolrSetup.getMontySolrHome() + "/contrib/examples/adsabs/server/solr/collection1", MontySolrSetup.getSolrHome() + "/example/solr/collection1" }); System.setProperty("solr.allow.unsafe.resourceloading", "true"); schemaString = getSchemaFile(); configString = MontySolrSetup.getMontySolrHome() + "/contrib/examples/adsabs/server/solr/collection1/solrconfig.xml"; initCore(configString, schemaString, MontySolrSetup.getSolrHome() + "/example/solr"); } public static String getSchemaFile() { /* * For purposes of the test, we make a copy of the schema.xml, * and create our own synonym files */ String configFile = MontySolrSetup.getMontySolrHome() + "/contrib/examples/adsabs/server/solr/collection1/schema.xml"; File newConfig; try { newConfig = duplicateFile(new File(configFile)); // notice 'mond' is a synonym in both synonym files // notice two rows point into 'lunar' - they should be merged, which means // if you searched for 'mond' or 'space', it resolves to 'syn:lunar' // but if you search for lunar, you WILL NOT find 'mond' File simpleTokenSynonymsFile = createTempFile(new String[]{ "moon,moons,luna,lune,mond=>lunar\n" + "stetoscope=>glass\n" + "pace=> lunar\n" + "mhz, khz, terahertz, hertz, gigahertz, kilohertz, megahertz, hertzian, millihertz, microhz, microhertz, submegahertz, millihz, gigahertzs, microherz => mhz\n" + "survey, surveys, surveyed, surveyor, surveying, durchmusterung, surveyors, resurveyed, resurvey, minisurvey, survery, durchmusterungen, nonsurvey, surveyable, relevamientos, surveyof, serveying, unsurveyable, surfey, servey => survey\n" + "source, sources, multisource, sourcing, sourceless, quellen, souce, subsources, radioquellen, souces, soruce, circumsource, soruces, sourse, sourses, subsource, pseudosource, surces, cources, intersource, sourcers, intrasource, sourcefile, scource, souarce, sourceat => source\n" + "faint, fainter, faintest, faintness, faintly, faintward, faintwards, faintening, fiant => faint\n" + "gamma, gammas, amma, gam, gama, gamm, gammar, gammma, gramma, gammaisation => gamma\n" + "radio, radios, nonradio, radioed, radiobereich, adio, miniradio, radido => radio\n" + "pulsars, pulsar, psr, pulser, psrs, pulsare, pulsares, pulars, pulsary, puslsar, interpulsars, pusar, nonpulsar, psro, rontgenpulsare, pulsarlike, pulsarpsr => pulsars\n" + "millisecond, milliseconds, submillisecond, millisec, milliseconde, millesecond, millisekunden, milliseond, millisecnd => millisecond\n" + "fermi, fermilab => fermi\n" }); File multiTokenSynonymsFile = createTempFile(new String[]{ "dynamics\0hubble,dyhu\n" + "hubble\0space\0telescope,HST\n" + "Massachusets\0Institute\0of\0Technology, MIT\n" + "Hubble\0Space\0Microscope, HSM\n" + "ABC,Astrophysics\0Business\0Center\n" + "Astrophysics\0Business\0Commons, ABC\n" + "MOND,modified\0newtonian\0dynamics\n" + "bubble\0pace\0telescope,BPT\n" + "GBT,Green\0bank\0telescope\n" + "gamma\0ray,gammaray,gamma\0rays,gammarays\n" }); replaceInFile(newConfig, "synonyms=\"ads_text_multi.synonyms\"", "synonyms=\"" + multiTokenSynonymsFile.getAbsolutePath() + "\""); replaceInFile(newConfig, "synonyms=\"ads_text_simple.synonyms\"", "synonyms=\"" + simpleTokenSynonymsFile.getAbsolutePath() + "\""); } catch (IOException e) { e.printStackTrace(); throw new IllegalStateException(e.getMessage()); } return newConfig.getAbsolutePath(); } @Override public void setUp() throws Exception { super.setUp(); assertU(adoc("id", "1", "bibcode", "xxxxxxxxxxxx1", "title", "Bílá kobyla skočila přes čtyřista")); assertU(adoc("id", "2", "bibcode", "xxxxxxxxxxxx2", "title", "třicet-tři stříbrných střech")); assertU(adoc("id", "3", "bibcode", "xxxxxxxxxxxx3", "title", "A ještě TřistaTřicetTři stříbrných křepeliček")); assertU(adoc("id", "4", "bibcode", "xxxxxxxxxxxx4", "title", "Mirrors of the hubble space telescope goes home")); assertU(adoc("id", "5", "bibcode", "xxxxxxxxxxxx5", "title", "Mirrors of the HST second")); assertU(adoc("id", "6", "bibcode", "xxxxxxxxxxxx6", "title", "Mirrors of the Hst third")); assertU(adoc("id", "7", "bibcode", "xxxxxxxxxxxx7", "title", "Mirrors of the HubbleSpaceTelescope fourth")); assertU(adoc("id", "8", "bibcode", "xxxxxxxxxxxx8", "title", "Take Massachusets Institute of Technology (MIT)")); assertU(adoc("id", "9", "bibcode", "xxxxxxxxxxxx9", "title", "MIT developed new network protocols")); assertU(adoc("id", "10", "bibcode", "xxxxxxxxxxx10", "title", "No-sky data survey")); assertU(adoc("id", "11", "bibcode", "xxxxxxxxxxx11", "title", "All-sky data survey")); assertU(adoc("id", "12", "bibcode", "xxxxxxxxxxx12", "title", "NoSky data survey")); assertU(adoc("id", "13", "bibcode", "xxxxxxxxxxx13", "title", "AllSky data survey")); assertU(adoc("id", "14", "bibcode", "xxxxxxxxxxx14", "title", "Modified Newtonian Dynamics (MOND): Observational Phenomenology and Relativistic Extensions")); assertU(adoc("id", "15", "bibcode", "xxxxxxxxxxx15", "title", "MOND test")); assertU(adoc("id", "16", "bibcode", "xxxxxxxxxxx16", "title", "mond test")); assertU(adoc("id", "17", "bibcode", "xxxxxxxxxxx17", "title", "bubble pace telescope multi-pace foobar")); assertU(adoc("id", "18", "bibcode", "xxxxxxxxxxx18", "title", "Mirrors of the Hubble fooox Space Telescope")); assertU(adoc("id", "19", "bibcode", "xxxxxxxxxxx19", "title", "BPT MIT")); assertU(adoc("id", "20", "bibcode", "xxxxxxxxxxx20", "title", "bubble pace telescope multi-foo")); assertU(adoc("id", "21", "bibcode", "xxxxxxxxxxx21", "title", "BPT multi-foo")); assertU(adoc("id", "147", "bibcode", "xxxxxxxxxx147", "title", "NAG5-ABCD")); assertU(adoc("id", "148", "bibcode", "xxxxxxxxxx148", "title", "NAG5ABCD")); assertU(adoc("id", "149", "bibcode", "xxxxxxxxxx149", "title", "NAG5 ABCD")); assertU(adoc("id", "150", "bibcode", "xxxxxxxxxx150", "title", "nag5-abcd")); assertU(adoc("id", "151", "bibcode", "xxxxxxxxxx151", "title", "nag5abcd")); assertU(adoc("id", "152", "bibcode", "xxxxxxxxxx152", "title", "nag5 abcd")); assertU(adoc("id", "318", "bibcode", "xxxxxxxxxx318", "title", "creation of a thesaurus", "pub", "creation of a thesaurus")); assertU(adoc("id", "382", "bibcode", "xxxxxxxxxx382", "title", "xhtml <tags> should be <SUB>fooxx</SUB> <xremoved>")); // greek letter should not be a problem, #604 assertU(adoc("id", "400", "bibcode", "xxxxxxxxxx400", "title", "A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$-ray Sources for Radio Millisecond Pulsars")); assertU(adoc("id", "401", "bibcode", "xxxxxxxxxx401", "title", "A 350-MHz GBT Survey of 50 Faint Fermi γ-ray Sources for Radio Millisecond Pulsars")); assertU(adoc("id", "402", "bibcode", "xxxxxxxxxx402", "title", "A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$ ray Sources for Radio Millisecond Pulsars")); assertU(adoc("id", "403", "bibcode", "xxxxxxxxxx403", "title", "A 350-MHz GBT Survey of 50 Faint Fermi γ ray Sources for Radio Millisecond Pulsars")); assertU(commit()); } public void testMultiTokens() throws Exception { //dumpDoc(null, "id", "title"); assertQueryEquals(req("q", "title:\"bubble pace telescope multi-pace foobar\"", "defType", "aqp"), "title:\"bubble (pace syn::lunar) telescope multi (pace syn::lunar) foobar\" " + "title:\"bubble (pace syn::lunar) telescope ? multipace foobar\" " + "title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? multi (pace syn::lunar) foobar\"~2 " + "title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? ? multipace foobar\"~3", BooleanQuery.class); assertQ(req("q", "title" + ":\"bubble pace telescope multi-pace foobar\""), "//*[@numFound='1']", "//doc/str[@name='id'][.='17']"); //assertQueryEquals(req("q", "\"NASA grant\"~3 NEAR N*", "defType", "aqp", "qf", "author^1.5 title^1.4 abstract^1.3 all"), // "(((spanNear([abstract:acr::nag5, abstract:5269], 5, true) abstract:acr::nag55269)^1.3) | ((author:nag5 5269, author:nag5 5269, * author:nag5 5 author:nag5 5 * author:nag5)^1.5) | ((spanNear([title:acr::nag5, title:5269], 5, true) title:acr::nag55269)^1.4) | (spanNear([all:acr::nag5, all:5269], 5, true) all:acr::nag55269))", // DisjunctionMaxQuery.class); // UPPER-CASE vs lower-case assertQueryEquals(req("q", "NAG5-ABCD", "defType", "aqp", "df", "title"), "(+title:acr::nag5 +title:acr::abcd) title:acr::nag5abcd", BooleanQuery.class); assertQ(req("q", "NAG5-ABCD", "df", "title"), "//*[@numFound='3']", "//doc/str[@name='id'][.='147']", "//doc/str[@name='id'][.='148']", "//doc/str[@name='id'][.='149']" ); assertQueryEquals(req("q", "nag5-abcd", "defType", "aqp", "df", "title"), "(+title:nag5 +title:abcd) title:nag5abcd", BooleanQuery.class); assertQ(req("q", "nag5-abcd", "df", "title"), "//*[@numFound='6']", "//doc/str[@name='id'][.='147']", "//doc/str[@name='id'][.='148']", "//doc/str[@name='id'][.='149']", "//doc/str[@name='id'][.='150']", "//doc/str[@name='id'][.='151']", "//doc/str[@name='id'][.='152']" ); // ticket #318 assertQueryEquals(req("q", "creation of a thesaurus", "defType", "aqp", "qf", "all title^1.4 pub"), "+(all:creation | pub:creation | (title:creation)^1.4) +pub:of +pub:a +(all:thesaurus | pub:thesaurus | (title:thesaurus)^1.4)", BooleanQuery.class); assertQ(req("q", "pub:of AND pub:a"), "//*[@numFound='1']", "//doc/str[@name='id'][.='318']" ); assertQ(req("q", "creation of a thesaurus", "defType", "aqp", "qf", "title^1.4 all pub"), "//*[@numFound='1']", "//doc/str[@name='id'][.='318']" ); // ticket #320 // in natural language: when searching for MOND, we'll first find the multi-token synonyms // ie. MOND, modified newtonina dynamics // then search for simple synonymes: <find nothing, ie. ignore 'mond'> // MOND is caught by acronym filter, which is configured to eat the original // and the result is made of acronym + synonym + multi-token-synonym // test with a field assertQueryEquals(req("q", "title:MOND", "defType", "aqp"), "title:acr::mond title:syn::acr::mond title:syn::modified newtonian dynamics", BooleanQuery.class); assertQueryEquals(req("q", "title:mond", "defType", "aqp"), "title:mond title:syn::lunar", BooleanQuery.class); assertQueryEquals(req("q", "title:Mond", "defType", "aqp"), "title:mond title:syn::lunar", BooleanQuery.class); // unfielded simple token assertQueryEquals(req("q", "MOND", "defType", "aqp"), "(all:acr::mond all:syn::acr::mond all:syn::modified newtonian dynamics)", BooleanQuery.class); assertQ(req("q", "title" + ":MOND"), "//*[@numFound='2']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']"); assertQueryEquals(req("q", "mond", "defType", "aqp"), "(all:mond all:syn::lunar)", BooleanQuery.class); assertQ(req("q", "title" + ":mond"), "//*[@numFound='5']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']", "//doc/str[@name='id'][.='16']", "//doc/str[@name='id'][.='17']", "//doc/str[@name='id'][.='20']"); assertQueryEquals(req("q", "Mond", "defType", "aqp"), "(all:mond all:syn::lunar)", BooleanQuery.class); assertQ(req("q", "title" + ":Mond"), "//*[@numFound='5']", "//doc/str[@name='id'][.='17']", // orig 'space' -> syn:lunar; look at the synonym file to understand "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']", "//doc/str[@name='id'][.='16']", "//doc/str[@name='id'][.='20']"); // search for 'pace' and find 'mond' (there is intentional error/duplication // in our synonym files - look above) assertQueryEquals(req("q", "title:pace", "defType", "aqp"), "title:pace title:syn::lunar", BooleanQuery.class); assertQ(req("q", "title" + ":pace"), "//*[@numFound='3']", "//doc/str[@name='id'][.='17']", "//doc/str[@name='id'][.='16']", "//doc/str[@name='id'][.='20']"); // search for 'lunar' MUST NOT return 'mond' (because synonyms are explicit =>) // and 'lunar' is not on the left hand side assertQueryEquals(req("q", "title:lunar", "defType", "aqp"), "title:lunar", TermQuery.class); assertQ(req("q", "title" + ":lunar"), "//*[@numFound='0']"); // but 'luna' is a synonym (syn::lunar) assertQueryEquals(req("q", "title:luna", "defType", "aqp"), "title:luna title:syn::lunar", BooleanQuery.class); assertQ(req("q", "title" + ":luna"), "//*[@numFound='3']", "//doc/str[@name='id'][.='17']", "//doc/str[@name='id'][.='16']", "//doc/str[@name='id'][.='20']"); // now the multi-token version assertQueryEquals(req("q", "title:\"modified newtonian dynamics\"", "defType", "aqp"), "title:\"modified newtonian dynamics\"" + " title:syn::acr::mond title:syn::modified newtonian dynamics", BooleanQuery.class); assertQ(req("q", "title" + ":\"modified newtonian dynamics\""), "//*[@numFound='2']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']"); // multi-token. this is truly crazy (several synonyms overlap) // 'bubble pace telescope' is a synonym // 'pace' is a synonym // multi-pace is split by WDFF and expanded with a synonym assertQueryEquals(req("q", "title:\"bubble pace telescope multi-pace foobar\"", "defType", "aqp"), "title:\"bubble (pace syn::lunar) telescope multi (pace syn::lunar) foobar\"" + " title:\"bubble (pace syn::lunar) telescope ? multipace foobar\"" + " title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? multi (pace syn::lunar) foobar\"~2" + " title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? ? multipace foobar\"~3", BooleanQuery.class); assertQ(req("q", "title" + ":\"bubble pace telescope multi-pace foobar\""), "//*[@numFound='1']", "//doc/str[@name='id'][.='17']"); // now the same thing, but not using phrases assertQueryEquals(req("q", "title:modified\\ newtonian\\ dynamics", "defType", "aqp"), "(+title:modified +title:newtonian +title:dynamics) (title:syn::acr::mond title:syn::modified newtonian dynamics)", BooleanQuery.class); assertQ(req("q", "title" + ":modified\\ newtonian\\ dynamics"), "//*[@numFound='2']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']"); // and even unfielded! assertQueryEquals(req("q", "modified\\ newtonian\\ dynamics", "defType", "aqp", "df", "title"), "(+title:modified +title:newtonian +title:dynamics) (title:syn::acr::mond title:syn::modified newtonian dynamics)", BooleanQuery.class); assertQ(req("q", "modified\\ newtonian\\ dynamics", "defType", "aqp", "df", "title"), "//*[@numFound='2']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']"); // lastly - unfielded phrase assertQueryEquals(req("q", "\"modified newtonian dynamics\"", "defType", "aqp", "qf", "title^2.0 all^1.5"), "(((all:\"modified newtonian dynamics\" all:syn::acr::mond all:syn::modified newtonian dynamics))^1.5 | ((title:\"modified newtonian dynamics\" title:syn::acr::mond title:syn::modified newtonian dynamics))^2.0)", DisjunctionMaxQuery.class); assertQ(req("q", "\"modified newtonian dynamics\"", "qf", "title^2.0 all^1.5"), "//*[@numFound='2']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']"); // test of the multi-synonym replacement, phrase handling etc //dumpDoc(null, "title", "recid"); assertQueryEquals(req("q", "title:\"bubble pace telescope multi-foo\"", "defType", "aqp", "df", "title"), "title:\"bubble (pace syn::lunar) telescope multi foo\" " + "title:\"bubble (pace syn::lunar) telescope ? multifoo\" " + "title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? multi foo\"~2 " + "title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? ? multifoo\"~3", BooleanQuery.class); assertQ(req("q", "title:\"bubble pace telescope multi-foo\"", "defType", "aqp", "df", "title"), "//*[@numFound='2']", "//doc/str[@name='id'][.='20']", "//doc/str[@name='id'][.='21']"); // wow! this works correctly assertQueryEquals(req("q", "bubble\\ pace\\ telescope\\ and\\ MIT", "defType", "aqp", "df", "title"), "(+title:bubble +(title:pace title:syn::lunar) +title:telescope +(title:acr::mit title:syn::massachusets institute of technology title:syn::acr::mit)) (+(title:syn::bubble pace telescope title:syn::acr::bpt) +(title:acr::mit title:syn::massachusets institute of technology title:syn::acr::mit))", BooleanQuery.class); assertQ(req("q", "bubble\\ pace\\ telescope\\ and\\ MIT", "defType", "aqp", "df", "title"), "//*[@numFound='1']", "//doc/str[@name='id'][.='19']" ); } public void unfieldedSearch() throws Exception { // non-phrase: by default do span search //setDebug(true); assertQueryEquals(req("q", "hubble space telescope", "defType", "aqp", "aqp.unfielded.tokens.strategy", "join", "df", "all"), "all:hubble all:syn::hubble space telescope all:syn::acr::hst all:space all:telescope", BooleanQuery.class); assertQ(req("q", "hubble space telescope"), "//*[@numFound='4']", "//doc/str[@name='id'][.='4']", "//doc/str[@name='id'][.='5']", "//doc/str[@name='id'][.='17']", // to go away after #147 "//doc/str[@name='id'][.='18']" ); // make sure the unfielded search is expanded properly (by edismax) - we use it just here // HOWEVER: maybe it should do expansion inside each clause? now it favors docs with matches in all fields (which is fine) assertQueryEquals(req("q", "hubble space telescope", "defType", "aqp", "qf", "title^2.0 keyword^1.5"), "(((title:hubble title:syn::hubble space telescope title:syn::acr::hst title:space title:telescope)^2.0) " + "| ((keyword:hubble keyword:space keyword:telescope)^1.5))", DisjunctionMaxQuery.class); assertQueryEquals(req("q", "title:(hubble space telescope goes home)", "defType", "aqp", "fl", "recid,title"), //"spanNear([all:hubble, all:space, all:telescope, all:goes, all:home], 5, true) spanNear([spanOr([all:syn::hubble space telescope, all:syn::acr::hst]), all:goes, all:home], 5, true)", "(+title:hubble +title:space +title:telescope +title:goes +title:home) (+(title:syn::hubble space telescope title:syn::acr::hst) +title:goes +title:home)", BooleanQuery.class); assertQ(req("q", "title:(hubble space telescope goes home)"), "//*[@numFound='1']", "//doc/str[@name='id'][.='4']" ); // surrounded by stop words assertQueryEquals(req("q", "title:(mirrors of the hubble space telescope the goes home)", "defType", "aqp"), //"spanNear([all:mirrors, all:hubble, all:space, all:telescope, all:goes, all:home], 5, true)" + //" spanNear([all:mirrors, spanOr([all:syn::hubble space telescope, all:syn::acr::hst]), all:goes, all:home], 5, true)", "(+title:mirrors +title:hubble +title:space +title:telescope +title:goes +title:home) " + "(+title:mirrors +(title:syn::hubble space telescope title:syn::acr::hst) +title:goes +title:home)", BooleanQuery.class); assertQ(req("q", "title:(mirrors of the hubble space telescope goes home)"), "//*[@numFound='1']", "//doc/str[@name='id'][.='4']" ); // surrounded - change default operator (many matches) // TODO: #147 assertQueryEquals(req("q", "title:(mirrors of the hubble space telescope start home)", "defType", "aqp", "q.op", "OR"), "(title:mirrors title:hubble title:space title:telescope title:start title:home) " + "(title:mirrors (title:syn::hubble space telescope title:syn::acr::hst) title:start title:home)", BooleanQuery.class); assertQ(req("q", "title:(mirrors of the hubble space telescope start home)", "q.op", "OR"), "//*[@numFound='6']", "//doc[1]/str[@name='id'][.='4']", // this one is the best match "//doc/str[@name='id'][.='18']", "//doc/str[@name='id'][.='5']", "//doc/str[@name='id'][.='6']", "//doc/str[@name='id'][.='7']", "//doc/str[@name='id'][.='17']" ); assertQueryEquals(req("q", "title:(mirrors of the hubble space telescope goes home)", "defType", "aqp"), //"spanNear([all:mirrors, all:hubble, all:space, all:telescope, all:goes, all:home], 5, true)" + //" spanNear([all:mirrors, spanOr([all:syn::hubble space telescope, all:syn::acr::hst]), all:goes, all:home], 5, true)", "(+title:mirrors +title:hubble +title:space +title:telescope +title:goes +title:home) " + "(+title:mirrors +(title:syn::hubble space telescope title:syn::acr::hst) +title:goes +title:home)", BooleanQuery.class); assertQ(req("q", "title:(mirrors of the hubble space telescope goes home)"), "//*[@numFound='1']", "//doc/str[@name='id'][.='4']" ); // different modifier (synonym must not be found) assertQueryEquals(req("q", "hubble space -telescope", "defType", "aqp"), "+(all:hubble all:space) -all:telescope", BooleanQuery.class); // different field assertQueryEquals(req("q", "hubble space title:telescope", "defType", "aqp"), "+(all:hubble all:space) +title:telescope", BooleanQuery.class); assertQueryEquals(req("q", "hubble space telescope +star", "defType", "aqp"), //"+(all:hubble space telescope all:acr::hst) +all:star", "+(all:hubble all:syn::hubble space telescope all:syn::acr::hst all:space all:telescope) +all:star", BooleanQuery.class); } public void testNoSynChain() throws Exception { // simple case: synonyms deactivated assertQueryEquals(req("q", "=title:\"Hubble Space Telescope\"", "defType", "aqp"), "title:\"hubble space telescope\"", PhraseQuery.class); assertQ(req("q", "=title:\"Hubble Space Telescope\""), "//*[@numFound='1']", "//doc/str[@name='id'][.='4']" ); //setDebug(true); assertQueryEquals(req("q", "=\"Hubble Space Telescope\"", "defType", "aqp", "qf", "body title"), "(body:\"hubble space telescope\" | title:\"hubble space telescope\")", DisjunctionMaxQuery.class); } public void testSynonyms() throws Exception { /* * Test multi-token translation, the chain is set to recognize * synonyms. So even if the query string is split into 3 tokens, * we are able to join them and find their synonym (HST) * */ // simple case assertQueryEquals(req("q", "title:\"hubble space telescope\"", "defType", "aqp"), "title:\"hubble space telescope\" title:syn::hubble space telescope title:syn::acr::hst", BooleanQuery.class); assertQ(req("q", "title:\"hubble space telescope\""), "//*[@numFound='2']", "//doc/str[@name='id'][.='4']", "//doc/str[@name='id'][.='5']"); // preceded by something // TODO: remove 'title:' after #147 is solved assertQueryEquals(req("q", "title:\"mirrors of the hubble space telescope\"", "defType", "aqp"), "title:\"mirrors hubble space telescope\"" + " title:\"mirrors (syn::hubble space telescope syn::acr::hst)\"", BooleanQuery.class); assertQ(req("q", "title:\"mirrors hubble space telescope\""), "//*[@numFound='2']", "//doc/str[@name='id'][.='4']", "//doc/str[@name='id'][.='5']" ); assertQ(req("q", "title:\"mirrors of the hubble space telescope\""), "//*[@numFound='2']", "//doc/str[@name='id'][.='4']", "//doc/str[@name='id'][.='5']" ); assertQ(req("q", "title:\"mirrors of the hubble space scope\""), "//*[@numFound='0']" ); // query followed by something assertQueryEquals(req("q", "title:\"hubble space telescope goes home\"", "defType", "aqp"), "title:\"hubble space telescope goes home\"" + " title:\"(syn::hubble space telescope syn::acr::hst) ? ? goes home\"~2", BooleanQuery.class); assertQ(req("q", "title:\"hubble space telescope goes home\""), "//*[@numFound='1']", "//doc/str[@name='id'][.='4']" ); // surrounded by something assertQueryEquals(req("q", "title:\"mirrors of the hubble space telescope goes home\"", "defType", "aqp"), "title:\"mirrors hubble space telescope goes home\"" + " title:\"mirrors (syn::hubble space telescope syn::acr::hst) ? ? goes home\"~2", BooleanQuery.class); assertQ(req("q", "title:\"mirrors of the hubble space telescope goes home\""), "//*[@numFound='1']", "//doc/str[@name='id'][.='4']" ); /* * Synonym expansion 1token->many */ assertQueryEquals(req("q", "title:HST", "defType", "aqp"), "title:acr::hst title:syn::hubble space telescope title:syn::acr::hst", BooleanQuery.class); assertQueryEquals(req("q", "HST goes home", "defType", "aqp"), "+((all:acr::hst all:syn::hubble space telescope all:syn::acr::hst)) +all:goes +all:home", BooleanQuery.class); /* * many token -> 1 */ assertQueryEquals(req("q", "\"Massachusets Institute of Technology\"", "defType", "aqp"), "(all:syn::massachusets institute of technology all:syn::acr::mit)", BooleanQuery.class); assertQueryEquals(req("q", "\"massachusets institute of technology\"", "defType", "aqp"), "(all:syn::massachusets institute of technology all:syn::acr::mit)", BooleanQuery.class); //TODO: this doesn't work because stop filter is at the end of the chain, move it up? // assertQueryEquals(req("q", "\"Massachusets Institute of the Technology\"", "defType", "aqp"), // "(all:syn::massachusets institute of technology all:syn::acr::mit)", // BooleanQuery.class); // assertQueryEquals(req("q", "\"Massachusets Institute Technology\"", "defType", "aqp"), // "(all:syn::massachusets institute of technology all:syn::acr::mit)", // BooleanQuery.class); /* * Case (In)Sensitivity * * It shoulb be ase sensitive for single tokens, and case-insensitive * for multi-tokens */ assertQueryEquals(req("q", "hst", "defType", "aqp"), "all:hst", TermQuery.class); assertQueryEquals(req("q", "HSt", "defType", "aqp"), "all:hst", TermQuery.class); /* * alternation of synonym groups: * ============================= */ //synonym at extremities (end-end): //one-token stopword one-token assertQueryEquals(req("q", "HST at MIT", "defType", "aqp"), "+((all:acr::hst all:syn::hubble space telescope all:syn::acr::hst)) +((all:acr::mit all:syn::massachusets institute of technology all:syn::acr::mit))", BooleanQuery.class); //one-token word one-token assertQueryEquals(req("q", "HST bum MIT", "defType", "aqp"), "+((all:acr::hst all:syn::hubble space telescope all:syn::acr::hst)) +all:bum +((all:acr::mit all:syn::massachusets institute of technology all:syn::acr::mit))", BooleanQuery.class); //one-token stopword multi-token assertQueryEquals(req("q", "\"HST at Massachusets Institute of Technology\"", "defType", "aqp"), "all:\"(acr::hst syn::hubble space telescope syn::acr::hst) (syn::massachusets institute of technology syn::acr::mit)\"", MultiPhraseQuery.class); //one-token word multi-token assertQueryEquals(req("q", "\"HST bum Massachusets Institute of Technology\"", "defType", "aqp"), "all:\"(acr::hst syn::hubble space telescope syn::acr::hst) bum (syn::massachusets institute of technology syn::acr::mit)\"", MultiPhraseQuery.class); //multi-token stopword single-token assertQueryEquals(req("q", "\"hubble space telescope at MIT\"", "defType", "aqp"), "(all:\"hubble space telescope (acr::mit syn::massachusets institute of technology syn::acr::mit)\" all:\"(syn::hubble space telescope syn::acr::hst) ? ? (acr::mit syn::massachusets institute of technology syn::acr::mit)\"~2)", BooleanQuery.class); //multi-token word single-token assertQueryEquals(req("q", "\"hubble space telescope bum MIT\"", "defType", "aqp"), "(all:\"hubble space telescope bum (acr::mit syn::massachusets institute of technology syn::acr::mit)\" all:\"(syn::hubble space telescope syn::acr::hst) ? ? bum (acr::mit syn::massachusets institute of technology syn::acr::mit)\"~2)", BooleanQuery.class); // synonyms hidden inside other words: //word one-token stopword one-token word assertQueryEquals(req("q", "\"foo HST at MIT bar\"", "defType", "aqp"), //"+all:foo +(all:hubble space telescope all:acr::hst) +(all:massachusets institute of technology all:acr::mit) +all:bar", "all:\"foo (acr::hst syn::hubble space telescope syn::acr::hst) (acr::mit syn::massachusets institute of technology syn::acr::mit) bar\"", MultiPhraseQuery.class); //word one-token word one-token word assertQueryEquals(req("q", "\"foo HST bum MIT bar\"", "defType", "aqp"), "all:\"foo (acr::hst syn::hubble space telescope syn::acr::hst) bum (acr::mit syn::massachusets institute of technology syn::acr::mit) bar\"", MultiPhraseQuery.class); //word one-token stopword multi-token word assertQueryEquals(req("q", "\"foo HST at Massachusets Institute of Technology bar\"", "defType", "aqp"), "all:\"foo (acr::hst syn::hubble space telescope syn::acr::hst) (syn::massachusets institute of technology syn::acr::mit) ? ? bar\"~2", MultiPhraseQuery.class); //word one-token word multi-token word assertQueryEquals(req("q", "\"foo HST bum Massachusets Institute of Technology bar\"", "defType", "aqp"), "all:\"foo (acr::hst syn::hubble space telescope syn::acr::hst) bum (syn::massachusets institute of technology syn::acr::mit) ? ? bar\"~2", MultiPhraseQuery.class); //word multi-token stopword single-token word assertQueryEquals(req("q", "\"foo hubble space telescope at MIT bar\"", "defType", "aqp"), "(all:\"foo hubble space telescope (acr::mit syn::massachusets institute of technology syn::acr::mit) bar\" all:\"foo (syn::hubble space telescope syn::acr::hst) ? ? (acr::mit syn::massachusets institute of technology syn::acr::mit) bar\"~2)", BooleanQuery.class); //word multi-token word single-token word assertQueryEquals(req("q", "\"foo hubble space telescope bum MIT bar\"", "defType", "aqp"), "(all:\"foo hubble space telescope bum (acr::mit syn::massachusets institute of technology syn::acr::mit) bar\" all:\"foo (syn::hubble space telescope syn::acr::hst) ? ? bum (acr::mit syn::massachusets institute of technology syn::acr::mit) bar\"~2)", BooleanQuery.class); /** * WordDelimiterFactory + synonym expansion craziness */ /* * Example of the CamelCase ignored, but other WordDelimiterFactory matched. * Because WDFF is before the synonym filter these token are first split * and then matched. HOWEVER, the case is important!! * * So, Hubble.Space.Microscope is split into: Hubble, Space, Microscope * * Which will be found only if the synonym file contains the same case (OR: if we enable the * case insensitive search, which is on my TODO list) * */ assertQueryEquals(req("q", "HubbleSpaceMicroscope bum MIT BX", "defType", "aqp"), "+all:hubblespacemicroscope +all:bum +((all:acr::mit all:syn::massachusets institute of technology all:syn::acr::mit)) +all:acr::bx", BooleanQuery.class); assertQueryEquals(req("q", "Hubble.Space.Microscope -bum MIT BX", "defType", "aqp"), "+(((+all:hubble +all:space +all:microscope) (all:syn::hubble space microscope all:syn::acr::hsm all:hubblespacemicroscope))) -all:bum +((all:acr::mit all:syn::massachusets institute of technology all:syn::acr::mit)) +all:acr::bx", BooleanQuery.class); assertQueryEquals(req("q", "Hubble.Space.Microscope -bum MIT BX", "defType", "aqp"), "+(((+all:hubble +all:space +all:microscope) (all:syn::hubble space microscope all:syn::acr::hsm all:hubblespacemicroscope))) -all:bum +((all:acr::mit all:syn::massachusets institute of technology all:syn::acr::mit)) +all:acr::bx", BooleanQuery.class); assertQueryEquals(req("q", "Hubble-Space-Microscope bum MIT BX", "defType", "aqp"), "+(((+all:hubble +all:space +all:microscope) (all:syn::hubble space microscope all:syn::acr::hsm all:hubblespacemicroscope))) +all:bum +((all:acr::mit all:syn::massachusets institute of technology all:syn::acr::mit)) +all:acr::bx", BooleanQuery.class); /* * *QUERY* synonym expansion is case sensitive for single tokens, * but case-insensitive for multi-tokens (yes, your developer went through some extreme pain ;))) */ assertQueryEquals(req("q", "Hst", "defType", "aqp"), "all:hst", TermQuery.class); assertQueryEquals(req("q", "hst", "defType", "aqp"), "all:hst", TermQuery.class); assertQueryEquals(req("q", "HST OR Hst", "defType", "aqp"), "((all:acr::hst all:syn::hubble space telescope all:syn::acr::hst)) all:hst", BooleanQuery.class); //TODO: add the corresponding searches, but this shows we are indexing properly //dumpDoc(null, "id", F.ADS_TEXT_TYPE); } public void testOtherCases() throws Exception { // #147 - parsing of WDDF tokens // analyzer operation. eg. XXX-YYYY => (XXX AND YYY) OR XXXYYY assertQueryEquals(req("q", "NAG5-ABCD", "defType", "aqp"), "((+all:acr::nag5 +all:acr::abcd) all:acr::nag5abcd)", BooleanQuery.class); // the ascii folding filter emits both unicode and the ascii version assertQ(req("q", "title" + ":Bílá"), "//*[@numFound='1']", "//doc[1]/str[@name='id'][.='1']"); assertQ(req("q", "title" + ":Bila"), "//*[@numFound='1']", "//doc[1]/str[@name='id'][.='1']"); assertQ(req("q", "title" + ":bila"), "//*[@numFound='1']", "//doc[1]/str[@name='id'][.='1']"); // test that the two lines in the synonym file get merged and produce correct synonym expansion assertQueryEquals(req("q", "ABC", "defType", "aqp"), "(all:acr::abc all:syn::acr::abc all:syn::astrophysics business center all:syn::astrophysics business commons)", BooleanQuery.class); // "all-sky" is indexed as "all", "sky", "all-sky" // we could achieve higher precision if WDDF generateWordParts=0 // but that would cause "some-other-hyphenated" tokens to be missed assertQ(req("q", "title" + ":no-sky"), "//*[@numFound='2']", "//doc/str[@name='id'][.='10']", "//doc/str[@name='id'][.='12']"); assertQ(req("q", "title" + ":nosky"), "//*[@numFound='2']", "//doc/str[@name='id'][.='10']", "//doc/str[@name='id'][.='12']"); assertQ(req("q", "title" + ":all-sky"), "//*[@numFound='2']", "//doc/str[@name='id'][.='11']", "//doc/str[@name='id'][.='13']"); assertQ(req("q", "title" + ":allsky"), "//*[@numFound='2']", "//doc/str[@name='id'][.='11']", "//doc/str[@name='id'][.='13']"); assertQ(req("q", "title" + ":sky"), "//*[@numFound='2']", "//doc/str[@name='id'][.='10']", "//doc/str[@name='id'][.='11']" ); assertQ(req("q", "title" + ":*sky"), "//*[@numFound='4']", "//doc/str[@name='id'][.='10']", "//doc/str[@name='id'][.='11']", "//doc/str[@name='id'][.='12']", "//doc/str[@name='id'][.='13']"); /* * Html tags should be removed */ assertQ(req("q", "title" + ":xremoved"), "//*[@numFound='0']"); assertQ(req("q", "title" + ":xhtml"), "//*[@numFound='1']", "//doc/str[@name='id'][.='382']"); /** * Latex symbols should simply be converted to ascii */ assertQ(req("q", "title:\"$\\gamma$-ray\""), "//*[@numFound='4']", "//doc/str[@name='id'][.='400']", "//doc/str[@name='id'][.='401']", "//doc/str[@name='id'][.='402']", "//doc/str[@name='id'][.='403']" ); assertQ(req("q", "title:\"$\\gamma$ ray\""), "//*[@numFound='4']", "//doc/str[@name='id'][.='400']", "//doc/str[@name='id'][.='401']", "//doc/str[@name='id'][.='402']", "//doc/str[@name='id'][.='403']" ); assertQ(req("q", "title:\"γ-ray\""), "//*[@numFound='4']", "//doc/str[@name='id'][.='400']", "//doc/str[@name='id'][.='401']", "//doc/str[@name='id'][.='402']", "//doc/str[@name='id'][.='403']" ); assertQ(req("q", "title:\"γ ray\""), "//*[@numFound='4']", "//doc/str[@name='id'][.='400']", "//doc/str[@name='id'][.='401']", "//doc/str[@name='id'][.='402']", "//doc/str[@name='id'][.='403']" ); assertQueryEquals(req( "q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$ ray Sources for Radio Millisecond Pulsars\"", "defType", "aqp"), "title:\"350 (mhz syn::mhz) (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma) ray (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + "title:\"350 (mhz syn::mhz) (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) ? (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + "title:\"350mhz (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma) ray (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + "title:\"350mhz (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) ? (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\"~2", BooleanQuery.class); assertQueryEquals(req( "q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi γ-ray Sources for Radio Millisecond Pulsars\"", "defType", "aqp"), "title:\"350 (mhz syn::mhz) (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (ray gammaray syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + "title:\"350mhz (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (ray gammaray syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\"", BooleanQuery.class); //dumpDoc(null, "title"); assertQ(req("q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$ ray Sources for Radio Millisecond Pulsars\""), "//*[@numFound='4']", "//doc/str[@name='id'][.='400']", "//doc/str[@name='id'][.='401']", "//doc/str[@name='id'][.='402']", "//doc/str[@name='id'][.='403']"); assertQ(req("q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$-ray Sources for Radio Millisecond Pulsars\""), "//*[@numFound='4']", "//doc/str[@name='id'][.='400']", "//doc/str[@name='id'][.='401']", "//doc/str[@name='id'][.='402']", "//doc/str[@name='id'][.='403']"); assertQ(req("q", "title:\"Survey\""), "//*[@numFound>='4']"); assertQ(req("q", "title:\"Faint Fermi\""), "//*[@numFound>='4']"); assertQ(req("q", "title:\"GBT Survey\""), "//*[@numFound>='4']"); assertQ(req("q", "title:\"GBT Survey of 50 Faint Fermi\"~2"), "//*[@numFound>='4']"); assertQ(req("q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi γ-ray Sources for Radio Millisecond Pulsars\""), "//*[@numFound='4']", "//doc/str[@name='id'][.='400']", "//doc/str[@name='id'][.='401']", "//doc/str[@name='id'][.='402']", "//doc/str[@name='id'][.='403']"); assertQ(req("q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi γ ray Sources for Radio Millisecond Pulsars\""), "//*[@numFound='4']", "//doc/str[@name='id'][.='400']", "//doc/str[@name='id'][.='401']", "//doc/str[@name='id'][.='402']", "//doc/str[@name='id'][.='403']"); } // Uniquely for Junit 3 public static junit.framework.Test suite() { return new junit.framework.JUnit4TestAdapter(TestAdsabsTypeFulltextParsing.class); } }