/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.analysis.author; import monty.solr.util.MontySolrQueryTestCase; import monty.solr.util.MontySolrSetup; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.solr.common.params.CommonParams; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.search.QParser; import org.junit.BeforeClass; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Formatter; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.adsabs.solr.AdsConfig.F; /** * * Tests for all the author_ types defined in schema.xml * See: * http://labs.adsabs.harvard.edu/trac/ads-invenio/ticket/131 * http://labs.adsabs.harvard.edu/trac/ads-invenio/ticket/156 * * I would like to see a token processing which is crazier... * * IMPORTANT: this unittest was reviewed on 11-12-2012 by Alberto * and he found 1 (in words "ONE") problem, everything else was * fine. The problem is easily fixable, right now the * "synonym-upgrade" considers only names with initials for * expansion. Ie. * * "jones, c" => jones, christine; forman, c; forman, christine * * But Alberto wants that any short form produces the same effect, * ie. * * "jones," => jones, christine; forman, c; forman, christine * "jones, c" => jones, christine; forman, c; forman, christine * * 12-12-2012: Finished (I told Alberto, but we didn't review it again) * */ public class TestAdsabsTypeAuthorParsing extends MontySolrQueryTestCase { private String author_field = "author"; @BeforeClass public static void beforeClass() throws Exception { makeResourcesVisible(Thread.currentThread().getContextClassLoader(), new String[] { MontySolrSetup.getMontySolrHome() + "/contrib/examples/adsabs/server/solr/collection1", MontySolrSetup.getSolrHome() + "/example/solr/collection1" }); System.setProperty("solr.allow.unsafe.resourceloading", "true"); schemaString = getSchemaFile(); configString = MontySolrSetup.getMontySolrHome() + "/contrib/examples/adsabs/server/solr/collection1/solrconfig.xml"; initCore(configString, schemaString, MontySolrSetup.getSolrHome() + "/example/solr"); } public static String getSchemaFile() { /* * Make a copy of the schema.xml, and create our own synonym translation rules */ String schemaConfig = MontySolrSetup.getMontySolrHome() + "/contrib/examples/adsabs/server/solr/collection1/schema.xml"; File newConfig; try { // hand-curated synonyms File curatedSynonyms = createTempFile(new String[]{ "ABBOT, CHARLES GREELEY;ABBOTT, CHARLES GREELEY", "ABDEL AZIZ BAKRY, A;BAKRY, A", "ACHUTBHAN, P;ACHUTHAN, P", "ADAMUT, I A;ADAMUTI, I A", "ADJABSCHIRZADEH, A;ADJABSHIRZADEH, A", "AGARWAL, S;AGGARWAL, S", "AGUILAR CHIU, L A;AGUILAR, L A", "AITMUHAMBETOV, A A;AITMUKHAMBETOV, A A", "AL MLEAKY, Y M; ALMLEAKY, Y M", "ALEXEENKO, V V;ALEXEYENKO, V V", "ALFONSO, JULIA;ALFONSO-GARZON, JULIA", "ALLEN, LYNNE;ALLEN, R LYNNE;JONES, LYNNE;JONES, R LYNNE", // until here copied from: /proj/ads/abstracts/config/author.syn.new "ARAGON SALAMANCA, A;ARAGON-SALAMANCA, A;ARAGON, A;SALAMANCA, A", // copied from: /proj/ads/abstracts/config/author.syn "ADAMŠuk, m; ADAMGuk, m;ADAMČuk, m", // hand-made additions "MÜLLER, A WILLIAM;MÜLLER, A BILL", "MÜLLER, WILLIAM;MÜLLER, BILL", "JONES, CHRISTINE;FORMAN, CHRISTINE", // the famous post-synonym expansion "DE ZEEUW, TIM=>DE ZEEUW, P TIM", "DE ZEEUW, P TIM=>DE ZEEUW, TIM;DE ZEEUW,", "grant, carolyn s; stern grant, carolyn; stern, carolyn p" }); // automatically harvested variations of author names (collected during indexing) // it will be enriched by the indexing File generatedTransliterations = createTempFile(formatSynonyms(new String[]{ "ADAMCHuk, m => ADAMČuk, m", "ADAMCuk, m => ADAMČuk, m", "ADAMCZuk, m => ADAMČuk, m", //"ADAMCHuk, m K=> ADAMČuk, m K", => deactivated for test purposes, see <surname>, <1> <2> use case //"ADAMCuk, m K=> ADAMČuk, m K", => deactivated for test purposes, see <surname>, <1> <2> use case "ADAMCUK, A B=> ADAMČUK, A B", "ADAMCHUK, A B=> ADAMČUK, A B", "ADAMCZUK, A B=> ADAMČUK, A B", "ADAMCHuk, mOLJA => ADAMČuk, mOLJA", "ADAMCuk, mOLJA => ADAMČuk, mOLJA", "ADAMCZuk, mOLJA => ADAMČuk, mOLJA", "ADAMCHuk, mOLJA K=> ADAMČuk, mOLJA K", "ADAMCuk, mOLJA K=> ADAMČuk, mOLJA K", "ADAMCZuk, mOLJA K=> ADAMČuk, mOLJA K", "ADAMCHUK, => ADAMČUK,", "ADAMCUK,=> ADAMČUK,", "ADAMCZUK, => ADAMČUK,", // this one is added by hand (no automated transliteration) "MULLER, WILLIAM => MÜLLER, WILLIAM", "MUELLER, WILLIAM => MÜLLER, WILLIAM", "Boser,=>Böser,", "Boser, S=>Böser, S" } )); File newSchema = duplicateModify(new File(schemaConfig), "synonyms=\"author_curated.synonyms\"", "synonyms=\"" + curatedSynonyms.getAbsolutePath().replace('\\', '/') + "\"", "synonyms=\"author_generated.translit\"", "synonyms=\"" + generatedTransliterations.getAbsolutePath().replace('\\', '/') + "\"" ); return newSchema.getAbsolutePath(); } catch (IOException e) { e.printStackTrace(); throw new IllegalStateException(e.getMessage()); } } @Override public void setUp() throws Exception { super.setUp(); assertU(adoc(F.ID, "1", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk,")); assertU(adoc(F.ID, "2", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk, M.")); assertU(adoc(F.ID, "3", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk, Marel")); assertU(adoc(F.ID, "4", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk, Molja")); assertU(adoc(F.ID, "5", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk, Molja Karel")); assertU(adoc(F.ID, "6", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk, M Karel")); assertU(adoc(F.ID, "7", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk, Molja K")); assertU(adoc(F.ID, "8", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk, M K")); assertU(adoc(F.ID, "9", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk, Karel Molja")); assertU(adoc(F.ID, "10", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk, Karel M")); assertU(adoc(F.ID, "11", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamčuk, K Molja")); assertU(adoc(F.ID, "20", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk,")); assertU(adoc(F.ID, "21", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, M.")); assertU(adoc(F.ID, "22", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Marel")); assertU(adoc(F.ID, "23", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Molja")); assertU(adoc(F.ID, "24", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Molja Karel")); assertU(adoc(F.ID, "25", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, M Karel")); assertU(adoc(F.ID, "26", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Molja K")); assertU(adoc(F.ID, "27", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, M K")); assertU(adoc(F.ID, "28", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Karel Molja")); assertU(adoc(F.ID, "29", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, Karel M")); assertU(adoc(F.ID, "30", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamcuk, K Molja")); assertU(adoc(F.ID, "40", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk,")); assertU(adoc(F.ID, "41", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, M.")); assertU(adoc(F.ID, "42", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Marel")); assertU(adoc(F.ID, "43", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Molja")); assertU(adoc(F.ID, "44", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Molja Karel")); assertU(adoc(F.ID, "45", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, M Karel")); assertU(adoc(F.ID, "46", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Molja K")); assertU(adoc(F.ID, "47", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, M K")); assertU(adoc(F.ID, "48", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Karel Molja")); assertU(adoc(F.ID, "49", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, Karel M")); assertU(adoc(F.ID, "50", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamchuk, K Molja")); assertU(adoc(F.ID, "60", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk,")); assertU(adoc(F.ID, "61", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, M.")); assertU(adoc(F.ID, "62", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Marel")); assertU(adoc(F.ID, "63", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Molja")); assertU(adoc(F.ID, "64", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Molja Karel")); assertU(adoc(F.ID, "65", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, M Karel")); assertU(adoc(F.ID, "66", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Molja K")); assertU(adoc(F.ID, "67", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, M K")); assertU(adoc(F.ID, "68", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Karel Molja")); assertU(adoc(F.ID, "69", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, Karel M")); assertU(adoc(F.ID, "70", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamguk, K Molja")); assertU(adoc(F.ID, "80", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk,")); assertU(adoc(F.ID, "81", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, M.")); assertU(adoc(F.ID, "82", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Marel")); assertU(adoc(F.ID, "83", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Molja")); assertU(adoc(F.ID, "84", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Molja Karel")); assertU(adoc(F.ID, "85", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, M Karel")); assertU(adoc(F.ID, "86", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Molja K")); assertU(adoc(F.ID, "87", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, M K")); assertU(adoc(F.ID, "88", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Karel Molja")); assertU(adoc(F.ID, "89", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, Karel M")); assertU(adoc(F.ID, "90", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Adamshuk, K Molja")); assertU(adoc(F.ID, "100", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Müller, William")); assertU(adoc(F.ID, "101", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Mueller, William")); assertU(adoc(F.ID, "110", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, Christine")); assertU(adoc(F.ID, "111", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, C")); assertU(adoc(F.ID, "112", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Forman, Christine")); assertU(adoc(F.ID, "113", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Forman, C")); assertU(adoc(F.ID, "114", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, Christopher")); assertU(adoc(F.ID, "115", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, C")); assertU(adoc(F.ID, "116", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Forman, Christopher")); assertU(adoc(F.ID, "117", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Forman, C")); //"ALLEN, LYNNE;ALLEN, R LYNNE;JONES, LYNNE;JONES, R LYNNE" assertU(adoc(F.ID, "120", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Allen, Lynne")); assertU(adoc(F.ID, "121", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Allen, L")); assertU(adoc(F.ID, "122", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Allen, R Lynne")); assertU(adoc(F.ID, "123", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Allen, R L")); assertU(adoc(F.ID, "124", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, Lynne")); assertU(adoc(F.ID, "125", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, L")); assertU(adoc(F.ID, "126", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, R Lynne")); assertU(adoc(F.ID, "127", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Jones, R L")); assertU(adoc(F.ID, "130", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Author, A", F.AUTHOR, "Author, B", F.AUTHOR, "Author, C" )); assertU(adoc(F.ID, "200", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Lee, H C")); assertU(adoc(F.ID, "201", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Lee, H-C")); assertU(adoc(F.ID, "202", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Lee, Harwin-C")); assertU(adoc(F.ID, "203", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Lee, Harwin-Costa")); assertU(adoc(F.ID, "210", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso")); // just surname assertU(adoc(F.ID, "211", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso,")); assertU(adoc(F.ID, "212", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso, B")); assertU(adoc(F.ID, "213", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso, Brava")); assertU(adoc(F.ID, "214", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso, Borat")); assertU(adoc(F.ID, "215", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Pinilla-Alonso, Amer")); assertU(adoc(F.ID, "220", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum")); assertU(adoc(F.ID, "221", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum,")); assertU(adoc(F.ID, "222", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum, H")); assertU(adoc(F.ID, "223", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum, Hector")); assertU(adoc(F.ID, "224", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum, Hiatus")); assertU(adoc(F.ID, "225", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "van Dokkum, Romulus")); assertU(adoc(F.ID, "230", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Böser", "first_author", "Böser, S")); assertU(adoc(F.ID, "231", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Böser, S")); assertU(adoc(F.ID, "232", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Boser, S")); assertU(adoc(F.ID, "233", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Boser,")); assertU(adoc(F.ID, "300", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Gopal-Krishna,")); assertU(adoc(F.ID, "301", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Gopal-Krishna, Jewell")); assertU(adoc(F.ID, "302", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Gopal-Krishna, J")); assertU(adoc(F.ID, "400", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Moon, Dae-Sik")); assertU(adoc(F.ID, "401", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Moon, Dae- Sik")); assertU(adoc(F.ID, "402", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Moon, D. -S.")); assertU(adoc(F.ID, "403", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Moon, D.")); assertU(adoc(F.ID, "404", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Moon, D")); assertU(adoc(F.ID, "405", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Moon, D. S.")); assertU(adoc(F.ID, "406", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Moon, Dae S.")); assertU(adoc(F.ID, "407", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Moon, Dae S")); assertU(adoc(F.ID, "408", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Moon, D Sik")); assertU(adoc(F.ID, "409", F.BIBCODE, "xxxxxxxxxxxxx", F.AUTHOR, "Moon, D-Sik")); assertU(commit()); // persist the transliteration map after new docs were indexed // and reload synonym chain harvested during indexing Analyzer iAnalyzer = h.getCore().getLatestSchema().getIndexAnalyzer(); Analyzer qAnalyzer = h.getCore().getLatestSchema().getQueryAnalyzer(); TokenStream iAuthor = iAnalyzer.tokenStream("author", new StringReader("")); TokenStream qAuthor = qAnalyzer.tokenStream("author", new StringReader("")); iAuthor.close(); qAuthor.close(); // TODO: force reload of the synonym map //h.getCoreContainer().reload("collection1"); } public void xtestX() throws Exception { String expected = "author:adamčuk, molja k author:adamčuk, molja k* " + "author:adamčuk, m k author:adamčuk, m k* " + "author:adamčuk, molja " + // ! author:adamčuk, molja * "author:adamčuk, m " + // ! author:adamčuk, m* "author:adamčuk, " + "author:adamcuk, molja k author:adamcuk, molja k* " + "author:adamcuk, m k author:adamcuk, m k* " + "author:adamcuk, molja " + // ! author:adamcuk, molja * "author:adamcuk, m " + // ! author:adamcuk, m* "author:adamcuk, " + "author:adamchuk, molja k author:adamchuk, molja k* " + "author:adamchuk, m k author:adamchuk, m k* " + "author:adamchuk, molja " + // ! author:adamchuk, molja * "author:adamchuk, m " + // ! author:adamchuk, m* "author:adamchuk,"; testAuthorQuery("\"adamczuk, molja k\"", expected + " author:adamczuk, molja k author:adamczuk, molja k* author:adamczuk, m k author:adamczuk, m k* author:adamczuk, molja author:adamczuk, m author:adamczuk,", "//*[@numFound='21']"); } public void testAuthorParsingUseCases() throws Exception { // issue #57: https://github.com/romanchyla/montysolr/issues/57 testAuthorQuery("\"Moon, Dae-Sik\"", "author:moon, dae sik author:moon, dae sik * author:moon, d sik author:moon, d sik * author:moon, dae s author:moon, dae s * author:moon, d s author:moon, d s * author:moon, dae author:moon, d author:moon,", "//*[@numFound='10']"); /** * will miss: Moon, Dae-Sik; Moon, Dae -Sik * * ie where both parts are fully spelled; but it will find 'dae, s' and 'd sik' * this logic seems defficient * */ testAuthorQuery("\"Moon, D. -S.\"", //"author:moon, d s author:moon, d s* author:/moon, d[^\\s]+ s/ author:/moon, d[^\\s]+ s .*/ author:moon, d author:moon,", "author:moon, d s author:moon, d s* author:/moon, d[^\\s]+ s.*/ author:moon, d author:moon,", "//*[@numFound='10']"); // test the definition that is in the live synonym file // we use this for blackbox - to verify deployment is using // synonym translation testAuthorQuery( "\"grant, carolyn s\"", "author:grant, carolyn s " + "author:grant, carolyn s* " + "author:grant, c s " + "author:grant, c s* " + "author:grant, carolyn " + "author:grant, c " + "author:grant, " + "author:stern grant, carolyn " + "author:stern grant, c " + "author:stern grant, " + "author:stern, carolyn p " + "author:stern, carolyn p* " + "author:stern, c p " + "author:stern, c p* " + "author:stern, carolyn " + "author:stern, c " + "author:stern,", "//*[@numFound='0']" ); testAuthorQuery( "Gopal-Krishna", "author:gopal krishna, author:gopal krishna,*", "//*[@numFound='3']", "\"Gopal Krishna,\"", "author:gopal krishna, author:gopal krishna,*", "//*[@numFound='3']", "\"Gopal Krishna\"", "gopal krishna, author:gopal krishna,* author:krishna, gopal author:krishna, gopal * author:krishna, gopal * author:krishna, g author:krishna, g * author:krishna, g * author:krishna, author:krishna,*", "//*[@numFound='3']" ); //#487 - these author names should parse the same; Maestro, V was // picked by the python name parser (V removed); Boyjian had problems // with expansion (python name parser was not applied there) testAuthorQuery( "Maestro\\,\\ V", "author:maestro, v author:maestro, v* author:maestro,", "//*[@numFound='0']", "V\\ Maestro", "author:v maestro, author:v maestro,* author:maestro, v author:maestro, v* author:maestro, v * author:maestro, author:maestro,*", //"author:maestro, v author:maestro, v* author:maestro,", "//*[@numFound='0']" ); testAuthorQuery( "Boyajian\\,\\ T", "author:boyajian, t author:boyajian, t* author:boyajian,", "//*[@numFound='0']", "T\\ Boyajian", "author:t boyajian, author:t boyajian,* author:boyajian, t author:boyajian, t* author:boyajian, t * author:boyajian, author:boyajian,*", "//*[@numFound='0']" ); // first is considered a title (but when the only thing we have, it will be searched as surname) testAuthorQuery( "first", "author:first, author:first,*", "//*[@numFound='0']" ); testAuthorQuery( "goodman", "author:goodman, author:goodman,*", "//*[@numFound='0']" ); // 'xxx' will be removed from the author (at least in the modified version) assertQueryEquals(req("defType", "aqp", "q", "author:\"accomazzi, alberto, xxx.\""), "author:accomazzi, alberto, xxx author:accomazzi, alberto, xxx * author:accomazzi, alberto author:accomazzi, alberto * author:accomazzi, alberto, xxx * author:accomazzi, a xxx author:accomazzi, a xxx * author:accomazzi, a xxx * author:accomazzi, alberto, x author:accomazzi, alberto, x * author:accomazzi, alberto, x * author:accomazzi, a x author:accomazzi, a x * author:accomazzi, a x * author:accomazzi, alberto, author:accomazzi, alberto, * author:accomazzi, a author:accomazzi, a * author:accomazzi, author:accomazzi, alberto author:accomazzi, alberto *", BooleanQuery.class); // #362 - smartly handle o' sulliva (done in the Pythonic name parser) // I'm not sure whether we should index the apostrophe, maybe it should // be replaced by space ? testAuthorQuery( "\"o' sullivan\"", "author:o sullivan, author:o sullivan,*", "//*[@numFound='0']", "\"o'sullivan\"", "author:o sullivan, author:o sullivan,*", "//*[@numFound='0']", "\"o' sullivan, ji\"", "author:o sullivan, ji author:o sullivan, ji * author:o sullivan, j author:o sullivan, j * author:o sullivan,", "//*[@numFound='0']" ); // funny author names testAuthorQuery( "\"o'sullivan\"", "author:o sullivan, author:o sullivan,*", "//*[@numFound='0']", "\"o' sullivan\"", "author:o sullivan, author:o sullivan,*", "//*[@numFound='0']" ); testAuthorQuery( "Dall\\'oglio", "author:dall oglio, author:dall oglio,*", "//*[@numFound='0']", "Antonella\\ Dall\\'Oglio", "author:antonella dall oglio, author:antonella dall oglio,* author:dall oglio, antonella author:dall oglio, antonella * author:dall oglio, antonella * author:dall oglio, a author:dall oglio, a * author:dall oglio, a * author:dall oglio, author:dall oglio,*", "//*[@numFound='0']" ); testAuthorQuery( "\"t' Hooft, Sullivan\"", "author:t hooft, sullivan author:t hooft, sullivan * author:t hooft, s author:t hooft, s * author:t hooft,", "//*[@numFound='0']" ); // hmmm.. these regexes must be slow; we should not generate them // also, before #487, the first query would generate: //"author:kao, p ing tzu author:kao, p ing tzu * author:kao, p i tzu author:kao, p i tzu * author:kao, p ing t author:kao, p ing t * author:kao, p i t author:kao, p i t * author:kao, p author:kao,", testAuthorQuery( "\"P'ING-TZU KAO\"", "author:p ing tzu kao, author:p ing tzu kao,* author:kao, p ing tzu " + "author:kao, p ing tzu * " + "author:/kao, p[^\\s]+ ing tzu/ author:/kao, p[^\\s]+ ing tzu .*/ " + "author:kao, p ing tzu * author:kao, p i tzu author:kao, p i tzu * " + "author:/kao, p[^\\s]+ i tzu/ author:/kao, p[^\\s]+ i tzu .*/ " + "author:kao, p i tzu * author:kao, p ing t author:kao, p ing t * " + "author:/kao, p[^\\s]+ ing t/ author:/kao, p[^\\s]+ ing t .*/ " + "author:kao, p ing t * author:kao, p i t author:kao, p i t * " + "author:/kao, p[^\\s]+ i t/ author:/kao, p[^\\s]+ i t .*/ " + "author:kao, p i t * author:kao, p author:kao, p * author:kao, author:kao,*", "//*[@numFound='0']" ); testAuthorQuery( "\"Kao, P'ing-Tzu\"", "author:kao, p ing tzu author:kao, p ing tzu * " + "author:/kao, p[^\\s]+ ing tzu/ author:/kao, p[^\\s]+ ing tzu .*/ " + "author:kao, p i tzu author:kao, p i tzu * " + "author:/kao, p[^\\s]+ i tzu/ author:/kao, p[^\\s]+ i tzu .*/ " + "author:kao, p ing t author:kao, p ing t * " + "author:/kao, p[^\\s]+ ing t/ author:/kao, p[^\\s]+ ing t .*/ " + "author:kao, p i t author:kao, p i t * " + "author:/kao, p[^\\s]+ i t/ author:/kao, p[^\\s]+ i t .*/ " + "author:kao, p author:kao,", "//*[@numFound='0']" ); // what happens we receive very long string (non-author thing) testAuthorQuery( "\"purpose of this review is to bridge the gap between\"", "MatchNoDocsQuery(\"\")", "//*[@numFound='0']" ); // making sure also other fields are being parsed properly author_field = "first_author"; testAuthorQuery( "\"Boser, S\"", "first_author:boser, s first_author:boser, s* first_author:boser, first_author:böser, s first_author:böser, s* first_author:böser, first_author:boeser, s first_author:boeser, s* first_author:boeser,", "//*[@numFound='1']", "\"Böser, S\"", "first_author:böser, s first_author:böser, s* first_author:böser, first_author:boser, s first_author:boser, s* first_author:boser, first_author:boeser, s first_author:boeser, s* first_author:boeser,", "//*[@numFound='1']" ); // back to the standard: author author_field = "author"; testAuthorQuery( "\"Boser, S\"", "author:böser, s author:böser, s* author:böser, author:boeser, s author:boeser, s* author:boeser, author:boser, s author:boser, s* author:boser,", "//*[@numFound='4']", "\"Böser, S\"", "author:böser, s author:böser, s* author:böser, author:boeser, s author:boeser, s* author:boeser, author:boser, s author:boser, s* author:boser,", "//*[@numFound='4']" ); // reported by Alex // [author:"van Dokkum" bibstem:"Natur" author:"Conroy" ] // doesn't return any results, even though it should yield 2010Natur.468..940V. testAuthorQuery( "\"van Dokkum\"", "author:van dokkum, author:van dokkum,*", "//*[@numFound='6']", // "van Dokkum" numFound=6 // 220 van Dokkum 221 van Dokkum, 222 van Dokkum, H // 223 van Dokkum, Hector 224 van Dokkum, Hiatus 225 van Dokkum, Romulus "\"van Dokkum,\"", "author:van dokkum, author:van dokkum,*", "//*[@numFound='6']", // "van Dokkum," numFound=6 // 220 van Dokkum 221 van Dokkum, 222 van Dokkum, H // 223 van Dokkum, Hector 224 van Dokkum, Hiatus 225 van Dokkum, Romulus "\"van Dokkum, H\"", "author:van dokkum, h author:van dokkum, h* author:van dokkum,", "//*[@numFound='5']", // "van Dokkum, H" numFound=5 // 220 van Dokkum 221 van Dokkum, 222 van Dokkum, H // 223 van Dokkum, Hector 224 van Dokkum, Hiatus "\"van Dokkum, H.\"", "author:van dokkum, h author:van dokkum, h* author:van dokkum,", "//*[@numFound='5']", // "van Dokkum, H." numFound=5 // 220 van Dokkum 221 van Dokkum, 222 van Dokkum, H // 223 van Dokkum, Hector 224 van Dokkum, Hiatus "\"van Dokkum, Romulus\"", "author:van dokkum, romulus author:van dokkum, romulus * author:van dokkum, r author:van dokkum, r * author:van dokkum,", "//*[@numFound='3']" // "van Dokkum, Romulus" numFound=3 // 220 van Dokkum 221 van Dokkum, 225 van Dokkum, Romulus ); //bug #324 testAuthorQuery( "Pinilla-Alonso", "author:pinilla alonso, author:pinilla alonso,*", "//*[@numFound='6']", // Pinilla-Alonso numFound=6 // 210 Pinilla-Alonso 211 Pinilla-Alonso, 212 Pinilla-Alonso, B // 213 Pinilla-Alonso, Brava 214 Pinilla-Alonso, Borat 215 Pinilla-Alonso, Amer "\"Pinilla Alonso\"", "author:pinilla alonso, author:pinilla alonso,* author:alonso, pinilla author:alonso, pinilla * author:alonso, pinilla * author:alonso, p author:alonso, p * author:alonso, p * author:alonso, author:alonso,*", "//*[@numFound='6']", // Pinilla-Alonso numFound=6 // 210 Pinilla-Alonso 211 Pinilla-Alonso, 212 Pinilla-Alonso, B // 213 Pinilla-Alonso, Brava 214 Pinilla-Alonso, Borat 215 Pinilla-Alonso, Amer "\"Pinilla Alonso,\"", "author:pinilla alonso, author:pinilla alonso,*", "//*[@numFound='6']", // Pinilla-Alonso numFound=6 // 210 Pinilla-Alonso 211 Pinilla-Alonso, 212 Pinilla-Alonso, B // 213 Pinilla-Alonso, Brava 214 Pinilla-Alonso, Borat 215 Pinilla-Alonso, Amer "\"Pinilla-Alonso, B\"", "author:pinilla alonso, b author:pinilla alonso, b* author:pinilla alonso,", "//*[@numFound='5']", // Pinilla-Alonso numFound=6 // 210 Pinilla-Alonso 211 Pinilla-Alonso, 212 Pinilla-Alonso, B // 213 Pinilla-Alonso, Brava 214 Pinilla-Alonso, Borat "\"Pinilla Alonso, B.\"", "author:pinilla alonso, b author:pinilla alonso, b* author:pinilla alonso,", "//*[@numFound='5']", // Pinilla-Alonso numFound=6 // 210 Pinilla-Alonso 211 Pinilla-Alonso, 212 Pinilla-Alonso, B // 213 Pinilla-Alonso, Brava 214 Pinilla-Alonso, Borat "\"Pinilla-Alonso, Brava\"", "author:pinilla alonso, brava author:pinilla alonso, brava * author:pinilla alonso, b author:pinilla alonso, b * author:pinilla alonso,", "//*[@numFound='4']" // Pinilla-Alonso, Brava numFound=4 // 210 Pinilla-Alonso 211 Pinilla-Alonso, 212 Pinilla-Alonso, B // 213 Pinilla-Alonso, Brava ); // bug: #255 testAuthorQuery( "\"Lee, H-C\"", "author:lee, h c author:lee, h c* author:/lee, h[^\\s]+ c.*/ author:lee, h author:lee,", "//*[@numFound='4']", // Lee, H-C numFound=4 // 200 Lee, H C 201 Lee, H-C 202 Lee, Harwin-C // 203 Lee, Harwin-Costa "\"Lee, H C\"", "author:lee, h c author:lee, h c* author:/lee, h[^\\s]+ c.*/ author:lee, h author:lee,", "//*[@numFound='4']", // "Lee, H-C" numFound=4 // 200 Lee, H C 201 Lee, H-C 202 Lee, Harwin-C // 203 Lee, Harwin-Costa "\"Lee, Harwin C\"", "author:lee, harwin c author:lee, harwin c* author:lee, h c author:lee, h c* author:lee, harwin author:lee, h author:lee,", "//*[@numFound='4']", // Lee, Harwin C numFound=4 // 200 Lee, H C 201 Lee, H-C 202 Lee, Harwin-C // 203 Lee, Harwin-Costa "\"Lee, Harwin-*\"", "author:lee, harwin-*", "//*[@numFound='0']", // Lee, Harwin-* numFound=0 "\"Lee, Harwin*\"", "author:lee, harwin*", "//*[@numFound='2']", // Lee, Harwin* numFound=2 // 202 Lee, Harwin-C 203 Lee, Harwin-Costa "\"Lee, H*\"", "author:lee, h author:lee, h* author:lee,", "//*[@numFound='4']" // Lee, Harwin-C numFound=4 // 200 Lee, H C 201 Lee, H-C 202 Lee, Harwin-C // 203 Lee, Harwin-Costa ); // test proper order of authors - ticket: #98 //System.out.println(h.query(req("q", String.format("%s:130", F.ID)))); assertQ(req("q", String.format("%s:130", F.ID)), "//*[@numFound='1']"); assert h.query(req("q", String.format("%s:130", F.ID))) .contains("<arr name=\"author\"><str>Author, A</str><str>Author, B</str><str>Author, C</str></arr>"); } public void testAuthorParsingMainLogic() throws Exception { /** * For ADS there are these rules: * What gets indexed: Normalized author name (always lowercase!) * What gets searched: By default, the author name is * * example: Štaufčik, Piotr * * 1. normalized (sztaufczik, piotr) * 2. enriched with name variants (sztaufczik, pjotr) * 3. enriched with synonyms (konrad, pjotr) * * The different tokenizer chains serve for situations, when we want * to search for the author name but DE-activate some of the steps * above. The NORMALIZATION happens ALWAYS (because we index things * that way). Combinations are: * * author_exact = 1 + 3 * author_nosyn = 1 + 2 * author_exact_nosyn = 1 * * * As a general rule, the ADS is trying to get more rather than less. * Here are the examples: * * <pre> * query: expanded into: * =============================================================== * * kurtz, michael julian -> kurtz, michael julian * kurtz, michael julian * * kurtz, michael j * kurtz, michael j * * kurtz, m j * kurtz, m j * * kurtz, m julian * kurtz, m julian * * kurtz, michael (<- libation to gods of recall) * kurtz, m (<- dtto) * kurtz, (<- libation #2) * * kurtz, michael j -> kurtz, michael j* * kurtz, michael j * * kurtz, m j* * kurtz, * kurtz, michael * kurtz, m * * kurtz, m julian -> kurtz, m julian * kurtz, m julian * * kurtz, m j * * kurtz, m j * kurtz, m * kurtz, * kurtz, m\w* julian (<- happens only for one-letter initials) * kurtz, m\w* julian .* (dtto) * kurtz, m\w* j (dtto) * kurtz, m\w* j .* (dtto) * * kurtz, michael -> kurtz, michael * kurtz, michael * * kurtz, m * kurtz, m * * kurtz, * * kurtz, m -> kurtz, m * kurtz, m* (in fact, these two can become just: kurtz, m*) * kurtz, * * kurtz, mi* -> kurtz, mi* * kurtz, * * * * </pre> */ /* * ============================================================ * Here comes the bloodiest part of the author parsing unittest * ============================================================ * * Each test case has two branches, one representing the full utf-8 form (with ascii chars), the other the ascii downgraded form. No matter which, the query must be expanded in both cases equally for each testcase Test-cases: <surname> <surname>, <surname>, <1> <surname>, <1name> <surname>, <1name> <2> <surname>, <1name> <2name> <surname>, <1> <2name> <surname>, <1> <2> <surname>, <2> <surname>, <2name> <surname>, <2name> <1> <surname>, <2name> <1name> <surname>, <2> <1name> <surname>, <2> <1> <surname>, <1n*> <surname>, <1*> <surname>, <2n*> <surname>, <2*> - transliteration: adamčuk, m --> adamchuk, m;adamcuk, m - synonym expansion for: ADAMŠuk, m;ADAMGuk, m;ADAMČuk, m */ //testAuthorQuery("\"allen, lynne\"", "xxx", "//*[@numFound='']"); String expected; String expected0; expected = "author:adamčuk, author:adamčuk,* " + // query variants added by parser "author:adamchuk, author:adamchuk,* " + "author:adamcuk, author:adamcuk,*"; /** * <surname> * * upgraded && transliterated * synonym adamšuk IS NOT FOUND because there is no entry for "adam(č|c|ch)uk" the syn list */ testAuthorQuery( //"adAMčuk" "adAM\u010duk", expected + " author:adamguk, m author:adamčuk, m author:adamšuk, m", "//*[@numFound='34']", // adamčuk numFound=34 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 9 Adamčuk, Karel Molja // 10 Adamčuk, Karel M 11 Adamčuk, K Molja 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja // 61 Adamguk, M. "adAMcuk", expected, "//*[@numFound='33']", // adamcuk numFound=33 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 9 Adamčuk, Karel Molja // 10 Adamčuk, Karel M 11 Adamčuk, K Molja 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "adAMchuk", expected, "//*[@numFound='33']", // adamchuk numFound=33 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 9 Adamčuk, Karel Molja // 10 Adamčuk, Karel M 11 Adamčuk, K Molja 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "adAMczuk", expected + "author:adamczuk, author:adamczuk,*", "//*[@numFound='33']", // adamczuk numFound=33 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 9 Adamčuk, Karel Molja // 10 Adamčuk, Karel M 11 Adamčuk, K Molja 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja //"adAMšuk" "adAM\u0161uk", "author:adamšuk, author:adamšuk,* " + "author:adamshuk, author:adamshuk,* " + "author:adamsuk, author:adamsuk,* " + "author:adamguk, m author:adamčuk, m author:adamšuk, m", "//*[@numFound='13']", // adamšuk numFound=13 // 2 Adamčuk, M. 61 Adamguk, M. 80 Adamshuk, // 81 Adamshuk, M. 82 Adamshuk, Marel 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K 88 Adamshuk, Karel Molja 89 Adamshuk, Karel M // 90 Adamshuk, K Molja "adAMguk", "author:adamguk, author:adamguk,* " + "author:adamguk, m author:adamčuk, m author:adamšuk, m", "//*[@numFound='12']" // adamguk numFound=12 // 2 Adamčuk, M. 60 Adamguk, 61 Adamguk, M. // 62 Adamguk, Marel 63 Adamguk, Molja 64 Adamguk, Molja Karel // 65 Adamguk, M Karel 66 Adamguk, Molja K 67 Adamguk, M K // 68 Adamguk, Karel Molja 69 Adamguk, Karel M 70 Adamguk, K Molja ); /** * <surname>, * * upgraded && transliterated * synonym adamšuk IS NOT FOUND because there is no entry for "adam(č|c|ch)uk" the syn list */ testAuthorQuery( "\"adamčuk,\"", expected + " author:adamguk, m author:adamčuk, m author:adamšuk, m", "//*[@numFound='34']", // adamčuk numFound=34 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 9 Adamčuk, Karel Molja // 10 Adamčuk, Karel M 11 Adamčuk, K Molja 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja // 61 Adamguk, M. "\"adamcuk,\"", expected, "//*[@numFound='33']", // adamcuk numFound=33 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 9 Adamčuk, Karel Molja // 10 Adamčuk, Karel M 11 Adamčuk, K Molja 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "\"adamchuk,\"", expected, "//*[@numFound='33']", // adamchuk numFound=33 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 9 Adamčuk, Karel Molja // 10 Adamčuk, Karel M 11 Adamčuk, K Molja 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "\"adamczuk,\"", expected + "author:adamczuk, author:adamczuk,*", "//*[@numFound='33']", // adamczuk numFound=33 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 9 Adamčuk, Karel Molja // 10 Adamčuk, Karel M 11 Adamčuk, K Molja 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "\"adamšuk,\"", "author:adamšuk, author:adamšuk,* " + "author:adamshuk, author:adamshuk,* " + "author:adamsuk, author:adamsuk,* " + "author:adamguk, m author:adamčuk, m author:adamšuk, m", "//*[@numFound='13']", // adamšuk numFound=13 // 2 Adamčuk, M. 61 Adamguk, M. 80 Adamshuk, // 81 Adamshuk, M. 82 Adamshuk, Marel 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K 88 Adamshuk, Karel Molja 89 Adamshuk, Karel M // 90 Adamshuk, K Molja "\"adamguk,\"", "author:adamguk, author:adamguk,* " + "author:adamguk, m author:adamčuk, m author:adamšuk, m", "//*[@numFound='12']" // adamguk numFound=12 // 2 Adamčuk, M. 60 Adamguk, 61 Adamguk, M. // 62 Adamguk, Marel 63 Adamguk, Molja 64 Adamguk, Molja Karel // 65 Adamguk, M Karel 66 Adamguk, Molja K 67 Adamguk, M K // 68 Adamguk, Karel Molja 69 Adamguk, Karel M 70 Adamguk, K Molja ); /** * <surname>, <1> * * expanded && upgraded && transliterated && expanded * synonym "adamšuk, m" IS FOUND because there is entry for "adamčuk, m" the syn list, notice * this works even if we type "adamchuk, m" or "adamcuk, m" * * question: the chain correctly finds the synonym "adamšuk, m", and this synonym is * then transliterated: adamshuk, m;adamsuk, m (is this desirable?) I think yes. */ expected = "author:adamšuk, m author:adamšuk, m* author:adamšuk, " + "author:adamsuk, m author:adamsuk, m* author:adamsuk, " + "author:adamshuk, m author:adamshuk, m* author:adamshuk, " + "author:adamguk, m author:adamguk, m* author:adamguk, " + "author:adamčuk, m author:adamčuk, m* author:adamčuk, " + "author:adamchuk, m author:adamchuk, m* author:adamchuk, " + "author:adamcuk, m author:adamcuk, m* author:adamcuk,"; testAuthorQuery( "\"adamčuk, m\"", expected + " author:adamguk, m author:adamšuk, m", "//*[@numFound='40']", // "adamčuk, m" numFound=40 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 62 Adamguk, Marel // 63 Adamguk, Molja 64 Adamguk, Molja Karel 65 Adamguk, M Karel // 66 Adamguk, Molja K 67 Adamguk, M K 80 Adamshuk, // 81 Adamshuk, M. 82 Adamshuk, Marel 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K "\"adamcuk, m\"", expected, "//*[@numFound='40']", // "adamcuk, m" numFound=40 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 62 Adamguk, Marel // 63 Adamguk, Molja 64 Adamguk, Molja Karel 65 Adamguk, M Karel // 66 Adamguk, Molja K 67 Adamguk, M K 80 Adamshuk, // 81 Adamshuk, M. 82 Adamshuk, Marel 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K "\"adamchuk, m\"", expected, "//*[@numFound='40']", // "adamchuk, m" numFound=40 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 62 Adamguk, Marel // 63 Adamguk, Molja 64 Adamguk, Molja Karel 65 Adamguk, M Karel // 66 Adamguk, Molja K 67 Adamguk, M K 80 Adamshuk, // 81 Adamshuk, M. 82 Adamshuk, Marel 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K "\"adamczuk, m\"", expected + "author:adamczuk, m author:adamczuk, m* author:adamczuk,", "//*[@numFound='40']", // "adamczuk, m" numFound=40 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 62 Adamguk, Marel // 63 Adamguk, Molja 64 Adamguk, Molja Karel 65 Adamguk, M Karel // 66 Adamguk, Molja K 67 Adamguk, M K 80 Adamshuk, // 81 Adamshuk, M. 82 Adamshuk, Marel 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K "\"adamšuk, m\"", expected + " author:adamguk, m author:adamčuk, m", "//*[@numFound='40']", // "adamšuk, m" numFound=40 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 62 Adamguk, Marel // 63 Adamguk, Molja 64 Adamguk, Molja Karel 65 Adamguk, M Karel // 66 Adamguk, Molja K 67 Adamguk, M K 80 Adamshuk, // 81 Adamshuk, M. 82 Adamshuk, Marel 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K "\"adamguk, m\"", expected + " author:adamčuk, m author:adamšuk, m", "//*[@numFound='40']", // "adamguk, m" numFound=40 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 62 Adamguk, Marel // 63 Adamguk, Molja 64 Adamguk, Molja Karel 65 Adamguk, M Karel // 66 Adamguk, Molja K 67 Adamguk, M K 80 Adamshuk, // 81 Adamshuk, M. 82 Adamshuk, Marel 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K "\"AdAmČuk, m\"", expected + " author:adamguk, m author:adamšuk, m", "//*[@numFound='40']", // just for fun "\"ADAMŠuk, m\"", expected + " author:adamguk, m author:adamčuk, m", "//*[@numFound='40']", "\"AdAmGuk, M\"", expected + " author:adamšuk, m author:adamčuk, m", "//*[@numFound='40']" ); /** * <surname>, <1name> * * upgraded && transliterated && expanded * synonym "adamšuk, m" IS FOUND because of the query variation for "adamčuk, m" the syn list */ // base part, must be present in all expected0 = "author:adamčuk, m author:adamčuk, m * author:adamčuk, " + "author:adamcuk, m author:adamcuk, m * author:adamcuk, " + "author:adamchuk, m author:adamchuk, m * author:adamchuk, " + "author:adamšuk, m author:adamšuk, m * author:adamšuk, " + "author:adamsuk, m author:adamsuk, m * author:adamsuk, " + "author:adamshuk, m author:adamshuk, m * author:adamshuk, " + "author:adamguk, m author:adamguk, m * author:adamguk, "; expected = expected0 + "author:adamčuk, molja author:adamčuk, molja * " + "author:adamchuk, molja author:adamchuk, molja * " + "author:adamcuk, molja author:adamcuk, molja *" ; testAuthorQuery( "\"adamčuk, molja\"", expected, "//*[@numFound='29']", // "adamčuk, molja" numFound=29 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel 7 Adamčuk, Molja K // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 23 Adamcuk, Molja 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel // 26 Adamcuk, Molja K 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 65 Adamguk, M Karel // 67 Adamguk, M K 80 Adamshuk, 81 Adamshuk, M. // 85 Adamshuk, M Karel 87 Adamshuk, M K "\"adamcuk, molja\"", expected, "//*[@numFound='29']", // "adamcuk, molja" numFound=29 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel 7 Adamčuk, Molja K // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 23 Adamcuk, Molja 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel // 26 Adamcuk, Molja K 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 65 Adamguk, M Karel // 67 Adamguk, M K 80 Adamshuk, 81 Adamshuk, M. // 85 Adamshuk, M Karel 87 Adamshuk, M K "\"adamchuk, molja\"", expected, "//*[@numFound='29']", // "adamchuk, molja" numFound=29 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel 7 Adamčuk, Molja K // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 23 Adamcuk, Molja 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel // 26 Adamcuk, Molja K 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 65 Adamguk, M Karel // 67 Adamguk, M K 80 Adamshuk, 81 Adamshuk, M. // 85 Adamshuk, M Karel 87 Adamshuk, M K "\"adamczuk, molja\"", expected + "author:adamczuk, molja author:adamczuk, molja * author:adamczuk, m author:adamczuk, m * author:adamczuk,", "//*[@numFound='29']", // "adamczuk, molja" numFound=29 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel 7 Adamčuk, Molja K // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 23 Adamcuk, Molja 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel // 26 Adamcuk, Molja K 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 65 Adamguk, M Karel // 67 Adamguk, M K 80 Adamshuk, 81 Adamshuk, M. // 85 Adamshuk, M Karel 87 Adamshuk, M K // "adamčuk, molja" is not there (and cannot be, because it is not in // synonym map, but synonym "adamšuk, m" is found correctly) "\"adamšuk, molja\"", expected0 + "author:adamšuk, molja author:adamšuk, molja * " + "author:adamshuk, molja author:adamshuk, molja * " + "author:adamsuk, molja author:adamsuk, molja *", "//*[@numFound='23']", // shorter by two variants, because "adamguk, molja" is already ascii form // it doesn't generate: "author:adamshuk, molja author:adamsuk, molja" // that is correct, because "adamšuk, m" is found and transliterated // "adamšuk, molja" simply isn't in any synonym list and we tehrefore cannot have it // "adamšuk, molja" numFound=23 // 1 Adamčuk, 2 Adamčuk, M. 6 Adamčuk, M Karel // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 25 Adamcuk, M Karel 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 45 Adamchuk, M Karel 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 65 Adamguk, M Karel // 67 Adamguk, M K 80 Adamshuk, 81 Adamshuk, M. // 83 Adamshuk, Molja 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel // 86 Adamshuk, Molja K 87 Adamshuk, M K "\"adamguk, molja\"", expected0 + "author:adamguk, molja author:adamguk, molja *", "//*[@numFound='23']" // "adamguk, molja" numFound=23 // 1 Adamčuk, 2 Adamčuk, M. 6 Adamčuk, M Karel // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 25 Adamcuk, M Karel 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 45 Adamchuk, M Karel 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 63 Adamguk, Molja // 64 Adamguk, Molja Karel 65 Adamguk, M Karel 66 Adamguk, Molja K // 67 Adamguk, M K 80 Adamshuk, 81 Adamshuk, M. // 85 Adamshuk, M Karel 87 Adamshuk, M K ); /** * <surname>, <1name> <2> * * upgraded && transliterated && expanded * synonym adamšuk IS NOT FOUND because there is no entry for "adamčuk, molja k" nor * there is any "adamčuk, m k" in the syn list * * NOTE: if you think that "adamšuk" should be found in our model, then you are wrong * because "adamcuk, m k" is a different name than "adamcuk, m" * We are not goign to do any magic to find the surname mapping, in other words: * we are not going to replace defficient synonym file. Because the correct translation * CAN WORK if "adamcuk, m k" and "adamcuk, m" are named as synonymous (see the example * case of "adamczuk, m k k") */ expected = "author:adamčuk, molja k author:adamčuk, molja k* " + "author:adamčuk, m k author:adamčuk, m k* " + "author:adamčuk, molja " + // ! author:adamčuk, molja * "author:adamčuk, m " + // ! author:adamčuk, m* "author:adamčuk, " + "author:adamcuk, molja k author:adamcuk, molja k* " + "author:adamcuk, m k author:adamcuk, m k* " + "author:adamcuk, molja " + // ! author:adamcuk, molja * "author:adamcuk, m " + // ! author:adamcuk, m* "author:adamcuk, " + "author:adamchuk, molja k author:adamchuk, molja k* " + "author:adamchuk, m k author:adamchuk, m k* " + "author:adamchuk, molja " + // ! author:adamchuk, molja * "author:adamchuk, m " + // ! author:adamchuk, m* "author:adamchuk,"; testAuthorQuery( "\"adamčuk, molja k\"", expected, "//*[@numFound='21']", // "adamčuk, molja k" numFound=21 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel 7 Adamčuk, Molja K // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 23 Adamcuk, Molja 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel // 26 Adamcuk, Molja K 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K "\"adamcuk, molja k\"", expected, "//*[@numFound='21']", // "adamcuk, molja k" numFound=21 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel 7 Adamčuk, Molja K // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 23 Adamcuk, Molja 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel // 26 Adamcuk, Molja K 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K "\"adamchuk, molja k\"", expected, "//*[@numFound='21']", // this contains 4 more entries because by default, the // transliteration produces only adam(c|ch)uk // "adamchuk, molja k" numFound=21 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel 7 Adamčuk, Molja K // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 23 Adamcuk, Molja 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel // 26 Adamcuk, Molja K 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K "\"adamczuk, molja k\"", expected + " author:adamczuk, molja k author:adamczuk, molja k* author:adamczuk, m k author:adamczuk, m k* author:adamczuk, molja author:adamczuk, m author:adamczuk,", "//*[@numFound='21']", // "adamczuk, molja k" numFound=21 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel 7 Adamčuk, Molja K // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 23 Adamcuk, Molja 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel // 26 Adamcuk, Molja K 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K "\"adamšuk, molja k\"", "author:adamšuk, molja k author:adamšuk, molja k* " + "author:adamšuk, m k author:adamšuk, m k* " + "author:adamšuk, molja " + "author:adamšuk, m " + "author:adamšuk, " + "author:adamsuk, molja k author:adamsuk, molja k* " + "author:adamsuk, m k author:adamsuk, m k* " + "author:adamsuk, molja " + "author:adamsuk, m " + "author:adamsuk, " + "author:adamshuk, molja k author:adamshuk, molja k* " + "author:adamshuk, m k author:adamshuk, m k* " + "author:adamshuk, molja " + "author:adamshuk, m " + "author:adamshuk,", "//*[@numFound='7']", // "adamšuk, molja k" numFound=7 // 80 Adamshuk, 81 Adamshuk, M. 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K "\"adamguk, molja k\"", "author:adamguk, molja k author:adamguk, molja k* " + "author:adamguk, m k author:adamguk, m k* " + "author:adamguk, molja " + "author:adamguk, m " + "author:adamguk,", "//*[@numFound='7']" // "adamguk, molja k" numFound=7 // 60 Adamguk, 61 Adamguk, M. 63 Adamguk, Molja // 64 Adamguk, Molja Karel 65 Adamguk, M Karel 66 Adamguk, Molja K // 67 Adamguk, M K ); /** * <surname>, <1name> <2name> * * It works as above with the addition that the VARIATIONS of the initials/full names * are produced, ie. Aaaa B Ccccc will produce * Aaaa B C * A B C, A B Cccc * Aaaa B Cccc * * And through these variations, we find the upgraded form "adamčuk, molja k" * * This has the benefit of us finding the combination of name/initials even if * we didn't encounter them during indexing. HOWEVER, to avoid false hits these * combinations are found only for names that have certain number of parts, * default >= 3 */ // we expect the same results as above (the difference is in the "..., k k *") // plus whathever comes out of the original input transliteration/combination expected0 = "author:adamčuk, molja k author:adamčuk, molja k * " + "author:adamčuk, m k author:adamčuk, m k * " + "author:adamčuk, molja " + // <- in my opinion this is wrong (too much recall), but it was requested "author:adamčuk, m " + "author:adamčuk, " + "author:adamcuk, molja k author:adamcuk, molja k * " + "author:adamcuk, m k author:adamcuk, m k * " + "author:adamcuk, molja " + // dtto "author:adamcuk, m " + "author:adamcuk, " + "author:adamchuk, molja k author:adamchuk, molja k * " + "author:adamchuk, m k author:adamchuk, m k * " + "author:adamchuk, molja " + //dtto "author:adamchuk, m " + "author:adamchuk,"; //dumpDoc(null, "id", "author"); testAuthorQuery( "\"adamčuk, molja karel\"", expected0 + " " + "author:adamčuk, molja karel author:adamčuk, molja karel * " + "author:adamčuk, m karel author:adamčuk, m karel * " + "author:adamchuk, molja karel author:adamchuk, molja karel * " + "author:adamchuk, m karel author:adamchuk, m karel * " + "author:adamcuk, molja karel author:adamcuk, molja karel * " + "author:adamcuk, m karel author:adamcuk, m karel *", "//*[@numFound='21']", // "adamčuk, molja karel" numFound=21 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel 7 Adamčuk, Molja K // 8 Adamčuk, M K 20 Adamcuk, 21 Adamcuk, M. // 23 Adamcuk, Molja 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel // 26 Adamcuk, Molja K 27 Adamcuk, M K 40 Adamchuk, // 41 Adamchuk, M. 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K "\"adamcuk, molja karel\"", expected0 + " " + "author:adamcuk, molja karel author:adamcuk, molja karel * " + "author:adamcuk, m karel author:adamcuk, m karel *", "//*[@numFound='17']", // because adamcuk, m\w* k\w* is not searched // "adamcuk, molja karel" numFound=17 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 7 Adamčuk, Molja K 8 Adamčuk, M K 20 Adamcuk, // 21 Adamcuk, M. 23 Adamcuk, Molja 24 Adamcuk, Molja Karel // 25 Adamcuk, M Karel 26 Adamcuk, Molja K 27 Adamcuk, M K // 40 Adamchuk, 41 Adamchuk, M. 43 Adamchuk, Molja // 46 Adamchuk, Molja K 47 Adamchuk, M K "\"adamchuk, molja karel\"", expected0 + " " + "author:adamchuk, molja karel author:adamchuk, molja karel * " + "author:adamchuk, m karel author:adamchuk, m karel *", "//*[@numFound='17']", // "adamchuk, molja karel" numFound=17 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 7 Adamčuk, Molja K 8 Adamčuk, M K 20 Adamcuk, // 21 Adamcuk, M. 23 Adamcuk, Molja 26 Adamcuk, Molja K // 27 Adamcuk, M K 40 Adamchuk, 41 Adamchuk, M. // 43 Adamchuk, Molja 44 Adamchuk, Molja Karel 45 Adamchuk, M Karel // 46 Adamchuk, Molja K 47 Adamchuk, M K "\"adamczuk, molja karel\"", expected0 + " " + "author:adamczuk, molja karel author:adamczuk, molja karel * " + "author:adamczuk, m karel author:adamczuk, m karel * " + "author:adamczuk, molja k author:adamczuk, molja k * " + "author:adamczuk, m k author:adamczuk, m k * " + "author:adamczuk, molja author:adamczuk, m " + "author:adamczuk,", "//*[@numFound='15']",//-3 because "č"->"cz" normally doesn't exist // "adamczuk, molja karel" numFound=15 // 1 Adamčuk, 2 Adamčuk, M. 4 Adamčuk, Molja // 7 Adamčuk, Molja K 8 Adamčuk, M K 20 Adamcuk, // 21 Adamcuk, M. 23 Adamcuk, Molja 26 Adamcuk, Molja K // 27 Adamcuk, M K 40 Adamchuk, 41 Adamchuk, M. // 43 Adamchuk, Molja 46 Adamchuk, Molja K 47 Adamchuk, M K // almost exactly the same as above, the only difference must be the space before * "\"adamšuk, molja karel\"", "author:adamšuk, molja k author:adamšuk, molja k * " + "author:adamšuk, m k author:adamšuk, m k * " + "author:adamšuk, molja " + "author:adamšuk, m " + "author:adamšuk, " + "author:adamsuk, molja k author:adamsuk, molja k * " + "author:adamsuk, m k author:adamsuk, m k * " + "author:adamsuk, molja " + "author:adamsuk, m " + "author:adamsuk, " + "author:adamshuk, molja k author:adamshuk, molja k * " + "author:adamshuk, m k author:adamshuk, m k * " + "author:adamshuk, molja " + "author:adamshuk, m " + "author:adamshuk, " + // plus variants with karel "author:adamšuk, molja karel author:adamšuk, molja karel * " + "author:adamšuk, m karel author:adamšuk, m karel * " + "author:adamshuk, molja karel author:adamshuk, molja karel * " + "author:adamshuk, m karel author:adamshuk, m karel * " + "author:adamsuk, molja karel author:adamsuk, molja karel * " + "author:adamsuk, m karel author:adamsuk, m karel *", "//*[@numFound='7']", // "adamšuk, molja karel" numFound=7 // 80 Adamshuk, 81 Adamshuk, M. 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K "\"adamguk, molja karel\"", "author:adamguk, molja k author:adamguk, molja k * " + "author:adamguk, m k author:adamguk, m k * " + "author:adamguk, molja " + "author:adamguk, m " + "author:adamguk, " + // plus variants with karel "author:adamguk, molja karel author:adamguk, molja karel * " + "author:adamguk, m karel author:adamguk, m karel *", "//*[@numFound='7']" // "adamguk, molja karel" numFound=7 // 60 Adamguk, 61 Adamguk, M. 63 Adamguk, Molja // 64 Adamguk, Molja Karel 65 Adamguk, M Karel 66 Adamguk, Molja K // 67 Adamguk, M K ); /* * TODO: * * Also make sure we test that the expanding algorithm doesn't have unwanted consequences * and doesn't include too much, ie. that search for "adamčuk, mos" doesn't get * transformed into "adam(c|ch)uk, m" */ //TODO: show that the translation works properly when the synonym is in the synonym list // ie "adamčuk, m k;adamšuk, m k" /** * <surname>, <1> <2name> * * Speciality of this patter is that we want to search for regular * expression * * <surname>, <1>\w* <2> * <surname>, <1>\w* <2name> * * The following expansion will not find the synonyms and will not find * the upgrade. I am listing this example here specifically to show what * happens when the synonym list is missing some values (in real life, * the correct mapping will be generated IFF we encounter one of these * during indexing: * * adamčuk, m karel * adamčuk, mxxxx karel * * */ //dumpDoc(null, "id", "author"); testAuthorQuery( "\"adamčuk, m karel\"", "author:adamčuk, m karel author:adamčuk, m karel * " + "author:adamčuk, m k author:adamčuk, m k * " + "author:adamčuk, m " + "author:adamčuk, " + "author:adamcuk, m karel author:adamcuk, m karel * " + "author:adamcuk, m k author:adamcuk, m k * " + "author:adamcuk, m " + "author:adamcuk, " + "author:adamchuk, m karel author:adamchuk, m karel * " + "author:adamchuk, m k author:adamchuk, m k * " + "author:adamchuk, m " + "author:adamchuk, " + "author:/adamčuk, m[^\\s]+ karel/ " + "author:/adamčuk, m[^\\s]+ karel .*/ " + "author:/adamčuk, m[^\\s]+ k/ " + "author:/adamčuk, m[^\\s]+ k .*/ " + "author:/adamcuk, m[^\\s]+ karel/ " + "author:/adamcuk, m[^\\s]+ karel .*/ " + "author:/adamcuk, m[^\\s]+ k/ " + "author:/adamcuk, m[^\\s]+ k .*/ " + "author:/adamchuk, m[^\\s]+ karel/ " + "author:/adamchuk, m[^\\s]+ karel .*/ " + "author:/adamchuk, m[^\\s]+ k/ " + "author:/adamchuk, m[^\\s]+ k .*/" , "//*[@numFound='18']" , // "adamčuk, m karel" numFound=18 // 1 Adamčuk, 2 Adamčuk, M. 5 Adamčuk, Molja Karel // 6 Adamčuk, M Karel 7 Adamčuk, Molja K 8 Adamčuk, M K // 20 Adamcuk, 21 Adamcuk, M. 24 Adamcuk, Molja Karel // 25 Adamcuk, M Karel 26 Adamcuk, Molja K 27 Adamcuk, M K // 40 Adamchuk, 41 Adamchuk, M. 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K "\"adamcuk, m karel\"", "author:adamcuk, m karel author:adamcuk, m karel * " + "author:/adamcuk, m[^\\s]+ karel/ author:/adamcuk, m[^\\s]+ karel .*/ " + "author:adamcuk, m k author:adamcuk, m k * " + "author:/adamcuk, m[^\\s]+ k/ author:/adamcuk, m[^\\s]+ k .*/ " + "author:adamcuk, m author:adamcuk," , "//*[@numFound='6']", // If you wonder why it is not the same as above, then know it is because of the // special setup - we are testing various situations (study the synonym and ascii // upgrade setup to understand details) // "adamcuk, m karel" numFound=6 // 20 Adamcuk, 21 Adamcuk, M. 24 Adamcuk, Molja Karel // 25 Adamcuk, M Karel 26 Adamcuk, Molja K 27 Adamcuk, M K "\"adamchuk, m karel\"", "author:adamchuk, m karel author:adamchuk, m karel * " + "author:/adamchuk, m[^\\s]+ karel/ author:/adamchuk, m[^\\s]+ karel .*/ " + "author:adamchuk, m k author:adamchuk, m k * " + "author:/adamchuk, m[^\\s]+ k/ author:/adamchuk, m[^\\s]+ k .*/ " + "author:adamchuk, m author:adamchuk," , "//*[@numFound='6']", // "adamchuk, m karel" numFound=6 // 40 Adamchuk, 41 Adamchuk, M. 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K "\"adamczuk, m karel\"", "author:adamczuk, m karel author:adamczuk, m karel * " + "author:/adamczuk, m[^\\s]+ karel/ author:/adamczuk, m[^\\s]+ karel .*/ " + "author:adamczuk, m k author:adamczuk, m k * " + "author:/adamczuk, m[^\\s]+ k/ author:/adamczuk, m[^\\s]+ k .*/ " + "author:adamczuk, m author:adamczuk," , "//*[@numFound='0']", "\"adamšuk, m karel\"", "author:adamšuk, m karel author:adamšuk, m karel * " + "author:/adamšuk, m[^\\s]+ karel/ author:/adamšuk, m[^\\s]+ karel .*/ " + "author:adamšuk, m k author:adamšuk, m k * " + "author:/adamšuk, m[^\\s]+ k/ author:/adamšuk, m[^\\s]+ k .*/ " + "author:adamšuk, m " + "author:adamšuk, " + "author:adamsuk, m karel author:adamsuk, m karel * " + "author:/adamsuk, m[^\\s]+ karel/ author:/adamsuk, m[^\\s]+ karel .*/ " + "author:adamsuk, m k author:adamsuk, m k * " + "author:/adamsuk, m[^\\s]+ k/ author:/adamsuk, m[^\\s]+ k .*/ " + "author:adamsuk, m " + "author:adamsuk, " + "author:adamshuk, m karel author:adamshuk, m karel * " + "author:/adamshuk, m[^\\s]+ karel/ author:/adamshuk, m[^\\s]+ karel .*/ " + "author:adamshuk, m k author:adamshuk, m k * " + "author:/adamshuk, m[^\\s]+ k/ author:/adamshuk, m[^\\s]+ k .*/ " + "author:adamshuk, m " + "author:adamshuk,", "//*[@numFound='6']", // "adamšuk, m karel" numFound=6 // 80 Adamshuk, 81 Adamshuk, M. 84 Adamshuk, Molja Karel // 85 Adamshuk, M Karel 86 Adamshuk, Molja K 87 Adamshuk, M K "\"adamguk, m karel\"", "author:adamguk, m karel author:adamguk, m karel * " + "author:/adamguk, m[^\\s]+ karel/ author:/adamguk, m[^\\s]+ karel .*/ " + "author:adamguk, m k author:adamguk, m k * " + "author:/adamguk, m[^\\s]+ k/ author:/adamguk, m[^\\s]+ k .*/ " + "author:adamguk, m author:adamguk," , "//*[@numFound='6']" // "adamguk, m karel" numFound=6 // 60 Adamguk, 61 Adamguk, M. 64 Adamguk, Molja Karel // 65 Adamguk, M Karel 66 Adamguk, Molja K 67 Adamguk, M K ); /** * <surname>, <1> <2> * * Speciality of this patter is that we want to search for regular * expression * * <surname>, <1>\w* <2> * * The following expansion will not find the synonyms and will not find * the upgrade. I am listing this example here specifically to show what * happens when the synonym list is missing some values (in real life, * the correct mapping will be generated IFF we encounter one of these * during indexing: * * adamčuk, m karel * adamčuk, mxxxx karel * * */ expected = "author:adamčuk, a b author:adamčuk, a b* " + "author:/adamčuk, a[^\\s]+ b.*/ " + "author:adamčuk, a " + "author:adamčuk, " + "author:adamchuk, a b author:adamchuk, a b* " + "author:/adamchuk, a[^\\s]+ b.*/ " + "author:adamchuk, a " + "author:adamchuk, " + "author:adamcuk, a b author:adamcuk, a b* " + "author:/adamcuk, a[^\\s]+ b.*/ " + "author:adamcuk, a " + "author:adamcuk," ; testAuthorQuery( "\"adamčuk, a b\"", expected , "//*[@numFound='3']", // "adamčuk, a b" numFound=3 // 1 Adamčuk, 20 Adamcuk, 40 Adamchuk, "\"adamcuk, a b\"", expected , "//*[@numFound='3']", // "adamcuk, a b" numFound=3 // 1 Adamčuk, 20 Adamcuk, 40 Adamchuk, "\"adamchuk, a b\"", expected , "//*[@numFound='3']", // "adamchuk, a b" numFound=3 // 1 Adamčuk, 20 Adamcuk, 40 Adamchuk, "\"adamczuk, a b\"", expected + "author:adamczuk, a b author:adamczuk, a b* author:/adamczuk, a[^\\s]+ b.*/ author:adamczuk, a author:adamczuk,", "//*[@numFound='3']", // "adamczuk, a b" numFound=3 // 1 Adamčuk, 20 Adamcuk, 40 Adamchuk, "\"adamšuk, m k\"", "author:adam\u0161uk, m k author:adam\u0161uk, m k* " + "author:/adam\u0161uk, m[^\\s]+ k.*/ " + "author:adam\u0161uk, m " + "author:adam\u0161uk, " + "author:adamsuk, m k author:adamsuk, m k* " + "author:/adamsuk, m[^\\s]+ k.*/ " + "author:adamsuk, m " + "author:adamsuk, " + "author:adamshuk, m k author:adamshuk, m k* " + "author:/adamshuk, m[^\\s]+ k.*/ " + "author:adamshuk, m " + "author:adamshuk,", "//*[@numFound='6']", // "adamšuk, m k" numFound=5 // 80 Adamshuk, 81 Adamshuk, M. 85 Adamshuk, M Karel // 86 Adamshuk, Molja K 87 Adamshuk, M K 84 Adamshuk, Molja Karel "\"adamguk, m k\"", "author:adamguk, m k author:adamguk, m k* " + "author:/adamguk, m[^\\s]+ k.*/ " + "author:adamguk, m " + "author:adamguk," , "//*[@numFound='6']" // "adamguk, m k" numFound=5 // 60 Adamguk, 61 Adamguk, M. 65 Adamguk, M Karel // 66 Adamguk, Molja K 67 Adamguk, M K 64 Adamguk, Molja Karel ); /** * <surname>, <2> * * No expansion, because of the gap. Only transliteration * */ testAuthorQuery( "\"adamčuk, k\"", "author:adamčuk, k author:adamčuk, k* author:adamčuk, " + "author:adamchuk, k author:adamchuk, k* author:adamchuk, " + "author:adamcuk, k author:adamcuk, k* author:adamcuk,", "//*[@numFound='12']", // "adamčuk, k" numFound=12 // 1 Adamčuk, 9 Adamčuk, Karel Molja 10 Adamčuk, Karel M // 11 Adamčuk, K Molja 20 Adamcuk, 28 Adamcuk, Karel Molja // 29 Adamcuk, Karel M 30 Adamcuk, K Molja 40 Adamchuk, // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "\"adamcuk, k\"", "author:adamcuk, k author:adamcuk, k* author:adamcuk,", "//*[@numFound='4']", // "adamcuk, k" numFound=4 // 20 Adamcuk, 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja "\"adamchuk, k\"", "author:adamchuk, k author:adamchuk, k* author:adamchuk,", "//*[@numFound='4']", // "adamchuk, k" numFound=4 // 40 Adamchuk, 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M // 50 Adamchuk, K Molja "\"adamczuk, k\"", "author:adamczuk, k author:adamczuk, k* author:adamczuk,", "//*[@numFound='0']", // "adamczuk, k" numFound=0 "\"adamšuk, k\"", "author:adamšuk, k author:adamšuk, k* author:adamšuk, " + "author:adamsuk, k author:adamsuk, k* author:adamsuk, " + "author:adamshuk, k author:adamshuk, k* author:adamshuk,", "//*[@numFound='4']" // "adamšuk, k" numFound=4 // 80 Adamshuk, 88 Adamshuk, Karel Molja 89 Adamshuk, Karel M // 90 Adamshuk, K Molja , "\"adamguk, k\"", "author:adamguk, k author:adamguk, k* author:adamguk,", "//*[@numFound='4']" // "adamguk, k" numFound=4 // 60 Adamguk, 68 Adamguk, Karel Molja 69 Adamguk, Karel M // 70 Adamguk, K Molja ); /** * <surname>, <2name> * * No expansion, because of the gap. Only transliteration * */ testAuthorQuery( "\"adamčuk, karel\"", "author:adamčuk, karel author:adamčuk, karel * " + "author:adamčuk, k author:adamčuk, k * author:adamčuk, " + "author:adamcuk, karel author:adamcuk, karel * " + "author:adamcuk, k author:adamcuk, k * author:adamcuk, " + "author:adamchuk, karel author:adamchuk, karel * " + "author:adamchuk, k author:adamchuk, k * author:adamchuk,", "//*[@numFound='12']", // "adamčuk, karel" numFound=12 // 1 Adamčuk, 9 Adamčuk, Karel Molja 10 Adamčuk, Karel M // 11 Adamčuk, K Molja 20 Adamcuk, 28 Adamcuk, Karel Molja // 29 Adamcuk, Karel M 30 Adamcuk, K Molja 40 Adamchuk, // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "\"adamcuk, karel\"", "author:adamcuk, karel author:adamcuk, karel * " + "author:adamcuk, k author:adamcuk, k * author:adamcuk,", "//*[@numFound='4']", // "adamcuk, karel" numFound=4 // 20 Adamcuk, 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja "\"adamchuk, karel\"", "author:adamchuk, karel author:adamchuk, karel * " + "author:adamchuk, k author:adamchuk, k * author:adamchuk,", "//*[@numFound='4']", // "adamchuk, karel" numFound=4 // 40 Adamchuk, 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M // 50 Adamchuk, K Molja "\"adamczuk, karel\"", "author:adamczuk, karel author:adamczuk, karel * " + "author:adamczuk, k author:adamczuk, k * author:adamczuk,", "//*[@numFound='0']", // "adamczuk, karel" numFound=0 "\"adamšuk, karel\"", "author:adamšuk, karel author:adamšuk, karel * " + "author:adamšuk, k author:adamšuk, k * author:adamšuk, " + "author:adamshuk, karel author:adamshuk, karel * " + "author:adamshuk, k author:adamshuk, k * author:adamshuk, " + "author:adamsuk, karel author:adamsuk, karel * " + "author:adamsuk, k author:adamsuk, k * author:adamsuk,", "//*[@numFound='4']", // "adamšuk, karel" numFound=4 // 80 Adamshuk, 88 Adamshuk, Karel Molja 89 Adamshuk, Karel M // 90 Adamshuk, K Molja "\"adamguk, karel\"", "author:adamguk, karel author:adamguk, karel * " + "author:adamguk, k author:adamguk, k * author:adamguk,", "//*[@numFound='4']" // "adamguk, karel" numFound=4 // 60 Adamguk, 68 Adamguk, Karel Molja 69 Adamguk, Karel M // 70 Adamguk, K Molja ); /** * <surname>, <2name> <1> * * The order is not correct, therefore no expansion. Only transliteration * */ testAuthorQuery( "\"adamčuk, karel m\"", "author:adamčuk, karel m author:adamčuk, karel m* " + "author:adamčuk, k m author:adamčuk, k m* author:adamčuk, karel " + "author:adamčuk, k author:adamčuk, author:adamchuk, karel m " + "author:adamchuk, karel m* author:adamchuk, k m " + "author:adamchuk, k m* author:adamchuk, karel author:adamchuk, k " + "author:adamchuk, author:adamcuk, karel m author:adamcuk, karel m* " + "author:adamcuk, k m author:adamcuk, k m* author:adamcuk, karel " + "author:adamcuk, k author:adamcuk,", "//*[@numFound='12']", // "adamčuk, karel m" numFound=12 // 1 Adamčuk, 9 Adamčuk, Karel Molja 10 Adamčuk, Karel M // 11 Adamčuk, K Molja 20 Adamcuk, 28 Adamcuk, Karel Molja // 29 Adamcuk, Karel M 30 Adamcuk, K Molja 40 Adamchuk, // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "\"adamcuk, karel m\"", "author:adamcuk, karel m author:adamcuk, karel m* author:adamcuk, k m " + "author:adamcuk, k m* author:adamcuk, karel author:adamcuk, k author:adamcuk,", "//*[@numFound='4']", // "adamcuk, karel m" numFound=4 // 20 Adamcuk, 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja "\"adamchuk, karel m\"", "author:adamchuk, karel m author:adamchuk, karel m* author:adamchuk, k m " + "author:adamchuk, k m* author:adamchuk, karel author:adamchuk, k author:adamchuk,", "//*[@numFound='4']", // "adamchuk, karel m" numFound=4 // 40 Adamchuk, 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M // 50 Adamchuk, K Molja "\"adamczuk, karel m\"", "author:adamczuk, karel m author:adamczuk, karel m* author:adamczuk, k m " + "author:adamczuk, k m* author:adamczuk, karel author:adamczuk, k author:adamczuk,", "//*[@numFound='0']", // "adamczuk, karel m" numFound=0 "\"adamšuk, karel m\"", "author:adamšuk, karel m author:adamšuk, karel m* author:adamšuk, k m " + "author:adamšuk, k m* author:adamšuk, karel author:adamšuk, k author:adamšuk, " + "author:adamsuk, karel m author:adamsuk, karel m* author:adamsuk, k m " + "author:adamsuk, k m* author:adamsuk, karel author:adamsuk, k author:adamsuk, " + "author:adamshuk, karel m author:adamshuk, karel m* author:adamshuk, k m " + "author:adamshuk, k m* author:adamshuk, karel author:adamshuk, k " + "author:adamshuk,", "//*[@numFound='4']", // "adamšuk, karel m" numFound=4 // 80 Adamshuk, 88 Adamshuk, Karel Molja 89 Adamshuk, Karel M // 90 Adamshuk, K Molja "\"adamguk, karel m\"", "author:adamguk, karel m author:adamguk, karel m* author:adamguk, k m " + "author:adamguk, k m* author:adamguk, karel author:adamguk, k author:adamguk,", "//*[@numFound='4']" // "adamguk, karel m" numFound=4 // 60 Adamguk, 68 Adamguk, Karel Molja 69 Adamguk, Karel M // 70 Adamguk, K Molja ); /** * <surname>, <2name> <1name> * * The order is not correct. Only transliteration * */ testAuthorQuery( "\"adamčuk, karel molja\"", "author:adamčuk, karel molja author:adamčuk, karel molja * " + "author:adamčuk, k molja author:adamčuk, k molja * author:adamčuk, karel m " + "author:adamčuk, karel m * author:adamčuk, k m author:adamčuk, k m * " + "author:adamčuk, karel author:adamčuk, k author:adamčuk, " + "author:adamcuk, karel molja author:adamcuk, karel molja * " + "author:adamcuk, k molja author:adamcuk, k molja * author:adamcuk, karel m " + "author:adamcuk, karel m * author:adamcuk, k m author:adamcuk, k m * " + "author:adamcuk, karel author:adamcuk, k author:adamcuk, " + "author:adamchuk, karel molja author:adamchuk, karel molja * " + "author:adamchuk, k molja author:adamchuk, k molja * " + "author:adamchuk, karel m author:adamchuk, karel m * " + "author:adamchuk, k m author:adamchuk, k m * " + "author:adamchuk, karel author:adamchuk, k author:adamchuk,", "//*[@numFound='12']", // "adamčuk, karel molja" numFound=12 // 1 Adamčuk, 9 Adamčuk, Karel Molja 10 Adamčuk, Karel M // 11 Adamčuk, K Molja 20 Adamcuk, 28 Adamcuk, Karel Molja // 29 Adamcuk, Karel M 30 Adamcuk, K Molja 40 Adamchuk, // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "\"adamcuk, karel molja\"", "author:adamcuk, karel molja author:adamcuk, karel molja * " + "author:adamcuk, k molja author:adamcuk, k molja * " + "author:adamcuk, karel m author:adamcuk, karel m * " + "author:adamcuk, k m author:adamcuk, k m * author:adamcuk, karel " + "author:adamcuk, k author:adamcuk,", "//*[@numFound='4']", // "adamcuk, karel molja" numFound=4 // 20 Adamcuk, 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja "\"adamchuk, karel molja\"", "author:adamchuk, karel molja author:adamchuk, karel molja * " + "author:adamchuk, k molja author:adamchuk, k molja * " + "author:adamchuk, karel m author:adamchuk, karel m * " + "author:adamchuk, k m author:adamchuk, k m * author:adamchuk, karel " + "author:adamchuk, k author:adamchuk,", "//*[@numFound='4']", // "adamchuk, karel molja" numFound=4 // 40 Adamchuk, 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M // 50 Adamchuk, K Molja "\"adamczuk, karel molja\"", "author:adamczuk, karel molja author:adamczuk, karel molja * " + "author:adamczuk, k molja author:adamczuk, k molja * " + "author:adamczuk, karel m author:adamczuk, karel m * " + "author:adamczuk, k m author:adamczuk, k m * " + "author:adamczuk, karel author:adamczuk, k author:adamczuk,", "//*[@numFound='0']", "\"adamšuk, karel molja\"", "author:adamšuk, karel molja author:adamšuk, karel molja * " + "author:adamšuk, k molja author:adamšuk, k molja * " + "author:adamšuk, karel m author:adamšuk, karel m * " + "author:adamšuk, k m author:adamšuk, k m * " + "author:adamšuk, karel author:adamšuk, k author:adamšuk, " + "author:adamsuk, karel molja author:adamsuk, karel molja * " + "author:adamsuk, k molja author:adamsuk, k molja * " + "author:adamsuk, karel m author:adamsuk, karel m * " + "author:adamsuk, k m author:adamsuk, k m * author:adamsuk, karel " + "author:adamsuk, k author:adamsuk, author:adamshuk, karel molja " + "author:adamshuk, karel molja * author:adamshuk, k molja " + "author:adamshuk, k molja * author:adamshuk, karel m " + "author:adamshuk, karel m * author:adamshuk, k m " + "author:adamshuk, k m * author:adamshuk, karel " + "author:adamshuk, k author:adamshuk,", "//*[@numFound='4']", // "adamšuk, karel molja" numFound=4 // 80 Adamshuk, 88 Adamshuk, Karel Molja 89 Adamshuk, Karel M // 90 Adamshuk, K Molja "\"adamguk, karel molja\"", "author:adamguk, karel molja author:adamguk, karel molja * " + "author:adamguk, k molja author:adamguk, k molja * " + "author:adamguk, karel m author:adamguk, karel m * " + "author:adamguk, k m author:adamguk, k m * " + "author:adamguk, karel author:adamguk, k " + "author:adamguk,", "//*[@numFound='4']" // "adamguk, karel molja" numFound=4 // 60 Adamguk, 68 Adamguk, Karel Molja 69 Adamguk, Karel M // 70 Adamguk, K Molja ); /** * <surname>, <2> <1> * * The order is not correct, therefore no expansion. Only transliteration * */ testAuthorQuery( "\"adamčuk, k m\"", "author:adamčuk, k m author:adamčuk, k m* " + "author:/adamčuk, k[^\\s]+ m.*/ " + "author:adamčuk, k author:adamčuk, " + "author:adamchuk, k m author:adamchuk, k m* " + "author:/adamchuk, k[^\\s]+ m.*/ " + "author:adamchuk, k author:adamchuk, " + "author:adamcuk, k m author:adamcuk, k m* " + "author:/adamcuk, k[^\\s]+ m.*/ " + "author:adamcuk, k author:adamcuk,", "//*[@numFound='12']" // "adamčuk, k m" numFound=12 // 1 Adamčuk, 9 Adamčuk, Karel Molja 10 Adamčuk, Karel M // 11 Adamčuk, K Molja 20 Adamcuk, 28 Adamcuk, Karel Molja // 29 Adamcuk, Karel M 30 Adamcuk, K Molja 40 Adamchuk, // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja ); testAuthorQuery( "\"adamcuk, k m\"", "author:adamcuk, k m author:adamcuk, k m* " + "author:/adamcuk, k[^\\s]+ m.*/ " + "author:adamcuk, k author:adamcuk,", "//*[@numFound='4']" // "adamcuk, k m" numFound=4 // 20 Adamcuk, 29 Adamcuk, Karel M 30 Adamcuk, K Molja // 28 Adamcuk, Karel Molja ); testAuthorQuery( "\"adamchuk, k m\"", "author:adamchuk, k m author:adamchuk, k m* " + "author:/adamchuk, k[^\\s]+ m.*/ " + "author:adamchuk, k author:adamchuk,", "//*[@numFound='4']" // "adamchuk, k m" numFound=4 // 40 Adamchuk, 49 Adamchuk, Karel M 50 Adamchuk, K Molja // xx Adamchuk, Karel Molja ); testAuthorQuery( "\"adamczuk, k m\"", "author:adamczuk, k m author:adamczuk, k m* " + "author:/adamczuk, k[^\\s]+ m.*/ " + "author:adamczuk, k author:adamczuk,", "//*[@numFound='0']" // "adamczuk, k m" numFound=0 ); testAuthorQuery( "\"adamšuk, k m\"", "author:adamšuk, k m author:adamšuk, k m* " + "author:/adamšuk, k[^\\s]+ m.*/ " + "author:adamšuk, k author:adamšuk, " + "author:adamshuk, k m author:adamshuk, k m* " + "author:/adamshuk, k[^\\s]+ m.*/ " + "author:adamshuk, k author:adamshuk, " + "author:adamsuk, k m author:adamsuk, k m* " + "author:/adamsuk, k[^\\s]+ m.*/ " + "author:adamsuk, k author:adamsuk,", "//*[@numFound='4']" // "adamšuk, k m" numFound=4 // 80 Adamshuk, 89 Adamshuk, Karel M 90 Adamshuk, K Molja // xx Adamshuk, Karel Molja ); testAuthorQuery( "\"adamguk, k m\"", "author:adamguk, k m author:adamguk, k m* " + "author:/adamguk, k[^\\s]+ m.*/ " + "author:adamguk, k author:adamguk,", "//*[@numFound='4']" // "adamguk, k m" numFound=4 // 60 Adamguk, 69 Adamguk, Karel M 70 Adamguk, K Molja // xx Adamguk, Karel Molja ); /** * <surname>, <2> <1name> * * The order is not correct, therefore no expansion. Only transliteration * */ testAuthorQuery( "\"adamčuk, k molja\"", "author:adamčuk, k molja author:adamčuk, k molja * " + "author:/adamčuk, k[^\\s]+ molja/ author:/adamčuk, k[^\\s]+ molja .*/ " + "author:adamčuk, k m author:adamčuk, k m * " + "author:/adamčuk, k[^\\s]+ m/ author:/adamčuk, k[^\\s]+ m .*/ " + "author:adamčuk, k author:adamčuk, " + "author:adamcuk, k molja author:adamcuk, k molja * " + "author:/adamcuk, k[^\\s]+ molja/ author:/adamcuk, k[^\\s]+ molja .*/ " + "author:adamcuk, k m author:adamcuk, k m * " + "author:/adamcuk, k[^\\s]+ m/ author:/adamcuk, k[^\\s]+ m .*/ " + "author:adamcuk, k author:adamcuk, " + "author:adamchuk, k molja author:adamchuk, k molja * " + "author:/adamchuk, k[^\\s]+ molja/ author:/adamchuk, k[^\\s]+ molja .*/ " + "author:adamchuk, k m author:adamchuk, k m * " + "author:/adamchuk, k[^\\s]+ m/ author:/adamchuk, k[^\\s]+ m .*/ " + "author:adamchuk, k author:adamchuk,", "//*[@numFound='12']", // "adamčuk, k molja" numFound=12 // 1 Adamčuk, 9 Adamčuk, Karel Molja 10 Adamčuk, Karel M // 11 Adamčuk, K Molja 20 Adamcuk, 28 Adamcuk, Karel Molja // 29 Adamcuk, Karel M 30 Adamcuk, K Molja 40 Adamchuk, // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "\"adamcuk, k molja\"", "author:adamcuk, k molja author:adamcuk, k molja * " + "author:/adamcuk, k[^\\s]+ molja/ author:/adamcuk, k[^\\s]+ molja .*/ " + "author:adamcuk, k m author:adamcuk, k m * " + "author:/adamcuk, k[^\\s]+ m/ author:/adamcuk, k[^\\s]+ m .*/ " + "author:adamcuk, k author:adamcuk,", "//*[@numFound='4']", // "adamcuk, k molja" numFound=4 // 20 Adamcuk, 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja "\"adamchuk, k molja\"", "author:adamchuk, k molja author:adamchuk, k molja * " + "author:/adamchuk, k[^\\s]+ molja/ author:/adamchuk, k[^\\s]+ molja .*/ " + "author:adamchuk, k m author:adamchuk, k m * " + "author:/adamchuk, k[^\\s]+ m/ author:/adamchuk, k[^\\s]+ m .*/ " + "author:adamchuk, k author:adamchuk,", "//*[@numFound='4']", // "adamchuk, k molja" numFound=4 // 40 Adamchuk, 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M // 50 Adamchuk, K Molja "\"adamczuk, k molja\"", "author:adamczuk, k molja author:adamczuk, k molja * " + "author:/adamczuk, k[^\\s]+ molja/ author:/adamczuk, k[^\\s]+ molja .*/ " + "author:adamczuk, k m author:adamczuk, k m * " + "author:/adamczuk, k[^\\s]+ m/ author:/adamczuk, k[^\\s]+ m .*/ " + "author:adamczuk, k author:adamczuk,", "//*[@numFound='0']", // "adamczuk, k molja" numFound=0 "\"adamšuk, k molja\"", "author:adamšuk, k molja author:adamšuk, k molja * " + "author:/adamšuk, k[^\\s]+ molja/ author:/adamšuk, k[^\\s]+ molja .*/ " + "author:adamšuk, k m author:adamšuk, k m * author:/adamšuk, k[^\\s]+ m/ " + "author:/adamšuk, k[^\\s]+ m .*/ author:adamšuk, k author:adamšuk, " + "author:adamsuk, k molja author:adamsuk, k molja * " + "author:/adamsuk, k[^\\s]+ molja/ author:/adamsuk, k[^\\s]+ molja .*/ " + "author:adamsuk, k m author:adamsuk, k m * author:/adamsuk, k[^\\s]+ m/ " + "author:/adamsuk, k[^\\s]+ m .*/ author:adamsuk, k author:adamsuk, " + "author:adamshuk, k molja author:adamshuk, k molja * " + "author:/adamshuk, k[^\\s]+ molja/ author:/adamshuk, k[^\\s]+ molja .*/ " + "author:adamshuk, k m author:adamshuk, k m * author:/adamshuk, k[^\\s]+ m/ " + "author:/adamshuk, k[^\\s]+ m .*/ author:adamshuk, k author:adamshuk,", "//*[@numFound='4']", // "adamšuk, k molja" numFound=4 // 80 Adamshuk, 88 Adamshuk, Karel Molja 89 Adamshuk, Karel M // 90 Adamshuk, K Molja "\"adamguk, k molja\"", "author:adamguk, k molja author:adamguk, k molja * " + "author:/adamguk, k[^\\s]+ molja/ author:/adamguk, k[^\\s]+ molja .*/ " + "author:adamguk, k m author:adamguk, k m * " + "author:/adamguk, k[^\\s]+ m/ author:/adamguk, k[^\\s]+ m .*/ " + "author:adamguk, k author:adamguk,", "//*[@numFound='4']" // "adamguk, k molja" numFound=4 // 60 Adamguk, 68 Adamguk, Karel Molja 69 Adamguk, Karel M // 70 Adamguk, K Molja ); /** * <surname>, <1*> * <surname>, <1n*> * * No expansion should happen if the <part*> has more than 2 characters, otherwise * it should work as if <surname>, <1> was specified * */ expected = "author:adamšuk, m author:adamšuk, m* author:adamšuk, " + "author:adamsuk, m author:adamsuk, m* author:adamsuk, " + "author:adamshuk, m author:adamshuk, m* author:adamshuk, " + "author:adamguk, m author:adamguk, m* author:adamguk, " + "author:adamčuk, m author:adamčuk, m* author:adamčuk, " + "author:adamchuk, m author:adamchuk, m* author:adamchuk, " + "author:adamcuk, m author:adamcuk, m* author:adamcuk,"; testAuthorQuery( "\"adamčuk, m*\"", expected + " author:adamguk, m author:adamšuk, m", "//*[@numFound='40']", // "adamčuk, m*" numFound=40 // 1 Adamčuk, 2 Adamčuk, M. 3 Adamčuk, Marel // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 6 Adamčuk, M Karel // 7 Adamčuk, Molja K 8 Adamčuk, M K 20 Adamcuk, // 21 Adamcuk, M. 22 Adamcuk, Marel 23 Adamcuk, Molja // 24 Adamcuk, Molja Karel 25 Adamcuk, M Karel 26 Adamcuk, Molja K // 27 Adamcuk, M K 40 Adamchuk, 41 Adamchuk, M. // 42 Adamchuk, Marel 43 Adamchuk, Molja 44 Adamchuk, Molja Karel // 45 Adamchuk, M Karel 46 Adamchuk, Molja K 47 Adamchuk, M K // 60 Adamguk, 61 Adamguk, M. 62 Adamguk, Marel // 63 Adamguk, Molja 64 Adamguk, Molja Karel 65 Adamguk, M Karel // 66 Adamguk, Molja K 67 Adamguk, M K 80 Adamshuk, // 81 Adamshuk, M. 82 Adamshuk, Marel 83 Adamshuk, Molja // 84 Adamshuk, Molja Karel 85 Adamshuk, M Karel 86 Adamshuk, Molja K // 87 Adamshuk, M K "\"adamcuk, m*\"", expected, "//*[@numFound='40']", "\"adamchuk, m*\"", expected, "//*[@numFound='40']", "\"adamczuk, m*\"", expected + " author:adamczuk, m author:adamczuk, m* author:adamczuk,", "//*[@numFound='40']", "\"adamšuk, m*\"", expected + " author:adamguk, m author:adamčuk, m", "//*[@numFound='40']", "\"adamguk, m*\"", expected + " author:adamčuk, m author:adamšuk, m", "//*[@numFound='40']", "\"adamčuk, mo*\"", "author:adamčuk, mo*", "//*[@numFound='3']", // "adamčuk, mo*" numFound=3 // 4 Adamčuk, Molja 5 Adamčuk, Molja Karel 7 Adamčuk, Molja K "\"adamcuk, mo*\"", "author:adamcuk, mo*", "//*[@numFound='3']", // "adamcuk, mo*" numFound=3 // 23 Adamcuk, Molja 24 Adamcuk, Molja Karel 26 Adamcuk, Molja K "\"adamchuk, mo*\"", "author:adamchuk, mo*", "//*[@numFound='3']", // "adamchuk, mo*" numFound=3 // 43 Adamchuk, Molja 44 Adamchuk, Molja Karel 46 Adamchuk, Molja K "\"adamczuk, mo*\"", "author:adamczuk, mo*", "//*[@numFound='0']", // "adamczuk, mo*" numFound=0 "\"adamšuk, mo*\"", "author:adamšuk, mo*", "//*[@numFound='0']", // "adamšuk, mo*" numFound=0 "\"adamguk, mo*\"", "author:adamguk, mo*", "//*[@numFound='3']" // "adamguk, mo*" numFound=3 // 63 Adamguk, Molja 64 Adamguk, Molja Karel 66 Adamguk, Molja K ); /** * <surname>, <2*> * <surname>, <2n*> * * No expansion should happen if the <part*> has more than 2 characters, otherwise * it should work only if such a patter is in the synonym list (and there is none) * */ testAuthorQuery( "\"adamčuk, k*\"", "author:adamčuk, k author:adamčuk, k* author:adamčuk, " + "author:adamchuk, k author:adamchuk, k* author:adamchuk, " + "author:adamcuk, k author:adamcuk, k* author:adamcuk,", "//*[@numFound='12']", // "adamčuk, k*" numFound=12 // 1 Adamčuk, 9 Adamčuk, Karel Molja 10 Adamčuk, Karel M // 11 Adamčuk, K Molja 20 Adamcuk, 28 Adamcuk, Karel Molja // 29 Adamcuk, Karel M 30 Adamcuk, K Molja 40 Adamchuk, // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M 50 Adamchuk, K Molja "\"adamcuk, k*\"", "author:adamcuk, k author:adamcuk, k* author:adamcuk,", "//*[@numFound='4']", // because there is no synonym mapping for "a, k" (but there is one for "a, m"!) // "adamcuk, k*" numFound=4 // 20 Adamcuk, 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M // 30 Adamcuk, K Molja "\"adamchuk, k*\"", "author:adamchuk, k author:adamchuk, k* author:adamchuk,", "//*[@numFound='4']", // "adamchuk, k*" numFound=4 // 40 Adamchuk, 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M // 50 Adamchuk, K Molja "\"adamczuk, k*\"", "author:adamczuk, k author:adamczuk, k* author:adamczuk,", "//*[@numFound='0']", "\"adamšuk, k*\"", "author:adamšuk, k author:adamšuk, k* author:adamšuk, " + "author:adamsuk, k author:adamsuk, k* author:adamsuk, " + "author:adamshuk, k author:adamshuk, k* author:adamshuk,", "//*[@numFound='4']", // "adamšuk, k*" numFound=4 // 80 Adamshuk, 88 Adamshuk, Karel Molja 89 Adamshuk, Karel M // 90 Adamshuk, K Molja "\"adamguk, k*\"", "author:adamguk, k author:adamguk, k* author:adamguk,", "//*[@numFound='4']", // "adamguk, k*" numFound=4 // 60 Adamguk, 68 Adamguk, Karel Molja 69 Adamguk, Karel M // 70 Adamguk, K Molja "\"adamčuk, ka*\"", "author:adamčuk, ka*", "//*[@numFound='2']", // 9 Adamčuk, Karel Molja 10 Adamčuk, Karel M "\"adamcuk, ka*\"", "author:adamcuk, ka*", "//*[@numFound='2']", // "adamcuk, ka*" numFound=2 // 28 Adamcuk, Karel Molja 29 Adamcuk, Karel M "\"adamchuk, ka*\"", "author:adamchuk, ka*", "//*[@numFound='2']", // "adamchuk, ka*" numFound=2 // 48 Adamchuk, Karel Molja 49 Adamchuk, Karel M "\"adamczuk, ka*\"", "author:adamczuk, ka*", "//*[@numFound='0']", "\"adamšuk, ka*\"", "author:adamšuk, ka*", "//*[@numFound='0']", // "adamšuk, ka*" numFound=0 "\"adamguk, ka*\"", "author:adamguk, ka*", "//*[@numFound='2']" // "adamguk, ka*" numFound=2 // 28 Adamguk, Karel Molja 29 Adamguk, Karel M ); /** * * The special case of synonym expansion called "semantic upgrade" * Basically, if the user input is too short - eg. "jones, c" * and our synonym file contains only these entries * "jones, christine; forman,christine" * * Then we want to be able to find that "jones, c" corresponds to * "jones, christine" and add the "forman, christine" and * "forman, c" to the expanded synonyms. However, WE DO NOT want * "forman, c*" search, but we want "jones, c*" search * */ testAuthorQuery( //must NOT have "jones*", must have "jones, c;jones, christine" "forman", "author:forman, author:forman, c author:jones, christine author:jones, c " + "author:forman, christine author:forman,*", "//*[@numFound='7']", // forman numFound=7 // 110 Jones, Christine 111 Jones, C 112 Forman, Christine // 113 Forman, C 115 Jones, C 116 Forman, Christopher // 117 Forman, C //must NOT have "forman*", must have "forman, c;forman, christine" // PLUS - must have other jones's and allen's "jones", "author:jones, author:jones, l author:allen, l author:allen, r l " + "author:allen, lynne author:jones, r l author:jones, r lynne author:jones, lynne " + "author:allen, r lynne author:forman, c author:jones, christine author:jones, c " + "author:forman, christine author:jones,*", "//*[@numFound='15']", // jones numFound=15 // 110 Jones, Christine 111 Jones, C 112 Forman, Christine // 113 Forman, C 114 Jones, Christopher 115 Jones, C // 117 Forman, C 120 Allen, Lynne 121 Allen, L // 122 Allen, R Lynne 123 Allen, R L 124 Jones, Lynne // 125 Jones, L 126 Jones, R Lynne 127 Jones, R L //must NOT have "jones, c*", must have "jones, christine" "\"forman, c\"", "author:forman, c author:forman, christine author:forman, c* author:forman," + "author:jones, christine author:jones, c", "//*[@numFound='7']", // "forman, c" numFound=7 // 110 Jones, Christine 111 Jones, C 112 Forman, Christine // 113 Forman, C 115 Jones, C 116 Forman, Christopher // 117 Forman, C //must NOT have "forman, c*", must have "forman, christine" "\"jones, c\"", "author:jones, c author:jones, christine author:jones, c* author:jones," + "author:forman, christine author:forman, c", "//*[@numFound='7']", // "jones, c" numFound=7 // 110 Jones, Christine 111 Jones, C 112 Forman, Christine // 113 Forman, C 114 Jones, Christopher 115 Jones, C // 117 Forman, C "\"jones, christine\"", "author:jones, christine author:jones, christine * author:jones, c " + "author:jones, c * author:jones, author:forman, christine " + "author:forman, christine * author:forman, c author:forman, c * " + "author:forman,", "//*[@numFound='6']", // "jones, christine" numFound=6 // 110 Jones, Christine 111 Jones, C 112 Forman, Christine // 113 Forman, C 115 Jones, C 117 Forman, C "\"forman, christine\"", "author:jones, christine author:jones, christine * author:jones, c " + "author:jones, c * author:jones, author:forman, christine author:forman, christine * " + "author:forman, c author:forman, c * author:forman,", "//*[@numFound='6']" ); /** * THE OLD STYLE, SO THAT I CAN COMPARE assertQueryEquals(req("qt", "aqp", "q", "author:\"Adamčuk, m\""), //"author:adamčuk, m author:adamcuk, m author:adamchuk, m author:adamčuk, author:adamčuk, m* author:adamchuk, marel author:adamčuk, marel author:adamcuk, molja author:adamcuk, marel author:adamčuk, molja author:adamchuk, molja author:adamchuk, m* author:adamchuk, author:adamcuk, author:adamcuk, m*", "author:adamčuk, m author:adamcuk, m author:adamchuk, m author:adamčuk, author:adamčuk, m* author:adamchuk, m* author:adamchuk, author:adamcuk, author:adamcuk, m*", BooleanQuery.class); assertQueryEquals(req("qt", "aqp", "q", "author:\"ADAMČuk, m\""), //"author:adamčuk, m author:adamcuk, m author:adamchuk, m author:adamčuk, author:adamčuk, m* author:adamchuk, marel author:adamčuk, marel author:adamcuk, molja author:adamcuk, marel author:adamčuk, molja author:adamchuk, molja author:adamchuk, m* author:adamchuk, author:adamcuk, author:adamcuk, m*", "author:adamčuk, m author:adamcuk, m author:adamchuk, m author:adamčuk, author:adamčuk, m* author:adamchuk, m* author:adamchuk, author:adamcuk, author:adamcuk, m*", BooleanQuery.class); assertQueryEquals(req("qt", "aqp", "q", "author:\"adamchuk, m\""), //"author:adamchuk, m author:adamcuk, m author:adamčuk, m author:adamchuk, m* author:adamchuk, marel author:adamčuk, marel author:adamcuk, molja author:adamcuk, marel author:adamchuk, molja author:adamčuk, molja author:adamchuk,", "author:adamchuk, m author:adamcuk, m author:adamčuk, m author:adamchuk, m* author:adamchuk,", BooleanQuery.class); **/ assertQueryEquals(req("defType", "aqp", "q", "author:\"Muller, William\""), // this was the old-style result, note "muller, w*" //"author:muller, w author:muller, w* author:muller, william author:müller, william author:mueller, william author:muller,", "author:müller, william author:müller, william * " + "author:müller, w author:müller, w * " + "author:müller, " + "author:muller, william author:muller, william * " + "author:muller, w author:muller, w * " + "author:muller, " + "author:mueller, william author:mueller, william * " + "author:mueller, w author:mueller, w * " + "author:mueller, " + "author:müller, bill author:müller, bill * " + "author:müller, b author:müller, b * " + "author:mueller, bill author:mueller, bill * " + "author:mueller, b author:mueller, b * " + "author:muller, bill author:muller, bill * " + "author:muller, b author:muller, b *", BooleanQuery.class); /* * TODO: assertQ(req("q", "author:\"Albert, R\""), "//*[@numFound='1']"); assertQ(req("q", "author:\"Albert, Reeka\""), "//*[@numFound='1']"); assertQ(req("q", "author:\"Barabási, A\""), "//*[@numFound='1']"); assertQ(req("q", "author:\"Barabaesi, A\""), "//*[@numFound='1']"); assertQ(req("q", "author:\"Barabási, Albert-László\""), "//*[@numFound='1']"); assertQ(req("q", "author:\"Barabasi, Albert-Laszlo\""), "//*[@numFound='1']"); assertQ(req("q", "author:Sellgren"), "//*[@numFound='1']"); assertQ(req("q", "author:\"Dwek, E P\""), "//*[@numFound='1']"); assertQ(req("q", "author:\"Dwek, E.\""), "//*[@numFound='1']"); assertQ(req("q", "author:\"Dwek, Edgar\""), "//*[@numFound='1']"); assertQ(req("q", "author:\"Dwek, E. P.\""), "//*[@numFound='1']"); assertQ(req("q", "author:\"Rentzsch Holm, Inga\""), "//*[@numFound='1']"); */ /* * Test we are not mixing/concatenating fields - Ticket #346 */ testAuthorQuery( "\"obama,\" boooo", "+(author:obama, author:obama,*) +all:boooo", "//*[@numFound='0']" ); } private void testAuthorQuery(String...vals) throws Exception { assert vals.length%3==0; for (int i=0;i<vals.length;i=i+3) { if (tp.debugParser) { System.out.println(escapeUnicode(vals[i])); System.out.println("Running test for " + author_field + ":" + vals[i]); String response = h.query(req("fl", "id,author", "rows", "100", "defType", "aqp", "q", String.format("%s:%s", author_field, vals[i]))); ArrayList<String> out = new ArrayList<String>(); Matcher m = Pattern.compile("numFound=\\\"(\\d+)").matcher(response); Matcher m2 = Pattern.compile("<doc><str name=\\\"id\\\">(\\d+)</str><arr name=\\\"" + author_field + "\\\"><str>([^<]*)</str></arr></doc>").matcher(response); m.find(); String numFound = m.group(1); while (m2.find()) { out.add(String.format("%0$3s\t%2$-23s", m2.group(1), m2.group(2))); } Collections.sort(out); System.out.print(" // " + vals[i] + " numFound=" + numFound); int j=0; for (String s: out) { if (j%3==0) { System.out.print("\n // "); } System.out.print(s); j++; } System.out.println(); } boolean failed = true; try { assertQueryEquals(req("defType", "aqp", "q", author_field + ":" + vals[i]), vals[i+1], null); assertQ(req("fl", "id," + author_field, "rows", "100", "q", author_field + ":" + vals[i]), vals[i+2].split(";")); failed = false; } finally { if (failed) { QParser qParser = getParser(req("fl", "id," + author_field, "rows", "100", "q", author_field + ":" + vals[i])); Query q = qParser.parse(); String actual = q.toString("field"); System.out.println("Offending test case: " + escapeUnicode(vals[i]) + "\nexpected vs actual: \n" + escapeUnicode(vals[i+1]) + "\n" + escapeUnicode(actual)); } } } } // Uniquely for Junit 3 public static junit.framework.Test suite() { return new junit.framework.JUnit4TestAdapter(TestAdsabsTypeAuthorParsing.class); } /* XXX:rca - it was not used, to remove? * * public void assertQ(String message, SolrQueryRequest req, String... tests) { try { String m = (null == message) ? "" : message + " "; String response = h.query(req); String results = h.validateXPath(response, tests); if (null != results) { tp.debugFail(m + "query failed XPath: " + results + "\n xml response was: " + response + "\n request was: " + req.getParamString()); } } catch (XPathExpressionException e1) { throw new RuntimeException("XPath is invalid", e1); } catch (Exception e2) { throw new RuntimeException("Exception during query", e2); } } */ public Query assertQueryEquals(SolrQueryRequest req, String expected, Class<?> clazz) throws Exception { QParser qParser = getParser(req); String query = req.getParams().get(CommonParams.Q); Query q = qParser.parse(); String actual = q.toString("field"); String[] ex = expected.split("\\s*[a-z]+\\:"); Arrays.sort(ex); String[] ac = actual.split("\\s*[a-z]+\\:"); Arrays.sort(ac); StringBuffer exs = new StringBuffer(); for (String s: ex) { if (s.trim().equals("")) continue; if (exs.length() > 0) exs.append(" "); exs.append(s.trim()); } StringBuffer acs = new StringBuffer(); for (String s: ac) { if (s.trim().equals("")) continue; if (acs.length() > 0) acs.append(" "); acs.append(s.trim()); } if (!acs.toString().equals(exs.toString())) { //assertArrayEquals(ac, ex); tp.debugFail(query, expected, actual); } if (clazz != null) { if (!q.getClass().isAssignableFrom(clazz)) { tp.debugFail("Query is not: " + clazz + " but: " + q.getClass(), expected, "-->" + q.toString()); } } return q; } public String escapeUnicode(String input) { StringBuilder b = new StringBuilder(input.length()); Formatter f = new Formatter(b); for (char c : input.toCharArray()) { if (c < 128) { b.append(c); } else { f.format("\\u%04x", (int) c); } } return b.toString(); } }