package org.apache.solr.analysis.author; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import monty.solr.util.MontySolrAbstractLuceneTestCase; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.TokenStream; public class TestPythonicAuthorNormalizeFilter extends MontySolrAbstractLuceneTestCase { public void test() throws Exception { compare("Ringo Starr", "Ringo Starr,", "Starr, Ringo"); compare("W K H Panofsky", "W K H Panofsky,", "Panofsky, W K H"); compare("Ibanez y Gracia, Maria Luisa, II, (ed.)", "Ibanez y Gracia, Maria Luisa, II, (ed.)", "Ibanez y Gracia, Maria Luisa"); compare("Ibanez y Gracia, Maria Luisa, II., ed.", "Ibanez y Gracia, Maria Luisa, II., ed.", "Ibanez y Gracia, Maria Luisa"); compare("Epstein, Brian, The Fifth Beatle", "Epstein, Brian, The Fifth Beatle", "Epstein, Brian"); compare("Michael Edward Peskin", "Michael Edward Peskin,", "Peskin, Michael Edward"); compare("M.E. Peskin", "M.E. Peskin,", "Peskin, M. E."); compare("M.E.Peskin", "M.E.Peskin,", "Peskin., M. E."); // to me this seems a bug of the python parser (?) compare("Ronaldo", "Ronaldo,"); compare("Cantina Octavia Jones-Smith", "Cantina Octavia Jones-Smith,", "Jones-Smith, Cantina Octavia"); compare("Jean-Luc Picard", "Jean-Luc Picard,", "Picard, Jean-Luc"); compare("Jean Luc Picard", "Jean Luc Picard,", "Picard, Jean Luc"); compare("Jean Luc de Picard", "Jean Luc de Picard,", "de Picard, Jean Luc"); compare("Juan Q. Xavier Velasquez y Garcia, Jr.", "Juan Q. Xavier Velasquez y Garcia, Jr.", "Velasquez y Garcia, Juan Q. Xavier"); compare("Jean-Luc Picard;Jean Luc de Picardie", "Jean-Luc Picard,", "Picard, Jean-Luc", "Jean Luc de Picardie,", "de Picardie, Jean Luc"); compare("Pinilla-Alonso", "Pinilla-Alonso,"); compare("Pinilla Alonso,", "Pinilla Alonso,"); compare("Pinilla-Alonso, Brava", "Pinilla-Alonso, Brava"); compare("purpose of this review is to bridge the gap between", "purpose of this review is to bridge the gap between,", "between, purpose of this review is to bridge the gap"); compare("o' sullivan", "o'sullivan,"); compare("o'sullivan", "o'sullivan,"); compare("o' john o'sullivan", "o' john o'sullivan,", "o'sullivan, o'john"); compare("o' john, o'sullivan", "o'john, o'sullivan"); compare("Joachim von Lubow", "Joachim von Lubow,", "von Lubow, Joachim"); compare("Gerard 't Hooft", "Gerard 't Hooft,", "'t Hooft, Gerard"); compare("Pieter J. in 't Veld", "Pieter J. in 't Veld,", "'t Veld, Pieter J. in"); compare("first", "first,"); compare("first;james", "first,", "james,"); compare("james; first; foo", "james,", "first,", "foo,"); compare("V Maestro", "V Maestro,", "Maestro, V"); compare("Maestro, V", "Maestro, V"); compare("Maestro, J", "Maestro, J"); compare("J Maestro", "J Maestro,", "Maestro, J"); compare("Alves de Oliveira", "Alves de Oliveira,", "de Oliveira, Alves"); // #16 github: honorifics, when the only thing we have, should be treated as surnames compare("goodman", "goodman,"); compare("alissa goodman", "alissa goodman,", "goodman, alissa"); } public void compare(String input, String... expected) throws Exception { Reader reader = new StringReader(input); Tokenizer tokenizer = new KeywordTokenizer(); tokenizer.setReader(reader); PythonicAuthorNormalizeFilterFactory factory = new PythonicAuthorNormalizeFilterFactory(new HashMap<String,String>()); TokenStream stream = factory.create(tokenizer); stream.reset(); CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); ArrayList<String> data = new ArrayList<String>(); while (stream.incrementToken()) { data.add(termAtt.toString()); } String[] actuals = new String[data.size()]; int i = 0; for (String s: data) { actuals[i] = s; i++; } assertArrayEquals("Tokenization differs for: " + input, expected, actuals); } }