package org.apache.solr.analysis.author;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class TestAuthorCollectorFactory extends BaseTokenStreamTestCase {
public void testCollector() throws IOException, InterruptedException {
Map<String,String> args = new HashMap<String,String>();
args.put("tokenTypes", String.format("%s,%s", AuthorUtils.AUTHOR_INPUT,
AuthorUtils.AUTHOR_TRANSLITERATED));
args.put("emitTokens", "true");
AuthorCollectorFactory factory = new AuthorCollectorFactory(args);
factory.setExplicitLuceneMatchVersion(true);
AuthorNormalizeFilterFactory normFactory = new AuthorNormalizeFilterFactory(new HashMap<String,String>());
AuthorTransliterationFactory transliteratorFactory = new AuthorTransliterationFactory(new HashMap<String,String>());
//create the synonym writer for the test MÜLLER, BILL
TokenStream stream = new PatternTokenizer(Pattern.compile(";"), -1);
((Tokenizer)stream).setReader(new StringReader("MÜLLER, BILL;MÜller, Bill"));
TokenStream ts = factory.create(transliteratorFactory.create(normFactory.create(stream)));
assertTrue(ts instanceof AuthorCollectorFilter);
ts.reset();
TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
ts.incrementToken();
assertTrue(termAtt.toString().equals("MÜLLER, BILL"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_INPUT));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MUELLER, BILL"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_TRANSLITERATED));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MULLER, BILL"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_TRANSLITERATED));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MÜller, Bill"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_INPUT));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MUEller, Bill"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_TRANSLITERATED));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MUller, Bill"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_TRANSLITERATED));
assertFalse(ts.incrementToken());
ts.close();
args.put("emitTokens", "true");
args.put("tokenTypes", AuthorUtils.AUTHOR_INPUT);
factory = new AuthorCollectorFactory(args);
stream = new PatternTokenizer(Pattern.compile(";"), -1);
((Tokenizer)stream).setReader(new StringReader("MÜLLER, BILL;MÜller, Bill"));
ts = factory.create(transliteratorFactory.create(normFactory.create(stream)));
ts.reset();
typeAtt = ts.getAttribute(TypeAttribute.class);
termAtt = ts.getAttribute(CharTermAttribute.class);
ts.incrementToken();
assertTrue(termAtt.toString().equals("MÜLLER, BILL"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_INPUT));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MÜller, Bill"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_INPUT));
assertFalse(ts.incrementToken());
ts.close();
args.put("emitTokens", "false");
args.put("tokenTypes", AuthorUtils.AUTHOR_TRANSLITERATED);
factory = new AuthorCollectorFactory(args);
stream = new PatternTokenizer(Pattern.compile(";"), -1);
((Tokenizer)stream).setReader(new StringReader("MÜLLER, BILL;MÜller, Bill"));
ts = factory.create(transliteratorFactory.create(normFactory.create(stream)));
ts.reset();
typeAtt = ts.getAttribute(TypeAttribute.class);
termAtt = ts.getAttribute(CharTermAttribute.class);
ts.incrementToken();
assertTrue(termAtt.toString().equals("MÜLLER, BILL"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_INPUT));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MÜller, Bill"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_INPUT));
assertFalse(ts.incrementToken());
ts.close();
args.put("emitTokens", "false");
args.put("tokenTypes", "foo");
factory = new AuthorCollectorFactory(args);
stream = new PatternTokenizer(Pattern.compile(";"), -1);
((Tokenizer)stream).setReader(new StringReader("MÜLLER, BILL;MÜller, Bill"));
ts = factory.create(transliteratorFactory.create(normFactory.create(stream)));
ts.reset();
typeAtt = ts.getAttribute(TypeAttribute.class);
termAtt = ts.getAttribute(CharTermAttribute.class);
ts.incrementToken();
assertTrue(termAtt.toString().equals("MÜLLER, BILL"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_INPUT));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MUELLER, BILL"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_TRANSLITERATED));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MULLER, BILL"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_TRANSLITERATED));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MÜller, Bill"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_INPUT));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MUEller, Bill"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_TRANSLITERATED));
ts.incrementToken();
assertTrue(termAtt.toString().equals("MUller, Bill"));
assertTrue(typeAtt.type().equals(AuthorUtils.AUTHOR_TRANSLITERATED));
assertFalse(ts.incrementToken());
}
}