/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.clustering.carrot2; import java.io.IOException; import java.io.Reader; import java.lang.invoke.MethodHandles; import java.util.regex.Pattern; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.carrot2.core.LanguageCode; import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer; import org.carrot2.text.analysis.ITokenizer; import org.carrot2.text.linguistic.ITokenizerFactory; import org.carrot2.text.util.MutableCharArray; import org.carrot2.util.ExceptionUtils; import org.carrot2.util.ReflectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * An implementation of Carrot2's {@link ITokenizerFactory} based on Lucene's * Smart Chinese tokenizer. If Smart Chinese tokenizer is not available in * classpath at runtime, the default Carrot2's tokenizer is used. Should the * Lucene APIs need to change, the changes can be made in this class. * * @lucene.experimental */ public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory { private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @Override public ITokenizer getTokenizer(LanguageCode language) { switch (language) { case CHINESE_SIMPLIFIED: return ChineseTokenizerFactory.createTokenizer(); /* * We use our own analyzer for Arabic. Lucene's version has special * support for Nonspacing-Mark characters (see * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we * have them included as letters in the parser. */ case ARABIC: // Intentional fall-through. default: return new ExtendedWhitespaceTokenizer(); } } /** * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the * factory will fall back to the default white space tokenizer. */ private static final class ChineseTokenizerFactory { static { try { ReflectionUtils.classForName( "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false); ReflectionUtils.classForName( "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false); } catch (Throwable e) { logger .warn("Could not instantiate Smart Chinese Analyzer, clustering quality " + "of Chinese content may be degraded. For best quality clusters, " + "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath"); if (e instanceof Error) { throw (Error) e; } } } static ITokenizer createTokenizer() { try { return new ChineseTokenizer(); } catch (Throwable e) { if (e instanceof OutOfMemoryError) { throw (OutOfMemoryError) e; } return new ExtendedWhitespaceTokenizer(); } } private final static class ChineseTokenizer implements ITokenizer { private final static Pattern numeric = Pattern .compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?"); private Tokenizer sentenceTokenizer; private TokenStream wordTokenFilter; private CharTermAttribute term = null; private final MutableCharArray tempCharSequence; private final Class<?> tokenFilterClass; private ChineseTokenizer() throws Exception { this.tempCharSequence = new MutableCharArray(new char[0]); // As Smart Chinese is not available during compile time, // we need to resort to reflection. final Class<?> tokenizerClass = ReflectionUtils.classForName( "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false); this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor( Reader.class).newInstance((Reader) null); this.tokenFilterClass = ReflectionUtils.classForName( "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false); } @Override public short nextToken() throws IOException { final boolean hasNextToken = wordTokenFilter.incrementToken(); if (hasNextToken) { short flags = 0; final char[] image = term.buffer(); final int length = term.length(); tempCharSequence.reset(image, 0, length); if (length == 1 && image[0] == ',') { // ChineseTokenizer seems to convert all punctuation to ',' // characters flags = ITokenizer.TT_PUNCTUATION; } else if (numeric.matcher(tempCharSequence).matches()) { flags = ITokenizer.TT_NUMERIC; } else { flags = ITokenizer.TT_TERM; } return flags; } return ITokenizer.TT_EOF; } @Override public void setTermBuffer(MutableCharArray array) { array.reset(term.buffer(), 0, term.length()); } @Override public void reset(Reader input) { try { sentenceTokenizer.setReader(input); wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor( TokenStream.class).newInstance(sentenceTokenizer); term = wordTokenFilter.addAttribute(CharTermAttribute.class); } catch (Exception e) { throw ExceptionUtils.wrapAsRuntimeException(e); } } } } }