/* * Copyright (C) 2014 Jörg Prante * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program; if not, see http://www.gnu.org/licenses * or write to the Free Software Foundation, Inc., 51 Franklin Street, * Fifth Floor, Boston, MA 02110-1301 USA. * * The interactive user interfaces in modified source and object code * versions of this program must display Appropriate Legal Notices, * as required under Section 5 of the GNU Affero General Public License. * */ package org.xbib.elasticsearch.index.analysis.icu; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig; import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerConfig; import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; import org.elasticsearch.index.analysis.AbstractTokenizerFactory; import org.elasticsearch.index.settings.IndexSettings; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.util.Map; import static org.elasticsearch.common.collect.Maps.newHashMap; /** * ICU-based tokenizer, optionally using ICU rbbi rules files. */ public class IcuTokenizerFactory extends AbstractTokenizerFactory { private final ICUTokenizerConfig config; @Inject public IcuTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); boolean cjkAsWords = settings.getAsBoolean("cjk_as_words", true); Map<Integer, String> tailored = newHashMap(); String[] scriptAndResourcePaths = settings.getAsArray("rulefiles"); if (scriptAndResourcePaths != null) { for (String scriptAndResourcePath : scriptAndResourcePaths) { // "rulefiles" : "Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi" int colonPos = scriptAndResourcePath.indexOf(":"); String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim(); String resourcePath = scriptAndResourcePath.substring(colonPos+1).trim(); tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath); } } if (tailored.isEmpty()) { this.config = new DefaultICUTokenizerConfig(cjkAsWords); } else { final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT]; for (Map.Entry<Integer,String> entry : tailored.entrySet()) { int code = entry.getKey(); String resourcePath = entry.getValue(); StringBuilder rules = new StringBuilder(); String line; try { InputStream rulesStream = getClass().getResourceAsStream("/" + resourcePath); if (rulesStream == null) { throw new ElasticsearchIllegalArgumentException("rules stream not found: " + resourcePath); } BufferedReader reader = new BufferedReader(new InputStreamReader(rulesStream, Charset.forName("UTF-8"))); while ((line = reader.readLine()) != null) { if (!line.startsWith("#")) { rules.append(line); } rules.append('\n'); } reader.close(); } catch (IOException e) { logger.error("unable to parse rules", e); } breakers[code] = new RuleBasedBreakIterator(rules.toString()); } this.config = new DefaultICUTokenizerConfig(cjkAsWords) { @Override public BreakIterator getBreakIterator(int script) { if (breakers[script] != null) { return (BreakIterator) breakers[script].clone(); } else { return super.getBreakIterator(script); } } }; } } @Override public Tokenizer create(Reader reader) { return new ICUTokenizer(reader, config); } }