package org.xbib.elasticsearch.index.analysis.icu.segmentation;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
/**
* ICU-based tokenizer, optionally using ICU rbbi rules files.
*/
public class IcuTokenizerFactory extends AbstractTokenizerFactory {
protected final IcuTokenizerConfig config;
public IcuTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
boolean cjkAsWords = settings.getAsBoolean("cjk_as_words", true);
boolean myanmarAsWords = settings.getAsBoolean("myanmar_as_words", true);
Map<Integer, String> tailored = new HashMap<>();
String[] scriptAndResourcePaths = settings.getAsArray("rulefiles");
if (scriptAndResourcePaths != null) {
for (String scriptAndResourcePath : scriptAndResourcePaths) {
// "rulefiles" : "Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"
int colonPos = scriptAndResourcePath.indexOf(':');
String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim();
String resourcePath = scriptAndResourcePath.substring(colonPos + 1).trim();
tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
}
}
if (tailored.isEmpty()) {
this.config = new DefaultIcuTokenizerConfig(cjkAsWords, myanmarAsWords);
} else {
final BreakIterator[] breakers = new BreakIterator[UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT)];
for (Map.Entry<Integer, String> entry : tailored.entrySet()) {
int code = entry.getKey();
String resourcePath = entry.getValue();
StringBuilder rules = new StringBuilder();
String line;
try {
InputStream rulesStream = getClass().getResourceAsStream("/" + resourcePath);
if (rulesStream == null) {
throw new ElasticsearchException("rules stream not found: " + resourcePath);
}
BufferedReader reader = new BufferedReader(new InputStreamReader(rulesStream, StandardCharsets.UTF_8));
while ((line = reader.readLine()) != null) {
if (!line.startsWith("#")) {
rules.append(line);
}
rules.append('\n');
}
reader.close();
} catch (IOException e) {
logger.error("unable to parse rules", e);
}
breakers[code] = new RuleBasedBreakIterator(rules.toString());
}
this.config = new DefaultIcuTokenizerConfig(cjkAsWords, myanmarAsWords) {
@Override
public BreakIterator getBreakIterator(int script) {
if (breakers[script] != null) {
return (BreakIterator) breakers[script].clone();
} else {
return super.getBreakIterator(script);
}
}
};
}
}
@Override
public Tokenizer create() {
return new IcuTokenizer(config);
}
}