package org.xbib.elasticsearch.index.mapper.langdetect; import com.fasterxml.jackson.core.JsonParseException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.support.XContentMapValues; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.mapper.ParseContext; import org.elasticsearch.index.mapper.TextFieldMapper; import org.xbib.elasticsearch.common.langdetect.LangdetectService; import org.xbib.elasticsearch.common.langdetect.Language; import org.xbib.elasticsearch.common.langdetect.LanguageDetectionException; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import static org.elasticsearch.common.xcontent.support.XContentMapValues.lenientNodeBooleanValue; /** * */ public class LangdetectMapper extends TextFieldMapper { private static final Logger logger = LogManager.getLogger(LangdetectMapper.class.getName()); public static final String MAPPER_TYPE = "langdetect"; private final LangdetectService langdetectService; private final LanguageTo languageTo; private final int positionIncrementGap; public LangdetectMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, int positionIncrementGap, Settings indexSettings, MultiFields multiFields, CopyTo copyTo, LanguageTo languageTo, LangdetectService langdetectService) { super(simpleName, fieldType, defaultFieldType, positionIncrementGap, false, indexSettings, multiFields, copyTo); this.langdetectService = langdetectService; this.languageTo = languageTo; this.positionIncrementGap = positionIncrementGap; } @Override protected String contentType() { return MAPPER_TYPE; } @Override protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException { if (context.externalValueSet()) { return; } XContentParser parser = context.parser(); if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { return; } String value = fieldType().nullValueAsString(); if (parser.currentToken() == XContentParser.Token.START_OBJECT) { XContentParser.Token token; String currentFieldName = null; while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { currentFieldName = parser.currentName(); } else { if ("value".equals(currentFieldName) || "_value".equals(currentFieldName)) { value = parser.textOrNull(); } } } } else { value = parser.textOrNull(); } if (langdetectService.getSettings().getAsBoolean("binary", false)) { try { byte[] b = parser.binaryValue(); if (b != null && b.length > 0) { value = new String(b, StandardCharsets.UTF_8); } } catch (JsonParseException e) { logger.trace(e.getMessage(), e); } catch (Exception e) { logger.error(e.getMessage(), e); } } try { List<Language> langs = langdetectService.detectAll(value); for (Language lang : langs) { Field field = new Field(fieldType().name(), lang.getLanguage(), fieldType()); fields.add(field); if (languageTo.languageToFields().containsKey(lang.getLanguage())) { parseLanguageToFields(context, languageTo.languageToFields().get(lang.getLanguage())); } } } catch (LanguageDetectionException e) { logger.trace(e.getMessage(), e); context.createExternalValueContext("unknown"); } } @Override protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException { super.doXContentBody(builder, includeDefaults, params); if (includeDefaults || fieldType().nullValue() != null) { builder.field("null_value", fieldType().nullValue()); } if (includeDefaults || positionIncrementGap != -1) { builder.field("position_increment_gap", positionIncrementGap); } NamedAnalyzer searchQuoteAnalyzer = fieldType().searchQuoteAnalyzer(); if (searchQuoteAnalyzer != null && !searchQuoteAnalyzer.name().equals(fieldType().searchAnalyzer().name())) { builder.field("search_quote_analyzer", searchQuoteAnalyzer.name()); } else if (includeDefaults) { if (searchQuoteAnalyzer == null) { builder.field("search_quote_analyzer", "default"); } else { builder.field("search_quote_analyzer", searchQuoteAnalyzer.name()); } } Map<String, Object> map = langdetectService.getSettings().getAsStructuredMap(); for (Map.Entry<String, Object> entry : map.entrySet()) { builder.field(entry.getKey(), entry.getValue()); } languageTo.toXContent(builder, params); } @SuppressWarnings("unchecked") private static void parseLanguageToFields(ParseContext originalContext, Object languageToFields) throws IOException { List<Object> fieldList = languageToFields instanceof List ? (List<Object>)languageToFields : Collections.singletonList(languageToFields); ParseContext context = originalContext.createCopyToContext(); for (Object field : fieldList) { ParseContext.Document targetDoc = null; for (ParseContext.Document doc = context.doc(); doc != null; doc = doc.getParent()) { if (field.toString().startsWith(doc.getPrefix())) { targetDoc = doc; break; } } if (targetDoc == null) { throw new IllegalArgumentException("target doc is null"); } final ParseContext copyToContext; if (targetDoc == context.doc()) { copyToContext = context; } else { copyToContext = context.switchDoc(targetDoc); } FieldMapper fieldMapper = copyToContext.docMapper().mappers().getMapper(field.toString()); if (fieldMapper != null) { fieldMapper.parse(copyToContext); } else { throw new MapperParsingException("attempt to copy value to non-existing field [" + field + "]"); } } } public static class Defaults { public static final TextFieldType LANG_FIELD_TYPE = new TextFieldType(); static { LANG_FIELD_TYPE.setStored(true); LANG_FIELD_TYPE.setOmitNorms(true); LANG_FIELD_TYPE.setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); LANG_FIELD_TYPE.setSearchAnalyzer(Lucene.KEYWORD_ANALYZER); LANG_FIELD_TYPE.setName(MAPPER_TYPE); LANG_FIELD_TYPE.freeze(); } } public static class Builder extends FieldMapper.Builder<Builder, TextFieldMapper> { protected int positionIncrementGap = -1; protected LanguageTo languageTo = LanguageTo.builder().build(); protected Settings.Builder settingsBuilder = Settings.builder(); public Builder(String name) { super(name, Defaults.LANG_FIELD_TYPE, Defaults.LANG_FIELD_TYPE); this.builder = this; } @Override public Builder searchAnalyzer(NamedAnalyzer searchAnalyzer) { super.searchAnalyzer(searchAnalyzer); return this; } public Builder positionIncrementGap(int positionIncrementGap) { this.positionIncrementGap = positionIncrementGap; return this; } public Builder searchQuotedAnalyzer(NamedAnalyzer analyzer) { this.fieldType.setSearchQuoteAnalyzer(analyzer); return builder; } public Builder ntrials(int trials) { settingsBuilder.put("number_of_trials", trials); return this; } public Builder alpha(double alpha) { settingsBuilder.put("alpha", alpha); return this; } public Builder alphaWidth(double alphaWidth) { settingsBuilder.put("alpha_width", alphaWidth); return this; } public Builder iterationLimit(int iterationLimit) { settingsBuilder.put("iteration_limit", iterationLimit); return this; } public Builder probThreshold(double probThreshold) { settingsBuilder.put("prob_threshold", probThreshold); return this; } public Builder convThreshold(double convThreshold) { settingsBuilder.put("conv_threshold", convThreshold); return this; } public Builder baseFreq(int baseFreq) { settingsBuilder.put("base_freq", baseFreq); return this; } public Builder pattern(String pattern) { settingsBuilder.put("pattern", pattern); return this; } public Builder max(int max) { settingsBuilder.put("max", max); return this; } public Builder binary(boolean binary) { settingsBuilder.put("binary", binary); return this; } public Builder map(Map<String, Object> map) { for (Map.Entry<String, Object> entry : map.entrySet()) { settingsBuilder.put("map." + entry.getKey(), entry.getValue()); } return this; } public Builder languages(String[] languages) { settingsBuilder.putArray("languages", languages); return this; } public Builder profile(String profile) { settingsBuilder.put("profile", profile); return this; } public Builder languageTo(LanguageTo languageTo) { this.languageTo = languageTo; return this; } @Override public LangdetectMapper build(BuilderContext context) { if (positionIncrementGap != -1) { fieldType.setIndexAnalyzer(new NamedAnalyzer(fieldType.indexAnalyzer(), positionIncrementGap)); fieldType.setSearchAnalyzer(new NamedAnalyzer(fieldType.searchAnalyzer(), positionIncrementGap)); fieldType.setSearchQuoteAnalyzer(new NamedAnalyzer(fieldType.searchQuoteAnalyzer(), positionIncrementGap)); } if (fieldType.indexOptions() != IndexOptions.NONE && !fieldType.tokenized()) { defaultFieldType.setOmitNorms(true); defaultFieldType.setIndexOptions(IndexOptions.DOCS); if (!omitNormsSet && Float.compare(fieldType.boost(), 1.0f) == 0) { fieldType.setOmitNorms(true); } if (!indexOptionsSet) { fieldType.setIndexOptions(IndexOptions.DOCS); } } setupFieldType(context); LangdetectService service = new LangdetectService(settingsBuilder.build()); return new LangdetectMapper(name, fieldType(), defaultFieldType, positionIncrementGap, context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo, languageTo, service); } } public static class TypeParser implements Mapper.TypeParser { @Override public Mapper.Builder<?, ?> parse(String name, Map<String, Object> mapping, ParserContext parserContext) { Builder builder = new Builder(name); Iterator<Map.Entry<String, Object>> iterator = mapping.entrySet().iterator(); while (iterator.hasNext()) { Map.Entry<String, Object> entry = iterator.next(); String fieldName = entry.getKey(); Object fieldNode = entry.getValue(); switch (fieldName) { case "analyzer": case "include_in_all": iterator.remove(); break; case "search_quote_analyzer": NamedAnalyzer analyzer = parserContext.getIndexAnalyzers().get(fieldNode.toString()); if (analyzer == null) { throw new MapperParsingException("Analyzer [" + fieldNode.toString() + "] not found for field [" + name + "]"); } builder.searchQuotedAnalyzer(analyzer); iterator.remove(); break; case "position_increment_gap": int newPositionIncrementGap = XContentMapValues.nodeIntegerValue(fieldNode, -1); if (newPositionIncrementGap < 0) { throw new MapperParsingException("position_increment_gap less than 0 aren't allowed."); } builder.positionIncrementGap(newPositionIncrementGap); if (builder.fieldType().indexAnalyzer() == null) { builder.fieldType().setIndexAnalyzer(parserContext.getIndexAnalyzers().getDefaultIndexAnalyzer()); } if (builder.fieldType().searchAnalyzer() == null) { builder.fieldType().setSearchAnalyzer(parserContext.getIndexAnalyzers().getDefaultSearchAnalyzer()); } if (builder.fieldType().searchQuoteAnalyzer() == null) { builder.fieldType().setSearchQuoteAnalyzer(parserContext.getIndexAnalyzers().getDefaultSearchQuoteAnalyzer()); } iterator.remove(); break; case "store": builder.store(parseStore(fieldNode.toString())); iterator.remove(); break; case "number_of_trials": builder.ntrials(XContentMapValues.nodeIntegerValue(fieldNode)); iterator.remove(); break; case "alpha": builder.alpha(XContentMapValues.nodeDoubleValue(fieldNode)); iterator.remove(); break; case "alpha_width": builder.alphaWidth(XContentMapValues.nodeDoubleValue(fieldNode)); iterator.remove(); break; case "iteration_limit": builder.iterationLimit(XContentMapValues.nodeIntegerValue(fieldNode)); iterator.remove(); break; case "prob_threshold": builder.probThreshold(XContentMapValues.nodeDoubleValue(fieldNode)); iterator.remove(); break; case "conv_threshold": builder.convThreshold(XContentMapValues.nodeDoubleValue(fieldNode)); iterator.remove(); break; case "base_freq": builder.baseFreq(XContentMapValues.nodeIntegerValue(fieldNode)); iterator.remove(); break; case "pattern": builder.pattern(XContentMapValues.nodeStringValue(fieldNode, null)); iterator.remove(); break; case "max": builder.max(XContentMapValues.nodeIntegerValue(fieldNode)); iterator.remove(); break; case "binary": boolean b = XContentMapValues.nodeBooleanValue(fieldNode); builder.binary(b); iterator.remove(); break; case "map": builder.map(XContentMapValues.nodeMapValue(fieldNode, "map")); iterator.remove(); break; case "languages": builder.languages(XContentMapValues.nodeStringArrayValue(fieldNode)); iterator.remove(); break; case "profile": builder.profile(XContentMapValues.nodeStringValue(fieldNode, null)); iterator.remove(); break; case "language_to" : Map<String, Object> map = XContentMapValues.nodeMapValue(fieldNode, null); LanguageTo.Builder languageToBuilder = LanguageTo.builder(); languageToBuilder.add(map); builder.languageTo(languageToBuilder.build()); iterator.remove(); break; default: break; } } return builder; } private static boolean parseStore(String store) throws MapperParsingException { return !"no".equals(store) && ("yes".equals(store) || lenientNodeBooleanValue(null, store, false)); } } public static class LanguageTo { private final Map<String, Object> languageToFields; private LanguageTo(Map<String, Object> languageToFields) { this.languageToFields = languageToFields; } public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { if (!languageToFields.isEmpty()) { builder.startObject("language_to"); for (Map.Entry<String, Object> field : languageToFields.entrySet()) { builder.field(field.getKey(), field.getValue()); } builder.endObject(); } return builder; } public static Builder builder() { return new Builder(); } public static class Builder { private final Map<String, Object> languageToBuilders = new LinkedHashMap<>(); public LanguageTo.Builder add(String language, String field) { languageToBuilders.put(language, field); return this; } public LanguageTo.Builder add(Map<String, Object> map) { languageToBuilders.putAll(map); return this; } public LanguageTo build() { return new LanguageTo(Collections.unmodifiableMap(languageToBuilders)); } } public Map<String, Object> languageToFields() { return languageToFields; } } }