/* * JBoss, Home of Professional Open Source * Copyright 2012 Red Hat Inc. and/or its affiliates and other contributors * as indicated by the @authors tag. All rights reserved. */ package org.jboss.elasticsearch.tools.content; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.elasticsearch.common.settings.SettingsException; import org.elasticsearch.common.xcontent.support.XContentMapValues; /** * Content preprocessor which apply Regular Expression (see {@link Pattern}) to the source String value, and copy * content of defined <a href="http://docs.oracle.com/javase/tutorial/essential/regex/groups.html">Capturing Groups</a> * to defined target fields. Example of configuration for this preprocessor: * * <pre> * { * "name" : "Name extractor", * "class" : "org.jboss.elasticsearch.tools.content.RegExpCapturingGroupPreprocessor", * "settings" : { * "source_field" : "fields.updated", * "pattern" : "my name is (.*)", * "result_mapping" : { * 0 : "target_field_group_0", * 1 : "target_field_group_1" * } * } * } * </pre> * * Options are: * <ul> * <li><code>source_field</code> - source field in input data. Dot notation for nested values can be used here (see * {@link XContentMapValues#extractValue(String, Map)}). * <li><code>pattern</code> - regular expression pattern to be used. Should contain some Capturing Groups! * <li><code>result_mapping</code> - mapping of values of Capturing Groups found in input value into target fields. * Target field can be same as input field. Dot notation can be used here for structure nesting. Target fields are not * rewritten if pattern doesn't match (but warning is generated in this case). * <li><code>source_bases</code> - list of fields in source data which are used as bases for extraction. If defined then * extraction is performed for each of this fields, <code>source_field</code> and <code>target_field_xx</code> are * resolved relatively against this base. Base must provide object or list of objects. * </ul> * * @author Vlastimil Elias (velias at redhat dot com) * @see StructuredContentPreprocessorFactory * @see Pattern * @see Matcher#matches() * @see Matcher#group(int) */ public class RegExpCapturingGroupPreprocessor extends StructuredContentPreprocessorWithSourceBasesBase<Object> { protected static final String CFG_SOURCE_FIELD = "source_field"; protected static final String CFG_PATTERN = "pattern"; protected static final String CFG_RESULT_MAPPING = "result_mapping"; protected String fieldSource; protected Pattern patternCompiled; protected Map<Object, String> resultMapping; @SuppressWarnings("unchecked") @Override public void init(Map<String, Object> settings) throws SettingsException { super.init(settings); fieldSource = XContentMapValues.nodeStringValue(settings.get(CFG_SOURCE_FIELD), null); validateConfigurationStringNotEmpty(fieldSource, CFG_SOURCE_FIELD); String pattern = XContentMapValues.nodeStringValue(settings.get(CFG_PATTERN), null); validateConfigurationStringNotEmpty(pattern, CFG_PATTERN); try { patternCompiled = Pattern.compile(pattern); } catch (PatternSyntaxException e) { throw new SettingsException("'settings/" + CFG_PATTERN + "' configuration value for '" + name + "' preprocessor is invalid: " + e.getMessage()); } try { resultMapping = (Map<Object, String>) settings.get(CFG_RESULT_MAPPING); validateResultMappingConfiguration(resultMapping, CFG_RESULT_MAPPING); } catch (ClassCastException e) { throw new SettingsException("'settings/" + CFG_RESULT_MAPPING + "' configuration value for '" + name + "' preprocessor is invalid"); } } /** * Validate result mapping configuration part. * * @param value to check * @param configFieldName name of field in preprocessor settings structure. Used for error message. * @throws SettingsException thrown if value is not valid */ protected void validateResultMappingConfiguration(Map<Object, String> value, String configFieldName) throws SettingsException { if (value == null || value.isEmpty()) { throw new SettingsException("Missing or empty 'settings/" + configFieldName + "' configuration object for '" + name + "' preprocessor"); } for (Object index : value.keySet()) { if (ValueUtils.isEmpty(index)) { throw new SettingsException("Missing or empty index in 'settings/" + configFieldName + "' configuration for '" + name + "' preprocessor"); } boolean isnumber = false; if (index instanceof Number) { isnumber = true; } else if (index instanceof String) { try { new Integer((String) index); isnumber = true; } catch (NumberFormatException e) { } } if (!isnumber) { throw new SettingsException("Index must be a number in 'settings/" + configFieldName + "' configuration for '" + name + "' preprocessor"); } try { if (ValueUtils.isEmpty((String) value.get(index))) { throw new SettingsException("Missing or empty value in 'settings/" + configFieldName + "/" + index + "' configuration for '" + name + "' preprocessor"); } } catch (ClassCastException e) { throw new SettingsException("Value for 'settings/" + configFieldName + "/" + index + "' configuration for '" + name + "' preprocessor must be String"); } } } @Override protected void processOneSourceValue(Map<String, Object> data, Object context, String base, PreprocessChainContext chainContext) { Object v = null; if (fieldSource.contains(".")) { v = XContentMapValues.extractValue(fieldSource, data); } else { v = data.get(fieldSource); } if (v != null) { if (v instanceof String) { String vs = (String) v; Matcher m = patternCompiled.matcher(vs); if (m.matches()) { for (Object index : resultMapping.keySet()) { int i = -1; if (index instanceof Number) { i = ((Number) index).intValue(); } else { i = Integer.parseInt(index.toString()); } if (i >= 0 && i <= m.groupCount()) { try { StructureUtils.putValueIntoMapOfMaps(data, resultMapping.get(index), m.group(i)); } catch (IllegalStateException e) { String warningMessage = "No match found for Capturing group " + i + " in value '" + vs + "' from field '" + fieldSource + "'"; addDataWarning(chainContext, warningMessage); logger.debug(warningMessage); } } } } else { String warningMessage = "value '" + vs + "' for field '" + fieldSource + "' do not match pattern, so can't be processed"; addDataWarning(chainContext, warningMessage); logger.debug(warningMessage); } } else { String warningMessage = "value for field '" + fieldSource + "' is not String but is " + v.getClass().getName() + ", so can't be processed"; addDataWarning(chainContext, warningMessage); logger.debug(warningMessage); } } } @Override protected Object createContext(Map<String, Object> data) { return null; } public String getFieldSource() { return fieldSource; } public Map<Object, String> getResultMapping() { return resultMapping; } public String getPattern() { return patternCompiled != null ? patternCompiled.pattern() : null; } }