StripHtmlPreprocessor.java example

Explorer

structured-content-tools-master
- src
  - main
    - java
      - org
        jboss
        elasticsearch
        tools
        content
        AddCurrentTimestampPreprocessor.java
        AddMultipleValuesPreprocessor.java
        AddValuePreprocessor.java
        ESLookupValuePreprocessor.java
        InvalidDataException.java
        IsDateInRangePreprocessor.java
        LongToTimestampValuePreprocessor.java
        MaxTimestampPreprocessor.java
        PreprocessChainContext.java
        PreprocessChainContextImpl.java
        RegExpCapturingGroupPreprocessor.java
        RemoveMultipleFieldsPreprocessor.java
        RequiredValidatorPreprocessor.java
        ScriptingPreprocessor.java
        SimpleValueMapMapperPreprocessor.java
        StripHtmlPreprocessor.java
        StructureUtils.java
        StructuredContentPreprocessor.java
        StructuredContentPreprocessorBase.java
        StructuredContentPreprocessorFactory.java
        StructuredContentPreprocessorWithSourceBasesBase.java
        TrimStringValuePreprocessor.java
        ValueUtils.java
        ValuesCollectingPreprocessor.java
  - test
    - java
      - org
        jboss
        elasticsearch
        tools
        content
        AddCurrentTimestampPreprocessorTest.java
        AddMultipleValuesPreprocessorTest.java
        AddValuePreprocessorTest.java
        ESLookupValuePreprocessorTest.java
        IsDateInRangePreprocessorTest.java
        LongToTimestampValuePreprocessorTest.java
        MaxTimestampPreprocessorTest.java
        PreprocessChainContextImplTest.java
        RegExpCapturingGroupPreprocessorTest.java
        RemoveMultipleFieldsPreprocessorTest.java
        RequiredValidatorPreprocessorTest.java
        ScriptingPreprocessorTest.java
        SimpleValueMapMapperPreprocessorTest.java
        StripHtmlPreprocessorTest.java
        StructureUtilsTest.java
        StructuredContentPreprocessorBaseTest.java
        StructuredContentPreprocessorFactoryTest.java
        StructuredContentPreprocessorMock.java
        StructuredContentPreprocessorWithSourceBasesBaseTest.java
        TrimStringValuePreprocessorTest.java
        ValueUtilsTest.java
        ValuesCollectingPreprocessorTest.java
        testtools
        ESRealClientTestBase.java
        TestUtils.java

/*
 * JBoss, Home of Professional Open Source
 * Copyright 2012 Red Hat Inc. and/or its affiliates and other contributors
 * as indicated by the @authors tag. All rights reserved.
 */
package org.jboss.elasticsearch.tools.content;

import java.util.List;
import java.util.Map;

import org.elasticsearch.common.settings.SettingsException;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

/**
 * Content preprocessor which takes String value from source field, strip html tags from it, unescape html entities (
 * <code>&lt;</code>, <code>&gt;</code>, <code>&amp;</code> atd) and store result to another or same target
 * field. Example of configuration for this preprocessor:
 * 
 * <pre>
 * { 
 *     "name"     : "HTML content to text description convertor",
 *     "class"    : "org.jboss.elasticsearch.tools.content.StripHtmlPreprocessor",
 *     "settings" : {
 *         "source_field"  : "content",
 *         "target_field"  : "description"
 *     } 
 * }
 * </pre>
 * 
 * Options are:
 * <ul>
 * <li><code>source_field</code> - source field in input data. Dot notation for nested values can be used here (see
 * {@link XContentMapValues#extractValue(String, Map)}).
 * <li><code>target_field</code> - target field in data to store mapped value into. Can be same as input field. Dot
 * notation can be used here for structure nesting.
 * <li><code>source_bases</code> - list of fields in source data which are used as bases for stripping. If defined then
 * stripping is performed for each of this fields, <code>source_field</code> and <code>target_field</code> are resolved
 * relatively against this base. Base must provide object or list of objects.
 * </ul>
 * 
 * @author Vlastimil Elias (velias at redhat dot com)
 * @see StructuredContentPreprocessorFactory
 */
public class StripHtmlPreprocessor extends StructuredContentPreprocessorWithSourceBasesBase<Object> {

	protected static final String CFG_SOURCE_FIELD = "source_field";
	protected static final String CFG_TARGET_FIELD = "target_field";

	protected String fieldSource;
	protected String fieldTarget;

	@Override
	public void init(Map<String, Object> settings) throws SettingsException {
		super.init(settings);
		fieldSource = XContentMapValues.nodeStringValue(settings.get(CFG_SOURCE_FIELD), null);
		validateConfigurationStringNotEmpty(fieldSource, CFG_SOURCE_FIELD);
		fieldTarget = XContentMapValues.nodeStringValue(settings.get(CFG_TARGET_FIELD), null);
		validateConfigurationStringNotEmpty(fieldTarget, CFG_TARGET_FIELD);
	}

	@Override
	protected Object createContext(Map<String, Object> data) {
		return null;
	}

	@Override
	protected void processOneSourceValue(Map<String, Object> data, Object context, String base,
			PreprocessChainContext chainContext) {
		Object v = null;
		if (fieldSource.contains(".")) {
			v = XContentMapValues.extractValue(fieldSource, data);
		} else {
			v = data.get(fieldSource);
		}

		if (v != null) {
			if (!(v instanceof String)) {
				String msg = "Value for field '" + getFullFieldName(base, fieldSource)
						+ "' is not String, so can't be processed";
				addDataWarning(chainContext, msg);
				logger.debug(msg);
			} else {
				String value = stripHtml(v.toString());
				StructureUtils.putValueIntoMapOfMaps(data, fieldTarget, value);
			}
		}
	}

	protected String stripHtml(String value) {
		if (value == null || value.trim().isEmpty())
			return value;
		Document doc = Jsoup.parse(Jsoup.clean(value, Whitelist.relaxed()));
		return convertNodeToText(doc.body());
	}

	protected String convertNodeToText(Element element) {
		if (element == null)
			return "";
		final StringBuilder buffer = new StringBuilder();
		new NodeTraversor(new NodeVisitor() {
			@Override
			public void head(Node node, int depth) {
				if (node instanceof TextNode) {
					TextNode textNode = (TextNode) node;
					String text = textNode.text().replace('\u00A0', ' ').trim(); // non breaking space
					if (!text.isEmpty()) {
						buffer.append(text);
						if (!text.endsWith(" ")) {
							buffer.append(" "); // the last text gets appended the extra space too but we remove it later
						}
					}
				}
			}

			@Override
			public void tail(Node node, int depth) {
			}
		}).traverse(element);
		String output = buffer.toString();
		if (output.endsWith(" ")) { // removal of the last extra space
			output = output.substring(0, output.length() - 1);
		}
		return output;
	}

	public String getFieldSource() {
		return fieldSource;
	}

	public String getFieldTarget() {
		return fieldTarget;
	}

	public List<String> getSourceBases() {
		return sourceBases;
	}

}