/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.components.ValidationContext; import org.apache.nifi.components.ValidationResult; import org.apache.nifi.components.Validator; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.processor.AbstractProcessor; import org.apache.nifi.processor.ProcessContext; import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.processor.Relationship; import org.apache.nifi.processor.io.InputStreamCallback; import org.apache.nifi.processor.util.StandardValidators; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Selector; import java.io.IOException; import java.io.InputStream; import java.util.concurrent.atomic.AtomicReference; public abstract class AbstractHTMLProcessor extends AbstractProcessor { protected static final String ELEMENT_HTML = "HTML"; protected static final String ELEMENT_TEXT = "Text"; protected static final String ELEMENT_DATA = "Data"; protected static final String ELEMENT_ATTRIBUTE = "Attribute"; protected static final Validator CSS_SELECTOR_VALIDATOR = new Validator() { @Override public ValidationResult validate(final String subject, final String value, final ValidationContext context) { if (context.isExpressionLanguageSupported(subject) && context.isExpressionLanguagePresent(value)) { return new ValidationResult.Builder().subject(subject).input(value).explanation("Expression Language Present").valid(true).build(); } String reason = null; try { Document doc = Jsoup.parse("<html></html>"); doc.select(value); } catch (final Selector.SelectorParseException e) { reason = "\"" + value + "\" is an invalid CSS selector"; } return new ValidationResult.Builder().subject(subject).input(value).explanation(reason).valid(reason == null).build(); } }; public static final PropertyDescriptor URL = new PropertyDescriptor .Builder().name("URL") .description("Base URL for the HTML page being parsed." + " This URL will be used to resolve an absolute URL" + " when an attribute value is extracted from a HTML element.") .required(true) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(true) .build(); public static final PropertyDescriptor CSS_SELECTOR = new PropertyDescriptor .Builder().name("CSS Selector") .description("CSS selector syntax string used to extract the desired HTML element(s).") .required(true) .addValidator(CSS_SELECTOR_VALIDATOR) .expressionLanguageSupported(true) .build(); public static final PropertyDescriptor HTML_CHARSET = new PropertyDescriptor .Builder().name("HTML Character Encoding") .description("Character encoding of the input HTML") .defaultValue("UTF-8") .required(true) .addValidator(StandardValidators.CHARACTER_SET_VALIDATOR) .build(); public static final Relationship REL_ORIGINAL = new Relationship.Builder() .name("original") .description("The original HTML input") .build(); public static final Relationship REL_SUCCESS = new Relationship.Builder() .name("success") .description("Successfully parsed HTML element") .build(); public static final Relationship REL_INVALID_HTML = new Relationship.Builder() .name("invalid html") .description("The input HTML syntax is invalid") .build(); public static final Relationship REL_NOT_FOUND = new Relationship.Builder() .name("element not found") .description("Element could not be found in the HTML document. The original HTML input will remain " + "in the FlowFile content unchanged. Relationship '" + REL_ORIGINAL + "' will not be invoked " + "in this scenario.") .build(); /** * Parses the Jsoup HTML document from the FlowFile input content. * * @param inputFlowFile Input FlowFile containing the HTML * @param context ProcessContext * @param session ProcessSession * * @return Jsoup Document */ protected Document parseHTMLDocumentFromFlowfile(final FlowFile inputFlowFile, final ProcessContext context, final ProcessSession session) { final AtomicReference<Document> doc = new AtomicReference<>(); session.read(inputFlowFile, new InputStreamCallback() { @Override public void process(InputStream inputStream) throws IOException { final String baseUrl = getBaseUrl(inputFlowFile, context); if (baseUrl == null || baseUrl.isEmpty()) { throw new RuntimeException("Base URL was empty."); } doc.set(Jsoup.parse(inputStream, context.getProperty(HTML_CHARSET).getValue(), baseUrl)); } }); return doc.get(); } protected String getBaseUrl(final FlowFile inputFlowFile, final ProcessContext context) { return "http://localhost/"; } }