/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.nifi.util.MockFlowFile; import org.apache.nifi.util.TestRunner; import org.apache.nifi.util.TestRunners; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Selector; import org.junit.Before; import org.junit.Test; public class TestGetHTMLElement extends AbstractHTMLTest { private TestRunner testRunner; @Before public void init() { testRunner = TestRunners.newTestRunner(GetHTMLElement.class); testRunner.setProperty(GetHTMLElement.URL, "http://localhost"); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.HTML_CHARSET, "UTF-8"); } @Test(expected = Selector.SelectorParseException.class) public void testCSSSelectorSyntaxValidator() throws IOException { Document doc = Jsoup.parse(new File("src/test/resources/Weather.html"), StandardCharsets.UTF_8.name()); doc.select("---invalidCssSelector"); } @Test public void testNoElementFound() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); //Bold element is not present in sample HTML testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); } @Test public void testInvalidSelector() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "InvalidCSSSelectorSyntax"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); } @Test public void testSingleElementFound() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "head"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); } @Test public void testMultipleElementFound() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "a"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 3); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); } @Test public void testElementFoundWriteToAttribute() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); ffs.get(0).assertAttributeEquals(GetHTMLElement.HTML_ELEMENT_ATTRIBUTE_NAME, ATL_WEATHER_LINK); } @Test public void testElementFoundWriteToContent() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); ffs.get(0).assertContentEquals(ATL_WEATHER_LINK); } @Test public void testValidPrependValueToFoundElement() throws Exception { final String PREPEND_VALUE = "TestPrepend"; testRunner.setProperty(GetHTMLElement.PREPEND_ELEMENT_VALUE, PREPEND_VALUE); testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); ffs.get(0).assertContentEquals(PREPEND_VALUE + ATL_WEATHER_LINK); } @Test public void testValidPrependValueToNotFoundElement() throws Exception { final String PREPEND_VALUE = "TestPrepend"; testRunner.setProperty(GetHTMLElement.PREPEND_ELEMENT_VALUE, PREPEND_VALUE); testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); } @Test public void testValidAppendValueToFoundElement() throws Exception { final String APPEND_VALUE = "TestAppend"; testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, APPEND_VALUE); testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "href"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); ffs.get(0).assertContentEquals(ATL_WEATHER_LINK + APPEND_VALUE); } @Test public void testValidAppendValueToNotFoundElement() throws Exception { final String APPEND_VALUE = "TestAppend"; testRunner.setProperty(GetHTMLElement.APPEND_ELEMENT_VALUE, APPEND_VALUE); testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "b"); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 1); } @Test public void testExtractAttributeFromElement() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "meta[name=author]"); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "Content"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); ffs.get(0).assertContentEquals(AUTHOR_NAME); } @Test public void testExtractAttributeFromElementRelativeUrl() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script"); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "src"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); ffs.get(0).assertContentEquals("js/scripts.js"); } @Test public void testExtractAttributeFromElementAbsoluteUrl() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script"); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); ffs.get(0).assertContentEquals("http://localhost/js/scripts.js"); } @Test public void testExtractAttributeFromElementAbsoluteUrlWithEL() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script"); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src"); testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}"); final Map<String, String> attributes = new HashMap<>(); attributes.put("contentUrl", "https://example.com/a/b/c/Weather.html"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath(), attributes); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); ffs.get(0).assertContentEquals("https://example.com/a/b/c/js/scripts.js"); } @Test public void testExtractAttributeFromElementAbsoluteUrlWithEmptyElResult() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "script"); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(GetHTMLElement.ATTRIBUTE_KEY, "abs:src"); // Expression Language returns empty string because flow-file doesn't have contentUrl attribute. testRunner.setProperty(GetHTMLElement.URL, "${contentUrl}"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 0); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 1); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 0); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); } @Test public void testExtractTextFromElement() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + ATL_ID); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_TEXT); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); ffs.get(0).assertContentEquals(ATL_WEATHER_TEXT); } @Test public void testExtractHTMLFromElement() throws Exception { testRunner.setProperty(GetHTMLElement.CSS_SELECTOR, "#" + GDR_ID); testRunner.setProperty(GetHTMLElement.DESTINATION, GetHTMLElement.DESTINATION_CONTENT); testRunner.setProperty(GetHTMLElement.OUTPUT_TYPE, GetHTMLElement.ELEMENT_HTML); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run(); testRunner.assertTransferCount(GetHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(GetHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(GetHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(GetHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(GetHTMLElement.REL_SUCCESS); ffs.get(0).assertContentEquals(GDR_WEATHER_TEXT); } }