/**
* Copyright (C) 2010-2017 Structr GmbH
*
* This file is part of Structr <http://structr.org>.
*
* Structr is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* Structr is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with Structr. If not, see <http://www.gnu.org/licenses/>.
*/
package org.structr.crawler;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.structr.common.View;
import org.structr.common.error.FrameworkException;
import org.structr.core.Export;
import org.structr.core.app.App;
import org.structr.core.app.StructrApp;
import org.structr.core.converter.PropertyConverter;
import org.structr.core.entity.AbstractNode;
import org.structr.core.entity.Principal;
import org.structr.core.graph.NodeInterface;
import org.structr.core.property.*;
import org.structr.rest.common.HttpHelper;
import org.structr.schema.ConfigurationProvider;
public class SourcePattern extends AbstractNode {
private static final Logger logger = LoggerFactory.getLogger(SourcePattern.class.getName());
public static final Property<List<SourcePattern>> subPatternsProperty = new EndNodes<>("subPatterns", SourcePatternSUBSourcePattern.class);
public static final Property<SourcePage> subPageProperty = new EndNode<>("subPage", SourcePatternSUBPAGESourcePage.class);
public static final Property<SourcePage> sourcePageProperty = new StartNode<>("sourcePage", SourcePageUSESourcePattern.class);
public static final Property<SourcePattern> parentPatternProperty = new StartNode<>("parentPattern", SourcePatternSUBSourcePattern.class);
public static final Property<Long> fromProperty = new LongProperty("from");
public static final Property<Long> toProperty = new LongProperty("to");
public static final Property<String> selectorProperty = new StringProperty("selector").indexed();
public static final Property<String> mappedTypeProperty = new StringProperty("mappedType").indexed();
public static final Property<String> mappedAttributeProperty = new StringProperty("mappedAttribute").indexed();
public static final Property<String> mappedAttributeFormatProperty = new StringProperty("mappedAttributeFormat");
public static final Property<String> mappedAttributeLocaleProperty = new StringProperty("mappedAttributeLocale");
public static final Property<String> inputValue = new StringProperty("inputValue").indexed();
public static final View uiView = new View(SourcePattern.class, "ui",
subPatternsProperty, subPageProperty, sourcePageProperty, parentPatternProperty, fromProperty, toProperty, selectorProperty, mappedTypeProperty, mappedAttributeProperty, mappedAttributeFormatProperty, mappedAttributeLocaleProperty, inputValue
);
private Class type(final String typeString) throws FrameworkException {
Class type = null;
final ConfigurationProvider config = StructrApp.getConfiguration();
if (typeString != null) {
type = config.getNodeEntityClass(typeString);
}
if (type == null) {
throw new FrameworkException(422, "Unknown type '" + typeString + "'");
}
return type;
}
private NodeInterface create(final String typeString) throws FrameworkException {
final App app = StructrApp.getInstance(securityContext);
return app.create(type(typeString));
}
private SourceSite getSite() {
SourcePattern pattern = this;
SourcePage page = pattern.getProperty(sourcePageProperty);
while (page == null) {
pattern = pattern.getProperty(parentPatternProperty);
if (pattern != null) {
page = pattern.getProperty(sourcePageProperty);
}
}
return page.getProperty(SourcePage.site);
}
private String getContent(final String urlString) throws FrameworkException {
final SourceSite site = getSite();
String proxyUrl = site.getProperty(SourceSite.proxyUrl);
String proxyUsername = site.getProperty(SourceSite.proxyUsername);
String proxyPassword = site.getProperty(SourceSite.proxyPassword);
Principal user = securityContext.getCachedUser();
if (user != null & StringUtils.isBlank(proxyUrl)) {
proxyUrl = user.getProperty(Principal.proxyUrl);
proxyUsername = user.getProperty(Principal.proxyUsername);
proxyPassword = user.getProperty(Principal.proxyPassword);
}
final String cookie = site.getProperty(SourceSite.cookie);
return HttpHelper.get(urlString, proxyUrl, proxyUsername, proxyPassword, cookie, Collections.EMPTY_MAP)
.replace("<head>", "<head>\n <base href=\"" + urlString + "\">");
}
private void extractAndSetValue(final NodeInterface obj, final Document doc, final String selector, final String mappedType, final String mappedAttribute, final String mappedAttributeFormat, final SourcePage subPage) throws FrameworkException {
// If the sub pattern has a mapped attribute, set the extracted value
if (StringUtils.isNotEmpty(mappedAttribute)) {
// Extract the value for this sub pattern's selector
final String ex = doc.select(selector).text();
final ConfigurationProvider config = StructrApp.getConfiguration();
final PropertyKey key = config.getPropertyKeyForJSONName(type(mappedType), mappedAttribute);
if (key != null) {
Object convertedValue = ex;
final PropertyConverter inputConverter = key.inputConverter(securityContext);
if (inputConverter != null) {
final String locale = getProperty(mappedAttributeLocaleProperty);
DecimalFormat decimalFormat = null;
if (key instanceof DoubleProperty) {
if (StringUtils.isNotBlank(locale)) {
decimalFormat = (DecimalFormat) NumberFormat.getNumberInstance(new Locale(locale));
} else if (StringUtils.isNotBlank(mappedAttributeFormat)) {
decimalFormat = new DecimalFormat(mappedAttributeFormat);
}
if (decimalFormat != null) {
convertedValue = decimalFormat.format(convertedValue);
}
} else {
convertedValue = inputConverter.convert(ex);
}
}
obj.setProperty(key, convertedValue);
}
// If the sub pattern has no mapped attribute but a sub page defined, query the patterns of the sub page
} else if (subPage != null) {
final String pageUrl = subPage.getProperty(SourcePage.url);
final URI uri;
try {
uri = new URI(pageUrl);
} catch (URISyntaxException ex) {
throw new FrameworkException(422, "Unable to parse sub page url: " + pageUrl);
}
// This is the URL of the linked page derived from the enclosing selector
final String subUrl = uri.getScheme() + "://" + uri.getAuthority() + doc.select(selector).attr("href");
// Extract the content of the linked page
final String subContent = getContent(subUrl);
// Parse the content into a document
final Document subDoc = Jsoup.parse(subContent);
final List<SourcePattern> subPagePatterns = subPage.getProperty(SourcePage.patterns);
// Loop through all patterns of the sub page
for (final SourcePattern subPagePattern : subPagePatterns) {
final Map<String, Object> params = new HashMap<>();
params.put("document", subDoc);
params.put("object", obj);
subPagePattern.extract(params);
// final String subPagePatternSelector = subPagePattern.getProperty(SourcePattern.selectorProperty);
//
//
// // Extract
// final String subEx = subDoc.select(subPagePatternSelector).text();
// final String subPagePatternType = subPagePattern.getProperty(SourcePattern.mappedTypeProperty);
//
// if (subPagePatternType != null) {
//
//
// final Elements subParts = subDoc.select(subPagePatternSelector);
//
// final Long j = 1L;
//
// for (final Element subPart : subParts) {
//
// final NodeInterface subObj = create(subPagePatternType);
//
// final List<SourcePattern> subPagePatternPatterns = subPagePattern.getProperty(SourcePattern.subPatternsProperty);
//
// for (final SourcePattern subPageSubPattern : subPagePatternPatterns) {
//
//
// final String subPagePatternSelector = subPageSubPattern.getProperty(SourcePattern.selectorProperty);
//
//
//
// final String subPageSubPatternSelector = subPagePatternSelector + ":nth-child(" + j + ") > " + subPagePatternSelector;
//
// extractAndSetValue(subObj, subDoc, subSelector, mappedType, subPatternMappedAttribute);
//
//
// final String subSubEx = subDoc.select(subPageSubPatternSelector).text();
//
// if (subSubEx != null && subSubEx != = '' && subPageSubPattern.mappedAttribute != null) {
//
// final PropertyKey key = config.getPropertyKeyForJSONName(type(mappedType), subPatternMappedAttribute);
// if (key != null) {
//
// subObj.setProperty(key, subSubEx);
// }
//
// }
//
// final String subPagePatternMappedAttribute = subPagePattern.getProperty(SourcePattern.mappedAttributeProperty);
//
// final PropertyKey key = config.getPropertyKeyForJSONName(type(mappedType), subPagePatternMappedAttribute);
// if (key != null) {
//
// obj.setProperty(key, subSubEx);
// }
//
// }
//
// } else {
//
// if (subEx != null && subEx != = '' && subPagePattern.mappedAttribute != null) {
// obj[subPagePattern.mappedAttribute] = subEx;
// }
}
}
}
@Export
public void extract(final Map<String, Object> parameters) throws FrameworkException {
final SourcePage page = getProperty(sourcePageProperty);
if (page == null) {
throw new FrameworkException(422, "Pattern has no source page, exiting.");
}
final String selector = getProperty(selectorProperty);
if (selector == null) {
throw new FrameworkException(422, "Pattern has no selector, exiting.");
}
final Long from = getProperty(fromProperty);
final Long to = getProperty(toProperty);
final List<SourcePattern> subPatterns = getProperty(subPatternsProperty);
Document doc = null;
NodeInterface parentObj = null;
if (parameters.containsKey("object")) {
parentObj = (NodeInterface) parameters.get("object");
}
if (parameters.containsKey("document")) {
doc = (Document) parameters.get("document");
} else {
final String url = page.getProperty(SourcePage.url);
if (url == null) {
throw new FrameworkException(422, "This pattern's source page has no URL, exiting.");
}
// Get the content from the URL
final String content = getContent(url);
// Parse the document with Jsoup and extract the elements matched by the given selector
doc = Jsoup.parse(content);
}
final String mappedType = getProperty(mappedTypeProperty);
if (mappedType == null) {
throw new FrameworkException(422, "No mapped type given, exiting.");
}
final Elements parts = doc.select(selector);
// Loop through all elements found for this pattern; if a start index is given, start at this element
for (int i = (from != null ? from.intValue() : 1); i<= (to != null ? to : parts.size()); i++) {
// If no object was given (from a higher-level pattern), create a new object of the given type
final NodeInterface obj = (parentObj == null ? create(mappedType) : parentObj);
if (subPatterns.size() > 0) {
// Loop through the sub patterns of this pattern
for (final SourcePattern subPattern : subPatterns) {
final String subSelector = selector + ":nth-child(" + i + ") > " + subPattern.getProperty(SourcePattern.selectorProperty);
final String subPatternMappedAttribute = subPattern.getProperty(SourcePattern.mappedAttributeProperty);
final String subPatternMappedAttributeFormat = subPattern.getProperty(SourcePattern.mappedAttributeFormatProperty);
final SourcePage subPatternSubPage = subPattern.getProperty(SourcePattern.subPageProperty);
extractAndSetValue(obj, doc, subSelector, mappedType, subPatternMappedAttribute, subPatternMappedAttributeFormat, subPatternSubPage);
}
} else {
final String mappedAttribute = getProperty(mappedAttributeProperty);
final String mappedAttributeFormat = getProperty(mappedAttributeFormatProperty);
extractAndSetValue(obj, doc, selector, mappedType, mappedAttribute, mappedAttributeFormat, null);
}
}
}
}