/*
* Seldon -- open source prediction engine
* =======================================
* Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/)
*
**********************************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************************************
*/
package io.seldon.importer.articles;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.codehaus.jackson.JsonGenerationException;
import org.codehaus.jackson.map.JsonMappingException;
import org.codehaus.jackson.map.ObjectMapper;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class AttributesImporterUtils {
public static Set<String> getTags(Document articleDoc, String tagsCssSelector, String title) {
Set<String> tagSet = new HashSet<String>();
if (StringUtils.isNotBlank(tagsCssSelector)) {
Elements tagsElements = articleDoc.select(tagsCssSelector);
Element tagsElement = tagsElements.first();
List<String> tagsParts;
if ((tagsElement != null) && (tagsElement.attr("content") != null)
&& (StringUtils.isNotBlank(tagsElement.attr("content")))) {
tagsParts = AttributesImporterUtils.getTagsPartsFromSingleElement(tagsElement);
} else {
tagsParts = AttributesImporterUtils.getTagsPartsFromMultipleElement(tagsElements);
}
List<String> extraTagsParts = AttributesImporterUtils.createExtraTagsPartsFromTitle(title, tagsParts);
tagSet.addAll(tagsParts);
tagSet.addAll(extraTagsParts);
}
return tagSet;
}
// for like -tagsSelector "head > meta[name=keywords]"
public static List<String> getTagsPartsFromSingleElement(Element tagsElement) {
String tagsRaw = tagsElement.attr("content");
String[] parts = tagsRaw.split(",");
for (int i = 0; i < parts.length; i++)
parts[i] = parts[i].trim().toLowerCase();
List<String> tagsParts = (parts != null) ? new ArrayList<String>(Arrays.asList(parts))
: new ArrayList<String>();
return tagsParts;
}
// for like -tagsSelector "section[id=tags] > a"
public static List<String> getTagsPartsFromMultipleElement(Elements tagsElements) {
List<String> tagsParts = new ArrayList<String>();
for (Element e : tagsElements) {
String tag = e.text();
tag = StringUtils.strip(tag);
tag = tag.toLowerCase();
tagsParts.add(tag);
}
return tagsParts;
}
public static List<String> createExtraTagsPartsFromTitle(String title, List<String> tagsParts) {
List<String> tileParts = new ArrayList<String>();
boolean haveTitle = StringUtils.isNotBlank(title);
String titleLower = title.toLowerCase();
String last_part = null;
for (String part : tagsParts) {
if (StringUtils.isNotBlank(part)) {
if (haveTitle && (last_part != null) && StringUtils.isNotBlank(last_part)) {
String phrase = last_part + " " + part;
if (titleLower.contains(phrase))
tileParts.add(phrase.toLowerCase());
}
}
last_part = part;
}
return tileParts;
}
public static void logResult(Logger logger, ItemProcessResult itemProcessResult) {
String v = "none";
ObjectMapper mapper = new ObjectMapper();
try {
v = mapper.writeValueAsString( itemProcessResult );
} catch (JsonGenerationException e) {
e.printStackTrace();
} catch (JsonMappingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
logger.info("ItemProcessResult: "+v);
}
public static String getBaseUrl(String url) throws Exception {
URL aURL = new URL(url);
String protocol = aURL.getProtocol();
String host = aURL.getHost();
String path = aURL.getPath();
String baseUrl = String.format("%s://%s%s", protocol,host,path);
return baseUrl;
}
}