/*
* Seldon -- open source prediction engine
* =======================================
* Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/)
*
**********************************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************************************
*/
package io.seldon.importer.articles;
import io.seldon.client.DefaultApiClient;
import io.seldon.client.beans.ItemBean;
import io.seldon.client.exception.ApiException;
import io.seldon.importer.articles.category.CategoryExtractor;
import io.seldon.utils.CollectionTools;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.sampullara.cli.Args;
import com.sampullara.cli.Argument;
public class FileItemAttributesImporter {
private static Logger logger = Logger.getLogger(FileItemAttributesImporter.class.getName());
@Argument(alias = "n", description = "How many items to import", required = false)
static Integer numItems = 500;
@Argument(alias = "i", description = "Interval in secs between runs", required = false)
static Integer intervalSecs = 600;
@Argument(alias = "gt", description = "Timeout on article http GET", required = false)
static Integer httpGetTimeout = 2000;
@Argument(alias = "api-url", description = "API Endpoint", required = true)
static String apiUrl;
@Argument(alias = "consumer-key", description = "Consumer Key", required = true)
static String consumerKey;
@Argument(alias = "consumer-secret", description = "Consumer Secret", required = true)
static String consumerSecret;
@Argument(alias = "it", description = "Item type", required = false, delimiter = ",")
static Integer[] itemTypes = new Integer[] {0,1};
/*
* CSS Selector arguments
*/
@Argument(alias = "imageSelector", description = "Image CSS Selector", required = false)
static String imageCssSelector = "head > meta[property=og:image]";
@Argument(alias = "titleSelector", description = "Title CSS Selector", required = false)
static String titleCssSelector = "head > meta[property=og:title]";
@Argument(alias = "leadTextSelector", description = "Lead Text CSS Selector", required = false)
static String leadTextCssSelector = "";
@Argument(alias = "textSelector", description = "Article text CSS Selector", required = false)
static String textCssSelector;
@Argument(alias = "tagsSelector", description = "Tags CSS Selector", required = false)
static String tagsCssSelector = "head > meta[name=keywords]";
@Argument(alias = "categorySelector", description = "Category CSS Selector", required = false)
static String categoryCssSelector;
@Argument(alias = "subCategorySelector", description = "Sub Category CSS Selector", required = false)
static String subCategoryCssSelector;
@Argument(alias = "linkSelector", description = "URL link CSS Selector", required = false)
static String linkCssSelector;
@Argument(alias = "publishDateSelector", description = "publish date CSS Selector", required = false)
static String publishDateCssSelector;
@Argument(alias = "urls", description = "File containing list of URLs", required = true)
static String urlFile;
@Argument(alias = "minFetch", description = "Min time between url requests in msecs", required = false)
static Integer minFetchGapMsecs = 500;
/*
* Arguments to allow success even if we don't find some elements
*/
@Argument(alias = "noimage", description = "is it ok not to find an image", required = false)
static boolean imageNotNeeded = false;
@Argument(alias = "nocategory", description = "is it ok not to find a category", required = false)
static boolean categoryNotNeeded = false;
/*
* Argument for using Domain
*/
@Argument(alias = "needdomain", description = "Should doamin be used as an attrinute", required = false)
static boolean domainIsNeeded = false;
/*
* Defaults
*/
@Argument(alias = "defImage", description = "Default image url", required = false)
static String defImageUrl;
@Argument(alias = "categoryPrefix", description = "The prefix for the supplied category extractor - will be io.seldon.importer.articles.category.<Prefix>CategoryExtractor", required = false)
static String categoryClassPrefix = "GeneralFirst";
@Argument(alias = "subCategoryPrefix", description = "The prefix for the supplied sub category extractor - will be io.seldon.importer.articles.category.<Prefix>SubCategoryExtractor", required = false)
static String subCategoryClassPrefix = ""; //"GeneralAll";
@Argument(alias = "t", description = "For testing, will not update", required = false)
static boolean testmode = false;
static int API_TIMEOUT = 10000;
static String ATTR_IMG_NAME = "img_url";
static final String ATTR_CATEGORIES = "categories";
static final String ATTR_TITLE = "title";
static final String CONTENT_TYPE_ARTICLE_VALID = "article";
static final String CONTENT_TYPE_ARTICLE_INVALID = "old_article";
static final int TYPE_NOT_VALID = 2;
static final int TYPE_NOT_SET = 0;
static final int TYPE_VALID = 1;
static final String UNVERIFIED_CONTENT_TYPE = "unverified_article";
static final String VERIFIED_CONTENT_TYPE = "article";
//field
static String CONTENT_TYPE = "content_type";
static String TITLE = "title";
static String CATEGORY = "category";
static String SUBCATEGORY = "subcategory";
static String IMG_URL = "img_url";
static String DESCRIPTION = "description";
static String TAGS = "tags";
static String LEAD_TEXT = "leadtext";
static String LINK = "link";
static String PUBLISH_DATE = "published_date";
static String DOMAIN = "domain";
private int total_item_processed_count = 0;
private int total_item_succeded_count = 0;
private FailFast failFast = null;
DefaultApiClient client;
static long lastUrlFetchTime = 0;
public FileItemAttributesImporter(DefaultApiClient client) {
this.client = client;
}
public void run() throws InterruptedException
{
logger.info("Starting...");
logger.info("Processing recent urls...");
int updates = process();
logger.info("Processed with "+updates+" updates");
logger.info("Processed urls...Finished");
if (failFast != null) {
failFast.stopChecking(); // We are exiting normally so no need to check the main thread is going to die
}
}
public static String getUrlEncodedString(String input)
{
URL url = null;
try
{
url = new URL(input);
URI uri = new URI(
url.getProtocol(),
url.getHost(),
url.getPath(),
url.getQuery(),
null);
String encoded = uri.toASCIIString();
return encoded;
}
catch(MalformedURLException mue)
{
logger.error("Malformed url "+input);
return null;
}
catch (URISyntaxException e)
{
logger.error("Failed to tranform url into uri ",e);
return null;
}
}
public int process()
{
int updates = 0;
try
{
Map<String,ItemBean> itemMap = new HashMap<String,ItemBean>();
List<ItemBean> items = new ArrayList<ItemBean>();
for(int i=0;i<itemTypes.length;i++)
{
List<ItemBean> itemsForType = client.getItems(numItems, itemTypes[i], true, "last_action");
logger.info("Adding "+itemsForType.size()+" items for item type "+itemTypes[i]);
items.addAll(itemsForType);
}
logger.info("Got "+items.size()+" items from API");
for(ItemBean item : items)
itemMap.put(item.getId(), item);
BufferedReader reader = new BufferedReader(new FileReader(urlFile));
String url;
int count = 0;
while ((url = reader.readLine()) != null)
{
count++;
ItemBean item = itemMap.get(url);
String contentType = null;
if (item == null)
item = new ItemBean(url, "", 1);
else
contentType = item.getAttributesName().get(ItemAttributesImporter.CONTENT_TYPE);
if(item.getType() == ItemAttributesImporter.TYPE_NOT_SET || (contentType == null || ItemAttributesImporter.UNVERIFIED_CONTENT_TYPE.equals(contentType)))
{
total_item_processed_count++;
logger.info("Looking at item "+count);
System.out.println("Item => "+item.toString());
boolean imported = false;
try
{
String category = null;
if (item.getAttributesName() != null)
category = item.getAttributesName().get(ItemAttributesImporter.CATEGORY);
Map<String,String> attributes = getAttributes(item.getId(),category);
if(attributes != null)
{
updates++;
total_item_succeded_count++;
item.setName(attributes.get(ItemAttributesImporter.ATTR_TITLE));
item.setAttributesName(attributes);
item.setType(ItemAttributesImporter.TYPE_VALID);
item.setFirst_action(new Date());
item.setLast_action(new Date());
if (!testmode) {
client.updateItem(item);
} else {
logger.info("TESTMODE skipping update");
}
imported = true;
}
}
catch(Exception e) {
logger.warn("Article:" + item.getId() + " error.",e);
}
String updated_amount_string = String.format("[%d/%d %.0f%%]", total_item_succeded_count,total_item_processed_count, ((((double)total_item_succeded_count/(double)total_item_processed_count))*100));
if(imported) {
logger.info("Article : " + item.getId() + " import - OK "+updated_amount_string);
logger.info("Item : " + item);
}
else {
logger.info("Article : " + item.getId() + " import - NOT OK "+updated_amount_string);
}
}
else {
logger.info("Article : " + item.getId() + " SKIPPED");
}
}
reader.close();
}
catch (ApiException e)
{
logger.error("Failed api call",e);
} catch (IOException e) {
// TODO Auto-generated catch block
logger.error("io exception",e);
}
return updates;
}
public static Map<String, String> getAttributes(String url,String existingCategory) {
ItemProcessResult itemProcessResult = new ItemProcessResult();
itemProcessResult.client_item_id = url;
itemProcessResult.extraction_status = "EXTRACTION_FAILED";
logger.info("Trying to get attributes for "+url);
Map<String,String> attributes = null;
String title="";
String category="";
String subCategory = "";
String img_url="";
String description="";
String tags = "";
String leadtext = "";
String link = "";
String publishDate = "";
String domain = "";
try {
long now = System.currentTimeMillis();
long timeSinceLastRequest = now - lastUrlFetchTime;
if (timeSinceLastRequest < minFetchGapMsecs)
{
long timeToSleep = minFetchGapMsecs - timeSinceLastRequest;
logger.info("Sleeping "+timeToSleep+"msecs as time since last fetch is "+timeSinceLastRequest);
Thread.sleep(timeToSleep);
}
Document articleDoc = Jsoup.connect(url).userAgent("SeldonBot/1.0").timeout(httpGetTimeout).get();
lastUrlFetchTime = System.currentTimeMillis();
//get IMAGE URL
if (StringUtils.isNotBlank(imageCssSelector))
{
Element imageElement = articleDoc.select(imageCssSelector).first();
if (imageElement != null)
{
if (imageElement.attr("content") != null) {
img_url = imageElement.attr("content");
}
if (StringUtils.isBlank(img_url) && imageElement.attr("src") != null) {
img_url = imageElement.attr("src");
}
if (StringUtils.isBlank(img_url) && imageElement.attr("href") != null) {
img_url = imageElement.attr("href");
}
}
}
if (StringUtils.isBlank(img_url) && StringUtils.isNotBlank(defImageUrl))
{
logger.info("Setting image to default: "+defImageUrl);
img_url = defImageUrl;
}
img_url = StringUtils.strip(img_url);
//get TITLE
if (StringUtils.isNotBlank(titleCssSelector))
{
Element titleElement = articleDoc.select(titleCssSelector).first();
if (titleElement != null && titleElement.attr("content") != null) {
title = titleElement.attr("content");
}
}
//get Lead Text
if (StringUtils.isNotBlank(leadTextCssSelector))
{
Element leadElement = articleDoc.select(leadTextCssSelector).first();
if (leadElement != null && leadElement.attr("content") != null) {
leadtext = leadElement.attr("content");
}
}
//get publish date
if (StringUtils.isNotBlank(publishDateCssSelector))
{
//2013-01-21T10:40:55Z
Element pubElement = articleDoc.select(publishDateCssSelector).first();
if (pubElement != null && pubElement.attr("content") != null) {
String pubtext = pubElement.attr("content");
SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
Date result = null;
try{
result = df.parse(pubtext);
}
catch (ParseException e)
{
logger.info("Failed to parse date withUTC format "+pubtext);
}
//try a simpler format
df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
try{
result = df.parse(pubtext);
}
catch (ParseException e)
{
logger.info("Failed to parse date "+pubtext);
}
if (result != null)
publishDate = dateFormatter.format(result);
else
logger.error("Failed to parse date "+pubtext);
}
}
//get Link
if (StringUtils.isNotBlank(linkCssSelector))
{
Element linkElement = articleDoc.select(linkCssSelector).first();
if (linkElement != null && linkElement.attr("content") != null) {
link = linkElement.attr("content");
}
}
//get CONTENT
if (StringUtils.isNotBlank(textCssSelector))
{
Element descriptionElement = articleDoc.select(textCssSelector).first();
if (descriptionElement != null)
description = Jsoup.parse(descriptionElement.html()).text();
}
//get TAGS
Set<String> tagSet = AttributesImporterUtils.getTags(articleDoc, tagsCssSelector, title);
if (tagSet.size() > 0)
tags = CollectionTools.join(tagSet, ",");
//get CATEGORY - client specific
if (StringUtils.isNotBlank(categoryCssSelector))
{
Element categoryElement = articleDoc.select(categoryCssSelector).first();
if (categoryElement != null && categoryElement.attr("content") != null) {
category = categoryElement.attr("content");
if (StringUtils.isNotBlank(category))
category = category.toUpperCase();
}
}
else if (StringUtils.isNotBlank(categoryClassPrefix))
{
String className = "io.seldon.importer.articles.category."+categoryClassPrefix+"CategoryExtractor";
Class<?> clazz = Class.forName(className);
Constructor<?> ctor = clazz.getConstructor();
CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance();
category = extractor.getCategory(url, articleDoc);
}
//get Sub CATEGORY - client specific
if (StringUtils.isNotBlank(subCategoryCssSelector))
{
Element subCategoryElement = articleDoc.select(subCategoryCssSelector).first();
if (subCategoryElement != null && subCategoryElement.attr("content") != null) {
subCategory = subCategoryElement.attr("content");
if (StringUtils.isNotBlank(subCategory))
subCategory = category.toUpperCase();
}
}
else if (StringUtils.isNotBlank(subCategoryClassPrefix))
{
String className = "io.seldon.importer.articles.category."+subCategoryClassPrefix+"SubCategoryExtractor";
Class<?> clazz = Class.forName(className);
Constructor<?> ctor = clazz.getConstructor();
CategoryExtractor extractor = (CategoryExtractor) ctor.newInstance();
subCategory = extractor.getCategory(url, articleDoc);
}
// Get domain
if (domainIsNeeded) {
domain = getDomain(url);
}
if(StringUtils.isNotBlank(title)
&& (imageNotNeeded || StringUtils.isNotBlank(img_url))
&& (categoryNotNeeded || StringUtils.isNotBlank(category))
&& (!domainIsNeeded || StringUtils.isNotBlank(domain))
)
{
attributes = new HashMap<String,String>();
attributes.put(TITLE,title);
if (StringUtils.isNotBlank(category))
attributes.put(CATEGORY, category);
if (StringUtils.isNotBlank(subCategory))
attributes.put(SUBCATEGORY, subCategory);
if (StringUtils.isNotBlank(link))
attributes.put(LINK, link);
if (StringUtils.isNotBlank(leadtext))
attributes.put(LEAD_TEXT, leadtext);
if (StringUtils.isNotBlank(img_url))
attributes.put(IMG_URL, img_url);
if (StringUtils.isNotBlank(tags))
attributes.put(TAGS, tags);
attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE);
if (StringUtils.isNotBlank(description))
attributes.put(DESCRIPTION,description);
if (StringUtils.isNotBlank(publishDate))
attributes.put(PUBLISH_DATE, publishDate);
if (StringUtils.isNotBlank(domain))
attributes.put(DOMAIN, domain);
System.out.println("Item: "+url+"; Category: "+category);
itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED";
}
else {
logger.warn("Failed to get title for article "+url);
logger.warn("[title="+title+", img_url="+img_url+", category="+category+", domain="+domain+"]");
}
{ // check for failures for the log result
if (StringUtils.isBlank(title)) {
itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list))?"":",")+ "title";
}
if (!imageNotNeeded && StringUtils.isBlank(img_url)) {
itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list))?"":",")+ "img_url";
}
if (!categoryNotNeeded && StringUtils.isBlank(category)) {
itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list))?"":",")+ "category";
}
}
}
catch(Exception e) {
logger.warn("Article: " + url + ". Attributes import FAILED",e);
itemProcessResult.error = e.toString();
}
AttributesImporterUtils.logResult(logger, itemProcessResult);
return attributes;
}
/**
* @param args
* @throws InterruptedException
* @throws FileNotFoundException
*/
public static void main(String[] args) throws InterruptedException, FileNotFoundException {
FailFast failFast = new FailFast(Thread.currentThread());
{ // Fail Fast thread
Thread fail_fast_thread = new Thread(failFast);
fail_fast_thread.setName("fail_fast_thread");
fail_fast_thread.start();
}
try
{
Args.parse(FileItemAttributesImporter.class, args);
DefaultApiClient client = new DefaultApiClient(apiUrl,consumerKey,consumerSecret,API_TIMEOUT);
FileItemAttributesImporter fixer = new FileItemAttributesImporter(client);
fixer.setFailFast(failFast);
fixer.run();
}
catch (IllegalArgumentException e)
{
e.printStackTrace();
Args.usage(FileItemAttributesImporter.class);
}
}
/**
*
* @param url The domain to extract the domain from.
* @return The domain or UNKOWN_DOMAN if unable to use url.
*/
private static String getDomain(String url) {
String retVal = "UNKOWN_DOMAN";
if (!url.startsWith("http") && !url.startsWith("https")) {
url = "http://" + url;
}
URL netUrl = null;
try {
netUrl = new URL(url);
} catch (MalformedURLException e) {
logger.warn("Failed to get domain for "+ url);
}
if (netUrl != null) {
String host = netUrl.getHost();
retVal = host;
}
return retVal;
}
private void setFailFast(FailFast failFast) {
this.failFast = failFast;
}
}