/* * Seldon -- open source prediction engine * ======================================= * Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/) * ********************************************************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ********************************************************************************************** */ package io.seldon.importer.articles; import io.seldon.client.DefaultApiClient; import io.seldon.client.beans.ItemBean; import io.seldon.client.exception.ApiException; import io.seldon.importer.articles.dynamicextractors.AttributeDetail; import io.seldon.importer.articles.dynamicextractors.AttributeDetailList; import io.seldon.importer.articles.dynamicextractors.DynamicExtractor; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.Date; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Appender; import org.apache.log4j.FileAppender; import org.apache.log4j.Logger; import org.codehaus.jackson.map.ObjectMapper; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import com.sampullara.cli.Args; import com.sampullara.cli.Argument; public class GeneralItemAttributesImporter { private static Logger logger = Logger.getLogger(GeneralItemAttributesImporter.class.getName()); private static enum OperationMode { OPERATION_MODE_ITEM_IMPORTER, OPERATION_MODE_FILE_IMPORTER } private static OperationMode opMode = OperationMode.OPERATION_MODE_ITEM_IMPORTER; // set a default mode @Argument(alias = "n", description = "How many items to import", required = false) static Integer numItems = 500; @Argument(alias = "i", description = "Interval in secs between runs", required = false) static Integer intervalSecs = 600; @Argument(alias = "gt", description = "Timeout on article http GET", required = false) static Integer httpGetTimeout = 10000; @Argument(alias = "minFetch", description = "Min time between url requests in msecs", required = false) static Integer minFetchGapMsecs = 500; @Argument(alias = "api-url", description = "API Endpoint", required = false) static String apiUrl; @Argument(alias = "consumer-key", description = "Consumer Key", required = false) static String consumerKey; @Argument(alias = "consumer-secret", description = "Consumer Secret", required = false) static String consumerSecret; @Argument(alias = "cf", description = "Client Id Filter - only process client ids starting with this string", required = false) static String clientIdFilter; @Argument(alias = "urls", description = "File containing list of URLs", required = false) static String urlFile = null; @Argument(alias = "banner-id-name", description = "This will modify the client id to 'banner-XXXX", required = false) static String bannerIdName = null; @Argument(alias = "banner-valid-type", description = "The type(int) to use then an item is valid", required = false) static String bannerValidType = null; @Argument(alias = "banner-invalid-type", description = "The type(int) to use then an item is invalid", required = false) static String bannerInValidType = null; @Argument(alias = "banner-item-log-file", description = "Log imported banner items (id and baseurl) to file", required = false) static String bannerItemLogFile; @Argument(alias = "add-base-url-attribute", description = "Add an additional attribute for the baseurl", required = false) static boolean addBaseUrlAttribute = false; @Argument(alias = "content-type-override", description = "Override the default content_type", required = false) static String contentTypeOverride = null; @Argument(alias = "invalid-content-type-override", description = "Override the default content_type", required = false) static String invalidContentTypeOverride = null; @Argument(alias = "it", description = "Item type", required = false, delimiter = ",") static Integer[] itemTypes = new Integer[] {0,1}; @Argument(alias = "t", description = "For testing, will not update", required = false) static boolean testmode = false; @Argument(alias = "j", description = "Is javascript in page supported", required = false) static boolean jsSupport = false; @Argument(alias = "attributes_config_file", description = "Attributes details", required = false) static String attributesConfigFile; @Argument(alias = "test_url", description = "Check url only", required = false) static String testUrl = null; static int API_TIMEOUT = 10000; static String ATTR_IMG_NAME = "img_url"; static final String ATTR_CATEGORIES = "categories"; static final String ATTR_LINK = "link"; static final String ATTR_TITLE = "title"; static final String CONTENT_TYPE_ARTICLE_VALID = "article"; static final String CONTENT_TYPE_ARTICLE_INVALID = "old_article"; public static final int TYPE_NOT_VALID = 2; public static final int TYPE_NOT_SET = 0; public static final int TYPE_VALID = 1; static final String UNVERIFIED_CONTENT_TYPE = "unverified_article"; static final String VERIFIED_CONTENT_TYPE = "article"; //field public static String CONTENT_TYPE = "content_type"; static String CATEGORY = "category"; private int total_item_processed_count = 0; private int total_item_succeded_count = 0; private FailFast failFast = null; DefaultApiClient client; static long lastUrlFetchTime = 0; private static List<AttributeDetail> attribute_detail_list = null; public GeneralItemAttributesImporter(DefaultApiClient client) { this.client = client; } public void run(UrlFetcher urlFetcher) throws InterruptedException { boolean keepGoing = true; logger.info("Starting..."); logger.info("opMode: "+opMode.toString()); while(keepGoing) { logger.info("Processing recent urls..."); int updates = 0; if (opMode == OperationMode.OPERATION_MODE_ITEM_IMPORTER ) { updates = process_as_item_importer(urlFetcher); } else { updates = process_as_file_importer(urlFetcher); } logger.info("Processed with "+updates+" updates"); if (opMode == OperationMode.OPERATION_MODE_FILE_IMPORTER) { keepGoing = false; logger.info("Processed urls...Finished"); if (failFast != null) { failFast.stopChecking(); // We are exiting normally so no need to check the main thread is going to die } } else { logger.info("Processed urls..sleeping..."); Thread.sleep(intervalSecs * 1000); } } { // If we get here then we should terminate. Force the termination as sometimes when as file importer it hangs due to open connections logger.info("End of processing so terminating process."); { // Flush the file log appenders Enumeration<?> appenders = logger.getAllAppenders(); while (appenders.hasMoreElements()) { Appender appender = (Appender)appenders.nextElement(); if (appender instanceof FileAppender) { FileAppender fileAppender = (FileAppender)appender; fileAppender.setImmediateFlush(true); } } } System.exit(0); // We've finished processing } } public static String getUrlEncodedString(String input) { URL url = null; try { url = new URL(input); URI uri = new URI( url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null); String encoded = uri.toASCIIString(); return encoded; } catch(MalformedURLException mue) { logger.error("Malformed url "+input); return null; } catch (URISyntaxException e) { logger.error("Failed to tranform url into uri ",e); return null; } } public int process_as_item_importer(UrlFetcher urlFetcher) { int updates = 0; try { int count = 0; int foundItems = 0; List<ItemBean> items = new ArrayList<ItemBean>(); for(int i=0;i<itemTypes.length;i++) { List<ItemBean> itemsForType = client.getItems(numItems, itemTypes[i], true, "last_action"); logger.info("Adding "+itemsForType.size()+" items for item type "+itemTypes[i]); items.addAll(itemsForType); } foundItems = items.size(); for(ItemBean item : items) { count++; String contentType = item.getAttributesName().get(CONTENT_TYPE); if(item.getType() == TYPE_NOT_SET || (contentType == null || UNVERIFIED_CONTENT_TYPE.equals(contentType))) { total_item_processed_count++; logger.info("Looking at item "+count+"/"+foundItems); System.out.println("Item => "+item.toString()); boolean imported = false; if (clientIdFilter != null && !item.getId().startsWith(clientIdFilter)) { logger.info("Skipping as does not match client id filter "+item.getId()); continue; } else if (item.getId().startsWith("file://")) { logger.warn("Ignoring bad url: "+item.getId()); continue; } try { Map<String,String> attributes = getAttributes(urlFetcher, item.getId(),item.getAttributesName().get(CATEGORY)); if(attributes != null) { updates++; total_item_succeded_count++; item.setName(attributes.get(GeneralItemAttributesImporter.ATTR_TITLE)); item.setAttributesName(attributes); item.setType(TYPE_VALID); mofidyItemId(item); // check the -bannerIdName flag if (!testmode) { client.updateItem(item); } else { logger.info("TESTMODE skipping update"); logger.info("Item Details: " + item); } imported = true; } } catch(Exception e) { logger.warn("Article:" + item.getId() + " error.",e); } String updated_amount_string = String.format("[%d/%d %.0f%%]", total_item_succeded_count,total_item_processed_count, ((((double)total_item_succeded_count/(double)total_item_processed_count))*100)); if(imported) { logger.info("Article : " + item.getId() + " import - OK "+updated_amount_string); logger.info("Item : " + item); } else { logger.info("Article : " + item.getId() + " import - NOT OK "+updated_amount_string); } } else { logger.info("Article : " + item.getId() + " SKIPPED"); } } } catch (ApiException e) { logger.error("Failed api call",e); } return updates; } public int process_as_file_importer(UrlFetcher urlFetcher) { int updates = 0; try { BufferedReader reader = new BufferedReader(new FileReader(urlFile)); String url; int count = 0; while ((url = reader.readLine()) != null) { count++; ItemBean item = new ItemBean(url, "", 1); total_item_processed_count++; logger.info("Looking at item "+count); System.out.println("Item => "+item.toString()); boolean imported = false; try { String category = null; if (item.getAttributesName() != null) category = item.getAttributesName().get(GeneralItemAttributesImporter.CATEGORY); Map<String,String> attributes = getAttributes(urlFetcher,item.getId(),category); if(attributes != null) { updates++; total_item_succeded_count++; item.setName(attributes.get(GeneralItemAttributesImporter.ATTR_TITLE)); item.setAttributesName(attributes); item.setType(GeneralItemAttributesImporter.TYPE_VALID); item.setFirst_action(new Date()); item.setLast_action(new Date()); mofidyItemId(item); // check the -bannerIdName flag invalidateUsingBannerItemLogFile(item); // check the -bannerItemLogFile flag if (!testmode) { client.updateItem(item); } else { logger.info("TESTMODE skipping update"); logger.info("Item Details: " + item); } imported = true; } else { // item failed import but still need to invalidate if banner item if (addBaseUrlAttribute) { Map<String,String> attributes_override = new HashMap<String,String>(); String baseUrl = AttributesImporterUtils.getBaseUrl(url); attributes_override.put("baseurl", baseUrl); item.setAttributesName(attributes_override); invalidateUsingBannerItemLogFile(item); // check the -bannerItemLogFile flag } } } catch(Exception e) { logger.warn("Article:" + item.getId() + " error.",e); } String updated_amount_string = String.format("[%d/%d %.0f%%]", total_item_succeded_count,total_item_processed_count, ((((double)total_item_succeded_count/(double)total_item_processed_count))*100)); if(imported) { logger.info("Article : " + item.getId() + " import - OK "+updated_amount_string); logger.info("Item : " + item); } else { logger.info("Article : " + item.getId() + " import - NOT OK "+updated_amount_string); } } reader.close(); } catch (IOException e) { // TODO Auto-generated catch block logger.error("io exception",e); } return updates; } public static Map<String, String> getAttributes(UrlFetcher urlFetcher, String url,String existingCategory) { ItemProcessResult itemProcessResult = new ItemProcessResult(); itemProcessResult.client_item_id = url; itemProcessResult.extraction_status = "EXTRACTION_FAILED"; logger.info("Trying to get attributes for "+url); Map<String,String> attributes = null; try { long now = System.currentTimeMillis(); long timeSinceLastRequest = now - lastUrlFetchTime; if (timeSinceLastRequest < minFetchGapMsecs) { long timeToSleep = minFetchGapMsecs - timeSinceLastRequest; logger.info("Sleeping "+timeToSleep+"msecs as time since last fetch is "+timeSinceLastRequest); Thread.sleep(timeToSleep); } String page = urlFetcher.getUrl(url); // fetch the article page Document articleDoc = Jsoup.parse(page); lastUrlFetchTime = System.currentTimeMillis(); attributes = new HashMap<String,String>(); if (attribute_detail_list!=null) { for (AttributeDetail attributeDetail: attribute_detail_list) { String attrib_name = attributeDetail.name; String attrib_value = null; String attrib_default_value = attributeDetail.default_value; String extractorType = attributeDetail.extractor_type; if (StringUtils.isNotBlank(extractorType)) { DynamicExtractor extractor = DynamicExtractor.build(extractorType); attrib_value = extractor.extract(attributeDetail, url, articleDoc); } attrib_name = (attrib_name!=null) ? StringUtils.strip( attrib_name ) : ""; attrib_value = (attrib_value!=null) ? StringUtils.strip( attrib_value ) : ""; attrib_default_value = (attrib_default_value!=null) ? StringUtils.strip( attrib_default_value ) : ""; // Use the default value if necessary if ( StringUtils.isBlank(attrib_value) && StringUtils.isNotBlank(attrib_default_value)) { attrib_value = attrib_default_value; } if (StringUtils.isNotBlank(attrib_name) && StringUtils.isNotBlank(attrib_value)) { attributes.put(attrib_name, attrib_value); } } // check if the import was ok boolean required_import_ok = true; for (AttributeDetail attributeDetail: attribute_detail_list) { if (attributeDetail.is_required && (!attributes.containsKey(attributeDetail.name))) { required_import_ok = false; itemProcessResult.attrib_failure_list = itemProcessResult.attrib_failure_list + ((StringUtils.isBlank(itemProcessResult.attrib_failure_list))?"":",")+ attributeDetail.name; } } if(required_import_ok) { if (contentTypeOverride == null) { attributes.put(CONTENT_TYPE, VERIFIED_CONTENT_TYPE); } else { attributes.put(CONTENT_TYPE, contentTypeOverride); } if (addBaseUrlAttribute) { String baseUrl = AttributesImporterUtils.getBaseUrl(url); attributes.put("baseurl", baseUrl); } itemProcessResult.extraction_status = "EXTRACTION_SUCCEEDED"; } else { attributes = null; logger.warn("Failed to get needed attributes for article "+url); } } } catch(Exception e) { logger.error("Article: " + url + ". Attributes import FAILED",e); itemProcessResult.error = e.toString(); } AttributesImporterUtils.logResult(logger, itemProcessResult); return attributes; } /** * @param args * @throws InterruptedException * @throws FileNotFoundException */ public static void main(String[] args) throws InterruptedException, FileNotFoundException { FailFast failFast = new FailFast(Thread.currentThread()); { // Fail Fast thread Thread fail_fast_thread = new Thread(failFast); fail_fast_thread.setName("fail_fast_thread"); fail_fast_thread.start(); } try { Args.parse(GeneralItemAttributesImporter.class, args); UrlFetcher urlFetcher = null; { // setup the correct UrlFetcher if (jsSupport) { urlFetcher = new JsSupporedUrlFetcher(httpGetTimeout); } else { urlFetcher = new SimpleUrlFetcher(httpGetTimeout); } } { // Determine opMode by checking for urlFile if (urlFile !=null) { opMode = OperationMode.OPERATION_MODE_FILE_IMPORTER; } } { // setup the attribute_detail_list attribute_detail_list = getAttributeDetailList(attributesConfigFile); } if (testUrl!=null){ test_url_and_exit(urlFetcher, testUrl); } DefaultApiClient client = new DefaultApiClient(apiUrl,consumerKey,consumerSecret,API_TIMEOUT); GeneralItemAttributesImporter fixer = new GeneralItemAttributesImporter(client); fixer.setFailFast(failFast); fixer.run(urlFetcher); } catch (IllegalArgumentException e) { e.printStackTrace(); Args.usage(GeneralItemAttributesImporter.class); } catch (Exception e) { e.printStackTrace(); Args.usage(GeneralItemAttributesImporter.class); } } /** * * @param url The domain to extract the domain from. * @return The domain or UNKOWN_DOMAN if unable to use url. */ private static String getDomain(String url) { String retVal = "UNKOWN_DOMAN"; if (!url.startsWith("http") && !url.startsWith("https")) { url = "http://" + url; } URL netUrl = null; try { netUrl = new URL(url); } catch (MalformedURLException e) { logger.warn("Failed to get domain for "+ url); } if (netUrl != null) { String host = netUrl.getHost(); retVal = host; } return retVal; } private void setFailFast(FailFast failFast) { this.failFast = failFast; } private void mofidyItemId(ItemBean item) throws Exception { if (bannerIdName != null) { if (bannerValidType == null) { throw new Exception("using -bannerIdName but bannerValidType is null"); } String item_id_current = item.getId(); String id_to_use = item.getAttributesName().get( bannerIdName ); String item_id_updated = "banner-"+id_to_use; item.setId( item_id_updated ); item.setType(Integer.parseInt(bannerValidType)); logger.info(String.format("updated item id [%s] -> [%s]", item_id_current, item_id_updated)); } } private void invalidateUsingBannerItemLogFile(ItemBean item) throws Exception { if (bannerItemLogFile != null) { if (bannerInValidType == null) { throw new Exception("using -bannerItemLogFile but bannerInValidType is null"); } final String current_item_id = item.getId(); final String current_item_baseurl = item.getAttributesName().get( "baseurl" ); { // make sure we log the current item if it was modified if (!current_item_id.equals(current_item_baseurl)) { // if the currrent's id and baseurl are the same - means it wasnt imported and it set with an id, so dont write it out PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter( bannerItemLogFile, true))); out.println( current_item_id+"|"+current_item_baseurl); out.close(); } } File f = new File(bannerItemLogFile); if(f.exists() && !f.isDirectory()) { Set<String> itemList = new HashSet<String>(); BufferedReader br = new BufferedReader(new FileReader(bannerItemLogFile)); String line; while ((line = br.readLine()) != null) { line = StringUtils.trim(line); if (line.length()>0) { itemList.add(line); } } br.close(); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter( bannerItemLogFile, false))); for (String loggedItem: itemList) { String logged_item_id = null; String logged_item_baseurl = null; { String[] line_items = StringUtils.split(loggedItem, '|'); if (line_items.length == 2) { logged_item_id = line_items[0]; logged_item_baseurl = line_items[1]; } } if ((logged_item_id!=null) &&(logged_item_baseurl!=null)) { if (logged_item_baseurl.equalsIgnoreCase(current_item_baseurl) && !logged_item_id.equalsIgnoreCase(current_item_id)) { ItemBean itemToInvalidate = new ItemBean(logged_item_id, "", Integer.parseInt(bannerInValidType)); if (invalidContentTypeOverride != null) { Map<String,String> attributes_override = new HashMap<String,String>(); attributes_override.put("content_type", invalidContentTypeOverride); itemToInvalidate.setAttributesName(attributes_override); } logger.info("Invalidating Item: " + itemToInvalidate); if (!testmode) { try { client.updateItem(itemToInvalidate); } catch (ApiException e) { logger.error("ERROR trying to invalidate item ["+itemToInvalidate.getId()+"] :"+e.toString()); out.println( logged_item_id+"|"+logged_item_baseurl); // keep as we have an error } } else { logger.info("TESTMODE skipping Invalidate update"); } } else { out.println( logged_item_id+"|"+logged_item_baseurl); } } } out.close(); } } } public static List<AttributeDetail> getAttributeDetailList(String confFile) throws Exception { List<AttributeDetail> retVal = null; ObjectMapper mapper = new ObjectMapper(); AttributeDetailList attributeDetailList = mapper.readValue(new File(confFile), AttributeDetailList.class); if (attributeDetailList!=null) { retVal = attributeDetailList.attribute_detail_list; } return retVal ; } private static String readFileAsString(String fpath) throws IOException { StringBuilder sb = new StringBuilder(); BufferedReader reader = new BufferedReader(new FileReader(new File(fpath))); char[] buff = new char[1024]; int numRead = 0; while ((numRead=reader.read(buff)) != -1) { String readData = String.valueOf(buff, 0, numRead); sb.append( readData ); } return sb.toString(); } private static void test_url_and_exit(UrlFetcher urlFetcher, String url) { System.out.println("-------- Attempting Extraction --------"); System.out.println("url: "+url); System.out.println("---------------------------------------"); Map<String, String> extracted_attributes = getAttributes(urlFetcher,url, ""); if (extracted_attributes!=null) { Set<String> keys = extracted_attributes.keySet(); for (String key: keys) { String s = String.format("%s: %s", key, extracted_attributes.get(key)); System.out.println(s); } } else { System.out.println("Extraction failed!"); } System.out.println("---------------------------------------"); System.exit(0); } }