/* Copyright (C) 2003-2011 Know Gate S.L. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The end-user documentation included with the redistribution, if any, must include the following acknowledgment: "This product includes software parts from hipergate (http://www.hipergate.org/)." Alternately, this acknowledgment may appear in the software itself, if and wherever such third-party acknowledgments normally appear. 3. The name hipergate must not be used to endorse or promote products derived from this software without prior written permission. Products derived from this software may not be called hipergate, nor may hipergate appear in their name, without prior written permission. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. You should have received a copy of hipergate License with this code; if not, visit http://www.hipergate.org or mail to info@hipergate.org */ package com.knowgate.syndication.fetcher; import java.io.File; import java.util.Map; import java.util.ArrayList; import java.util.Properties; import java.util.HashMap; import com.knowgate.dfs.FileSystem; import com.knowgate.misc.Gadgets; import com.knowgate.dataobjs.DB; import com.knowgate.debug.DebugFile; import com.knowgate.storage.DataSource; import com.knowgate.storage.Engine; import com.knowgate.syndication.FeedEntry; import com.knowgate.syndication.fetcher.BDBFeedInfoCache; import com.sun.syndication.fetcher.impl.FeedFetcherCache; /** * A batch of parallel web searches */ public class EntriesBatch { private FeedFetcherCache oCache; private String sDir; // Path of directory for caching RSS feeds private Properties oPrp; // Environment properties private DataSource oDts; // DataSource where indexes will be stored // All entries retrieved from the fetcher threads are written here // if two fetchers find the same result only the first one is taken // into account and the second one is ignored. private HashMap<String,FeedEntry> mEntries; // Array of fetcher threads private ArrayList<AbstractEntriesFetcher> aFetchers; public EntriesBatch(DataSource oDataSrc, Properties oEnvProps) { oPrp = oEnvProps; oDts = oDataSrc; if (oPrp.getProperty("storage")==null) { sDir = null; } else { sDir = Gadgets.chomp(oPrp.getProperty("storage"),File.separator)+"syndication"; if (!new File(sDir).exists()) { try { new FileSystem().mkdirs("file://"+sDir); } catch (Exception xcpt) { if (DebugFile.trace) DebugFile.writeln("Could not create directory "+sDir+" "+xcpt.getClass().getName()+" "+xcpt.getMessage()); } } } oCache = null; mEntries = new HashMap<String,FeedEntry>(500); aFetchers = new ArrayList<AbstractEntriesFetcher>(10); } public void close() { if (oCache instanceof BDBFeedInfoCache) ((BDBFeedInfoCache) oCache).close(); oCache=null; mEntries.clear(); mEntries=null; aFetchers.clear(); aFetchers=null; } public Properties properties() { return oPrp; } /** * Register a fetcher thread into this batch * @param oFetcher AbstractEntriesFetcher */ public void registerFetcher(AbstractEntriesFetcher oFetcher) { aFetchers.add(oFetcher); } /** * Register a set of fetcher threads into this batch * @param vFetcher Variable number of AbstractEntriesFetcher */ public void registerFetchers(AbstractEntriesFetcher... vFetchers) { for (AbstractEntriesFetcher oFetcher : vFetchers) registerFetcher(oFetcher); } /** * Get array of fetcher threads * @return ArrayList<AbstractEntriesFetcher> */ public ArrayList<AbstractEntriesFetcher> fetchers() { return aFetchers; } /** * Check whther or not the list of URIs found by * fetcher threads contains a given URI * @return <b>true</b> if any fetcher thread as already found the given URI */ public boolean contains(String sUri) { boolean bContains; if (sUri==null) { bContains = false; } else { if (sUri.startsWith("http://") || sUri.startsWith("https://")) { bContains = mEntries.containsKey(sUri); if (!bContains) { if (sUri.endsWith("/")) bContains = mEntries.containsKey(sUri.substring(0, sUri.length()-2)); else bContains = mEntries.containsKey(sUri+"/"); } } else { bContains = mEntries.containsKey(sUri); } } // fi return bContains; } // contains public FeedFetcherCache getFeedsCache() { if (null==oCache && sDir!=null) { Engine oEng = oDts.getEngine(); if (oEng.equals(Engine.BERKELYDB)) oCache = new BDBFeedInfoCache(sDir); else if (oEng.equals(Engine.JDBCRDBMS)) oCache = new DBFeedInfoCache(oDts); } return oCache; } /** * Get DataSource where indexes are written */ public DataSource getDataSource() { return oDts; } /** * Add a entry to the common list of them shared by all fetcher threads * @param sUri String entry unique identifier * @param oEntry FeedEntry */ public void addEntry(String sUri, FeedEntry oEntry) { mEntries.put(sUri, oEntry); } /** * Get entry by its URI * @param URI String * @return FeedEntry */ public FeedEntry getEntry(String sUri) { FeedEntry oFentry = mEntries.get(sUri); if (oFentry==null) { if (sUri.endsWith("/")) oFentry = mEntries.get(sUri.substring(0,sUri.length()-2)); else oFentry = mEntries.get(sUri+"/"); } return oFentry; } // getEntry /** * Get entries already retrieved by fetcher threads * @return ConcurrentHashMap<String,FeedEntry> */ public Map<String,FeedEntry> entries() { return mEntries; } /** * Execute all registered fetcher threads * Wait until all of them have finished before returning */ public void mapReduce() { if (DebugFile.trace) { DebugFile.writeln("Begin EntriesBatch.mapReduce()"); DebugFile.incIdent(); } mEntries.clear(); // ********* // Map stage for (AbstractEntriesFetcher f : aFetchers) { f.start(); } // next if (DebugFile.trace) { DebugFile.writeln(String.valueOf(aFetchers.size())+" fetcher threads started"); } for (int t=0; t<aFetchers.size(); t++) { try { aFetchers.get(t).join(); } catch (InterruptedException e) { if (DebugFile.trace) DebugFile.writeln("join("+String.valueOf(t)+") interrupted"); } } // next if (oCache instanceof BDBFeedInfoCache) ((BDBFeedInfoCache) oCache).close(); oCache=null; if (DebugFile.trace) { int nMapping = 0; for (AbstractEntriesFetcher f : aFetchers) { nMapping += f.entries().size(); } // next DebugFile.writeln(String.valueOf(nMapping)+" URLs found"); } // ************ // Reduce stage for (AbstractEntriesFetcher f : aFetchers) { for (FeedEntry e : f.entries()) { String sUrl = e.getURL(); if (sUrl.length()>0) { if (!mEntries.containsKey(sUrl)) mEntries.put(e.getURL(), e); } else { if (DebugFile.trace) { DebugFile.writeln("EntriesBatch.mapReduce() NullPointerException URL of "+f.getSourceType()+" entry "+e.getString(DB.tl_entry,"")+" is null"); } } } } // next if (DebugFile.trace) { DebugFile.decIdent(); DebugFile.writeln("End EntriesBatch.mapReduce() : "+String.valueOf(mEntries.size())); } } // mapReduce }