/* Copyright (C) 2011 Know Gate S.L. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The end-user documentation included with the redistribution, if any, must include the following acknowledgment: "This product includes software parts from hipergate (http://www.hipergate.org/)." Alternately, this acknowledgment may appear in the software itself, if and wherever such third-party acknowledgments normally appear. 3. The name hipergate must not be used to endorse or promote products derived from this software without prior written permission. Products derived from this software may not be called hipergate, nor may hipergate appear in their name, without prior written permission. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. You should have received a copy of hipergate License with this code; if not, visit http://www.hipergate.org or mail to info@hipergate.org */ package com.knowgate.syndication.crawler; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.Date; import java.util.Map; import java.util.ArrayList; import java.util.Comparator; import java.util.Properties; import java.util.Collections; import java.util.ListIterator; import java.text.SimpleDateFormat; import org.xml.sax.Attributes; import org.xml.sax.Parser; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.InputSource; import org.xml.sax.helpers.XMLReaderFactory; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.ParserAdapter; import org.xml.sax.helpers.ParserFactory; import com.knowgate.dataobjs.DB; import com.knowgate.debug.DebugFile; import com.knowgate.debug.StackTraceUtil; import com.knowgate.misc.Gadgets; import com.knowgate.misc.NameValuePair; import com.knowgate.storage.Table; import com.knowgate.storage.Record; import com.knowgate.storage.RecordSet; import com.knowgate.storage.DataSource; import com.knowgate.storage.StorageException; import com.knowgate.storage.RecordColumnValueComparatorAsc; import com.knowgate.clocial.UserAccountAlias; import com.knowgate.syndication.FeedEntry; import com.knowgate.syndication.SyndSearch; import com.knowgate.syndication.SyndReferer; import com.knowgate.syndication.SyndSearchRun; import com.knowgate.syndication.fetcher.BingFetcher; import com.knowgate.syndication.fetcher.EntriesBatch; import com.knowgate.syndication.fetcher.MeneameFetcher; import com.knowgate.syndication.fetcher.YahooBossFetcher; import com.knowgate.syndication.fetcher.BitacorasFetcher; import com.knowgate.syndication.fetcher.GenericFeedFetcher; import com.knowgate.syndication.fetcher.TwitterJsonFetcher; import com.knowgate.syndication.fetcher.FacebookJsonFetcher; import org.apache.oro.text.regex.MalformedPatternException; import com.sun.syndication.io.FeedException; import com.sun.syndication.feed.synd.SyndContent; import com.sun.syndication.feed.synd.SyndEntryImpl; import com.sun.syndication.fetcher.FetcherException; public class SearchRunner extends DefaultHandler { private final static int MAX_RECENT = 100; private final static int DEFAULT_SEARCH_REFRESH = 1200; // 20 minutes private final static int FASTEST_SEARCH_REFRESH = 60; // 1 minute private final static int SLOWEST_SEARCH_REFRESH = 1296000; // 15 days private final static int DELAYED_SEARCH_REFRESH = 600; // 10 minutes private final static int NEVER_SEARCH_REFRESH = 2147483647; private EntriesBatch oBatch; private String sQry, sDomain; private String sCurrentTag, sFetcherId, sFetcherUri; private boolean bFetcherIsEnabled; private Properties oEnvProps; private ArrayList<NameValuePair> aFetchers; public SearchRunner(String sTxSought, Properties oProps) { oEnvProps = oProps; aFetchers = new ArrayList<NameValuePair>(20); oBatch = null; init(); setQueryString(sTxSought); } private void init() { if (DebugFile.trace) DebugFile.writeln("Begin SearchRunner.init()"); try { XMLReader oParser; Parser oSax1Parser; try { oParser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); } catch (Exception e) { oSax1Parser = ParserFactory.makeParser("org.apache.xerces.parsers.SAXParser"); oParser = new ParserAdapter(oSax1Parser); } oParser.setContentHandler(this); InputStream oInStm = getClass().getResourceAsStream("SearchRunner.xml"); InputSource oInSrc = new InputSource(oInStm); oParser.parse(oInSrc); oInStm.close(); } catch (Exception e) { try { if (DebugFile.trace) DebugFile.writeln(e.getClass().getName()+" "+e.getMessage()+"\n"+StackTraceUtil.getStackTrace(e)); } catch (IOException ignore) {} } if (DebugFile.trace) DebugFile.writeln("End SearchRunner.init()"); } // public void startElement(String uri, String local, String raw, Attributes attrs) throws SAXException { sCurrentTag = local; bFetcherIsEnabled = true; if (local.equals("fetcher")) { sFetcherId = attrs.getValue("id"); if (attrs.getValue("enabled")!=null) bFetcherIsEnabled = attrs.getValue("enabled").equals("1") || attrs.getValue("enabled").equalsIgnoreCase("true") || attrs.getValue("enabled").equalsIgnoreCase("yes") || attrs.getValue("enabled").equalsIgnoreCase("on"); else bFetcherIsEnabled = true; } else if (local.equals("uri")) { sFetcherUri = ""; } } // startElement public void characters(char[] ch, int start, int length) throws SAXException { if (sCurrentTag.equals("uri")) { sFetcherUri += new String(ch,start,length); } } public void endElement(String uri, String local, String name) throws SAXException { if (bFetcherIsEnabled) aFetchers.add(new NameValuePair(sFetcherId,sFetcherUri)); } public void setQueryString(String sTxSought) { sQry = sTxSought; try { URL oFind = new URL(sQry.startsWith("http://") || sQry.startsWith("https://") ? sQry : "http://"+sQry); if (oFind.getFile().length()==0) { sDomain = oFind.getHost(); String[] aDomain = Gadgets.split(sDomain,'.'); if (aDomain.length>1) sDomain = aDomain[aDomain.length-2] + "." + aDomain[aDomain.length-1]; } else { sDomain = null; } } catch (Exception ignore) { sDomain = null; } } private String setURLParam1(String sUrl, String sParamValue) { String sRetVal = sUrl; try { sRetVal = Gadgets.replace(sUrl,"\\x241",Gadgets.URLEncode(sParamValue)); } catch (MalformedPatternException neverthrown) { } return sRetVal; } public void run(DataSource oDts) throws IOException,StorageException,InstantiationException { if (DebugFile.trace) { DebugFile.writeln("Begin SearchRunner.run("+sQry+")"); DebugFile.incIdent(); } Table oTbl = null; int nNew = 0; RecordColumnValueComparatorAsc oRcvc = new RecordColumnValueComparatorAsc("uri_entry"); long lStartMilis = new Date().getTime(), lEndMilis; oBatch = new EntriesBatch(oDts, oEnvProps); for (NameValuePair oNvp : aFetchers) { if (oNvp.getName().equals("twittersearch")) { oBatch.registerFetcher(new TwitterJsonFetcher (oDts, setURLParam1(oNvp.getValue(), sQry), sQry)); } else if (oNvp.getName().equals("bitacoras")) { oBatch.registerFetcher(new BitacorasFetcher(oDts, setURLParam1(oNvp.getValue(), sDomain==null ? sQry : sDomain), sQry, oBatch.getFeedsCache())); } else if (oNvp.getName().equals("facebookgraph")) { oBatch.registerFetcher(new FacebookJsonFetcher(oDts, setURLParam1(oNvp.getValue(), sQry), sQry)); } else if (oNvp.getName().equals("yahooboss")) { oBatch.registerFetcher(new YahooBossFetcher(oDts, sQry, oEnvProps)); } else if (oNvp.getName().startsWith("meneame")) { oBatch.registerFetcher(new MeneameFetcher(oDts, setURLParam1(oNvp.getValue(), sQry), sQry, oBatch.getFeedsCache())); } else if (oNvp.getName().startsWith("bingsearch")) { oBatch.registerFetcher(new BingFetcher(oDts, sQry, oEnvProps)); } else { oBatch.registerFetcher(new GenericFeedFetcher (oDts, setURLParam1(oNvp.getValue(), sQry), oNvp.getName(), sQry, oBatch.getFeedsCache(), oEnvProps)); } } // next oBatch.mapReduce(); ArrayList<SyndReferer> aReferences = new ArrayList<SyndReferer>(); try { // ********************************************************* // Store a singleton for query string at k_syndsearches table SyndSearch oSs = new SyndSearch(oDts,Gadgets.left(sQry, 254), new Date(), 0, null, 0, 0); oTbl = oDts.openTable(oSs); if (!oTbl.exists(sQry)) oSs.store(oTbl); oTbl.close(); oTbl=null; // ******************************************************** // Get all entries found for current query at previous runs FeedEntry oFe = new FeedEntry(oDts); oTbl = oDts.openTable(oFe); RecordSet oRst = oTbl.fetch("tx_sought", sQry); oTbl.close(); oTbl=null; if (DebugFile.trace) DebugFile.writeln(String.valueOf(oRst.size())+" previous entries exists"); oRst.sort("uri_entry"); Map<String,FeedEntry> oCache = oBatch.entries(); // ********************************************************************* // For each fetched entry check if it was already at k_syndentries table for (Object oOntr : oCache.values()) { FeedEntry oFntr = (FeedEntry) oOntr; int iRec = Collections.binarySearch(oRst, oFntr, (Comparator) oRcvc); if (iRec>=0) { // If this entry was already indexed then get its former primary key oFntr.put("id_syndentry", oRst.get(iRec).get("id_syndentry")); } else { // If this is a completely new entry, increment the new entries counter // and add the reference to the target URL for given query string nNew++; if (!oFntr.isNull(DB.url_domain)) aReferences.add(new SyndReferer(oDts, sQry, oFntr.getString(DB.url_domain))); } } // next (feed) if (DebugFile.trace) DebugFile.writeln(String.valueOf(nNew)+" new entries found"); Date dtNow = new Date(); // ************************************** // Add new entries to k_syndentries table oTbl = oDts.openTable(oFe); for (Object oOntr : oCache.values()) { FeedEntry oFntr = (FeedEntry) oOntr; // Always try to associate a user account to each feed entry if (oFntr.isEmpty("nm_service") || oFntr.isEmpty("nm_alias") || Gadgets.search (new String[] {"admin","anonymous","anonimo","editorial"}, oFntr.getString("nm_alias","").toLowerCase())>=0) { oFntr.remove("id_acalias"); oFntr.remove("nm_alias"); } if (oFntr.isEmpty(DB.gu_account) && !oFntr.isEmpty("nm_service") && !oFntr.isEmpty("nm_alias")) { String sAccId = UserAccountAlias.getUserAccountId(oDts, oFntr.getString("nm_service"), oFntr.getString("nm_alias")); oFntr.put(DB.gu_account, sAccId); } oFntr.store(oTbl); } // next oRst = oTbl.fetch("tx_sought_by_date", sQry+"%"); oTbl.close(); oTbl=null; oBatch.close(); oBatch=null; final int nResults = oRst.size(); if (DebugFile.trace) DebugFile.writeln(String.valueOf(nResults)+" total entries after update"); // ************************************ // Add referers to k_syndreferers table oTbl = oDts.openTable(new SyndReferer(oDts,null,null)); for (SyndReferer r : aReferences) { Record s = oTbl.load(r.getString(DB.id_syndref)); if (null!=s) r.put(DB.nu_entries, s.getInt(DB.nu_entries)+1); r.store(oTbl); } // next oTbl.close(); oTbl=null; // ***************************************************** // Update total results and times run for query string & // kept latests result formatted into an cached XML CLOB SyndSearch oSyS = new SyndSearch(oDts, sQry, dtNow, 1, null, 0, nResults); oTbl = oDts.openTable(oSyS); Record oRyS = oTbl.load(sQry); int nReRunAfter; if (oRyS==null) { nReRunAfter = DEFAULT_SEARCH_REFRESH; } else { oSyS.put("nu_runs", oRyS.getInt("nu_runs")+1); if (oRyS.isNull("nu_rerun_after_secs")) nReRunAfter = DEFAULT_SEARCH_REFRESH; else nReRunAfter = oRyS.getInt("nu_rerun_after_secs"); } // fi if (nReRunAfter==NEVER_SEARCH_REFRESH) { oSyS.remove("dt_next_run"); } else { if (nNew==0) { if (nReRunAfter<SLOWEST_SEARCH_REFRESH) nReRunAfter += DELAYED_SEARCH_REFRESH; } else { if (nReRunAfter>FASTEST_SEARCH_REFRESH) nReRunAfter /= 2; if (nReRunAfter>FASTEST_SEARCH_REFRESH) nReRunAfter = FASTEST_SEARCH_REFRESH; } // fi oSyS.put("dt_next_run", new Date(dtNow.getTime()+(long) nReRunAfter)); } // fi oSyS.put("dt_last_run", dtNow); oSyS.put("nu_rerun_after_secs", nReRunAfter); oSyS.put("nu_results", nResults); oSyS.put("xml_recent", recordSetToXML(oRst, oEnvProps.getProperty("shortdate", "yyyy-MM-dd"), MAX_RECENT, 0)); oSyS.store(oTbl); oTbl.close(); oTbl=null; lEndMilis = new Date().getTime(); if (DebugFile.trace) DebugFile.writeln("Batch took "+String.valueOf(lEndMilis-lStartMilis)+" to execute"); // ****************************** // Write an audit log of this run SyndSearchRun oRun = new SyndSearchRun(oDts, sQry, dtNow, (int) (lEndMilis-lStartMilis),nNew); oTbl = oDts.openTable(oRun); oRun.store(oTbl); oTbl.close(); } catch (Exception xcpt) { if (DebugFile.trace) { DebugFile.writeln(xcpt.getClass().getName()+" "+xcpt.getMessage()); DebugFile.writeln(StackTraceUtil.getStackTrace(xcpt)); } if (oTbl!=null) { try { oTbl.close(); } catch (Exception ignore) { } throw new StorageException(xcpt.getMessage(), xcpt); } } if (DebugFile.trace) { DebugFile.decIdent(); DebugFile.writeln("End SearchRunner.run()"); } } // run private static String hlink(final String sIn) { String sHIn = sIn; try { String sUrl = Gadgets.getFirstMatchSubStr(sIn, "(http|https)://[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&:/~\\+#]*[\\w\\-\\@?^=%&/~\\+#])?"); String sHref = Gadgets.getFirstMatchSubStr(sIn, "(href|HREF)\\s*=\\s*(\"|')(http|https)://[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&:/~\\+#]*[\\w\\-\\@?^=%&/~\\+#])?(\"|')"); if (sUrl!=null && sHref==null) { sHIn = Gadgets.replace(sIn, Gadgets.escapeChars(sUrl, "*?()[]-+\\",'\\'), "<a href=\""+sUrl+"\"></a>"); } String sTwit = Gadgets.getFirstMatchSubStr(sHIn, "( |RT)@\\w+( |:)"); if (sTwit!=null) { sHIn = Gadgets.replace(sHIn, sTwit, "<a href=\"http://twitter.com/"+sTwit.trim()+"\"></a>"); } } catch (Exception xcpt) { if (DebugFile.trace) DebugFile.writeln(xcpt.getClass().getName()+" at hlink("+sIn+")"+xcpt.getMessage()); } return sHIn; } public static String recordToXML(Record r, SimpleDateFormat oFmt) { StringBuffer oBuffer = new StringBuffer(8000); try { oBuffer.append("<syndentry id=\""+r.get("id_syndentry")+"\">"); oBuffer.append("<uri_entry>"); oBuffer.append(Gadgets.replace(Gadgets.XMLEncode(r.getString("uri_entry")),"\"","%22").replace('\n',' ')); oBuffer.append("</uri_entry>"); oBuffer.append("<id_type>"); oBuffer.append(r.getString("id_type","")); oBuffer.append("</id_type>"); oBuffer.append("<id_country>"); String sCountryId = r.getString("id_country",""); oBuffer.append(sCountryId.equals("xx") ? "" : sCountryId); oBuffer.append("</id_country>"); oBuffer.append("<dt_run>"); if (!r.isNull("dt_run")) oBuffer.append(oFmt.format(r.getDate("dt_run")).replace(' ','T')); oBuffer.append("</dt_run>"); oBuffer.append("<dt_published>"); if (!r.isNull("dt_published")) oBuffer.append(oFmt.format(r.getDate("dt_published")).replace(' ','T')); oBuffer.append("</dt_published>"); oBuffer.append("<dt_modified>"); if (!r.isNull("dt_modified")) oBuffer.append(oFmt.format(r.getDate("dt_modified")).replace(' ','T')); oBuffer.append("</dt_modified>"); oBuffer.append("<gu_contact>"); oBuffer.append(r.getString("gu_contact","")); oBuffer.append("</gu_contact>"); oBuffer.append("<nm_author><![CDATA["); if (Gadgets.hasXssSignature(r.getString("nm_author",""))) oBuffer.append(Gadgets.XMLEncode(Gadgets.HTMLDencode(r.getString("nm_author",""))).replace('\n',' ')); else oBuffer.append(Gadgets.HTMLDencode(r.getString("nm_author","")).replace('\n',' ')); oBuffer.append("]]></nm_author>"); oBuffer.append("<url_author>"); oBuffer.append(Gadgets.replace(Gadgets.XMLEncode(r.getString("url_author","")),"\"","%22").replace('\n',' ')); oBuffer.append("</url_author>"); oBuffer.append("<nu_influence>"); if (!r.isNull("nu_influence")) oBuffer.append(r.getInteger("nu_influence").toString()); oBuffer.append("</nu_influence>"); oBuffer.append("<nu_relevance>"); if (!r.isNull("nu_relevance")) oBuffer.append(r.get("nu_relevance").toString()); oBuffer.append("</nu_relevance>"); oBuffer.append("<tl_entry><![CDATA["); if (Gadgets.hasXssSignature(r.getString("tl_entry",""))) oBuffer.append(Gadgets.XMLEncode(Gadgets.HTMLDencode(r.getString("tl_entry",""))).replace('\n',' ')); else oBuffer.append(Gadgets.HTMLDencode(hlink(r.getString("tl_entry","")).replace('\n',' '))); oBuffer.append("]]></tl_entry>"); oBuffer.append("<de_entry><![CDATA["); if (Gadgets.hasXssSignature(r.getString("de_entry",""))) oBuffer.append(Gadgets.XMLEncode(Gadgets.HTMLDencode(r.getString("de_entry",""))).replace('\n',' ')); else oBuffer.append(Gadgets.HTMLDencode(hlink(r.getString("de_entry","")).replace('\n',' '))); oBuffer.append("]]></de_entry>"); oBuffer.append("<url_addr>"); oBuffer.append(Gadgets.replace(Gadgets.XMLEncode(r.getString("url_addr","")),"\"","%22").replace('\n',' ')); oBuffer.append("</url_addr>"); oBuffer.append("<url_domain>"); oBuffer.append(Gadgets.replace(Gadgets.XMLEncode(r.getString("url_domain","")),"\"","%22").replace('\n',' ')); oBuffer.append("</url_domain>"); oBuffer.append("<tx_content><![CDATA["); try { SyndEntryImpl oEntry = (SyndEntryImpl) r.get(DB.bin_entry); if (oEntry.getContents().size()>0) oBuffer.append(((SyndContent)oEntry.getContents().get(0)).getValue()); } catch (Exception xcpt) { if (DebugFile.trace) DebugFile.writeln(xcpt.getClass().getName()+" "+xcpt.getMessage()+" at entry "+r.get("id_syndentry")); } oBuffer.append("]]></tx_content>"); oBuffer.append("</syndentry>"); } catch (org.apache.oro.text.regex.MalformedPatternException neverthrown) { } return oBuffer.toString(); } // recordToXML public static String recordSetToXML(RecordSet oRst, String sDateFormat, int iMaxResults, int iOffset) throws StorageException,InstantiationException,FeedException,FetcherException,IOException { if (DebugFile.trace) { DebugFile.writeln("Begin SearchRunner.recordSetToXML([RecordSet], "+sDateFormat+", "+String.valueOf(iMaxResults)+", "+String.valueOf(iOffset)+")"); DebugFile.incIdent(); } int iWritten,iSkipped; StringBuffer oBuffer = new StringBuffer(64000); SimpleDateFormat oFmt = new SimpleDateFormat(sDateFormat==null ? "yyyy-MM-dd" : sDateFormat); oBuffer.append("<syndentries count=\""+String.valueOf(oRst.size()<iMaxResults ? oRst.size() : iMaxResults)+"\">"); ListIterator<Record> oIter = oRst.listIterator(oRst.size()); for (iSkipped=0; oIter.hasPrevious() && iSkipped<iOffset; iSkipped++) oIter.previous(); for (iWritten=0; oIter.hasPrevious() && iWritten<iMaxResults; iWritten++) { oBuffer.append(SearchRunner.recordToXML(oIter.previous(), oFmt)); } // next oBuffer.append("</syndentries>"); if (DebugFile.trace) { DebugFile.decIdent(); DebugFile.writeln("End SearchRunner.recordSetToXML() : "+String.valueOf(iWritten)); } return oBuffer.toString(); } }