/* $Id: RSSConnector.java 994959 2010-09-08 10:04:42Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.connectors.rss;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
import org.apache.manifoldcf.crawler.system.ManifoldCF;
import org.apache.manifoldcf.connectorcommon.interfaces.*;
import org.apache.manifoldcf.connectorcommon.fuzzyml.*;
import org.apache.manifoldcf.core.common.DateParser;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.client.RedirectException;
import org.apache.http.client.CircularRedirectException;
import org.apache.http.NoHttpResponseException;
import org.apache.http.HttpException;
import java.io.*;
import java.util.*;
import java.net.*;
import java.text.*;
import java.util.regex.*;
/** This is the RSS implementation of the IRepositoryConnector interface.
* This connector basically looks at an RSS document in order to seed the
* document queue. The document is always fetched from the same URL (it's
* specified in the configuration parameters). The documents subsequently
* crawled are not scraped for additional links; only the primary document is
* ingested. On the other hand, redirections ARE honored, so that various
* sites that use this trick can be supported (e.g. the BBC)
*
*/
public class RSSConnector extends org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector
{
public static final String _rcsid = "@(#)$Id: RSSConnector.java 994959 2010-09-08 10:04:42Z kwright $";
protected final static String rssThrottleGroupType = "_RSS_";
// Usage flag values
protected static final int ROBOTS_NONE = 0;
protected static final int ROBOTS_DATA = 1;
protected static final int ROBOTS_ALL = 2;
/** Dechromed content mode - none */
public static final int DECHROMED_NONE = 0;
/** Dechromed content mode - description field */
public static final int DECHROMED_DESCRIPTION = 1;
/** Dechromed content mode - content field */
public static final int DECHROMED_CONTENT = 2;
/** Chromed suppression mode - use chromed content if dechromed content not available */
public static final int CHROMED_USE = 0;
/** Chromed suppression mode - skip documents if dechromed content not available */
public static final int CHROMED_SKIP = 1;
/** Chromed suppression mode - index metadata only if dechromed content not available */
public static final int CHROMED_METADATA_ONLY = 2;
/** Robots usage flag */
protected int robotsUsage = ROBOTS_ALL;
/** The user-agent for this connector instance */
protected String userAgent = null;
/** The email address for this connector instance */
protected String from = null;
/** The minimum milliseconds between fetches */
protected long minimumMillisecondsPerFetchPerServer = -1L;
/** The maximum open connections */
protected int maxOpenConnectionsPerServer = 0;
/** The minimum milliseconds between bytes */
protected double minimumMillisecondsPerBytePerServer = 0.0;
/** The throttle group name */
protected String throttleGroupName = null;
/** The proxy host */
protected String proxyHost = null;
/** The proxy port */
protected int proxyPort = -1;
/** Proxy auth domain */
protected String proxyAuthDomain = null;
/** Proxy auth username */
protected String proxyAuthUsername = null;
/** Proxy auth password */
protected String proxyAuthPassword = null;
/** The throttled fetcher used by this instance */
protected ThrottledFetcher fetcher = null;
/** The robots object used by this instance */
protected Robots robots = null;
/** Storage for fetcher objects */
protected static Map<String,ThrottledFetcher> fetcherMap = new HashMap<String,ThrottledFetcher>();
/** Storage for robots objects */
protected static Map robotsMap = new HashMap();
/** Flag indicating whether session data is initialized */
protected boolean isInitialized = false;
// A couple of very important points.
// The canonical document identifier is simply a URL.
// Versions of the document are calculated using a checksum technique
protected static DataCache cache = new DataCache();
protected static final Map understoodProtocols = new HashMap();
static
{
understoodProtocols.put("http","http");
understoodProtocols.put("https","https");
}
// Activity types
public final static String ACTIVITY_FETCH = "fetch";
public final static String ACTIVITY_ROBOTSPARSE = "robots parse";
public final static String ACTIVITY_PROCESS = "process";
/** Deny access token for default authority */
private final static String defaultAuthorityDenyToken = "DEAD_AUTHORITY";
/** Constructor.
*/
public RSSConnector()
{
}
/** Establish a session */
protected void getSession()
throws ManifoldCFException
{
if (!isInitialized)
{
String x;
String emailAddress = params.getParameter(RSSConfig.PARAMETER_EMAIL);
if (emailAddress == null)
throw new ManifoldCFException("Missing email address");
userAgent = "Mozilla/5.0 (ApacheManifoldCFRSSFeedReader; "+((emailAddress==null)?"":emailAddress)+")";
from = emailAddress;
String robotsUsageString = params.getParameter(RSSConfig.PARAMETER_ROBOTSUSAGE);
robotsUsage = ROBOTS_ALL;
if (robotsUsageString == null || robotsUsageString.length() == 0 || robotsUsageString.equals(RSSConfig.VALUE_ALL))
robotsUsage = ROBOTS_ALL;
else if (robotsUsageString.equals(RSSConfig.VALUE_NONE))
robotsUsage = ROBOTS_NONE;
else if (robotsUsageString.equals(RSSConfig.VALUE_DATA))
robotsUsage = ROBOTS_DATA;
proxyHost = params.getParameter(RSSConfig.PARAMETER_PROXYHOST);
String proxyPortString = params.getParameter(RSSConfig.PARAMETER_PROXYPORT);
proxyAuthDomain = params.getParameter(RSSConfig.PARAMETER_PROXYAUTHDOMAIN);
proxyAuthUsername = params.getParameter(RSSConfig.PARAMETER_PROXYAUTHUSERNAME);
proxyAuthPassword = params.getObfuscatedParameter(RSSConfig.PARAMETER_PROXYAUTHPASSWORD);
proxyPort = -1;
if (proxyPortString != null && proxyPortString.length() > 0)
{
try
{
proxyPort = Integer.parseInt(proxyPortString);
}
catch (NumberFormatException e)
{
throw new ManifoldCFException(e.getMessage(),e);
}
}
// Read throttling configuration parameters
minimumMillisecondsPerBytePerServer = 0.0;
maxOpenConnectionsPerServer = 10;
minimumMillisecondsPerFetchPerServer = 0L;
x = params.getParameter(RSSConfig.PARAMETER_BANDWIDTH);
if (x != null && x.length() > 0)
{
try
{
int maxKBytesPerSecondPerServer = Integer.parseInt(x);
if (maxKBytesPerSecondPerServer > 0)
minimumMillisecondsPerBytePerServer = 1.0/(double)maxKBytesPerSecondPerServer;
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
x = params.getParameter(RSSConfig.PARAMETER_MAXOPEN);
if (x != null && x.length() > 0)
{
try
{
maxOpenConnectionsPerServer = Integer.parseInt(x);
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
x = params.getParameter(RSSConfig.PARAMETER_MAXFETCHES);
if (x != null && x.length() > 0)
{
try
{
int maxFetches = Integer.parseInt(x);
if (maxFetches == 0)
maxFetches = 1;
minimumMillisecondsPerFetchPerServer = 60000L/((long)maxFetches);
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
IThrottleGroups tg = ThrottleGroupsFactory.make(currentContext);
// Create the throttle group
tg.createOrUpdateThrottleGroup(rssThrottleGroupType, throttleGroupName, new ThrottleSpec(maxOpenConnectionsPerServer,
minimumMillisecondsPerFetchPerServer, minimumMillisecondsPerBytePerServer));
isInitialized = true;
}
}
/** Return the list of activities that this connector supports (i.e. writes into the log).
*@return the list.
*/
@Override
public String[] getActivitiesList()
{
return new String[]{ACTIVITY_FETCH, ACTIVITY_ROBOTSPARSE, ACTIVITY_PROCESS};
}
/** Tell the world what model this connector uses for getDocumentIdentifiers().
* This must return a model value as specified above.
*@return the model type value.
*/
@Override
public int getConnectorModel()
{
// This connector is currently structured that the RSS feeds are the seeds.
return MODEL_ALL;
}
// All methods below this line will ONLY be called if a connect() call succeeded
// on this instance!
/** Connect. The configuration parameters are included.
*@param configParams are the configuration parameters for this connection.
* Note well: There are no exceptions allowed from this call, since it is expected to mainly establish connection parameters.
*/
@Override
public void connect(ConfigParams configParams)
{
super.connect(configParams);
// Do the necessary bookkeeping around connection counting
throttleGroupName = params.getParameter(RSSConfig.PARAMETER_THROTTLEGROUP);
if (throttleGroupName == null)
throttleGroupName = "";
fetcher = getFetcher();
robots = getRobots(fetcher);
// Let the system know we have a connection.
fetcher.noteConnectionEstablished();
robots.noteConnectionEstablished();
}
/** This method is periodically called for all connectors that are connected but not
* in active use.
*/
@Override
public void poll()
throws ManifoldCFException
{
fetcher.poll();
robots.poll();
}
/** Check status of connection.
*/
@Override
public String check()
throws ManifoldCFException
{
getSession();
return super.check();
}
/** Close the connection. Call this before discarding the repository connector.
*/
@Override
public void disconnect()
throws ManifoldCFException
{
isInitialized = false;
// Let the system know we are freeing the connection
robots.noteConnectionReleased();
fetcher.noteConnectionReleased();
userAgent = null;
from = null;
minimumMillisecondsPerFetchPerServer = -1L;
maxOpenConnectionsPerServer = 0;
minimumMillisecondsPerBytePerServer = 0.0;
throttleGroupName = null;
proxyHost = null;
proxyPort = -1;
proxyAuthDomain = null;
proxyAuthUsername = null;
proxyAuthPassword = null;
super.disconnect();
}
/** Get the bin name string for a document identifier. The bin name describes the queue to which the
* document will be assigned for throttling purposes. Throttling controls the rate at which items in a
* given queue are fetched; it does not say anything about the overall fetch rate, which may operate on
* multiple queues or bins.
* For example, if you implement a web crawler, a good choice of bin name would be the server name, since
* that is likely to correspond to a real resource that will need real throttle protection.
*@param documentIdentifier is the document identifier.
*@return the bin name.
*/
@Override
public String[] getBinNames(String documentIdentifier)
{
try
{
WebURL uri = new WebURL(documentIdentifier);
return new String[]{uri.getHost()};
}
catch (URISyntaxException e)
{
return new String[]{""};
}
}
/** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents
* are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
*
* This method can choose to find repository changes that happen only during the specified time interval.
* The seeds recorded by this method will be viewed by the framework based on what the
* getConnectorModel() method returns.
*
* It is not a big problem if the connector chooses to create more seeds than are
* strictly necessary; it is merely a question of overall work required.
*
* The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
* For continuous crawling jobs, this method will
* be called once, when the job starts, and at various periodic intervals as the job executes.
*
* When a job's specification is changed, the framework automatically resets the seeding version string to null. The
* seeding version string may also be set to null on each job run, depending on the connector model returned by
* getConnectorModel().
*
* Note that it is always ok to send MORE documents rather than less to this method.
* The connector will be connected before this method can be called.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
*@param seedTime is the end of the time range of documents to consider, exclusive.
*@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@return an updated seeding version string, to be stored with the job.
*/
@Override
public String addSeedDocuments(ISeedingActivity activities, Specification spec,
String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
getSession();
Filter f = new Filter(spec,true);
// Go through all the seeds.
Iterator<String> iter = f.getSeeds();
while (iter.hasNext())
{
String canonicalURL = iter.next();
activities.addSeedDocument(canonicalURL);
}
return "";
}
/** Convert an absolute or relative URL to a document identifier. This may involve several steps at some point,
* but right now it does NOT involve converting the host name to a canonical host name.
* (Doing so would destroy the ability of virtually hosted sites to do the right thing,
* since the original host name would be lost.) Thus, we do the conversion to IP address
* right before we actually fetch the document.
*@param policies are the canonicalization policies in effect.
*@param parentIdentifier the identifier of the document in which the raw url was found, or null if none.
*@param rawURL is the raw, un-normalized and un-canonicalized url.
*@return the canonical URL (the document identifier), or null if the url was illegal.
*/
protected static String makeDocumentIdentifier(CanonicalizationPolicies policies, String parentIdentifier, String rawURL)
throws ManifoldCFException
{
try
{
// First, find the matching canonicalization policy, if any
CanonicalizationPolicy p = policies.findMatch(rawURL);
// Filter out control characters
StringBuilder sb = new StringBuilder();
int i = 0;
while (i < rawURL.length())
{
char x = rawURL.charAt(i++);
// Only 7-bit ascii is allowed in URLs - and that has limits too (no control characters)
if (x >= ' ' && x < 128)
sb.append(x);
}
rawURL = sb.toString();
WebURL url;
if (parentIdentifier != null)
{
WebURL parentURL = new WebURL(parentIdentifier);
url = parentURL.resolve(rawURL);
}
else
url = new WebURL(rawURL);
String protocol = url.getScheme();
String host = url.getHost();
// The new URL better darn well have a host and a protocol, and we only know how to deal with
// http and https.
if (protocol == null || host == null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it has no protocol or host");
return null;
}
if (understoodProtocols.get(protocol) == null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it has an unsupported protocol '"+protocol+"'");
return null;
}
// Canonicalization procedure.
// The query part of the URL may contain bad parameters (session id's, for instance), or may be ordered in such a
// way as to prevent an effectively identical URL from being matched. The anchor part of the URL should also be stripped.
// This code performs both of these activities in a simple way; rewrites of various pieces may get more involved if we add
// the ability to perform mappings using criteria specified in the UI. Right now we don't.
String id = doCanonicalization(p,url);
if (id == null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it could not be canonicalized");
return null;
}
// As a last basic legality check, go through looking for illegal characters.
i = 0;
while (i < id.length())
{
char x = id.charAt(i++);
// Only 7-bit ascii is allowed in URLs - and that has limits too (no control characters)
if (x < ' ' || x > 127)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it has illegal characters in it");
return null;
}
}
return id;
}
catch (java.net.URISyntaxException e)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it is badly formed: "+e.getMessage());
return null;
}
catch (java.lang.IllegalArgumentException e)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because there was an argument error: "+e.getMessage(),e);
return null;
}
catch (java.lang.NullPointerException e)
{
// This gets tossed by url.toAsciiString() for reasons I don't understand, but which have to do with a malformed URL.
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Can't use url '"+rawURL+"' because it is missing fields: "+e.getMessage(),e);
return null;
}
}
/** Code to canonicalize a URL. If URL cannot be canonicalized (and is illegal) return null.
*/
protected static String doCanonicalization(CanonicalizationPolicy p, WebURL url)
throws ManifoldCFException, java.net.URISyntaxException
{
// Note well: The java.net.URI class mistreats the query part of the URI, near as I can tell, in the following ways:
// (1) It decodes the whole thing without regards to the argument interpretation, so the escaped ampersands etc in the arguments are converted
// to non-escaped ones (ugh). This is why I changed the code below to parse the RAW query string and decode it myself.
// (2) On reassembly of the query string, the class does not properly escape ":", "/", or a bunch of other characters the class description *says*
// it will escape. This means it creates URI's that are illegal according to RFC 2396 - although it is true that RFC 2396 also contains
// apparent errors.
//
// I've therefore opted to deal with this problem by doing much of the query string processing myself - including its final reassembly into the
// URI at the end of the processing.
//
// To make the url be canonical, we need to strip off everything after the #. We also need to order the arguments in a canonical
// way, and remove session identifiers of the types we know about.
String queryString = url.getRawQuery();
if (queryString != null)
{
// Rewrite the query string. To do this, we first parse it (by looking for ampersands and equal signs), and then
// we ditch any keys that we really don't want (session identifiers particularly). Finally, we go through the
// keys in sorted order and reassemble the query, making sure that any arguments that have the same name
// appear in the same order.
// I don't use the 'split' operation because I think it's a lot more oomph (and performance loss) than is needed
// for this simple parsing task.
// When reordering a url, the following is done:
// (1) The individual order of all arguments with the same name is preserved
// (2) The arguments themselves appear in sorted order, minus any arguments that should be removed because they
// are interpreted to be session arguments.
//
// When a url is NOT reordered, the following is done:
// (1) Each argument is examined IN TURN.
// (2) If the argument is a session argument and should be excluded, it is simply skipped.
// Canonicalization note: Broadvision
//
// The format of Broadvision's urls is as follows:
// http://blah/path/path?arg|arg|arg|BVSession@@@@=xxxx&more stuff
// The session identifier is the BVSession@@@@. In theory I could strip this away, but I've found that
// most Broadvision sites require session even for basic navigation!
if (p == null || p.canReorder())
{
// Reorder the arguments.
HashMap argumentMap = new HashMap();
int index = 0;
while (index < queryString.length())
{
int newIndex = queryString.indexOf("&",index);
if (newIndex == -1)
newIndex = queryString.length();
String argument = queryString.substring(index,newIndex);
int valueIndex = argument.indexOf("=");
String key;
if (valueIndex == -1)
key = argument;
else
key = argument.substring(0,valueIndex);
// If this is a disallowed argument, simply don't include it in the final map.
boolean includeArgument = true;
if ((p == null || p.canRemovePhpSession()) && key.equals("PHPSESSID"))
includeArgument = false;
if ((p == null || p.canRemoveBvSession()) && key.indexOf("BVSession@@@@") != -1)
includeArgument = false;
if (includeArgument)
{
ArrayList list = (ArrayList)argumentMap.get(key);
if (list == null)
{
list = new ArrayList();
argumentMap.put(key,list);
}
list.add(argument);
}
if (newIndex < queryString.length())
index = newIndex + 1;
else
index = newIndex;
}
// Reassemble query string in sorted order
String[] sortArray = new String[argumentMap.size()];
int i = 0;
Iterator iter = argumentMap.keySet().iterator();
while (iter.hasNext())
{
sortArray[i++] = (String)iter.next();
}
java.util.Arrays.sort(sortArray);
StringBuilder newString = new StringBuilder();
boolean isFirst = true;
i = 0;
while (i < sortArray.length)
{
String key = sortArray[i++];
ArrayList list = (ArrayList)argumentMap.get(key);
int j = 0;
while (j < list.size())
{
if (isFirst == false)
{
newString.append("&");
}
else
isFirst = false;
newString.append((String)list.get(j++));
}
}
queryString = newString.toString();
}
else
{
// Do not reorder!
StringBuilder newString = new StringBuilder();
int index = 0;
boolean isFirst = true;
while (index < queryString.length())
{
int newIndex = queryString.indexOf("&",index);
if (newIndex == -1)
newIndex = queryString.length();
String argument = queryString.substring(index,newIndex);
int valueIndex = argument.indexOf("=");
String key;
if (valueIndex == -1)
key = argument;
else
key = argument.substring(0,valueIndex);
// If this is a disallowed argument, simply don't include it in the final query.
boolean includeArgument = true;
if ((p == null || p.canRemovePhpSession()) && key.equals("PHPSESSID"))
includeArgument = false;
if ((p == null || p.canRemoveBvSession()) && key.indexOf("BVSession@@@@") != -1)
includeArgument = false;
if (includeArgument)
{
if (!isFirst)
newString.append("&");
else
isFirst = false;
newString.append(argument);
}
if (newIndex < queryString.length())
index = newIndex + 1;
else
index = newIndex;
}
queryString = newString.toString();
}
}
// Now, rewrite path to get rid of jsessionid etc.
String pathString = url.getPath();
if (pathString != null)
{
int index = pathString.indexOf(";jsessionid=");
if ((p == null || p.canRemoveJavaSession()) && index != -1)
{
// There's a ";jsessionid="
// Strip the java session id
pathString = pathString.substring(0,index);
}
if ((p == null || p.canRemoveAspSession()) && pathString.startsWith("/s("))
{
// It's asp.net
index = pathString.indexOf(")");
if (index != -1)
pathString = pathString.substring(index+1);
}
}
// Put it back into the URL without the ref, and with the modified query and path parts.
url = new WebURL(url.getScheme(),url.getHost(),url.getPort(),pathString,queryString);
String rval = url.toASCIIString();
return rval;
}
protected static Set<String> xmlContentTypes;
static
{
xmlContentTypes = new HashSet<String>();
xmlContentTypes.add("text/xml");
xmlContentTypes.add("application/rss+xml");
xmlContentTypes.add("application/xml");
xmlContentTypes.add("application/atom+xml");
xmlContentTypes.add("application/xhtml+xml");
xmlContentTypes.add("text/XML");
xmlContentTypes.add("application/rdf+xml");
xmlContentTypes.add("text/application");
xmlContentTypes.add("XML");
}
/** Process a set of documents.
* This is the method that should cause each document to be fetched, processed, and the results either added
* to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
* The document specification allows this class to filter what is done based on the job.
* The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers to process.
*@param statuses are the currently-stored document versions for each document in the set of document identifiers
* passed in above.
*@param activities is the interface this method should use to queue up new document references
* and ingest documents.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
*/
@Override
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
throws ManifoldCFException, ServiceInterruption
{
getSession();
// The connection limit is designed to permit this connector to coexist with potentially other connectors, such as the web connector.
// There is currently no good way to enforce connection limits across all installed connectors - this will require considerably more
// thought to set up properly.
int connectionLimit = 200;
String[] fixedList = new String[2];
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In getDocumentVersions for "+Integer.toString(documentIdentifiers.length)+" documents");
Filter f = new Filter(spec,false);
String[] acls = f.getAcls();
// Sort it,
java.util.Arrays.sort(acls);
// NOTE: There are two kinds of documents in here; documents that are RSS feeds (that presumably have a content-type
// of text/xml), and documents that need to be indexed.
//
// For the latter, the metadata etc is part of the version string. For the former, the only thing that is part of the version string is the
// document's checksum.
//
// The need to exclude documents from fetch based on whether they match an expression causes some difficulties, because we really
// DON'T want this to apply to the feeds themselves. Since the distinguishing characteristic of a feed is that it is in the seed list,
// and that its content-type is text/xml, we could use either of these characteristics to treat feeds differently from
// fetchable urls. But the latter approach requires a fetch, which is forbidden. So - the spec will be used to characterize the url.
// However, the spec might change, and the url might be dropped from the list - and then what??
//
// The final solution is to simply not queue what cannot be mapped.
int feedTimeout = f.getFeedTimeoutValue();
// The document specification has already been used to trim out documents that are not
// allowed from appearing in the queue. So, even that has already been done.
for (String documentIdentifier : documentIdentifiers)
{
// If it is in this list, we presume that it has been vetted against the map etc., so we don't do that again. We just fetch it.
// And, if the content type is xml, we calculate the version as if it is a feed rather than a document.
// Get the url
String urlValue = documentIdentifier;
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Getting version string for '"+urlValue+"'");
String versionString;
String ingestURL = null;
String[] pubDates = null;
String[] sources = null;
String[] titles = null;
String[] authorNames = null;
String[] authorEmails = null;
String[] categories = null;
String[] descriptions = null;
try
{
// If there's a carrydown "data" value for this url, we use that value rather than actually fetching the document. This also means we don't need to
// do a robots check, because we aren't actually crawling anything. So, ALWAYS do this first...
CharacterInput[] dechromedData = activities.retrieveParentDataAsFiles(urlValue,"data");
try
{
if (dechromedData.length > 0)
{
// Data already available. The fetch cycle can be entirely avoided, as can the robots check.
ingestURL = f.mapDocumentURL(urlValue);
if (ingestURL != null)
{
// Open up an input stream corresponding to the carrydown data. The stream will be encoded as utf-8.
try
{
InputStream is = dechromedData[0].getUtf8Stream();
try
{
StringBuilder sb = new StringBuilder();
long checkSum = cache.addData(activities,urlValue,"text/html",is);
// Grab what we need from the passed-down data for the document. These will all become part
// of the version string.
pubDates = activities.retrieveParentData(urlValue,"pubdate");
sources = activities.retrieveParentData(urlValue,"source");
titles = activities.retrieveParentData(urlValue,"title");
authorNames = activities.retrieveParentData(urlValue,"authorname");
authorEmails = activities.retrieveParentData(urlValue,"authoremail");
categories = activities.retrieveParentData(urlValue,"category");
descriptions = activities.retrieveParentData(urlValue,"description");
java.util.Arrays.sort(pubDates);
java.util.Arrays.sort(sources);
java.util.Arrays.sort(titles);
java.util.Arrays.sort(authorNames);
java.util.Arrays.sort(authorEmails);
java.util.Arrays.sort(categories);
java.util.Arrays.sort(descriptions);
if (sources.length == 0)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Warning; URL '"+ingestURL+"' doesn't seem to have any RSS feed source!");
}
sb.append('+');
packList(sb,acls,'+');
if (acls.length > 0)
{
sb.append('+');
pack(sb,defaultAuthorityDenyToken,'+');
}
else
sb.append('-');
// The ingestion URL
pack(sb,ingestURL,'+');
// The pub dates
packList(sb,pubDates,'+');
// The titles
packList(sb,titles,'+');
// The sources
packList(sb,sources,'+');
// The categories
packList(sb,categories,'+');
// The descriptions
packList(sb,descriptions,'+');
// The author names
packList(sb,authorNames,'+');
// The author emails
packList(sb,authorEmails,'+');
// Do the checksum part, which does not need to be parseable.
sb.append(new Long(checkSum).toString());
versionString = sb.toString();
}
finally
{
is.close();
}
}
catch (java.net.SocketTimeoutException e)
{
throw new ManifoldCFException("IO exception reading data from string: "+e.getMessage(),e);
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
throw new ManifoldCFException("IO exception reading data from string: "+e.getMessage(),e);
}
}
else
{
// Document a seed or unmappable; just skip
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Skipping carry-down document '"+urlValue+"' because it is unmappable or is a seed.");
}
}
else
{
// Get the old version string
String oldVersionString = statuses.getIndexedVersionString(documentIdentifier);
// Unpack the old version as much as possible.
// We are interested in what the ETag and Last-Modified headers were last time.
String lastETagValue = null;
String lastModifiedValue = null;
// Note well: Non-continuous jobs cannot use etag because the rss document MUST be fetched each time for such jobs,
// or the documents it points at would get deleted.
//
// NOTE: I disabled this code because we really need the feed's TTL value in order to reschedule properly. I can't get the
// TTL value without refetching the document - therefore ETag and Last-Modified cannot be used :-(
if (false && jobMode == JOBMODE_CONTINUOUS && oldVersionString != null && oldVersionString.startsWith("-"))
{
// It's a feed, so the last etag and last-modified fields should be encoded in this version string.
StringBuilder lastETagBuffer = new StringBuilder();
int unpackPos = unpack(lastETagBuffer,oldVersionString,1,'+');
StringBuilder lastModifiedBuffer = new StringBuilder();
unpackPos = unpack(lastModifiedBuffer,oldVersionString,unpackPos,'+');
if (lastETagBuffer.length() > 0)
lastETagValue = lastETagBuffer.toString();
if (lastModifiedBuffer.length() > 0)
lastModifiedValue = lastModifiedBuffer.toString();
}
if (Logging.connectors.isDebugEnabled() && (lastETagValue != null || lastModifiedValue != null))
Logging.connectors.debug("RSS: Document '"+urlValue+"' was found to have a previous ETag value of '"+((lastETagValue==null)?"null":lastETagValue)+
"' and a previous Last-Modified value of '"+((lastModifiedValue==null)?"null":lastModifiedValue)+"'");
// Robots check. First, we need to separate the url into its components
URL url;
try
{
url = new URL(urlValue);
}
catch (MalformedURLException e)
{
Logging.connectors.debug("RSS: URL '"+urlValue+"' is malformed; skipping",e);
activities.deleteDocument(documentIdentifier);
continue;
}
String protocol = url.getProtocol();
int port = url.getPort();
String hostName = url.getHost();
String pathPart = url.getFile();
// Check with robots to see if it's allowed
if (robotsUsage >= ROBOTS_DATA && !robots.isFetchAllowed(currentContext,throttleGroupName,
protocol,port,hostName,url.getPath(),
userAgent,from,
proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword,
activities, connectionLimit))
{
activities.recordActivity(null,ACTIVITY_FETCH,
null,urlValue,Integer.toString(-2),"Robots exclusion",null);
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Skipping url '"+urlValue+"' because robots.txt says to");
activities.deleteDocument(documentIdentifier);
continue;
}
// Now, use the fetcher, and get the file.
IThrottledConnection connection = fetcher.createConnection(currentContext,
throttleGroupName,
hostName,
connectionLimit,
feedTimeout,
proxyHost,
proxyPort,
proxyAuthDomain,
proxyAuthUsername,
proxyAuthPassword,
activities);
try
{
// Begin the fetch
connection.beginFetch("Data");
try
{
// Execute the request.
// Use the connect timeout from the document specification!
int status = connection.executeFetch(protocol,port,pathPart,userAgent,from,
lastETagValue,lastModifiedValue);
switch (status)
{
case IThrottledConnection.STATUS_NOCHANGE:
versionString = oldVersionString;
break;
case IThrottledConnection.STATUS_OK:
try
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Successfully fetched "+urlValue);
// Document successfully fetched!
// If its content is xml, presume it's a feed...
String contentType = connection.getResponseHeader("Content-Type");
// Some sites have multiple content types. We just look at the LAST one in that case.
if (contentType != null)
{
String[] contentTypes = contentType.split(",");
if (contentTypes.length > 0)
contentType = contentTypes[contentTypes.length-1].trim();
else
contentType = null;
}
String strippedContentType = contentType;
if (strippedContentType != null)
{
int pos = strippedContentType.indexOf(";");
if (pos != -1)
strippedContentType = strippedContentType.substring(0,pos).trim();
}
boolean isXML = (strippedContentType != null && xmlContentTypes.contains(strippedContentType));
ingestURL = null;
if (!isXML)
{
// If the chromed content mode is set to "skip", and we got here, it means
// we should not include the content.
if (f.getChromedContentMode() == CHROMED_SKIP)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Removing url '"+urlValue+"' because it no longer has dechromed content available");
versionString = null;
break;
}
// Decide whether to exclude this document based on what we see here.
// Basically, we want to get rid of everything that we don't know what
// to do with in the ingestion system.
if (!activities.checkMimeTypeIndexable(contentType))
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Removing url '"+urlValue+"' because it had the wrong content type: "+((contentType==null)?"null":"'"+contentType+"'"));
versionString = null;
break;
}
ingestURL = f.mapDocumentURL(urlValue);
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: The url '"+urlValue+"' is a feed");
if (!f.isSeed(urlValue))
{
// Remove the feed from consideration, since it has left the list of seeds
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Removing feed url '"+urlValue+"' because it is not a seed.");
versionString = null;
break;
}
}
InputStream is = connection.getResponseBodyStream();
try
{
long checkSum = cache.addData(activities,urlValue,contentType,is);
StringBuilder sb = new StringBuilder();
if (ingestURL != null)
{
// We think it is ingestable. The version string accordingly starts with a "+".
// Grab what we need from the passed-down data for the document. These will all become part
// of the version string.
pubDates = activities.retrieveParentData(urlValue,"pubdate");
sources = activities.retrieveParentData(urlValue,"source");
titles = activities.retrieveParentData(urlValue,"title");
authorNames = activities.retrieveParentData(urlValue,"authorname");
authorEmails = activities.retrieveParentData(urlValue,"authoremail");
categories = activities.retrieveParentData(urlValue,"category");
descriptions = activities.retrieveParentData(urlValue,"description");
java.util.Arrays.sort(pubDates);
java.util.Arrays.sort(sources);
java.util.Arrays.sort(titles);
java.util.Arrays.sort(authorNames);
java.util.Arrays.sort(authorEmails);
java.util.Arrays.sort(categories);
java.util.Arrays.sort(descriptions);
if (sources.length == 0)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Warning; URL '"+ingestURL+"' doesn't seem to have any RSS feed source!");
}
sb.append('+');
packList(sb,acls,'+');
if (acls.length > 0)
{
sb.append('+');
pack(sb,defaultAuthorityDenyToken,'+');
}
else
sb.append('-');
// The ingestion URL
pack(sb,ingestURL,'+');
// The pub dates
packList(sb,pubDates,'+');
// The titles
packList(sb,titles,'+');
// The sources
packList(sb,sources,'+');
// The categories
packList(sb,categories,'+');
// The descriptions
packList(sb,descriptions,'+');
// The author names
packList(sb,authorNames,'+');
// The author emails
packList(sb,authorEmails,'+');
}
else
{
sb.append('-');
String etag = connection.getResponseHeader("ETag");
if (etag == null)
pack(sb,"",'+');
else
pack(sb,etag,'+');
String lastModified = connection.getResponseHeader("Last-Modified");
if (lastModified == null)
pack(sb,"",'+');
else
pack(sb,lastModified,'+');
}
// Do the checksum part, which does not need to be parseable.
sb.append(new Long(checkSum).toString());
versionString = sb.toString();
}
finally
{
is.close();
}
}
catch (java.net.SocketTimeoutException e)
{
Logging.connectors.warn("RSS: Socket timeout exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e);
versionString = null;
}
catch (ConnectTimeoutException e)
{
Logging.connectors.warn("RSS: Connecto timeout exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e);
versionString = null;
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
Logging.connectors.warn("RSS: IO exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e);
versionString = null;
}
break;
case IThrottledConnection.STATUS_SITEERROR:
case IThrottledConnection.STATUS_PAGEERROR:
default:
// Record an *empty* version.
// This signals the processDocuments() method that we really don't want to ingest this document, but we also don't
// want to blow the document out of the queue, since then we'd wind up perhaps fetching it multiple times.
versionString = "";
break;
}
}
finally
{
connection.doneFetch(activities);
}
}
finally
{
connection.close();
}
if (versionString == null)
{
activities.deleteDocument(documentIdentifier);
continue;
}
if (!(versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier,versionString)))
continue;
// Process document!
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Processing '"+urlValue+"'");
// The only links we extract come from documents that we think are RSS feeds.
// When we think that's the case, we attempt to parse it as RSS XML.
if (ingestURL == null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Interpreting document '"+urlValue+"' as a feed");
// We think it is a feed.
// If this is a continuous job, AND scanonly is true, it means that the document was either identical to the
// previous fetch, or was not fetched at all. In that case, it may not even be there, and we *certainly* don't
// want to attempt to process it in any case.
//
// NOTE: I re-enabled the scan permanently because we need the TTL value to be set whatever the cost. If the
// TTL value is not set, we default to the specified job's feed-rescan time, which is not going to be current enough for some feeds.
if (true || jobMode != JOBMODE_CONTINUOUS)
{
handleRSSFeedSAX(urlValue,activities,f);
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Extraction of feed '"+urlValue+"' complete");
// Record the feed's version string, so we won't refetch unless needed.
// This functionality is required for the last ETag and Last-Modified fields to be sent to the rss server, and to
// keep track of the adaptive parameters.
activities.recordDocument(documentIdentifier,versionString);
}
else
{
// The problem here is that we really do need to set the rescan time to something reasonable.
// But we might not even have read the feed! So what to do??
// One answer is to build a connector-specific table that carries the last value of every feed around.
// Another answer is to change the version code to always read the feed (and the heck with ETag and Last-Modified).
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Feed '"+urlValue+"' does not appear to differ from previous fetch for a continuous job; not extracting!");
long currentTime = System.currentTimeMillis();
Long defaultRescanTime = f.getDefaultRescanTime(currentTime);
if (defaultRescanTime != null)
{
Long minimumTime = f.getMinimumRescanTime(currentTime);
if (minimumTime != null)
{
if (defaultRescanTime.longValue() < minimumTime.longValue())
defaultRescanTime = minimumTime;
}
}
activities.setDocumentScheduleBounds(urlValue,defaultRescanTime,defaultRescanTime,null,null);
}
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Interpreting '"+urlValue+"' as a document");
String errorCode = null;
String errorDesc = null;
long startTime = System.currentTimeMillis();
Long fileLengthLong = null;
try
{
long documentLength = cache.getDataLength(documentIdentifier);
if (!activities.checkLengthIndexable(documentLength))
{
activities.noDocument(documentIdentifier,versionString);
errorCode = activities.EXCLUDED_LENGTH;
errorDesc = "Document rejected because of length ("+documentLength+")";
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because its length was rejected ("+documentLength+")");
continue;
}
if (!activities.checkURLIndexable(documentIdentifier))
{
activities.noDocument(documentIdentifier,versionString);
errorCode = activities.EXCLUDED_URL;
errorDesc = "Document rejected because of URL ('"+documentIdentifier+"')";
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because its URL was rejected ('"+documentIdentifier+"')");
continue;
}
// Check if it's a recognized content type
String contentType = cache.getContentType(documentIdentifier);
// Some sites have multiple content types. We just look at the LAST one in that case.
if (contentType != null)
{
String[] contentTypes = contentType.split(",");
if (contentTypes.length > 0)
contentType = contentTypes[contentTypes.length-1].trim();
else
contentType = null;
}
if (!activities.checkMimeTypeIndexable(contentType))
{
activities.noDocument(documentIdentifier,versionString);
errorCode = activities.EXCLUDED_MIMETYPE;
errorDesc = "Document rejected because of mime type ("+contentType+")";
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because its mime type was rejected ('"+contentType+"')");
continue;
}
// Treat it as an ingestable document.
long dataSize = cache.getDataLength(urlValue);
RepositoryDocument rd = new RepositoryDocument();
// Set content type
if (contentType != null)
rd.setMimeType(contentType);
// Turn into acls and add into description
String[] denyAcls;
if (acls == null)
denyAcls = null;
else if (acls.length == 0)
denyAcls = new String[0];
else
denyAcls = new String[]{defaultAuthorityDenyToken};
if (acls != null && denyAcls != null)
rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,acls,denyAcls);
if (titles != null && titles.length > 0)
rd.addField("title",titles);
if (authorNames != null && authorNames.length > 0)
rd.addField("authorname",authorNames);
if (authorEmails != null && authorEmails.length > 0)
rd.addField("authoremail",authorEmails);
if (descriptions != null && descriptions.length > 0)
rd.addField("summary",descriptions);
if (sources != null && sources.length > 0)
rd.addField("source",sources);
if (categories != null && categories.length > 0)
rd.addField("category",categories);
// The pubdates are a ms since epoch value; we want the minimum one for the origination time.
Long minimumOrigTime = null;
if (pubDates != null && pubDates.length > 0)
{
String[] pubDateValuesISO = new String[pubDates.length];
TimeZone tz = TimeZone.getTimeZone("UTC");
DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'", Locale.ROOT);
df.setTimeZone(tz);
for (int k = 0; k < pubDates.length; k++)
{
String pubDate = pubDates[k];
try
{
Long pubDateLong = new Long(pubDate);
if (minimumOrigTime == null || pubDateLong.longValue() < minimumOrigTime.longValue())
minimumOrigTime = pubDateLong;
pubDateValuesISO[k] = df.format(new Date(pubDateLong.longValue()));
}
catch (NumberFormatException e)
{
// Do nothing; the version string seems to not mean anything
pubDateValuesISO[k] = "";
}
}
rd.addField("pubdate",pubDates);
rd.addField("pubdateiso",pubDateValuesISO);
}
if (minimumOrigTime != null)
activities.setDocumentOriginationTime(urlValue,minimumOrigTime);
InputStream is = cache.getData(urlValue);
if (is != null)
{
try
{
rd.setBinary(is,dataSize);
try
{
activities.ingestDocumentWithException(documentIdentifier,versionString,ingestURL,rd);
errorCode = "OK";
fileLengthLong = new Long(dataSize);
}
catch (IOException e)
{
errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
errorDesc = e.getMessage();
handleIOException(e,"reading data");
}
}
finally
{
try
{
is.close();
}
catch (IOException e)
{
errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
errorDesc = e.getMessage();
handleIOException(e,"closing stream");
}
}
}
}
catch (ManifoldCFException e)
{
if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
errorCode = null;
throw e;
}
finally
{
if (errorCode != null)
activities.recordActivity(new Long(startTime),ACTIVITY_PROCESS,
null,urlValue,errorCode,errorDesc,null);
}
}
}
}
finally
{
for (CharacterInput ci : dechromedData)
{
if (ci != null)
ci.discard();
}
}
}
finally
{
// Remove any fetched documents.
cache.deleteData(documentIdentifier);
}
}
}
protected static void handleIOException(IOException e, String context)
throws ManifoldCFException, ServiceInterruption
{
if (e instanceof java.net.SocketTimeoutException)
throw new ManifoldCFException("IO error "+context+": "+e.getMessage(),e);
else if (e instanceof InterruptedIOException)
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
else
throw new ManifoldCFException("IO error "+context+": "+e.getMessage(),e);
}
// UI support methods.
//
// These support methods come in two varieties. The first bunch is involved in setting up connection configuration information. The second bunch
// is involved in presenting and editing document specification information for a job. The two kinds of methods are accordingly treated differently,
// in that the first bunch cannot assume that the current connector object is connected, while the second bunch can. That is why the first bunch
// receives a thread context argument for all UI methods, while the second bunch does not need one (since it has already been applied via the connect()
// method, above).
/** Output the configuration header section.
* This method is called in the head section of the connector's configuration page. Its purpose is to add the required tabs to the list, and to output any
* javascript methods that might be needed by the configuration editing HTML.
*@param threadContext is the local thread context.
*@param out is the output to which any HTML should be sent.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector.
*/
@Override
public void outputConfigurationHeader(IThreadContext threadContext, IHTTPOutput out,
Locale locale, ConfigParams parameters, List<String> tabsArray)
throws ManifoldCFException, IOException
{
tabsArray.add(Messages.getString(locale,"RSSConnector.Email"));
tabsArray.add(Messages.getString(locale,"RSSConnector.Robots"));
tabsArray.add(Messages.getString(locale,"RSSConnector.Bandwidth"));
tabsArray.add(Messages.getString(locale,"RSSConnector.Proxy"));
out.print(
"<script type=\"text/javascript\">\n"+
"<!--\n"+
"function checkConfig()\n"+
"{\n"+
" if (editconnection.email.value != \"\" && editconnection.email.value.indexOf(\"@\") == -1)\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.NeedAValidEmailAddress")+"\");\n"+
" editconnection.email.focus();\n"+
" return false;\n"+
" }\n"+
" if (editconnection.bandwidth.value != \"\" && !isInteger(editconnection.bandwidth.value))\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.EnterAValidNumberOrBlankForNoLimit")+"\");\n"+
" editconnection.bandwidth.focus();\n"+
" return false;\n"+
" }\n"+
" if (editconnection.connections.value == \"\" || !isInteger(editconnection.connections.value))\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.EnterAValidNumberForTheMaxNumberOfOpenConnectionsPerServer")+"\");\n"+
" editconnection.connections.focus();\n"+
" return false;\n"+
" }\n"+
" if (editconnection.fetches.value != \"\" && !isInteger(editconnection.fetches.value))\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.EnterAValidNumberOrBlankForNoLimit")+"\");\n"+
" editconnection.fetches.focus();\n"+
" return false;\n"+
" }\n"+
" return true;\n"+
"}\n"+
"\n"+
"function checkConfigForSave()\n"+
"{\n"+
" if (editconnection.email.value == \"\")\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.EmailAddressRequiredToBeIncludedInAllRequestHeaders")+"\");\n"+
" SelectTab(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.Email")+"\");\n"+
" editconnection.email.focus();\n"+
" return false;\n"+
" }\n"+
" return true;\n"+
"}\n"+
"\n"+
"//-->\n"+
"</script>\n"
);
}
/** Output the configuration body section.
* This method is called in the body section of the connector's configuration page. Its purpose is to present the required form elements for editing.
* The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags. The name of the
* form is "editconnection".
*@param threadContext is the local thread context.
*@param out is the output to which any HTML should be sent.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*@param tabName is the current tab name.
*/
@Override
public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out,
Locale locale, ConfigParams parameters, String tabName)
throws ManifoldCFException, IOException
{
String email = parameters.getParameter(RSSConfig.PARAMETER_EMAIL);
if (email == null)
email = "";
String robotsUsage = parameters.getParameter(RSSConfig.PARAMETER_ROBOTSUSAGE);
if (robotsUsage == null)
robotsUsage = RSSConfig.VALUE_ALL;
String bandwidth = parameters.getParameter(RSSConfig.PARAMETER_BANDWIDTH);
if (bandwidth == null)
bandwidth = "64";
String connections = parameters.getParameter(RSSConfig.PARAMETER_MAXOPEN);
if (connections == null)
connections = "2";
String fetches = parameters.getParameter(RSSConfig.PARAMETER_MAXFETCHES);
if (fetches == null)
fetches = "12";
String throttleGroup = parameters.getParameter(RSSConfig.PARAMETER_THROTTLEGROUP);
if (throttleGroup == null)
throttleGroup = "";
String proxyHost = parameters.getParameter(RSSConfig.PARAMETER_PROXYHOST);
if (proxyHost == null)
proxyHost = "";
String proxyPort = parameters.getParameter(RSSConfig.PARAMETER_PROXYPORT);
if (proxyPort == null)
proxyPort = "";
String proxyAuthDomain = parameters.getParameter(RSSConfig.PARAMETER_PROXYAUTHDOMAIN);
if (proxyAuthDomain == null)
proxyAuthDomain = "";
String proxyAuthUsername = parameters.getParameter(RSSConfig.PARAMETER_PROXYAUTHUSERNAME);
if (proxyAuthUsername == null)
proxyAuthUsername = "";
String proxyAuthPassword = parameters.getObfuscatedParameter(RSSConfig.PARAMETER_PROXYAUTHPASSWORD);
if (proxyAuthPassword == null)
proxyAuthPassword = "";
else
proxyAuthPassword = out.mapPasswordToKey(proxyAuthPassword);
// Email tab
if (tabName.equals(Messages.getString(locale,"RSSConnector.Email")))
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.EmailAddressToContactColon") + "</nobr></td><td class=\"value\"><input type=\"text\" size=\"32\" name=\"email\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(email)+"\"/></td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
"<input type=\"hidden\" name=\"email\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(email)+"\"/>\n"
);
}
// Robots tab
if (tabName.equals(Messages.getString(locale,"RSSConnector.Robots")))
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.RobotsTxtUsageColon") + "</nobr></td>\n"+
" <td class=\"value\">\n"+
" <select name=\"robotsusage\" size=\"3\">\n"+
" <option value=\"none\" "+(robotsUsage.equals(RSSConfig.VALUE_NONE)?"selected=\"selected\"":"")+">" + Messages.getBodyString(locale,"RSSConnector.DontLookAtRobotsTxt") + "</option>\n"+
" <option value=\"data\" "+(robotsUsage.equals(RSSConfig.VALUE_DATA)?"selected=\"selected\"":"")+">" + Messages.getBodyString(locale,"RSSConnector.ObeyRobotsTxtForDataFetchesOnly") + "</option>\n"+
" <option value=\"all\" "+(robotsUsage.equals(RSSConfig.VALUE_ALL)?"selected=\"selected\"":"")+">" + Messages.getBodyString(locale,"RSSConnector.ObeyRobotsTxtForAllFetches") + "</option>\n"+
" </select>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
"<input type=\"hidden\" name=\"robotsusage\" value=\""+robotsUsage+"\"/>\n"
);
}
// Bandwidth tab
if (tabName.equals(Messages.getString(locale,"RSSConnector.Bandwidth")))
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.MaxKBytesPerSecondPerServerColon") + "</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"6\" name=\"bandwidth\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(bandwidth)+"\"/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.MaxConnectionsPerServerColon") + "</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"4\" name=\"connections\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(connections)+"\"/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.MaxFetchesPerMinutePerServerColon") + "</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"4\" name=\"fetches\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(fetches)+"\"/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ThrottleGroupNameColon") + "</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"32\" name=\"throttlegroup\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(throttleGroup)+"\"/></td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
"<input type=\"hidden\" name=\"bandwidth\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(bandwidth)+"\"/>\n"+
"<input type=\"hidden\" name=\"connections\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(connections)+"\"/>\n"+
"<input type=\"hidden\" name=\"fetches\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(fetches)+"\"/>\n"+
"<input type=\"hidden\" name=\"throttlegroup\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(throttleGroup)+"\"/>\n"
);
}
// Proxy tab
if (tabName.equals(Messages.getString(locale,"RSSConnector.Proxy")))
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ProxyHostColon") + "</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"40\" name=\"proxyhost\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyHost)+"\"/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ProxyPortColon") + "</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"5\" name=\"proxyport\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyPort)+"\"/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ProxyAuthenticationDomainColon") + "</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"32\" name=\"proxyauthdomain\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthDomain)+"\"/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ProxyAuthenticationUserNameColon") + "</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"32\" name=\"proxyauthusername\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthUsername)+"\"/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ProxyAuthenticationPasswordColon") + "</nobr></td>\n"+
" <td class=\"value\"><input type=\"password\" size=\"16\" name=\"proxyauthpassword\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthPassword)+"\"/></td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
"<input type=\"hidden\" name=\"proxyhost\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyHost)+"\"/>\n"+
"<input type=\"hidden\" name=\"proxyport\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyPort)+"\"/>\n"+
"<input type=\"hidden\" name=\"proxyauthusername\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthUsername)+"\"/>\n"+
"<input type=\"hidden\" name=\"proxyauthdomain\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthDomain)+"\"/>\n"+
"<input type=\"hidden\" name=\"proxyauthpassword\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(proxyAuthPassword)+"\"/>\n"
);
}
}
/** Process a configuration post.
* This method is called at the start of the connector's configuration page, whenever there is a possibility that form data for a connection has been
* posted. Its purpose is to gather form information and modify the configuration parameters accordingly.
* The name of the posted form is "editconnection".
*@param threadContext is the local thread context.
*@param variableContext is the set of variables available from the post, including binary file post information.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*@return null if all is well, or a string error message if there is an error that should prevent saving of the connection (and cause a redirection to an error page).
*/
@Override
public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext,
Locale locale, ConfigParams parameters)
throws ManifoldCFException
{
String email = variableContext.getParameter("email");
if (email != null)
parameters.setParameter(RSSConfig.PARAMETER_EMAIL,email);
String robotsUsage = variableContext.getParameter("robotsusage");
if (robotsUsage != null)
parameters.setParameter(RSSConfig.PARAMETER_ROBOTSUSAGE,robotsUsage);
String bandwidth = variableContext.getParameter("bandwidth");
if (bandwidth != null)
parameters.setParameter(RSSConfig.PARAMETER_BANDWIDTH,bandwidth);
String connections = variableContext.getParameter("connections");
if (connections != null)
parameters.setParameter(RSSConfig.PARAMETER_MAXOPEN,connections);
String fetches = variableContext.getParameter("fetches");
if (fetches != null)
parameters.setParameter(RSSConfig.PARAMETER_MAXFETCHES,fetches);
String throttleGroup = variableContext.getParameter("throttlegroup");
if (throttleGroup != null)
parameters.setParameter(RSSConfig.PARAMETER_THROTTLEGROUP,throttleGroup);
String proxyHost = variableContext.getParameter("proxyhost");
if (proxyHost != null)
parameters.setParameter(RSSConfig.PARAMETER_PROXYHOST,proxyHost);
String proxyPort = variableContext.getParameter("proxyport");
if (proxyPort != null)
parameters.setParameter(RSSConfig.PARAMETER_PROXYPORT,proxyPort);
String proxyAuthDomain = variableContext.getParameter("proxyauthdomain");
if (proxyAuthDomain != null)
parameters.setParameter(RSSConfig.PARAMETER_PROXYAUTHDOMAIN,proxyAuthDomain);
String proxyAuthUsername = variableContext.getParameter("proxyauthusername");
if (proxyAuthUsername != null)
parameters.setParameter(RSSConfig.PARAMETER_PROXYAUTHUSERNAME,proxyAuthUsername);
String proxyAuthPassword = variableContext.getParameter("proxyauthpassword");
if (proxyAuthPassword != null)
parameters.setObfuscatedParameter(RSSConfig.PARAMETER_PROXYAUTHPASSWORD,variableContext.mapKeyToPassword(proxyAuthPassword));
return null;
}
/** View configuration.
* This method is called in the body section of the connector's view configuration page. Its purpose is to present the connection information to the user.
* The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags.
*@param threadContext is the local thread context.
*@param out is the output to which any HTML should be sent.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*/
@Override
public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out,
Locale locale, ConfigParams parameters)
throws ManifoldCFException, IOException
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr>\n"+
" <td class=\"description\" colspan=\"1\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.ParametersColon") + "</nobr></td>\n"+
" <td class=\"value\" colspan=\"3\">\n"
);
Iterator iter = parameters.listParameters();
while (iter.hasNext())
{
String param = (String)iter.next();
String value = parameters.getParameter(param);
if (param.length() >= "password".length() && param.substring(param.length()-"password".length()).equalsIgnoreCase("password"))
{
out.print(
" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param)+"=********</nobr><br/>\n"
);
}
else if (param.length() >="keystore".length() && param.substring(param.length()-"keystore".length()).equalsIgnoreCase("keystore"))
{
IKeystoreManager kmanager = KeystoreManagerFactory.make("",value);
out.print(
" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param)+"=<"+Integer.toString(kmanager.getContents().length)+Messages.getBodyString(locale,"RSSConnector.certificates")+"></nobr><br/>\n"
);
}
else
{
out.print(
" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param)+"="+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(value)+"</nobr><br/>\n"
);
}
}
out.print(
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
/** Output the specification header section.
* This method is called in the head section of a job page which has selected a repository connection of the
* current type. Its purpose is to add the required tabs to the list, and to output any javascript methods
* that might be needed by the job editing HTML.
* The connector will be connected before this method can be called.
*@param out is the output to which any HTML should be sent.
*@param locale is the locale the output is preferred to be in.
*@param ds is the current document specification for this job.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector.
*/
@Override
public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification ds,
int connectionSequenceNumber, List<String> tabsArray)
throws ManifoldCFException, IOException
{
tabsArray.add(Messages.getString(locale,"RSSConnector.URLs"));
tabsArray.add(Messages.getString(locale,"RSSConnector.Canonicalization"));
tabsArray.add(Messages.getString(locale,"RSSConnector.URLMappings"));
tabsArray.add(Messages.getString(locale,"RSSConnector.Exclusions"));
tabsArray.add(Messages.getString(locale,"RSSConnector.TimeValues"));
tabsArray.add(Messages.getString(locale,"RSSConnector.Security"));
tabsArray.add(Messages.getString(locale,"RSSConnector.DechromedContent"));
String seqPrefix = "s"+connectionSequenceNumber+"_";
out.print(
"<script type=\"text/javascript\">\n"+
"<!--\n"+
"function "+seqPrefix+"SpecOp(n, opValue, anchorvalue)\n"+
"{\n"+
" eval(\"editjob.\"+n+\".value = \\\"\"+opValue+\"\\\"\");\n"+
" postFormSetAnchor(anchorvalue);\n"+
"}\n"+
"\n"+
"function "+seqPrefix+"AddRegexp(anchorvalue)\n"+
"{\n"+
" if (editjob."+seqPrefix+"rssmatch.value == \"\")\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.MatchMustHaveARegexpValue")+"\");\n"+
" editjob."+seqPrefix+"rssmatch.focus();\n"+
" return;\n"+
" }\n"+
"\n"+
" "+seqPrefix+"SpecOp(\""+seqPrefix+"rssop\",\"Add\",anchorvalue);\n"+
"}\n"+
"\n"+
"function "+seqPrefix+"RemoveRegexp(index, anchorvalue)\n"+
"{\n"+
" editjob."+seqPrefix+"rssindex.value = index;\n"+
" "+seqPrefix+"SpecOp(\""+seqPrefix+"rssop\",\"Delete\",anchorvalue);\n"+
"}\n"+
"\n"+
"function "+seqPrefix+"SpecAddToken(anchorvalue)\n"+
"{\n"+
" if (editjob."+seqPrefix+"spectoken.value == \"\")\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.TypeInAnAccessToken")+"\");\n"+
" editjob."+seqPrefix+"spectoken.focus();\n"+
" return;\n"+
" }\n"+
" "+seqPrefix+"SpecOp(\""+seqPrefix+"accessop\",\"Add\",anchorvalue);\n"+
"}\n"+
"\n"+
"function "+seqPrefix+"URLRegexpDelete(index, anchorvalue)\n"+
"{\n"+
" editjob."+seqPrefix+"urlregexpnumber.value = index;\n"+
" "+seqPrefix+"SpecOp(\""+seqPrefix+"urlregexpop\",\"Delete\",anchorvalue);\n"+
"}\n"+
"\n"+
"function "+seqPrefix+"URLRegexpAdd(anchorvalue)\n"+
"{\n"+
" "+seqPrefix+"SpecOp(\""+seqPrefix+"urlregexpop\",\"Add\",anchorvalue);\n"+
"}\n"+
"\n"+
"function "+seqPrefix+"checkSpecification()\n"+
"{\n"+
" if (editjob."+seqPrefix+"feedtimeout.value == \"\" || !isInteger(editjob."+seqPrefix+"feedtimeout.value))\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.ATimeoutValueInSecondsIsRequired")+"\");\n"+
" editjob."+seqPrefix+"feedtimeout.focus();\n"+
" return false;\n"+
" }\n"+
" if (editjob."+seqPrefix+"feedrefetch.value == \"\" || !isInteger(editjob."+seqPrefix+"feedrefetch.value))\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.ARefetchIntervalInMinutesIsRequired")+"\");\n"+
" editjob."+seqPrefix+"feedrefetch.focus();\n"+
" return false;\n"+
" }\n"+
" if (editjob."+seqPrefix+"minfeedrefetch.value == \"\" || !isInteger(editjob."+seqPrefix+"minfeedrefetch.value))\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.AMinimumRefetchIntervalInMinutesIsRequire")+"\");\n"+
" editjob."+seqPrefix+"minfeedrefetch.focus();\n"+
" return false;\n"+
" }\n"+
" if (editjob."+seqPrefix+"badfeedrefetch.value != \"\" && !isInteger(editjob."+seqPrefix+"badfeedrefetch.value))\n"+
" {\n"+
" alert(\""+Messages.getBodyJavascriptString(locale,"RSSConnector.ABadFeedRefetchIntervalInMinutesIsRequired")+"\");\n"+
" editjob."+seqPrefix+"badfeedrefetch.focus();\n"+
" return false;\n"+
" }\n"+
"\n"+
" return true;\n"+
"}\n"+
"\n"+
"//-->\n"+
"</script>\n"
);
}
/** Output the specification body section.
* This method is called in the body section of a job page which has selected a repository connection of the
* current type. Its purpose is to present the required form elements for editing.
* The coder can presume that the HTML that is output from this configuration will be within appropriate
* <html>, <body>, and <form> tags. The name of the form is always "editjob".
* The connector will be connected before this method can be called.
*@param out is the output to which any HTML should be sent.
*@param locale is the locale the output is preferred to be in.
*@param ds is the current document specification for this job.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*@param actualSequenceNumber is the connection within the job that has currently been selected.
*@param tabName is the current tab name. (actualSequenceNumber, tabName) form a unique tuple within
* the job.
*/
@Override
public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification ds,
int connectionSequenceNumber, int actualSequenceNumber, String tabName)
throws ManifoldCFException, IOException
{
String seqPrefix = "s"+connectionSequenceNumber+"_";
int i;
int k;
// Build the url seed string, and the url regexp match and map
StringBuilder sb = new StringBuilder();
ArrayList regexp = new ArrayList();
ArrayList matchStrings = new ArrayList();
int feedTimeoutValue = 60;
int feedRefetchValue = 60;
int minFeedRefetchValue = 15;
Integer badFeedRefetchValue = null;
String exclusions = "";
// Now, loop through paths
i = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals(RSSConfig.NODE_FEED))
{
String rssURL = sn.getAttributeValue(RSSConfig.ATTR_URL);
if (rssURL != null)
{
sb.append(rssURL).append("\n");
}
}
else if (sn.getType().equals(RSSConfig.NODE_EXCLUDES))
{
exclusions = sn.getValue();
if (exclusions == null)
exclusions = "";
}
else if (sn.getType().equals(RSSConfig.NODE_MAP))
{
String match = sn.getAttributeValue(RSSConfig.ATTR_MATCH);
String map = sn.getAttributeValue(RSSConfig.ATTR_MAP);
if (match != null)
{
regexp.add(match);
if (map == null)
map = "";
matchStrings.add(map);
}
}
else if (sn.getType().equals(RSSConfig.NODE_FEEDTIMEOUT))
{
String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE);
feedTimeoutValue = Integer.parseInt(value);
}
else if (sn.getType().equals(RSSConfig.NODE_FEEDRESCAN))
{
String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE);
feedRefetchValue = Integer.parseInt(value);
}
else if (sn.getType().equals(RSSConfig.NODE_MINFEEDRESCAN))
{
String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE);
minFeedRefetchValue = Integer.parseInt(value);
}
else if (sn.getType().equals(RSSConfig.NODE_BADFEEDRESCAN))
{
String value = sn.getAttributeValue(RSSConfig.ATTR_VALUE);
badFeedRefetchValue = new Integer(value);
}
}
// URLs tab
if (tabName.equals(Messages.getString(locale,"RSSConnector.URLs")) && connectionSequenceNumber == actualSequenceNumber)
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"value\" colspan=\"2\">\n"+
" <textarea rows=\"25\" cols=\"80\" name=\""+seqPrefix+"rssurls\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(sb.toString())+"</textarea>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"rssurls\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(sb.toString())+"\"/>\n"
);
}
// Exclusions tab
if (tabName.equals(Messages.getString(locale,"RSSConnector.Exclusions")) && connectionSequenceNumber == actualSequenceNumber)
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\" colspan=\"1\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.Exclude") + "</nobr></td>\n"+
" <td class=\"value\" colspan=\"1\">\n"+
" <textarea rows=\"25\" cols=\"60\" name=\""+seqPrefix+"exclusions\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(exclusions)+"</textarea>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"exclusions\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(exclusions)+"\"/>\n"
);
}
// Canonicalization tab
if (tabName.equals(Messages.getString(locale,"RSSConnector.Canonicalization")) && connectionSequenceNumber == actualSequenceNumber)
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"boxcell\" colspan=\"2\">\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"urlregexpop\" value=\"Continue\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"urlregexpnumber\" value=\"\"/>\n"+
" <table class=\"formtable\">\n"+
" <tr class=\"formheaderrow\">\n"+
" <td class=\"formcolumnheader\"></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.URLRegularExpression")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.Description")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.Reorder")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveJSPSessions")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveASPSessions")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemovePHPSessions")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveBVSessions")+"</nobr></td>\n"+
" </tr>\n"
);
int q = 0;
int l = 0;
while (q < ds.getChildCount())
{
SpecificationNode specNode = ds.getChild(q++);
if (specNode.getType().equals(RSSConfig.NODE_URLSPEC))
{
// Ok, this node matters to us
String regexpString = specNode.getAttributeValue(RSSConfig.ATTR_REGEXP);
String description = specNode.getAttributeValue(RSSConfig.ATTR_DESCRIPTION);
if (description == null)
description = "";
String allowReorder = specNode.getAttributeValue(RSSConfig.ATTR_REORDER);
if (allowReorder == null || allowReorder.length() == 0)
allowReorder = RSSConfig.VALUE_NO;
String allowJavaSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL);
if (allowJavaSessionRemoval == null || allowJavaSessionRemoval.length() == 0)
allowJavaSessionRemoval = RSSConfig.VALUE_NO;
String allowASPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL);
if (allowASPSessionRemoval == null || allowASPSessionRemoval.length() == 0)
allowASPSessionRemoval = RSSConfig.VALUE_NO;
String allowPHPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL);
if (allowPHPSessionRemoval == null || allowPHPSessionRemoval.length() == 0)
allowPHPSessionRemoval = RSSConfig.VALUE_NO;
String allowBVSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL);
if (allowBVSessionRemoval == null || allowBVSessionRemoval.length() == 0)
allowBVSessionRemoval = RSSConfig.VALUE_NO;
out.print(
" <tr class=\""+(((l % 2)==0)?"evenformrow":"oddformrow")+"\">\n"+
" <td class=\"formcolumncell\">\n"+
" <a name=\""+seqPrefix+"urlregexp_"+Integer.toString(l)+"\">\n"+
" <input type=\"button\" value=\"Delete\" alt=\""+Messages.getAttributeString(locale,"RSSConnector.DeleteUrlRegexp")+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(regexpString)+"\" onclick='javascript:"+seqPrefix+"URLRegexpDelete("+Integer.toString(l)+",\""+seqPrefix+"urlregexp_"+Integer.toString(l)+"\");'/>\n"+
" </a>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"urlregexp_"+Integer.toString(l)+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(regexpString)+"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"urlregexpdesc_"+Integer.toString(l)+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(description)+"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"urlregexpreorder_"+Integer.toString(l)+"\" value=\""+allowReorder+"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"urlregexpjava_"+Integer.toString(l)+"\" value=\""+allowJavaSessionRemoval+"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"urlregexpasp_"+Integer.toString(l)+"\" value=\""+allowASPSessionRemoval+"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"urlregexpphp_"+Integer.toString(l)+"\" value=\""+allowPHPSessionRemoval+"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"urlregexpbv_"+Integer.toString(l)+"\" value=\""+allowBVSessionRemoval+"\"/>\n"+
" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(regexpString)+"</nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(description)+"</td>\n"+
" <td class=\"formcolumncell\">"+allowReorder+"</td>\n"+
" <td class=\"formcolumncell\">"+allowJavaSessionRemoval+"</td>\n"+
" <td class=\"formcolumncell\">"+allowASPSessionRemoval+"</td>\n"+
" <td class=\"formcolumncell\">"+allowPHPSessionRemoval+"</td>\n"+
" <td class=\"formcolumncell\">"+allowBVSessionRemoval+"</td>\n"+
" </tr>\n"
);
l++;
}
}
if (l == 0)
{
out.print(
" <tr class=\"formrow\"><td colspan=\"8\" class=\"formcolumnmessage\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.NoCanonicalizationSpecified")+"</nobr></td></tr>\n"
);
}
out.print(
" <tr class=\"formrow\"><td colspan=\"8\" class=\"formseparator\"><hr/></td></tr>\n"+
" <tr class=\"formrow\">\n"+
" <td class=\"formcolumncell\">\n"+
" <a name=\""+seqPrefix+"urlregexp_"+Integer.toString(l)+"\">\n"+
" <input type=\"button\" value=\"Add\" alt=\""+Messages.getAttributeString(locale,"RSSConnector.AddUlRegexp")+"\" onclick='javascript:"+seqPrefix+"URLRegexpAdd(\""+seqPrefix+"urlregexp_"+Integer.toString(l+1)+"\");'/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"urlregexpcount\" value=\""+Integer.toString(l)+"\"/>\n"+
" </a>\n"+
" </td>\n"+
" <td class=\"formcolumncell\"><input type=\"text\" name=\""+seqPrefix+"urlregexp\" size=\"30\" value=\"\"/></td>\n"+
" <td class=\"formcolumncell\"><input type=\"text\" name=\""+seqPrefix+"urlregexpdesc\" size=\"30\" value=\"\"/></td>\n"+
" <td class=\"formcolumncell\"><input type=\"checkbox\" name=\""+seqPrefix+"urlregexpreorder\" value=\"yes\"/></td>\n"+
" <td class=\"formcolumncell\"><input type=\"checkbox\" name=\""+seqPrefix+"urlregexpjava\" value=\"yes\" checked=\"true\"/></td>\n"+
" <td class=\"formcolumncell\"><input type=\"checkbox\" name=\""+seqPrefix+"urlregexpasp\" value=\"yes\" checked=\"true\"/></td>\n"+
" <td class=\"formcolumncell\"><input type=\"checkbox\" name=\""+seqPrefix+"urlregexpphp\" value=\"yes\" checked=\"true\"/></td>\n"+
" <td class=\"formcolumncell\"><input type=\"checkbox\" name=\""+seqPrefix+"urlregexpbv\" value=\"yes\" checked=\"true\"/></td>\n"+
" </tr>\n"+
" </table>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
// Post the canonicalization specification
int q = 0;
int l = 0;
while (q < ds.getChildCount())
{
SpecificationNode specNode = ds.getChild(q++);
if (specNode.getType().equals(RSSConfig.NODE_URLSPEC))
{
// Ok, this node matters to us
String regexpString = specNode.getAttributeValue(RSSConfig.ATTR_REGEXP);
String description = specNode.getAttributeValue(RSSConfig.ATTR_DESCRIPTION);
if (description == null)
description = "";
String allowReorder = specNode.getAttributeValue(RSSConfig.ATTR_REORDER);
if (allowReorder == null || allowReorder.length() == 0)
allowReorder = RSSConfig.VALUE_NO;
String allowJavaSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL);
if (allowJavaSessionRemoval == null || allowJavaSessionRemoval.length() == 0)
allowJavaSessionRemoval = RSSConfig.VALUE_NO;
String allowASPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL);
if (allowASPSessionRemoval == null || allowASPSessionRemoval.length() == 0)
allowASPSessionRemoval = RSSConfig.VALUE_NO;
String allowPHPSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL);
if (allowPHPSessionRemoval == null || allowPHPSessionRemoval.length() == 0)
allowPHPSessionRemoval = RSSConfig.VALUE_NO;
String allowBVSessionRemoval = specNode.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL);
if (allowBVSessionRemoval == null || allowBVSessionRemoval.length() == 0)
allowBVSessionRemoval = RSSConfig.VALUE_NO;
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"urlregexp_"+Integer.toString(l)+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(regexpString)+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"urlregexpdesc_"+Integer.toString(l)+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(description)+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"urlregexpreorder_"+Integer.toString(l)+"\" value=\""+allowReorder+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"urlregexpjava_"+Integer.toString(l)+"\" value=\""+allowJavaSessionRemoval+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"urlregexpasp_"+Integer.toString(l)+"\" value=\""+allowASPSessionRemoval+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"urlregexpphp_"+Integer.toString(l)+"\" value=\""+allowPHPSessionRemoval+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"urlregexpbv_"+Integer.toString(l)+"\" value=\""+allowBVSessionRemoval+"\"/>\n"
);
l++;
}
}
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"urlregexpcount\" value=\""+Integer.toString(l)+"\"/>\n"
);
}
// Mappings tab
if (tabName.equals(Messages.getString(locale,"RSSConnector.URLMappings")) && connectionSequenceNumber == actualSequenceNumber)
{
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"rssop\" value=\"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"rssindex\" value=\"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"rssmapcount\" value=\""+Integer.toString(regexp.size())+"\"/>\n"+
"\n"+
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"4\"><hr/></td></tr>\n"
);
i = 0;
while (i < regexp.size())
{
String prefix = seqPrefix+"rssregexp_"+Integer.toString(i)+"_";
out.print(
" <tr>\n"+
" <td class=\"value\">\n"+
" <a name=\""+seqPrefix+"regexp_"+Integer.toString(i)+"\">\n"+
" <input type=\"button\" value=\""+Messages.getAttributeString(locale,"RSSConnector.Remove")+"\" onclick='javascript:"+seqPrefix+"RemoveRegexp("+Integer.toString(i)+",\""+seqPrefix+"regexp_"+Integer.toString(i)+"\")' alt=\""+Messages.getAttributeString(locale,"RSSConnector.RemoveRegexp")+Integer.toString(i)+"\"/>\n"+
" </a>\n"+
" </td>\n"+
" <td class=\"value\"><input type=\"hidden\" name=\""+prefix+"match"+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape((String)regexp.get(i))+"\"/>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape((String)regexp.get(i))+"</td>\n"+
" <td class=\"value\">--></td>\n"+
" <td class=\"value\">\n"
);
String match = (String)matchStrings.get(i);
out.print(
" <input type=\"hidden\" name=\""+prefix+"map"+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(match)+"\"/>\n"
);
if (match.length() == 0)
{
out.print(
" <as is>\n"
);
}
else
{
out.print(
" "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(match)+"\n"
);
}
out.print(
" </td>\n"+
" </tr>\n"
);
i++;
}
out.print(
" <tr>\n"+
" <td class=\"value\"><a name=\""+seqPrefix+"regexp_"+Integer.toString(i)+"\"><input type=\"button\" value=\""+Messages.getAttributeString(locale,"RSSConnector.Add")+"\" onclick='javascript:"+seqPrefix+"AddRegexp(\""+seqPrefix+"regexp_"+Integer.toString(i+1)+"\")' alt=\""+Messages.getAttributeString(locale,"RSSConnector.AddRegexp")+"\"/></a></td>\n"+
" <td class=\"value\"><input type=\"text\" name=\""+seqPrefix+"rssmatch\" size=\"16\" value=\"\"/></td>\n"+
" <td class=\"value\">--></td>\n"+
" <td class=\"value\"><input type=\"text\" name=\""+seqPrefix+"rssmap\" size=\"16\" value=\"\"/></td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"rssmapcount\" value=\""+Integer.toString(regexp.size())+"\"/>\n"
);
i = 0;
while (i < regexp.size())
{
String prefix = seqPrefix+"rssregexp_"+Integer.toString(i)+"_";
String match = (String)matchStrings.get(i);
out.print(
"<input type=\"hidden\" name=\""+prefix+"match"+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape((String)regexp.get(i))+"\"/>\n"+
"<input type=\"hidden\" name=\""+prefix+"map"+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(match)+"\"/>\n"
);
i++;
}
}
// Timeout Value tab
if (tabName.equals(Messages.getString(locale,"RSSConnector.TimeValues")) && connectionSequenceNumber == actualSequenceNumber)
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.FeedConnectTimeout")+"</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"5\" name=\""+seqPrefix+"feedtimeout\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedTimeoutValue))+"\"/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.DefaultFeedRefetchTime")+"</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"5\" name=\""+seqPrefix+"feedrefetch\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedRefetchValue))+"\"/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.MinimumFeedRefetchTime")+"</nobr></td>\n"+
" <td class=\"value\"><input type=\"text\" size=\"5\" name=\""+seqPrefix+"minfeedrefetch\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(minFeedRefetchValue))+"\"/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.BadFeedRefetchTime")+"</nobr></td>\n"+
" <td class=\"value\">\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"badfeedrefetch_present\" value=\"true\"/>\n"+
" <input type=\"text\" size=\"5\" name=\""+seqPrefix+"badfeedrefetch\" value=\""+((badFeedRefetchValue==null)?"":org.apache.manifoldcf.ui.util.Encoder.attributeEscape(badFeedRefetchValue.toString()))+"\"/>\n"+
" </td>\n"+
" </tr>\n"+
"\n"+
"</table>\n"
);
}
else
{
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"feedtimeout\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedTimeoutValue))+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"feedrefetch\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(feedRefetchValue))+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"minfeedrefetch\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(Integer.toString(minFeedRefetchValue))+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"badfeedrefetch_present\" value=\"true\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"badfeedrefetch\" value=\""+((badFeedRefetchValue==null)?"":org.apache.manifoldcf.ui.util.Encoder.attributeEscape(badFeedRefetchValue.toString()))+"\"/>\n"
);
}
// Dechromed content tab
String dechromedMode = RSSConfig.VALUE_NONE;
String chromedMode = RSSConfig.VALUE_USE;
i = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals(RSSConfig.NODE_DECHROMEDMODE))
dechromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE);
else if (sn.getType().equals(RSSConfig.NODE_CHROMEDMODE))
chromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE);
}
if (tabName.equals(Messages.getString(locale,"RSSConnector.DechromedContent")) && connectionSequenceNumber == actualSequenceNumber)
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"1\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"value\"><nobr><input type=\"radio\" name=\""+seqPrefix+"dechromedmode\" value=\"none\" "+(dechromedMode.equals(RSSConfig.VALUE_NONE)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.NoDechromedContent")+"</nobr></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"value\"><nobr><input type=\"radio\" name=\""+seqPrefix+"dechromedmode\" value=\"description\" "+(dechromedMode.equals(RSSConfig.VALUE_DESCRIPTION)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.DechromedContentIfPresentInDescriptionField")+"</nobr></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"value\"><nobr><input type=\"radio\" name=\""+seqPrefix+"dechromedmode\" value=\"content\" "+(dechromedMode.equals(RSSConfig.VALUE_CONTENT)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.DechromedContentIfPresentInContentField")+"</nobr></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"separator\"><hr/></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"value\"><nobr><input type=\"radio\" name=\""+seqPrefix+"chromedmode\" value=\"use\" "+(chromedMode.equals(RSSConfig.VALUE_USE)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.UseChromedContentIfNoDechromedContentFound")+"</nobr></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"value\"><nobr><input type=\"radio\" name=\""+seqPrefix+"chromedmode\" value=\"skip\" "+(chromedMode.equals(RSSConfig.VALUE_SKIP)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.NeverUseChromedContent")+"</nobr></td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"value\"><nobr><input type=\"radio\" name=\""+seqPrefix+"chromedmode\" value=\"metadata\" "+(chromedMode.equals(RSSConfig.VALUE_METADATA)?"checked=\"true\"":"")+"/>"+Messages.getBodyString(locale,"RSSConnector.NoContentMetadataOnly")+"</nobr></td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"dechromedmode\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(dechromedMode)+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"chromedmode\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(chromedMode)+"\"/>\n"
);
}
// Security tab
// There is no native security, so all we care about are the tokens.
i = 0;
if (tabName.equals(Messages.getString(locale,"RSSConnector.Security")) && connectionSequenceNumber == actualSequenceNumber)
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"
);
// Go through forced ACL
i = 0;
k = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals(RSSConfig.NODE_ACCESS))
{
String accessDescription = "_"+Integer.toString(k);
String accessOpName = seqPrefix+"accessop"+accessDescription;
String token = sn.getAttributeValue(RSSConfig.ATTR_TOKEN);
out.print(
" <tr>\n"+
" <td class=\"description\">\n"+
" <input type=\"hidden\" name=\""+accessOpName+"\" value=\"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"spectoken"+accessDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(token)+"\"/>\n"+
" <a name=\""+seqPrefix+"token_"+Integer.toString(k)+"\">\n"+
" <input type=\"button\" value=\"Delete\" onClick='Javascript:"+seqPrefix+"SpecOp(\""+accessOpName+"\",\"Delete\",\""+seqPrefix+"token_"+Integer.toString(k)+"\")' alt=\""+Messages.getAttributeString(locale,"RSSConnector.DeleteToken")+Integer.toString(k)+"\"/>\n"+
" </a> \n"+
" </td>\n"+
" <td class=\"value\">\n"+
" "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(token)+"\n"+
" </td>\n"+
" </tr>\n"
);
k++;
}
}
if (k == 0)
{
out.print(
" <tr>\n"+
" <td class=\"message\" colspan=\"2\">" + Messages.getBodyString(locale,"RSSConnector.NoAccessTokensPresent") + "</td>\n"+
" </tr>\n"
);
}
out.print(
" <tr><td class=\"lightseparator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\">\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"tokencount\" value=\""+Integer.toString(k)+"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"accessop\" value=\"\"/>\n"+
" <a name=\""+seqPrefix+"token_"+Integer.toString(k)+"\">\n"+
" <input type=\"button\" value=\"Add\" onClick='Javascript:"+seqPrefix+"SpecAddToken(\""+seqPrefix+"token_"+Integer.toString(k+1)+"\")' alt=\""+Messages.getAttributeString(locale,"RSSConnector.AddAccessToken")+"\"/>\n"+
" </a> \n"+
" </td>\n"+
" <td class=\"value\">\n"+
" <input type=\"text\" size=\"30\" name=\""+seqPrefix+"spectoken\" value=\"\"/>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
// Finally, go through forced ACL
i = 0;
k = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals(RSSConfig.NODE_ACCESS))
{
String accessDescription = "_"+Integer.toString(k);
String token = sn.getAttributeValue(RSSConfig.ATTR_TOKEN);
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"spectoken"+accessDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(token)+"\"/>\n"
);
k++;
}
}
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"tokencount\" value=\""+Integer.toString(k)+"\"/>\n"
);
}
}
/** Process a specification post.
* This method is called at the start of job's edit or view page, whenever there is a possibility that form
* data for a connection has been posted. Its purpose is to gather form information and modify the
* document specification accordingly. The name of the posted form is always "editjob".
* The connector will be connected before this method can be called.
*@param variableContext contains the post data, including binary file-upload information.
*@param locale is the locale the output is preferred to be in.
*@param ds is the current document specification for this job.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*@return null if all is well, or a string error message if there is an error that should prevent saving of
* the job (and cause a redirection to an error page).
*/
@Override
public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification ds,
int connectionSequenceNumber)
throws ManifoldCFException
{
String seqPrefix = "s"+connectionSequenceNumber+"_";
// Get the map
String value = variableContext.getParameter(seqPrefix+"rssmapcount");
if (value != null)
{
int mapsize = Integer.parseInt(value);
// Clear it first
int j = 0;
while (j < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(j);
if (sn.getType().equals(RSSConfig.NODE_MAP))
ds.removeChild(j);
else
j++;
}
// Grab the map values
j = 0;
while (j < mapsize)
{
String prefix = seqPrefix+"rssregexp_"+Integer.toString(j)+"_";
String match = variableContext.getParameter(prefix+"match");
String map = variableContext.getParameter(prefix+"map");
if (map == null)
map = "";
// Add to the documentum specification
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_MAP);
node.setAttribute(RSSConfig.ATTR_MATCH,match);
node.setAttribute(RSSConfig.ATTR_MAP,map);
ds.addChild(ds.getChildCount(),node);
j++;
}
}
// Get the cgiPath
String rssURLSequence = variableContext.getParameter(seqPrefix+"rssurls");
if (rssURLSequence != null)
{
// Delete all url specs first
int i = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i);
if (sn.getType().equals(RSSConfig.NODE_FEED))
ds.removeChild(i);
else
i++;
}
try
{
java.io.Reader str = new java.io.StringReader(rssURLSequence);
try
{
java.io.BufferedReader is = new java.io.BufferedReader(str);
try
{
while (true)
{
String nextString = is.readLine();
if (nextString == null)
break;
if (nextString.length() == 0)
continue;
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_FEED);
node.setAttribute(RSSConfig.ATTR_URL,nextString);
ds.addChild(ds.getChildCount(),node);
}
}
finally
{
is.close();
}
}
finally
{
str.close();
}
}
catch (java.io.IOException e)
{
throw new ManifoldCFException("IO error: "+e.getMessage(),e);
}
}
// Read the url specs
String urlRegexpCount = variableContext.getParameter(seqPrefix+"urlregexpcount");
if (urlRegexpCount != null && urlRegexpCount.length() > 0)
{
int regexpCount = Integer.parseInt(urlRegexpCount);
int j = 0;
while (j < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(j);
if (sn.getType().equals(RSSConfig.NODE_URLSPEC))
ds.removeChild(j);
else
j++;
}
// Grab the operation and the index (if any)
String operation = variableContext.getParameter(seqPrefix+"urlregexpop");
if (operation == null)
operation = "Continue";
int opIndex = -1;
if (operation.equals("Delete"))
opIndex = Integer.parseInt(variableContext.getParameter(seqPrefix+"urlregexpnumber"));
// Reconstruct urlspec nodes
j = 0;
while (j < regexpCount)
{
// For each index, first look for a delete operation
if (!operation.equals("Delete") || j != opIndex)
{
// Add the jth node
String regexp = variableContext.getParameter(seqPrefix+"urlregexp_"+Integer.toString(j));
String regexpDescription = variableContext.getParameter(seqPrefix+"urlregexpdesc_"+Integer.toString(j));
String reorder = variableContext.getParameter(seqPrefix+"urlregexpreorder_"+Integer.toString(j));
String javaSession = variableContext.getParameter(seqPrefix+"urlregexpjava_"+Integer.toString(j));
String aspSession = variableContext.getParameter(seqPrefix+"urlregexpasp_"+Integer.toString(j));
String phpSession = variableContext.getParameter(seqPrefix+"urlregexpphp_"+Integer.toString(j));
String bvSession = variableContext.getParameter(seqPrefix+"urlregexpbv_"+Integer.toString(j));
SpecificationNode newSn = new SpecificationNode(RSSConfig.NODE_URLSPEC);
newSn.setAttribute(RSSConfig.ATTR_REGEXP,regexp);
if (regexpDescription != null && regexpDescription.length() > 0)
newSn.setAttribute(RSSConfig.VALUE_DESCRIPTION,regexpDescription);
if (reorder != null && reorder.length() > 0)
newSn.setAttribute(RSSConfig.ATTR_REORDER,reorder);
if (javaSession != null && javaSession.length() > 0)
newSn.setAttribute(RSSConfig.ATTR_JAVASESSIONREMOVAL,javaSession);
if (aspSession != null && aspSession.length() > 0)
newSn.setAttribute(RSSConfig.ATTR_ASPSESSIONREMOVAL,aspSession);
if (phpSession != null && phpSession.length() > 0)
newSn.setAttribute(RSSConfig.ATTR_PHPSESSIONREMOVAL,phpSession);
if (bvSession != null && bvSession.length() > 0)
newSn.setAttribute(RSSConfig.ATTR_BVSESSIONREMOVAL,bvSession);
ds.addChild(ds.getChildCount(),newSn);
}
j++;
}
if (operation.equals("Add"))
{
String regexp = variableContext.getParameter(seqPrefix+"urlregexp");
String regexpDescription = variableContext.getParameter(seqPrefix+"urlregexpdesc");
String reorder = variableContext.getParameter(seqPrefix+"urlregexpreorder");
String javaSession = variableContext.getParameter(seqPrefix+"urlregexpjava");
String aspSession = variableContext.getParameter(seqPrefix+"urlregexpasp");
String phpSession = variableContext.getParameter(seqPrefix+"urlregexpphp");
String bvSession = variableContext.getParameter(seqPrefix+"urlregexpbv");
// Add a new node at the end
SpecificationNode newSn = new SpecificationNode(RSSConfig.NODE_URLSPEC);
newSn.setAttribute(RSSConfig.ATTR_REGEXP,regexp);
if (regexpDescription != null && regexpDescription.length() > 0)
newSn.setAttribute(RSSConfig.VALUE_DESCRIPTION,regexpDescription);
if (reorder != null && reorder.length() > 0)
newSn.setAttribute(RSSConfig.ATTR_REORDER,reorder);
if (javaSession != null && javaSession.length() > 0)
newSn.setAttribute(RSSConfig.ATTR_JAVASESSIONREMOVAL,javaSession);
if (aspSession != null && aspSession.length() > 0)
newSn.setAttribute(RSSConfig.ATTR_ASPSESSIONREMOVAL,aspSession);
if (phpSession != null && phpSession.length() > 0)
newSn.setAttribute(RSSConfig.ATTR_PHPSESSIONREMOVAL,phpSession);
if (bvSession != null && bvSession.length() > 0)
newSn.setAttribute(RSSConfig.ATTR_BVSESSIONREMOVAL,bvSession);
ds.addChild(ds.getChildCount(),newSn);
}
}
// Get the exclusions
String exclusions = variableContext.getParameter(seqPrefix+"exclusions");
if (exclusions != null)
{
// Delete existing exclusions record first
int i = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i);
if (sn.getType().equals(RSSConfig.NODE_EXCLUDES))
ds.removeChild(i);
else
i++;
}
SpecificationNode cn = new SpecificationNode(RSSConfig.NODE_EXCLUDES);
cn.setValue(exclusions);
ds.addChild(ds.getChildCount(),cn);
}
// Read the feed timeout, if present
String feedTimeoutValue = variableContext.getParameter(seqPrefix+"feedtimeout");
if (feedTimeoutValue != null && feedTimeoutValue.length() > 0)
{
int j = 0;
while (j < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(j);
if (sn.getType().equals(RSSConfig.NODE_FEEDTIMEOUT))
ds.removeChild(j);
else
j++;
}
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_FEEDTIMEOUT);
node.setAttribute(RSSConfig.ATTR_VALUE,feedTimeoutValue);
ds.addChild(ds.getChildCount(),node);
}
// Read the feed refetch interval, if present
String feedRefetchValue = variableContext.getParameter(seqPrefix+"feedrefetch");
if (feedRefetchValue != null && feedRefetchValue.length() > 0)
{
int j = 0;
while (j < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(j);
if (sn.getType().equals(RSSConfig.NODE_FEEDRESCAN))
ds.removeChild(j);
else
j++;
}
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_FEEDRESCAN);
node.setAttribute(RSSConfig.ATTR_VALUE,feedRefetchValue);
ds.addChild(ds.getChildCount(),node);
}
// Read the minimum feed refetch interval, if present
String minFeedRefetchValue = variableContext.getParameter(seqPrefix+"minfeedrefetch");
if (minFeedRefetchValue != null && minFeedRefetchValue.length() > 0)
{
int j = 0;
while (j < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(j);
if (sn.getType().equals(RSSConfig.NODE_MINFEEDRESCAN))
ds.removeChild(j);
else
j++;
}
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_MINFEEDRESCAN);
node.setAttribute(RSSConfig.ATTR_VALUE,minFeedRefetchValue);
ds.addChild(ds.getChildCount(),node);
}
// Read the bad feed refetch interval (which is allowed to be null)
String badFeedRefetchValuePresent = variableContext.getParameter(seqPrefix+"badfeedrefetch_present");
if (badFeedRefetchValuePresent != null && badFeedRefetchValuePresent.length() > 0)
{
String badFeedRefetchValue = variableContext.getParameter(seqPrefix+"badfeedrefetch");
int k = 0;
while (k < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(k);
if (sn.getType().equals(RSSConfig.NODE_BADFEEDRESCAN))
ds.removeChild(k);
else
k++;
}
if (badFeedRefetchValue != null && badFeedRefetchValue.length() > 0)
{
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_BADFEEDRESCAN);
node.setAttribute(RSSConfig.ATTR_VALUE,badFeedRefetchValue);
ds.addChild(ds.getChildCount(),node);
}
}
// Read the dechromed mode
String dechromedMode = variableContext.getParameter(seqPrefix+"dechromedmode");
if (dechromedMode != null && dechromedMode.length() > 0)
{
int j = 0;
while (j < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(j);
if (sn.getType().equals(RSSConfig.NODE_DECHROMEDMODE))
ds.removeChild(j);
else
j++;
}
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_DECHROMEDMODE);
node.setAttribute(RSSConfig.ATTR_MODE,dechromedMode);
ds.addChild(ds.getChildCount(),node);
}
// Read the chromed mode
String chromedMode = variableContext.getParameter(seqPrefix+"chromedmode");
if (chromedMode != null && chromedMode.length() > 0)
{
int j = 0;
while (j < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(j);
if (sn.getType().equals(RSSConfig.NODE_CHROMEDMODE))
ds.removeChild(j);
else
j++;
}
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_CHROMEDMODE);
node.setAttribute(RSSConfig.ATTR_MODE,chromedMode);
ds.addChild(ds.getChildCount(),node);
}
// Now, do whatever action we were told to do.
String rssop = variableContext.getParameter(seqPrefix+"rssop");
if (rssop != null && rssop.equals("Add"))
{
// Add a match to the end
String match = variableContext.getParameter(seqPrefix+"rssmatch");
String map = variableContext.getParameter(seqPrefix+"rssmap");
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_MAP);
node.setAttribute(RSSConfig.ATTR_MATCH,match);
node.setAttribute(RSSConfig.ATTR_MAP,map);
ds.addChild(ds.getChildCount(),node);
}
else if (rssop != null && rssop.equals("Delete"))
{
int index = Integer.parseInt(variableContext.getParameter(seqPrefix+"rssindex"));
int j = 0;
while (j < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(j);
if (sn.getType().equals(RSSConfig.NODE_MAP))
{
if (index == 0)
{
ds.removeChild(j);
break;
}
index--;
}
j++;
}
}
String xc = variableContext.getParameter(seqPrefix+"tokencount");
if (xc != null)
{
// Delete all tokens first
int i = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i);
if (sn.getType().equals(RSSConfig.NODE_ACCESS))
ds.removeChild(i);
else
i++;
}
int accessCount = Integer.parseInt(xc);
i = 0;
while (i < accessCount)
{
String accessDescription = "_"+Integer.toString(i);
String accessOpName = seqPrefix+"accessop"+accessDescription;
xc = variableContext.getParameter(accessOpName);
if (xc != null && xc.equals("Delete"))
{
// Next row
i++;
continue;
}
// Get the stuff we need
String accessSpec = variableContext.getParameter(seqPrefix+"spectoken"+accessDescription);
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_ACCESS);
node.setAttribute(RSSConfig.ATTR_TOKEN,accessSpec);
ds.addChild(ds.getChildCount(),node);
i++;
}
String op = variableContext.getParameter(seqPrefix+"accessop");
if (op != null && op.equals("Add"))
{
String accessspec = variableContext.getParameter(seqPrefix+"spectoken");
SpecificationNode node = new SpecificationNode(RSSConfig.NODE_ACCESS);
node.setAttribute(RSSConfig.ATTR_TOKEN,accessspec);
ds.addChild(ds.getChildCount(),node);
}
}
return null;
}
/** View specification.
* This method is called in the body section of a job's view page. Its purpose is to present the document
* specification information to the user. The coder can presume that the HTML that is output from
* this configuration will be within appropriate <html> and <body> tags.
* The connector will be connected before this method can be called.
*@param out is the output to which any HTML should be sent.
*@param locale is the locale the output is preferred to be in.
*@param ds is the current document specification for this job.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*/
@Override
public void viewSpecification(IHTTPOutput out, Locale locale, Specification ds,
int connectionSequenceNumber)
throws ManifoldCFException, IOException
{
String exclusions = "";
out.print(
"<table class=\"displaytable\">\n"
);
int i = 0;
boolean seenAny = false;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals(RSSConfig.NODE_FEED))
{
if (seenAny == false)
{
out.print(
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RSSUrls")+"</nobr></td>\n"+
" <td class=\"value\">\n"
);
seenAny = true;
}
out.print(
" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(sn.getAttributeValue(RSSConfig.ATTR_URL))+"</nobr><br/>\n"
);
}
else if (sn.getType().equals(RSSConfig.NODE_EXCLUDES))
{
exclusions = sn.getValue();
if (exclusions == null)
exclusions = "";
}
}
if (seenAny)
{
out.print(
" </td>\n"+
" </tr>\n"
);
}
else
{
out.print(
" <tr><td class=\"message\" colspan=\"2\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.NoRSSUrlsSpecified")+"</nobr></td></tr>\n"
);
}
out.print(
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"
);
i = 0;
int l = 0;
seenAny = false;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals(RSSConfig.NODE_URLSPEC))
{
if (l == 0)
{
out.print(
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.URLCanonicalization")+"</nobr></td>\n"+
" <td class=\"value\">\n"+
" <table class=\"formtable\">\n"+
" <tr class=\"formheaderrow\">\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.URLRegexp")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.Description")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.Reorder")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveJSPSessions")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveASPSessions")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemovePHPSessions")+"</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.RemoveBVSessions")+"</nobr></td>\n"+
" </tr>\n"
);
}
String regexpString = sn.getAttributeValue(RSSConfig.ATTR_REGEXP);
String description = sn.getAttributeValue(RSSConfig.ATTR_DESCRIPTION);
if (description == null)
description = "";
String allowReorder = sn.getAttributeValue(RSSConfig.ATTR_REORDER);
if (allowReorder == null || allowReorder.length() == 0)
allowReorder = RSSConfig.VALUE_NO;
String allowJavaSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL);
if (allowJavaSessionRemoval == null || allowJavaSessionRemoval.length() == 0)
allowJavaSessionRemoval = RSSConfig.VALUE_NO;
String allowASPSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL);
if (allowASPSessionRemoval == null || allowASPSessionRemoval.length() == 0)
allowASPSessionRemoval = RSSConfig.VALUE_NO;
String allowPHPSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL);
if (allowPHPSessionRemoval == null || allowPHPSessionRemoval.length() == 0)
allowPHPSessionRemoval = RSSConfig.VALUE_NO;
String allowBVSessionRemoval = sn.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL);
if (allowBVSessionRemoval == null || allowBVSessionRemoval.length() == 0)
allowBVSessionRemoval = RSSConfig.VALUE_NO;
out.print(
" <tr class=\""+(((l % 2)==0)?"evenformrow":"oddformrow")+"\">\n"+
" <td class=\"formcolumncell\"><nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(regexpString)+"</nobr></td>\n"+
" <td class=\"formcolumncell\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(description)+"</td>\n"+
" <td class=\"formcolumncell\"><nobr>"+allowReorder+"</nobr></td>\n"+
" <td class=\"formcolumncell\"><nobr>"+allowJavaSessionRemoval+"</nobr></td>\n"+
" <td class=\"formcolumncell\"><nobr>"+allowASPSessionRemoval+"</nobr></td>\n"+
" <td class=\"formcolumncell\"><nobr>"+allowPHPSessionRemoval+"</nobr></td>\n"+
" <td class=\"formcolumncell\"><nobr>"+allowBVSessionRemoval+"</nobr></td>\n"+
" </tr>\n"
);
l++;
}
}
if (l > 0)
{
out.print(
" </table>\n"+
" </td>\n"+
" </tr>\n"
);
}
else
{
out.print(
" <tr><td class=\"message\" colspan=\"2\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.NoCanonicalizationSpecified")+"</nobr></td></tr>\n"
);
}
out.print(
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"
);
i = 0;
seenAny = false;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals(RSSConfig.NODE_MAP))
{
if (seenAny == false)
{
out.print(
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.URLMappingsColon")+"</nobr></td>\n"+
" <td class=\"value\">\n"
);
seenAny = true;
}
String match = sn.getAttributeValue(RSSConfig.ATTR_MATCH);
String map = sn.getAttributeValue(RSSConfig.ATTR_MAP);
out.print(
" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(match)+"</nobr>\n"
);
if (map != null && map.length() > 0)
{
out.print(
" --> <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(map)+"</nobr>\n"
);
}
out.print(
" <br/>\n"
);
}
}
if (seenAny)
{
out.print(
" </td>\n"+
" </tr>\n"
);
}
else
{
out.print(
" <tr><td class=\"message\" colspan=\"2\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.NoMappingsSpecifiedWillAcceptAllUrls")+"</nobr></td></tr>\n"
);
}
out.print(
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.Exclude") + "</nobr></td>\n"+
" <td class=\"value\">\n"
);
try
{
java.io.Reader str = new java.io.StringReader(exclusions);
try
{
java.io.BufferedReader is = new java.io.BufferedReader(str);
try
{
while (true)
{
String nextString = is.readLine();
if (nextString == null)
break;
if (nextString.length() == 0)
continue;
out.print(
" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(nextString)+"</nobr><br/>\n"
);
}
}
finally
{
is.close();
}
}
finally
{
str.close();
}
}
catch (java.io.IOException e)
{
throw new ManifoldCFException("IO error: "+e.getMessage(),e);
}
out.print(
" </td>\n"+
" </tr>\n"
);
out.print(
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"
);
String feedTimeoutValue = "60";
String feedRefetchValue = "60";
String minFeedRefetchValue = "15";
String badFeedRefetchValue = null;
String dechromedMode = RSSConfig.VALUE_NONE;
String chromedMode = RSSConfig.VALUE_USE;
i = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals(RSSConfig.NODE_FEEDTIMEOUT))
{
feedTimeoutValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE);
}
else if (sn.getType().equals(RSSConfig.NODE_FEEDRESCAN))
{
feedRefetchValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE);
}
else if (sn.getType().equals(RSSConfig.NODE_MINFEEDRESCAN))
{
minFeedRefetchValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE);
}
else if (sn.getType().equals(RSSConfig.NODE_BADFEEDRESCAN))
{
badFeedRefetchValue = sn.getAttributeValue(RSSConfig.ATTR_VALUE);
}
else if (sn.getType().equals(RSSConfig.NODE_DECHROMEDMODE))
{
dechromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE);
}
else if (sn.getType().equals(RSSConfig.NODE_CHROMEDMODE))
{
chromedMode = sn.getAttributeValue(RSSConfig.ATTR_MODE);
}
}
out.print(
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.FeedConnectionTimeout")+"</nobr></td>\n"+
" <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(feedTimeoutValue)+"</td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.DefaultFeedRescanInterval")+"</nobr></td>\n"+
" <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(feedRefetchValue)+"</td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.MinimumFeedRescanInterval")+"</nobr></td>\n"+
" <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(minFeedRefetchValue)+"</td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.BadFeedRescanInterval")+"</nobr></td>\n"+
" <td class=\"value\">"+((badFeedRefetchValue==null)?"(Default feed rescan value)":org.apache.manifoldcf.ui.util.Encoder.bodyEscape(badFeedRefetchValue))+"</td>\n"+
" </tr>\n"+
" \n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
"\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.DechromedContentSource")+"</nobr></td>\n"+
" <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(dechromedMode)+"</td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.ChromedContent")+"</nobr></td>\n"+
" <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(chromedMode)+"</td>\n"+
" </tr>\n"+
"\n"
);
out.print(
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"
);
// Go through looking for access tokens
seenAny = false;
i = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals(RSSConfig.NODE_ACCESS))
{
if (seenAny == false)
{
out.print(
" <tr><td class=\"description\"><nobr>"+Messages.getBodyString(locale,"RSSConnector.AccessTokens")+"</nobr></td>\n"+
" <td class=\"value\">\n"
);
seenAny = true;
}
String token = sn.getAttributeValue(RSSConfig.ATTR_TOKEN);
out.print(
" "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(token)+"<br/>\n"
);
}
}
if (seenAny)
{
out.print(
" </td>\n"+
" </tr>\n"
);
}
else
{
out.print(
" <tr><td class=\"message\" colspan=\"2\"><nobr>" + Messages.getBodyString(locale,"RSSConnector.NoAccessTokensSpecified") + "</nobr></td></tr>\n"
);
}
out.print(
"</table>\n"
);
}
/** Handle an RSS feed document, using SAX to limit the memory impact */
protected void handleRSSFeedSAX(String documentIdentifier, IProcessActivity activities, Filter filter)
throws ManifoldCFException, ServiceInterruption
{
// The SAX model uses parsing events to control parsing, which allows me to manage memory usage much better.
// This is essential for when a feed contains dechromed content as well as links.
// First, catch all flavors of IO exception, and handle them properly
try
{
// Open the input stream, and set up the parse
InputStream is = cache.getData(documentIdentifier);
if (is == null)
{
Logging.connectors.error("RSS: Document '"+documentIdentifier+"' should be in cache but isn't");
return;
}
try
{
Parser p = new Parser();
// Parse the document. This will cause various things to occur, within the instantiated XMLParsingContext class.
XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState();
OuterContextClass c = new OuterContextClass(x,documentIdentifier,activities,filter);
x.setContext(c);
try
{
// Believe it or not, there are no parsing errors we can get back now.
p.parseWithCharsetDetection(null,is,x);
c.checkIfValidFeed();
c.setDefaultRescanTimeIfNeeded();
}
finally
{
x.cleanup();
}
}
finally
{
is.close();
}
}
catch (java.net.SocketTimeoutException e)
{
throw new ManifoldCFException("Socket timeout error: "+e.getMessage(),e);
}
catch (ConnectTimeoutException e)
{
throw new ManifoldCFException("Socket connect timeout error: "+e.getMessage(),e);
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
throw new ManifoldCFException("IO error: "+e.getMessage(),e);
}
}
/** This class handles the outermost XML context for the feed document. */
protected class OuterContextClass extends XMLParsingContext
{
/** Keep track of the number of valid feed signals we saw */
protected int outerTagCount = 0;
/** The document identifier */
protected String documentIdentifier;
/** Activities interface */
protected IProcessActivity activities;
/** Filter */
protected Filter filter;
/** Flag indicating the the rescan time was set for this feed */
protected boolean rescanTimeSet = false;
public OuterContextClass(XMLFuzzyHierarchicalParseState theStream, String documentIdentifier, IProcessActivity activities, Filter filter)
{
super(theStream);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
/** Check if feed was valid */
public void checkIfValidFeed()
{
if (outerTagCount == 0)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: RSS document '"+documentIdentifier+"' does not have rss, feed, or rdf:RDF tag - not valid feed");
}
}
/** Check if the rescan flag was set or not, and if not, make sure it gets set properly */
public void setDefaultRescanTimeIfNeeded()
throws ManifoldCFException
{
if (rescanTimeSet == false)
{
// Set it!
// Need to set the requeue parameters appropriately, since otherwise the feed reverts to default document
// rescan or expire behavior.
long currentTime = System.currentTimeMillis();
Long rescanTime = filter.getBadFeedRescanTime(currentTime);
if (rescanTime == null)
rescanTime = filter.getDefaultRescanTime(currentTime);
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"' setting default rescan time to "+((rescanTime==null)?"null":rescanTime.toString()));
activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null);
rescanTimeSet = true;
}
}
/** Handle the tag beginning to set the correct second-level parsing context */
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
if (localName.equals("rss"))
{
// RSS feed detected
outerTagCount++;
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Parsed bottom-level XML for RSS document '"+documentIdentifier+"'");
return new RSSContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
}
else if (localName.toLowerCase(Locale.ROOT).equals("rdf"))
{
// RDF/Atom feed detected
outerTagCount++;
return new RDFContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
}
else if (localName.equals("feed"))
{
// Basic feed detected
outerTagCount++;
return new FeedContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
}
else if (localName.equals("urlset") || localName.equals("sitemapindex"))
{
// Sitemap detected
outerTagCount++;
return new UrlsetContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
}
// The default action is to establish a new default context.
return super.beginTag(namespace,localName,qName,atts);
}
/** Handle the tag ending */
@Override
protected void endTag()
throws ManifoldCFException
{
XMLParsingContext context = theStream.getContext();
String tagName = context.getLocalname();
if (tagName.equals("rss"))
{
rescanTimeSet = ((RSSContextClass)context).process();
}
else if (tagName.toLowerCase(Locale.ROOT).equals("rdf"))
{
rescanTimeSet = ((RDFContextClass)context).process();
}
else if (tagName.equals("feed"))
{
rescanTimeSet = ((FeedContextClass)context).process();
}
else if (tagName.equals("urlset") || tagName.equals("sitemapindex"))
{
rescanTimeSet = ((UrlsetContextClass)context).process();
}
else
super.endTag();
}
}
protected class RSSContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentIdentifier;
/** Activities interface */
protected IProcessActivity activities;
/** Filter */
protected Filter filter;
/** Rescan time set flag */
protected boolean rescanTimeSet = false;
public RSSContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
{
super(theStream,namespace,localName,qName,atts);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
// Handle each channel
if (localName.equals("channel"))
{
// Channel detected
return new RSSChannelContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
}
// Skip everything else.
return super.beginTag(namespace,localName,qName,atts);
}
@Override
protected void endTag()
throws ManifoldCFException
{
// If it's our channel tag, process global channel information
XMLParsingContext context = theStream.getContext();
String tagName = context.getLocalname();
if (tagName.equals("channel"))
{
rescanTimeSet = ((RSSChannelContextClass)context).process();
}
else
super.endTag();
}
/** Process this data */
protected boolean process()
throws ManifoldCFException
{
return rescanTimeSet;
}
}
protected class RSSChannelContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentIdentifier;
/** Activities interface */
protected IProcessActivity activities;
/** Filter */
protected Filter filter;
/** TTL value is set on a per-channel basis */
protected String ttlValue = null;
public RSSChannelContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
{
super(theStream,namespace,localName,qName,atts);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("ttl"))
{
// TTL value seen. Prepare to record it, as a string.
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("item"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
return new RSSItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
}
// Skip everything else.
return super.beginTag(namespace,localName,qName,atts);
}
@Override
protected void endTag()
throws ManifoldCFException
{
XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("ttl"))
// If the current context must be the TTL one, record its data value.
ttlValue = ((XMLStringParsingContext)theContext).getValue();
else if (theTag.equals("item"))
{
// It's an item.
RSSItemContextClass itemContext = (RSSItemContextClass)theContext;
// Presumably, since we are done parsing, we've recorded all the information we need in the context, object including:
// (1) File name (if any), containing dechromed content
// (2) Link name(s)
// (3) Pubdate
// (4) Title
// The job now is to pull this info out and call the activities interface appropriately.
// NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up
// all dangling files etc. that need to be removed.
// If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup()
// method will be called regardless.
itemContext.process(documentIdentifier,activities,filter);
}
else
super.endTag();
}
/** Process this data, return true if rescan time was set */
protected boolean process()
throws ManifoldCFException
{
// Deal with the ttlvalue, if it was found
// Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default.
long currentTime = System.currentTimeMillis();
Long rescanTime = filter.getDefaultRescanTime(currentTime);
if (ttlValue != null)
{
try
{
int minutes = Integer.parseInt(ttlValue);
long nextTime = currentTime + minutes * 60000L;
rescanTime = new Long(nextTime);
// Set the upper bound time; we want to scan the feeds aggressively.
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"', found a ttl value of "+ttlValue+"; setting refetch time accordingly");
}
catch (NumberFormatException e)
{
Logging.connectors.warn("RSS: RSS document '"+documentIdentifier+"' has illegal ttl value '"+ttlValue+"'");
}
}
if (rescanTime != null)
{
Long minimumTime = filter.getMinimumRescanTime(currentTime);
if (minimumTime != null)
{
if (rescanTime.longValue() < minimumTime.longValue())
rescanTime = minimumTime;
}
}
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"' setting rescan time to "+((rescanTime==null)?"null":rescanTime.toString()));
activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null);
return true;
}
}
protected class RSSItemContextClass extends XMLParsingContext
{
protected int dechromedContentMode;
protected String guidField = null;
protected String linkField = null;
protected String pubDateField = null;
protected String titleField = null;
protected String descriptionField = null;
protected String authorEmailField = null;
protected String authorNameField = null;
protected ArrayList categoryField = new ArrayList();
protected File contentsFile = null;
public RSSItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
{
super(theStream,namespace,localName,qName,atts);
this.dechromedContentMode = dechromedContentMode;
}
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("link"))
{
// "link" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("guid"))
{
// "guid" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("pubdate"))
{
// "pubDate" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("title"))
{
// "title" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("category"))
{
// "category" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("author"))
{
// "author" tag, which contains email
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("creator"))
{
// "creator" tag which contains name (like dc:creator)
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else
{
// Handle potentially longer fields. Both "description" and "content" fields can potentially be large; they are thus
// processed as temporary files. But the dance is complicated because (a) we only want one PRIMARY content source,
// and (b) we want access to the description field, if it is not used as primary content.
switch (dechromedContentMode)
{
case DECHROMED_NONE:
if (localName.equals("description"))
{
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
case DECHROMED_DESCRIPTION:
if (localName.equals("description"))
{
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
}
break;
case DECHROMED_CONTENT:
if (localName.equals("content"))
{
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
}
else if (localName.equals("description"))
{
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
default:
break;
}
// Skip everything else.
return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
@Override
protected void endTag()
throws ManifoldCFException
{
XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("link"))
{
linkField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("guid"))
{
guidField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("pubdate"))
{
pubDateField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("title"))
{
titleField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("category"))
{
categoryField.add(((XMLStringParsingContext)theContext).getValue());
}
else if (theTag.equals("author"))
{
authorEmailField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("creator"))
{
authorNameField = ((XMLStringParsingContext)theContext).getValue();
}
else
{
// What we want is: (a) if dechromed mode is NONE, just put the description file in the description field; (b)
// if dechromed mode is "description", put the description field in the primary content field; (c)
// if dechromed mode is "content", put the content field in the primary content field, and the description field in the description field.
switch (dechromedContentMode)
{
case DECHROMED_NONE:
if (theTag.equals("description"))
{
descriptionField = ((XMLStringParsingContext)theContext).getValue();
}
break;
case DECHROMED_DESCRIPTION:
if (theTag.equals("description"))
{
// Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
tagCleanup();
contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
break;
case DECHROMED_CONTENT:
if (theTag.equals("content"))
{
tagCleanup();
// Retrieve content file
contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
else if (theTag.equals("description"))
{
descriptionField = ((XMLStringParsingContext)theContext).getValue();
}
break;
default:
break;
}
super.endTag();
}
}
protected void tagCleanup()
throws ManifoldCFException
{
// Delete the contents file if it is there.
if (contentsFile != null)
{
contentsFile.delete();
contentsFile = null;
}
}
/** Process the data accumulated for this item */
public void process(String documentIdentifier, IProcessActivity activities, Filter filter)
throws ManifoldCFException
{
if (linkField == null || linkField.length() == 0)
linkField = guidField;
if (linkField != null && linkField.length() > 0)
{
Date origDateDate = null;
if (pubDateField != null && pubDateField.length() > 0)
{
origDateDate = DateParser.parseRFC822Date(pubDateField);
// Special for China Daily News
if (origDateDate == null)
origDateDate = DateParser.parseChinaDate(pubDateField);
// Special for LL
if (origDateDate == null)
origDateDate = DateParser.parseISO8601Date(pubDateField);
}
Long origDate;
if (origDateDate != null)
origDate = new Long(origDateDate.getTime());
else
origDate = null;
String[] links = linkField.split(", ");
int l = 0;
while (l < links.length)
{
String rawURL = links[l++].trim();
// Process the link
String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL);
if (newIdentifier != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+
((origDate==null)?"null":origDate.toString()));
if (filter.isLegalURL(newIdentifier))
{
if (contentsFile == null && filter.getChromedContentMode() != CHROMED_METADATA_ONLY)
{
// It's a reference! Add it.
String[] dataNames = new String[]{"pubdate","title","source","authoremail","authorname","category","description"};
String[][] dataValues = new String[dataNames.length][];
if (origDate != null)
dataValues[0] = new String[]{origDate.toString()};
if (titleField != null)
dataValues[1] = new String[]{titleField};
dataValues[2] = new String[]{documentIdentifier};
if (authorEmailField != null)
dataValues[3] = new String[]{authorEmailField};
if (authorNameField != null)
dataValues[4] = new String[]{authorNameField};
dataValues[5] = new String[categoryField.size()];
int q = 0;
while (q < categoryField.size())
{
(dataValues[5])[q] = (String)categoryField.get(q);
q++;
}
if (descriptionField != null)
dataValues[6] = new String[]{descriptionField};
// Add document reference, not including the data to pass down, but including a description
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
else
{
// The issue here is that if a document is ingested without a jobqueue entry, the document will not
// be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make
// sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document
// right here.
// Since the dechromed data is available from the feed, the possibility remains of passing the document
// Now, set up the carrydown info
String[] dataNames = new String[]{"pubdate","title","source","authoremail","authorname","category","data","description"};
Object[][] dataValues = new Object[dataNames.length][];
if (origDate != null)
dataValues[0] = new String[]{origDate.toString()};
if (titleField != null)
dataValues[1] = new String[]{titleField};
dataValues[2] = new String[]{documentIdentifier};
if (authorEmailField != null)
dataValues[3] = new String[]{authorEmailField};
if (authorNameField != null)
dataValues[4] = new String[]{authorNameField};
dataValues[5] = new String[categoryField.size()];
int q = 0;
while (q < categoryField.size())
{
(dataValues[5])[q] = (String)categoryField.get(q);
q++;
}
if (descriptionField != null)
dataValues[7] = new String[]{descriptionField};
if (contentsFile == null)
{
CharacterInput ci = new NullCharacterInput();
try
{
dataValues[6] = new Object[]{ci};
// Add document reference, including the data to pass down, and the dechromed content too
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
finally
{
ci.discard();
}
}
else
{
CharacterInput ci = new TempFileCharacterInput(contentsFile);
try
{
contentsFile = null;
dataValues[6] = new Object[]{ci};
// Add document reference, including the data to pass down, and the dechromed content too
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
finally
{
ci.discard();
}
}
}
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded");
}
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'");
}
}
}
}
}
protected class RDFContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentIdentifier;
/** Activities interface */
protected IProcessActivity activities;
/** Filter */
protected Filter filter;
/** ttl value */
protected String ttlValue = null;
public RDFContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
{
super(theStream,namespace,localName,qName,atts);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("ttl"))
{
// TTL value seen. Prepare to record it, as a string.
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("item"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
return new RDFItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
}
// Skip everything else.
return super.beginTag(namespace,localName,qName,atts);
}
@Override
protected void endTag()
throws ManifoldCFException
{
XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("ttl"))
// If the current context must be the TTL one, record its data value.
ttlValue = ((XMLStringParsingContext)theContext).getValue();
else if (theTag.equals("item"))
{
// It's an item.
RDFItemContextClass itemContext = (RDFItemContextClass)theContext;
// Presumably, since we are done parsing, we've recorded all the information we need in the context, object including:
// (1) File name (if any), containing dechromed content
// (2) Link name(s)
// (3) Pubdate
// (4) Title
// The job now is to pull this info out and call the activities interface appropriately.
// NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up
// all dangling files etc. that need to be removed.
// If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup()
// method will be called regardless.
itemContext.process(documentIdentifier,activities,filter);
}
else
super.endTag();
}
/** Process this data */
protected boolean process()
throws ManifoldCFException
{
// Deal with the ttlvalue, if it was found
// Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default.
long currentTime = System.currentTimeMillis();
Long rescanTime = filter.getDefaultRescanTime(currentTime);
if (ttlValue != null)
{
try
{
int minutes = Integer.parseInt(ttlValue);
long nextTime = currentTime + minutes * 60000L;
rescanTime = new Long(nextTime);
// Set the upper bound time; we want to scan the feeds aggressively.
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In RDF document '"+documentIdentifier+"', found a ttl value of "+ttlValue+"; setting refetch time accordingly");
}
catch (NumberFormatException e)
{
Logging.connectors.warn("RSS: RDF document '"+documentIdentifier+"' has illegal ttl value '"+ttlValue+"'");
}
}
if (rescanTime != null)
{
Long minimumTime = filter.getMinimumRescanTime(currentTime);
if (minimumTime != null)
{
if (rescanTime.longValue() < minimumTime.longValue())
rescanTime = minimumTime;
}
}
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In RDF document '"+documentIdentifier+"' setting rescan time to "+((rescanTime==null)?"null":rescanTime.toString()));
activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null);
return true;
}
}
protected class RDFItemContextClass extends XMLParsingContext
{
protected int dechromedContentMode;
protected String linkField = null;
protected String pubDateField = null;
protected String titleField = null;
protected String authorNameField = null;
protected String descriptionField = null;
protected File contentsFile = null;
public RDFItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
{
super(theStream,namespace,localName,qName,atts);
this.dechromedContentMode = dechromedContentMode;
}
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("link"))
{
// "link" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("date"))
{
// "dc:date" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("title"))
{
// "title" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("creator"))
{
// "creator" tag (e.g. "dc:creator")
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else
{
switch (dechromedContentMode)
{
case DECHROMED_NONE:
if (localName.equals("description"))
{
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
case DECHROMED_DESCRIPTION:
if (localName.equals("description"))
{
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
}
break;
case DECHROMED_CONTENT:
if (localName.equals("content"))
{
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
}
else if (localName.equals("description"))
{
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
default:
break;
}
// Skip everything else.
return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
@Override
protected void endTag()
throws ManifoldCFException
{
XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("link"))
{
linkField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("date"))
{
pubDateField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("title"))
{
titleField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("creator"))
{
authorNameField = ((XMLStringParsingContext)theContext).getValue();
}
else
{
switch (dechromedContentMode)
{
case DECHROMED_NONE:
if (theTag.equals("description"))
{
descriptionField = ((XMLStringParsingContext)theContext).getValue();
}
break;
case DECHROMED_DESCRIPTION:
if (theTag.equals("description"))
{
// Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
tagCleanup();
contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
break;
case DECHROMED_CONTENT:
if (theTag.equals("dc:content"))
{
// Retrieve content file
tagCleanup();
contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
else if (theTag.equals("description"))
{
descriptionField = ((XMLStringParsingContext)theContext).getValue();
}
break;
default:
break;
}
super.endTag();
}
}
protected void tagCleanup()
throws ManifoldCFException
{
// Delete the contents file if it is there.
if (contentsFile != null)
{
contentsFile.delete();
contentsFile = null;
}
}
/** Process the data accumulated for this item */
public void process(String documentIdentifier, IProcessActivity activities, Filter filter)
throws ManifoldCFException
{
if (linkField != null && linkField.length() > 0)
{
Date origDateDate = null;
if (pubDateField != null && pubDateField.length() > 0)
origDateDate = DateParser.parseISO8601Date(pubDateField);
Long origDate;
if (origDateDate != null)
origDate = new Long(origDateDate.getTime());
else
origDate = null;
String[] links = linkField.split(", ");
int l = 0;
while (l < links.length)
{
String rawURL = links[l++].trim();
// Process the link
String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL);
if (newIdentifier != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In RDF document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+
((origDate==null)?"null":origDate.toString()));
if (filter.isLegalURL(newIdentifier))
{
if (contentsFile == null && filter.getChromedContentMode() != CHROMED_METADATA_ONLY)
{
// It's a reference! Add it.
String[] dataNames = new String[]{"pubdate","title","source","authorname","description"};
String[][] dataValues = new String[dataNames.length][];
if (origDate != null)
dataValues[0] = new String[]{origDate.toString()};
if (titleField != null)
dataValues[1] = new String[]{titleField};
dataValues[2] = new String[]{documentIdentifier};
if (authorNameField != null)
dataValues[3] = new String[]{authorNameField};
if (descriptionField != null)
dataValues[4] = new String[]{descriptionField};
// Add document reference, including the data to pass down
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
else
{
// The issue here is that if a document is ingested without a jobqueue entry, the document will not
// be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make
// sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document
// right here.
// Now, set up the carrydown info
String[] dataNames = new String[]{"pubdate","title","source","authorname","data","description"};
Object[][] dataValues = new Object[dataNames.length][];
if (origDate != null)
dataValues[0] = new String[]{origDate.toString()};
if (titleField != null)
dataValues[1] = new String[]{titleField};
dataValues[2] = new String[]{documentIdentifier};
if (authorNameField != null)
dataValues[3] = new String[]{authorNameField};
if (descriptionField != null)
dataValues[5] = new String[]{descriptionField};
if (contentsFile == null)
{
CharacterInput ci = new NullCharacterInput();
try
{
dataValues[4] = new Object[]{ci};
// Add document reference, including the data to pass down, and the dechromed content too
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
finally
{
ci.discard();
}
}
else
{
CharacterInput ci = new TempFileCharacterInput(contentsFile);
try
{
contentsFile = null;
dataValues[4] = new Object[]{ci};
// Add document reference, including the data to pass down, and the dechromed content too
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
finally
{
ci.discard();
}
}
}
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded");
}
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In RSS document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'");
}
}
}
}
}
protected class FeedContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentIdentifier;
/** Activities interface */
protected IProcessActivity activities;
/** Filter */
protected Filter filter;
/** ttl value */
protected String ttlValue = null;
public FeedContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
{
super(theStream,namespace,localName,qName,atts);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("ttl"))
{
// TTL value seen. Prepare to record it, as a string.
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("entry"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
return new FeedItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
}
// Skip everything else.
return super.beginTag(namespace,localName,qName,atts);
}
@Override
protected void endTag()
throws ManifoldCFException
{
XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("ttl"))
// If the current context must be the TTL one, record its data value.
ttlValue = ((XMLStringParsingContext)theContext).getValue();
else if (theTag.equals("entry"))
{
// It's an item.
FeedItemContextClass itemContext = (FeedItemContextClass)theContext;
// Presumably, since we are done parsing, we've recorded all the information we need in the context, object including:
// (1) File name (if any), containing dechromed content
// (2) Link name(s)
// (3) Pubdate
// (4) Title
// The job now is to pull this info out and call the activities interface appropriately.
// NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up
// all dangling files etc. that need to be removed.
// If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup()
// method will be called regardless.
itemContext.process(documentIdentifier,activities,filter);
}
else
super.endTag();
}
/** Process this data */
protected boolean process()
throws ManifoldCFException
{
// Deal with the ttlvalue, if it was found
// Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default.
long currentTime = System.currentTimeMillis();
Long rescanTime = filter.getDefaultRescanTime(currentTime);
if (ttlValue != null)
{
try
{
int minutes = Integer.parseInt(ttlValue);
long nextTime = currentTime + minutes * 60000L;
rescanTime = new Long(nextTime);
// Set the upper bound time; we want to scan the feeds aggressively.
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found a ttl value of "+ttlValue+"; setting refetch time accordingly");
}
catch (NumberFormatException e)
{
Logging.connectors.warn("RSS: Atom document '"+documentIdentifier+"' has illegal ttl value '"+ttlValue+"'");
}
}
if (rescanTime != null)
{
Long minimumTime = filter.getMinimumRescanTime(currentTime);
if (minimumTime != null)
{
if (rescanTime.longValue() < minimumTime.longValue())
rescanTime = minimumTime;
}
}
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"' setting rescan time to "+((rescanTime==null)?"null":rescanTime.toString()));
activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null);
return true;
}
}
protected class FeedItemContextClass extends XMLParsingContext
{
protected int dechromedContentMode;
protected List<String> linkField = new ArrayList<String>();
protected String pubDateField = null;
protected String titleField = null;
protected String authorNameField = null;
protected String authorEmailField = null;
protected ArrayList categoryField = new ArrayList();
protected File contentsFile = null;
protected String descriptionField = null;
public FeedItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
{
super(theStream,namespace,localName,qName,atts);
this.dechromedContentMode = dechromedContentMode;
}
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("link"))
{
// "link" tag
String ref = atts.get("href");
if (ref != null && ref.length() > 0)
linkField.add(ref);
return super.beginTag(namespace,localName,qName,atts);
}
else if (localName.equals("published") || localName.equals("updated"))
{
// "published" pr "updated" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("title"))
{
// "title" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("author"))
{
return new FeedAuthorContextClass(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("category"))
{
String category = atts.get("term");
if (category != null && category.length() > 0)
categoryField.add(category);
return super.beginTag(namespace,localName,qName,atts);
}
else
{
switch (dechromedContentMode)
{
case DECHROMED_NONE:
if (localName.equals("subtitle"))
{
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
case DECHROMED_DESCRIPTION:
if (localName.equals("subtitle"))
{
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
}
break;
case DECHROMED_CONTENT:
if (localName.equals("content"))
{
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
}
}
else if (localName.equals("subtitle"))
{
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
default:
break;
}
// Skip everything else.
return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
@Override
protected void endTag()
throws ManifoldCFException
{
XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("published") || theTag.equals("updated"))
{
pubDateField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("title"))
{
titleField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("author"))
{
FeedAuthorContextClass authorContext = (FeedAuthorContextClass)theContext;
authorEmailField = authorContext.getAuthorEmail();
authorNameField = authorContext.getAuthorName();
}
else
{
switch (dechromedContentMode)
{
case DECHROMED_NONE:
if (theTag.equals("subtitle"))
{
titleField = ((XMLStringParsingContext)theContext).getValue();
}
break;
case DECHROMED_DESCRIPTION:
if (theTag.equals("subtitle"))
{
// Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
tagCleanup();
contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
break;
case DECHROMED_CONTENT:
if (theTag.equals("content"))
{
// Retrieve content file
tagCleanup();
contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
else if (theTag.equals("subtitle"))
{
titleField = ((XMLStringParsingContext)theContext).getValue();
}
break;
default:
break;
}
super.endTag();
}
}
protected void tagCleanup()
throws ManifoldCFException
{
// Delete the contents file if it is there.
if (contentsFile != null)
{
contentsFile.delete();
contentsFile = null;
}
}
/** Process the data accumulated for this item */
public void process(String documentIdentifier, IProcessActivity activities, Filter filter)
throws ManifoldCFException
{
if (linkField.size() > 0)
{
Date origDateDate = null;
if (pubDateField != null && pubDateField.length() > 0)
origDateDate = DateParser.parseISO8601Date(pubDateField);
Long origDate;
if (origDateDate != null)
origDate = new Long(origDateDate.getTime());
else
origDate = null;
for (String linkValue : linkField)
{
String[] links = linkValue.split(", ");
int l = 0;
while (l < links.length)
{
String rawURL = links[l++].trim();
// Process the link
String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL);
if (newIdentifier != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+
((origDate==null)?"null":origDate.toString()));
if (filter.isLegalURL(newIdentifier))
{
if (contentsFile == null && filter.getChromedContentMode() != CHROMED_METADATA_ONLY)
{
// It's a reference! Add it.
String[] dataNames = new String[]{"pubdate","title","source","category","description"};
String[][] dataValues = new String[dataNames.length][];
if (origDate != null)
dataValues[0] = new String[]{origDate.toString()};
if (titleField != null)
dataValues[1] = new String[]{titleField};
dataValues[2] = new String[]{documentIdentifier};
dataValues[3] = new String[categoryField.size()];
int q = 0;
while (q < categoryField.size())
{
(dataValues[3])[q] = (String)categoryField.get(q);
q++;
}
if (descriptionField != null)
dataValues[4] = new String[]{descriptionField};
// Add document reference, including the data to pass down
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
else
{
// The issue here is that if a document is ingested without a jobqueue entry, the document will not
// be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make
// sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document
// right here.
// Now, set up the carrydown info
String[] dataNames = new String[]{"pubdate","title","source","category","data","description"};
Object[][] dataValues = new Object[dataNames.length][];
if (origDate != null)
dataValues[0] = new String[]{origDate.toString()};
if (titleField != null)
dataValues[1] = new String[]{titleField};
dataValues[2] = new String[]{documentIdentifier};
dataValues[3] = new String[categoryField.size()];
int q = 0;
while (q < categoryField.size())
{
(dataValues[3])[q] = (String)categoryField.get(q);
q++;
}
if (descriptionField != null)
dataValues[5] = new String[]{descriptionField};
if (contentsFile == null)
{
CharacterInput ci = new NullCharacterInput();
try
{
dataValues[4] = new Object[]{ci};
// Add document reference, including the data to pass down, and the dechromed content too
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
finally
{
ci.discard();
}
}
else
{
CharacterInput ci = new TempFileCharacterInput(contentsFile);
try
{
contentsFile = null;
dataValues[4] = new Object[]{ci};
// Add document reference, including the data to pass down, and the dechromed content too
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
finally
{
ci.discard();
}
}
}
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded");
}
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'");
}
}
}
}
}
}
protected class FeedAuthorContextClass extends XMLParsingContext
{
protected String authorNameField = null;
protected String authorEmailField = null;
public FeedAuthorContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
{
super(theStream,namespace,localName,qName,atts);
}
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
if (localName.equals("name"))
{
// "name" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("email"))
{
// "email" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else
{
// Skip everything else.
return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
@Override
protected void endTag()
throws ManifoldCFException
{
XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("name"))
{
authorNameField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("email"))
{
authorEmailField = ((XMLStringParsingContext)theContext).getValue();
}
else
{
super.endTag();
}
}
public String getAuthorName()
{
return authorNameField;
}
public String getAuthorEmail()
{
return authorEmailField;
}
}
protected class UrlsetContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentIdentifier;
/** Activities interface */
protected IProcessActivity activities;
/** Filter */
protected Filter filter;
/** ttl value */
protected String ttlValue = null;
public UrlsetContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
{
super(theStream,namespace,localName,qName,atts);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
// The tags we care about are "url", nothing else.
if (localName.equals("url") || localName.equals("sitemap"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
return new UrlsetItemContextClass(theStream,namespace,localName,qName,atts);
}
// Skip everything else.
return super.beginTag(namespace,localName,qName,atts);
}
@Override
protected void endTag()
throws ManifoldCFException
{
XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("url") || theTag.equals("sitemap"))
{
// It's an item.
UrlsetItemContextClass itemContext = (UrlsetItemContextClass)theContext;
// Presumably, since we are done parsing, we've recorded all the information we need in the context, object including:
// (1) File name (if any), containing dechromed content
// (2) Link name(s)
// (3) Pubdate
// (4) Title
// The job now is to pull this info out and call the activities interface appropriately.
// NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up
// all dangling files etc. that need to be removed.
// If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup()
// method will be called regardless.
itemContext.process(documentIdentifier,activities,filter);
}
else
super.endTag();
}
/** Process this data */
protected boolean process()
throws ManifoldCFException
{
// Deal with the ttlvalue, if it was found
// Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default.
long currentTime = System.currentTimeMillis();
Long rescanTime = filter.getDefaultRescanTime(currentTime);
if (ttlValue != null)
{
try
{
int minutes = Integer.parseInt(ttlValue);
long nextTime = currentTime + minutes * 60000L;
rescanTime = new Long(nextTime);
// Set the upper bound time; we want to scan the feeds aggressively.
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found a ttl value of "+ttlValue+"; setting refetch time accordingly");
}
catch (NumberFormatException e)
{
Logging.connectors.warn("RSS: SiteMap document '"+documentIdentifier+"' has illegal ttl value '"+ttlValue+"'");
}
}
if (rescanTime != null)
{
Long minimumTime = filter.getMinimumRescanTime(currentTime);
if (minimumTime != null)
{
if (rescanTime.longValue() < minimumTime.longValue())
rescanTime = minimumTime;
}
}
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"' setting rescan time to "+((rescanTime==null)?"null":rescanTime.toString()));
activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null);
return true;
}
}
protected class UrlsetItemContextClass extends XMLParsingContext
{
protected String linkField = null;
protected String pubDateField = null;
public UrlsetItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
{
super(theStream,namespace,localName,qName,atts);
}
@Override
protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
throws ManifoldCFException
{
// The tags we care about are "loc" and "lastmod", nothing else.
if (localName.equals("loc"))
{
// "loc" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("lastmod"))
{
// "lastmod" tag
return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else
{
// Skip everything else.
return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
@Override
protected void endTag()
throws ManifoldCFException
{
XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("loc"))
{
linkField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("lastmod"))
{
pubDateField = ((XMLStringParsingContext)theContext).getValue();
}
else
{
super.endTag();
}
}
protected void tagCleanup()
throws ManifoldCFException
{
}
/** Process the data accumulated for this item */
public void process(String documentIdentifier, IProcessActivity activities, Filter filter)
throws ManifoldCFException
{
if (linkField != null && linkField.length() > 0)
{
Date origDateDate = null;
if (pubDateField != null && pubDateField.length() > 0)
origDateDate = DateParser.parseISO8601Date(pubDateField);
Long origDate;
if (origDateDate != null)
origDate = new Long(origDateDate.getTime());
else
origDate = null;
String[] links = linkField.split(", ");
int l = 0;
while (l < links.length)
{
String rawURL = links[l++].trim();
// Process the link
String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL);
if (newIdentifier != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+
((origDate==null)?"null":origDate.toString()));
if (filter.isLegalURL(newIdentifier))
{
// It's a reference! Add it.
String[] dataNames = new String[]{"pubdate","source"};
String[][] dataValues = new String[dataNames.length][];
if (origDate != null)
dataValues[0] = new String[]{origDate.toString()};
dataValues[1] = new String[]{documentIdentifier};
// Add document reference, including the data to pass down
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded");
}
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'");
}
}
}
}
}
/** Get the maximum number of documents to amalgamate together into one batch, for this connector.
*@return the maximum number. 0 indicates "unlimited".
*/
public int getMaxDocumentRequest()
{
// RSS and the web in general do not batch well. Multiple chunks have no advantage over one-at-a-time requests.
return 1;
}
// Protected methods and classes
/** Given the current parameters, find the correct throttled fetcher object
* (or create one if not there).
*/
protected ThrottledFetcher getFetcher()
{
synchronized (fetcherMap)
{
ThrottledFetcher tf = fetcherMap.get(throttleGroupName);
if (tf == null)
{
tf = new ThrottledFetcher();
fetcherMap.put(throttleGroupName,tf);
}
return tf;
}
}
/** Read a string as a sequence of individual expressions, urls, etc.
*/
protected static List<String> stringToArray(String input)
{
List<String> list = new ArrayList<String>();
try
{
java.io.Reader str = new java.io.StringReader(input);
try
{
java.io.BufferedReader is = new java.io.BufferedReader(str);
try
{
while (true)
{
String nextString = is.readLine();
if (nextString == null)
break;
if (nextString.length() == 0)
continue;
nextString.trim();
if (nextString.startsWith("#"))
continue;
list.add(nextString);
}
}
finally
{
is.close();
}
}
finally
{
str.close();
}
}
catch (java.io.IOException e)
{
// Eat the exception and exit.
}
return list;
}
/** Compile all regexp entries in the passed in list, and add them to the output
* list.
*/
protected static void compileList(List<Pattern> output, List<String> input)
throws ManifoldCFException
{
for (String inputString : input)
{
try
{
output.add(Pattern.compile(inputString));
}
catch (PatternSyntaxException e)
{
throw new ManifoldCFException("Mapping regular expression '"+inputString+"' is illegal: "+e.getMessage(),e);
}
}
}
/** Given the current parameters, find the correct robots object (or create
* one if none found).
*/
protected Robots getRobots(ThrottledFetcher fetcher)
{
synchronized (robotsMap)
{
Robots r = (Robots)robotsMap.get(throttleGroupName);
if (r == null)
{
r = new Robots(fetcher);
robotsMap.put(throttleGroupName,r);
}
return r;
}
}
// Protected classes
/** The throttle specification class. Each server name is a different bin in this model.
*/
protected static class ThrottleSpec implements IThrottleSpec
{
protected final int maxOpenConnectionsPerServer;
protected final long minimumMillisecondsPerFetchPerServer;
protected final double minimumMillisecondsPerBytePerServer;
public ThrottleSpec(int maxOpenConnectionsPerServer, long minimumMillisecondsPerFetchPerServer,
double minimumMillisecondsPerBytePerServer)
{
this.maxOpenConnectionsPerServer = maxOpenConnectionsPerServer;
this.minimumMillisecondsPerFetchPerServer = minimumMillisecondsPerFetchPerServer;
this.minimumMillisecondsPerBytePerServer = minimumMillisecondsPerBytePerServer;
}
/** Given a bin name, find the max open connections to use for that bin.
*@return Integer.MAX_VALUE if no limit found.
*/
public int getMaxOpenConnections(String binName)
{
return maxOpenConnectionsPerServer;
}
/** Look up minimum milliseconds per byte for a bin.
*@return 0.0 if no limit found.
*/
public double getMinimumMillisecondsPerByte(String binName)
{
return minimumMillisecondsPerBytePerServer;
}
/** Look up minimum milliseconds for a fetch for a bin.
*@return 0 if no limit found.
*/
public long getMinimumMillisecondsPerFetch(String binName)
{
return minimumMillisecondsPerFetchPerServer;
}
}
/** Name/value class */
protected static class NameValue
{
protected String name;
protected String value;
public NameValue(String name, String value)
{
this.name = name;
this.value = value;
}
public String getName()
{
return name;
}
public String getValue()
{
return value;
}
}
/** Evaluator token.
*/
protected static class EvaluatorToken
{
public final static int TYPE_GROUP = 0;
public final static int TYPE_TEXT = 1;
public final static int TYPE_COMMA = 2;
public final static int GROUPSTYLE_NONE = 0;
public final static int GROUPSTYLE_LOWER = 1;
public final static int GROUPSTYLE_UPPER = 2;
public final static int GROUPSTYLE_MIXED = 3;
protected int type;
protected int groupNumber = -1;
protected int groupStyle = GROUPSTYLE_NONE;
protected String textValue = null;
public EvaluatorToken()
{
type = TYPE_COMMA;
}
public EvaluatorToken(int groupNumber, int groupStyle)
{
type = TYPE_GROUP;
this.groupNumber = groupNumber;
this.groupStyle = groupStyle;
}
public EvaluatorToken(String text)
{
type = TYPE_TEXT;
this.textValue = text;
}
public int getType()
{
return type;
}
public int getGroupNumber()
{
return groupNumber;
}
public int getGroupStyle()
{
return groupStyle;
}
public String getTextValue()
{
return textValue;
}
}
/** Token stream.
*/
protected static class EvaluatorTokenStream
{
protected String text;
protected int pos;
protected EvaluatorToken token = null;
/** Constructor.
*/
public EvaluatorTokenStream(String text)
{
this.text = text;
this.pos = 0;
}
/** Get current token.
*/
public EvaluatorToken peek()
throws ManifoldCFException
{
if (token == null)
{
token = nextToken();
}
return token;
}
/** Go on to next token.
*/
public void advance()
{
token = null;
}
protected EvaluatorToken nextToken()
throws ManifoldCFException
{
char x;
// Fetch the next token
while (true)
{
if (pos == text.length())
return null;
x = text.charAt(pos);
if (x > ' ')
break;
pos++;
}
StringBuilder sb;
if (x == '"')
{
// Parse text
pos++;
sb = new StringBuilder();
while (true)
{
if (pos == text.length())
break;
x = text.charAt(pos);
pos++;
if (x == '"')
{
break;
}
if (x == '\\')
{
if (pos == text.length())
break;
x = text.charAt(pos++);
}
sb.append(x);
}
return new EvaluatorToken(sb.toString());
}
if (x == ',')
{
pos++;
return new EvaluatorToken();
}
// Eat number at beginning
sb = new StringBuilder();
while (true)
{
if (pos == text.length())
break;
x = text.charAt(pos);
if (x >= '0' && x <= '9')
{
sb.append(x);
pos++;
continue;
}
break;
}
String numberValue = sb.toString();
int groupNumber = 0;
if (numberValue.length() > 0)
groupNumber = new Integer(numberValue).intValue();
// Save the next char position
int modifierPos = pos;
// Go to the end of the word
while (true)
{
if (pos == text.length())
break;
x = text.charAt(pos);
if (x == ',' || x >= '0' && x <= '9' || x <= ' ' && x >= 0)
break;
pos++;
}
int style = EvaluatorToken.GROUPSTYLE_NONE;
if (modifierPos != pos)
{
String modifier = text.substring(modifierPos,pos);
if (modifier.startsWith("u"))
style = EvaluatorToken.GROUPSTYLE_UPPER;
else if (modifier.startsWith("l"))
style = EvaluatorToken.GROUPSTYLE_LOWER;
else if (modifier.startsWith("m"))
style = EvaluatorToken.GROUPSTYLE_MIXED;
else
throw new ManifoldCFException("Unknown style: "+modifier);
}
return new EvaluatorToken(groupNumber,style);
}
}
/** Class representing a URL regular expression match, for the purposes of determining canonicalization policy */
protected static class CanonicalizationPolicy
{
protected final Pattern matchPattern;
protected final boolean reorder;
protected final boolean removeJavaSession;
protected final boolean removeAspSession;
protected final boolean removePhpSession;
protected final boolean removeBVSession;
public CanonicalizationPolicy(Pattern matchPattern, boolean reorder, boolean removeJavaSession, boolean removeAspSession,
boolean removePhpSession, boolean removeBVSession)
{
this.matchPattern = matchPattern;
this.reorder = reorder;
this.removeJavaSession = removeJavaSession;
this.removeAspSession = removeAspSession;
this.removePhpSession = removePhpSession;
this.removeBVSession = removeBVSession;
}
public boolean checkMatch(String url)
{
Matcher matcher = matchPattern.matcher(url);
return matcher.find();
}
public boolean canReorder()
{
return reorder;
}
public boolean canRemoveJavaSession()
{
return removeJavaSession;
}
public boolean canRemoveAspSession()
{
return removeAspSession;
}
public boolean canRemovePhpSession()
{
return removePhpSession;
}
public boolean canRemoveBvSession()
{
return removeBVSession;
}
}
/** Class representing a list of canonicalization rules */
protected static class CanonicalizationPolicies
{
protected final List<CanonicalizationPolicy> rules = new ArrayList<CanonicalizationPolicy>();
public CanonicalizationPolicies()
{
}
public void addRule(CanonicalizationPolicy rule)
{
rules.add(rule);
}
public CanonicalizationPolicy findMatch(String url)
{
for (CanonicalizationPolicy rule : rules)
{
if (rule.checkMatch(url))
return rule;
}
return null;
}
}
/** Class representing a mapping rule */
protected static class MappingRule
{
protected final Pattern matchPattern;
protected final String evalExpression;
public MappingRule(Pattern matchPattern, String evalExpression)
{
this.matchPattern = matchPattern;
this.evalExpression = evalExpression;
}
public boolean checkMatch(String url)
{
Matcher matcher = matchPattern.matcher(url);
return matcher.matches();
}
public String map(String url)
throws ManifoldCFException
{
// Create a matcher, and attempt to do a match
Matcher matcher = matchPattern.matcher(url);
if (!matcher.matches())
{
return null;
}
// A match! Now, interpret the output expression
if (evalExpression == null || evalExpression.length() == 0)
return url;
StringBuilder sb = new StringBuilder();
EvaluatorTokenStream et = new EvaluatorTokenStream(evalExpression);
while (true)
{
EvaluatorToken t = et.peek();
if (t == null)
break;
switch (t.getType())
{
case EvaluatorToken.TYPE_COMMA:
et.advance();
break;
case EvaluatorToken.TYPE_GROUP:
et.advance();
String groupValue = matcher.group(t.getGroupNumber());
switch (t.getGroupStyle())
{
case EvaluatorToken.GROUPSTYLE_NONE:
sb.append(groupValue);
break;
case EvaluatorToken.GROUPSTYLE_LOWER:
sb.append(groupValue.toLowerCase(Locale.ROOT));
break;
case EvaluatorToken.GROUPSTYLE_UPPER:
sb.append(groupValue.toUpperCase(Locale.ROOT));
break;
case EvaluatorToken.GROUPSTYLE_MIXED:
if (groupValue.length() > 0)
{
sb.append(groupValue.substring(0,1).toUpperCase(Locale.ROOT));
sb.append(groupValue.substring(1).toLowerCase(Locale.ROOT));
}
break;
default:
throw new ManifoldCFException("Illegal group style");
}
break;
case EvaluatorToken.TYPE_TEXT:
et.advance();
sb.append(t.getTextValue());
break;
default:
throw new ManifoldCFException("Illegal token type");
}
}
return sb.toString();
}
}
/** Class that represents all mappings */
protected static class MappingRules
{
protected final List<MappingRule> mappings = new ArrayList<MappingRule>();
public MappingRules()
{
}
public void add(MappingRule rule)
{
mappings.add(rule);
}
public boolean isMatch(String url)
{
if (mappings.size() == 0)
return true;
for (MappingRule p : mappings)
{
if (p.checkMatch(url))
return true;
}
return false;
}
public String map(String url)
throws ManifoldCFException
{
if (mappings.size() == 0)
return url;
for (MappingRule p : mappings)
{
String rval = p.map(url);
if (rval != null)
return rval;
}
return null;
}
}
/** Class that handles parsing and interpretation of the document specification.
* Note that I believe it to be faster to do this once, gathering all the data, than to scan the document specification multiple times.
* Therefore, this class contains the *entire* interpreted set of data from a document specification.
*/
protected static class Filter
{
protected final MappingRules mappings = new MappingRules();
protected final Set<String> seeds;
protected Integer defaultRescanInterval = null;
protected Integer minimumRescanInterval = null;
protected Integer badFeedRescanInterval = null;
protected int dechromedContentMode = DECHROMED_NONE;
protected int chromedContentMode = CHROMED_USE;
protected int feedTimeoutValue = 60000;
protected final Set<String> acls = new HashSet<String>();
protected final CanonicalizationPolicies canonicalizationPolicies = new CanonicalizationPolicies();
/** The arraylist of exclude patterns */
protected final List<Pattern> excludePatterns = new ArrayList<Pattern>();
/** Constructor. */
public Filter(Specification spec, boolean warnOnBadSeed)
throws ManifoldCFException
{
String excludes = "";
// To save allocation, preallocate the seeds map assuming that it will require 1.5x the number of nodes in the spec
int initialSize = spec.getChildCount();
if (initialSize == 0)
initialSize = 1;
seeds = new HashSet<String>((initialSize * 3) >> 1);
int i = 0;
// First pass. Find all of the rules (which are necessary to canonicalize the seeds, etc.)
while (i < spec.getChildCount())
{
SpecificationNode n = spec.getChild(i++);
if (n.getType().equals(RSSConfig.NODE_MAP))
{
String match = n.getAttributeValue(RSSConfig.ATTR_MATCH);
String map = n.getAttributeValue(RSSConfig.ATTR_MAP);
if (match != null && match.length() > 0)
{
Pattern p;
try
{
p = Pattern.compile(match);
}
catch (java.util.regex.PatternSyntaxException e)
{
throw new ManifoldCFException("Regular expression '"+match+"' is illegal: "+e.getMessage(),e);
}
if (map == null)
map = "";
mappings.add(new MappingRule(p,map));
}
}
else if (n.getType().equals(RSSConfig.NODE_EXCLUDES))
{
excludes = n.getValue();
if (excludes == null)
excludes = "";
}
else if (n.getType().equals(RSSConfig.NODE_URLSPEC))
{
String urlRegexp = n.getAttributeValue(RSSConfig.ATTR_REGEXP);
if (urlRegexp == null)
urlRegexp = "";
String reorder = n.getAttributeValue(RSSConfig.ATTR_REORDER);
boolean reorderValue;
if (reorder == null)
reorderValue = false;
else
{
if (reorder.equals(RSSConfig.VALUE_YES))
reorderValue = true;
else
reorderValue = false;
}
String javaSession = n.getAttributeValue(RSSConfig.ATTR_JAVASESSIONREMOVAL);
boolean javaSessionValue;
if (javaSession == null)
javaSessionValue = false;
else
{
if (javaSession.equals(RSSConfig.VALUE_YES))
javaSessionValue = true;
else
javaSessionValue = false;
}
String aspSession = n.getAttributeValue(RSSConfig.ATTR_ASPSESSIONREMOVAL);
boolean aspSessionValue;
if (aspSession == null)
aspSessionValue = false;
else
{
if (aspSession.equals(RSSConfig.VALUE_YES))
aspSessionValue = true;
else
aspSessionValue = false;
}
String phpSession = n.getAttributeValue(RSSConfig.ATTR_PHPSESSIONREMOVAL);
boolean phpSessionValue;
if (phpSession == null)
phpSessionValue = false;
else
{
if (phpSession.equals(RSSConfig.VALUE_YES))
phpSessionValue = true;
else
phpSessionValue = false;
}
String bvSession = n.getAttributeValue(RSSConfig.ATTR_BVSESSIONREMOVAL);
boolean bvSessionValue;
if (bvSession == null)
bvSessionValue = false;
else
{
if (bvSession.equals(RSSConfig.VALUE_YES))
bvSessionValue = true;
else
bvSessionValue = false;
}
try
{
canonicalizationPolicies.addRule(new CanonicalizationPolicy(Pattern.compile(urlRegexp),reorderValue,javaSessionValue,aspSessionValue,
phpSessionValue, bvSessionValue));
}
catch (java.util.regex.PatternSyntaxException e)
{
throw new ManifoldCFException("Canonicalization regular expression '"+urlRegexp+"' is illegal: "+e.getMessage(),e);
}
}
}
compileList(excludePatterns,stringToArray(excludes));
// Second pass. Do the rest of the work,
i = 0;
while (i < spec.getChildCount())
{
SpecificationNode n = spec.getChild(i++);
if (n.getType().equals(RSSConfig.NODE_FEED))
{
String rssURL = n.getAttributeValue(RSSConfig.ATTR_URL);
if (rssURL != null && rssURL.length() > 0)
{
String canonicalURL = makeDocumentIdentifier(canonicalizationPolicies,null,rssURL);
if (canonicalURL != null)
{
seeds.add(canonicalURL);
}
else
{
if (warnOnBadSeed)
Logging.connectors.warn("RSS: Illegal seed feed '"+rssURL+"'");
}
}
}
else if (n.getType().equals(RSSConfig.NODE_ACCESS))
{
String token = n.getAttributeValue(RSSConfig.ATTR_TOKEN);
acls.add(token);
}
else if (n.getType().equals(RSSConfig.NODE_FEEDRESCAN))
{
String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE);
if (interval != null && interval.length() > 0)
{
try
{
defaultRescanInterval = new Integer(interval);
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
}
else if (n.getType().equals(RSSConfig.NODE_MINFEEDRESCAN))
{
String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE);
if (interval != null && interval.length() > 0)
{
try
{
minimumRescanInterval = new Integer(interval);
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
}
else if (n.getType().equals(RSSConfig.NODE_BADFEEDRESCAN))
{
String interval = n.getAttributeValue(RSSConfig.ATTR_VALUE);
if (interval != null && interval.length() > 0)
{
try
{
badFeedRescanInterval = new Integer(interval);
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
}
else if (n.getType().equals(RSSConfig.NODE_FEEDTIMEOUT))
{
String value = n.getAttributeValue(RSSConfig.ATTR_VALUE);
if (value != null && value.length() > 0)
{
try
{
feedTimeoutValue= Integer.parseInt(value) * 1000;
}
catch (NumberFormatException e)
{
throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
}
}
}
else if (n.getType().equals(RSSConfig.NODE_DECHROMEDMODE))
{
String mode = n.getAttributeValue(RSSConfig.ATTR_MODE);
if (mode != null && mode.length() > 0)
{
if (mode.equals(RSSConfig.VALUE_NONE))
dechromedContentMode = DECHROMED_NONE;
else if (mode.equals(RSSConfig.VALUE_DESCRIPTION))
dechromedContentMode = DECHROMED_DESCRIPTION;
else if (mode.equals(RSSConfig.VALUE_CONTENT))
dechromedContentMode = DECHROMED_CONTENT;
}
}
else if (n.getType().equals(RSSConfig.NODE_CHROMEDMODE))
{
String mode = n.getAttributeValue(RSSConfig.ATTR_MODE);
if (mode != null && mode.length() > 0)
{
if (mode.equals(RSSConfig.VALUE_USE))
chromedContentMode = CHROMED_USE;
else if (mode.equals(RSSConfig.VALUE_SKIP))
chromedContentMode = CHROMED_SKIP;
else if (mode.equals(RSSConfig.VALUE_METADATA))
chromedContentMode = CHROMED_METADATA_ONLY;
}
}
}
}
/** Check if document is a seed */
public boolean isSeed(String canonicalUrl)
{
return seeds.contains(canonicalUrl);
}
/** Iterate over all canonicalized seeds */
public Iterator<String> getSeeds()
{
return seeds.iterator();
}
/** Get the acls */
public String[] getAcls()
{
String[] rval = new String[acls.size()];
Iterator<String> iter = acls.iterator();
int i = 0;
while (iter.hasNext())
{
rval[i++] = iter.next();
}
return rval;
}
/** Get the feed timeout value */
public int getFeedTimeoutValue()
{
return feedTimeoutValue;
}
/** Get the dechromed content mode */
public int getDechromedContentMode()
{
return dechromedContentMode;
}
/** Get the chromed content mode */
public int getChromedContentMode()
{
return chromedContentMode;
}
/** Get the next time (by default) a feed should be scanned */
public Long getDefaultRescanTime(long currentTime)
{
if (defaultRescanInterval == null)
return null;
return new Long(defaultRescanInterval.intValue() * 60000L + currentTime);
}
/** Get the minimum next time a feed should be scanned */
public Long getMinimumRescanTime(long currentTime)
{
if (minimumRescanInterval == null)
return null;
return new Long(minimumRescanInterval.intValue() * 60000L + currentTime);
}
/** Get the next time a "bad feed" should be rescanned */
public Long getBadFeedRescanTime(long currentTime)
{
if (badFeedRescanInterval == null)
return null;
return new Long(badFeedRescanInterval.intValue() * 60000L + currentTime);
}
/** Check for legality of a url.
* @return true if the passed-in url is either a seed, or a legal url, according to this
* filter.
*/
public boolean isLegalURL(String url)
{
if (seeds.contains(url))
return true;
if (mappings.isMatch(url) == false)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Url '"+url+"' is illegal because it did not match a mapping rule");
return false;
}
// Now make sure it's not in the exclude list.
for (Pattern p : excludePatterns)
{
Matcher m = p.matcher(url);
if (m.find())
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Url '"+url+"' is illegal because exclude pattern '"+p.toString()+"' matched it");
return false;
}
}
return true;
}
/** Scan patterns and return the one that matches first.
* @return null if the url doesn't match or should not be ingested, or the new string if it does.
*/
public String mapDocumentURL(String url)
throws ManifoldCFException
{
if (seeds.contains(url))
return null;
return mappings.map(url);
}
/** Get canonicalization policies */
public CanonicalizationPolicies getCanonicalizationPolicies()
{
return canonicalizationPolicies;
}
}
}