/* $Id: DNSManager.java 988245 2010-08-23 18:39:35Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.connectors.webcrawler;
import java.util.*;
import java.io.*;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.authorities.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.CacheKeyFactory;
import org.apache.manifoldcf.crawler.system.ManifoldCF;
import org.apache.manifoldcf.crawler.system.Logging;
/** This class manages the database table into which we DNS entries for hosts. The data resides in the database,
* as well as in cache (up to a certain point). The result is that there is a memory limited, database-backed repository
* of DNS entries that we can draw on.
* Note that this code is also responsible for efficiently caching the mapping of IP address to a canonical host name.
*
* <br><br>
* <b>dnsdata</b>
* <table border="1" cellpadding="3" cellspacing="0">
* <tr class="TableHeadingColor">
* <th>Field</th><th>Type</th><th>Description </th>
* <tr><td>hostname</td><td>VARCHAR(255)</td><td>Primary Key</td></tr>
* <tr><td>canonicalhostname</td><td>VARCHAR(255)</td><td></td></tr>
* <tr><td>ipaddress</td><td>VARCHAR(16)</td><td></td></tr>
* <tr><td>expirationtime</td><td>BIGINT</td><td></td></tr>
* </table>
* <br><br>
*
*/
public class DNSManager extends org.apache.manifoldcf.core.database.BaseTable
{
public static final String _rcsid = "@(#)$Id: DNSManager.java 988245 2010-08-23 18:39:35Z kwright $";
// Robots cache class. Only one needed.
protected static DNSCacheClass dnsCacheClass = new DNSCacheClass();
// Database fields
protected final static String hostField = "hostname";
protected final static String fqdnField = "canonicalhostname";
protected final static String ipaddressField = "ipaddress";
protected final static String expirationField = "expirationtime";
// Cache manager. This handle is set up during the constructor.
ICacheManager cacheManager;
/** Constructor. Note that one robotsmanager handle is only useful within a specific thread context,
* so the calling connector object logic must recreate the handle whenever the thread context changes.
*@param tc is the thread context.
*@param database is the database handle.
*/
public DNSManager(IThreadContext tc, IDBInterface database)
throws ManifoldCFException
{
super(database,"dnsdata");
cacheManager = CacheManagerFactory.make(tc);
}
/** Install the manager.
*/
public void install()
throws ManifoldCFException
{
// Standard practice: outer loop, no transactions
while (true)
{
Map existing = getTableSchema(null,null);
if (existing == null)
{
// Install the table.
HashMap map = new HashMap();
map.put(hostField,new ColumnDescription("VARCHAR(255)",true,false,null,null,false));
map.put(fqdnField,new ColumnDescription("VARCHAR(255)",false,true,null,null,false));
map.put(ipaddressField,new ColumnDescription("VARCHAR(16)",false,true,null,null,false));
map.put(expirationField,new ColumnDescription("BIGINT",false,false,null,null,false));
performCreate(map,null);
}
else
{
// Upgrade code, if needed, goes here
}
// Handle indexes
// I thought at one point this index might be useful, but it doesn't seem necessary after all
// ArrayList list = new ArrayList();
// list.add(ipaddressField);
// addTableIndex(false,list);
break;
}
}
/** Uninstall the manager.
*/
public void deinstall()
throws ManifoldCFException
{
performDrop(null);
}
/** Given a host name, look up the ip address and fqdn.
*@return null if there is no available cached version of this info.
*/
public DNSInfo lookup(String hostName, long currentTime)
throws ManifoldCFException
{
// Build description objects
HostDescription[] objectDescriptions = new HostDescription[1];
StringSetBuffer ssb = new StringSetBuffer();
ssb.add(getDNSKey(hostName));
objectDescriptions[0] = new HostDescription(hostName,new StringSet(ssb));
HostExecutor exec = new HostExecutor(this,objectDescriptions[0]);
cacheManager.findObjectsAndExecute(objectDescriptions,null,exec,getTransactionID());
// DNSInfo object must be built if it isn't yet present.
DNSInfo rd = exec.getResults();
if (rd == null || rd.getExpirationTime() <= currentTime)
return null;
return rd;
}
/** Write DNS data, replacing any existing row.
*@param hostName is the host.
*@param fqdn is the canonical host name.
*@param ipaddress is the host ip address, in standard form.
*@param expirationTime is the time this data should expire.
*/
public void writeDNSData(String hostName, String fqdn, String ipaddress, long expirationTime)
throws ManifoldCFException
{
StringSetBuffer ssb = new StringSetBuffer();
ssb.add(getDNSKey(hostName));
StringSet cacheKeys = new StringSet(ssb);
ICacheHandle ch = cacheManager.enterCache(null,cacheKeys,getTransactionID());
try
{
beginTransaction();
try
{
// See whether the instance exists
ArrayList params = new ArrayList();
params.add(hostName);
IResultSet set = performQuery("SELECT * FROM "+getTableName()+" WHERE "+
hostField+"=?",params,null,null);
HashMap values = new HashMap();
values.put(expirationField,new Long(expirationTime));
if (fqdn == null)
fqdn = "";
values.put(fqdnField,fqdn);
if (ipaddress == null)
ipaddress = "";
values.put(ipaddressField, ipaddress);
if (set.getRowCount() > 0)
{
// Update
params.clear();
params.add(hostName);
performUpdate(values," WHERE "+hostField+"=?",params,null);
}
else
{
// Insert
values.put(hostField,hostName);
// We only need the general key because this is new.
performInsert(values,null);
}
cacheManager.invalidateKeys(ch);
}
catch (ManifoldCFException e)
{
signalRollback();
throw e;
}
catch (Error e)
{
signalRollback();
throw e;
}
finally
{
endTransaction();
}
}
finally
{
cacheManager.leaveCache(ch);
}
}
// Protected methods and classes
/** Construct a key which represents an individual host name.
*@param hostName is the name of the connector.
*@return the cache key.
*/
protected static String getDNSKey(String hostName)
{
return "DNS_"+hostName;
}
/** Read DNS data, if it exists.
*@return null if the data doesn't exist at all. Return DNS data if it does.
*/
protected DNSInfo readDNSInfo(String hostName)
throws ManifoldCFException
{
ArrayList list = new ArrayList();
list.add(hostName);
IResultSet set = performQuery("SELECT "+ipaddressField+","+fqdnField+","+expirationField+" FROM "+getTableName()+
" WHERE "+hostField+"=?",list,null,null);
if (set.getRowCount() == 0)
return null;
IResultRow row = set.getRow(0);
long expiration = ((Long)row.getValue(expirationField)).longValue();
String ipaddress = (String)row.getValue(ipaddressField);
if (ipaddress != null && ipaddress.length() == 0)
ipaddress = null;
String fqdn = (String)row.getValue(fqdnField);
if (fqdn != null && fqdn.length() == 0)
fqdn = null;
return new DNSInfo(ipaddress,fqdn,expiration,hostName);
}
/** This is a cached data item.
*/
protected static class DNSInfo
{
protected long expiration;
protected String hostName;
protected String ipaddress;
protected String fqdn;
/** Constructor. */
public DNSInfo(String ipaddress, String fqdn, long expiration, String hostName)
{
this.ipaddress = ipaddress;
this.fqdn = fqdn;
this.expiration = expiration;
this.hostName = hostName;
}
/** Get the ipaddress */
public String getIPAddress()
{
return ipaddress;
}
/** Get the fqdn */
public String getFQDN()
{
return fqdn;
}
/** Get the expiration time. */
public long getExpirationTime()
{
return expiration;
}
/** Get the host name */
public String getHostName()
{
return hostName;
}
}
/** This is the object description for a robots host object.
* This is the key that is used to look up cached data.
*/
protected static class HostDescription extends org.apache.manifoldcf.core.cachemanager.BaseDescription
{
protected String hostName;
protected String criticalSectionName;
protected StringSet cacheKeys;
public HostDescription(String hostName, StringSet invKeys)
{
super("dnscache");
this.hostName = hostName;
criticalSectionName = getClass().getName()+"-"+hostName;
cacheKeys = invKeys;
}
public String getHostName()
{
return hostName;
}
public int hashCode()
{
return hostName.hashCode();
}
public boolean equals(Object o)
{
if (!(o instanceof HostDescription))
return false;
HostDescription d = (HostDescription)o;
return d.hostName.equals(hostName);
}
public String getCriticalSectionName()
{
return criticalSectionName;
}
/** Get the cache keys for an object (which may or may not exist yet in
* the cache). This method is called in order for cache manager to throw the correct locks.
* @return the object's cache keys, or null if the object should not
* be cached.
*/
public StringSet getObjectKeys()
{
return cacheKeys;
}
/** Get the object class for an object. The object class is used to determine
* the group of objects treated in the same LRU manner.
* @return the newly created object's object class, or null if there is no
* such class, and LRU behavior is not desired.
*/
public ICacheClass getObjectClass()
{
return dnsCacheClass;
}
}
/** Cache class for robots.
* An instance of this class describes the cache class for robots data caching. There's
* only ever a need for one, so that will be created statically.
*/
protected static class DNSCacheClass implements ICacheClass
{
/** Get the name of the object class.
* This determines the set of objects that are treated in the same
* LRU pool.
*@return the class name.
*/
public String getClassName()
{
// We count all the robot data, so this is a constant string.
return "DNSCLASS";
}
/** Get the maximum LRU count of the object class.
*@return the maximum number of the objects of the particular class
* allowed.
*/
public int getMaxLRUCount()
{
// Hardwired for the moment; 2000 dns data records will be cached,
// and no more.
return 2000;
}
}
/** This is the executor object for locating robots host objects.
* This object furnishes the operations the cache manager needs to rebuild objects that it needs that are
* not in the cache at the moment.
*/
protected static class HostExecutor extends org.apache.manifoldcf.core.cachemanager.ExecutorBase
{
// Member variables
protected DNSManager thisManager;
protected DNSInfo returnValue;
protected HostDescription thisHost;
/** Constructor.
*@param manager is the RobotsManager class instance.
*@param objectDescription is the desired object description.
*/
public HostExecutor(DNSManager manager, HostDescription objectDescription)
{
super();
thisManager = manager;
thisHost = objectDescription;
returnValue = null;
}
/** Get the result.
*@return the looked-up or read cached instance.
*/
public DNSInfo getResults()
{
return returnValue;
}
/** Create a set of new objects to operate on and cache. This method is called only
* if the specified object(s) are NOT available in the cache. The specified objects
* should be created and returned; if they are not created, it means that the
* execution cannot proceed, and the execute() method will not be called.
* @param objectDescriptions is the set of unique identifier of the object.
* @return the newly created objects to cache, or null, if any object cannot be created.
* The order of the returned objects must correspond to the order of the object descriptinos.
*/
public Object[] create(ICacheDescription[] objectDescriptions) throws ManifoldCFException
{
// I'm not expecting multiple values to be request, so it's OK to walk through the objects
// and do a request at a time.
DNSInfo[] rval = new DNSInfo[objectDescriptions.length];
int i = 0;
while (i < rval.length)
{
HostDescription desc = (HostDescription)objectDescriptions[i];
// I need to cache both the data and the expiration date, and pick up both when I
// do the query. This is because I don't want to cache based on request time, since that
// would screw up everything!
rval[i] = thisManager.readDNSInfo(desc.getHostName());
i++;
}
return rval;
}
/** Notify the implementing class of the existence of a cached version of the
* object. The object is passed to this method so that the execute() method below
* will have it available to operate on. This method is also called for all objects
* that are freshly created as well.
* @param objectDescription is the unique identifier of the object.
* @param cachedObject is the cached object.
*/
public void exists(ICacheDescription objectDescription, Object cachedObject) throws ManifoldCFException
{
// Cast what came in as what it really is
HostDescription objectDesc = (HostDescription)objectDescription;
DNSInfo data = (DNSInfo)cachedObject;
if (objectDesc.equals(thisHost))
returnValue = data;
}
/** Perform the desired operation. This method is called after either createGetObject()
* or exists() is called for every requested object.
*/
public void execute() throws ManifoldCFException
{
// Does nothing; we only want to fetch objects in this cacher.
}
}
}