/* $Id: DataCache.java 988245 2010-08-23 18:39:35Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.connectors.webcrawler;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
import org.apache.manifoldcf.crawler.system.ManifoldCF;
import java.util.*;
import java.io.*;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.client.RedirectException;
import org.apache.http.client.CircularRedirectException;
import org.apache.http.NoHttpResponseException;
import org.apache.http.HttpException;
/** This class is a cache of a specific URL's data. It's fetched early and kept,
* so that (1) an accurate data length can be found, and (2) we can compute a version
* checksum.
*/
public class DataCache
{
public static final String _rcsid = "@(#)$Id: DataCache.java 988245 2010-08-23 18:39:35Z kwright $";
// Hashmap containing the cache of files.
// This is keyed by document identifier, and contains DocumentData objects.
protected Map<String,DocumentData> cacheData = new HashMap<String,DocumentData>();
/** Constructor.
*/
public DataCache()
{
}
/** Add a data entry into the cache.
* This method is called whenever the data from a fetch is considered interesting or useful, and will
* be thus passed on from getDocumentVersions() to the processDocuments() phase. At the moment that's
* usually a 200 or a 302 response.
*@param documentIdentifier is the document identifier (url).
*@param connection is the connection, upon which a fetch has been done that needs to be
* cached.
*@return a "checksum" value, to use as a version string.
*/
public String addData(IProcessActivity activities, String documentIdentifier, IThrottledConnection connection)
throws ManifoldCFException, ServiceInterruption
{
// Grab the response code, and the content-type header
int responseCode = connection.getResponseCode();
String contentType = connection.getResponseHeader("Content-Type");
String referralURI = connection.getResponseHeader("Location");
// Create a temporary file; that's what we will cache
try
{
// First, get the stream.
InputStream dataStream = connection.getResponseBodyStream();
if (dataStream == null)
return null;
try
{
File tempFile = File.createTempFile("_webcache_","tmp");
try
{
// Causes memory leaks if left around; there's no way to release
// the record specifying that the file should be deleted, even
// after it's removed. So disable this and live with the occasional
// dangling file left as a result of shutdown or error. :-(
// tempFile.deleteOnExit();
ManifoldCF.addFile(tempFile);
// Transfer data to temporary file
long checkSum = 0L;
OutputStream os = new FileOutputStream(tempFile);
try
{
byte[] byteArray = new byte[65536];
while (true)
{
int amt;
try
{
amt = dataStream.read(byteArray,0,byteArray.length);
}
catch (java.net.SocketTimeoutException e)
{
Logging.connectors.warn("Socket timeout exception reading socket stream: "+e.getMessage(),e);
long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("Socket timeout: "+e.getMessage(),e,currentTime + 300000L,
currentTime + 12 * 60 * 60000L,-1,false);
}
catch (ConnectTimeoutException e)
{
Logging.connectors.warn("Socket connect timeout exception reading socket stream: "+e.getMessage(),e);
long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("Socket timeout: "+e.getMessage(),e,currentTime + 300000L,
currentTime + 12 * 60 * 60000L,-1,false);
}
catch (InterruptedIOException e)
{
//Logging.connectors.warn("IO interruption seen",e);
throw new ManifoldCFException("Interrupted: "+e.getMessage(),ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
Logging.connectors.warn("IO exception reading socket stream: "+e.getMessage(),e);
long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("Read timeout: "+e.getMessage(),e,currentTime + 300000L,
currentTime + 12 * 60 * 60000L,-1,false);
}
if (amt == -1)
break;
int i = 0;
while (i < amt)
{
byte x = byteArray[i++];
long bytevalue = (long)x;
checkSum = (checkSum << 5) ^ (checkSum >> 3) ^ (bytevalue << 2) ^ (bytevalue >> 3);
}
os.write(byteArray,0,amt);
// Check if job is alive before looping
activities.checkJobStillActive();
}
}
finally
{
os.close();
}
synchronized(this)
{
deleteData(documentIdentifier);
cacheData.put(documentIdentifier,new DocumentData(tempFile,responseCode,contentType,referralURI));
return new Long(checkSum).toString();
}
}
catch (IOException e)
{
ManifoldCF.deleteFile(tempFile);
throw e;
}
catch (ManifoldCFException e)
{
ManifoldCF.deleteFile(tempFile);
throw e;
}
catch (ServiceInterruption e)
{
ManifoldCF.deleteFile(tempFile);
throw e;
}
catch (Error e)
{
ManifoldCF.deleteFile(tempFile);
throw e;
}
}
finally
{
try
{
dataStream.close();
}
catch (java.net.SocketTimeoutException e)
{
Logging.connectors.warn("WEB: Socket timeout exception closing data stream, ignoring: "+e.getMessage(),e);
}
catch (ConnectTimeoutException e)
{
Logging.connectors.warn("WEB: Socket connect timeout exception closing data stream, ignoring: "+e.getMessage(),e);
}
catch (InterruptedIOException e)
{
throw e;
}
catch (IOException e)
{
// We can get this if the socket was unexpectedly closed by the server; treat this
// as a Service Interruption. Generally, this is ok - warn but don't do anything else.
Logging.connectors.warn("WEB: IO exception closing data stream, ignoring: "+e.getMessage(),e);
}
}
}
catch (java.net.SocketTimeoutException e)
{
throw new ManifoldCFException("Socket timeout exception creating temporary file: "+e.getMessage(),e);
}
catch (ConnectTimeoutException e)
{
throw new ManifoldCFException("Socket connect timeout exception creating temporary file: "+e.getMessage(),e);
}
catch (InterruptedIOException e)
{
//Logging.connectors.warn("IO interruption seen",e);
throw new ManifoldCFException("Interrupted: "+e.getMessage(),ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
throw new ManifoldCFException("IO exception creating temporary file: "+e.getMessage(),e);
}
}
/** Get the response code.
*@param documentIdentifier is the document identifier.
*@return the code.
*/
public synchronized int getResponseCode(String documentIdentifier)
{
DocumentData dd = cacheData.get(documentIdentifier);
if (dd == null)
return IThrottledConnection.FETCH_NOT_TRIED;
return dd.getResponseCode();
}
/** Get the content type.
*@param documentIdentifier is the document identifier.
*@return the content type, or null if there is none.
*/
public synchronized String getContentType(String documentIdentifier)
{
DocumentData dd = cacheData.get(documentIdentifier);
if (dd == null)
return null;
return dd.getContentType();
}
/** Get the referral URI.
*@param documentIdentifier is the document identifier.
*@return the referral URI, or null if none.
*/
public synchronized String getReferralURI(String documentIdentifier)
{
DocumentData dd = cacheData.get(documentIdentifier);
if (dd == null)
return null;
return dd.getReferralURI();
}
/** Fetch binary data length.
*@param documentIdentifier is the document identifier.
*@return the length.
*/
public synchronized long getDataLength(String documentIdentifier)
{
DocumentData dd = cacheData.get(documentIdentifier);
if (dd == null)
return 0L;
return dd.getData().length();
}
/** Fetch binary data entry from the cache.
*@param documentIdentifier is the document identifier (url).
*@return a binary data stream.
*/
public synchronized InputStream getData(String documentIdentifier)
throws ManifoldCFException
{
DocumentData dd = cacheData.get(documentIdentifier);
if (dd == null)
return null;
try
{
return new FileInputStream(dd.getData());
}
catch (FileNotFoundException e)
{
throw new ManifoldCFException("File not found exception opening data: "+e.getMessage(),e);
}
}
/** Delete specified item of data.
*@param documentIdentifier is the document identifier (url).
*/
public synchronized void deleteData(String documentIdentifier)
{
DocumentData dd = cacheData.remove(documentIdentifier);
if (dd != null)
{
ManifoldCF.deleteFile(dd.getData());
}
}
// Protected classes
/** This class represents everything we need to know about a document that's getting passed from the
* getDocumentVersions() phase to the processDocuments() phase.
*/
protected static class DocumentData
{
/** The cache file for the data */
protected File data;
/** The response code */
protected int responseCode;
/** The content-type header value */
protected String contentType;
/** The referral URI */
protected String referralURI;
// More will probably go here later, but I can't think of much else at the moment.
/** Constructor. */
public DocumentData(File data, int responseCode, String contentType, String referralURI)
{
this.data = data;
this.responseCode = responseCode;
this.contentType = contentType;
this.referralURI = referralURI;
}
/** Get the data */
public File getData()
{
return data;
}
/** Get the response code */
public int getResponseCode()
{
return responseCode;
}
/** Get the contentType */
public String getContentType()
{
return contentType;
}
/** Get the referral URI */
public String getReferralURI()
{
return referralURI;
}
}
}