// Copyright 2011 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.connector.servlet;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.google.enterprise.connector.logging.NDC;
import com.google.enterprise.connector.manager.ConnectorManagerException;
import com.google.enterprise.connector.manager.Context;
import com.google.enterprise.connector.manager.Manager;
import com.google.enterprise.connector.persist.ConnectorNotFoundException;
import com.google.enterprise.connector.pusher.FeedConnection;
import com.google.enterprise.connector.pusher.XmlFeed;
import com.google.enterprise.connector.spi.Document;
import com.google.enterprise.connector.spi.DocumentAccessException;
import com.google.enterprise.connector.spi.DocumentNotFoundException;
import com.google.enterprise.connector.spi.Property;
import com.google.enterprise.connector.spi.RepositoryException;
import com.google.enterprise.connector.spi.SkippedDocumentException;
import com.google.enterprise.connector.spi.SpiConstants;
import com.google.enterprise.connector.spi.Value;
import com.google.enterprise.connector.spiimpl.DateValue;
import com.google.enterprise.connector.spiimpl.ValueImpl;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.text.ParseException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPOutputStream;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
public class GetDocumentContent extends HttpServlet {
private static Logger LOGGER =
Logger.getLogger(GetDocumentContent.class.getName());
private static final String HDR_IF_MODIFIED = "If-Modified-Since";
/**
* Attribute name on the ServletRequest containing a cache of the parsed
* query parameters.
*/
private static final String PARAMETER_CACHE_NAME =
GetDocumentContent.class.getName() + ".parameters";
/**
* Attribute name on the ServletRequest containing a cache of the metadata
* of the requested document.
*/
private static final String METADATA_CACHE_NAME =
GetDocumentContent.class.getName() + ".document";
/**
* Distinguished attribute value denoting that the metadata is already known
* to be unavailable.
*/
private static final Object NEGATIVE_METADATA_CACHE_VALUE = new Object();
/** HTTP header that contains the Document metadata. */
private static final String EXTERNAL_METADATA_HEADER =
"X-Gsa-External-Metadata";
private static boolean useCompression = false;
private static FeedConnection feedConnection;
public static void setUseCompression(boolean doCompression) {
useCompression = doCompression;
}
/**
* Set the feed connection to use to discover if the security header is
* supported. This must be set during startup to take effect.
*/
public static void setFeedConnection(FeedConnection fc) {
feedConnection = fc;
}
/**
* GSA 7.0 introduces the ability to provide a HTTP header that specifies
* whether the document is secure. In previous GSAs we are required to use
* HTTP basic, which has to be configured correctly on the GSA.
*/
private synchronized static boolean isSecurityHeaderSupported() {
if (feedConnection == null) {
// FeedConnection is unavailable, so choose the pessimistic choice.
return false;
} else {
// The newer ACL format was added in same GSA version as security header,
// so we abuse the ACL feature detection logic.
return feedConnection.supportsInheritedAcls();
}
}
/**
* Retrieves the content of a document from a connector instance.
*
* @param req
* @param res
* @throws IOException
*/
@Override
protected void doPost(HttpServletRequest req, HttpServletResponse res)
throws IOException {
doGet(req, res);
}
/**
* Retrieves the content of a document from a connector instance.
*
* @param req
* @param res
* @throws IOException
*/
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse res)
throws IOException {
doGet(req, res, Context.getInstance().getManager());
}
/**
* Fetches the last modified date for the document, in milliseconds since
* the epoch; or -1 if the last modified date is not known or unavailable.
*
* @param req
* @return a long integer specifying the time the document was last modified,
* in milliseconds since midnight, January 1, 1970 GMT, or -1 if the time is
* not known.
*/
@Override
protected long getLastModified(HttpServletRequest req) {
Map<String, List<String>> params = getQueryParams(req);
String connectorName = ServletUtil.getFirstParameter(
params, ServletUtil.XMLTAG_CONNECTOR_NAME);
String docid = ServletUtil.getFirstParameter(
params, ServletUtil.QUERY_PARAM_DOCID);
if (Strings.isNullOrEmpty(connectorName) || Strings.isNullOrEmpty(docid)) {
return -1L;
}
return handleGetLastModified(getDocumentMetaDataNoThrow(req,
Context.getInstance().getManager(), connectorName, docid));
}
/**
* Returns a map of query parameters extracted from the request.
*/
private static Map<String, List<String>> getQueryParams(HttpServletRequest req) {
@SuppressWarnings("unchecked") Map<String, List<String>> params =
(Map<String, List<String>>)(req.getAttribute(PARAMETER_CACHE_NAME));
if (params == null) {
params = ServletUtil.parseQueryString(req.getQueryString());
req.setAttribute(PARAMETER_CACHE_NAME, params);
}
return params;
}
/**
* Retrieves the content of a document from a connector instance.
*
* @param req
* @param res
* @param manager manager to use for retrieving document information
* @throws IOException
*/
@VisibleForTesting
static void doGet(HttpServletRequest req, HttpServletResponse res,
Manager manager) throws IOException {
// The servlet relies on proper security to be handled by a filter.
if ("SecMgr".equals(req.getHeader("User-Agent")) ||
req.getHeader("Range") != null ||
"HEAD".equals(req.getMethod())) {
// GSA does a GET with Range:0-0 to simulate head request.
// Assume that a "HEAD" request to check authz is being performed
// due to presence of Range header.
// We don't support authz by hr so we always issue deny.
// TODO(ejona): Remove checking for Range header and HEAD once
// Legacy Authz is removed from supported GSA versions.
LOGGER.finest("RETRIEVER: Head request denied");
res.sendError(HttpServletResponse.SC_FORBIDDEN);
return;
}
Map<String, List<String>> params = getQueryParams(req);
String connectorName = ServletUtil.getFirstParameter(
params, ServletUtil.XMLTAG_CONNECTOR_NAME);
String docid = ServletUtil.getFirstParameter(
params, ServletUtil.QUERY_PARAM_DOCID);
if (Strings.isNullOrEmpty(connectorName) || Strings.isNullOrEmpty(docid)) {
res.sendError(HttpServletResponse.SC_BAD_REQUEST);
return;
}
NDC.pushAppend("Retrieve " + connectorName + " "
+ docid.substring(docid.lastIndexOf('/') + 1));
Document metadata;
try {
metadata = getDocumentMetaData(req, manager, connectorName, docid);
} catch (Exception e) {
res.sendError(handleException("metadata", e));
return;
}
int securityCode = handleMarkingDocumentSecurity(req, res, metadata);
if (securityCode != HttpServletResponse.SC_OK) {
res.sendError(securityCode);
return;
}
// Set the Content-Type.
String mimeType = handleGetContentType(metadata);
LOGGER.log(Level.FINEST, "Document Content-Type {0}", mimeType);
res.setContentType(mimeType);
Integer contentLength = handleGetContentLength(metadata);
if (contentLength != null) {
LOGGER.log(Level.FINEST, "Document Content-Length {0}", contentLength);
res.setContentLength(contentLength);
}
// Supply the document metadata in an X-Gsa-External-Metadata header.
if (metadata != null) {
res.setHeader(EXTERNAL_METADATA_HEADER, getMetadataHeader(metadata));
}
OutputStream out = res.getOutputStream();
if (useCompression) {
// Select Content-Encoding based on the client's Accept-Encoding header.
// Choose GZIP if the header includes "gzip", otherwise no compression.
String encodings = req.getHeader("Accept-Encoding");
if (encodings != null && encodings.matches(".*\\bgzip\\b.*")) {
res.setHeader("Content-Encoding", "gzip");
out = new GZIPOutputStream(out, 64 * 1024);
}
res.setHeader("Vary", "Accept-Encoding");
}
// TODO: Configure chunked output?
try {
int code = handleDoGet(manager, connectorName, docid, out);
if (code != HttpServletResponse.SC_OK) {
res.sendError(code);
} else {
res.setStatus(code);
}
} finally {
out.close();
NDC.pop();
}
}
/**
* Builds the GSA-specific metadata header value for crawl-time metadata,
* based upon the Document's supplied metadata.
*/
// Warning: See XmlFeed.wrapMetaData() if you make changes here.
@VisibleForTesting
static String getMetadataHeader(Document metadata) {
StringBuilder sb = new StringBuilder();
Set<String> propertyNames = null;
try {
propertyNames = metadata.getPropertyNames();
} catch (RepositoryException e) {
LOGGER.log(Level.WARNING, "Failed to retrieve property names", e);
}
if (propertyNames != null && !propertyNames.isEmpty()) {
// Sort property names so that metadata is written in a canonical form.
// The GSA's metadata change detection logic depends on the metadata to
// be in the same order each time to prevent reindexing.
propertyNames = new TreeSet<String>(propertyNames);
for (String name : propertyNames) {
if (XmlFeed.propertySkipSet.contains(name)) {
continue;
}
try {
Property property = metadata.findProperty(name);
if (property != null) {
encodeOneProperty(sb, name, property);
}
} catch (RepositoryException e) {
LOGGER.log(Level.WARNING, "Failed to retrieve property " + name, e);
}
}
}
return (sb.length() == 0) ? "" : sb.substring(0, sb.length() - 1);
}
/**
* Adds one Property's values to the metadata header under contruction.
*/
private static void encodeOneProperty(StringBuilder sb, String name,
Property property) throws RepositoryException {
ValueImpl value;
while ((value = (ValueImpl) property.nextValue()) != null) {
LOGGER.log(Level.FINEST, "PROPERTY: {0} = \"{1}\"",
new Object[] { name, value.toString() });
String valString = value.toFeedXml();
if (!Strings.isNullOrEmpty(valString)) {
ServletUtil.percentEncode(sb, name, valString);
sb.append(',');
}
}
}
/**
* Retrieves the content of a document from a connector instance.
*
* @param manager a Manager
* @param connectorName the name of the connector instance that
* can access the document
* @param docId the document identifer
* @param out OutputStream to which to write the content
* @return an HTTP Status Code
* @throws IOException
*/
@VisibleForTesting
static int handleDoGet(Manager manager, String connectorName, String docid,
OutputStream out) throws IOException {
InputStream in = null;
try {
in = manager.getDocumentContent(connectorName, docid);
if (in == null) {
// This is unlikely to happen, since Production Manager
// will return an AlternateContent InputStream.
in = new ByteArrayInputStream(new byte[0]);
}
byte[] buffer = new byte[1024 * 1024];
int bytes;
do {
bytes = in.read(buffer);
if (bytes > 0) {
out.write(buffer, 0, bytes);
}
} while (bytes != -1);
return HttpServletResponse.SC_OK;
} catch (Exception e) {
return handleException("content", e);
} finally {
if (in != null) {
in.close();
}
}
}
/**
* Retrieve and cache the metadata of the currently requested document.
* The metadata is cached for the life of the servlet request.
* No checked exceptions are thrown.
* If a problem occurs {@code null} is returned.
*
* @param req Request to use for caching return value
* @param manager a Manager
* @param connectorName the name of the connector instance that
* can access the document
* @param docId the document identifer
* @return document's metadata or {@code null} if it is unavailable
*/
private static Document getDocumentMetaDataNoThrow(HttpServletRequest req,
Manager manager, String connectorName, String docid) {
try {
return getDocumentMetaData(req, manager, connectorName, docid);
} catch (ConnectorManagerException e) {
return null;
} catch (RepositoryException e) {
return null;
}
}
/**
* Retrieve and cache the metadata of the currently requested document.
* The metadata is cached for the life of the servlet request.
*
* @param req Request to use for caching return value
* @param manager a Manager
* @param connectorName the name of the connector instance that
* can access the document
* @param docId the document identifer
* @return document's metadata or {@code null} if it is unavailable
*/
@VisibleForTesting
static Document getDocumentMetaData(HttpServletRequest req,
Manager manager, String connectorName, String docid)
throws ConnectorManagerException, RepositoryException {
Object cache = req.getAttribute(METADATA_CACHE_NAME);
if (cache != null) {
return cache == NEGATIVE_METADATA_CACHE_VALUE ? null : (Document) cache;
}
Document metadata = manager.getDocumentMetaData(connectorName, docid);
req.setAttribute(METADATA_CACHE_NAME,
(metadata == null) ? NEGATIVE_METADATA_CACHE_VALUE : metadata);
return metadata;
}
/**
* Retrieves the last modified date of a document from a connector instance.
*
* @param metadata the Document metadata
* @return a long integer specifying the time the document was last modified,
* in milliseconds since midnight, January 1, 1970 GMT, or -1L if the
* time is not known
*/
@VisibleForTesting
static long handleGetLastModified(Document metadata) {
if (metadata == null) {
return -1L;
}
try {
// TODO: Value and DateValue Calendar methods are too weak to try to get
// last modified from non-DateValues.
ValueImpl value = (ValueImpl)
Value.getSingleValue(metadata, SpiConstants.PROPNAME_LASTMODIFIED);
if (value == null) {
LOGGER.log(Level.FINEST, "Document does not contain {0}",
SpiConstants.PROPNAME_LASTMODIFIED);
} else if (value instanceof DateValue) {
// DateValues don't give direct access to their Calendar object, but
// I can get the Calendar back out by parsing the stringized version.
// This method also applies the FeedTimeZone, if needed.
// TODO: Add a DateValue.getTimeMillis() or getCalendar() method to
// directly access the wrapped value.
String lastModified = ((DateValue) value).toIso8601();
LOGGER.log(Level.FINEST, "Document last modified {0}", lastModified);
return Value.iso8601ToCalendar(lastModified).getTimeInMillis();
}
} catch (RepositoryException e) {
LOGGER.log(Level.WARNING, "Failed to retrieve last-modified date", e);
} catch (ParseException e) {
LOGGER.log(Level.WARNING, "Failed to parse last-modified date", e);
}
return -1L;
}
/**
* Retrieves the content type of a document from a connector instance.
*
* @param metadata the Document metadata
* @return the content-type of the document, as a string, or
* {@link SpiConstants.DEFAULT_MIMETYPE} if the content type
* is not supplied.
*/
@VisibleForTesting
static String handleGetContentType(Document metadata) {
// NOTE: To maintain consistency with the XmlFeed, this code returns
// SpiConstants.DEFAULT_MIMETYPE ("text/html") if the Document supplies
// no mime type property. However, the GSA would really rather receive
// MimeTypeDetector.UKNOWN_MIMETYPE ("application/octet-stream").
if (metadata != null) {
try {
String mimeType = Value.getSingleValueString(metadata,
SpiConstants.PROPNAME_MIMETYPE);
if (!Strings.isNullOrEmpty(mimeType)) {
return mimeType;
}
} catch (RepositoryException e) {
LOGGER.log(Level.WARNING, "Failed to retrieve content-type", e);
}
}
return SpiConstants.DEFAULT_MIMETYPE;
}
/**
* Retrieves the content length of a document from a connector instance.
*
* @param metadata the Document metadata
* @return the content-length of the document, as an Integer, or {@code null}
* if the content length is not known, less than or equal to zero,
* or the value does not fit in an Integer. Note that if the
* content-length returned by the connector is zero, this returns
* null, since the GSA does not support empty documents, so the
* empty content will be replaced by ProductionManager with alternate
* non-empty content.
*/
@VisibleForTesting
static Integer handleGetContentLength(Document metadata) {
if (metadata != null) {
try {
String lengthStr = Value.getSingleValueString(metadata,
SpiConstants.PROPNAME_CONTENT_LENGTH);
if (!Strings.isNullOrEmpty(lengthStr)) {
Integer length = Integer.valueOf(lengthStr);
return (length > 0) ? length : null;
}
} catch (NumberFormatException e) {
LOGGER.log(Level.WARNING, "Failed to retrieve content-length", e);
} catch (RepositoryException e) {
LOGGER.log(Level.WARNING, "Failed to retrieve content-length", e);
}
}
return null;
}
@VisibleForTesting
static int handleMarkingDocumentSecurity(HttpServletRequest req,
HttpServletResponse res, Document metadata) throws IOException {
if (req.getHeader("Authorization") != null) {
// GSA logged in; it is aware of the access restrictions on the document.
return HttpServletResponse.SC_OK;
}
if (metadata == null) {
return HttpServletResponse.SC_SERVICE_UNAVAILABLE;
}
ValueImpl isPublicVal;
try {
isPublicVal = (ValueImpl) Value.getSingleValue(metadata,
SpiConstants.PROPNAME_ISPUBLIC);
} catch (RepositoryException ex) {
LOGGER.log(Level.WARNING, "Failed retrieving isPublic property", ex);
return HttpServletResponse.SC_SERVICE_UNAVAILABLE;
}
boolean isPublic = isPublicVal == null || isPublicVal.toBoolean();
if (isSecurityHeaderSupported()) {
res.setHeader("X-Gsa-Serve-Security", isPublic ? "public" : "secure");
return HttpServletResponse.SC_OK;
} else {
if (isPublic) {
return HttpServletResponse.SC_OK;
} else {
res.setHeader("WWW-Authenticate", "Basic realm=\"Retriever\"");
return HttpServletResponse.SC_UNAUTHORIZED;
}
}
}
/** Logs an Exception and returns an appropriate HTTP status code. */
private static int handleException(String context, Exception e)
throws IOException {
if (e instanceof DocumentNotFoundException) {
LOGGER.log(Level.FINE, "Failed to retrieve document {0}: {1}",
new Object[] {context, e.toString()});
return HttpServletResponse.SC_NOT_FOUND;
} else if (e instanceof SkippedDocumentException) {
LOGGER.log(Level.FINE, "Failed to retrieve document {0}: {1}",
new Object[] {context, e.toString()});
return HttpServletResponse.SC_NOT_FOUND;
} else if (e instanceof DocumentAccessException) {
LOGGER.log(Level.FINE, "Failed to retrieve document {0}: {1}",
new Object[] {context, e.toString()});
return HttpServletResponse.SC_FORBIDDEN;
} else if (e instanceof ConnectorNotFoundException) {
LOGGER.log(Level.FINE, "Failed to retrieve document {0}: {1}",
new Object[] {context, e.toString()});
return HttpServletResponse.SC_SERVICE_UNAVAILABLE;
} else if (e instanceof RepositoryException) {
LOGGER.log(Level.WARNING, "Failed to retrieve document " + context, e);
return HttpServletResponse.SC_SERVICE_UNAVAILABLE;
} else if (e instanceof IOException) {
LOGGER.log(Level.WARNING, "Failed to retrieve document " + context, e);
throw (IOException) e;
} else if (e instanceof RuntimeException) {
LOGGER.log(Level.WARNING, "Failed to retrieve document " + context, e);
throw (RuntimeException) e;
} else { // ConnectorManagerException
LOGGER.log(Level.SEVERE, "Failed to retrieve document " + context, e);
return HttpServletResponse.SC_INTERNAL_SERVER_ERROR;
}
}
}