GetDocumentContent.java example

Explorer
manager.v3-master
- projects
// Copyright 2011 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.enterprise.connector.servlet;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.google.enterprise.connector.logging.NDC;
import com.google.enterprise.connector.manager.ConnectorManagerException;
import com.google.enterprise.connector.manager.Context;
import com.google.enterprise.connector.manager.Manager;
import com.google.enterprise.connector.persist.ConnectorNotFoundException;
import com.google.enterprise.connector.pusher.FeedConnection;
import com.google.enterprise.connector.pusher.XmlFeed;
import com.google.enterprise.connector.spi.Document;
import com.google.enterprise.connector.spi.DocumentAccessException;
import com.google.enterprise.connector.spi.DocumentNotFoundException;
import com.google.enterprise.connector.spi.Property;
import com.google.enterprise.connector.spi.RepositoryException;
import com.google.enterprise.connector.spi.SkippedDocumentException;
import com.google.enterprise.connector.spi.SpiConstants;
import com.google.enterprise.connector.spi.Value;
import com.google.enterprise.connector.spiimpl.DateValue;
import com.google.enterprise.connector.spiimpl.ValueImpl;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.text.ParseException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPOutputStream;

import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

public class GetDocumentContent extends HttpServlet {

  private static Logger LOGGER =
    Logger.getLogger(GetDocumentContent.class.getName());
  private static final String HDR_IF_MODIFIED = "If-Modified-Since";

  /**
   * Attribute name on the ServletRequest containing a cache of the parsed
   * query parameters.
   */
  private static final String PARAMETER_CACHE_NAME =
      GetDocumentContent.class.getName() + ".parameters";

  /**
   * Attribute name on the ServletRequest containing a cache of the metadata
   * of the requested document.
   */
  private static final String METADATA_CACHE_NAME =
      GetDocumentContent.class.getName() + ".document";
  /**
   * Distinguished attribute value denoting that the metadata is already known
   * to be unavailable.
   */
  private static final Object NEGATIVE_METADATA_CACHE_VALUE = new Object();

  /** HTTP header that contains the Document metadata. */
  private static final String EXTERNAL_METADATA_HEADER =
      "X-Gsa-External-Metadata";

  private static boolean useCompression = false;
  private static FeedConnection feedConnection;

  public static void setUseCompression(boolean doCompression) {
    useCompression = doCompression;
  }

  /**
   * Set the feed connection to use to discover if the security header is
   * supported. This must be set during startup to take effect.
   */
  public static void setFeedConnection(FeedConnection fc) {
    feedConnection = fc;
  }

  /**
   * GSA 7.0 introduces the ability to provide a HTTP header that specifies
   * whether the document is secure. In previous GSAs we are required to use
   * HTTP basic, which has to be configured correctly on the GSA.
   */
  private synchronized static boolean isSecurityHeaderSupported() {
    if (feedConnection == null) {
      // FeedConnection is unavailable, so choose the pessimistic choice.
      return false;
    } else {
      // The newer ACL format was added in same GSA version as security header,
      // so we abuse the ACL feature detection logic.
      return feedConnection.supportsInheritedAcls();
    }
  }

  /**
   * Retrieves the content of a document from a connector instance.
   *
   * @param req
   * @param res
   * @throws IOException
   */
  @Override
  protected void doPost(HttpServletRequest req, HttpServletResponse res)
      throws IOException {
    doGet(req, res);
  }

  /**
   * Retrieves the content of a document from a connector instance.
   *
   * @param req
   * @param res
   * @throws IOException
   */
  @Override
  protected void doGet(HttpServletRequest req, HttpServletResponse res)
      throws IOException {
    doGet(req, res, Context.getInstance().getManager());
  }

  /**
   * Fetches the last modified date for the document, in milliseconds since
   * the epoch; or -1 if the last modified date is not known or unavailable.
   *
   * @param req
   * @return a long integer specifying the time the document was last modified,
   * in milliseconds since midnight, January 1, 1970 GMT, or -1 if the time is
   * not known.
   */
  @Override
  protected long getLastModified(HttpServletRequest req) {
    Map<String, List<String>> params = getQueryParams(req);
    String connectorName = ServletUtil.getFirstParameter(
        params, ServletUtil.XMLTAG_CONNECTOR_NAME);
    String docid = ServletUtil.getFirstParameter(
        params, ServletUtil.QUERY_PARAM_DOCID);
    if (Strings.isNullOrEmpty(connectorName) || Strings.isNullOrEmpty(docid)) {
      return -1L;
    }
    return handleGetLastModified(getDocumentMetaDataNoThrow(req,
        Context.getInstance().getManager(), connectorName, docid));
  }

  /**
   * Returns a map of query parameters extracted from the request.
   */
  private static Map<String, List<String>> getQueryParams(HttpServletRequest req) {
    @SuppressWarnings("unchecked") Map<String, List<String>> params = 
        (Map<String, List<String>>)(req.getAttribute(PARAMETER_CACHE_NAME));
    if (params == null) {
      params = ServletUtil.parseQueryString(req.getQueryString());
      req.setAttribute(PARAMETER_CACHE_NAME, params);
    }
    return params;
  }

  /**
   * Retrieves the content of a document from a connector instance.
   *
   * @param req
   * @param res
   * @param manager manager to use for retrieving document information
   * @throws IOException
   */
  @VisibleForTesting
  static void doGet(HttpServletRequest req, HttpServletResponse res,
      Manager manager) throws IOException {
    // The servlet relies on proper security to be handled by a filter.

    if ("SecMgr".equals(req.getHeader("User-Agent")) || 
        req.getHeader("Range") != null ||
        "HEAD".equals(req.getMethod())) {
      // GSA does a GET with Range:0-0 to simulate head request.
      // Assume that a "HEAD" request to check authz is being performed
      // due to presence of Range header.
      // We don't support authz by hr so we always issue deny.
      // TODO(ejona): Remove checking for Range header and HEAD once 
      // Legacy Authz is removed from supported GSA versions.
      LOGGER.finest("RETRIEVER: Head request denied");
      res.sendError(HttpServletResponse.SC_FORBIDDEN);
      return;
    }

    Map<String, List<String>> params = getQueryParams(req);
    String connectorName = ServletUtil.getFirstParameter(
        params, ServletUtil.XMLTAG_CONNECTOR_NAME);
    String docid = ServletUtil.getFirstParameter(
        params, ServletUtil.QUERY_PARAM_DOCID);
    if (Strings.isNullOrEmpty(connectorName) || Strings.isNullOrEmpty(docid)) {
      res.sendError(HttpServletResponse.SC_BAD_REQUEST);
      return;
    }
    NDC.pushAppend("Retrieve " + connectorName + " "
                   + docid.substring(docid.lastIndexOf('/') + 1));

    Document metadata;
    try {
      metadata = getDocumentMetaData(req, manager, connectorName, docid);
    } catch (Exception e) {
      res.sendError(handleException("metadata", e));
      return;
    }

    int securityCode = handleMarkingDocumentSecurity(req, res, metadata);
    if (securityCode != HttpServletResponse.SC_OK) {
      res.sendError(securityCode);
      return;
    }

    // Set the Content-Type. 
    String mimeType = handleGetContentType(metadata);
    LOGGER.log(Level.FINEST, "Document Content-Type {0}", mimeType);
    res.setContentType(mimeType);

    Integer contentLength = handleGetContentLength(metadata);
    if (contentLength != null) {
      LOGGER.log(Level.FINEST, "Document Content-Length {0}", contentLength);
      res.setContentLength(contentLength);
    }

    // Supply the document metadata in an X-Gsa-External-Metadata header.
    if (metadata != null) {
      res.setHeader(EXTERNAL_METADATA_HEADER, getMetadataHeader(metadata));
    }

    OutputStream out = res.getOutputStream();
    if (useCompression) {
      // Select Content-Encoding based on the client's Accept-Encoding header.
      // Choose GZIP if the header includes "gzip", otherwise no compression.
      String encodings = req.getHeader("Accept-Encoding");
      if (encodings != null && encodings.matches(".*\\bgzip\\b.*")) {
        res.setHeader("Content-Encoding", "gzip");
        out = new GZIPOutputStream(out, 64 * 1024);
      }
      res.setHeader("Vary", "Accept-Encoding");
    }

    // TODO: Configure chunked output?

    try {
      int code = handleDoGet(manager, connectorName, docid, out);
      if (code != HttpServletResponse.SC_OK) {
        res.sendError(code);
      } else {
        res.setStatus(code);
      }
    } finally {
      out.close();
      NDC.pop();
    }
  }

  /**
   * Builds the GSA-specific metadata header value for crawl-time metadata,
   * based upon the Document's supplied metadata.
   */
  // Warning: See XmlFeed.wrapMetaData() if you make changes here.
  @VisibleForTesting
  static String getMetadataHeader(Document metadata) {
    StringBuilder sb = new StringBuilder();
    Set<String> propertyNames = null;
    try {
      propertyNames = metadata.getPropertyNames();
    } catch (RepositoryException e) {
      LOGGER.log(Level.WARNING, "Failed to retrieve property names", e);
    }
    if (propertyNames != null && !propertyNames.isEmpty()) {
      // Sort property names so that metadata is written in a canonical form.
      // The GSA's metadata change detection logic depends on the metadata to
      // be in the same order each time to prevent reindexing.
      propertyNames = new TreeSet<String>(propertyNames);
      for (String name : propertyNames) {
        if (XmlFeed.propertySkipSet.contains(name)) {
          continue;
        }
        try { 
          Property property = metadata.findProperty(name);
          if (property != null) {
            encodeOneProperty(sb, name, property);
          }
        } catch (RepositoryException e) {
          LOGGER.log(Level.WARNING, "Failed to retrieve property " + name, e);
        }
      }
    }
    return (sb.length() == 0) ? "" : sb.substring(0, sb.length() - 1);
  }
  
  /**
   * Adds one Property's values to the metadata header under contruction.
   */
  private static void encodeOneProperty(StringBuilder sb, String name,
      Property property) throws RepositoryException {
    ValueImpl value;
    while ((value = (ValueImpl) property.nextValue()) != null) {
      LOGGER.log(Level.FINEST, "PROPERTY: {0} = \"{1}\"",
                 new Object[] { name, value.toString() });
      String valString = value.toFeedXml();
      if (!Strings.isNullOrEmpty(valString)) {
        ServletUtil.percentEncode(sb, name, valString);
        sb.append(',');
      }
    }
  }

  /**
   * Retrieves the content of a document from a connector instance.
   *
   * @param manager a Manager
   * @param connectorName the name of the connector instance that
   *        can access the document
   * @param docId the document identifer
   * @param out OutputStream to which to write the content
   * @return an HTTP Status Code
   * @throws IOException
   */
  @VisibleForTesting
  static int handleDoGet(Manager manager, String connectorName, String docid,
      OutputStream out) throws IOException {
    InputStream in = null;
    try {
      in = manager.getDocumentContent(connectorName, docid);
      if (in == null) {
        // This is unlikely to happen, since Production Manager
        // will return an AlternateContent InputStream.
        in = new ByteArrayInputStream(new byte[0]);
      }
      byte[] buffer = new byte[1024 * 1024];
      int bytes;
      do {
        bytes = in.read(buffer);
        if (bytes > 0) {
          out.write(buffer, 0, bytes);
        }
      } while (bytes != -1);
      return HttpServletResponse.SC_OK;
    } catch (Exception e) {
      return handleException("content", e);
    } finally {
      if (in != null) {
        in.close();
      }
    }
  }

  /**
   * Retrieve and cache the metadata of the currently requested document.
   * The metadata is cached for the life of the servlet request.
   * No checked exceptions are thrown.
   * If a problem occurs  {@code null} is returned.
   *
   * @param req Request to use for caching return value
   * @param manager a Manager
   * @param connectorName the name of the connector instance that
   *        can access the document
   * @param docId the document identifer
   * @return document's metadata or {@code null} if it is unavailable
   */
  private static Document getDocumentMetaDataNoThrow(HttpServletRequest req, 
      Manager manager, String connectorName, String docid) {
    try {
      return getDocumentMetaData(req, manager, connectorName, docid);
    } catch (ConnectorManagerException e) {
      return null;
    } catch (RepositoryException e) {
      return null;
    }
  }

  /**
   * Retrieve and cache the metadata of the currently requested document.
   * The metadata is cached for the life of the servlet request.
   *
   * @param req Request to use for caching return value
   * @param manager a Manager
   * @param connectorName the name of the connector instance that
   *        can access the document
   * @param docId the document identifer
   * @return document's metadata or {@code null} if it is unavailable
   */
  @VisibleForTesting
  static Document getDocumentMetaData(HttpServletRequest req,
      Manager manager, String connectorName, String docid)
      throws ConnectorManagerException, RepositoryException {
    Object cache = req.getAttribute(METADATA_CACHE_NAME);
    if (cache != null) {
      return cache == NEGATIVE_METADATA_CACHE_VALUE ? null : (Document) cache;
    }

    Document metadata = manager.getDocumentMetaData(connectorName, docid);
    req.setAttribute(METADATA_CACHE_NAME, 
        (metadata == null) ? NEGATIVE_METADATA_CACHE_VALUE : metadata);
    return metadata;
  }

  /**
   * Retrieves the last modified date of a document from a connector instance.
   *
   * @param metadata the Document metadata
   * @return a long integer specifying the time the document was last modified,
   *         in milliseconds since midnight, January 1, 1970 GMT, or -1L if the
   *         time is not known
   */
  @VisibleForTesting
  static long handleGetLastModified(Document metadata) {
    if (metadata == null) {
      return -1L;
    }

    try {
      // TODO: Value and DateValue Calendar methods are too weak to try to get
      // last modified from non-DateValues.
      ValueImpl value = (ValueImpl)
          Value.getSingleValue(metadata, SpiConstants.PROPNAME_LASTMODIFIED);
      if (value == null) {
        LOGGER.log(Level.FINEST, "Document does not contain {0}",
                   SpiConstants.PROPNAME_LASTMODIFIED);
      } else if (value instanceof DateValue) {
        // DateValues don't give direct access to their Calendar object, but
        // I can get the Calendar back out by parsing the stringized version.
        // This method also applies the FeedTimeZone, if needed.
        // TODO: Add a DateValue.getTimeMillis() or getCalendar() method to
        // directly access the wrapped value.
        String lastModified = ((DateValue) value).toIso8601();
        LOGGER.log(Level.FINEST, "Document last modified {0}", lastModified);
        return Value.iso8601ToCalendar(lastModified).getTimeInMillis();
      }
    } catch (RepositoryException e) {
      LOGGER.log(Level.WARNING, "Failed to retrieve last-modified date", e);
    } catch (ParseException e) {
      LOGGER.log(Level.WARNING, "Failed to parse last-modified date", e);
    }
    return -1L;
  }

  /**
   * Retrieves the content type of a document from a connector instance.
   *
   * @param metadata the Document metadata
   * @return the content-type of the document, as a string, or 
   *         {@link SpiConstants.DEFAULT_MIMETYPE} if the content type
   *         is not supplied.
   */
  @VisibleForTesting
  static String handleGetContentType(Document metadata) {
    // NOTE: To maintain consistency with the XmlFeed, this code returns
    // SpiConstants.DEFAULT_MIMETYPE ("text/html") if the Document supplies
    // no mime type property. However, the GSA would really rather receive
    // MimeTypeDetector.UKNOWN_MIMETYPE ("application/octet-stream").
    if (metadata != null) {
      try {
        String mimeType = Value.getSingleValueString(metadata, 
            SpiConstants.PROPNAME_MIMETYPE);
        if (!Strings.isNullOrEmpty(mimeType)) {
          return mimeType;
        }
      } catch (RepositoryException e) {
        LOGGER.log(Level.WARNING, "Failed to retrieve content-type", e);
      }
    }
    return SpiConstants.DEFAULT_MIMETYPE;
  }
  
  /**
   * Retrieves the content length of a document from a connector instance.
   *
   * @param metadata the Document metadata
   * @return the content-length of the document, as an Integer, or {@code null}
   *         if the content length is not known, less than or equal to zero,
   *         or the value does not fit in an Integer.  Note that if the
   *         content-length returned by the connector is zero, this returns
   *         null, since the GSA does not support empty documents, so the
   *         empty content will be replaced by ProductionManager with alternate
   *         non-empty content.
   */
  @VisibleForTesting
  static Integer handleGetContentLength(Document metadata) {
    if (metadata != null) {
      try {
        String lengthStr = Value.getSingleValueString(metadata, 
            SpiConstants.PROPNAME_CONTENT_LENGTH);
        if (!Strings.isNullOrEmpty(lengthStr)) {
          Integer length = Integer.valueOf(lengthStr);
          return (length > 0) ? length : null;
        }
      } catch (NumberFormatException e) {
        LOGGER.log(Level.WARNING, "Failed to retrieve content-length", e);
      } catch (RepositoryException e) {
        LOGGER.log(Level.WARNING, "Failed to retrieve content-length", e);
      }
    }
    return null;
  }
  
  @VisibleForTesting
  static int handleMarkingDocumentSecurity(HttpServletRequest req,
      HttpServletResponse res, Document metadata) throws IOException {
    if (req.getHeader("Authorization") != null) {
      // GSA logged in; it is aware of the access restrictions on the document.
      return HttpServletResponse.SC_OK;
    }

    if (metadata == null) {
      return HttpServletResponse.SC_SERVICE_UNAVAILABLE;
    }

    ValueImpl isPublicVal;
    try {
      isPublicVal = (ValueImpl) Value.getSingleValue(metadata,
          SpiConstants.PROPNAME_ISPUBLIC);
    } catch (RepositoryException ex) {
      LOGGER.log(Level.WARNING, "Failed retrieving isPublic property", ex);
      return HttpServletResponse.SC_SERVICE_UNAVAILABLE;
    }
    boolean isPublic = isPublicVal == null || isPublicVal.toBoolean();

    if (isSecurityHeaderSupported()) {
      res.setHeader("X-Gsa-Serve-Security", isPublic ? "public" : "secure");
      return HttpServletResponse.SC_OK;
    } else {
      if (isPublic) {
        return HttpServletResponse.SC_OK;
      } else {
        res.setHeader("WWW-Authenticate", "Basic realm=\"Retriever\"");
        return HttpServletResponse.SC_UNAUTHORIZED;
      }
    }
  }

  /** Logs an Exception and returns an appropriate HTTP status code. */
  private static int handleException(String context, Exception e)
      throws IOException {
    if (e instanceof DocumentNotFoundException) {
      LOGGER.log(Level.FINE, "Failed to retrieve document {0}: {1}",
                 new Object[] {context, e.toString()});
      return HttpServletResponse.SC_NOT_FOUND;
    } else if (e instanceof SkippedDocumentException) {
      LOGGER.log(Level.FINE, "Failed to retrieve document {0}: {1}",
                 new Object[] {context, e.toString()});
      return HttpServletResponse.SC_NOT_FOUND;
    } else if (e instanceof DocumentAccessException) {
      LOGGER.log(Level.FINE, "Failed to retrieve document {0}: {1}",
                 new Object[] {context, e.toString()});
      return HttpServletResponse.SC_FORBIDDEN;
    } else if (e instanceof ConnectorNotFoundException) {
      LOGGER.log(Level.FINE, "Failed to retrieve document {0}: {1}",
                 new Object[] {context, e.toString()});
      return HttpServletResponse.SC_SERVICE_UNAVAILABLE;
    } else if (e instanceof RepositoryException) {
      LOGGER.log(Level.WARNING, "Failed to retrieve document " + context, e);
      return HttpServletResponse.SC_SERVICE_UNAVAILABLE;
    } else if (e instanceof IOException) {
      LOGGER.log(Level.WARNING, "Failed to retrieve document " + context, e);
      throw (IOException) e;
    } else if (e instanceof RuntimeException) {
      LOGGER.log(Level.WARNING, "Failed to retrieve document " + context, e);
      throw (RuntimeException) e;
    } else { // ConnectorManagerException
      LOGGER.log(Level.SEVERE, "Failed to retrieve document " + context, e);
      return HttpServletResponse.SC_INTERNAL_SERVER_ERROR;
    }
  }
}