AlternateContentFilterInputStream.java example

Explorer
manager.v3-master
- projects
// Copyright 2011 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.enterprise.connector.common;

import com.google.common.base.Charsets;
import com.google.enterprise.connector.pusher.XmlFeed;
import com.google.enterprise.connector.spi.XmlUtils;

import java.io.ByteArrayInputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.logging.Logger;

/**
 * A {@code FilterInputStream} that protects against large documents and empty
 * documents.
 * If we have read more than {@link FileSizeLimitInfo.maxDocumentSize}
 * bytes from the input, we reset the feed to before we started reading
 * content, then provide the alternate content.  Similarly, if we get EOF
 * after reading zero bytes, we provide the alternate content.
 * </p>
 * This filter assumes that a {@link BigEmptyDocumentFilterInputStream}
 * is somewhere up-stream, in that {@link AlternateContentFilterInputStream}
 * looks for the {@link BigDocumentException} and {@link EmptyDocumentException}
 * that would be thrown.
 */
// TODO: WARNING: This will not work for BigDocumentException if using
// chunked HTTP transfer.
public class AlternateContentFilterInputStream extends FilterInputStream {
  private static final Logger LOGGER =
      Logger.getLogger(AlternateContentFilterInputStream.class.getName());

  private static final String SPACE = " ";

  /**
   * Construct the alternate content data for a feed item.  If the feed item
   * has null or empty content, or if the feed item has excessively large
   * content, substitute this data which will insure that the feed item gets
   * indexed by the GSA. The alternate content consists of the item's title,
   * or a single space, if it lacks a title.
   *
   * @param title from the feed item
   * @param mimeType MIME type of the feed item
   * @return an InputStream containing the alternate content
   */
  public static InputStream getAlternateContent(String title, String mimeType) {
    if (title != null && title.trim().length() == 0) {
      title = null;
    }

    String content;
    if ("application/pdf".equalsIgnoreCase(mimeType)) {
      // Alternate content for PDF must still be a PDF,
      // or the GSA drops the document with a "Conversion Error".
      content = PdfUtil.titledEmptyPdf(title);
    } else if (title != null) {
      // If the feed item supplied a title property, we build an
      // HTML5 fragment containing that title.  This provides better
      // looking search result entries.
      content = titledEmptyHtml(title);
    } else {
      // If no title is available, supply a single space as the content.
      content = SPACE;
    }

    return new ByteArrayInputStream(content.getBytes(Charsets.UTF_8));
  }

  /** Builds an HTML5 document with just a title, no body. */
  private static String titledEmptyHtml(String title) {
    StringBuilder buf = new StringBuilder();
    try {
      buf.append("<!DOCTYPE html><html><head><meta charset=\"utf-8\"/>");
      buf.append("<title>");
      XmlUtils.xmlAppendAttrValue(title, buf);
      buf.append("</title></html>");
    } catch (IOException e) {
      // Should not happen with StringBuilder.
      throw new AssertionError(e);
    }
    return buf.toString();
  }

  private boolean useAlternate;
  private InputStream alternate;
  private final XmlFeed feed;
  private int resetPoint;

  /**
   * @param in InputStream containing raw document content.
   *        May be {@code null}.
   * @param alternate InputStream containing alternate content to provide
   *        If {@code null}, a default alternate content of a single space
   *        character is used.
   */
  public AlternateContentFilterInputStream(InputStream in,
      InputStream alternate) {
    this(in, alternate, null);
  }

  /**
   * @param in InputStream containing raw document content.
   *        May be {@code null}.
   * @param alternate InputStream containing alternate content to provide
   *        If {@code null}, a default alternate content of a single space
   *        character is used.
   * @param feed XmlFeed under constructions (used for reseting size).
   *        May be {@code null}.
   */
  public AlternateContentFilterInputStream(InputStream in,
      InputStream alternate, XmlFeed feed) {
    super(in);
    if (alternate == null) {
      // Use the default Alternate content: a single space.
      alternate = getAlternateContent(null, null);
    }
    this.useAlternate = (in == null);
    this.alternate = alternate;
    this.feed = feed;
    this.resetPoint = (feed == null) ? 0 : -1;
  }


  // Reset the feed to its position when we started reading this stream,
  // and start reading from the alternate input.
  private void switchToAlternate() {
    if (feed != null) {
      feed.reset(resetPoint);
    }
    useAlternate = true;
  }

  @Override
  public int read() throws IOException {
    if (resetPoint == -1) {
      // If I have read nothing yet, remember the reset point in the feed.
      resetPoint = feed.size();
    }
    if (!useAlternate) {
      try {
        return super.read();
      } catch (EmptyDocumentException e) {
        switchToAlternate();
      } catch (BigDocumentException e) {
        LOGGER.finer("Document content exceeds the maximum configured "
                     + "document size, discarding content.");
        switchToAlternate();
      }
    }
    return alternate.read();
  }

  @Override
  public int read(byte b[], int off, int len) throws IOException {
    if (resetPoint == -1) {
      // If I have read nothing yet, remember the reset point in the feed.
      resetPoint = feed.size();
    }
    if (!useAlternate) {
      try {
        return super.read(b, off, len);
      } catch (EmptyDocumentException e) {
        switchToAlternate();
        return 0; // Return alternate content on subsequent call to read().
      } catch (BigDocumentException e) {
        LOGGER.finer("Document content exceeds the maximum configured "
                     + "document size, discarding content.");
        switchToAlternate();
        return 0; // Return alternate content on subsequent call to read().
      }
    }
    return alternate.read(b, off, len);
  }

  @Override
  public boolean markSupported() {
    return false;
  }

  @Override
  public void close() throws IOException {
    super.close();
    alternate.close();
  }
}