// Copyright 2011 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.google.enterprise.connector.common; import com.google.common.base.Charsets; import com.google.enterprise.connector.pusher.XmlFeed; import com.google.enterprise.connector.spi.XmlUtils; import java.io.ByteArrayInputStream; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.util.logging.Logger; /** * A {@code FilterInputStream} that protects against large documents and empty * documents. * If we have read more than {@link FileSizeLimitInfo.maxDocumentSize} * bytes from the input, we reset the feed to before we started reading * content, then provide the alternate content. Similarly, if we get EOF * after reading zero bytes, we provide the alternate content. * </p> * This filter assumes that a {@link BigEmptyDocumentFilterInputStream} * is somewhere up-stream, in that {@link AlternateContentFilterInputStream} * looks for the {@link BigDocumentException} and {@link EmptyDocumentException} * that would be thrown. */ // TODO: WARNING: This will not work for BigDocumentException if using // chunked HTTP transfer. public class AlternateContentFilterInputStream extends FilterInputStream { private static final Logger LOGGER = Logger.getLogger(AlternateContentFilterInputStream.class.getName()); private static final String SPACE = " "; /** * Construct the alternate content data for a feed item. If the feed item * has null or empty content, or if the feed item has excessively large * content, substitute this data which will insure that the feed item gets * indexed by the GSA. The alternate content consists of the item's title, * or a single space, if it lacks a title. * * @param title from the feed item * @param mimeType MIME type of the feed item * @return an InputStream containing the alternate content */ public static InputStream getAlternateContent(String title, String mimeType) { if (title != null && title.trim().length() == 0) { title = null; } String content; if ("application/pdf".equalsIgnoreCase(mimeType)) { // Alternate content for PDF must still be a PDF, // or the GSA drops the document with a "Conversion Error". content = PdfUtil.titledEmptyPdf(title); } else if (title != null) { // If the feed item supplied a title property, we build an // HTML5 fragment containing that title. This provides better // looking search result entries. content = titledEmptyHtml(title); } else { // If no title is available, supply a single space as the content. content = SPACE; } return new ByteArrayInputStream(content.getBytes(Charsets.UTF_8)); } /** Builds an HTML5 document with just a title, no body. */ private static String titledEmptyHtml(String title) { StringBuilder buf = new StringBuilder(); try { buf.append("<!DOCTYPE html><html><head><meta charset=\"utf-8\"/>"); buf.append("<title>"); XmlUtils.xmlAppendAttrValue(title, buf); buf.append("</title></html>"); } catch (IOException e) { // Should not happen with StringBuilder. throw new AssertionError(e); } return buf.toString(); } private boolean useAlternate; private InputStream alternate; private final XmlFeed feed; private int resetPoint; /** * @param in InputStream containing raw document content. * May be {@code null}. * @param alternate InputStream containing alternate content to provide * If {@code null}, a default alternate content of a single space * character is used. */ public AlternateContentFilterInputStream(InputStream in, InputStream alternate) { this(in, alternate, null); } /** * @param in InputStream containing raw document content. * May be {@code null}. * @param alternate InputStream containing alternate content to provide * If {@code null}, a default alternate content of a single space * character is used. * @param feed XmlFeed under constructions (used for reseting size). * May be {@code null}. */ public AlternateContentFilterInputStream(InputStream in, InputStream alternate, XmlFeed feed) { super(in); if (alternate == null) { // Use the default Alternate content: a single space. alternate = getAlternateContent(null, null); } this.useAlternate = (in == null); this.alternate = alternate; this.feed = feed; this.resetPoint = (feed == null) ? 0 : -1; } // Reset the feed to its position when we started reading this stream, // and start reading from the alternate input. private void switchToAlternate() { if (feed != null) { feed.reset(resetPoint); } useAlternate = true; } @Override public int read() throws IOException { if (resetPoint == -1) { // If I have read nothing yet, remember the reset point in the feed. resetPoint = feed.size(); } if (!useAlternate) { try { return super.read(); } catch (EmptyDocumentException e) { switchToAlternate(); } catch (BigDocumentException e) { LOGGER.finer("Document content exceeds the maximum configured " + "document size, discarding content."); switchToAlternate(); } } return alternate.read(); } @Override public int read(byte b[], int off, int len) throws IOException { if (resetPoint == -1) { // If I have read nothing yet, remember the reset point in the feed. resetPoint = feed.size(); } if (!useAlternate) { try { return super.read(b, off, len); } catch (EmptyDocumentException e) { switchToAlternate(); return 0; // Return alternate content on subsequent call to read(). } catch (BigDocumentException e) { LOGGER.finer("Document content exceeds the maximum configured " + "document size, discarding content."); switchToAlternate(); return 0; // Return alternate content on subsequent call to read(). } } return alternate.read(b, off, len); } @Override public boolean markSupported() { return false; } @Override public void close() throws IOException { super.close(); alternate.close(); } }