// Copyright 2012 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.connector.common;
import com.google.common.base.Charsets;
import com.google.common.base.Strings;
import com.google.enterprise.connector.util.Base16;
import java.text.MessageFormat;
/**
* Bare minimum PDF utilities.
*/
public class PdfUtil {
private PdfUtil() { // Prevent instantiation.
throw new AssertionError("Do not instantiate PdfUtil");
}
/**
* Minimal PDF boilerplate, according to Adobe PDF Reference Manual.
*/
private static String PDF_OBJS = "%PDF-1.1\n"
+ "1 0 obj\n<</Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n"
+ "2 0 obj\n<</Type /Pages\n/Kids [3 0 R]\n/Count 1\n>>\nendobj\n"
+ "3 0 obj\n<</Type /Page\n/Parent 2 0 R\n/MediaBox [0 0 72 72]\n>>\n"
+ "endobj\n4 0 obj\n<<{0}\n>>\nendobj\n";
private static String PDF_XREF = "xref\n0 5\n0000000000 65535 f\r\n"
+ "0000000009 00000 n\r\n0000000057 00000 n\r\n"
+ "0000000113 00000 n\r\n0000000181 00000 n\r\n"
+ "trailer\n<</Size 5\n/Root 1 0 R\n/Info 4 0 R\n>>\nstartxref\n{0}\n"
+ "%%EOF\n";
/**
* Creates a tiny, empty PDF document.
*
* @return a String containing a PDF encoding of an empty document
*/
public static String emptyPdf() {
return titledEmptyPdf(null);
}
/**
* Creates a tiny, empty PDF document with a Title entry in the
* Document Information Dictionary.
*
* Note that the GSA PDF text extraction process only pulls out
* the first 128 characters of the Title.
*
* @param title the title of the document
* @return a String containing a PDF encoding of a titled document
*/
public static String titledEmptyPdf(String title) {
StringBuilder buf = new StringBuilder();
// If title is null or empty, do not insert a /Title element.
// For some reason, the GSA pdf titles can not have periods.
buf.append(MessageFormat.format(PDF_OBJS, Strings.isNullOrEmpty(title)
? "" : "/Title " + toBinaryString(title.replace('.', ' '))));
buf.append(MessageFormat.format(PDF_XREF, buf.length()));
return buf.toString();
}
/**
* PDF literal strings are limited to 8-bit characters.
* Unicode characters need to be encoded as UTF-16BE in
* a stream of hexadecimal characters.
*
* @param text some plain text
* @return a PDF hexadecimal string encoding the text
*/
public static String toBinaryString(String text) {
// Leading FEFF indicates big-endian 16-bit Unicode text.
StringBuilder buf = new StringBuilder("<FEFF");
byte[] bytes = text.getBytes(Charsets.UTF_16BE);
Base16.upperCase().encode(bytes, buf);
buf.append('>');
return buf.toString();
}
}