/* * Created on Jul 2, 2009 * (c) 2009 Trumpet, Inc. * */ package com.itextpdf.text.pdf.parser; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import junit.framework.Assert; import org.junit.BeforeClass; import org.junit.Test; import com.itextpdf.text.Document; import com.itextpdf.text.Font; import com.itextpdf.text.FontFactory; import com.itextpdf.text.Paragraph; import com.itextpdf.text.pdf.BaseFont; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.PdfWriter; /** * @author kevin */ public class PdfTextExtractorEncodingsTest { /** Basic Latin characters, with Unicode values less than 128 */ private static final String TEXT1 = "AZaz09*!"; /** Latin-1 characters */ private static final String TEXT2 = "\u0027\u0060\u00a4\u00a6"; // the following will cause failures // private static final String TEXT2 = "\u0027\u0060\u00a4\u00a6\00b5\u2019"; @BeforeClass public static void initializeFontFactory(){ FontFactory.registerDirectories(); } protected static Font getSomeTTFont(String encoding, boolean embedded, float size) { String fontNames[] = {"arial"}; for (String name : fontNames) { Font foundFont = FontFactory.getFont(name, encoding, embedded, size); if (foundFont != null) { switch(foundFont.getBaseFont().getFontType()){ case BaseFont.FONT_TYPE_TT: case BaseFont.FONT_TYPE_TTUNI: return foundFont; // SUCCESS } } } throw new IllegalArgumentException("Unable to find TrueType font to test with - add the name of a TT font on the system to the fontNames array in this method"); } private static byte[] createPdf(final Font font) throws Exception { final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); final Document document = new Document(); PdfWriter.getInstance(document, byteStream); document.open(); document.add(new Paragraph(TEXT1, font)); document.newPage(); document.add(new Paragraph(TEXT2, font)); document.close(); final byte[] pdfBytes = byteStream.toByteArray(); return pdfBytes; } /** * Used for testing only if we need to open the PDF itself * @param bytes * @param file * @throws Exception */ private void saveBytesToFile(byte[] bytes, File file) throws Exception{ final FileOutputStream outputStream = new FileOutputStream(file); outputStream.write(bytes); outputStream.close(); System.out.println("PDF dumped to " + file.getAbsolutePath()); } /** * Test parsing a document which uses a standard non-embedded font. * * @throws Exception any exception will cause the test to fail */ @Test public void testStandardFont() throws Exception { Font font = new Font(Font.TIMES_ROMAN, 12); byte[] pdfBytes = createPdf(font); if (false){ saveBytesToFile(pdfBytes, new File("test.pdf")); } checkPdf(pdfBytes); } /** * Test parsing a document which uses a font encoding which creates a /Differences * PdfArray in the PDF. * * @throws Exception any exception will cause the test to fail */ @Test public void testEncodedFont() throws Exception { Font font = getSomeTTFont("ISO-8859-1", BaseFont.EMBEDDED, 12); byte[] pdfBytes = createPdf(font); checkPdf(pdfBytes); } /** * Test parsing a document which uses a Unicode font encoding which creates a /ToUnicode * PdfArray. * * @throws Exception any exception will cause the test to fail */ @Test public void testUnicodeFont() throws Exception { Font font = getSomeTTFont(BaseFont.IDENTITY_H, BaseFont.EMBEDDED, 12); byte[] pdfBytes = createPdf(font); checkPdf(pdfBytes); } private void checkPdf(final byte[] pdfBytes) throws Exception { final PdfReader pdfReader = new PdfReader(pdfBytes); final PdfTextExtractor textExtractor = new PdfTextExtractor(pdfReader); // Characters from http://unicode.org/charts/PDF/U0000.pdf Assert.assertEquals(TEXT1, textExtractor.getTextFromPage(1)); // Characters from http://unicode.org/charts/PDF/U0080.pdf Assert.assertEquals(TEXT2, textExtractor.getTextFromPage(2)); } }