// Copyright 2012-01-10 PlanBase Inc. & Glen Peterson // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.planbase.pdf.layoutmanager; import org.apache.pdfbox.exceptions.COSVisitorException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.edit.PDPageContentStream; import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.pdmodel.graphics.xobject.PDJpeg; import org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap; import java.awt.Color; import java.awt.image.BufferedImage; import java.io.IOException; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; /** <p>The main class in this package; it handles page and line breaks.</p> <h3>Usage (the unit test is a much better example):</h3> <pre><code>// Create a new manager PdfLayoutMgr pageMgr = PdfLayoutMgr.newRgbPageMgr(); LogicalPage lp = pageMgr.logicalPageStart(); // defaults to Landscape orientation // call various lp.tableBuilder() or lp.put...() methods here. // They will page-break and create extra physical pages as needed. // ... lp.commit(); lp = pageMgr.logicalPageStart(LogicalPage.Orientation.PORTRAIT); // These pages will be in Portrait orientation // call various lp methods to put things on the next page grouping // ... lp.commit(); // The file to write to OutputStream os = new FileOutputStream("test.pdf"); // Commit all pages to output stream. pageMgr.save(os);</code></pre> <br> <h3>Note:</h3> <p>Because this class buffers and writes to an underlying stream, it is mutable, has side effects, and is NOT thread-safe!</p> */ public class PdfLayoutMgr { // private Logger logger = Logger.getLogger(PdfLayoutMgr.class); // logger.info("Ascent: " + PDType1Font.HELVETICA.getFontDescriptor().getAscent()); // logger.info("StemH: " + PDType1Font.HELVETICA.getFontDescriptor().getStemH()); // logger.info("CapHeight: " + PDType1Font.HELVETICA.getFontDescriptor().getCapHeight()); // logger.info("XHeight: " + PDType1Font.HELVETICA.getFontDescriptor().getXHeight()); // logger.info("Descent: " + PDType1Font.HELVETICA.getFontDescriptor().getDescent()); // logger.info("Leading: " + PDType1Font.HELVETICA.getFontDescriptor().getLeading()); // // logger.info("Height: " + PDType1Font.HELVETICA.getFontDescriptor().getFontBoundingBox().getHeight()); // // Ascent: 718.0 // StemH: 0.0 // CapHeight: 718.0 // XHeight: 523.0 // Descent: -207.0 // Leading: 0.0 // Height: 1156.0 // CapHeight - descent = 925 // 925 - descent = 1132 which is still less than 1156. // I'm going to make line-height = // Java FontMetrics says getHeight() = getAscent() + getDescent() + getLeading(). // I think ascent and descent are compatible with this. I'm going to make Leading be // -descent/2 /** If you use no scaling when printing the output PDF, PDFBox shows approximately 72 Document-Units Per Inch. This makes one pixel on an average desktop monitor correspond to roughly one document unit. This is a useful constant for page layout math. */ public static final float DOC_UNITS_PER_INCH = 72f; // TODO: add Sensible defaults, such as textStyle? // private TextStyle textStyle; // private PDRectangle pageDimensions; // private Padding pageMargins; // private PDRectangle printableArea; // // public TextStyle textStyle() { return textStyle; } // public PDRectangle pageDimensions() { return pageDimensions; } // public Padding pageMargins() { return pageMargins; } // public PDRectangle printableArea() { return printableArea; } // You can have many DrawJpegs backed by only a few images - it is a flyweight, and this // hash map keeps track of the few underlying images, even as intances of DrawJpeg // represent all the places where these images are used. // CRITICAL: This means that the the set of jpgs must be thrown out and created anew for each // document! Thus, a private final field on the PdfLayoutMgr instead of DrawJpeg, and DrawJpeg // must be an inner class (or this would have to be package scoped). private final Map<BufferedImage,PDJpeg> jpegMap = new HashMap<BufferedImage,PDJpeg>(); private PDJpeg ensureCached(final ScaledJpeg sj) { BufferedImage bufferedImage = sj.bufferedImage(); PDJpeg temp = jpegMap.get(bufferedImage); if (temp == null) { try { temp = new PDJpeg(doc, bufferedImage); } catch (IOException ioe) { // can there ever be an exception here? Doesn't it get written later? throw new IllegalStateException("Caught exception creating a PDJpeg from a bufferedImage", ioe); } jpegMap.put(bufferedImage, temp); } return temp; } // You can have many DrawPngs backed by only a few images - it is a flyweight, and this // hash map keeps track of the few underlying images, even as intances of DrawPng // represent all the places where these images are used. // CRITICAL: This means that the the set of jpgs must be thrown out and created anew for each // document! Thus, a private final field on the PdfLayoutMgr instead of DrawPng, and DrawPng // must be an inner class (or this would have to be package scoped). private final Map<BufferedImage,PDPixelMap> pngMap = new HashMap<BufferedImage,PDPixelMap>(); private PDPixelMap ensureCached(final ScaledPng sj) { BufferedImage bufferedImage = sj.bufferedImage(); PDPixelMap temp = pngMap.get(bufferedImage); if (temp == null) { try { temp = new PDPixelMap(doc, bufferedImage); } catch (IOException ioe) { // can there ever be an exception here? Doesn't it get written later? throw new IllegalStateException("Caught exception creating a PDPixelMap from a bufferedImage", ioe); } pngMap.put(bufferedImage, temp); } return temp; } /** * Please don't access this class directly if you don't have to. It's a little bit like a model for stuff that * needs to be drawn on a page, but much more like a heap of random functionality that sort of landed in an * inner class. This will probably be refactored away in future releases. */ static class PageBuffer { public final int pageNum; private long lastOrd = 0; private final Set<PdfItem> items = new TreeSet<PdfItem>(); private PageBuffer(int pn) { pageNum = pn; } void fillRect(final float xVal, final float yVal, final float w, final float h, final Color c, final float z) { items.add(FillRect.of(xVal, yVal, w, h, c, lastOrd++, z)); } // public void fillRect(final float xVal, final float yVal, final float w, final Color c, // final float h) { // fillRect(xVal, yVal, w, h, c, PdfItem.DEFAULT_Z_INDEX); // } // // public void drawJpeg(final float xVal, final float yVal, final BufferedImage bi, // final PdfLayoutMgr mgr, final float z) { // items.add(DrawJpeg.of(xVal, yVal, bi, mgr, lastOrd++, z)); // } void drawJpeg(final float xVal, final float yVal, final ScaledJpeg sj, final PdfLayoutMgr mgr) { items.add(DrawJpeg.of(xVal, yVal, sj, mgr, lastOrd++, PdfItem.DEFAULT_Z_INDEX)); } void drawPng(final float xVal, final float yVal, final ScaledPng sj, final PdfLayoutMgr mgr) { items.add(DrawPng.of(xVal, yVal, sj, mgr, lastOrd++, PdfItem.DEFAULT_Z_INDEX)); } private void drawLine(final float xa, final float ya, final float xb, final float yb, final LineStyle ls, final float z) { items.add(DrawLine.of(xa, ya, xb, yb, ls, lastOrd++, z)); } void drawLine(final float xa, final float ya, final float xb, final float yb, final LineStyle ls) { drawLine(xa, ya, xb, yb, ls, PdfItem.DEFAULT_Z_INDEX); } private void drawStyledText(final float xCoord, final float yCoord, final String text, TextStyle s, final float z) { items.add(Text.of(xCoord, yCoord, text, s, lastOrd++, z)); } void drawStyledText(final float xCoord, final float yCoord, final String text, TextStyle s) { drawStyledText(xCoord, yCoord, text, s, PdfItem.DEFAULT_Z_INDEX); } private void commit(PDPageContentStream stream) throws IOException { // Since items are z-ordered, then sub-ordered by entry-order, we will draw // everything in the correct order. for (PdfItem item : items) { item.commit(stream); } } private static class DrawLine extends PdfItem { private final float x1, y1, x2, y2; private final LineStyle style; private DrawLine(final float xa, final float ya, final float xb, final float yb, LineStyle s, final long ord, final float z) { super(ord, z); x1 = xa; y1 = ya; x2 = xb; y2 = yb; style = s; } public static DrawLine of(final float xa, final float ya, final float xb, final float yb, LineStyle s, final long ord, final float z) { return new DrawLine(xa, ya, xb, yb, s, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { stream.setStrokingColor(style.color()); stream.setLineWidth(style.width()); stream.drawLine(x1, y1, x2, y2); } } private static class FillRect extends PdfItem { private final float x, y, width, height; private final Color color; private FillRect(final float xVal, final float yVal, final float w, final float h, final Color c, final long ord, final float z) { super(ord, z); x = xVal; y = yVal; width = w; height = h; color = c; } public static FillRect of(final float xVal, final float yVal, final float w, final float h, final Color c, final long ord, final float z) { return new FillRect(xVal, yVal, w, h, c, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { stream.setNonStrokingColor(color); stream.fillRect(x, y, width, height); } } static class Text extends PdfItem { public final float x, y; public final String t; public final TextStyle style; private Text(final float xCoord, final float yCoord, final String text, TextStyle s, final long ord, final float z) { super(ord, z); x = xCoord; y = yCoord; t = text; style = s; } public static Text of(final float xCoord, final float yCoord, final String text, TextStyle s, final long ord, final float z) { return new Text(xCoord, yCoord, text, s, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { stream.beginText(); stream.setNonStrokingColor(style.textColor()); stream.setFont(style.font(), style.fontSize()); stream.moveTextPositionByAmount(x, y); stream.drawString(t); stream.endText(); } } private static class DrawPng extends PdfItem { private final float x, y; private final PDPixelMap png; private final ScaledPng scaledPng; // private Log logger = LogFactory.getLog(DrawPng.class); private DrawPng(final float xVal, final float yVal, final ScaledPng sj, final PdfLayoutMgr mgr, final long ord, final float z) { super(ord, z); x = xVal; y = yVal; png = mgr.ensureCached(sj); scaledPng = sj; } public static DrawPng of(final float xVal, final float yVal, final ScaledPng sj, final PdfLayoutMgr mgr, final long ord, final float z) { return new DrawPng(xVal, yVal, sj, mgr, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { // stream.drawImage(png, x, y); XyDim dim = scaledPng.dimensions(); stream.drawXObject(png, x, y, dim.x(), dim.y()); } } private static class DrawJpeg extends PdfItem { private final float x, y; private final PDJpeg jpeg; private final ScaledJpeg scaledJpeg; // private Log logger = LogFactory.getLog(DrawJpeg.class); private DrawJpeg(final float xVal, final float yVal, final ScaledJpeg sj, final PdfLayoutMgr mgr, final long ord, final float z) { super(ord, z); x = xVal; y = yVal; jpeg = mgr.ensureCached(sj); scaledJpeg = sj; } public static DrawJpeg of(final float xVal, final float yVal, final ScaledJpeg sj, final PdfLayoutMgr mgr, final long ord, final float z) { return new DrawJpeg(xVal, yVal, sj, mgr, ord, z); } @Override public void commit(PDPageContentStream stream) throws IOException { // stream.drawImage(jpeg, x, y); XyDim dim = scaledJpeg.dimensions(); stream.drawXObject(jpeg, x, y, dim.x(), dim.y()); } } } private final List<PageBuffer> pages = new ArrayList<PageBuffer>(); private final PDDocument doc; // pages.size() counts the first page as 1, so 0 is the appropriate sentinel value private int unCommittedPageIdx = 0; private final PDColorSpace colorSpace; private final PDRectangle pageSize; List<PageBuffer> pages() { return Collections.unmodifiableList(pages); } private PdfLayoutMgr(PDColorSpace cs, PDRectangle mb) throws IOException { doc = new PDDocument(); colorSpace = cs; pageSize = (mb == null) ? PDPage.PAGE_SIZE_LETTER : mb; } /** Returns a new PdfLayoutMgr with the given color space. @param cs the color-space. @return a new PdfLayoutMgr @throws IOException */ public static PdfLayoutMgr of(PDColorSpace cs) throws IOException { return new PdfLayoutMgr(cs, null); } /** Returns a new PdfLayoutMgr with the given color space and page size. @param cs the color-space. @param pageSize the page size. There are a bunch of presets in org.apache.pdfbox.pdmodel.PDPage like PAGE_SIZE_LETTER, PAGE_SIZE_A1, and PAGE_SIZE_A4. @return a new PdfLayoutMgr @throws IOException */ public static PdfLayoutMgr of(PDColorSpace cs, PDRectangle pageSize) throws IOException { return new PdfLayoutMgr(cs, pageSize); } /** Creates a new PdfLayoutMgr with the PDDeviceRGB color space. @return a new Page Manager with an RGB color space @throws IOException */ @SuppressWarnings("UnusedDeclaration") // Part of end-user public interface public static PdfLayoutMgr newRgbPageMgr() throws IOException { return new PdfLayoutMgr(PDDeviceRGB.INSTANCE, null); } /** Returns the page width given the defined PDRectangle pageSize */ public float pageWidth() { return pageSize.getWidth(); } /** Returns the page height given the defined PDRectangle pageSize */ public float pageHeight() { return pageSize.getHeight(); } /** Returns the correct page for the given value of y. This lets the user use any Y value and we continue extending their canvas downward (negative) by adding extra pages. @param y the un-adjusted y value. @return the proper page and adjusted y value for that page. */ LogicalPage.PageBufferAndY appropriatePage(LogicalPage lp, float y) { if (pages.size() < 1) { throw new IllegalStateException("Cannot work with the any pages until one has been created by calling newPage()."); } int idx = unCommittedPageIdx; // Get the first possible page while (y < lp.yPageBottom()) { // logger.info("Adjusting y. Was: " + y + " about to add " + printAreaHeight); y += lp.printAreaHeight(); // y could even be negative. Just keep moving to the top of the next // page until it's in the printable area. idx++; if (pages.size() <= idx) { pages.add(new PageBuffer(pages.size() + 1)); } } PageBuffer ps = pages.get(idx); return new LogicalPage.PageBufferAndY(ps, y); } /** Call this to commit the PDF information to the underlying stream after it is completely built. */ public void save(OutputStream os) throws IOException, COSVisitorException { doc.save(os); doc.close(); } // TODO: Add logicalPage() method and call pages.add() lazily for the first item actually shown on a page, and logicalPageEnd called before a save. // TODO: Add feature for different paper size or orientation for each group of logical pages. /** Tells this PdfLayoutMgr that you want to start a new logical page (which may be broken across two or more physical pages) in the requested page orientation. */ @SuppressWarnings("UnusedDeclaration") // Part of end-user public interface public LogicalPage logicalPageStart(LogicalPage.Orientation o) { PageBuffer pb = new PageBuffer(pages.size() + 1); pages.add(pb); return LogicalPage.of(this, o); } /** Get a new logical page (which may be broken across two or more physical pages) in Landscape orientation. */ public LogicalPage logicalPageStart() { return logicalPageStart(LogicalPage.Orientation.LANDSCAPE); } // void addLogicalPage(PageBuffer pb) { // pages.add(pb); // } /** Call this when you are through with your current set of pages to commit all pending text and drawing operations. This is the only method that throws an IOException because the purpose of PdfLayoutMgr is to buffer all operations until a page is complete so that it can safely be written to the underlying stream. This method turns the potential pages into real output. Call when you need a page break, or your document is done and you need to write it out. @throws IOException - if there is a failure writing to the underlying stream. */ @SuppressWarnings("UnusedDeclaration") // Part of end-user public interface void logicalPageEnd(LogicalPage lp) throws IOException { // Write out all uncommitted pages. while (unCommittedPageIdx < pages.size()) { PDPage pdPage = new PDPage(pageSize); if (lp.orientation() == LogicalPage.Orientation.LANDSCAPE) { pdPage.setRotation(90); } PDPageContentStream stream = null; try { stream = new PDPageContentStream(doc, pdPage); doc.addPage(pdPage); if (lp.orientation() == LogicalPage.Orientation.LANDSCAPE) { stream.concatenate2CTM(0, 1, -1, 0, lp.pageWidth(), 0); } stream.setStrokingColorSpace(colorSpace); stream.setNonStrokingColorSpace(colorSpace); PageBuffer pb = pages.get(unCommittedPageIdx); pb.commit(stream); lp.commitBorderItems(stream); stream.close(); // Set to null to show that no exception was thrown and no need to close again. stream = null; } finally { // Let it throw an exception if the closing doesn't work. if (stream != null) { stream.close(); } } unCommittedPageIdx++; } } @Override public boolean equals(Object other) { // First, the obvious... if (this == other) { return true; } if (other == null) { return false; } if (!(other instanceof PdfLayoutMgr)) { return false; } // Details... final PdfLayoutMgr that = (PdfLayoutMgr) other; return this.doc.equals(that.doc) && (this.pages.equals(that.pages)); } @Override public int hashCode() { return doc.hashCode() + pages.hashCode(); } // public XyOffset putRect(XyOffset outerTopLeft, XyDim outerDimensions, final Color c) { //// System.out.println("putRect(" + outerTopLeft + " " + outerDimensions + " " + //// Utils.toString(c) + ")"); // putRect(outerTopLeft.x(), outerTopLeft.y(), outerDimensions.x(), outerDimensions.y(), c); // return XyOffset.of(outerTopLeft.x() + outerDimensions.x(), // outerTopLeft.y() - outerDimensions.y()); // } // /** // Puts text on the page. // @param x the x-value of the top-left corner. // @param origY the logical-page Y-value of the top-left corner. // @param cell the cell containing the styling and text to render. // @return the bottom Y-value (logical-page) of the rendered cell. // */ // public float putCell(final float x, float origY, final Cell cell) { // return cell.processRows(x, origY, false, this); // } private static final String ISO_8859_1 = "ISO_8859_1"; private static final String UNICODE_BULLET = "\u2022"; // PDFBox uses an encoding that the PDF spec calls WinAnsiEncoding. The spec says this is // Windows Code Page 1252. // http://en.wikipedia.org/wiki/Windows-1252 // It has a lot in common with ISO-8859-1, but it defines some additional characters such as // the Euro symbol. private static final Map<String,String> utf16ToWinAnsi; static { Map<String,String> tempMap = new HashMap<String,String>(); try { // 129, 141, 143, 144, and 157 are undefined in WinAnsi. // I had mapped A0-FF to 160-255 without noticing that that maps each character to // itself, meaning that Unicode and WinAnsii are the same in that range. // Unicode characters with exact WinAnsi equivalents tempMap.put("\u0152", new String(new byte[]{0,(byte)140},ISO_8859_1)); // OE tempMap.put("\u0153", new String(new byte[]{0,(byte)156},ISO_8859_1)); // oe tempMap.put("\u0160", new String(new byte[]{0,(byte)138},ISO_8859_1)); // S Acron tempMap.put("\u0161", new String(new byte[]{0,(byte)154},ISO_8859_1)); // s acron tempMap.put("\u0178", new String(new byte[]{0,(byte)159},ISO_8859_1)); // Y Diaeresis tempMap.put("\u017D", new String(new byte[]{0,(byte)142},ISO_8859_1)); // Capital Z-caron tempMap.put("\u017E", new String(new byte[]{0,(byte)158},ISO_8859_1)); // Lower-case Z-caron tempMap.put("\u0192", new String(new byte[]{0,(byte)131},ISO_8859_1)); // F with a hook (like jf put together) tempMap.put("\u02C6", new String(new byte[]{0,(byte)136},ISO_8859_1)); // circumflex (up-caret) tempMap.put("\u02DC", new String(new byte[]{0,(byte)152},ISO_8859_1)); // Tilde // Cyrillic letters map to their closest Romanizations according to ISO 9:1995 // http://en.wikipedia.org/wiki/ISO_9 // http://en.wikipedia.org/wiki/A_(Cyrillic) // Cyrillic extensions // 0400 Ѐ Cyrillic capital letter IE WITH GRAVE // ≡ 0415 Е 0300 (left-accent) tempMap.put("\u0400", new String(new byte[]{0,(byte)200},ISO_8859_1)); // 0401 Ё Cyrillic capital letter IO // ≡ 0415 Е 0308 (diuresis) tempMap.put("\u0401", new String(new byte[]{0,(byte)203},ISO_8859_1)); // 0402 Ђ Cyrillic capital letter DJE tempMap.put("\u0402", new String(new byte[]{0,(byte)208},ISO_8859_1)); // 0403 Ѓ Cyrillic capital letter GJE // ≡ 0413 Г 0301 (accent) // Ghe only maps to G-acute, which is not in our charset. // 0404 Є Cyrillic capital letter UKRAINIAN IE tempMap.put("\u0404", new String(new byte[]{0,(byte)202},ISO_8859_1)); // 0405 Ѕ Cyrillic capital letter DZE tempMap.put("\u0405", "S"); // // 0406 І Cyrillic capital letter BYELORUSSIAN- // UKRAINIAN I // → 0049 I latin capital letter i // → 0456 і cyrillic small letter byelorussian- // ukrainian i // → 04C0 Ӏ cyrillic letter palochka tempMap.put("\u0406", new String(new byte[]{0,(byte)204},ISO_8859_1)); // 0407 Ї Cyrillic capital letter YI // ≡ 0406 І 0308 (diuresis) tempMap.put("\u0407", new String(new byte[]{0,(byte)207},ISO_8859_1)); // 0408 Ј Cyrillic capital letter JE // 0409 Љ Cyrillic capital letter LJE // 040A Њ Cyrillic capital letter NJE // 040B Ћ Cyrillic capital letter TSHE // 040C Ќ Cyrillic capital letter KJE // ≡ 041A К 0301 (accent) // 040D Ѝ Cyrillic capital letter I WITH GRAVE // ≡ 0418 И 0300 (accent) // 040E Ў Cyrillic capital letter SHORT U // ≡ 0423 У 0306 (accent) // 040F Џ Cyrillic capital letter DZHE // Basic Russian alphabet // See: http://www.unicode.org/charts/PDF/U0400.pdf // 0410 А Cyrillic capital letter A => Latin A tempMap.put("\u0410", "A"); // 0411 Б Cyrillic capital letter BE => Latin B // → 0183 ƃ latin small letter b with topbar tempMap.put("\u0411", "B"); // 0412 В Cyrillic capital letter VE => Latin V tempMap.put("\u0412", "V"); // 0413 Г Cyrillic capital letter GHE => Latin G tempMap.put("\u0413", "G"); // 0414 Д Cyrillic capital letter DE => Latin D tempMap.put("\u0414", "D"); // 0415 Е Cyrillic capital letter IE => Latin E tempMap.put("\u0415", "E"); // 0416 Ж Cyrillic capital letter ZHE => Z-caron tempMap.put("\u0416", new String(new byte[]{0,(byte)142},ISO_8859_1)); // 0417 З Cyrillic capital letter ZE => Latin Z tempMap.put("\u0417", "Z"); // 0418 И Cyrillic capital letter I => Latin I tempMap.put("\u0418", "I"); // 0419 Й Cyrillic capital letter SHORT I => Latin J // ≡ 0418 И 0306 (a little mark) // The two-character form (reversed N plus the mark) is not supported. tempMap.put("\u0419", "J"); // 041A К Cyrillic capital letter KA => Latin K tempMap.put("\u041A", "K"); // 041B Л Cyrillic capital letter EL => Latin L tempMap.put("\u041B", "L"); // 041C М Cyrillic capital letter EM => Latin M tempMap.put("\u041C", "M"); // 041D Н Cyrillic capital letter EN => Latin N tempMap.put("\u041D", "N"); // 041E О Cyrillic capital letter O => Latin O tempMap.put("\u041E", "O"); // 041F П Cyrillic capital letter PE => Latin P tempMap.put("\u041F", "P"); // 0420 Р Cyrillic capital letter ER => Latin R tempMap.put("\u0420", "R"); // 0421 С Cyrillic capital letter ES => Latin S tempMap.put("\u0421", "S"); // 0422 Т Cyrillic capital letter TE => Latin T tempMap.put("\u0422", "T"); // 0423 У Cyrillic capital letter U => Latin U // → 0478 Ѹ cyrillic capital letter uk // → 04AF ү cyrillic small letter straight u // → A64A Ꙋ cyrillic capital letter monograph uk tempMap.put("\u0423", "U"); tempMap.put("\u0478", "U"); // Is this right? tempMap.put("\u04AF", "U"); // Is this right? tempMap.put("\uA64A", "U"); // Is this right? // 0424 Ф Cyrillic capital letter EF => Latin F tempMap.put("\u0424", "F"); // 0425 Х Cyrillic capital letter HA => Latin H tempMap.put("\u0425", "H"); // 0426 Ц Cyrillic capital letter TSE => Latin C tempMap.put("\u0426", "C"); // 0427 Ч Cyrillic capital letter CHE => Mapping to "Ch" because there is no // C-caron - hope this is the best choice! A also had this as "CH" but some make it // Tch as in Tchaikovsky, really didn't know what to do here. tempMap.put("\u0427", "Ch"); // 0428 Ш Cyrillic capital letter SHA => S-caron tempMap.put("\u0428", new String(new byte[]{0,(byte)138},ISO_8859_1)); // 0429 Щ Cyrillic capital letter SHCHA => Latin "Shch" because there is no // S-circumflex to map it to. Should it go to S-caron like SHA? tempMap.put("\u0429", "Shch"); // 042A Ъ Cyrillic capital letter HARD SIGN => Latin double prime, or in this case, // right double-quote. tempMap.put("\u042A", new String(new byte[]{0,(byte)148},ISO_8859_1)); // 042B Ы Cyrillic capital letter YERU => Latin Y tempMap.put("\u042B", "Y"); // 042C Ь Cyrillic capital letter SOFT SIGN => Latin prime, or in this case, // the right-single-quote. tempMap.put("\u042C", new String(new byte[]{0,(byte)146},ISO_8859_1)); // 042D Э Cyrillic capital letter E => Latin E-grave tempMap.put("\u042D", new String(new byte[]{0,(byte)200},ISO_8859_1)); // 042E Ю Cyrillic capital letter YU => Latin U-circumflex tempMap.put("\u042E", new String(new byte[]{0,(byte)219},ISO_8859_1)); // 042F Я Cyrillic capital letter YA => A-circumflex tempMap.put("\u042F", new String(new byte[]{0,(byte)194},ISO_8859_1)); // 0430 а Cyrillic small letter A tempMap.put("\u0430", "a"); // 0431 б Cyrillic small letter BE tempMap.put("\u0431", "b"); // 0432 в Cyrillic small letter VE tempMap.put("\u0432", "v"); // 0433 г Cyrillic small letter GHE tempMap.put("\u0433", "g"); // 0434 д Cyrillic small letter DE tempMap.put("\u0434", "d"); // 0435 е Cyrillic small letter IE tempMap.put("\u0435", "e"); // 0436 ж Cyrillic small letter ZHE tempMap.put("\u0436", new String(new byte[]{0,(byte)158},ISO_8859_1)); // 0437 з Cyrillic small letter ZE tempMap.put("\u0437", "z"); // 0438 и Cyrillic small letter I tempMap.put("\u0438", "i"); // 0439 й Cyrillic small letter SHORT I // ≡ 0438 и 0306 (accent) tempMap.put("\u0439", "j"); // 043A к Cyrillic small letter KA tempMap.put("\u043A", "k"); // 043B л Cyrillic small letter EL tempMap.put("\u043B", "l"); // 043C м Cyrillic small letter EM tempMap.put("\u043C", "m"); // 043D н Cyrillic small letter EN tempMap.put("\u043D", "n"); // 043E о Cyrillic small letter O tempMap.put("\u043E", "o"); // 043F п Cyrillic small letter PE tempMap.put("\u043F", "p"); // 0440 р Cyrillic small letter ER tempMap.put("\u0440", "r"); // 0441 с Cyrillic small letter ES tempMap.put("\u0441", "s"); // 0442 т Cyrillic small letter TE tempMap.put("\u0442", "t"); // 0443 у Cyrillic small letter U tempMap.put("\u0443", "u"); // 0444 ф Cyrillic small letter EF tempMap.put("\u0444", "f"); // 0445 х Cyrillic small letter HA tempMap.put("\u0445", "h"); // 0446 ц Cyrillic small letter TSE tempMap.put("\u0446", "c"); // 0447 ч Cyrillic small letter CHE - see notes on capital letter. tempMap.put("\u0447", "ch"); // 0448 ш Cyrillic small letter SHA tempMap.put("\u0448", new String(new byte[]{0,(byte)154},ISO_8859_1)); // 0449 щ Cyrillic small letter SHCHA tempMap.put("\u0449", "shch"); // 044A ъ Cyrillic small letter HARD SIGN tempMap.put("\u044A", new String(new byte[]{0,(byte)148},ISO_8859_1)); // 044B ы Cyrillic small letter YERU // → A651 ꙑ cyrillic small letter yeru with back yer tempMap.put("\u044B", "y"); // 044C ь Cyrillic small letter SOFT SIGN // → 0185 ƅ latin small letter tone six // → A64F ꙏ cyrillic small letter neutral yer tempMap.put("\u044C", new String(new byte[]{0,(byte)146},ISO_8859_1)); // 044D э Cyrillic small letter E tempMap.put("\u044D", new String(new byte[]{0,(byte)232},ISO_8859_1)); // 044E ю Cyrillic small letter YU // → A655 ꙕ cyrillic small letter reversed yu tempMap.put("\u044E", new String(new byte[]{0,(byte)251},ISO_8859_1)); tempMap.put("\uA655", new String(new byte[]{0,(byte)251},ISO_8859_1)); // is this right? // 044F я Cyrillic small letter YA => a-circumflex tempMap.put("\u044F", new String(new byte[]{0,(byte)226},ISO_8859_1)); // Cyrillic extensions // 0450 ѐ CYRILLIC SMALL LETTER IE WITH GRAVE // • Macedonian // ≡ 0435 е 0300 $̀ tempMap.put("\u0450", new String(new byte[]{0,(byte)232},ISO_8859_1)); // e-grave => e-grave // 0451 ё CYRILLIC SMALL LETTER IO // • Russian, ... // ≡ 0435 е 0308 $̈ tempMap.put("\u0451", new String(new byte[]{0,(byte)235},ISO_8859_1)); // 0452 ђ CYRILLIC SMALL LETTER DJE // • Serbian // → 0111 đ latin small letter d with stroke tempMap.put("\u0452", new String(new byte[]{0,(byte)240},ISO_8859_1)); // 0453 ѓ CYRILLIC SMALL LETTER GJE - only maps to g-acute, which is not in our charset. // • Macedonian // ≡ 0433 г 0301 $́ // 0454 є CYRILLIC SMALL LETTER UKRAINIAN IE // = Old Cyrillic yest tempMap.put("\u0454", new String(new byte[]{0,(byte)234},ISO_8859_1)); // 0455 ѕ CYRILLIC SMALL LETTER DZE // • Macedonian // → A643 ꙃ cyrillic small letter dzelo tempMap.put("\u0455", "s"); // 0456 CYRILLIC SMALL LETTER BYELORUSSIAN- // UKRAINIAN I // = Old Cyrillic i tempMap.put("\u0456", new String(new byte[]{0,(byte)236},ISO_8859_1)); // 0457 ї CYRILLIC SMALL LETTER YI // • Ukrainian // ≡ 0456 і 0308 $̈ tempMap.put("\u0457", new String(new byte[]{0,(byte)239},ISO_8859_1)); // 0458 ј CYRILLIC SMALL LETTER JE // • Serbian, Azerbaijani, Altay // 0459 љ CYRILLIC SMALL LETTER LJE // • Serbian, Macedonian // → 01C9 lj latin small letter lj // 045A њ CYRILLIC SMALL LETTER NJE // • Serbian, Macedonian // → 01CC nj latin small letter nj // 045B ћ CYRILLIC SMALL LETTER TSHE // • Serbian // → 0107 ć latin small letter c with acute // → 0127 ħ latin small letter h with stroke // → 040B Ћ cyrillic capital letter tshe // → 210F ħ planck constant over two pi // → A649 ꙉ cyrillic small letter djerv // 045C ќ CYRILLIC SMALL LETTER KJE // • Macedonian // ≡ 043A к 0301 $́ // 045D ѝ CYRILLIC SMALL LETTER I WITH GRAVE // • Macedonian, Bulgarian // ≡ 0438 и 0300 $̀ // 045E ў CYRILLIC SMALL LETTER SHORT U // • Byelorussian, Uzbek // ≡ 0443 у 0306 $̆ // 045F џ CYRILLIC SMALL LETTER DZHE // • Serbian, Macedonian, Abkhasian // → 01C6 dž latin small letter dz with caron // Extended Cyrillic // ... // 0490 Ґ CYRILLIC CAPITAL LETTER GHE WITH UPTURN => G ? tempMap.put("\u0490", "G"); // Ghe with upturn // 0491 ґ CYRILLIC SMALL LETTER GHE WITH UPTURN // • Ukrainian tempMap.put("\u0491", "g"); // Other commonly-used unicode characters with exact WinAnsi equivalents tempMap.put("\u2013", new String(new byte[]{0,(byte)150},ISO_8859_1)); // En-dash tempMap.put("\u2014", new String(new byte[]{0,(byte)151},ISO_8859_1)); // Em-dash tempMap.put("\u2018", new String(new byte[]{0,(byte)145},ISO_8859_1)); // Curved single open quote tempMap.put("\u2019", new String(new byte[]{0,(byte)146},ISO_8859_1)); // Curved single close-quote tempMap.put("\u201A", new String(new byte[]{0,(byte)130},ISO_8859_1)); // Low single curved-quote tempMap.put("\u201C", new String(new byte[]{0,(byte)147},ISO_8859_1)); // Curved double open quote tempMap.put("\u201D", new String(new byte[]{0,(byte)148},ISO_8859_1)); // Curved double close-quote tempMap.put("\u201E", new String(new byte[]{0,(byte)132},ISO_8859_1)); // Low right double quote. tempMap.put("\u2020", new String(new byte[]{0,(byte)134},ISO_8859_1)); // Dagger tempMap.put("\u2021", new String(new byte[]{0,(byte)135},ISO_8859_1)); // Double dagger tempMap.put(UNICODE_BULLET, new String(new byte[]{0,(byte)149},ISO_8859_1)); // Bullet - use this as replacement character. tempMap.put("\u2026", new String(new byte[]{0,(byte)133},ISO_8859_1)); // Ellipsis tempMap.put("\u2030", new String(new byte[]{0,(byte)137},ISO_8859_1)); // Permille tempMap.put("\u2039", new String(new byte[]{0,(byte)139},ISO_8859_1)); // Left angle-quote tempMap.put("\u203A", new String(new byte[]{0,(byte)155},ISO_8859_1)); // Right angle-quote tempMap.put("\u20ac", new String(new byte[]{0,(byte)128},ISO_8859_1)); // Euro symbol tempMap.put("\u2122", new String(new byte[]{0,(byte)153},ISO_8859_1)); // Trademark symbol } catch (UnsupportedEncodingException uee) { throw new IllegalStateException("Problem creating translation table due to Unsupported Encoding (coding error)", uee); } utf16ToWinAnsi = Collections.unmodifiableMap(tempMap); } // private static final Pattern whitespacePattern = Pattern.compile("\\p{Z}+"); // What about \u00ba?? // \u00a0-\u00a9 \u00ab-\u00b9 \u00bb-\u00bf \u00d7 \u00f7 private static final Pattern nonAsciiPattern = Pattern.compile("[^\u0000-\u00ff]"); /** <p>PDF files are limited to the 217 characters of Windows-1252 which the PDF spec calls WinAnsi and Java calls ISO-8859-1. This method transliterates the standard Java UTF-16 character representations to their Windows-1252 equivalents where such translation is possible. Any character (e.g. Kanji) which does not have an appropriate substitute in Windows-1252 will be mapped to the bullet character (a round dot).</p> <p>This transliteration covers the modern alphabets of the following languages:<br> Afrikaans (af), Albanian (sq), Basque (eu), Catalan (ca), Danish (da), Dutch (nl), English (en), Faroese (fo), Finnish (fi), French (fr), Galician (gl), German (de), Icelandic (is), Irish (ga), Italian (it), Norwegian (no), Portuguese (pt), Scottish (gd), Spanish (es), Swedish (sv).</p> <p>Romanized substitutions are used for the Cyrillic characters of the modern Russian (ru) alphabet according to ISO 9:1995 with the following phonetic substitutions: 'Ch' for Ч and 'Shch' for Щ.</p> <p>The PdfLayoutMgr calls this method internally whenever it renders text (transliteration has to happen before line breaking), but is available externally in case you wish to use it directly with PDFBox.</p> @param in a string in the standard Java UTF-16 encoding @return a string in Windows-1252 (informally called ISO-8859-1 or WinAnsi) */ public static String convertJavaStringToWinAnsi(String in) { // ByteBuffer bb = StandardCharsets.UTF_16.encode(CharBuffer.wrap(in)); // // then decode those bytes as US-ASCII // return StandardCharsets.ISO_8859_1.decode(bb).toString(); // return java.nio.charset.StandardCharsets.ISO_8859_1.encode(in); Matcher m = nonAsciiPattern.matcher(in); StringBuilder sB = new StringBuilder(); int idx = 0; while (m.find()) { int start = m.start(); // first character of match. if (idx < start) { // Append everything from the last match up to this one. sB.append(in.subSequence(idx, start)); } String s = utf16ToWinAnsi.get(m.group()); // "In WinAnsiEncoding, all unused codes greater than 40 map to the bullet character." // source: PDF spec, Annex D.3 PDFDocEncoding Character Set p. 656 footnote about // WinAnsiEncoding. // // I think the bullet is the closest thing to a "replacement character" in the // WinAnsi character set, so that's what I'll use it for. It looks tons better than // nullnullnull... if (s == null) { s = utf16ToWinAnsi.get(UNICODE_BULLET); } sB.append(s); idx = m.end(); // m.end() is exclusive } if (idx < in.length()) { sB.append(in.subSequence(idx, in.length())); } return sB.toString(); } }