/* * Copyright (c) 2013 Allogy Interactive. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.hsl.txtreader; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import com.sun.pdfview.PDFCMap; import com.sun.pdfview.PDFGlyph; import com.sun.pdfview.PDFObject; /** * The PDFFont encoding encapsulates the mapping from character codes * in the PDF document to glyphs of the font. * * Encodings take two basic forms. For Type1, TrueType, and Type3 fonts, * the encoding maps from character codes to Strings, which represent the * glyphs of the font. For Type0 fonts, the mapping is a CMap which maps * character codes to characters in one of many descendant fonts. * * Note that the data in the PDF might be ASCII characters (bytes) or it might * be a multi-byte format such as unicode. For now we will assume all * glyph ids fit into at most the two bytes of a character. */ public class PDFFontEncoding { /** Encoding types */ private static final int TYPE_ENCODING = 0; private static final int TYPE_CMAP = 1; /** * the base encoding (an array of integers which can be mapped to names * using the methods on FontSupport */ private int[] baseEncoding; /** any differences from the base encoding */ private Map<Character,String> differences; /** * a CMap for fonts encoded by CMap */ private PDFCMap cmap; /** * the type of this encoding (encoding or CMap) */ private int type; /** Creates a new instance of PDFFontEncoding */ public PDFFontEncoding(String fontType, PDFObject encoding) throws IOException { if (encoding.getType() == PDFObject.NAME) { // if the encoding is a String, it is the name of an encoding // or the name of a CMap, depending on the type of the font if (fontType.equals("Type0")) { type = TYPE_CMAP; cmap = PDFCMap.getCMap(encoding.getStringValue()); } else { type = TYPE_ENCODING; differences = new HashMap<Character,String>(); baseEncoding = this.getBaseEncoding(encoding.getStringValue()); } } else { // loook at the "Type" entry of the encoding to determine the type String typeStr = encoding.getDictRef("Type").getStringValue(); if (typeStr.equals("Encoding")) { // it is an encoding type = TYPE_ENCODING; parseEncoding(encoding); } else if (typeStr.equals("CMap")) { // it is a CMap type = TYPE_CMAP; cmap = PDFCMap.getCMap(encoding); } else { throw new IllegalArgumentException("Uknown encoding type: " + type); } } } /** Get the glyphs associated with a given String */ public List<PDFGlyph> getGlyphs(PDFFont font, String text) { List<PDFGlyph> outList = new ArrayList<PDFGlyph>(text.length()); // go character by character through the text char[] arry = text.toCharArray(); for (int i = 0; i < arry.length; i++) { switch (type) { case TYPE_ENCODING: outList.add(getGlyphFromEncoding(font, arry[i])); break; case TYPE_CMAP: // 2 bytes -> 1 character in a CMap char c = (char)((arry[i] & 0xff) << 8); if (i < arry.length - 1) { c |= (char)(arry[++i] & 0xff); } outList.add(getGlyphFromCMap(font, c)); break; } } return outList; } // txtReader.PDF Port public String translateString(String text) { StringBuffer unicodeStr = new StringBuffer(); // go character by character through the text char[] arry = text.toCharArray(); switch (type) { case TYPE_ENCODING: for (char ch : arry) { unicodeStr.append(getDecodeChar(ch)); } break; case TYPE_CMAP: for (int i = 0; i < arry.length; i++) { // 2 bytes -> 1 character in a CMap char c = (char)((arry[i] & 0xff) << 8); if (i < arry.length - 1) { c |= (char)(arry[++i] & 0xff); } unicodeStr.append(getDecodeChar(c)); } break; } return unicodeStr.toString(); } // txtReader.PDF Port public char getDecodeChar(char src) { // see if this character is in the differences list try { String charName = differences.get(src); int idx = FontSupport.findName(charName, FontSupport.stdNames); if (idx != -1) { return (char) idx; } } catch (Exception ex) { if (baseEncoding != null) { // only deal with one byte of source src &= 0xff; // get the character name from the base encoding int charID = baseEncoding[src]; return (char) charID; } } return src; } /** * Get a glyph from an encoding, given a font and character */ private PDFGlyph getGlyphFromEncoding(PDFFont font, char src) { String charName = null; // only deal with one byte of source src &= 0xff; // see if this character is in the differences list if (differences.containsKey(new Character(src))) { charName = (String) differences.get(new Character(src)); } else if (baseEncoding != null) { // get the character name from the base encoding int charID = baseEncoding[src]; charName = FontSupport.getName(charID); } return font.getCachedGlyph(src, charName); } /** * Get a glyph from a CMap, given a Type0 font and a character */ private PDFGlyph getGlyphFromCMap(PDFFont font, char src) { int fontID = cmap.getFontID(src); char charID = cmap.map(src); /* if (font instanceof Type0Font) { font = ((Type0Font) font).getDescendantFont(fontID); } */ return font.getCachedGlyph(charID, null); } /** * Parse a PDF encoding object for the actual encoding */ public void parseEncoding(PDFObject encoding) throws IOException { differences = new HashMap<Character,String>(); // figure out the base encoding, if one exists PDFObject baseEncObj = encoding.getDictRef("BaseEncoding"); if (baseEncObj != null) { baseEncoding = getBaseEncoding(baseEncObj.getStringValue()); } // parse the differences array PDFObject diffArrayObj = encoding.getDictRef("Differences"); if (diffArrayObj != null) { PDFObject[] diffArray = diffArrayObj.getArray(); int curPosition = -1; for (int i = 0; i < diffArray.length; i++) { if (diffArray[i].getType() == PDFObject.NUMBER) { curPosition = diffArray[i].getIntValue(); } else if (diffArray[i].getType() == PDFObject.NAME) { Character key = new Character((char) curPosition); differences.put(key, diffArray[i].getStringValue()); curPosition++; } else { throw new IllegalArgumentException("Unexpected type in diff array: " + diffArray[i]); } } } } /** Get the base encoding for a given name */ private int[] getBaseEncoding(String encodingName) { if (encodingName.equals("MacRomanEncoding")) { return FontSupport.macRomanEncoding; } else if (encodingName.equals("MacExpertEncoding")) { return FontSupport.type1CExpertCharset; } else if (encodingName.equals("WinAnsiEncoding")) { return FontSupport.winAnsiEncoding; } else { throw new IllegalArgumentException("Unknown encoding: " + encodingName); } } }