package com.tom_roush.pdfbox.pdmodel.font; import com.tom_roush.fontbox.FontBoxFont; import com.tom_roush.fontbox.cff.CFFFont; import com.tom_roush.fontbox.cff.CFFType1Font; import com.tom_roush.fontbox.ttf.OpenTypeFont; import com.tom_roush.fontbox.ttf.TTFParser; import com.tom_roush.fontbox.ttf.TrueTypeFont; import com.tom_roush.fontbox.type1.Type1Font; import com.tom_roush.pdfbox.util.PDFBoxResourceLoader; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.PriorityQueue; import java.util.Set; /** * Font mapper, locates non-embedded fonts via a pluggable FontProvider. * * @author John Hewson */ final class FontMapper { private FontMapper() {} private static final FontCache fontCache = new FontCache(); // todo: static cache isn't ideal private static FontProvider fontProvider; private static Map<String, FontInfo> fontInfoByName; private static final TrueTypeFont lastResortFont; static { try { String ttfName = "com/tom_roush/pdfbox/resources/ttf/LiberationSans-Regular.ttf"; InputStream ttfStream; if(PDFBoxResourceLoader.isReady()) { ttfStream = PDFBoxResourceLoader.getStream(ttfName); if (ttfStream == null) { throw new IOException("Error loading resource: " + ttfStream); } } else { // Fallback URL url = FontMapper.class.getClassLoader().getResource(ttfName); if (url == null) { throw new IOException("Error loading resource: " + ttfName); } ttfStream = url.openStream(); } TTFParser ttfParser = new TTFParser(); lastResortFont = ttfParser.parse(ttfStream); } catch (IOException e) { throw new RuntimeException(e); } } // lazy thread safe singleton private static class DefaultFontProvider { private static final FontProvider INSTANCE = new FileSystemFontProvider(fontCache); } /** * Sets the font service provider. */ public synchronized static void setProvider(FontProvider fontProvider) { FontMapper.fontProvider = fontProvider; fontInfoByName = createFontInfoByName(fontProvider.getFontInfo()); } /** * Returns the font service provider. Defaults to using FileSystemFontProvider. */ public synchronized static FontProvider getProvider() { if (fontProvider == null) { setProvider(DefaultFontProvider.INSTANCE); } return fontProvider; } /** * Returns the font cache associated with this FontMapper. This method is needed by * FontProvider subclasses. */ public static FontCache getFontCache() { return fontCache; } private static Map<String, FontInfo> createFontInfoByName(List<? extends FontInfo> fontInfoList) { Map<String, FontInfo> map = new LinkedHashMap<String, FontInfo>(); for (FontInfo info : fontInfoList) { for (String name : getPostScriptNames(info.getPostScriptName())) { map.put(name, info); } } return map; } /** * Gets alternative names, as seen in some PDFs, e.g. PDFBOX-142. */ private static Set<String> getPostScriptNames(String postScriptName) { Set<String> names = new HashSet<String>(); // built-in PostScript name names.add(postScriptName); // remove hyphens (e.g. Arial-Black -> ArialBlack) names.add(postScriptName.replaceAll("-", "")); return names; } /** * Map of PostScript name substitutes, in priority order. */ private static final Map<String, List<String>> substitutes = new HashMap<String, List<String>>(); static { // substitutes for standard 14 fonts substitutes.put("Courier", Arrays.asList("CourierNew", "CourierNewPSMT", "LiberationMono", "NimbusMonL-Regu", "DroidSansMono")); substitutes.put("Courier-Bold", Arrays.asList("CourierNewPS-BoldMT", "CourierNew-Bold", "LiberationMono-Bold", "NimbusMonL-Bold","DroidSansMono")); substitutes.put("Courier-Oblique", Arrays.asList("CourierNewPS-ItalicMT","CourierNew-Italic", "LiberationMono-Italic", "NimbusMonL-ReguObli","DroidSansMono")); substitutes.put("Courier-BoldOblique", Arrays.asList("CourierNewPS-BoldItalicMT","CourierNew-BoldItalic", "LiberationMono-BoldItalic", "NimbusMonL-BoldObli","DroidSansMono")); substitutes.put("Helvetica", Arrays.asList("ArialMT", "Arial", "LiberationSans", "NimbusSanL-Regu","Roboto-Regular")); substitutes.put("Helvetica-Bold", Arrays.asList("Arial-BoldMT", "Arial-Bold", "LiberationSans-Bold", "NimbusSanL-Bold","Roboto-Bold")); substitutes.put("Helvetica-Oblique", Arrays.asList("Arial-ItalicMT", "Arial-Italic", "Helvetica-Italic", "LiberationSans-Italic", "NimbusSanL-ReguItal", "Roboto-Italic")); substitutes.put("Helvetica-BoldOblique", Arrays.asList("Arial-BoldItalicMT", "Helvetica-BoldItalic", "LiberationSans-BoldItalic", "NimbusSanL-BoldItal","Roboto-BoldItalic")); substitutes.put("Times-Roman", Arrays.asList("TimesNewRomanPSMT", "TimesNewRoman", "TimesNewRomanPS", "LiberationSerif", "NimbusRomNo9L-Regu","DroidSerif-Regular", "Roboto-Regular")); substitutes.put("Times-Bold", Arrays.asList("TimesNewRomanPS-BoldMT", "TimesNewRomanPS-Bold", "TimesNewRoman-Bold", "LiberationSerif-Bold", "NimbusRomNo9L-Medi", "DroidSerif-Bold", "Roboto-Bold")); substitutes.put("Times-Italic", Arrays.asList("TimesNewRomanPS-ItalicMT", "TimesNewRomanPS-Italic", "TimesNewRoman-Italic", "LiberationSerif-Italic", "NimbusRomNo9L-ReguItal","DroidSerif-Italic", "Roboto-Italic")); substitutes.put("Times-BoldItalic", Arrays.asList("TimesNewRomanPS-BoldItalicMT", "TimesNewRomanPS-BoldItalic", "TimesNewRoman-BoldItalic", "LiberationSerif-BoldItalic", "NimbusRomNo9L-MediItal","DroidSerif-BoldItalic", "Roboto-BoldItalic")); substitutes.put("Symbol", Arrays.asList("Symbol", "SymbolMT", "StandardSymL")); substitutes.put("ZapfDingbats", Arrays.asList("ZapfDingbatsITC", "Dingbats", "MS-Gothic")); // TODO: PdfBox-Android load extra fonts? (DroidSerif for times and a symbol font) // Acrobat also uses alternative names for Standard 14 fonts, which we map to those above // these include names such as "Arial" and "TimesNewRoman" for (String baseName : Standard14Fonts.getNames()) { if (!substitutes.containsKey(baseName)) { String mappedName = Standard14Fonts.getMappedFontName(baseName); substitutes.put(baseName, copySubstitutes(mappedName)); } } } /** * Copies a list of font substitutes, adding the original font at the start of the list. */ private static List<String> copySubstitutes(String postScriptName) { return new ArrayList<String>(substitutes.get(postScriptName)); } /** * Adds a top-priority substitute for the given font. * * @param match PostScript name of the font to match * @param replace PostScript name of the font to use as a replacement */ public static void addSubstitute(String match, String replace) { if (!substitutes.containsKey(match)) { substitutes.put(match, new ArrayList<String>()); } substitutes.get(match).add(replace); } /** * Returns the substitutes for a given font. */ private static List<String> getSubstitutes(String postScriptName) { List<String> subs = substitutes.get(postScriptName.replaceAll(" ", "")); if (subs != null) { return subs; } else { return Collections.emptyList(); } } /** * Attempts to find a good fallback based on the font descriptor. */ private static String getFallbackFontName(PDFontDescriptor fontDescriptor) { String fontName; if (fontDescriptor != null) { // heuristic detection of bold boolean isBold = false; String name = fontDescriptor.getFontName(); if (name != null) { String lower = fontDescriptor.getFontName().toLowerCase(); isBold = lower.contains("bold") || lower.contains("black") || lower.contains("heavy"); } // font descriptor flags should describe the style if (fontDescriptor.isFixedPitch()) { fontName = "Courier"; if (isBold && fontDescriptor.isItalic()) { fontName += "-BoldOblique"; } else if (isBold) { fontName += "-Bold"; } else if (fontDescriptor.isItalic()) { fontName += "-Oblique"; } } else if (fontDescriptor.isSerif()) { fontName = "Times"; if (isBold && fontDescriptor.isItalic()) { fontName += "-BoldItalic"; } else if (isBold) { fontName += "-Bold"; } else if (fontDescriptor.isItalic()) { fontName += "-Italic"; } else { fontName += "-Roman"; } } else { fontName = "Helvetica"; if (isBold && fontDescriptor.isItalic()) { fontName += "-BoldOblique"; } else if (isBold) { fontName += "-Bold"; } else if (fontDescriptor.isItalic()) { fontName += "-Oblique"; } } } else { // if there is no FontDescriptor then we just fall back to Times Roman fontName = "Times-Roman"; } return fontName; } /** * Finds a TrueType font with the given PostScript name, or a suitable substitute, or null. * * @param fontDescriptor FontDescriptor */ public static FontMapping<TrueTypeFont> getTrueTypeFont(String baseFont, PDFontDescriptor fontDescriptor) { TrueTypeFont ttf = (TrueTypeFont) findFont(FontFormat.TTF, baseFont); if (ttf != null) { return new FontMapping<TrueTypeFont>(ttf, false); } else { // fallback - todo: i.e. fuzzy match String fontName = getFallbackFontName(fontDescriptor); ttf = (TrueTypeFont) findFont(FontFormat.TTF, fontName); if (ttf == null) { // we have to return something here as TTFs aren't strictly required on the system ttf = lastResortFont; } return new FontMapping<TrueTypeFont>(ttf, true); } } /** * Finds a font with the given PostScript name, or a suitable substitute, or null. This allows * any font to be substituted with a PFB, TTF or OTF. * * @param fontDescriptor the FontDescriptor of the font to find */ public static FontMapping<FontBoxFont> getFontBoxFont(String baseFont, PDFontDescriptor fontDescriptor) { FontBoxFont font = findFontBoxFont(baseFont); if (font != null) { return new FontMapping<FontBoxFont>(font, false); } else { // fallback - todo: i.e. fuzzy match String fallbackName = getFallbackFontName(fontDescriptor); font = findFontBoxFont(fallbackName); if (font == null) { // we have to return something here as TTFs aren't strictly required on the system font = lastResortFont; } return new FontMapping<FontBoxFont>(font, true); } } /** * Finds a font with the given PostScript name, or a suitable substitute, or null. * * @param postScriptName PostScript font name */ private static FontBoxFont findFontBoxFont(String postScriptName) { Type1Font t1 = (Type1Font) findFont(FontFormat.PFB, postScriptName); if (t1 != null) { return t1; } CFFFont cff = (CFFFont) findFont(FontFormat.OTF, postScriptName); if (cff instanceof CFFType1Font) { return cff; } TrueTypeFont ttf = (TrueTypeFont) findFont(FontFormat.TTF, postScriptName); if (ttf != null) { return ttf; } return null; } /** * Finds a font with the given PostScript name, or a suitable substitute, or null. * * @param postScriptName PostScript font name */ private static FontBoxFont findFont(FontFormat format, String postScriptName) { // handle damaged PDFs, see PDFBOX-2884 if (postScriptName == null) { return null; } // make sure the font provider is initialized if (fontProvider == null) { getProvider(); } // first try to match the PostScript name FontInfo info = getFont(format, postScriptName); if (info != null) { return info.getFont(); } // remove hyphens (e.g. Arial-Black -> ArialBlack) info = getFont(format, postScriptName.replaceAll("-", "")); if (info != null) { return info.getFont(); } // then try named substitutes for (String substituteName : getSubstitutes(postScriptName)) { info = getFont(format, substituteName); if (info != null) { return info.getFont(); } } // then try converting Windows names e.g. (ArialNarrow,Bold) -> (ArialNarrow-Bold) info = getFont(format, postScriptName.replaceAll(",", "-")); if (info != null) { return info.getFont(); } // no matches return null; } /** * Finds the named font with the given format. */ private static FontInfo getFont(FontFormat format, String postScriptName) { // strip subset tag (happens when we substitute a corrupt embedded font, see PDFBOX-2642) if (postScriptName.contains("+")) { postScriptName = postScriptName.substring(postScriptName.indexOf('+') + 1); } // look up the PostScript name FontInfo info = fontInfoByName.get(postScriptName); if (info != null && info.getFormat() == format) { return info; } return null; } /** * Finds a CFF CID-Keyed font with the given PostScript name, or a suitable substitute, or null. * This method can also map CJK fonts via their CIDSystemInfo (ROS). * * @param fontDescriptor FontDescriptor * @param cidSystemInfo the CID system info, e.g. "Adobe-Japan1", if any. */ public static CIDFontMapping getCIDFont(String baseFont, PDFontDescriptor fontDescriptor, PDCIDSystemInfo cidSystemInfo) { // try name match or substitute with OTF OpenTypeFont otf1 = (OpenTypeFont) findFont(FontFormat.OTF, baseFont); if (otf1 != null) { return new CIDFontMapping(otf1, null, false); } // try name match or substitute with TTF TrueTypeFont ttf = (TrueTypeFont) findFont(FontFormat.TTF, baseFont); if (ttf != null) { return new CIDFontMapping(null, ttf, false); } if (cidSystemInfo != null) { // "In Acrobat 3.0.1 and later, Type 0 fonts that use a CMap whose CIDSystemInfo // dictionary defines the Adobe-GB1, Adobe-CNS1 Adobe-Japan1, or Adobe-Korea1 character // collection can also be substituted." - Adobe Supplement to the ISO 32000 String collection = cidSystemInfo.getRegistry() + "-" + cidSystemInfo.getOrdering(); if (collection.equals("Adobe-GB1") || collection.equals("Adobe-CNS1") || collection.equals("Adobe-Japan1") || collection.equals("Adobe-Korea1")) { // try automatic substitutes via character collection PriorityQueue<FontMatch> queue = getFontMatches(fontDescriptor, cidSystemInfo); FontMatch bestMatch = queue.poll(); if (bestMatch != null) { FontBoxFont font = bestMatch.info.getFont(); if (font instanceof OpenTypeFont) { return new CIDFontMapping((OpenTypeFont) font, null, true); } else { return new CIDFontMapping(null, font, true); } } } } // last-resort fallback return new CIDFontMapping(null, lastResortFont, true); } /** * Returns a list of matching fonts, scored by suitability. Positive scores indicate matches * for certain attributes, while negative scores indicate mismatches. Zero scores are neutral. * * @param fontDescriptor FontDescriptor, always present. * @param cidSystemInfo Font's CIDSystemInfo, may be null. */ private static PriorityQueue<FontMatch> getFontMatches(PDFontDescriptor fontDescriptor, PDCIDSystemInfo cidSystemInfo) { PriorityQueue<FontMatch> queue = new PriorityQueue<FontMatch>(20); for (FontInfo info : fontInfoByName.values()) { // filter by CIDSystemInfo, if given if (cidSystemInfo != null && !isCharSetMatch(cidSystemInfo, info)) { continue; } FontMatch match = new FontMatch(info); // Panose is the most reliable if (fontDescriptor.getPanose() != null && info.getPanose() != null) { PDPanoseClassification panose = fontDescriptor.getPanose().getPanose(); if (panose.getFamilyKind() == info.getPanose().getFamilyKind()) { // serifs if (panose.getSerifStyle() == info.getPanose().getSerifStyle()) { // exact match match.score += 2; } else if (panose.getSerifStyle() >= 2 && panose.getSerifStyle() <= 5 && info.getPanose().getSerifStyle() >= 2 && info.getPanose().getSerifStyle() <= 5) { // cove (serif) match.score += 1; } else if (panose.getSerifStyle() >= 11 && panose.getSerifStyle() <= 13 && info.getPanose().getSerifStyle() >= 11 && info.getPanose().getSerifStyle() <= 13) { // sans-serif match.score += 1; } else if (panose.getSerifStyle() != 0 && info.getPanose().getSerifStyle() != 0) { // mismatch match.score -= 1; } // weight int weight = info.getPanose().getWeight(); int weightClass = info.getWeightClassAsPanose(); if (Math.abs(weight - weightClass) > 2) { // inconsistent data in system font, usWeightClass wins weight = weightClass; } if (panose.getWeight() == weight) { // exact match match.score += 2; } else if (panose.getWeight() > 1 && weight > 1) { float dist = Math.abs(panose.getWeight() - weight); match.score += 1 - dist * 0.5; } // todo: italic // ... } } else if (fontDescriptor.getFontWeight() > 0 && info.getWeightClass() > 0) { // usWeightClass is pretty reliable float dist = Math.abs(fontDescriptor.getFontWeight() - info.getWeightClass()); match.score += 1 - (dist / 100) * 0.5; } // todo: italic // ... queue.add(match); } return queue; } /** * Returns true if the character set described by CIDSystemInfo is present in the given font. * Only applies to Adobe-GB1, Adobe-CNS1, Adobe-Japan1, Adobe-Korea1, as per the PDF spec. */ private static boolean isCharSetMatch(PDCIDSystemInfo cidSystemInfo, FontInfo info) { if (info.getCIDSystemInfo() != null) { return info.getCIDSystemInfo().getRegistry().equals(cidSystemInfo.getRegistry()) && info.getCIDSystemInfo().getOrdering().equals(cidSystemInfo.getOrdering()); } else { long codePageRange = info.getCodePageRange(); long JIS_JAPAN = 1 << 17; long CHINESE_SIMPLIFIED = 1 << 18; long KOREAN_WANSUNG = 1 << 19; long CHINESE_TRADITIONAL = 1 << 20; long KOREAN_JOHAB = 1 << 21; if (cidSystemInfo.getOrdering().equals("GB1") && (codePageRange & CHINESE_SIMPLIFIED) == CHINESE_SIMPLIFIED) { return true; } else if (cidSystemInfo.getOrdering().equals("CNS1") && (codePageRange & CHINESE_TRADITIONAL) == CHINESE_TRADITIONAL) { return true; } else if (cidSystemInfo.getOrdering().equals("Japan1") && (codePageRange & JIS_JAPAN) == JIS_JAPAN) { return true; } else { return cidSystemInfo.getOrdering().equals("Korea1") && (codePageRange & KOREAN_WANSUNG) == KOREAN_WANSUNG || (codePageRange & KOREAN_JOHAB) == KOREAN_JOHAB; } } } /** * A potential match for a font substitution. */ private static class FontMatch implements Comparable<FontMatch> { double score; final FontInfo info; FontMatch(FontInfo info) { this.info = info; } @Override public int compareTo(FontMatch match) { return Double.compare(match.score, this.score); } } /** * For debugging. Prints all matches and returns the best match. */ private static FontMatch printMatches(PriorityQueue<FontMatch> queue) { FontMatch bestMatch = queue.peek(); System.out.println("-------"); while (!queue.isEmpty()) { FontMatch match = queue.poll(); FontInfo info = match.info; System.out.println(match.score + " | " + info.getMacStyle() + " " + info.getFamilyClass() + " " + info.getPanose() + " " + info.getCIDSystemInfo() + " " + info.getPostScriptName() + " " + info.getFormat()); } System.out.println("-------"); return bestMatch; } }