/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.encoding.conversion; import java.util.HashMap; import java.util.Iterator; /** * This class represents PDF encoding name to Java charset name mapping. * * @author Pin Xue (http://www.pinxue.net), Holly Lee (holly.lee (at) gmail.com) * @version $Revision: 1.0 $ */ class CJKEncodings { // Mapping: PDF encoding name -> Java (IANA) charset name private static HashMap charsetMapping = new HashMap(); private CJKEncodings() { } static { // Chinese (Simplified) // Microsoft Code Page 936 (lfCharSet 0x86), GB 2312-80 character set, EUC-CN encoding charsetMapping.put("GB-EUC-H", "GB2312"); // Vertical version of GB-EUC-H charsetMapping.put("GB-EUC-V", "GB2312"); // Mac OS, GB 2312-80 character set, EUC-CN encoding, Script Manager code 19 charsetMapping.put("GBpc-EUC-H", "GB2312"); // Vertical version of GBpc-EUC-H charsetMapping.put("GBpc-EUC-V", "GB2312"); // Microsoft Code Page 936 (lfCharSet 0x86), GBK character set, GBK encoding charsetMapping.put("GBK-EUC-H", "GBK"); // Vertical version of GBK-EUC-H charsetMapping.put("GBK-EUC-V", "GBK"); // Same as GBK-EUC-H but replaces half-width Latin characters with proportional // forms and maps character code 0x24 to a dollar sign ($) instead of a yuan symbol (♀∴) charsetMapping.put("GBKp-EUC-H", "GBK"); // Vertical version of GBKp-EUC-H charsetMapping.put("GBKp-EUC-V", "GBK"); // GB 18030-2000 character set, mixed 1-, 2-, and 4-byte encoding charsetMapping.put("GBK2K-H", "GB18030"); // Vertical version of GBK2K-H charsetMapping.put("GBK2K-V", "GB18030"); // Unicode (UCS-2) encoding for the Adobe-GB1 character collection charsetMapping.put("UniGB-UCS2-H", "ISO-10646-UCS-2"); // Vertical version of UniGB-UCS2-H charsetMapping.put("UniGB-UCS2-V", "ISO-10646-UCS-2"); // Unicode (UTF-16BE) encoding for the Adobe-GB1 character collection; contains mappings // for all characters in the GB18030-2000 character set charsetMapping.put("UniGB-UTF16-H", "UTF-16BE"); // Vertical version of UniGB-UTF16-H charsetMapping.put("UniGB-UTF16-V", "UTF-16BE"); // Chinese (Traditional) // Mac OS, Big Five character set, Big Five encoding, Script Manager code 2 charsetMapping.put("B5pc-H", "BIG5"); // Vertical version of B5pc-H charsetMapping.put("B5pc-V", "BIG5"); // Hong Kong SCS, an extension to the Big Five character set and encoding charsetMapping.put("HKscs-B5-H", "Big5-HKSCS"); // Vertical version of HKscs-B5-H charsetMapping.put("HKscs-B5-V", "Big5-HKSCS"); // Microsoft Code Page 950 (lfCharSet 0x88), Big Five character set with ETen extensions charsetMapping.put("ETen-B5-H", "BIG5"); // Vertical version of ETen-B5-H charsetMapping.put("ETen-B5-V", "BIG5"); // Same as ETen-B5-H but replaces half-width Latin characters with proportional forms charsetMapping.put("ETenms-B5-H", "BIG5"); // Vertical version of ETenms-B5-H charsetMapping.put("ETenms-B5-V", "BIG5"); // CNS 11643-1992 character set, EUC-TW encoding charsetMapping.put("CNS-EUC-H", "HZ"); // Vertical version of CNS-EUC-H charsetMapping.put("CNS-EUC-V", "HZ"); // Unicode (UCS-2) encoding for the Adobe-CNS1 character collection charsetMapping.put("UniCNS-UCS2-H", "ISO-10646-UCS-2"); // Vertical version of UniCNS-UCS2-H charsetMapping.put("UniCNS-UCS2-V", "ISO-10646-UCS-2"); // Unicode (UTF-16BE) encoding for the Adobe-CNS1 character collection; // contains mappings for all the characters in the HKSCS-2001 character set and // contains both 2- and 4- byte character codes charsetMapping.put("UniCNS-UTF16-H", "UTF-16BE"); // Vertical version of UniCNS-UTF16-H charsetMapping.put("UniCNS-UTF16-V", "UTF-16BE"); //Japanese // Mac OS, JIS X 0208 character set with KanjiTalk6 extensions, Shift-JIS encoding, Script Manager code 1 charsetMapping.put("83pv-RKSJ-H", "JIS"); // Microsoft Code Page 932 (lfCharSet 0x80), JIS X 0208 character set with NEC and IBM- extensions charsetMapping.put("90ms-RKSJ-H", "JIS"); // Vertical version of 90ms-RKSJ-H charsetMapping.put("90ms-RKSJ-V", "JIS"); // Same as 90ms-RKSJ-H but replaces half-width Latin characters with proportional forms charsetMapping.put("90msp-RKSJ-H", "JIS"); // Vertical version of 90msp-RKSJ-H charsetMapping.put("90msp-RKSJ-V", "JIS"); // Mac OS, JIS X 0208 character set with KanjiTalk7 extensions, Shift-JIS encoding, Script Manager code 1 charsetMapping.put("90pv-RKSJ-H", "JIS"); // JIS X 0208 character set with Fujitsu FMR extensions, Shift-JIS encoding charsetMapping.put("Add-RKSJ-H", "JIS"); // Vertical version of Add-RKSJ-H charsetMapping.put("Add-RKSJ-V", "JIS"); // JIS X 0208 character set, EUC-JP encoding charsetMapping.put("EUC-H", "JIS"); // Vertical version of EUC-H charsetMapping.put("EUC-V", "JIS"); // JIS C 6226 (JIS78) character set with NEC extensions, Shift-JIS encoding charsetMapping.put("Ext-RKSJ-H", "JIS"); // Vertical version of Ext-RKSJ-H charsetMapping.put("Ext-RKSJ-V", "JIS"); // JIS X 0208 character set, ISO-2022-JP encoding charsetMapping.put("H", "JIS"); // Vertical version of H charsetMapping.put("V", "JIS"); // Unicode (UCS-2) encoding for the Adobe-Japan1 character collection charsetMapping.put("UniJIS-UCS2-H", "ISO-10646-UCS-2"); // Vertical version of UniJIS-UCS2-H charsetMapping.put("UniJIS-UCS2-V", "ISO-10646-UCS-2"); // Same as UniJIS-UCS2-H but replaces proportional Latin characters with half-width forms charsetMapping.put("UniJIS-UCS2-HW-H", "ISO-10646-UCS-2"); // Vertical version of UniJIS-UCS2-HW-H charsetMapping.put("UniJIS-UCS2-HW-V", "ISO-10646-UCS-2"); // Unicode (UTF-16BE) encoding for the Adobe-Japan1 character collection; // contains mappings for all characters in the JIS X 0213:1000 character set charsetMapping.put("UniJIS-UTF16-H", "UTF-16BE"); // Vertical version of UniJIS-UTF16-H charsetMapping.put("UniJIS-UTF16-V", "UTF-16BE"); // JIS X 0208 character set, ISO-2022-JP encoding charsetMapping.put("Identity-H", "JIS"); // Vertical version of H charsetMapping.put("Identity-V", "JIS"); //Korean // KS X 1001:1992 character set, EUC-KR encoding charsetMapping.put("KSC-EUC-H", "KSC"); // Vertical version of KSC-EUC-H charsetMapping.put("KSC-EUC-V", "KSC"); // Microsoft Code Page 949 (lfCharSet 0x81), KS X 1001:1992 character set // plus 8822.putitional hangul, Unified Hangul Code (UHC) encoding charsetMapping.put("KSCms-UHC-H", "KSC"); // Vertical version of KSCms-UHC-H charsetMapping.put("KSCms-UHC-V", "KSC"); // Same as KSCms-UHC-H but replaces proportional Latin characters with half-width forms charsetMapping.put("KSCms-UHC-HW-H", "KSC"); // Vertical version of KSCms-UHC-HW-H charsetMapping.put("KSCms-UHC-HW-V", "KSC"); // Mac OS, KS X 1001:1992 character set with Mac OS KH extensions, Script Manager Code 3 charsetMapping.put("KSCpc-EUC-H", "KSC"); // Unicode (UCS-2) encoding for the Adobe-Korea1 character collection charsetMapping.put("UniKS-UCS2-H", "ISO-10646-UCS-2"); // Vertical version of UniKS-UCS2-H charsetMapping.put("UniKS-UCS2-V", "ISO-10646-UCS-2"); // Unicode (UTF-16BE) encoding for the Adobe-Korea1 character collection charsetMapping.put("UniKS-UTF16-H", "UTF-16BE"); // Vertical version of UniKS-UTF16-H charsetMapping.put("UniKS-UTF16-V", "UTF-16BE"); } /** * Get respective Java charset name from given PDF encoding name. * * @param encoding PDF encoding name * @return Java charset name, or null if not found */ public static final String getCharset( String encoding ) { if ( encoding.startsWith("COSName")) { encoding = encoding.substring(8, encoding.length()-1); } return (String)(charsetMapping.get(encoding)); } /** * Return an iterator to iterate through all encodings. */ public static final Iterator getEncodingIterator() { return charsetMapping.keySet().iterator(); } }