/* * Copyright (c) 2013 Allogy Interactive. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.hsl.txtreader; import java.io.IOException; import java.nio.ByteBuffer; import java.util.HashMap; import android.util.Log; import com.sun.pdfview.PDFCMap; import com.sun.pdfview.PDFObject; public class PDFToUnicodeCMap extends PDFCMap { private static final String TAG_CODESPACE_RANGE = "begincodespacerange"; private static final String TAG_BASE_FONT_CHAR = "beginbfchar"; private static final String TAG_BASE_FONT_RANGE = "beginbfrange"; private HashMap<Character, Character> mUnicodeCMap; private int mCharByteNo; private int mCharIndex; private ByteBuffer mCharByteBuffer; private Object mPreToken; private Object mToken; private ByteBuffer mByteBuf; public PDFToUnicodeCMap(PDFObject mapObj) { super(); mUnicodeCMap = new HashMap<Character, Character>(); mCharByteNo = 1; mCharIndex = 0; mCharByteBuffer = ByteBuffer.allocate(4); mPreToken = null; mToken = null; //parse the CMap try { mByteBuf = ByteBuffer.wrap(mapObj.getStream()); //Log.i("PDFToUnicodeCMap", new String(mapObj.getStream())); } catch (IOException e) { e.printStackTrace(); return; } while ((mToken = nextToken(mByteBuf)) != null) { if (mToken instanceof String) { if (TAG_CODESPACE_RANGE.equals(mToken)) { codespaceRange((Integer)mPreToken); } else if (TAG_BASE_FONT_RANGE.equals(mToken)) { baseFontRange((Integer)mPreToken); } else if (TAG_BASE_FONT_CHAR.equals(mToken)) { baseFontChar((Integer)mPreToken); } } mPreToken = mToken; } } private void codespaceRange(int range) { for (int i=0; i<range; i++) { int startRange = (Integer)nextToken(mByteBuf); int endRange = (Integer)nextToken(mByteBuf); if (endRange>255) { mCharByteNo = 2; /* Log.i("PDFToUnicodeCMap", "startRange:"+Integer.toHexString(startRange)+ " endRange:"+Integer.toHexString(endRange)); */ } } } private void baseFontRange(int range) { for (int i=0; i<range; i++) { int startCode = (Integer) nextToken(mByteBuf); int endCode = (Integer) nextToken(mByteBuf); int mappedCode = (Integer) nextToken(mByteBuf); for (int j=startCode; j<=endCode; j++) { int orgCode = translateUnsignedChar(j); mUnicodeCMap.put((char) orgCode, (char) mappedCode); mappedCode++; //Log.i("PDFToUnicodeCMap", "mUnicodeCMap:"+Integer.toHexString(orgCode)+ // ", "+Integer.toHexString(mappedCode)); } } } private void baseFontChar(int no) { for (int i=0; i<no; i++) { int orgCode = translateUnsignedChar((Integer) nextToken(mByteBuf)); int mappedCode = (Integer) nextToken(mByteBuf); mUnicodeCMap.put((char) orgCode , (char) mappedCode); //Log.i("PDFToUnicodeCMap", "mUnicodeCMap:"+Integer.toHexString(orgCode)+ // ", "+Integer.toHexString(mappedCode)); } } // workaround for unsigned Char turns into negative signed short problem private int translateUnsignedChar(int orgCode) { if (mCharByteNo==1 && orgCode > 127) { mCharByteBuffer.position(0); mCharByteBuffer.putInt(orgCode); mCharByteBuffer.put(2, (byte) 0xff); mCharByteBuffer.position(0); orgCode = mCharByteBuffer.asCharBuffer().get(1); } return orgCode; } //to parse the next token, return null when reaching the end private Object nextToken(ByteBuffer bBuf) { Object retObj = null; byte nextByte = 0; try { nextByte = nextByte(bBuf); } catch (Exception ex) { return null; } if (nextByte == '%') { StringBuffer buffer = new StringBuffer(); readLine(bBuf, buffer); retObj = buffer.toString(); } else if (nextByte == '<') { int theNextByte = nextByte(bBuf); //Don't need to deal with << stuff if (theNextByte == '<') { retObj = "<<"; } else { bBuf.position(bBuf.position()-1); retObj = readHexNumber(bBuf); theNextByte = nextByte(bBuf); if (theNextByte != '>') { Log.e("toUnicode", "Error: expected the end of a dictionary."); } } } else if (nextByte >= '0' && nextByte <= '9') { StringBuffer buffer = new StringBuffer(); buffer.append((char)nextByte); nextByte = bBuf.get(); while (nextByte >= '0' && nextByte <= '9') { buffer.append((char)nextByte); nextByte = bBuf.get(); } retObj = new Integer(buffer.toString()); } else if (nextByte == -1) { retObj = null; } else { StringBuffer strBuf = new StringBuffer(); strBuf.append((char) nextByte); try { readString(bBuf, strBuf); } catch (Exception ex) { retObj = null; } retObj = strBuf.toString(); } //Log.i("PDFToUnicodeCMap", "nextToken: "+retObj); return retObj; } // read nextByte without spacing chars private byte nextByte(ByteBuffer bBuf) { byte nextByte; do nextByte = bBuf.get(); while (nextByte == 0x09 || nextByte == 0x0A || nextByte == 0x0D || nextByte == 0x20); return nextByte; } private int readHexNumber(ByteBuffer bBuf) { int retval = 0; byte nextByte = 0; nextByte = nextByte(bBuf); StringBuffer buffer = new StringBuffer(); while ((nextByte >= '0' && nextByte <= '9') || (nextByte >= 'a' && nextByte <= 'f') || (nextByte >= 'A' && nextByte <= 'F')) { buffer.append((char)nextByte); nextByte = bBuf.get(); } bBuf.position(bBuf.position()-1); retval = Integer.parseInt(buffer.toString(),16); return retval; } private void readLine(ByteBuffer bBuf, StringBuffer strBuf) { int nextByte; while (true) { nextByte = bBuf.get(); if (nextByte != 0x0A && nextByte != 0x0D && nextByte != -1) { strBuf.append((char)nextByte); } else { break; } } } private void readString(ByteBuffer bBuf, StringBuffer strBuf) { int nextByte; while (true) { nextByte = bBuf.get(); if (nextByte != 0x0A && nextByte != 0x0D && nextByte != -1 && nextByte != 0x20) { strBuf.append((char)nextByte); } else { break; } } } @Override public char map(char src) { if (mCharByteNo == 2) { if (mCharIndex == 0) { mCharByteBuffer.put(0, (byte) src); mCharIndex = 1; return PDFTextFormat.NULL_CHAR; } else { mCharIndex = 0; mCharByteBuffer.put(1, (byte) src); return mUnicodeCMap.get(mCharByteBuffer.asCharBuffer().get(0)); } } else { if (mUnicodeCMap.size() > 0) { return mUnicodeCMap.get(src); } } return src; } }