package com.justwayward.reader.view.chmview; import com.justwayward.reader.utils.LogUtils; import java.io.ByteArrayInputStream; import java.io.Closeable; import java.io.EOFException; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; /** * CHM 文件结构 * <p> * 1. Header * - 1.1 ITSF, 4字节 * - 1.2 版本信息, 4字节 * - 1.3 文件头总长度, 4字节 * - 1.4 固定为1, 4字节 * - 1.5 时间记录, 4字节 * - 1.6 windows语言ID标识, 4字节 * - 1.7 两个GUID, 16字节, 固定为{7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC},{7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC} * - 1.8 两项header section, 每项16字节, 记录着从文件头开始的偏移量和section的长度,各占8个字节 * - - 1.8.1 header section 0 * - - - 1.8.1.1 第一双字:0x01fe * - - - 1.8.1.2 第三双字为文件大小 * - - - 1.8.1.3 共占5个双字,其余均为0 * - - 1.8.2 header section 1 * - - - 1.8.2.1 第一双字为ITSP * - - - 1.8.2.2 第二双字为版本号 * - - - 1.8.2.3 第三双字为本section长度 * - - - 1.8.2.4 第四双字为0x0a * - - - 1.8.2.5 第五双字值为0x1000,是目录块的大小 * - - - 1.8.2.6 第六双字是quickref section的“密度”,一般是2 * - - - 1.8.2.7 第七双字是索引树的深度,1表示没有索引,2表示有一层的PMGI数据块 * - - - 1.8.2.8 第八双字表示根索引的块号,如果没有索引为-1 * - - - 1.8.2.9 第九双字是第一个PMGL(listing)的块号 * - - - 1.8.2.A 第十双字是最后一个PMGL的块号 * - - - 1.8.2.B 第十一双字是-1 * - - - 1.8.2.C 第十二双字是目录块的块数 * - - - 1.8.2.D 第十三双字是windows语言ID标识 * - 1.9 8个字节的信息,这些在版本2里是没有的 * <p> * 2. */ public class CHMFile implements Closeable { public static final int CHM_HEADER_LENGTH = 0x60; public static final int CHM_DIRECTORY_HEADER_LENGTH = 0x54; // header info private int version; // 3, 2 private int timestamp; private int lang; // Windows Language ID private long contentOffset; private long fileLength; private int chunkSize; private int quickRef; private int rootIndexChunkNo; private int firstPMGLChunkNo; private int lastPMGLChunkNo; private int totalChunks; private long chunkOffset; RandomAccessFile fileAccess; private Map<String, ListingEntry> entryCache = new TreeMap<String, ListingEntry>(); // level 1 index, <filename, level 2 chunkNo> private List<Map<String, Integer>> indexTree = new ArrayList<Map<String, Integer>>(); private List<String> resources; private String siteMap; private Section[] sections = new Section[]{new Section()}; private String filepath; public CHMFile(String filepath) throws IOException, DataFormatException { int iTemp; fileAccess = new RandomAccessFile(this.filepath = filepath, "r"); /** * Step 1. CHM header */ LEInputStream in = new LEInputStream(createInputStream(0, CHM_HEADER_LENGTH)); // ITSF if (!in.readUTF8(4).equals("ITSF")) { throw new DataFormatException("CHM file should start with \"ITSF\""); } // version if ((version = in.read32()) > 3) { LogUtils.w("CHM header version unexpected value " + version); } // header length int length = in.read32(); // value iTemp = in.read32(); // -1 // timestamp timestamp = in.read32(); // big-endian DWORD? LogUtils.i("CHM timestamp: " + timestamp); // windows language id lang = in.read32(); LogUtils.i("CHM ITSF language: " + WindowsLanguageID.getLocale(lang)); // GUID String strTmp = in.readGUID(); //.equals("7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC"); strTmp = in.readGUID(); //.equals("7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC"); // header section long off0 = in.read64(); long len0 = in.read64(); long off1 = in.read64(); long len1 = in.read64(); // if the header length is really 0x60, read the final QWORD, or the content should be immediate after header section 1 contentOffset = (length >= CHM_HEADER_LENGTH) ? in.read64() : (off1 + len1); LogUtils.i("CHM content offset " + contentOffset); /* Step 1.1 (Optional) CHM header section 0 */ in = new LEInputStream(createInputStream(off0, (int) len0)); // len0 can't exceed 32-bit iTemp = in.read32(); // 0x01FE; iTemp = in.read32(); // 0; if ((fileLength = in.read64()) != fileAccess.length()) { LogUtils.w("CHM file may be corrupted, expect file length " + fileLength); } iTemp = in.read32(); // 0; iTemp = in.read32(); // 0; /** * Step 1.2 CHM header section 1: directory index header */ in = new LEInputStream(createInputStream(off1, CHM_DIRECTORY_HEADER_LENGTH)); if (!in.readUTF8(4).equals("ITSP")) { throw new DataFormatException("CHM directory header should start with \"ITSP\""); } iTemp = in.read32(); // version chunkOffset = off1 + in.read32(); // = 0x54 iTemp = in.read32(); // = 0x0a chunkSize = in.read32(); // 0x1000 quickRef = 1 + (1 << in.read32()); // = 1 + (1 << quickRefDensity ) for (int i = in.read32(); i > 1; i--) { // depth of index tree, 1: no index, 2: one level of PMGI chunks indexTree.add(new TreeMap<String, Integer>()); } rootIndexChunkNo = in.read32(); // chunk number of root, -1: none firstPMGLChunkNo = in.read32(); // chunk number of first PMGL lastPMGLChunkNo = in.read32(); // chunk number of last PMGL iTemp = in.read32(); // = -1 totalChunks = in.read32(); // chunk counts int lang2 = in.read32(); // language code LogUtils.i("CHM ITSP language " + WindowsLanguageID.getLocale(lang2)); strTmp = in.readGUID(); //.equals("5D02926A-212E-11D0-9DF9-00A0-C922-E6EC")) iTemp = in.read32(); // = x54 iTemp = in.read32(); // = -1 iTemp = in.read32(); // = -1 iTemp = in.read32(); // = -1 if (chunkSize * totalChunks + CHM_DIRECTORY_HEADER_LENGTH != len1) { throw new DataFormatException("CHM directory list chunks size mismatch"); } /** * Step 2. CHM name list: content sections */ in = new LEInputStream( getResourceAsStream("::DataSpace/NameList")); iTemp = in.read16(); // length in 16-bit-word, = in.length() / 2 sections = new Section[in.read16()]; for (int i = 0; i < sections.length; i++) { String name = in.readUTF16(in.read16() << 1); if ("Uncompressed".equals(name)) { sections[i] = new Section(); } else if ("MSCompressed".equals(name)) { sections[i] = new LZXCSection(); } else { throw new DataFormatException("Unknown content section " + name); } iTemp = in.read16(); // = null } } /** * Read len bytes from file beginning from offset. Since it's really a * ByteArrayInputStream, close() operation is optional */ private synchronized InputStream createInputStream(long offset, int len) throws IOException { fileAccess.seek(offset); byte[] b = new byte[len]; // TODO performance? fileAccess.readFully(b); return new ByteArrayInputStream(b); } /** * Resovle entry by name, using cache and index */ private ListingEntry resolveEntry(String name) throws IOException { // if (rootIndexChunkNo < 0 && resources == null) // no index // { list(); // force cache fill //} ListingEntry entry = entryCache.get(name); if (entry != null) { return entry; } //error // if (rootIndexChunkNo >= 0 && resources == null) { // entry = resolveIndexedEntry(name, rootIndexChunkNo, 0); // } // // if (entry == null) {// ugly // entry = resolveIndexedEntry(name.toLowerCase(), rootIndexChunkNo, 0); // LogUtils.w("Resolved using lowercase name " + name); // } if (entry == null) { throw new FileNotFoundException(filepath + "#" + name); } return entry; } /** * listing chunks have filename/offset entries sorted by filename * alphabetically index chunks have filename/listingchunk# entries, * specifying the first filename of each listing chunk. NOTE: this code will * crack when there is no index at all (rootIndexChunkNo == -1), so at * processDirectoryIndex() method, we have already cached all resource * names. however, this code will still crack, when resolving a not-at-all * existing resource. */ private synchronized ListingEntry resolveIndexedEntry(String name, int chunkNo, int level) throws IOException { if (chunkNo < 0) { throw new IllegalArgumentException("chunkNo < 0"); } if (level < indexTree.size()) { // no more than indexTreeDepth // process the index chunk Map<String, Integer> index = indexTree.get(level); if (index.isEmpty()) { // load it from the file LEInputStream in = new LEInputStream( createInputStream(chunkOffset + rootIndexChunkNo * chunkSize, chunkSize)); if (!in.readUTF8(4).equals("PMGI")) { throw new DataFormatException("Index Chunk magic mismatch, should be 'PMGI'"); } int freeSpace = in.read32(); // Length of free space and/or quickref area at end of directory chunk // directory index entries, sorted by filename (case insensitive) while (in.available() > freeSpace) { index.put(in.readUTF8(in.readENC()), in.readENC()); } LogUtils.i("Index L" + level + indexTree); } chunkNo = -1; String lastKey = ""; for (Entry<String, Integer> item : index.entrySet()) { if (name.compareTo(item.getKey()) < 0) { if (level + 1 == indexTree.size() // it's the last index && entryCache.containsKey(lastKey)) // if the first entry is cached { return entryCache.get(name); // it should be in the cache, too } break; // we found its chunk, break anyway } lastKey = item.getKey(); chunkNo = item.getValue(); } return resolveIndexedEntry(name, chunkNo, level + 1); } else { // process the listing chunk, and cache entries in the whole chunk LEInputStream in = new LEInputStream( createInputStream(chunkOffset + chunkNo * chunkSize, chunkSize)); if (!in.readUTF8(4).equals("PMGL")) { throw new DataFormatException("Listing Chunk magic mismatch, should be 'PMGL'"); } int freeSpace = in.read32(); // Length of free space and/or quickref area at end of directory chunk in.read32(); // = 0; in.read32(); // previousChunk # in.read32(); // nextChunk # while (in.available() > freeSpace) { ListingEntry entry = new ListingEntry(in); entryCache.put(entry.name, entry); } /* The quickref area is written backwards from the end of the chunk. One quickref entry * exists for every n entries in the file, * where n is calculated as 1 + (1 << quickref density). So for density = 2, n = 5. chunkSize-0002: WORD Number of entries in the chunk chunkSize-0004: WORD Offset of entry n from entry 0 chunkSize-0008: WORD Offset of entry 2n from entry 0 chunkSize-000C: WORD Offset of entry 3n from entry 0 LogUtils.i("resources.size() = " + resources.size()); if ( (in.available() & 1) >0 ) // align to word in.skip(1); while (in.available() > 0) LogUtils.i("chunk " + i + ": " + in.read16()); */ return entryCache.get(name); } } /** * Get an InputStream object for the named resource in the CHM. */ public InputStream getResourceAsStream(String name) throws IOException { name = name.toLowerCase(); if (name == null || name.length() == 0) { name = getSiteMap(); if (name == null) return null; } ListingEntry entry = resolveEntry(name); if (entry == null) { throw new FileNotFoundException(filepath + "#" + name); } Section section = sections[entry.section]; return section.resolveInputStream(entry.offset, entry.length); } /** * Get the name of the resources in the CHM. Caches perform better when * iterate the CHM using order of this returned list. * * @see #resolveIndexedEntry * chunk will be read twice, one in resolveIndexEntry, one here, fix it! */ public synchronized List<String> list() throws IOException { if (resources == null) { // find resources in all listing chunks resources = new ArrayList<String>(); for (int i = firstPMGLChunkNo; i < totalChunks; i++) { LEInputStream in = new LEInputStream( createInputStream(chunkOffset + i * chunkSize, chunkSize)); if (!in.readUTF8(4).equals("PMGL")) { continue; //throw new DataFormatException("Listing Chunk magic mismatch, should be 'PMGL'"); } int freeSpace = in.read32(); // Length of free space and/or quickref area at end of directory chunk in.read32(); // = 0; in.read32(); // previousChunk # in.read32(); // nextChunk # while (in.available() > freeSpace) { ListingEntry entry = new ListingEntry(in); entryCache.put(entry.name, entry); if (entry.name.charAt(0) == '/') { resources.add(entry.name); if (entry.name.endsWith(".hhc")) { // .hhc entry is the navigation file siteMap = entry.name; LogUtils.i("CHM sitemap " + siteMap); } } } } resources = Collections.unmodifiableList(resources); // protect the list, since the reference will be } return resources; } /** * The sitemap file, usually the .hhc file. */ public String getSiteMap() throws IOException { if (resources == null) { list(); } return siteMap; } /** * After close, the object can not be used any more. */ public void close() throws IOException { entryCache = null; sections = null; resources = null; if (fileAccess != null) { fileAccess.close(); fileAccess = null; } } protected void finalize() throws IOException { close(); } class Section { public InputStream resolveInputStream(long off, int len) throws IOException { return createInputStream(contentOffset + off, len); } } class LZXCSection extends Section { long compressedLength; long uncompressedLength; int blockSize; int resetInterval; long[] addressTable; int windowSize; long sectionOffset; LRUCache<Integer, byte[][]> cachedBlocks; public LZXCSection() throws IOException, DataFormatException { // control data LEInputStream in = new LEInputStream( getResourceAsStream("::DataSpace/Storage/MSCompressed/ControlData")); in.read32(); // words following LZXC if (!in.readUTF8(4).equals("LZXC")) { throw new DataFormatException("Must be in LZX Compression"); } in.read32(); // <=2, version resetInterval = in.read32(); // huffman reset interval for blocks windowSize = in.read32() * 0x8000; // usu. 0x10, windows size in 0x8000-byte blocks int cacheSize = in.read32(); // unknown, 0, 1, 2 LogUtils.i("LZX cache size " + cacheSize); cachedBlocks = new LRUCache<Integer, byte[][]>((1 + cacheSize) << 2); in.read32(); // = 0 // reset table in = new LEInputStream( getResourceAsStream("::DataSpace/Storage/MSCompressed/Transform/" + "{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable")); int version = in.read32(); if (version != 2) { LogUtils.w("LZXC version unknown " + version); } addressTable = new long[in.read32()]; in.read32(); // = 8; size of table entry in.read32(); // = 0x28, header length uncompressedLength = in.read64(); compressedLength = in.read64(); blockSize = (int) in.read64(); // 0x8000, do not support blockSize larger than 32-bit integer for (int i = 0; i < addressTable.length; i++) { addressTable[i] = in.read64(); } // init cache // cachedBlocks = new byte[resetInterval][blockSize]; // cachedResetBlockNo = -1; ListingEntry entry = entryCache.get("::DataSpace/Storage/MSCompressed/Content".toLowerCase()); if (entry == null) { throw new DataFormatException("LZXC missing content"); } if (compressedLength != entry.length) { throw new DataFormatException("LZXC content corrupted"); } sectionOffset = contentOffset + entry.offset; } @Override public InputStream resolveInputStream(final long off, final int len) throws IOException { // the input stream ! return new InputStream() { int startBlockNo = (int) (off / blockSize); int startOffset = (int) (off % blockSize); int endBlockNo = (int) ((off + len) / blockSize); int endOffset = (int) ((off + len) % blockSize); // actually start at reset intervals int blockNo = startBlockNo - startBlockNo % resetInterval; Inflater inflater = new Inflater(windowSize); byte[] buf; int pos; int bytesLeft; @Override public int available() throws IOException { return bytesLeft; // not non-blocking available } @Override public void close() throws IOException { inflater = null; } /** * Read the blockNo block, called when bytesLeft == 0 */ private void readBlock() throws IOException { if (blockNo > endBlockNo) { throw new EOFException(); } int cachedNo = blockNo / resetInterval; synchronized (cachedBlocks) { byte[][] cache = cachedBlocks.get(cachedNo); if (cache == null) { if ((cache = cachedBlocks.prune()) == null) // try reuse old caches { cache = new byte[resetInterval][blockSize]; } int resetBlockNo = blockNo - blockNo % resetInterval; for (int i = 0; i < cache.length && resetBlockNo + i < addressTable.length; i++) { int blockNo = resetBlockNo + i; int len = (int) ((blockNo + 1 < addressTable.length) ? (addressTable[blockNo + 1] - addressTable[blockNo]) : (compressedLength - addressTable[blockNo])); LogUtils.i("readBlock " + blockNo + ": " + (sectionOffset + addressTable[blockNo]) + "+ " + len); inflater.inflate(i == 0, // reset flag createInputStream(sectionOffset + addressTable[blockNo], len), cache[i]); // here is the heart } cachedBlocks.put(cachedNo, cache); } if (buf == null) // allocate the buffer { buf = new byte[blockSize]; } System.arraycopy(cache[blockNo % cache.length], 0, buf, 0, buf.length); } // the start block has special pos value pos = (blockNo == startBlockNo) ? startOffset : 0; // the end block has special length bytesLeft = (blockNo < startBlockNo) ? 0 : ((blockNo < endBlockNo) ? blockSize : endOffset); bytesLeft -= pos; blockNo++; } @Override public int read(byte[] b, int off, int len) throws IOException, DataFormatException { if ((bytesLeft <= 0) && (blockNo > endBlockNo)) { return -1; // no more data } while (bytesLeft <= 0) { readBlock(); // re-charge } int togo = Math.min(bytesLeft, len); System.arraycopy(buf, pos, b, off, togo); pos += togo; bytesLeft -= togo; return togo; } @Override public int read() throws IOException { byte[] b = new byte[1]; return (read(b) == 1) ? b[0] & 0xff : -1; } @Override public long skip(long n) throws IOException { LogUtils.w("LZX skip happens: " + pos + "+ " + n); pos += n; // TODO n chould be negative, so do boundary checks! return n; } }; } } class ListingEntry { String name; int section; long offset; int length; public ListingEntry(LEInputStream in) throws IOException { name = in.readUTF8(in.readENC()).toLowerCase(); section = in.readENC(); offset = in.readENC(); length = in.readENC(); } public String toString() { return name + " @" + section + ": " + offset + " + " + length; } } public static void main(String[] argv) throws Exception { if (argv.length == 0) { System.err.println("usage: java " + CHMFile.class.getName() + " <chm file name> (file)*"); System.exit(1); } CHMFile chm = new CHMFile(argv[0]); if (argv.length == 1) { for (String file : chm.list()) { System.out.println(file); } } else { byte[] buf = new byte[1024]; for (int i = 1; i < argv.length; i++) { InputStream in = chm.getResourceAsStream(argv[i]); int c = 0; while ((c = in.read(buf)) >= 0) { System.out.print(new String(buf, 0, c)); } } } chm.close(); } }