/** * Copyright (C) 2007 Rui Shen (rui.shen@gmail.com) All Right Reserved * File : CHMFile.java * Created : 2007-3-1 * **************************************************************************** * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************** */ package org.geometerplus.fbreader.formats.chm; import java.io.ByteArrayInputStream; import java.io.Closeable; import java.io.EOFException; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.Map.Entry; import java.util.logging.Logger; /** * @author Rui Shen * * CHMFile * @see http://www.kyz.uklinux.net/libmspack/doc/structmschmd__header.html * @see http://www.nongnu.org/chmspec/latest/Internal.html */ public class CHMFile implements Closeable { public static final int CHM_HEADER_LENGTH = 0x60; public static final int CHM_DIRECTORY_HEADER_LENGTH = 0x54; // private static Logger log = Logger.getLogger(CHMFile.class.getName()); // header info private int version; // 3, 2 private int timestamp; public int lang; // Windows Language ID private long contentOffset; private long fileLength; private int chunkSize; private int quickRef; private int rootIndexChunkNo; private int firstPMGLChunkNo; private int lastPMGLChunkNo; private int totalChunks; private long chunkOffset; RandomAccessFile fileAccess; private Map<String, ListingEntry> entryCache = new TreeMap<String, ListingEntry>(); // level 1 index, <filename, level 2 chunkNo> private List<Map<String, Integer>> indexTree = new ArrayList<Map<String, Integer>>(); private List<String> resources; public hhctree contents=new hhctree(); private String siteMap; private Section[] sections = new Section[]{ new Section() }; // for section 0 private String filepath; /** * We need random access to the source file */ public CHMFile(String filepath) throws IOException, DataFormatException { fileAccess = new RandomAccessFile(this.filepath = filepath, "r"); /** Step 1. CHM header */ // The header length is 0x60 (96) LEInputStream in = new LEInputStream(createInputStream(0, CHM_HEADER_LENGTH)); if ( ! in.readUTF8(4).equals("ITSF") ) throw new DataFormatException("CHM file should start with \"ITSF\""); if ( (version = in.read32()) > 3) System.out.println("CHM header version unexpected value " + version); int length = in.read32(); in.read32(); // -1 timestamp = in.read32(); // big-endian DWORD? // log.info("CHM timestamp " + new Date(timestamp)); lang = in.read32(); System.out.println("CHM ITSF language " + WindowsLanguageID.getLocale(lang)); in.readGUID(); //.equals("7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC"); in.readGUID(); //.equals("7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC"); long off0 = in.read64(); long len0 = in.read64(); long off1 = in.read64(); long len1 = in.read64(); // if the header length is really 0x60, read the final QWORD // or the content should be immediate after header section 1 contentOffset = (length >= CHM_HEADER_LENGTH) ? in.read64() : (off1 + len1); // log.fine("CHM content offset " + contentOffset); /* Step 1.1 (Optional) CHM header section 0 */ in = new LEInputStream(createInputStream(off0, (int) len0)); // len0 can't exceed 32-bit in.read32(); // 0x01FE; in.read32(); // 0; if ( (fileLength = in.read64()) != fileAccess.length()) System.out.println("CHM file may be corrupted, expect file length " + fileLength); in.read32(); // 0; in.read32(); // 0; /* Step 1.2 CHM header section 1: directory index header */ in = new LEInputStream(createInputStream(off1, CHM_DIRECTORY_HEADER_LENGTH)); if (! in.readUTF8(4).equals("ITSP") ) throw new DataFormatException("CHM directory header should start with \"ITSP\""); in.read32(); // version chunkOffset = off1 + in.read32(); // = 0x54 in.read32(); // = 0x0a chunkSize = in.read32(); // 0x1000 quickRef = 1 + (1 << in.read32()); // = 1 + (1 << quickRefDensity ) for (int i = in.read32(); i > 1; i --) // depth of index tree, 1: no index, 2: one level of PMGI chunks indexTree.add(new TreeMap<String, Integer>()); rootIndexChunkNo = in.read32(); // chunk number of root, -1: none firstPMGLChunkNo = in.read32(); lastPMGLChunkNo = in.read32(); in.read32(); // = -1 totalChunks = in.read32(); int lang2 = in.read32(); // language code // log.info("CHM ITSP language " + WindowsLanguageID.getLocale(lang2)); in.readGUID(); //.equals("5D02926A-212E-11D0-9DF9-00A0-C922-E6EC")) in.read32(); // = x54 in.read32(); // = -1 in.read32(); // = -1 in.read32(); // = -1 if (chunkSize * totalChunks + CHM_DIRECTORY_HEADER_LENGTH != len1) throw new DataFormatException("CHM directory list chunks size mismatch"); /* Step 2. CHM name list: content sections */ in = new LEInputStream( getResourceAsStream("::DataSpace/NameList")); if (in == null) throw new DataFormatException("Missing ::DataSpace/NameList entry"); in.read16(); // length in 16-bit-word, = in.length() / 2 sections = new Section[in.read16()]; for (int i = 0; i < sections.length; i ++) { String name = in.readUTF16(in.read16() << 1); if ("Uncompressed".equals(name)) { sections[i] = new Section(); } else if ("MSCompressed".equals(name)) { sections[i] = new LZXCSection(); } else throw new DataFormatException("Unknown content section " + name); in.read16(); // = null } } private void createContents(String tmp,hhctree root){ //根据 hhc 生成 目录。 hhctree proot=null; String tmpstr =tmp; // System.out.println("---hym--hhc-:"+tmp); if(tmpstr.indexOf("<UL>")!=-1){ tmpstr = tmpstr.substring(tmpstr.indexOf("<UL>")+4); if(tmpstr.lastIndexOf("</UL>")!=-1) tmpstr = tmpstr.substring(0,tmpstr.lastIndexOf("</UL>"));//hhc 不规范就会出错 while(tmpstr.indexOf("<LI>")!=-1){ if(tmpstr.indexOf("<UL>")>tmpstr.indexOf("<LI>")||tmpstr.indexOf("<UL>")==-1){ hhctree nowtree = new hhctree(); proot=nowtree; String tttstr=tmpstr.substring(tmpstr.indexOf("<LI>")+4); tmpstr=tttstr; if(tmpstr.indexOf("<LI>")!=-1){ tttstr=tmpstr.substring(0,tmpstr.indexOf("<LI>")); }else{ tttstr=tmpstr;//最后了,没有li了。 } if(tttstr.indexOf("text/sitemap")==-1){ continue; }else{ nowtree.parent=root; root.child.add(nowtree); String name=tttstr.substring(tttstr.indexOf("Name\"")+13); name=name.substring(0,name.indexOf("\"")); nowtree.name=name; String filename=tttstr.substring(tttstr.indexOf("Local")+14); filename=filename.substring(0,filename.indexOf("\"")); nowtree.filename=filename; nowtree.filename=nowtree.filename.replaceAll("%20", " "); // System.out.println("---hym---:"+filename); } }else{ String ttstr=tmpstr.substring(tmpstr.indexOf("<UL>")); //这里要找下一个匹配的 《/ul> 而不是最后一个/ul String onestr=""; while(ttstr.lastIndexOf("</UL>")!=-1){ String tttstr=ttstr.substring(0,ttstr.indexOf("</UL>")+5); onestr+=tttstr; ttstr=ttstr.substring(ttstr.indexOf("</UL>")+5); if(jUL(onestr)){//匹配上了 break; } } if(onestr.length()==0){//下面全部 没有 ul了 onestr=ttstr; } //递归 createContents(onestr,proot); //------- tmpstr=tmpstr.substring(tmpstr.indexOf("<UL>")+onestr.length()-1); } } } } public boolean jUL(String tmpstr){ int ulnum=0; int nulnum=0; String tstr=tmpstr; while(tstr.indexOf("<UL>")!=-1){ tstr=tstr.substring(tstr.indexOf("<UL>")+4); ulnum++; } while(tmpstr.indexOf("</UL>")!=-1){ tmpstr=tmpstr.substring(tmpstr.indexOf("</UL>")+5); nulnum++; } if(ulnum==nulnum){ return true; } return false; } /** * Read len bytes from file beginning from offset. * Since it's really a ByteArrayInputStream, close() operation is optional */ private synchronized InputStream createInputStream(long offset, int len) throws IOException { fileAccess.seek(offset); byte[]b = new byte[len]; // TODO performance? fileAccess.readFully(b); return new ByteArrayInputStream(b); } /** * Resovle entry by name, using cache and index */ private ListingEntry resolveEntry(String name) throws IOException { if (rootIndexChunkNo < 0 && resources == null) // no index list(); // force cache fill ListingEntry entry = entryCache.get(name); if (entry != null) return entry; if (rootIndexChunkNo >= 0 && resources == null) entry = resolveIndexedEntry(name, rootIndexChunkNo, 0); if (entry == null) {// ugly entry = resolveIndexedEntry(name.toLowerCase(), rootIndexChunkNo, 0); // log.warning("Resolved using lowercase name " + name); } if (entry == null) throw new FileNotFoundException(filepath + "#" + name); return entry; } /** * listing chunks have filename/offset entries sorted by filename alphabetically * index chunks have filename/listingchunk# entries, specifying the first filename of each listing chunk. * NOTE: this code will crack when there is no index at all (rootIndexChunkNo == -1), * so at processDirectoryIndex() method, we have already cached all resource names. * however, this code will still crack, when resolving a not-at-all existing resource. */ private synchronized ListingEntry resolveIndexedEntry(String name, int chunkNo, int level) throws IOException { if (chunkNo < 0) throw new IllegalArgumentException("chunkNo < 0"); if (level < indexTree.size()) { // no more than indexTreeDepth // process the index chunk Map<String, Integer> index = indexTree.get(level); if (index.isEmpty()) { // load it from the file LEInputStream in = new LEInputStream( createInputStream(chunkOffset + rootIndexChunkNo * chunkSize, chunkSize)); if (! in.readUTF8(4).equals("PMGI") ) throw new DataFormatException("Index Chunk magic mismatch, should be 'PMGI'"); int freeSpace = in.read32(); // Length of free space and/or quickref area at end of directory chunk // directory index entries, sorted by filename (case insensitive) while (in.available() > freeSpace) { index.put(in.readUTF8(in.readENC()), in.readENC()); } // log.fine("Index L" + level + indexTree); } chunkNo = -1; String lastKey = ""; for (Entry<String, Integer> item: index.entrySet()) { if (name.compareTo(item.getKey()) < 0) { if (level + 1 == indexTree.size() // it's the last index && entryCache.containsKey(lastKey)) // if the first entry is cached return entryCache.get(name); // it should be in the cache, too break; // we found its chunk, break anyway } lastKey = item.getKey(); chunkNo = item.getValue(); } return resolveIndexedEntry(name, chunkNo, level + 1); } else { // process the listing chunk, and cache entries in the whole chunk LEInputStream in = new LEInputStream( createInputStream(chunkOffset + chunkNo * chunkSize, chunkSize)); if (! in.readUTF8(4).equals("PMGL") ) throw new DataFormatException("Listing Chunk magic mismatch, should be 'PMGL'"); int freeSpace = in.read32(); // Length of free space and/or quickref area at end of directory chunk in.read32(); // = 0; in.read32(); // previousChunk # in.read32(); // nextChunk # while (in.available() > freeSpace) { ListingEntry entry = new ListingEntry(in); entryCache.put(entry.name, entry); } /* The quickref area is written backwards from the end of the chunk. One quickref entry * exists for every n entries in the file, * where n is calculated as 1 + (1 << quickref density). So for density = 2, n = 5. chunkSize-0002: WORD Number of entries in the chunk chunkSize-0004: WORD Offset of entry n from entry 0 chunkSize-0008: WORD Offset of entry 2n from entry 0 chunkSize-000C: WORD Offset of entry 3n from entry 0 log.info("resources.size() = " + resources.size()); if ( (in.available() & 1) >0 ) // align to word in.skip(1); while (in.available() > 0) log.info("chunk " + i + ": " + in.read16()); */ return entryCache.get(name); } } /** * Get an InputStream object for the named resource in the CHM. */ public InputStream getResourceAsStream(String name) throws IOException { if (name == null || name.length() == 0) name = getSiteMap(); ListingEntry entry = resolveEntry(name); if (entry == null) throw new FileNotFoundException(filepath + "#" + name); Section section = sections[entry.section]; return section.resolveInputStream(entry.offset, entry.length); } public List<Map<String, Integer>> getIndexTree(){ return indexTree; } /** * Get the name of the resources in the CHM. * Caches perform better when iterate the CHM using order of this returned list. * @see resolveIndexEntry(String name, int chunkNo, int level) * TODO: some chunk will be read twice, one in resolveIndexEntry, one here, fix it! */ public synchronized List<String> list() throws IOException { if (resources == null) { // find resources in all listing chunks resources = new ArrayList<String>(); for (int i = firstPMGLChunkNo; i <= lastPMGLChunkNo; i ++) { LEInputStream in = new LEInputStream( createInputStream(chunkOffset + i * chunkSize, chunkSize)); if (! in.readUTF8(4).equals("PMGL") ) throw new DataFormatException("Listing Chunk magic mismatch, should be 'PMGL'"); int freeSpace = in.read32(); // Length of free space and/or quickref area at end of directory chunk in.read32(); // = 0; in.read32(); // previousChunk # in.read32(); // nextChunk # while (in.available() > freeSpace) { ListingEntry entry = new ListingEntry(in); entryCache.put(entry.name, entry); if (entry.name.charAt(0) == '/'){ resources.add(entry.name); // System.out.println("----hym list--"+entry.name); if (entry.name.toLowerCase().endsWith(".hhc")) { // .hhc entry is the navigation file siteMap = entry.name; System.out.println("CHM sitemap " + siteMap); } } } } resources = Collections.unmodifiableList(resources); // protect the list, since the reference will be } return resources; } /** * The sitemap file, usually the .hhc file. * @see http://www.nongnu.org/chmspec/latest/Sitemap.html#HHC */ public String getSiteMap() throws IOException { if (resources == null) list(); return siteMap; } /** * After close, the object can not be used any more. */ public void close() throws IOException { entryCache = null; sections = null; resources = null; contents=new hhctree(); siteMap=""; if (fileAccess != null) { fileAccess.close(); fileAccess = null; } } protected void finalize() throws IOException { close(); } class Section { public InputStream resolveInputStream(long off, int len) throws IOException { return createInputStream(contentOffset + off, len); } } class LZXCSection extends Section { long compressedLength; long uncompressedLength; int blockSize; int resetInterval; long[]addressTable; int windowSize; long sectionOffset; LRUCache<Integer, byte[][]> cachedBlocks; public LZXCSection() throws IOException, DataFormatException { // control data LEInputStream in = new LEInputStream( getResourceAsStream("::DataSpace/Storage/MSCompressed/ControlData")); in.read32(); // words following LZXC if ( ! in.readUTF8(4).equals("LZXC")) throw new DataFormatException("Must be in LZX Compression"); in.read32(); // <=2, version resetInterval = in.read32(); // huffman reset interval for blocks windowSize = in.read32() * 0x8000; // usu. 0x10, windows size in 0x8000-byte blocks int cacheSize = in.read32(); // unknown, 0, 1, 2 // log.info("LZX cache size " + cacheSize); cachedBlocks = new LRUCache<Integer, byte[][]>((1 + cacheSize) << 2); in.read32(); // = 0 // reset table in = new LEInputStream( getResourceAsStream("::DataSpace/Storage/MSCompressed/Transform/" + "{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable")); if (in == null) throw new DataFormatException("LZXC missing reset table"); int version = in.read32(); if ( version != 2) System.out.println("LZXC version unknown " + version); addressTable = new long[in.read32()]; in.read32(); // = 8; size of table entry in.read32(); // = 0x28, header length uncompressedLength = in.read64(); compressedLength = in.read64(); blockSize = (int) in.read64(); // 0x8000, do not support blockSize larger than 32-bit integer for (int i = 0; i < addressTable.length; i ++ ) { addressTable[i] = in.read64(); } // init cache // cachedBlocks = new byte[resetInterval][blockSize]; // cachedResetBlockNo = -1; ListingEntry entry = entryCache.get("::DataSpace/Storage/MSCompressed/Content"); if ( entry == null ) throw new DataFormatException("LZXC missing content"); if (compressedLength != entry.length) throw new DataFormatException("LZXC content corrupted"); sectionOffset = contentOffset + entry.offset; } @Override public InputStream resolveInputStream(final long off, final int len) throws IOException { // the input stream ! return new InputStream() { int startBlockNo = (int) (off / blockSize); int startOffset = (int) (off % blockSize); int endBlockNo = (int) ( (off + len) / blockSize ); int endOffset = (int) ( (off + len) % blockSize ); // actually start at reset intervals int blockNo = startBlockNo - startBlockNo % resetInterval; Inflater inflater = new Inflater(windowSize); byte[]buf; int pos; int bytesLeft; @Override public int available() throws IOException { return bytesLeft; // not non-blocking available } @Override public void close() throws IOException { inflater = null; } /** * Read the blockNo block, called when bytesLeft == 0 */ private void readBlock() throws IOException { if (blockNo > endBlockNo) // return; throw new EOFException(); int cachedNo = blockNo / resetInterval; synchronized(cachedBlocks) { byte[][]cache = cachedBlocks.get(cachedNo); if (cache == null) { if ( (cache = cachedBlocks.prune()) == null) // try reuse old caches cache = new byte[resetInterval][blockSize]; int resetBlockNo = blockNo - blockNo % resetInterval; for (int i = 0; i < cache.length && resetBlockNo + i < addressTable.length; i ++) { int blockNo = resetBlockNo + i; int len = (int) ( (blockNo + 1 < addressTable.length) ? ( addressTable[blockNo + 1] - addressTable[blockNo] ): ( compressedLength - addressTable[blockNo]) ); // log.fine("readBlock " + blockNo + ": " + (sectionOffset + addressTable[blockNo]) + "+ " + len); inflater.inflate(i == 0, // reset flag createInputStream(sectionOffset + addressTable[blockNo], len), cache[i]); // here is the heart } cachedBlocks.put(cachedNo, cache); } if (buf == null) // allocate the buffer buf = new byte[blockSize]; System.arraycopy(cache[blockNo % cache.length], 0, buf, 0, buf.length); } // the start block has special pos value pos = (blockNo == startBlockNo) ? startOffset : 0; // the end block has special length bytesLeft = (blockNo < startBlockNo) ? 0 : ( (blockNo < endBlockNo) ? blockSize : endOffset ); bytesLeft -= pos; blockNo ++; } @Override public int read(byte[] b, int off, int len) throws IOException, DataFormatException { if ( (bytesLeft <= 0) && (blockNo > endBlockNo) ) { return -1; // no more data } // while (bytesLeft <= 0){ try{ readBlock(); // re-charge }catch(Exception e){ return -1; } } // while (bytesLeft <= 0) // readBlock(); // re-charge int togo = Math.min(bytesLeft, len); System.arraycopy(buf, pos, b, off, togo); pos += togo; bytesLeft -= togo; return togo; } @Override public int read() throws IOException { byte[]b = new byte[1]; return (read(b) == 1) ? b[0] & 0xff : -1; } @Override public long skip(long n) throws IOException { // log.warning("LZX skip happens: " + pos + "+ " + n); pos += n; // TODO n chould be negative, so do boundary checks! return n; } }; } } class ListingEntry { String name; int section; long offset; int length; public ListingEntry(LEInputStream in) throws IOException { name = in.readUTF8(in.readENC()); section = in.readENC(); offset = in.readENC(); length = in.readENC(); } public String toString() { return name + " @" + section + ": " + offset + " + " + length; } } public void chhctree(String encoding){ if(siteMap!=null&&siteMap.length()!=0){ String hhcstr =""; try { InputStream in = getResourceAsStream(null); InputStreamReader stream=new InputStreamReader(in,encoding); char[] buffertmp = new char[4096*2]; int count = 0; while ( (count = stream.read(buffertmp))>0){ hhcstr+=new String(buffertmp,0,count); } createContents(hhcstr,contents); } catch (IOException e) { e.printStackTrace(); } } } public static void main(String[]argv) throws Exception { // if (argv.length == 0) { // System.err.println("usage: java " + CHMFile.class.getName() + " <chm file name> (file)*"); // System.exit(1); // } CHMFile chm = new CHMFile("d://tt.chm"); for (String file: chm.list() ){ System.out.println(file); } // for (String file: chm.getIndexTree() ){ // System.out.println(file); // } if (argv.length == 1) { } else { byte[]buf = new byte[1024]; for (int i = 1; i < argv.length; i ++ ) { InputStream in = chm.getResourceAsStream(argv[i]); int c = 0; while ( (c = in.read(buf)) >= 0) { System.out.print(new String(buf, 0, c)); } } } chm.close(); } }