package org.trie4j.lz; import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.BitSet; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.trie4j.test.LapTimer; public class LZSS { public static void main(String[] args) throws Exception{ LapTimer lt = new LapTimer(); String src = "abcabdrz"; src = read("data/jawiki-20120220-tail"); int windowSize = 8192; lt.reset(); LZSSData ret = compress(src, windowSize); lt.lapMillis("compress done. %d elements, %d chars", ret.match.length(), ret.dest.length()); dump(ret); StringBuilder b = new StringBuilder(); lt.reset(); decompress(ret, b); lt.lapMillis("decompress done."); StringBuilder dest = ret.dest; int sz = dest.length(); int bsz = ret.size / 8 + (((ret.size) % 8 == 0) ? 0 : 1); boolean eq = src.equals(b.toString()); System.out.println(String.format( "src: %d, comp: %d(%02.1f%%) + %dbytes, decomp: %d, %b", src.length(), sz, 1.0 * sz / src.length() * 100, bsz, b.length(), eq)); for(int i = 0; i < src.length(); i++){ if(src.charAt(i) != b.charAt(i)){ System.out.println(String.format( "%dth char different [%c:%c]", i, src.charAt(i), b.charAt(i))); int s = Math.max(i - 5, 0); int e = Math.min(i + 5, src.length()); System.out.println("src: " + src.substring(s, e)); System.out.println("dec: " + b.substring(s, e)); break; } } } public static class LZSSData{ public LZSSData(BitSet match, StringBuilder dest, int size) { this.match = match; this.dest = dest; this.size = size; } private BitSet match = new BitSet(); private StringBuilder dest = new StringBuilder(); private int size; } public static LZSSData compress(CharSequence src, int windowSize) throws IOException{ BitSet match = new BitSet(); StringBuilder out = new StringBuilder(); int size = 0; Map<Character, List<Integer>> startPoss = new HashMap<Character, List<Integer>>(); int n = src.length(); for(int i = 0; i < n; i++){ char target = src.charAt(i); // find longest match boolean found = false; int start = 0; int matchLen = 0; List<Integer> poss = startPoss.get(target); if(poss != null){ Iterator<Integer> it = poss.iterator(); while(it.hasNext()){ int s = it.next(); if((i - s) > windowSize){ it.remove(); continue; } int len = getMatchedLen(src, s + 1, i + 1, n) + 1; if(len > matchLen){ start = i - s; matchLen = len; } found = true; } poss.add(i); int jn = Math.min(i + matchLen, n); for(int j = i + 1; j < jn; j++){ List<Integer> p = startPoss.get(src.charAt(j)); if(p == null){ p = new LinkedList<Integer>(); startPoss.put(src.charAt(j), p); } p.add(j); } } else{ poss = new LinkedList<Integer>(); poss.add(i); startPoss.put(target, poss); } if(found && matchLen > 1){ match.set(size); out.append((char)start) .append((char)matchLen); i += matchLen - 1; } else{ match.set(size, false); out.append(target); } size++; } return new LZSSData(match, out, size); } public static void decompress(LZSSData src, StringBuilder out){ int index = 0; int n = src.size; for(int i = 0; i < n; i++){ if(src.match.get(i)){ int start = src.dest.charAt(index++); int matchedLen = src.dest.charAt(index++); int s = out.length() - start; int e = s + matchedLen; for(; s < e; s++){ out.append(out.charAt(s)); } } else{ out.append(src.dest.charAt(index++)); } } } private static String toString(char c){ if(c < 0x20){ return String.format("(0x%02x)", (int)c); } else{ return "" + c; } } private static String read(String filename) throws IOException{ InputStream is = new FileInputStream(filename); try{ DataInputStream dis = new DataInputStream(new BufferedInputStream(is)); char[] buff = new char[is.available() / 2]; int i = 0; while(dis.available() > 0){ buff[i++] = dis.readChar(); } return new String(buff); } finally{ is.close(); } } private static void dump(LZSSData src){ int index = 0; int n = src.match.size(); for(int i = 0; i < Math.min(n, 42); i++){ if(src.match.get(i)){ System.out.println(String.format( "%02d %02d:%02d", i, (int)src.dest.charAt(index++), (int)src.dest.charAt(index++) )); } else{ System.out.println(String.format( "%02d %s", i, toString(src.dest.charAt(index++)) )); } } } private static int getMatchedLen(CharSequence src, int i1, int i2, int end){ int n = Math.min(i2 - i1, end - i2); for(int i = 0; i < n; i++){ if(src.charAt(i1++) != src.charAt(i2++)) return i; } return 0; } }