package org.trie4j.lz;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.trie4j.test.LapTimer;
public class LZ77 {
public static void main(String[] args) throws Exception{
main1(args);
}
public static void main1(String[] args) throws Exception{
LapTimer lt = new LapTimer();
String src = "abcabdrz";
src = read("data/jawiki-20120220-tail");
// src = readTitles("data/jawiki-20120220-all-titles-in-ns0.gz");
int windowSize = 8192;
StringBuilder dest = new StringBuilder();
lt.reset();
compress2(src, dest, windowSize);
lt.lapMillis("compress done.");
int l = 0;
for(int i = 0; i < dest.length() / 3; i++){
l = Math.max(dest.charAt(i * 3 + 1), l);
}
System.out.println("max matched length: " + l);
dump(dest);
StringBuilder b = new StringBuilder();
lt.reset();
decompress(dest, b);
lt.lapMillis("decompress done.");
boolean eq = src.equals(b.toString());
System.out.println(String.format(
"src: %d, comp: %d(%02.1f%%), decomp: %d, %b",
src.length(), dest.length(), 1.0 * dest.length() / src.length() * 100, b.length(), eq));
for(int i = 0; i < src.length(); i++){
if(src.charAt(i) != b.charAt(i)){
System.out.println(String.format(
"%dth char different [%c:%c]",
i, src.charAt(i), b.charAt(i)));
int s = Math.max(i - 5, 0);
int e = Math.min(i + 5, src.length());
System.out.println("src: " + src.substring(s, e));
System.out.println("dec: " + b.substring(s, e));
break;
}
}
}
public static void main2(String[] args) throws Exception{
LapTimer lt = new LapTimer();
String src = "abcabdrz";
src = read("data/jawiki-20120220-tail");
int windowSize = 8192;
System.out.println("total " + src.length() + " chars. windowSize: " + windowSize);
StringBuilder dest1 = new StringBuilder();
lt.reset();
compress1(src, dest1, windowSize);
lt.lapMillis("compress1 done.");
StringBuilder dest2 = new StringBuilder();
lt.reset();
compress2(src, dest2, windowSize);
lt.lapMillis("compress2 done.");
System.out.println(String.format(
"src: %d, comp1: %d(%02.1f%%)",
src.length(), dest1.length(), 1.0 * dest1.length() / src.length() * 100));
System.out.println(String.format(
"src: %d, comp2: %d(%02.1f%%)",
src.length(), dest2.length(), 1.0 * dest2.length() / src.length() * 100));
for(int i = 0; i < Math.min(dest1.length(), dest2.length()); i++){
if(dest1.charAt(i) != dest2.charAt(i)){
System.out.println(String.format(
"%dth char different [%s:%s]",
i, toString(dest1.charAt(i)), toString(dest2.charAt(i))));
dump(dest1, dest2);
break;
}
}
}
private static String toString(char c){
if(c < 0x20){
return String.format("(0x%02x)", (int)c);
} else{
return "" + c;
}
}
private static String read(String filename)
throws IOException{
InputStream is = new FileInputStream(filename);
try{
DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
char[] buff = new char[is.available() / 2];
int i = 0;
while(dis.available() > 0){
buff[i++] = dis.readChar();
}
return new String(buff);
} finally{
is.close();
}
}
private static void dump(CharSequence... src){
int n = 3 * 42; // Integer.MAX_VALUE;
for(CharSequence s : src){
n = Math.min(n, s.length() / 3);
}
int ns = src.length;
int[] sumchars = new int[ns];
for(int i = 0; i < n; i += 3){
for(int j = 0; j < ns; j++){
int start = src[j].charAt(i);
int count = src[j].charAt(i + 1);
char stopchar = src[j].charAt(i + 2);
System.out.print(String.format(
"%02d:%02d %02d:%02d:%-6s ",
i / 3, i / 3 + sumchars[j], start, count, toString(stopchar)));
sumchars[j] += count;
}
System.out.println();
}
}
private static void compress1(CharSequence src, Appendable out, int windowSize)
throws IOException{
int n = src.length();
for(int i = 0; i < n; i++){
char target = src.charAt(i);
// find longest match
boolean found = false;
int start = 0;
int matchLen = 0;
char nonMatchChar = 0xff;
for(int s = Math.max(0, i - windowSize); s < i; s++){
if(target == src.charAt(s)){
int len = getMatchedLen(src, s + 1, i + 1, n) + 1;
if(len > matchLen){
start = i - s;
matchLen = len;
nonMatchChar = (char)0xff;
if((i + matchLen) < n){
nonMatchChar = src.charAt(i + matchLen);
}
}
found = true;
}
}
if(found){
out.append((char)start)
.append((char)matchLen)
.append(nonMatchChar);
i += matchLen;
} else{
out.append((char)0x00).append((char)0x00).append(target);
}
}
}
private static void compress2(CharSequence src, Appendable out, int windowSize)
throws IOException{
Map<Character, List<Integer>> startPoss = new HashMap<Character, List<Integer>>();
int n = src.length();
for(int i = 0; i < n; i++){
char target = src.charAt(i);
// find longest match
boolean found = false;
int start = 0;
int matchLen = 0;
char nonMatchChar = 0xff;
List<Integer> poss = startPoss.get(target);
if(poss != null){
Iterator<Integer> it = poss.iterator();
while(it.hasNext()){
int s = it.next();
if((i - s) > windowSize){
it.remove();
continue;
}
int len = getMatchedLen(src, s + 1, i + 1, n) + 1;
if(len > matchLen){
start = i - s;
matchLen = len;
nonMatchChar = (char)0xff;
if((i + matchLen) < n){
nonMatchChar = src.charAt(i + matchLen);
}
}
found = true;
}
poss.add(i);
int jn = Math.min(i + matchLen + 1, n);
for(int j = i + 1; j < jn; j++){
List<Integer> p = startPoss.get(src.charAt(j));
if(p == null){
p = new LinkedList<Integer>();
startPoss.put(src.charAt(j), p);
}
p.add(j);
}
} else{
poss = new LinkedList<Integer>();
poss.add(i);
startPoss.put(target, poss);
}
if(found){
out.append((char)start)
.append((char)matchLen)
.append(nonMatchChar);
i += matchLen;
} else{
out.append((char)0x00).append((char)0x00).append(target);
}
}
}
private static int getMatchedLen(CharSequence src, int i1, int i2, int end){
int n = Math.min(i2 - i1, end - i2);
for(int i = 0; i < n; i++){
if(src.charAt(i1++) != src.charAt(i2++)) return i;
}
return 0;
}
public static void decompress(CharSequence src, StringBuilder out){
int n = src.length();
for(int i = 0; i < n; i += 3){
int start = src.charAt(i);
int matchedLen = src.charAt(i + 1);
char nonMatchChar = src.charAt(i + 2);
if(start != 0){
int s = out.length() - start;
int e = s + matchedLen;
for(; s < e; s++){
out.append(out.charAt(s));
}
}
if(nonMatchChar != 0xff){
out.append(nonMatchChar);
}
}
}
}