/*
* Copyright (C) 2014.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 3 or
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
package main;
import java.nio.charset.Charset;
import java.text.CollationKey;
import java.text.Collator;
import java.text.ParseException;
import java.text.RuleBasedCollator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import uk.me.parabola.imgfmt.app.srt.Sort;
import uk.me.parabola.imgfmt.app.srt.SortKey;
import uk.me.parabola.mkgmap.srt.SrtTextReader;
/**
* Test to compare sorting results and timings between sort keys and collator.
*
* Also have tested against java7 RuleBasedCollator and the ICU one.
*
* In general our implementation is fastest by a long way; key based sort 3 times faster, collation
* based sort even more so. The java collator does not result in the same sort as using sort keys.
*
* I also tried out the ICU collation with mixed results. Could not get the correct desired results with
* it. It was not faster than our implementation for a 1252 cp sort.
*/
public class SortTest {
private static final int LIST_SIZE = 500000;
private Sort sort;
private boolean time;
private boolean fullOutput;
private boolean quiet;
private boolean unicode;
private void test() throws Exception {
sort = SrtTextReader.sortForCodepage(unicode? 65001: 1252);
//testPairs();
Charset charset = sort.getCharset();
Random rand = new Random(21909278L);
List<String> list = createList(rand, charset);
if (time) {
// Run a few times without output, to warm up
compareLists(sortWithKeys(list), sortWithKeys(list));
compareLists(sortWithCollator(list), sortWithCollator(list));
compareLists(sortWithJavaKeys(list), sortWithJavaKeys(list));
compareLists(sortWithJavaCollator(list), sortWithJavaCollator(list));
// re-create the list to make sure it wasn't too optimised to the data
list = createList(rand, charset);
}
System.out.println("Compare key sort and collator sort");
int n = compareLists(sortWithKeys(list), sortWithCollator(list));
System.out.println("N errors " + n);
if (!unicode) {
System.out.println("Compare our sort with java sort");
n = compareLists(sortWithKeys(list), sortWithJavaKeys(list));
System.out.println("N errors " + n);
}
if (time) {
System.out.println("Compare java keys with java collator");
n = compareLists(sortWithJavaKeys(list), sortWithJavaCollator(list));
System.out.println("N errors " + n);
}
}
private List<String> createList(Random rand, Charset charset) {
List<String> list = new ArrayList<>();
for (int n = 0; n < LIST_SIZE; n++) {
int len = rand.nextInt(6)+1;
if (len < 2)
len = rand.nextInt(5) + 2;
if (unicode) {
char[] c = new char[len];
for (int i = 0; i < len; i++) {
int ch;
do {
if (rand.nextInt(10) > 6)
ch = rand.nextInt(6 * 256);
else
ch = rand.nextInt(256);
} while (reject(rand, ch));
c[i] = (char) ch;
}
list.add(new String(c));
} else {
byte[] b = new byte[len];
for (int i = 0; i < len; i++) {
int ch;
do {
ch = rand.nextInt(256);
// reject unassigned. Also low chars most of the time
} while (reject(rand, ch));
b[i] = (byte) ch;
}
list.add(new String(b, charset));
}
}
list = Collections.unmodifiableList(list);
return list;
}
private int compareLists(List<String> r1, List<String> r2) {
int count = 0;
for (int i = 0; i < LIST_SIZE; i++) {
String s1 = r1.get(i);
String s2 = r2.get(i);
String mark = "";
if (!s1.equals(s2)) {
mark = "*";
count++;
}
if (fullOutput || (!mark.isEmpty() && !quiet))
System.out.printf("%6d |%-10s |%-10s %s\n", i, s1, s2, mark);
}
return count;
}
private boolean reject(Random rand, int ch) {
switch (ch) {
case 0:
case ' ':
case '\n':case '\r':
case 0x81:case 0x8d:case 0x8f:
case 0x90:case 0x9d:
return true;
}
switch (Character.getType(ch)) {
case Character.UNASSIGNED:
return true;
case Character.CONTROL:
return true;
}
// Reject low characters most of the time
if (ch < 0x20 && rand.nextInt(100) < 95)
return true;
if (ch > 255 && rand.nextInt(100) > 99)
return true;
return false;
}
private List<String> sortWithKeys(List<String> list) {
long start = System.currentTimeMillis();
List<SortKey<String>> keys = new ArrayList<>();
for (String s : list) {
SortKey<String> key = sort.createSortKey(s, s);
keys.add(key);
}
Collections.sort(keys);
long end = System.currentTimeMillis();
List<String> ret = new ArrayList<>();
for (SortKey<String> key : keys) {
ret.add(key.getObject());
}
System.out.println("time keys: " + (end-start) + "ms");
return ret;
}
private List<String> sortWithCollator(List<String> list) {
long start = System.currentTimeMillis();
List<String> ret = new ArrayList<>(list);
Collections.sort(ret, sort.getCollator());
System.out.println("time coll: " + (System.currentTimeMillis() - start) + "ms");
return ret;
}
private List<String> sortWithJavaKeys(List<String> list) {
long start = System.currentTimeMillis();
List<CollationKey> keys = new ArrayList<>();
Collator jcol;
try {
jcol = new RuleBasedCollator(getRules(false));
} catch (ParseException e) {
e.printStackTrace();
return null;
}
for (String s : list) {
CollationKey key = jcol.getCollationKey(s);
keys.add(key);
}
Collections.sort(keys);
long end = System.currentTimeMillis();
List<String> ret = new ArrayList<>();
for (CollationKey key : keys) {
ret.add(key.getSourceString());
}
System.out.println("time J keys: " + (end - start) + "ms");
return ret;
}
private List<String> sortWithJavaCollator(List<String> list) {
long start = System.currentTimeMillis();
List<String> out = new ArrayList<>(list);
Collator jcol;
try {
jcol = new RuleBasedCollator(getRules(false));
jcol.setStrength(Collator.TERTIARY);
} catch (ParseException e) {
e.printStackTrace();
return null;
}
Collections.sort(out, jcol);
System.out.println("time J collator: " + (System.currentTimeMillis() - start) + "ms");
return out;
}
private String getRules(boolean forICU) {
return "='\u0008'='\u000e'='\u000f'='\u0010'='\u0011'='\u0012'='\u0013'='\u0014'='\u0015'='\u0016'"
+ "='\u0017' ='\u0018' = '\u0019' ='\u001a' ='\u001b'= '\u001c' ='\u001d'= '\u001e'= '\u001f' "
+ "='\u007f' ='\u00ad'"
+ ", '\u0001', '\u0002', '\u0003', '\u0004' ,'\u0005' ,'\u0006', '\u0007'"
+ "< '\u0009' < '\n' < '\u000b' < '\u000c' < '\r' < '\u0020','\u00a0'"
+ "< '_' < '-' < '–' < '—' < '\u002c' < '\u003b' < ':' < '!' < '¡' < '?' < '¿'"
+ "< '.' < '·' "
+ ((forICU)? "< \\' ": "< ''' ")
+ "< '‘' < '’' < '‚' < '‹' < '›' < '“' < '”' < '„' < '«' < '»' "
+ " < '\"' "
+ "< '“' < '”' < '„' < '«'< '»' < '(' < ')' "
+ "< '[' < ']' < '{' < '}' < '§' < '¶' < '@' < '*' < '/' < '\\' < '&' < '#' < '%'"
+ "< '‰' < '†' < '‡' < '•' < '`' < '´' < '^' < '¯' < '¨' < '¸' < 'ˆ' < '°' < '©' < '®'"
+ "< '+' < '±' < '÷' < '×' < '\u003c' < '\u003d' < '>' < '¬' < '|' < '¦' < '~' ; '˜' < '¤'"
+ "< '¢' < '$' < '£' < '¥' < '€' < 0 < 1,¹ < 2,² < 3,³ < 4 < 5 < 6 < 7 < 8 < 9"
+ "< a,ª,A ; á,Á ; à,À ; â, ; å,Å ; ä,Ä ; ã,Ã"
+ "< b,B"
+ "< c,C ; ç,Ç"
+ "< d,D ; ð,Ð"
+ "< e,E ; é,É ; è,È ; ê,Ê ; ë,Ë"
+ "< f,F"
+ "< ƒ"
+ "< g,G"
+ "< h,H"
+ "< i,I ; í,Í ; ì,Ì ; î,Î ; ï,Ï"
+ "< j,J"
+ "< k,K"
+ "< l,L"
+ "< m,M"
+ "< n,N ; ñ,Ñ"
+ "< o,º,O ; ó,Ó ; ò,Ò ; ô,Ô ; ö,Ö ; õ,Õ ; ø,Ø"
+ "< p,P"
+ "< q,Q"
+ "< r,R"
+ "< s,S ; š,Š"
+ "< t,T"
+ "< u,U ; ú,Ú ; ù,Ù ; û,Û ; ü,Ü"
+ "< v,V"
+ "< w,W"
+ "< x,X"
+ "< y,Y ; ý,Ý ; ÿ,Ÿ"
+ "< z,Z ; ž,Ž"
+ "< þ,Þ"
+ "< µ"
+ "&'1/4'=¼ &'1/2'=½ &'3/4'=¾"
+ "&ae = æ &AE = Æ &ss = ß &OE= Œ &oe= œ &TM = ™ &'...' = … "
;
}
public static void main(String[] args) throws Exception {
SortTest sortTest = new SortTest();
for (String arg : args) {
switch (arg) {
case "--time":
sortTest.time = true;
break;
case "--full":
sortTest.fullOutput = true;
break;
case "--quiet":
sortTest.quiet = true;
break;
case "--unicode":
sortTest.unicode = true;
break;
}
}
sortTest.test();
}
}