/* * Copyright (C) 2010, 2011. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3 or * version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ package uk.me.parabola.imgfmt.app.srt; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.nio.charset.CodingErrorAction; import java.text.CollationKey; import java.text.Collator; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import uk.me.parabola.imgfmt.ExitException; import uk.me.parabola.imgfmt.app.Label; /** * Represents the sorting positions for all the characters in a codepage. * * A map contains a file that determines how the characters are to be sorted. So we * have to have to be able to create such a file and sort with exactly the same rules * as is contained in it. * * What about the java {@link java.text.RuleBasedCollator}? It turns out that it is possible to * make it work in the way we need it to, although it doesn't help with creating the srt file. * Also it is significantly slower than this implementation, so this one is staying. I also * found that sorting with the sort keys and the collator gave different results in some * cases. This implementation does not. * * Be careful when benchmarking. With small lists (< 10000 entries) repeated runs cause some * pretty aggressive optimisation to kick in. This tends to favour this implementation which has * much tighter loops that the java7 or ICU implementations, but this may not be realised with * real workloads. * * @author Steve Ratcliffe */ public class Sort { private static final byte[] ZERO_KEY = new byte[4]; private static final Integer NO_ORDER = 0; private int codepage; private int id1; // Unknown - identifies the sort private int id2; // Unknown - identifies the sort private String description; private Charset charset; private final Page[] pages = new Page[256]; private final List<CodePosition> expansions = new ArrayList<>(); private int maxExpSize = 1; private CharsetEncoder encoder; private boolean multi; private int maxPage; public Sort() { pages[0] = new Page(); } public void add(int ch, int primary, int secondary, int tertiary, int flags) { ensurePage(ch >>> 8); if (getPrimary(ch) != 0) throw new ExitException(String.format("Repeated primary index 0x%x", ch & 0xff)); setPrimary (ch, primary); setSecondary(ch, secondary); setTertiary( ch, tertiary); setFlags(ch, flags); } /** * Run after all sorting order points have been added. * * Make sure that all tertiary values of secondary ignorable are greater * than any normal tertiary value. * * And the same for secondaries on primary ignorable. */ public void finish() { int maxSecondary = 0; int maxTertiary = 0; for (Page p : pages) { if (p == null) continue; for (int i = 0; i < 256; i++) { if (((p.flags[i] >>> 4) & 0x3) == 0) { if (p.getPrimary(i) != 0) { byte second = p.getSecondary(i); maxSecondary = Math.max(maxSecondary, second); if (second != 0) { maxTertiary = Math.max(maxTertiary, p.getTertiary(i)); } } } } } for (Page p : pages) { if (p == null) continue; for (int i = 0; i < 256; i++) { if (((p.flags[i] >>> 4) & 0x3) != 0) continue; if (p.getPrimary(i) == 0) { if (p.getSecondary(i) == 0) { if (p.getTertiary(i) != 0) { p.setTertiary(i, p.getTertiary(i) + maxTertiary); } } else { p.setSecondary(i, p.getSecondary(i) + maxSecondary); } } } } } /** * Return a table indexed by a character value in the target codepage, that gives the complete sort * position of the character. * * This is only used for testing. * * @return A table of sort positions. */ public char[] getSortPositions() { char[] tab = new char[256]; for (int i = 1; i < 256; i++) { tab[i] = (char) (((getPrimary(i) << 8) & 0xff00) | ((getSecondary(i) << 4) & 0xf0) | (getTertiary(i) & 0xf)); } return tab; } /** * Create a sort key for a given unicode string. The sort key can be compared instead of the original strings * and will compare based on the sorting represented by this Sort class. * * Using a sort key is more efficient if many comparisons are being done (for example if you are sorting a * list of strings). * * @param object This is saved in the sort key for later retrieval and plays no part in the sorting. * @param s The string for which the sort key is to be created. * @param second Secondary sort key. * @param cache A cache for the created keys. This is for saving memory so it is essential that this * is managed by the caller. * @return A sort key. */ public <T> SortKey<T> createSortKey(T object, String s, int second, Map<String, byte[]> cache) { // If there is a cache then look up and return the key. // This is primarily for memory management, not for speed. byte[] key; if (cache != null) { key = cache.get(s); if (key != null) return new SrtSortKey<>(object, key, second); } try { char[] chars; if (isMulti()) { chars = s.toCharArray(); } else { ByteBuffer out = encoder.encode(CharBuffer.wrap(s)); byte[] bval = out.array(); chars = new char[bval.length]; for (int i = 0; i < bval.length; i++) chars[i] = (char) (bval[i] & 0xff); } // In theory you could have a string where every character expands into maxExpSize separate characters // in the key. However if we allocate enough space to deal with the worst case, then we waste a // vast amount of memory. So allocate a minimal amount of space, try it and if it fails reallocate the // maximum amount. // // We need +1 for the null bytes, we also +2 for a couple of expanded characters. For a complete // german map this was always enough in tests. key = new byte[(chars.length + 1 + 2) * 4]; try { fillCompleteKey(chars, key); } catch (ArrayIndexOutOfBoundsException e) { // Ok try again with the max possible key size allocated. key = new byte[(chars.length+1) * 4 * maxExpSize]; fillCompleteKey(chars, key); } if (cache != null) cache.put(s, key); return new SrtSortKey<>(object, key, second); } catch (CharacterCodingException e) { return new SrtSortKey<>(object, ZERO_KEY); } } /** * Create a sort key based on a Label. * * The label will contain the actual characters (after transliteration for example) * @param object This is saved in the sort key for later retrieval and plays no part in the sorting. * @param label The label, the actual written bytes/chars will be used as input to the sort. * @param second Secondary sort key. * @param cache A cache for the created keys. This is for saving memory so it is essential that this * is managed by the caller. * @return A sort key. */ public <T> SortKey<T> createSortKey(T object, Label label, int second, Map<Label, byte[]> cache) { byte[] key; if (cache != null) { key = cache.get(label); if (key != null) return new SrtSortKey<>(object, key, second); } char[] encText = label.getEncText(); // In theory you could have a string where every character expands into maxExpSize separate characters // in the key. However if we allocate enough space to deal with the worst case, then we waste a // vast amount of memory. So allocate a minimal amount of space, try it and if it fails reallocate the // maximum amount. // // We need +1 for the null bytes, we also +2 for a couple of expanded characters. For a complete // german map this was always enough in tests. key = new byte[(encText.length + 1 + 2) * 4]; try { fillCompleteKey(encText, key); } catch (ArrayIndexOutOfBoundsException e) { // Ok try again with the max possible key size allocated. key = new byte[encText.length * 4 * maxExpSize + 4]; fillCompleteKey(encText, key); } if (cache != null) cache.put(label, key); return new SrtSortKey<>(object, key, second); } /** * Convenient version of create sort key method. * @see #createSortKey(Object, String, int, Map) */ public <T> SortKey<T> createSortKey(T object, String s, int second) { return createSortKey(object, s, second, null); } /** * Convenient version of create sort key method. * * @see #createSortKey(Object, String, int, Map) */ public <T> SortKey<T> createSortKey(T object, String s) { return createSortKey(object, s, 0, null); } public <T> SortKey<T> createSortKey(T object, Label label) { return createSortKey(object, label, 0, null); } public <T> SortKey<T> createSortKey(T object, Label label, int second) { return createSortKey(object, label, second, null); } /** * Fill in the key from the given byte string. * * @param bVal The string for which we are creating the sort key. * @param key The sort key. This will be filled in. */ private void fillCompleteKey(char[] bVal, byte[] key) { int start = fillKey(Collator.PRIMARY, bVal, key, 0); start = fillKey(Collator.SECONDARY, bVal, key, start); fillKey(Collator.TERTIARY, bVal, key, start); } /** * Fill in the output key for a given strength. * * @param input The input string in a particular 8 bit codepage. * @param outKey The output sort key. * @param start The index into the output key to start at. * @return The next position in the output key. */ private int fillKey(int type, char[] input, byte[] outKey, int start) { int index = start; for (char c : input) { if (!hasPage(c >>> 8)) continue; int exp = (getFlags(c) >> 4) & 0x3; if (exp == 0) { index = writePos(type, c, outKey, index); } else { // now have to redirect to a list of input chars, get the list via the primary value always. int idx = getPrimary(c); for (int i = idx - 1; i < idx + exp; i++) { int pos = expansions.get(i).getPosition(type); if (pos != 0) { if (type == Collator.PRIMARY) outKey[index++] = (byte) ((pos >>> 8) & 0xff); outKey[index++] = (byte) pos; } } } } if (type == Collator.PRIMARY) outKey[index++] = '\0'; outKey[index++] = '\0'; return index; } public int getPrimary(int ch) { return this.pages[ch >>> 8].getPrimary(ch); } public int getSecondary(int ch) { return this.pages[ch >>> 8].getSecondary(ch); } public int getTertiary(int ch) { return this.pages[ch >>> 8].getTertiary(ch); } public byte getFlags(int ch) { assert ch >= 0; return this.pages[ch >>> 8].flags[ch & 0xff]; } public int getCodepage() { return codepage; } public Charset getCharset() { return charset; } public int getId1() { return id1; } public void setId1(int id1) { this.id1 = id1; } public int getId2() { return id2; } public void setId2(int id2) { this.id2 = id2 & 0x7fff; } /** * Get the sort order as a single integer. * A combination of id1 and id2. I think that they are arbitrary so may as well treat them as one. * * @return id1 and id2 as if they were a little endian 2 byte integer. */ public int getSortOrderId() { return (this.id2 << 16) + (this.id1 & 0xffff); } /** * Set the sort order as a single integer. * @param id The sort order id. */ public void setSortOrderId(int id) { id1 = id & 0xffff; id2 = (id >>> 16) & 0x7fff; } public void setCodepage(int codepage) { this.codepage = codepage; charset = charsetFromCodepage(codepage); encoder = charset.newEncoder(); encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } /** * Add an expansion to the sort. * An expansion is a letter that sorts as if it were two separate letters. * * The case were two letters sort as if the were just one (and more complex cases) are * not supported or are unknown to us. * * @param ch The code point of this letter in the code page. * @param inFlags The initial flags, eg if it is a letter or not. * @param expansionList The letters that this letter sorts as, as code points in the codepage. */ public void addExpansion(int ch, int inFlags, List<Integer> expansionList) { ensurePage(ch >>> 8); setFlags(ch, (byte) ((inFlags & 0xf) | (((expansionList.size()-1) << 4) & 0xf0))); // Check for repeated definitions if (getPrimary(ch) != 0) throw new ExitException(String.format("repeated code point %x", ch)); setPrimary(ch, (expansions.size() + 1)); setSecondary(ch, 0); setTertiary(ch, 0); maxExpSize = Math.max(maxExpSize, expansionList.size()); for (Integer b : expansionList) { CodePosition cp = new CodePosition(); cp.setPrimary((char) getPrimary(b & 0xff)); // Currently sort without secondary or tertiary differences to the base letters. cp.setSecondary((byte) getSecondary(b & 0xff)); cp.setTertiary((byte) getTertiary(b & 0xff)); expansions.add(cp); } } /** * Get the expansion with the given index, one based. * @param val The one-based index number of the extension. */ public CodePosition getExpansion(int val) { return expansions.get(val - 1); } public Collator getCollator() { return new SrtCollator(codepage); } public int getExpansionSize() { return expansions.size(); } public String toString() { return String.format("sort cp=%d order=%08x", codepage, getSortOrderId()); } private void setPrimary(int ch, int val) { this.pages[ch >>> 8].setPrimary(ch, val); } private void setSecondary(int ch, int val) { this.pages[ch >>> 8].setSecondary(ch, val); } private void setTertiary(int ch, int val) { this.pages[ch >>> 8].setTertiary(ch, val); } private void setFlags(int ch, int val) { this.pages[ch >>> 8].flags[ch & 0xff] = (byte) val; } public static Charset charsetFromCodepage(int codepage) { Charset charset; switch (codepage) { case 0: charset = Charset.forName("ascii"); break; case 65001: charset = Charset.forName("UTF-8"); break; case 932: // Java uses "ms932" for code page 932 // (Windows-31J, Shift-JIS + MS extensions) charset = Charset.forName("ms932"); break; default: charset = Charset.forName("cp" + codepage); break; } return charset; } public void setMulti(boolean multi) { this.multi = multi; } public boolean isMulti() { return multi; } public int getPos(int type, int ch) { return pages[ch >>> 8].getPos(type, ch); } public int writePos(int type, int ch, byte[] outkey, int start) { return pages[ch >>> 8].writePos(type, ch, outkey, start); } /** * Ensure that the given page exists in the page array. * * @param n The page index. */ private void ensurePage(int n) { assert n == 0 || isMulti(); if (this.pages[n] == null) { this.pages[n] = new Page(); if (n > maxPage) maxPage = n; } } /** * The max page, top 8+ bits of the character that we have information on. */ public int getMaxPage() { return maxPage; } /** * @return True if there is at least one character with the given page/block number. */ public boolean hasPage(int p) { return pages[p] != null; } /** * Holds the sort positions of a 256 character block. */ private static class Page { private final char[] primary = new char[256]; private final byte[] secondary = new byte[256]; private final byte[] tertiary = new byte[256]; private final byte[] flags = new byte[256]; char getPrimary(int ch) { return primary[ch & 0xff]; } void setPrimary(int ch, int val) { primary[ch & 0xff] = (char) val; } byte getSecondary(int ch) { return secondary[ch & 0xff]; } void setSecondary(int ch, int val) { secondary[ch & 0xff] = (byte) val; } byte getTertiary(int ch) { return tertiary[ch & 0xff]; } void setTertiary(int ch, int val) { tertiary[ch & 0xff] = (byte) val; } /** * Get the sort position data for a given strength for a character. * @param type The collation strength PRIMARY, SECONDARY etc. * @param ch The character. * @return The sorting weight for the given character. */ public int getPos(int type, int ch) { switch (type) { case Collator.PRIMARY: return getPrimary(ch) & 0xffff; case Collator.SECONDARY: return getSecondary(ch) & 0xff; case Collator.TERTIARY: return getTertiary(ch) & 0xff; default: assert false : "bad collation type passed"; return 0; } } /** * Write a sort position for a given character to a sort key. * @param strength The sort strength type. * @param ch The character. * @param outKey The output key. * @param start The offset into outKey, the new position is written here. * @return The new start offset, after the key information has been written. */ public int writePos(int strength, int ch, byte[] outKey, int start) { int pos = getPos(strength, ch); if (pos != 0) { if (strength == Collator.PRIMARY) outKey[start++] = (byte) ((pos >> 8) & 0xff); // for 2 byte charsets outKey[start++] = (byte) (pos & 0xff); } return start; } } /** * A collator that works with this sort. This should be used if you just need to compare two * strings against each other once. * * The sort key is better when the comparison must be done several times as in a sort operation. * * This implementation has the same effect when used for sorting as the sort keys. */ private class SrtCollator extends Collator { private final int codepage; private SrtCollator(int codepage) { this.codepage = codepage; } public int compare(String source, String target) { char[] chars1; char[] chars2; if (isMulti()) { chars1 = source.toCharArray(); chars2 = target.toCharArray(); } else { CharBuffer in1 = CharBuffer.wrap(source); CharBuffer in2 = CharBuffer.wrap(target); try { byte[] bytes1 = encoder.encode(in1).array(); byte[] bytes2 = encoder.encode(in2).array(); chars1 = new char[bytes1.length]; for (int i = 0; i < bytes1.length; i++) chars1[i] = (char) (bytes1[i] & 0xff); chars2 = new char[bytes2.length]; for (int i = 0; i < bytes2.length; i++) chars2[i] = (char) (bytes2[i] & 0xff); } catch (CharacterCodingException e) { throw new ExitException("character encoding failed unexpectedly", e); } } int strength = getStrength(); int res = compareOneStrength(chars1, chars2, Collator.PRIMARY); if (res == 0 && strength != PRIMARY) { res = compareOneStrength(chars1, chars2, Collator.SECONDARY); if (res == 0 && strength != SECONDARY) { res = compareOneStrength(chars1, chars2, Collator.TERTIARY); } } return res; } /** * Compare the bytes against primary, secondary or tertiary arrays. * @param char1 Bytes for the first string in the codepage encoding. * @param char2 Bytes for the second string in the codepage encoding. * @return Comparison result -1, 0 or 1. */ private int compareOneStrength(char[] char1, char[] char2, int type) { int res = 0; PositionIterator it1 = new PositionIterator(char1, type); PositionIterator it2 = new PositionIterator(char2, type); while (it1.hasNext() || it2.hasNext()) { int p1 = it1.next(); int p2 = it2.next(); if (p1 < p2) { res = -1; break; } else if (p1 > p2) { res = 1; break; } } return res; } public CollationKey getCollationKey(String source) { throw new UnsupportedOperationException("use Sort.createSortKey() instead"); } public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; SrtCollator that = (SrtCollator) o; if (codepage != that.codepage) return false; return true; } public int hashCode() { return codepage; } class PositionIterator implements Iterator<Integer> { private final char[] chars; private final int len; private final int type; private int pos; private int expStart; private int expEnd; private int expPos; PositionIterator(char[] chars, int type) { this.chars = chars; this.len = chars.length; this.type = type; } public boolean hasNext() { return pos < len || expPos != 0; } /** * Get the next sort order value for the input string. Does not ever return values * that are ignorable. Returns NO_ORDER at (and beyond) the end of the string, this * value sorts less than any other and so makes shorter strings sort first. * @return The next non-ignored sort position. At the end of the string it returns * NO_ORDER. */ public Integer next() { int next; if (expPos == 0) { do { if (pos >= len) { next = NO_ORDER; break; } // Get the first non-ignorable at this level int c = chars[(pos++ & 0xff)]; if (!hasPage(c >>> 8)) { next = 0; continue; } int nExpand = (getFlags(c) >> 4) & 0x3; // Check if this is an expansion. if (nExpand > 0) { expStart = getPrimary(c) - 1; expEnd = expStart + nExpand; expPos = expStart; next = expansions.get(expPos).getPosition(type); if (++expPos > expEnd) expPos = 0; } else { next = getPos(type, c); } } while (next == 0); } else { next = expansions.get(expPos).getPosition(type); if (++expPos > expEnd) expPos = 0; } return next; } public void remove() { throw new UnsupportedOperationException("remove not supported"); } } } }