package org.wikibrain.core.lang; import gnu.trove.set.TByteSet; import gnu.trove.set.hash.TByteHashSet; import org.apache.commons.lang3.ArrayUtils; import org.junit.Test; import org.wikibrain.conf.Configuration; import java.util.*; /** * @author Ari Weiland */ public class TestLangStorage { // @Test // public void buildStuff() throws IOException { // File file = new File("src/main/resources/tmp.txt"); // file.createNewFile(); // List<String> lines = new ArrayList<String>(); // for (int i=0; i<LanguageSet.ALL.size(); i++) { // String lang = "LANG_" + i; // lines.add("Tables.UNIVERSAL_SKELETAL_LINK." + lang + ","); // } // FileUtils.writeLines(file, lines, "\n"); // // file = new File("src/main/resources/db/universal-skeletal-link-create-tables.sql"); // lines = new ArrayList<String>(); // for (int i=0; i<LanguageSet.ALL.size(); i++) { // String lang = "lang_" + i; // lines.add(lang + " BOOLEAN NOT NULL,"); // } // FileUtils.writeLines(file, lines, "\n", true); // } private static final int TOTAL_LANGUAGES = LanguageSet.ALL.size(); @Test public void test() { LanguageSet languages = new LanguageSet(new Configuration().get().getStringList("languages.big-economies.langCodes")); byte[] bits = toByteArray(languages); LanguageSet output = getLanguageSet(bits); assert languages.equals(output); System.out.println(); for (int j=0; j<285; j++) { List<Language> langs = new ArrayList<Language>(); for (int i=0; i <= j; i++) { langs.add(Language.getById(new Random().nextInt(TOTAL_LANGUAGES) + 1)); } languages = new LanguageSet(langs); bits = toByteArray(languages); output = getLanguageSet(bits); assert languages.equals(output); } } public static byte[] toByteArray(LanguageSet langs) { TByteSet byteSet = new TByteHashSet(); Set<byte[]> extras = new HashSet<byte[]>(); for (Language l : langs) { short id = l.getId(); if (id < 256) { // id-1 because id ranges from 1 to >256 but byte ranges from -128 to 127 byteSet.add((byte) (id-128)); } else { byte[] temp = new byte[2]; temp[0] = (byte) -128; temp[1] = (byte) (id-255-128); extras.add(temp); } } byte[] output = byteSet.toArray(); for (byte[] b : extras) { output = ArrayUtils.addAll(output, b); } return output; } public static LanguageSet getLanguageSet(byte[] truncated) { Set<Language> languages = new HashSet<Language>(); boolean extra = false; for (byte b : truncated) { if (extra) { languages.add(Language.getById(b+128+255)); extra = false; } else if (b == -128) { extra = true; } else { languages.add(Language.getById(b+128)); } } return new LanguageSet(languages); } public static byte[] toByteBits(LanguageSet languages) { int index = 0; // 8 is the number of bits per byte byte[] langBits = new byte[TOTAL_LANGUAGES/8 + 1]; Arrays.fill(langBits, (byte) 0x0); for (int i=1; i <= TOTAL_LANGUAGES; i++) { byte temp = langBits[index]; temp = (byte) (temp << 1); if (languages.containsLanguage(Language.getById(i))) { temp = (byte) (temp | (byte) 0x1); } langBits[index] = temp; if (i%8 == 0) { index++; } } return langBits; } // public static LanguageSet getLanguageSet(byte[] langBits) { // // 8 is the number of bits per int // if (langBits.length != TOTAL_LANGUAGES/8 + 1) { // throw new IllegalArgumentException(); // } // byte[] copy = Arrays.copyOf(langBits, langBits.length); // List<Language> languages = new ArrayList<Language>(); // int index = copy.length - 1; // for (int i=TOTAL_LANGUAGES; i > 0; i--) { // if (i%8 == 0) { // index--; // } // byte temp = copy[index]; // if ((temp & 0x1) == 1) { // languages.add(Language.getById(i)); // } // temp = (byte) (temp >> 1); // copy[index] = temp; // } // return new LanguageSet(languages); // } public static short[] toShortBits(LanguageSet languages) { int index = 0; // 16 is the number of bits per byte short[] langBits = new short[TOTAL_LANGUAGES/16 + 1]; Arrays.fill(langBits, (short) 0x0); for (int i=1; i <= TOTAL_LANGUAGES; i++) { short temp = langBits[index]; temp = (short) (temp << 1); if (languages.containsLanguage(Language.getById(i))) { temp = (short) (temp | (short) 0x1); } langBits[index] = temp; if (i%16 == 0) { index++; } } return langBits; } public static LanguageSet getLanguageSet(short[] langBits) { // 16 is the number of bits per int if (langBits.length != TOTAL_LANGUAGES/16 + 1) { throw new IllegalArgumentException(); } short[] copy = Arrays.copyOf(langBits, langBits.length); List<Language> languages = new ArrayList<Language>(); int index = copy.length - 1; for (int i=TOTAL_LANGUAGES; i > 0; i--) { if (i%16 == 0) { index--; } short temp = copy[index]; if ((temp & 0x1) == 1) { languages.add(Language.getById(i)); } temp = (short) (temp >> 1); copy[index] = temp; } return new LanguageSet(languages); } public static int[] toIntBits(LanguageSet languages) { int index = 0; // 32 is the number of bits per byte int[] langBits = new int[TOTAL_LANGUAGES/32 + 1]; Arrays.fill(langBits, 0x0); for (int i=1; i <= TOTAL_LANGUAGES; i++) { int temp = langBits[index]; temp = temp << 1; if (languages.containsLanguage(Language.getById(i))) { temp = temp | 0x1; } langBits[index] = temp; if (i%32 == 0) { index++; } } return langBits; } public static LanguageSet getLanguageSet(int[] langBits) { // 32 is the number of bits per int if (langBits.length != TOTAL_LANGUAGES/32 + 1) { throw new IllegalArgumentException(); } int[] copy = Arrays.copyOf(langBits, langBits.length); List<Language> languages = new ArrayList<Language>(); int index = copy.length - 1; for (int i=TOTAL_LANGUAGES; i > 0; i--) { if (i%32 == 0) { index--; } int temp = copy[index]; if ((temp & 0x1) == 1) { languages.add(Language.getById(i)); } temp = temp >> 1; copy[index] = temp; } return new LanguageSet(languages); } public static long[] toLongBits(LanguageSet languages) { int index = 0; // 64 is the number of bits per byte long[] langBits = new long[TOTAL_LANGUAGES/64 + 1]; Arrays.fill(langBits, 0x0L); for (int i=1; i <= TOTAL_LANGUAGES; i++) { long temp = langBits[index]; temp = temp << 1; if (languages.containsLanguage(Language.getById(i))) { temp = temp | 0x1L; } langBits[index] = temp; if (i%64 == 0) { index++; } } return langBits; } public static LanguageSet getLanguageSet(long[] langBits) { // 64 is the number of bits per int if (langBits.length != TOTAL_LANGUAGES/64 + 1) { throw new IllegalArgumentException(); } long[] copy = Arrays.copyOf(langBits, langBits.length); List<Language> languages = new ArrayList<Language>(); int index = copy.length - 1; for (int i=TOTAL_LANGUAGES; i > 0; i--) { if (i%64 == 0) { index--; } long temp = copy[index]; if ((temp & 0x1L) == 1) { languages.add(Language.getById(i)); } temp = temp >> 1; copy[index] = temp; } return new LanguageSet(languages); } }