/* * Copyright (C) 2009 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.tools.dict; import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; /** * Compresses a list of words and frequencies into a tree structured binary dictionary. */ public class MakeBinaryDictionary { public static final int ALPHA_SIZE = 256; public static final String TAG_WORD = "w"; public static final String ATTR_FREQ = "f"; private static final int FLAG_ADDRESS_MASK = 0x400000; private static final int FLAG_TERMINAL_MASK = 0x800000; private static final int ADDRESS_MASK = 0x3FFFFF; public static final CharNode EMPTY_NODE = new CharNode(); List<CharNode> roots; Map<String, Integer> mDictionary; int mWordCount; static class CharNode { char data; int freq; boolean terminal; List<CharNode> children; static int sNodes; public CharNode() { sNodes++; } } public static void usage() { System.err.println("Usage: makedict <src.xml> <dest.dict>"); System.exit(-1); } public static void main(String[] args) { if (args.length < 2) { usage(); } else { new MakeBinaryDictionary(args[0], args[1]); } } public MakeBinaryDictionary(String srcFilename, String destFilename) { populateDictionary(srcFilename); writeToDict(destFilename); // Enable the code below to verify that the generated tree is traversable. if (false) { traverseDict(0, new char[32], 0); } } private void populateDictionary(String filename) { roots = new ArrayList<CharNode>(); try { SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); parser.parse(new File(filename), new DefaultHandler() { boolean inWord; int freq; @Override public void startElement(String uri, String localName, String qName, Attributes attributes) { if (qName.equals("w")) { inWord = true; freq = Integer.parseInt(attributes.getValue(0)); } } @Override public void characters(char[] data, int offset, int length) { // Ignore other whitespace if (!inWord) return; // Ignore one letter words if (length < 2) return; mWordCount++; String word = new String(data, offset, length); addWordTop(word, freq); } @Override public void endElement(String uri, String localName, String qName) { if (qName.equals("w")) inWord = false; } }); } catch (Exception ioe) { System.err.println("Exception in parsing\n" + ioe); ioe.printStackTrace(); } System.out.println("Nodes = " + CharNode.sNodes); } private int indexOf(List<CharNode> children, char c) { if (children == null) { return -1; } for (int i = 0; i < children.size(); i++) { if (children.get(i).data == c) { return i; } } return -1; } private void addWordTop(String word, int occur) { if (occur > 255) occur = 255; char firstChar = word.charAt(0); int index = indexOf(roots, firstChar); if (index == -1) { CharNode newNode = new CharNode(); newNode.data = firstChar; newNode.freq = occur; index = roots.size(); roots.add(newNode); } else { roots.get(index).freq += occur; } if (word.length() > 1) { addWordRec(roots.get(index), word, 1, occur); } else { roots.get(index).terminal = true; } } private void addWordRec(CharNode parent, String word, int charAt, int occur) { CharNode child = null; char data = word.charAt(charAt); if (parent.children == null) { parent.children = new ArrayList<CharNode>(); } else { for (int i = 0; i < parent.children.size(); i++) { CharNode node = parent.children.get(i); if (node.data == data) { child = node; break; } } } if (child == null) { child = new CharNode(); parent.children.add(child); } child.data = data; if (child.freq == 0) child.freq = occur; if (word.length() > charAt + 1) { addWordRec(child, word, charAt + 1, occur); } else { child.terminal = true; child.freq = occur; } } byte[] dict; int dictSize; static final int CHAR_WIDTH = 8; static final int FLAGS_WIDTH = 1; // Terminal flag (word end) static final int ADDR_WIDTH = 23; // Offset to children static final int FREQ_WIDTH_BYTES = 1; static final int COUNT_WIDTH_BYTES = 1; private void addCount(int count) { dict[dictSize++] = (byte) (0xFF & count); } private void addNode(CharNode node) { int charData = 0xFFFF & node.data; if (charData > 254) { dict[dictSize++] = (byte) 255; dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF); dict[dictSize++] = (byte) (node.data & 0xFF); } else { dict[dictSize++] = (byte) (0xFF & node.data); } if (node.children != null) { dictSize += 3; // Space for children address } else { dictSize += 1; // Space for just the terminal/address flags } if ((0xFFFFFF & node.freq) > 255) { node.freq = 255; } if (node.terminal) { byte freq = (byte) (0xFF & node.freq); dict[dictSize++] = freq; } } int nullChildrenCount = 0; int notTerminalCount = 0; private void updateNodeAddress(int nodeAddress, CharNode node, int childrenAddress) { if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character nodeAddress += 2; } childrenAddress = ADDRESS_MASK & childrenAddress; if (childrenAddress == 0) { nullChildrenCount++; } else { childrenAddress |= FLAG_ADDRESS_MASK; } if (node.terminal) { childrenAddress |= FLAG_TERMINAL_MASK; } else { notTerminalCount++; } dict[nodeAddress + 1] = (byte) (childrenAddress >> 16); if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) { dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8); dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF)); } } void writeWordsRec(List<CharNode> children) { if (children == null || children.size() == 0) { return; } final int childCount = children.size(); addCount(childCount); //int childrenStart = dictSize; int[] childrenAddresses = new int[childCount]; for (int j = 0; j < childCount; j++) { CharNode node = children.get(j); childrenAddresses[j] = dictSize; addNode(node); } for (int j = 0; j < childCount; j++) { CharNode node = children.get(j); int nodeAddress = childrenAddresses[j]; int cacheDictSize = dictSize; writeWordsRec(node.children); updateNodeAddress(nodeAddress, node, node.children != null ? cacheDictSize : 0); } } void writeToDict(String dictFilename) { // 4MB max, 22-bit offsets dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably // < 1MB in most cases, as there is a limit in the // resource size in apks. dictSize = 0; writeWordsRec(roots); System.out.println("Dict Size = " + dictSize); try { FileOutputStream fos = new FileOutputStream(dictFilename); fos.write(dict, 0, dictSize); fos.close(); } catch (IOException ioe) { System.err.println("Error writing dict file:" + ioe); } } void traverseDict(int pos, char[] word, int depth) { int count = dict[pos++] & 0xFF; for (int i = 0; i < count; i++) { char c = (char) (dict[pos++] & 0xFF); if (c == 0xFF) { c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF)); pos += 2; } word[depth] = c; boolean terminal = (dict[pos] & 0x80) > 0; int address = 0; if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { address = ((dict[pos + 0] & (FLAG_ADDRESS_MASK >> 16)) << 16) | ((dict[pos + 1] & 0xFF) << 8) | ((dict[pos + 2] & 0xFF)); pos += 2; } pos++; if (terminal) { showWord(word, depth + 1, dict[pos] & 0xFF); pos++; } if (address != 0) { traverseDict(address, word, depth + 1); } } } void showWord(char[] word, int size, int freq) { System.out.print(new String(word, 0, size) + " " + freq + "\n"); } }