// Copyright 2013 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package experimental.morfessor; import marmot.util.Counter; import marmot.util.FileUtils; import java.io.BufferedReader; import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.Map.Entry; public class CharEncoder implements Serializable { private static final long serialVersionUID = 1L; private static final char UNKNOWN_ = '?'; private Map<Character, Character> encode_map_; private Map<Character, Character> decode_map_; public CharEncoder(Map<Character, Character> encode_map) { encode_map_ = encode_map; decode_map_ = new HashMap<Character, Character>(); for (Map.Entry<Character, Character> entry : encode_map.entrySet()) { decode_map_.put(entry.getValue(), entry.getKey()); } } public static CharEncoder loadFromFile(String filename) { Map<Character, Character> encode_map = new HashMap<Character, Character>(); try { BufferedReader reader = FileUtils.openFile(filename); while (reader.ready()) { String line = reader.readLine().trim(); if (line.isEmpty()) { continue; } char key = line.charAt(0); char value = line.charAt(2); encode_map.put(key, value); } } catch (IOException e) { throw new RuntimeException(e); } return new CharEncoder(encode_map); } public static CharEncoder fromVocab(Vocab vocab) { int length = ('z' - 'a') + ('Z' - 'A') + ('9' - '0') + 3; char[] alphabet = new char[length]; int current = 0; for (char index = 'a'; index <= 'z'; index++) alphabet[current++] = index; for (char index = 'A'; index <= 'Z'; index++) alphabet[current++] = index; for (char index = '0'; index <= '9'; index++) alphabet[current++] = index; assert length == current; Counter<Character> counter = new Counter<Character>(); for (Entry<String, Double> entry : vocab.entrySet()) { for (int index = 0; index < entry.getKey().length(); index++) { counter.increment(entry.getKey().charAt(index), entry.getValue()); } } Map<Character, Character> encode_map = new HashMap<>(); current = length - 1; Set<Character> alphabet_set = new HashSet<>(); for (char c : alphabet) { alphabet_set.add(c); } for (Entry<Character, Double> entry : counter.sortedEntries()) { if (alphabet_set.isEmpty()) { break; } char c = entry.getKey(); while (!alphabet_set.contains(c)) { c = alphabet[current--]; } alphabet_set.remove(c); encode_map.put(entry.getKey(), c); } return new CharEncoder(encode_map); } public String encode(String word) { StringBuilder sb = new StringBuilder(); for (int index = 0; index < word.length(); index++) { Character c = encode_map_.get(word.charAt(index)); if (c == null) { sb.append(UNKNOWN_); } else { sb.append(c); } } return sb.toString(); } public String decode(String word) { StringBuilder sb = new StringBuilder(); for (int index = 0; index < word.length(); index++) { Character c = decode_map_.get(word.charAt(index)); if (c == null) { sb.append(UNKNOWN_); } else { sb.append(c); } } return sb.toString(); } }