// Copyright 2015 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.util;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import marmot.util.StringUtils.Mode;
import marmot.util.StringUtils.Shape;
public class HashLexicon implements Lexicon {
private static final long serialVersionUID = 1L;
private transient Map<String, int[]> map_;
private Mode mode_;
public HashLexicon(Mode mode) {
mode_ = mode;
map_ = new HashMap<>();
}
private static void checkUnigramLine(boolean condition,
String unigram_file, List<String> line) {
if (!condition) {
throw new RuntimeException(
String.format(
"Line in file %s should be of format <WORD> [<COUNT>], but is \"%s\"",
unigram_file, line));
}
}
public static HashLexicon readFromFile(String path, int min_count) {
LineIterator iterator = new LineIterator(path);
HashLexicon unigram_lexicon = new HashLexicon(Mode.lower);
while (iterator.hasNext()) {
List<String> line = iterator.next();
if (line.isEmpty())
continue;
checkUnigramLine(line.size() == 1 || line.size() == 2, path, line);
String word = line.get(0);
int count = 1;
if (line.size() > 1) {
try {
String count_string = line.get(1);
count = Integer.valueOf(count_string);
} catch (NumberFormatException e) {
checkUnigramLine(false, path, line);
}
}
if (count >= min_count)
unigram_lexicon.addEntry(word, count);
}
return unigram_lexicon;
}
public void addEntry(String word, Integer value) {
String key = StringUtils.normalize(word, mode_);
int[] current_value = map_.get(key);
if (current_value == null) {
current_value = new int[ARRAY_LENGTH];
map_.put(key, current_value);
}
Shape shape = StringUtils.getShape(word);
current_value[shape.ordinal()] += value;
current_value[ARRAY_LENGTH - 1] += value;
}
public int[] getCount(String word) {
String key = StringUtils.normalize(word, mode_);
int[] value = map_.get(key);
return value;
}
public int size() {
return map_.size();
}
private void writeObject(ObjectOutputStream oos) throws IOException {
oos.defaultWriteObject();
oos.writeInt(map_.size());
for (Map.Entry<String, int[]> entry : map_.entrySet()) {
String string = entry.getKey();
oos.writeUTF(string);
int[] counts = entry.getValue();
int non_zeroes = 0;
for (int count : counts) {
if (count > 0) {
non_zeroes ++;
}
}
oos.writeShort(non_zeroes);
for (int i = 0; i < counts.length; i++) {
int count = counts[i];
if (count > 0) {
oos.writeShort(i);
oos.writeShort(count);
}
}
}
}
private void readObject(ObjectInputStream ois)
throws ClassNotFoundException, IOException {
ois.defaultReadObject();
int map_size = ois.readInt();
map_ = new HashMap<>(map_size);
for (int number = 0; number < map_size; number++) {
String string = ois.readUTF();
int[] counts = new int[Lexicon.ARRAY_LENGTH];
int non_zeroes = ois.readShort();
for (int non_zero = 0; non_zero < non_zeroes; non_zero++) {
int index = ois.readShort();
int count = ois.readShort();
counts[index] = count;
}
map_.put(string, counts);
}
}
public Map<String, int[]> getMap() {
return map_;
}
}