// Copyright 2013 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package experimental.morfessor; import java.io.IOException; import java.io.Writer; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import marmot.util.Counter; import marmot.util.FileUtils; import marmot.util.LineIterator; import marmot.util.StringUtils; import marmot.util.StringUtils.Mode; public class Vocab { private Counter<String> counter_; public Vocab(String filename) { counter_ = new Counter<String>(); init(filename, -1); System.err.println("|Vocab| = " + counter_.size()); } public static List<String> tokenize(String word) { List<String> list = new LinkedList<String>(); StringBuilder sb = new StringBuilder(word.length()); for (int index = 0; index < word.length(); index++) { char c = word.charAt(index); if (Character.isDigit(c) || Character.isLetter(c)) { sb.append(c); } else { if (sb.length() > 0) { list.add(sb.toString()); sb.setLength(0); } if (!Character.isWhitespace(c)) { list.add(Character.toString(c)); } } } if (sb.length() > 0) { list.add(sb.toString()); } return list; } private void init(String filename, int limit) { int lines = 0; LineIterator iterator = new LineIterator(filename); while (iterator.hasNext() && (limit < 0 || lines < limit)) { List<String> line = iterator.next(); double count = Double.parseDouble(line.get(0)); for (String word : tokenize(line.get(1))) { if (isSpecial(word)) { continue; } word = StringUtils.normalize(word, Mode.lower); counter_.increment(word, count); } lines++; } } public void saveToAsciiFile(String filename) { CharEncoder encoder = CharEncoder.fromVocab(this); FileUtils.saveToFile(encoder, filename + ".map"); Counter<String> counter = new Counter<>(); for (Entry<String, Double> entry : counter_.entrySet()) { String form = encoder.encode(entry.getKey()); Double count = entry.getValue(); counter.increment(form, count); } try { Writer writer = FileUtils.openFileWriter(filename + ".ascii"); for (Entry<String, Double> entry : counter.entrySet()) { writer.write(String.format("%d %s\n", entry.getValue() .intValue(), entry.getKey())); } writer.close(); } catch (IOException e) { throw new RuntimeException(e); } } public Set<Map.Entry<String, Double>> entrySet() { return counter_.entrySet(); } public static void main(String[] args) { Vocab vocab = new Vocab(args[0]); vocab.saveToAsciiFile(args[1]); } public static boolean isSpecial(String word) { return word.length() == 1 && !(Character.isDigit(word.charAt(0)) || Character .isLetter(word.charAt(0))); } }