package edu.berkeley.nlp.lm.phrasetable;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import edu.berkeley.nlp.lm.StringWordIndexer;
import edu.berkeley.nlp.lm.WordIndexer;
import edu.berkeley.nlp.lm.map.HashNgramMap;
import edu.berkeley.nlp.lm.phrasetable.PhraseTableValueContainer.FeaturePhraseTableValues;
import edu.berkeley.nlp.lm.phrasetable.PhraseTableValueContainer.PhraseTableValues;
import edu.berkeley.nlp.lm.phrasetable.PhraseTableValueContainer.TargetTranslationsValues;
import edu.berkeley.nlp.lm.util.Logger;
/**
*
* Experimental class for reading Moses phrase tables and storing them
* efficiently in memory using a trie.
*
* @author adampauls
*
*/
public class MosesPhraseTable implements Serializable
{
/**
*
*/
private static final long serialVersionUID = 1L;
public static class TargetSideTranslation
{
// only stores the first 4 features from a moses file (i.e. does not store the bias)
public float[] features;
public int[] trgWords;
@Override
public String toString() {
return Arrays.toString(trgWords) + " :: " + Arrays.toString(features);
}
}
private final HashNgramMap<PhraseTableValues> map;
private final WordIndexer<String> wordIndexer;
public static MosesPhraseTable readFromFile(final String file) {
final StringWordIndexer stringWordIndexer = new StringWordIndexer();
final MosesPhraseTableReaderCallback<String> callback = new MosesPhraseTableReaderCallback<String>(stringWordIndexer);
new MosesPhraseTableReader<String>(file, stringWordIndexer).parse(callback);
return new MosesPhraseTable(callback.getMap(), stringWordIndexer);
}
private MosesPhraseTable(final HashNgramMap<PhraseTableValues> map, final WordIndexer<String> wordIndexer) {
this.map = map;
this.wordIndexer = wordIndexer;
}
public List<TargetSideTranslation> getTranslations(final int[] src, final int startPos, final int endPos) {
final long offsetForNgram = map.getOffsetForNgramInModel(src, startPos, endPos);
if (offsetForNgram < 0) return Collections.emptyList();
final TargetTranslationsValues scratch = new PhraseTableValueContainer.TargetTranslationsValues();
map.getValues().getFromOffset(offsetForNgram, endPos - startPos - 1, scratch);
final List<TargetSideTranslation> ret = new ArrayList<TargetSideTranslation>();
for (int i = 0; i < scratch.targetTranslationOffsets.length; ++i) {
final FeaturePhraseTableValues features = new PhraseTableValueContainer.FeaturePhraseTableValues(null);
final long currOffset = scratch.targetTranslationOffsets[i];
final int currOrder = scratch.targetTranslationOrders[i];
map.getValues().getFromOffset(currOffset, currOrder, features);
if (features.features == null) {
Logger.warn("Should probably fix");
continue;
}
final TargetSideTranslation tr = new TargetSideTranslation();
tr.features = Arrays.copyOf(features.features, features.features.length);
int sepIndex = 0;
final int[] srcAndTrg = map.getNgramForOffset(currOffset, currOrder);
for (; sepIndex < srcAndTrg.length; ++sepIndex) {
if (srcAndTrg[sepIndex] == ((PhraseTableValueContainer) map.getValues()).getSeparatorWord()) {
break;
}
}
tr.trgWords = Arrays.copyOfRange(srcAndTrg, sepIndex + 1, srcAndTrg.length);
assert tr.trgWords.length > 0;
ret.add(tr);
}
return ret;
}
public WordIndexer<String> getWordIndexer() {
return wordIndexer;
}
}