package edu.berkeley.cs.nlp.ocular.eval;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.HYPHEN;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.SPACE;
import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.last;
import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2;
import java.util.ArrayList;
import java.util.List;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType;
import edu.berkeley.cs.nlp.ocular.model.DecodeState;
import edu.berkeley.cs.nlp.ocular.model.transition.SparseTransitionModel.TransitionState;
import edu.berkeley.cs.nlp.ocular.util.CollectionHelper;
import edu.berkeley.cs.nlp.ocular.util.Tuple2;
import tberg.murphy.indexer.Indexer;
/**
* @author Dan Garrette (dhgarrette@gmail.com)
*/
public class ModelTranscriptions {
private List<Tuple2<String,String>>[] viterbiDiplomaticCharLangLines;
private List<Tuple2<String,String>>[] viterbiNormalizedCharLangLines;
private List<Tuple2<String,String>> viterbiNormalizedCharLangRunning; // A continuous string, re-assembling words hyphenated over a line.
private List<DecodeState>[] viterbiDecodeStates;
private Indexer<String> langIndexer;
@SuppressWarnings("unchecked")
public ModelTranscriptions(DecodeState[][] decodeStates, Indexer<String> charIndexer, Indexer<String> langIndexer) {
this.langIndexer = langIndexer;
int numLines = decodeStates.length;
viterbiDiplomaticCharLangLines = new List[numLines];
viterbiNormalizedCharLangLines = new List[numLines];
viterbiNormalizedCharLangRunning = new ArrayList<Tuple2<String,String>>();
viterbiDecodeStates = new List[numLines];
for (int line = 0; line < numLines; ++line) {
viterbiDiplomaticCharLangLines[line] = new ArrayList<Tuple2<String,String>>();
viterbiNormalizedCharLangLines[line] = new ArrayList<Tuple2<String,String>>();
viterbiDecodeStates[line] = new ArrayList<DecodeState>();
for (int i = 0; i < decodeStates[line].length; ++i) {
DecodeState ds = decodeStates[line][i];
TransitionState ts = ds.ts;
String currDiplomaticChar = charIndexer.getObject(ts.getGlyphChar().templateCharIndex);
String prevDiplomaticChar = (!viterbiDiplomaticCharLangLines[line].isEmpty() ? last(viterbiDiplomaticCharLangLines[line])._1 : null); // null if start of line, but that's ok
if (HYPHEN.equals(prevDiplomaticChar) && HYPHEN.equals(currDiplomaticChar)) {
// collapse multi-hyphens
}
else {
viterbiDecodeStates[line].add(ds);
//
// Add diplomatic characters to diplomatic transcription
//
if (!ts.getGlyphChar().isElided()) {
viterbiDiplomaticCharLangLines[line].add(makeCharLangTuple(currDiplomaticChar, ts.getLanguageIndex()));
}
//
// Add normalized characters to normalized transcriptions
//
if (ts.getGlyphChar().glyphType != GlyphType.DOUBLED) { // the first in a pair of doubled characters isn't part of the normalized transcription
String currNormalizedChar = charIndexer.getObject(ts.getLmCharIndex());
//if (LONG_S.equals(currNormalizedChar)) currNormalizedChar = "s"; // don't use long-s in normalized transcriptions
//
// Add to normalized line transcription
viterbiNormalizedCharLangLines[line].add(makeCharLangTuple(currNormalizedChar, ts.getLanguageIndex()));
//
// Add to normalized running transcription
switch(ts.getType()) {
case RMRGN_HPHN_INIT:
case RMRGN_HPHN:
case LMRGN_HPHN:
break;
case LMRGN:
case RMRGN:
if (!viterbiNormalizedCharLangRunning.isEmpty() && !SPACE.equals(last(viterbiNormalizedCharLangRunning)._1)) {
viterbiNormalizedCharLangRunning.add(makeCharLangTuple(SPACE, ts.getLanguageIndex()));
}
break;
case TMPL:
if (SPACE.equals(currNormalizedChar) && (viterbiNormalizedCharLangRunning.isEmpty() || SPACE.equals(last(viterbiNormalizedCharLangRunning)._1))) {
// do nothing -- collapse spaces
}
else {
viterbiNormalizedCharLangRunning.add(makeCharLangTuple(currNormalizedChar, ts.getLanguageIndex()));
}
}
}
}
}
}
if (!viterbiNormalizedCharLangRunning.isEmpty() && SPACE.equals(last(viterbiNormalizedCharLangRunning)._1)) {
viterbiNormalizedCharLangRunning.remove(viterbiNormalizedCharLangRunning.size() - 1);
}
}
private Tuple2<String, String> makeCharLangTuple(String c, int langIndex) {
String lang = (langIndex >= 0 ? langIndexer.getObject(langIndex) : null);
return Tuple2(c, lang);
}
public List<Tuple2<String, String>>[] getViterbiDiplomaticCharLangLines() {
return viterbiDiplomaticCharLangLines;
}
public List<String>[] getViterbiDiplomaticCharLines() {
@SuppressWarnings("unchecked")
List<String>[] output = new List[viterbiDiplomaticCharLangLines.length];
for (int i = 0; i < viterbiDiplomaticCharLangLines.length; ++i)
output[i] = mapToElement1(viterbiDiplomaticCharLangLines[i]);
return output;
}
public List<String>[] getViterbiDiplomaticLangLines() {
@SuppressWarnings("unchecked")
List<String>[] output = new List[viterbiDiplomaticCharLangLines.length];
for (int i = 0; i < viterbiDiplomaticCharLangLines.length; ++i)
output[i] = mapToElement2(viterbiDiplomaticCharLangLines[i]);
return output;
}
public List<Tuple2<String, String>>[] getViterbiNormalizedCharLangLines() {
return viterbiNormalizedCharLangLines;
}
public List<String>[] getViterbiNormalizedCharLines() {
@SuppressWarnings("unchecked")
List<String>[] output = new List[viterbiNormalizedCharLangLines.length];
for (int i = 0; i < viterbiNormalizedCharLangLines.length; ++i)
output[i] = mapToElement1(viterbiNormalizedCharLangLines[i]);
return output;
}
public List<String>[] getViterbiNormalizedLangLines() {
@SuppressWarnings("unchecked")
List<String>[] output = new List[viterbiNormalizedCharLangLines.length];
for (int i = 0; i < viterbiNormalizedCharLangLines.length; ++i)
output[i] = mapToElement2(viterbiNormalizedCharLangLines[i]);
return output;
}
public List<Tuple2<String, String>> getViterbiNormalizedCharLangRunning() {
return viterbiNormalizedCharLangRunning;
}
public List<String> getViterbiNormalizedCharRunning() {
return mapToElement1(viterbiNormalizedCharLangRunning);
}
public List<String> getViterbiNormalizedLangRunning() {
return mapToElement2(viterbiNormalizedCharLangRunning);
}
public List<DecodeState>[] getViterbiDecodeStates() {
return viterbiDecodeStates;
}
private <A,B> List<A> mapToElement1(List<Tuple2<A,B>> input) {
List<A> output = new ArrayList<A>();
for (Tuple2<A,B> t : input) output.add(t._1);
return output;
}
private <A,B> List<B> mapToElement2(List<Tuple2<A,B>> input) {
List<B> output = new ArrayList<B>();
for (Tuple2<A,B> t : input) output.add(t._2);
return output;
}
}