package edu.berkeley.cs.nlp.ocular.model.transition;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeAddTildeMap;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeCanBeElidedSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeCanBeReplacedSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeDiacriticDisregardMap;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makePunctSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeValidDoublableSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeValidSubstitutionCharsSet;
import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeSet;
import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import tberg.murphy.arrays.a;
import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType;
import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
import edu.berkeley.cs.nlp.ocular.lm.SingleLanguageModel;
import edu.berkeley.cs.nlp.ocular.model.TransitionStateType;
import edu.berkeley.cs.nlp.ocular.util.ArrayHelper;
import edu.berkeley.cs.nlp.ocular.util.Tuple2;
import tberg.murphy.indexer.Indexer;
/**
* @author Dan Garrette (dhgarrette@gmail.com)
*/
public class CodeSwitchTransitionModel implements SparseTransitionModel {
public class CodeSwitchTransitionState implements TransitionState {
private final int[] context;
public final TransitionStateType type;
/**
* The current language of this state. This may be *-1* to indicate that there is no
* current language state. This will happen at, for example, the beginning of a document,
* where forcing a language decision before we have reached a word makes no sense. The
* null should be used to tell the system to use the language *prior* instead of the language
* *transition* prior. In other words:
*
* p(destLang | null) = p(destLang)
*/
public final int langIndex;
public final int lmCharIndex;
public final GlyphChar glyphChar;
public CodeSwitchTransitionState(int[] context, TransitionStateType type, int langIndex, GlyphChar glyphChar) {
if (context == null) throw new IllegalArgumentException("context is null");
if (glyphChar == null) throw new IllegalArgumentException("glyphChar is null");
this.context = context;
this.type = type;
this.langIndex = langIndex;
this.lmCharIndex = makeLmCharIndex(context, type);
this.glyphChar = glyphChar;
}
public boolean equals(Object other) {
if (other instanceof CodeSwitchTransitionState) {
CodeSwitchTransitionState that = (CodeSwitchTransitionState) other;
if (this.type != that.type || this.langIndex != that.langIndex) {
return false;
}
else if (!Arrays.equals(this.context, that.context)) {
return false;
}
else if (!this.glyphChar.equals(that.glyphChar)) {
return false;
}
else {
return true;
}
}
else {
return false;
}
}
public int hashCode() {
int ctxHash = Arrays.hashCode(context);
int typeHash = this.type.ordinal();
int langHash = this.langIndex;
int glyphHash = this.glyphChar.hashCode();
return 1013 * ctxHash + 1009 * typeHash + 1007 * langHash + 1017 * glyphHash;
}
private void addNoSubGlyphStates(List<Tuple2<TransitionState, Double>> result, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
int nextLmChar = makeLmCharIndex(nextContext, nextType);
addNoSubGlyphStates(result, nextLmChar, nextContext, nextType, nextLanguage, transitionScore);
}
private void addNoSubGlyphStates(List<Tuple2<TransitionState, Double>> result, int nextLmChar, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
if (!allowGlyphSubstitution)
addState(result, nextContext, nextType, nextLanguage, new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR), transitionScore);
else {
GlyphType glyphType = glyphChar.glyphType;
if (nextType == TransitionStateType.RMRGN_HPHN_INIT || nextType == TransitionStateType.RMRGN_HPHN|| nextType == TransitionStateType.LMRGN_HPHN) {
/*
* This always maintains whether it is marked as a tilde-elision character
* or an elided character. This is necessary right-margin-hyphen states
* in which the new state is detached from the actual previous character.
* Note that non-hyphen margins should just use no-sub glyph since normal
* (non-hyphen) margins are treated as spaces, and spaces can't be elided
* and can't follow tilde-elision states.
*/
{
GlyphChar nextGlyphChar = new GlyphChar(nextLmChar, glyphChar.glyphType);
double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
}
if (nextType == TransitionStateType.RMRGN_HPHN_INIT) {
/*
* Allow for the elision of Ouptut a space
*/
GlyphChar nextGlyphChar = new GlyphChar(spaceCharIndex, glyphChar.glyphType);
double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
}
}
else {
/*
* 1. Next state's glyph is just the rendering of the LM character
*
* This is just a short-circuit of `addGlyphStates` in which no
* substitution glyph states are permitted. Useful for things
* like punctuation or spaces, where substitutions will never
* be allowed.
*/
if (glyphType != GlyphType.ELISION_TILDE) { // normal state can't follow an elision-marking tilde
// 1. Next state's glyph is just the rendering of the LM character
GlyphChar nextGlyphChar = new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR);
double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
}
}
}
}
/**
* Add transition states, allowing for the possibility of substitutions or elisions.
*
* 1. Next state's glyph is just the rendering of the LM character
* 2. Next state's glyph is a substitution of the LM character
* 3. Next state's glyph is an elision-decorated version of the LM character
* 4. Next state's glyph is an elision after a tilde-decorated character
* 5. Next state's glyph is the LM char, stripped of its accents
* 6. Next state's glyph is an elision after a space
* 7. Next state's glyph is a doubled version of the LM character
*
*/
private void addGlyphStates(List<Tuple2<TransitionState, Double>> result, int nextLmChar, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
if (!allowGlyphSubstitution)
addState(result, nextContext, nextType, nextLanguage, new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR), transitionScore);
else {
Set<GlyphChar> potentialNextGlyphChars = new HashSet<GlyphChar>();
GlyphType glyphType = glyphChar.glyphType;
if (glyphType == GlyphType.DOUBLED) {
// Deterministically duplicate the glyph (but no longer marked as "doubled")
//potentialNextGlyphChars.add(new GlyphChar(glyphChar.templateCharIndex, GlyphType.NORMAL_CHAR));
throw new RuntimeException("This should have been handled elsewhere so that we don't re-include ngram LM scores");
}
else if (glyphType == GlyphType.ELISION_TILDE) {
// 4. An elision-tilde'd character must be followed by a tilde-elision
if (canBeElided.contains(nextLmChar)) {
potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.TILDE_ELIDED));
}
}
else {
// 1. Next state's glyph is just the rendering of the LM character
potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR));
// 2. Next state's glyph is a substitution of the LM character
if (canBeReplaced.contains(nextLmChar)) {
for (int nextGlyphCharIndex : lm.get(nextLanguage).getActiveCharacters()) {
if (validSubstitutionChars.contains(nextGlyphCharIndex)) {
potentialNextGlyphChars.add(new GlyphChar(nextGlyphCharIndex, GlyphType.NORMAL_CHAR));
}
}
}
if (nextLmChar == sCharIndex)
potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR));
// 3. Next state's glyph is an elision-decorated version of the LM character
Integer tildeDecorated = addTilde.get(nextLmChar);
if (tildeDecorated != null) {
potentialNextGlyphChars.add(new GlyphChar(tildeDecorated, GlyphType.ELISION_TILDE));
}
// 4. Next state's glyph is elided --- No elision can take place after a normal character
if (glyphType == GlyphType.TILDE_ELIDED) {
if (canBeElided.contains(nextLmChar)) {
potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.TILDE_ELIDED));
}
}
// 5. Next state's glyph is the LM char, stripped of its accents
Integer baseChar = diacriticDisregardMap.get(nextLmChar);
if (baseChar != null) {
potentialNextGlyphChars.add(new GlyphChar(baseChar, GlyphType.NORMAL_CHAR));
}
// 6. Next state's glyph is an elision after a space
if (!elideAnything) {
if (glyphType != GlyphType.FIRST_ELIDED) { // TODO: Comment this out if we want to allow multiple characters to be elided from the front of a word
if (lmCharIndex == spaceCharIndex) {
if (type != TransitionStateType.LMRGN_HPHN && type != TransitionStateType.RMRGN_HPHN_INIT && type != TransitionStateType.RMRGN_HPHN) { // only allowed at the start of a word, not in the middle of a hyphenated word
if (nextType == TransitionStateType.TMPL) {
if (canBeElided.contains(nextLmChar)) {
potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.FIRST_ELIDED));
}
}
}
}
}
}
// 7. Next state's glyph is a doubled version of the LM character
if (validDoublableSet.contains(nextLmChar)) {
potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.DOUBLED));
if (nextLmChar == sCharIndex)
potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.DOUBLED));
}
// 8. Elide the character
if (elideAnything) {
if (nextType == TransitionStateType.TMPL) {
if (canBeElided.contains(nextLmChar)) {
potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.ELIDED));
}
}
}
}
// Create states for all the potential next glyphs
for (GlyphChar nextGlyphChar : potentialNextGlyphChars) {
double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
}
}
}
private void addTransitionsToTmpl(List<Tuple2<TransitionState, Double>> result, int[] context) {
addTransitionsToTmpl(result, context, 0.0, false);
}
private void addTransitionsToTmpl(List<Tuple2<TransitionState, Double>> result, int[] context, double prevScore, boolean clearContext) {
if (glyphChar.glyphType == GlyphType.DOUBLED) {
// Duplicate the state: same context, language, lmChar, ...; but Doubled=>Normal
TransitionStateType nextType = TransitionStateType.TMPL;
int nextLanguage = langIndex;
int nextLmChar = lmCharIndex;
//SingleLanguageModel destLM = lm.get(nextLanguage);
//double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
double score = prevScore; //+ Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(destLM, context, nextLmChar)) + Math.log(pDestLang); // TODO: Is it necessary to have some sort of LM probability factored in?
if (nextLmChar == sCharIndex) { // a doubled 's' may have long-s chars
GlyphChar nextGlyphCharS = new GlyphChar(sCharIndex, GlyphType.NORMAL_CHAR);
double glyphLogProbS = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharS);
addState(result, context, nextType, nextLanguage, nextGlyphCharS, score + glyphLogProbS);
GlyphChar nextGlyphCharLongs = new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR);
double glyphLogProbLongs = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharLongs);
addState(result, context, nextType, nextLanguage, nextGlyphCharLongs, score + glyphLogProbLongs);
}
else {
GlyphChar nextGlyphChar = new GlyphChar(lmCharIndex, GlyphType.NORMAL_CHAR);
double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
addState(result, context, nextType, nextLanguage, nextGlyphChar, score + glyphLogProb);
}
}
else {
if (this.langIndex < 0) { // there is no current language
for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) { // no current language, can switch to any language
SingleLanguageModel destLM = lm.get(destLanguage);
for (int c : destLM.getActiveCharacters()) { // punctuation no problem since we have no current language
if (c != spaceCharIndex) {
double pDestLang = lm.languagePrior(destLanguage); // no language to transition from
double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
}
}
}
}
else { // there is a current language
boolean switchAllowed = lmCharIndex == spaceCharIndex; // can switch if its (a non-space character) after a space
if (switchAllowed) { // switch permitted
for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) {
SingleLanguageModel destLM = lm.get(destLanguage);
for (int c : destLM.getActiveCharacters()) {
if (punctSet.contains(c)) {
if (allowLanguageSwitchOnPunct) {
double pDestLang = lm.languageTransitionProb(this.langIndex, destLanguage);
double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
addNoSubGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
}
else if (this.langIndex == destLanguage) { // switching not allowed, but this is the same language
double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
addNoSubGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
}
}
else if (c != spaceCharIndex) {
double pDestLang = lm.languageTransitionProb(this.langIndex, destLanguage);
double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
}
}
}
}
else { // no switching allowed
int destLanguage = this.langIndex; // there will always be a current language here
SingleLanguageModel destLM = lm.get(destLanguage);
for (int c : destLM.getActiveCharacters()) { // punctuation no problem since we're definitely not switching anyway
if (c != spaceCharIndex) {
double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
}
}
}
}
{ // space character: switching is never allowed
SingleLanguageModel thisLM = lm.get(this.langIndex);
// TODO: If current lmCharIndex==spaceCharIndex, sum over all languages?
double pTransition = 0.0;
// if (lmCharIndex == spaceCharIndex) {
double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
pTransition += getNgramProb(thisLM, context, spaceCharIndex) * pDestLang;
// }
// else {
// // total probability of transitioning to a space, regardless of language
// for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) {
// SingleLanguageModel destLM = lm.get(destLanguage);
// double pDestLang = lm.languageTransitionPrior(this.langIndex, destLanguage);
// int[] shrunkenContext = shrinkContext(context, thisLM);
// pTransition += getNgramProb(thisLM, context, spaceCharIndex) * pDestLang;
// }
// }
double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(pTransition);
int[] nextContext = (!clearContext ? a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), spaceCharIndex) : new int[] { spaceCharIndex });
addNoSubGlyphStates(result, spaceCharIndex, nextContext, TransitionStateType.TMPL, this.langIndex, score);
}
}
}
public Collection<Tuple2<TransitionState, Double>> nextLineStartStates() {
SingleLanguageModel thisLM = lm.get(this.langIndex);
List<Tuple2<TransitionState, Double>> result = new ArrayList<Tuple2<TransitionState, Double>>();
if (type == TransitionStateType.TMPL) {
// transition from letter to space (left margin)
double scoreWithSpace = Math.log(getNgramProb(thisLM, context, spaceCharIndex));
int[] contextWithSpace = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), spaceCharIndex);
{
double score = Math.log(LINE_MRGN_PROB) + scoreWithSpace;
addNoSubGlyphStates(result, spaceCharIndex, contextWithSpace, TransitionStateType.LMRGN, this.langIndex, score);
}
addTransitionsToTmpl(result, contextWithSpace, scoreWithSpace, false);
}
else if (type == TransitionStateType.RMRGN) {
{
double score = Math.log(LINE_MRGN_PROB);
addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN, this.langIndex, score);
}
addTransitionsToTmpl(result, context);
}
else if (type == TransitionStateType.RMRGN_HPHN || type == TransitionStateType.RMRGN_HPHN_INIT) {
{
double score = Math.log(LINE_MRGN_PROB);
addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN_HPHN, this.langIndex, score);
}
if (this.langIndex >= 0) { // can't have a hyphen if there is no language, since that means there have been no characters so far
if (glyphChar.glyphType == GlyphType.DOUBLED) {
// Duplicate the state: same context, language, lmChar, ...; but Doubled=>Normal
TransitionStateType nextType = TransitionStateType.TMPL;
int nextLanguage = langIndex;
int nextLmChar = lmCharIndex;
double score = Math.log(1.0); //+ Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, nextLmChar)) + Math.log(1.0); // TODO: Is it necessary to have some sort of LM probability factored in?
if (nextLmChar == sCharIndex) { // a doubled 's' may have long-s chars
GlyphChar nextGlyphCharS = new GlyphChar(sCharIndex, GlyphType.NORMAL_CHAR);
double glyphLogProbS = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharS);
addState(result, context, nextType, nextLanguage, nextGlyphCharS, score + glyphLogProbS);
GlyphChar nextGlyphCharLongs = new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR);
double glyphLogProbLongs = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharLongs);
addState(result, context, nextType, nextLanguage, nextGlyphCharLongs, score + glyphLogProbLongs);
}
else {
GlyphChar nextGlyphChar = new GlyphChar(lmCharIndex, GlyphType.NORMAL_CHAR);
double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
addState(result, context, nextType, nextLanguage, nextGlyphChar, score + glyphLogProb);
}
}
else {
for (int c : thisLM.getActiveCharacters()) {
if (c != spaceCharIndex && !punctSet.contains(c)) { // can't start a line after hyphen with space or punct
double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, c)) /*+ Math.log(1.0)*/;
int[] nextContext = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), c);
addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, this.langIndex, score);
}
}
}
}
}
else if (type == TransitionStateType.LMRGN || type == TransitionStateType.LMRGN_HPHN) {
// TODO: TAYLOR: Why do we clear the context in this case?
{
double score = Math.log(LINE_MRGN_PROB);
addNoSubGlyphStates(result, new int[0], TransitionStateType.LMRGN, this.langIndex, score);
}
addTransitionsToTmpl(result, context, 0.0, true);
}
return result;
}
public double endLogProb() {
if (glyphChar.glyphType == GlyphType.DOUBLED || glyphChar.glyphType == GlyphType.ELISION_TILDE) // can't end on an incomplete "double glyph"
return Double.NEGATIVE_INFINITY;
else
return 0.0;
}
/**
* Calculate forward transitions
*/
public Collection<Tuple2<TransitionState, Double>> forwardTransitions() {
SingleLanguageModel thisLM = lm.get(this.langIndex);
List<Tuple2<TransitionState, Double>> result = new ArrayList<Tuple2<TransitionState, Double>>();
if (type == TransitionStateType.LMRGN) {
{
double score = Math.log(LINE_MRGN_PROB);
addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN, this.langIndex, score);
}
addTransitionsToTmpl(result, context);
}
else if (type == TransitionStateType.LMRGN_HPHN) {
{
double score = Math.log(LINE_MRGN_PROB);
addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN_HPHN, this.langIndex, score);
}
if (this.langIndex >= 0) { // can't have a hyphen if there is no language, since that means there have been no characters so far
if (glyphChar.glyphType == GlyphType.DOUBLED) {
// Duplicate the state: same context, language, lmChar, ...; but Doubled=>Normal
TransitionStateType nextType = TransitionStateType.TMPL;
int nextLanguage = langIndex;
int nextLmChar = lmCharIndex;
//double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
double score = Math.log(1.0); //+ Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, nextLmChar)) + Math.log(pDestLang); // TODO: Is it necessary to have some sort of LM probability factored in?
if (nextLmChar == sCharIndex) { // a doubled 's' may have long-s chars
GlyphChar nextGlyphCharS = new GlyphChar(sCharIndex, GlyphType.NORMAL_CHAR);
double glyphLogProbS = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharS);
addState(result, context, nextType, nextLanguage, nextGlyphCharS, score + glyphLogProbS);
GlyphChar nextGlyphCharLongs = new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR);
double glyphLogProbLongs = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharLongs);
addState(result, context, nextType, nextLanguage, nextGlyphCharLongs, score + glyphLogProbLongs);
}
else {
GlyphChar nextGlyphChar = new GlyphChar(lmCharIndex, GlyphType.NORMAL_CHAR);
double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
addState(result, context, nextType, nextLanguage, nextGlyphChar, score + glyphLogProb);
}
}
else {
for (int c : thisLM.getActiveCharacters()) {
if (c != spaceCharIndex && !punctSet.contains(c)) { // can't start a line after hyphen with space or punct
double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, c)) + Math.log(pDestLang);
int[] nextContext = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), c);
addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, this.langIndex, score);
}
}
}
}
}
else if (type == TransitionStateType.RMRGN) {
double score = Math.log(LINE_MRGN_PROB);
addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN, this.langIndex, score);
}
else if (type == TransitionStateType.RMRGN_HPHN) {
double score = Math.log(LINE_MRGN_PROB);
addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN_HPHN, this.langIndex, score);
}
else if (type == TransitionStateType.RMRGN_HPHN_INIT) {
double score = Math.log(LINE_MRGN_PROB);
addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN_HPHN, this.langIndex, score);
}
else if (type == TransitionStateType.TMPL) {
{
double score = Math.log(LINE_MRGN_PROB) + Math.log(1.0 - LINE_END_HYPHEN_PROB) + Math.log(getNgramProb(thisLM, context, spaceCharIndex));
int[] nextContext = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), spaceCharIndex);
addNoSubGlyphStates(result, spaceCharIndex, nextContext, TransitionStateType.RMRGN, this.langIndex, score);
}
{
double score = Math.log(LINE_MRGN_PROB) + Math.log(LINE_END_HYPHEN_PROB);
addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN_HPHN_INIT, this.langIndex, score);
}
addTransitionsToTmpl(result, context);
}
return result;
}
public int getLmCharIndex() {
return lmCharIndex;
}
public GlyphChar getGlyphChar() {
return glyphChar;
}
public int getOffset() {
throw new Error("Method not implemented");
}
public int getExposure() {
throw new Error("Method not implemented");
}
public TransitionStateType getType() {
return type;
}
public int getLanguageIndex() {
return this.langIndex;
}
public String toString() {
StringBuilder contextSB = new StringBuilder("[");
for (int c : context)
contextSB.append(charIndexer.getObject(c));
//.append(", ");
//if (context.length > 0) contextSB.delete(contextSB.length()-2, contextSB.length());
contextSB.append("]");
return "CodeSwitchTransitionState("+(langIndex>=0 ? langIndexer.getObject(langIndex) : "No Language")+", "+charIndexer.getObject(lmCharIndex)+", "+type+", "+contextSB+", "+glyphChar.toString(charIndexer)+")";
}
}
private void addState(List<Tuple2<TransitionState, Double>> result, int[] stateContext, TransitionStateType stateType, int stateLanguage, GlyphChar glyphChar, double stateTransitionScore) {
if (stateTransitionScore != Double.NEGATIVE_INFINITY) {
result.add(Tuple2((TransitionState) new CodeSwitchTransitionState(stateContext, stateType, stateLanguage, glyphChar), stateTransitionScore));
}
}
public static final double LINE_MRGN_PROB = 0.5;
public static final double LINE_END_HYPHEN_PROB = 1e-8;
private Indexer<String> charIndexer;
private Indexer<String> langIndexer;
private int spaceCharIndex;
private int hyphenCharIndex;
private int sCharIndex;
private int longsCharIndex;
private Set<Integer> punctSet;
private Set<Integer> canBeReplaced;
private Set<Integer> validSubstitutionChars;
private Set<Integer> validDoublableSet;
private Set<Integer> canBeElided;
private Map<Integer, Integer> addTilde;
private Map<Integer,Integer> diacriticDisregardMap;
private int numLanguages;
private CodeSwitchLanguageModel lm;
private GlyphSubstitutionModel gsm;
private boolean allowLanguageSwitchOnPunct;
private boolean allowGlyphSubstitution;
private double noCharSubPrior;
private boolean elideAnything;
private Set<TransitionStateType> alwaysSpaceTransitionTypes;
/**
* character index is the last letter of the context.
*
* if this is the beginning of a line (context is empty or the type
* is a margin), then charindex is a space. if it's a right margin,
* then last letter is a hyphen; if there is a context then you
* know, context.
*/
private int makeLmCharIndex(int[] context, TransitionStateType type) {
if (context.length == 0 || this.alwaysSpaceTransitionTypes.contains(type)) {
return spaceCharIndex;
}
else if (type == TransitionStateType.RMRGN_HPHN_INIT) {
return hyphenCharIndex;
}
else {
return context[context.length - 1];
}
}
public CodeSwitchTransitionModel(CodeSwitchLanguageModel lm, boolean allowLanguageSwitchOnPunct, GlyphSubstitutionModel gsm, boolean allowGlyphSubstitution, double noCharSubPrior, boolean elideAnything) {
this.lm = lm;
this.gsm = gsm;
this.allowLanguageSwitchOnPunct = allowLanguageSwitchOnPunct;
this.allowGlyphSubstitution = allowGlyphSubstitution;
this.noCharSubPrior = noCharSubPrior;
this.elideAnything = elideAnything;
this.charIndexer = lm.getCharacterIndexer();
this.langIndexer = lm.getLanguageIndexer();
this.spaceCharIndex = charIndexer.getIndex(Charset.SPACE);
this.hyphenCharIndex = charIndexer.getIndex(Charset.HYPHEN);
this.sCharIndex = charIndexer.contains("s") ? charIndexer.getIndex("s") : -1;
this.longsCharIndex = charIndexer.getIndex(Charset.LONG_S);
this.punctSet = makePunctSet(charIndexer);
this.canBeReplaced = makeCanBeReplacedSet(charIndexer);
this.validSubstitutionChars = makeValidSubstitutionCharsSet(charIndexer);
this.validDoublableSet = makeValidDoublableSet(charIndexer);
this.canBeElided = makeCanBeElidedSet(charIndexer);
this.addTilde = makeAddTildeMap(charIndexer);
this.diacriticDisregardMap = makeDiacriticDisregardMap(charIndexer);
this.numLanguages = lm.getLanguageIndexer().size();
this.alwaysSpaceTransitionTypes = makeSet(TransitionStateType.LMRGN, TransitionStateType.LMRGN_HPHN, TransitionStateType.RMRGN, TransitionStateType.RMRGN_HPHN);
}
private void addNoSubGlyphStartState(List<Tuple2<TransitionState, Double>> result, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
if (!allowGlyphSubstitution)
addState(result, nextContext, nextType, nextLanguage, new GlyphChar(spaceCharIndex, GlyphType.NORMAL_CHAR), transitionScore);
else {
// 1. Next state's glyph is just the rendering of the LM character
GlyphChar nextGlyphChar = new GlyphChar(spaceCharIndex, GlyphType.NORMAL_CHAR);
double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, spaceCharIndex, nextGlyphChar);
addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
}
}
/**
* Add transition states, allowing for the possibility of substitutions or elisions.
*
* 1. Next state's glyph is just the rendering of the LM character
* 2. Next state's glyph is a substitution of the LM character
* 3. Next state's glyph is an elision-decorated version of the LM character
* 4. Next state's glyph is elided
* 5. Next state's glyph is the LM char, stripped of its accents
* 6. Next state's glyph is an elision after a space
* 7. Next state's glyph is a doubled version of the LM character
*
*/
private void addGlyphStartStates(List<Tuple2<TransitionState, Double>> result, int nextLmChar, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
if (!allowGlyphSubstitution)
addState(result, nextContext, nextType, nextLanguage, new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR), transitionScore);
else {
Set<GlyphChar> potentialNextGlyphChars = new HashSet<GlyphChar>();
// 1. Next state's glyph is just the rendering of the LM character
potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR));
// 2. Next state's glyph is a substitution of the LM character
if (canBeReplaced.contains(nextLmChar)) {
for (int nextGlyphCharIndex : lm.get(nextLanguage).getActiveCharacters()) {
if (validSubstitutionChars.contains(nextGlyphCharIndex)) {
potentialNextGlyphChars.add(new GlyphChar(nextGlyphCharIndex, GlyphType.NORMAL_CHAR));
}
}
}
if (nextLmChar == sCharIndex)
potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR));
// 3. Next state's glyph is an elision-decorated version of the LM character
Integer tildeDecorated = addTilde.get(nextLmChar);
if (tildeDecorated != null) {
potentialNextGlyphChars.add(new GlyphChar(tildeDecorated, GlyphType.ELISION_TILDE));
}
// 5. Next state's glyph is the LM char, stripped of its accents
Integer baseChar = diacriticDisregardMap.get(nextLmChar);
if (baseChar != null) {
potentialNextGlyphChars.add(new GlyphChar(baseChar, GlyphType.NORMAL_CHAR));
}
// 6. Next state's glyph is an elision after a space --- and the start state is always a "space"
if (!elideAnything) {
if (nextType == TransitionStateType.TMPL) {
if (canBeElided.contains(nextLmChar)) {
potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.FIRST_ELIDED));
}
}
}
// 7. Next state's glyph is a doubled version of the LM character
if (validDoublableSet.contains(nextLmChar)) {
potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.DOUBLED));
if (nextLmChar == sCharIndex)
potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.DOUBLED));
}
// 8. Elide the character
if (elideAnything) {
if (nextType == TransitionStateType.TMPL) {
if (canBeElided.contains(nextLmChar)) {
potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.ELIDED));
}
}
}
// Create states for all the potential next glyphs
for (GlyphChar nextGlyphChar : potentialNextGlyphChars) {
double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
}
}
}
/**
* Make a collection of states that can be the start of a line.
*
* First possibility: L-Margin, with no context. Has probability LINE_MRGN_PROB * prior prob of the language. (1 of this)
* Other possibilities: TMPL, with any individual single character c as context (~75 of these)
* - probability is: 1-LINE_MRGN_PROB * probability of c with no context * prior prob of the language.
*/
public Collection<Tuple2<TransitionState, Double>> startStates() {
List<Tuple2<TransitionState, Double>> result = new ArrayList<Tuple2<TransitionState, Double>>();
/*
* Don't force a language choice.
*/
{
double score = Math.log(LINE_MRGN_PROB) /*+ Math.log(1.0)*/;
addNoSubGlyphStartState(result, new int[0], TransitionStateType.LMRGN, -1, score);
}
/*
* Choose among all the languages when there's an actual word (not a space).
*/
for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) {
SingleLanguageModel destLM = lm.get(destLanguage);
double destLanguagePrior = lm.languagePrior(destLanguage);
for (int c : destLM.getActiveCharacters()) {
if (c != spaceCharIndex) {
double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(destLM, new int[0], c)) + Math.log(destLanguagePrior);
addGlyphStartStates(result, c, new int[] { c }, TransitionStateType.TMPL, destLanguage, score);
}
}
}
/*
* Since there's no "first" language, and we don't want to force a language
* choice without an actual word, calculate the probability of starting the
* line with a space as the sum of the no-context space probabilities across
* all the languages, weighted by the language priors.
*/
{
double totalSpaceProb = 0.0;
for (int language = 0; language < numLanguages; ++language)
totalSpaceProb += getNgramProb(lm.get(language), new int[0], spaceCharIndex) * lm.languagePrior(language);
double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(totalSpaceProb) /*+ Math.log(1.0)*/;
addNoSubGlyphStartState(result, new int[] { spaceCharIndex }, TransitionStateType.TMPL, -1, score);
}
return result;
}
private double getNgramProb(SingleLanguageModel slm, int[] context, int c) {
if (slm != null) {
return slm.getCharNgramProb(shrinkContext(context, slm), c);
}
else {
// No current language, so sum transition to `c` across all languages
double totalSpaceProb = 0.0;
for (int language = 0; language < numLanguages; ++language) {
SingleLanguageModel languageLM = this.lm.get(language);
totalSpaceProb += languageLM.getCharNgramProb(shrinkContext(context, languageLM), c) * this.lm.languagePrior(language);
}
return totalSpaceProb;
}
}
// private int[] appendToContext(int[] originalContext, int c, SingleLanguageModels lm) {
// return shrinkContext(a.append(originalContext, c), slm);
// }
private double calculateGlyphLogProb(TransitionStateType nextType, int nextLanguage, int nextLmChar, GlyphChar nextGlyphChar) {
if (nextLanguage < 0) {
if (this.alwaysSpaceTransitionTypes.contains(nextType) && nextGlyphChar.templateCharIndex == spaceCharIndex)
return 0.0; // log(1)
else
return Double.NEGATIVE_INFINITY; // log(0)
}
else {
double p = (1.0 - noCharSubPrior) * gsm.glyphProb(nextLanguage, nextLmChar, nextGlyphChar);
double pWithBias = ((nextGlyphChar.glyphType == GlyphType.NORMAL_CHAR && nextGlyphChar.templateCharIndex == nextLmChar) ? noCharSubPrior + p : p);
return Math.log(pWithBias);
}
}
private int[] shrinkContext(int[] originalContext, SingleLanguageModel slm) {
int[] newContext = originalContext;
int maxOrder = slm.getMaxOrder();
while (newContext.length > maxOrder - 1)
newContext = ArrayHelper.takeRight(newContext, maxOrder - 1);
if (slm != null) {
newContext = slm.shrinkContext(newContext);
}
return newContext;
}
}