CodeSwitchTransitionModel.java example

Explorer

ocular-master
- src
  - main
    - java
      - edu
        berkeley
        cs
        nlp
        ocular
        data
        Document.java
        FirstFolioRawImageLoader.java
        LazyRawImageDocument.java
        LazyRawImageLoader.java
        LazyRawPdfImageDocument.java
        LazyRawSingleImageDocument.java
        PdfImageReader.java
        RawImageLoader.java
        TextAndLineImagesLoader.java
        textreader
        BasicTextReader.java
        BlacklistCharacterSetTextReader.java
        CharIndexer.java
        Charset.java
        ConvertLongSTextReader.java
        FlipUVTextReader.java
        RemoveAllDiacriticsTextReader.java
        ReplaceSomeTextReader.java
        TextReader.java
        WhitelistCharacterSetTextReader.java
        eval
        AlignedFormPair.java
        BasicMultiDocumentTranscriber.java
        BasicSingleDocumentEvaluatorAndOutputPrinter.java
        ErrorSampler.java
        EvalPrinter.java
        Evaluator.java
        Form.java
        Glyph.java
        LmPerplexity.java
        MarkovEditDistanceComputer.java
        ModelTranscriptions.java
        MultiDocumentTranscriber.java
        Operation.java
        SingleDocumentEvaluatorAndOutputPrinter.java
        font
        Font.java
        gsm
        BasicGlyphSubstitutionModel.java
        GlyphChar.java
        GlyphSubstitutionModel.java
        NoSubGlyphSubstitutionModel.java
        image
        FontRenderer.java
        ImageUtils.java
        Visualizer.java
        lm
        BasicCodeSwitchLanguageModel.java
        CodeSwitchLanguageModel.java
        CorpusCounter.java
        CountDb.java
        CountDbBig.java
        CountDbSimple.java
        CountType.java
        InterpolatingSingleLanguageModel.java
        LanguageModel.java
        LongArrWrapper.java
        LongNgram.java
        Ngram.java
        NgramCounts.java
        NgramLanguageModel.java
        NgramWrapper.java
        SingleLanguageModel.java
        UniformLanguageModel.java
        main
        ExtractLinesOnly.java
        FonttrainTranscribeShared.java
        InitializeFont.java
        InitializeGlyphSubstitutionModel.java
        InitializeLanguageModel.java
        LineExtractionOptions.java
        NoDocumentsFoundException.java
        NoDocumentsToProcessException.java
        OcularRunnable.java
        TrainFont.java
        Transcribe.java
        gui
        GridLayout2.java
        InitializeFontGUI.java
        TrainLanguageModelGUI.java
        TranscribeOrTrainFontGUI.java
        model
        CharacterTemplate.java
        DecodeState.java
        DecoderEM.java
        TransitionStateType.java
        em
        BeamingSemiMarkovDP.java
        CUDAInnerLoop.java
        DefaultInnerLoop.java
        DenseBigramTransitionModel.java
        EmissionCacheInnerLoop.java
        EmptyBeamException.java
        JOCLInnerLoop.java
        emission
        CachingEmissionModel.java
        CachingEmissionModelExplicitOffset.java
        EmissionModel.java
        transition
        CharacterNgramTransitionModel.java
        CharacterNgramTransitionModelMarkovOffset.java
        CodeSwitchTransitionModel.java
        SparseTransitionModel.java
        output
        AltoOutputWriter.java
        HtmlOutputWriter.java
        preprocessing
        Binarizer.java
        Cropper.java
        LineExtractor.java
        ManualCropper.java
        ManualStackCropperPrep.java
        Straightener.java
        Test.java
        VerticalModel.java
        VerticalProfile.java
        train
        FontTrainer.java
        ModelPathMaker.java
        TrainingRestarter.java
        util
        ArrayHelper.java
        CollectionHelper.java
        FileHelper.java
        FileUtil.java
        StringHelper.java
        Tuple2.java
        Tuple3.java
  - test
    - java
      - edu
        berkeley
        cs
        nlp
        ocular
        data
        textreader
        BasicTextReaderTests.java
        BlacklistCharacterSetTextReaderTests.java
        CharIndexerTests.java
        CharsetTests.java
        ConvertLongSTextReaderTests.java
        RemoveAllDiacriticsTextReaderTests.java
        ReplaceSomeTextReaderTests.java
        WhitelistCharacterSetTextReaderTests.java
        eval
        LmPerplexityTests.java
        gsm
        BasicGlyphSubstitutionModelTests.java
        lm
        LanguageTransitionPriorsTests.java
        model
        FontTrainEMTests.java
        PostViterbiTests.java
        util
        ArrayHelperTests.java
        CollectionHelperTests.java
        FileUtilTests.java
        StringHelperTests.java

package edu.berkeley.cs.nlp.ocular.model.transition;

import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeAddTildeMap;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeCanBeElidedSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeCanBeReplacedSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeDiacriticDisregardMap;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makePunctSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeValidDoublableSet;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.makeValidSubstitutionCharsSet;
import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeSet;
import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import tberg.murphy.arrays.a;
import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType;
import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
import edu.berkeley.cs.nlp.ocular.lm.SingleLanguageModel;
import edu.berkeley.cs.nlp.ocular.model.TransitionStateType;
import edu.berkeley.cs.nlp.ocular.util.ArrayHelper;
import edu.berkeley.cs.nlp.ocular.util.Tuple2;
import tberg.murphy.indexer.Indexer;

/**
 * @author Dan Garrette (dhgarrette@gmail.com)
 */
public class CodeSwitchTransitionModel implements SparseTransitionModel {

	public class CodeSwitchTransitionState implements TransitionState {
		private final int[] context;
		public final TransitionStateType type;

		/**
		 * The current language of this state.  This may be *-1* to indicate that there is no
		 * current language state.  This will happen at, for example, the beginning of a document,
		 * where forcing a language decision before we have reached a word makes no sense.  The 
		 * null should be used to tell the system to use the language *prior* instead of the language
		 * *transition* prior.  In other words:
		 * 
		 *     p(destLang | null) = p(destLang)
		 */
		public final int langIndex;

		public final int lmCharIndex;
		public final GlyphChar glyphChar;

		public CodeSwitchTransitionState(int[] context, TransitionStateType type, int langIndex, GlyphChar glyphChar) {
			if (context == null) throw new IllegalArgumentException("context is null");
			if (glyphChar == null) throw new IllegalArgumentException("glyphChar is null");

			this.context = context;
			this.type = type;
			this.langIndex = langIndex;
			this.lmCharIndex = makeLmCharIndex(context, type);
			this.glyphChar = glyphChar;
		}

		public boolean equals(Object other) {
			if (other instanceof CodeSwitchTransitionState) {
				CodeSwitchTransitionState that = (CodeSwitchTransitionState) other;
				if (this.type != that.type || this.langIndex != that.langIndex) {
					return false;
				}
				else if (!Arrays.equals(this.context, that.context)) {
					return false;
				}
				else if (!this.glyphChar.equals(that.glyphChar)) {
					return false;
				}
				else {
					return true;
				}
			}
			else {
				return false;
			}
		}

		public int hashCode() {
			int ctxHash = Arrays.hashCode(context);
			int typeHash = this.type.ordinal();
			int langHash = this.langIndex;
			int glyphHash = this.glyphChar.hashCode();
			return 1013 * ctxHash + 1009 * typeHash + 1007 * langHash + 1017 * glyphHash;
		}

		private void addNoSubGlyphStates(List<Tuple2<TransitionState, Double>> result, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
			int nextLmChar = makeLmCharIndex(nextContext, nextType);
			addNoSubGlyphStates(result, nextLmChar, nextContext, nextType, nextLanguage, transitionScore);
		}
		
		private void addNoSubGlyphStates(List<Tuple2<TransitionState, Double>> result, int nextLmChar, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
			if (!allowGlyphSubstitution)
				addState(result, nextContext, nextType, nextLanguage, new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR), transitionScore);
			else {
				GlyphType glyphType = glyphChar.glyphType;
				
				if (nextType == TransitionStateType.RMRGN_HPHN_INIT || nextType == TransitionStateType.RMRGN_HPHN|| nextType == TransitionStateType.LMRGN_HPHN) {
					/*
					 * This always maintains whether it is marked as a tilde-elision character 
					 * or an elided character.  This is necessary right-margin-hyphen states 
					 * in which the new state is detached from the actual previous character.
					 * Note that non-hyphen margins should just use no-sub glyph since normal
					 * (non-hyphen) margins are treated as spaces, and spaces can't be elided
					 * and can't follow tilde-elision states.
					 */
					{
						GlyphChar nextGlyphChar = new GlyphChar(nextLmChar, glyphChar.glyphType);
						double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
						addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
					}
					
					if (nextType == TransitionStateType.RMRGN_HPHN_INIT) {
						/*
						 * Allow for the elision of Ouptut a space 
						 */
						GlyphChar nextGlyphChar = new GlyphChar(spaceCharIndex, glyphChar.glyphType);
						double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
						addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
					}
				}
				else {
					/*
					 * 1. Next state's glyph is just the rendering of the LM character
					 * 
					 * This is just a short-circuit of `addGlyphStates` in which no 
					 * substitution glyph states are permitted.  Useful for things
					 * like punctuation or spaces, where substitutions will never
					 * be allowed.
					 */
					if (glyphType != GlyphType.ELISION_TILDE) { // normal state can't follow an elision-marking tilde
						// 1. Next state's glyph is just the rendering of the LM character
						GlyphChar nextGlyphChar = new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR);
						double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
						addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
					}
				}
			}
		}

		/**
		 * Add transition states, allowing for the possibility of substitutions or elisions.
		 * 
		 *   1. Next state's glyph is just the rendering of the LM character
		 *   2. Next state's glyph is a substitution of the LM character
		 *   3. Next state's glyph is an elision-decorated version of the LM character
		 *   4. Next state's glyph is an elision after a tilde-decorated character
		 *   5. Next state's glyph is the LM char, stripped of its accents
		 *   6. Next state's glyph is an elision after a space
		 *   7. Next state's glyph is a doubled version of the LM character
		 * 
		 */
		private void addGlyphStates(List<Tuple2<TransitionState, Double>> result, int nextLmChar, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
			if (!allowGlyphSubstitution)
				addState(result, nextContext, nextType, nextLanguage, new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR), transitionScore);
			else {
				Set<GlyphChar> potentialNextGlyphChars = new HashSet<GlyphChar>(); 
				GlyphType glyphType = glyphChar.glyphType;
				if (glyphType == GlyphType.DOUBLED) {
					// Deterministically duplicate the glyph (but no longer marked as "doubled")
					//potentialNextGlyphChars.add(new GlyphChar(glyphChar.templateCharIndex, GlyphType.NORMAL_CHAR));
					throw new RuntimeException("This should have been handled elsewhere so that we don't re-include ngram LM scores");
				}
				else if (glyphType == GlyphType.ELISION_TILDE) {
					// 4. An elision-tilde'd character must be followed by a tilde-elision
					if (canBeElided.contains(nextLmChar)) {
						potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.TILDE_ELIDED));
					}
				}
				else {
					// 1. Next state's glyph is just the rendering of the LM character
					potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR));

					// 2. Next state's glyph is a substitution of the LM character
					if (canBeReplaced.contains(nextLmChar)) {
						for (int nextGlyphCharIndex : lm.get(nextLanguage).getActiveCharacters()) {
							if (validSubstitutionChars.contains(nextGlyphCharIndex)) {
								potentialNextGlyphChars.add(new GlyphChar(nextGlyphCharIndex, GlyphType.NORMAL_CHAR));
							}
						}
					}
					if (nextLmChar == sCharIndex)
						potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR));
					
					// 3. Next state's glyph is an elision-decorated version of the LM character
					Integer tildeDecorated = addTilde.get(nextLmChar);
					if (tildeDecorated != null) {
						potentialNextGlyphChars.add(new GlyphChar(tildeDecorated, GlyphType.ELISION_TILDE));
					}
	
					// 4. Next state's glyph is elided --- No elision can take place after a normal character 
					if (glyphType == GlyphType.TILDE_ELIDED) {
						if (canBeElided.contains(nextLmChar)) {
							potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.TILDE_ELIDED));
						}
					}
					
					// 5. Next state's glyph is the LM char, stripped of its accents
					Integer baseChar = diacriticDisregardMap.get(nextLmChar);
					if (baseChar != null) {
						potentialNextGlyphChars.add(new GlyphChar(baseChar, GlyphType.NORMAL_CHAR));
					}
					
					// 6. Next state's glyph is an elision after a space
					if (!elideAnything) {
					if (glyphType != GlyphType.FIRST_ELIDED) { // TODO: Comment this out if we want to allow multiple characters to be elided from the front of a word
						if (lmCharIndex == spaceCharIndex) {
							if (type != TransitionStateType.LMRGN_HPHN && type != TransitionStateType.RMRGN_HPHN_INIT && type != TransitionStateType.RMRGN_HPHN) { // only allowed at the start of a word, not in the middle of a hyphenated word
								if (nextType == TransitionStateType.TMPL) {
									if (canBeElided.contains(nextLmChar)) {
										potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.FIRST_ELIDED));
									}
								}
							}
						}
					}
					}
					
					// 7. Next state's glyph is a doubled version of the LM character
					if (validDoublableSet.contains(nextLmChar)) {
						potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.DOUBLED));
						if (nextLmChar == sCharIndex)
							potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.DOUBLED));
					}
					
					// 8. Elide the character
					if (elideAnything) {
						if (nextType == TransitionStateType.TMPL) {
							if (canBeElided.contains(nextLmChar)) {
								potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.ELIDED));
							}
						}
					}

				}
				
				// Create states for all the potential next glyphs
				for (GlyphChar nextGlyphChar : potentialNextGlyphChars) {
					double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
					addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
				}
			}
		}
		
		private void addTransitionsToTmpl(List<Tuple2<TransitionState, Double>> result, int[] context) {
			addTransitionsToTmpl(result, context, 0.0, false);
		}

		private void addTransitionsToTmpl(List<Tuple2<TransitionState, Double>> result, int[] context, double prevScore, boolean clearContext) {
			if (glyphChar.glyphType == GlyphType.DOUBLED) {
				// Duplicate the state: same context, language, lmChar, ...; but Doubled=>Normal
				TransitionStateType nextType = TransitionStateType.TMPL;
				int nextLanguage = langIndex;
				int nextLmChar = lmCharIndex;
				//SingleLanguageModel destLM = lm.get(nextLanguage);
				//double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
				double score = prevScore; //+ Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(destLM, context, nextLmChar)) + Math.log(pDestLang); // TODO: Is it necessary to have some sort of LM probability factored in?
				if (nextLmChar == sCharIndex) { // a doubled 's' may have long-s chars
					GlyphChar nextGlyphCharS = new GlyphChar(sCharIndex, GlyphType.NORMAL_CHAR);
					double glyphLogProbS = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharS);
					addState(result, context, nextType, nextLanguage, nextGlyphCharS, score + glyphLogProbS);

					GlyphChar nextGlyphCharLongs = new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR);
					double glyphLogProbLongs = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharLongs);
					addState(result, context, nextType, nextLanguage, nextGlyphCharLongs, score + glyphLogProbLongs);
				}
				else {
					GlyphChar nextGlyphChar = new GlyphChar(lmCharIndex, GlyphType.NORMAL_CHAR);
					double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
					addState(result, context, nextType, nextLanguage, nextGlyphChar, score + glyphLogProb);
				}
			}
			else {
				if (this.langIndex < 0) { // there is no current language
					for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) { // no current language, can switch to any language
						SingleLanguageModel destLM = lm.get(destLanguage);
						for (int c : destLM.getActiveCharacters()) { // punctuation no problem since we have no current language
							if (c != spaceCharIndex) {
								double pDestLang = lm.languagePrior(destLanguage); // no language to transition from
								double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
								int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
								addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
							}
						}
					}
				}
				else { // there is a current language
					boolean switchAllowed = lmCharIndex == spaceCharIndex; // can switch if its (a non-space character) after a space
					if (switchAllowed) { // switch permitted
						for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) {
							SingleLanguageModel destLM = lm.get(destLanguage);
							for (int c : destLM.getActiveCharacters()) {
								if (punctSet.contains(c)) {
									if (allowLanguageSwitchOnPunct) {
										double pDestLang = lm.languageTransitionProb(this.langIndex, destLanguage);
										double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
										int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
										addNoSubGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
									}
									else if (this.langIndex == destLanguage) { // switching not allowed, but this is the same language
										double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
										double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
										int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
										addNoSubGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
									}
								}
								else if (c != spaceCharIndex) {
									double pDestLang = lm.languageTransitionProb(this.langIndex, destLanguage);
									double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
									int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
									addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
								}
							}
						}
					}
					else { // no switching allowed
						int destLanguage = this.langIndex; // there will always be a current language here
						SingleLanguageModel destLM = lm.get(destLanguage);
						for (int c : destLM.getActiveCharacters()) { // punctuation no problem since we're definitely not switching anyway
							if (c != spaceCharIndex) {
								double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
								double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(getNgramProb(destLM, context, c)) + Math.log(pDestLang);
								int[] nextContext = (!clearContext ? a.append((destLM!=null ? shrinkContext(context, destLM) : context), c) : new int[] { c });
								addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, destLanguage, score);
							}
						}
					}
				}
	
				{ // space character: switching is never allowed
					SingleLanguageModel thisLM = lm.get(this.langIndex);
					// TODO: If current lmCharIndex==spaceCharIndex, sum over all languages?
					double pTransition = 0.0;
					//				if (lmCharIndex == spaceCharIndex) {
					double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
					pTransition += getNgramProb(thisLM, context, spaceCharIndex) * pDestLang;
					//				}
					//				else {
					//					// total probability of transitioning to a space, regardless of language
					//					for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) {
					//						SingleLanguageModel destLM = lm.get(destLanguage);
					//						double pDestLang = lm.languageTransitionPrior(this.langIndex, destLanguage);
					//						int[] shrunkenContext = shrinkContext(context, thisLM);
					//						pTransition += getNgramProb(thisLM, context, spaceCharIndex) * pDestLang;
					//					}
					//				}
					double score = Math.log(1.0 - LINE_MRGN_PROB) + prevScore + Math.log(pTransition);
					int[] nextContext = (!clearContext ? a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), spaceCharIndex) : new int[] { spaceCharIndex });
					addNoSubGlyphStates(result, spaceCharIndex, nextContext, TransitionStateType.TMPL, this.langIndex, score);
				}
			}
		}

		public Collection<Tuple2<TransitionState, Double>> nextLineStartStates() {
			SingleLanguageModel thisLM = lm.get(this.langIndex);
			List<Tuple2<TransitionState, Double>> result = new ArrayList<Tuple2<TransitionState, Double>>();

			if (type == TransitionStateType.TMPL) {
				// transition from letter to space (left margin)
				double scoreWithSpace = Math.log(getNgramProb(thisLM, context, spaceCharIndex));
				int[] contextWithSpace = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), spaceCharIndex);

				{
					double score = Math.log(LINE_MRGN_PROB) + scoreWithSpace;
					addNoSubGlyphStates(result, spaceCharIndex, contextWithSpace, TransitionStateType.LMRGN, this.langIndex, score);
				}

				addTransitionsToTmpl(result, contextWithSpace, scoreWithSpace, false);
			}
			else if (type == TransitionStateType.RMRGN) {
				{
					double score = Math.log(LINE_MRGN_PROB);
					addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN, this.langIndex, score);
				}

				addTransitionsToTmpl(result, context);
			}
			else if (type == TransitionStateType.RMRGN_HPHN || type == TransitionStateType.RMRGN_HPHN_INIT) {
				{
					double score = Math.log(LINE_MRGN_PROB);
					addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN_HPHN, this.langIndex, score);
				}

				if (this.langIndex >= 0) { // can't have a hyphen if there is no language, since that means there have been no characters so far
					if (glyphChar.glyphType == GlyphType.DOUBLED) {
						// Duplicate the state: same context, language, lmChar, ...; but Doubled=>Normal
						TransitionStateType nextType = TransitionStateType.TMPL;
						int nextLanguage = langIndex;
						int nextLmChar = lmCharIndex;
						double score = Math.log(1.0); //+ Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, nextLmChar)) + Math.log(1.0); // TODO: Is it necessary to have some sort of LM probability factored in?
						if (nextLmChar == sCharIndex) { // a doubled 's' may have long-s chars
							GlyphChar nextGlyphCharS = new GlyphChar(sCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProbS = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharS);
							addState(result, context, nextType, nextLanguage, nextGlyphCharS, score + glyphLogProbS);

							GlyphChar nextGlyphCharLongs = new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProbLongs = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharLongs);
							addState(result, context, nextType, nextLanguage, nextGlyphCharLongs, score + glyphLogProbLongs);
						}
						else {
							GlyphChar nextGlyphChar = new GlyphChar(lmCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
							addState(result, context, nextType, nextLanguage, nextGlyphChar, score + glyphLogProb);
						}
					}
					else {
						for (int c : thisLM.getActiveCharacters()) {
							if (c != spaceCharIndex && !punctSet.contains(c)) { // can't start a line after hyphen with space or punct 
								double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, c)) /*+ Math.log(1.0)*/;
								int[] nextContext = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), c);
								addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, this.langIndex, score);
							}
						}
					}
				}
			}
			else if (type == TransitionStateType.LMRGN || type == TransitionStateType.LMRGN_HPHN) {
				// TODO: TAYLOR: Why do we clear the context in this case?

				{
					double score = Math.log(LINE_MRGN_PROB);
					addNoSubGlyphStates(result, new int[0], TransitionStateType.LMRGN, this.langIndex, score);
				}

				addTransitionsToTmpl(result, context, 0.0, true);
			}
			return result;
		}

		public double endLogProb() {
			if (glyphChar.glyphType == GlyphType.DOUBLED || glyphChar.glyphType == GlyphType.ELISION_TILDE) // can't end on an incomplete "double glyph"
				return Double.NEGATIVE_INFINITY;
			else
				return 0.0;
		}

		/**
		 * Calculate forward transitions
		 */
		public Collection<Tuple2<TransitionState, Double>> forwardTransitions() {
			SingleLanguageModel thisLM = lm.get(this.langIndex);
			List<Tuple2<TransitionState, Double>> result = new ArrayList<Tuple2<TransitionState, Double>>();

			if (type == TransitionStateType.LMRGN) {
				{
					double score = Math.log(LINE_MRGN_PROB);
					addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN, this.langIndex, score);
				}

				addTransitionsToTmpl(result, context);
			}
			else if (type == TransitionStateType.LMRGN_HPHN) {
				{
					double score = Math.log(LINE_MRGN_PROB);
					addNoSubGlyphStates(result, this.context, TransitionStateType.LMRGN_HPHN, this.langIndex, score);
				}

				if (this.langIndex >= 0) { // can't have a hyphen if there is no language, since that means there have been no characters so far
					if (glyphChar.glyphType == GlyphType.DOUBLED) {
						// Duplicate the state: same context, language, lmChar, ...; but Doubled=>Normal
						TransitionStateType nextType = TransitionStateType.TMPL;
						int nextLanguage = langIndex;
						int nextLmChar = lmCharIndex;
						//double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
						double score = Math.log(1.0); //+ Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, nextLmChar)) + Math.log(pDestLang); // TODO: Is it necessary to have some sort of LM probability factored in?
						if (nextLmChar == sCharIndex) { // a doubled 's' may have long-s chars
							GlyphChar nextGlyphCharS = new GlyphChar(sCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProbS = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharS);
							addState(result, context, nextType, nextLanguage, nextGlyphCharS, score + glyphLogProbS);

							GlyphChar nextGlyphCharLongs = new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProbLongs = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphCharLongs);
							addState(result, context, nextType, nextLanguage, nextGlyphCharLongs, score + glyphLogProbLongs);
						}
						else {
							GlyphChar nextGlyphChar = new GlyphChar(lmCharIndex, GlyphType.NORMAL_CHAR);
							double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
							addState(result, context, nextType, nextLanguage, nextGlyphChar, score + glyphLogProb);
						}
					}
					else {
						for (int c : thisLM.getActiveCharacters()) {
							if (c != spaceCharIndex && !punctSet.contains(c)) { // can't start a line after hyphen with space or punct 
								double pDestLang = 1.0; // since there's only one language for this character, don't divide its mass across languages
								double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(thisLM, context, c)) + Math.log(pDestLang);
								int[] nextContext = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), c);
								addGlyphStates(result, c, nextContext, TransitionStateType.TMPL, this.langIndex, score);
							}
						}
					}
				}
			}
			else if (type == TransitionStateType.RMRGN) {
				double score = Math.log(LINE_MRGN_PROB);
				addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN, this.langIndex, score);
			}
			else if (type == TransitionStateType.RMRGN_HPHN) {
				double score = Math.log(LINE_MRGN_PROB);
				addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN_HPHN, this.langIndex, score);
			}
			else if (type == TransitionStateType.RMRGN_HPHN_INIT) {
				double score = Math.log(LINE_MRGN_PROB);
				addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN_HPHN, this.langIndex, score);
			}
			else if (type == TransitionStateType.TMPL) {
				{
					double score = Math.log(LINE_MRGN_PROB) + Math.log(1.0 - LINE_END_HYPHEN_PROB) + Math.log(getNgramProb(thisLM, context, spaceCharIndex));
					int[] nextContext = a.append((thisLM!=null ? shrinkContext(context, thisLM) : context), spaceCharIndex);
					addNoSubGlyphStates(result, spaceCharIndex, nextContext, TransitionStateType.RMRGN, this.langIndex, score);
				}

				{
					double score = Math.log(LINE_MRGN_PROB) + Math.log(LINE_END_HYPHEN_PROB);
					addNoSubGlyphStates(result, this.context, TransitionStateType.RMRGN_HPHN_INIT, this.langIndex, score);
				}

				addTransitionsToTmpl(result, context);
			}
			return result;
		}

		public int getLmCharIndex() {
			return lmCharIndex;
		}
		
		public GlyphChar getGlyphChar() {
			return glyphChar;
		}

		public int getOffset() {
			throw new Error("Method not implemented");
		}

		public int getExposure() {
			throw new Error("Method not implemented");
		}

		public TransitionStateType getType() {
			return type;
		}

		public int getLanguageIndex() {
			return this.langIndex;
		}
		
		public String toString() {
			StringBuilder contextSB = new StringBuilder("[");
			for (int c : context)
				contextSB.append(charIndexer.getObject(c));
				//.append(", ");
			//if (context.length > 0) contextSB.delete(contextSB.length()-2, contextSB.length());
			contextSB.append("]");
			return "CodeSwitchTransitionState("+(langIndex>=0 ? langIndexer.getObject(langIndex) : "No Language")+", "+charIndexer.getObject(lmCharIndex)+", "+type+", "+contextSB+", "+glyphChar.toString(charIndexer)+")";
		}
	}

	private void addState(List<Tuple2<TransitionState, Double>> result, int[] stateContext, TransitionStateType stateType, int stateLanguage, GlyphChar glyphChar, double stateTransitionScore) {
		if (stateTransitionScore != Double.NEGATIVE_INFINITY) {
			result.add(Tuple2((TransitionState) new CodeSwitchTransitionState(stateContext, stateType, stateLanguage, glyphChar), stateTransitionScore));
		}
	}

	public static final double LINE_MRGN_PROB = 0.5;
	public static final double LINE_END_HYPHEN_PROB = 1e-8;

	private Indexer<String> charIndexer;
	private Indexer<String> langIndexer;
	private int spaceCharIndex;
	private int hyphenCharIndex;
	private int sCharIndex;
	private int longsCharIndex;
	private Set<Integer> punctSet;
	
	private Set<Integer> canBeReplaced;
	private Set<Integer> validSubstitutionChars;
	private Set<Integer> validDoublableSet;
	private Set<Integer> canBeElided;
	private Map<Integer, Integer> addTilde;
	private Map<Integer,Integer> diacriticDisregardMap;

	private int numLanguages;
	private CodeSwitchLanguageModel lm;
	private GlyphSubstitutionModel gsm;
	private boolean allowLanguageSwitchOnPunct;
	private boolean allowGlyphSubstitution;
	private double noCharSubPrior;
	private boolean elideAnything;

	private Set<TransitionStateType> alwaysSpaceTransitionTypes;
	
	/**
	 * character index is the last letter of the context.
	 * 
	 * if this is the beginning of a line (context is empty or the type
	 * is a margin), then charindex is a space. if it's a right margin,
	 * then last letter is a hyphen; if there is a context then you
	 * know, context.
	 */
	private int makeLmCharIndex(int[] context, TransitionStateType type) {
		if (context.length == 0 || this.alwaysSpaceTransitionTypes.contains(type)) {
			return spaceCharIndex;
		}
		else if (type == TransitionStateType.RMRGN_HPHN_INIT) {
			return hyphenCharIndex;
		}
		else {
			return context[context.length - 1];
		}
	}

	public CodeSwitchTransitionModel(CodeSwitchLanguageModel lm, boolean allowLanguageSwitchOnPunct, GlyphSubstitutionModel gsm, boolean allowGlyphSubstitution, double noCharSubPrior, boolean elideAnything) {
		this.lm = lm;
		this.gsm = gsm;
		this.allowLanguageSwitchOnPunct = allowLanguageSwitchOnPunct;
		this.allowGlyphSubstitution = allowGlyphSubstitution;
		this.noCharSubPrior = noCharSubPrior;
		this.elideAnything = elideAnything;

		this.charIndexer = lm.getCharacterIndexer();
		this.langIndexer = lm.getLanguageIndexer();
		this.spaceCharIndex = charIndexer.getIndex(Charset.SPACE);
		this.hyphenCharIndex = charIndexer.getIndex(Charset.HYPHEN);
		this.sCharIndex = charIndexer.contains("s") ? charIndexer.getIndex("s") : -1;
		this.longsCharIndex = charIndexer.getIndex(Charset.LONG_S);
		this.punctSet = makePunctSet(charIndexer);
		this.canBeReplaced = makeCanBeReplacedSet(charIndexer);
		this.validSubstitutionChars = makeValidSubstitutionCharsSet(charIndexer);
		this.validDoublableSet = makeValidDoublableSet(charIndexer);
		this.canBeElided = makeCanBeElidedSet(charIndexer);
		this.addTilde = makeAddTildeMap(charIndexer);
		this.diacriticDisregardMap = makeDiacriticDisregardMap(charIndexer);

		this.numLanguages = lm.getLanguageIndexer().size();
		this.alwaysSpaceTransitionTypes = makeSet(TransitionStateType.LMRGN, TransitionStateType.LMRGN_HPHN, TransitionStateType.RMRGN, TransitionStateType.RMRGN_HPHN);
	}

	private void addNoSubGlyphStartState(List<Tuple2<TransitionState, Double>> result, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
		if (!allowGlyphSubstitution)
			addState(result, nextContext, nextType, nextLanguage, new GlyphChar(spaceCharIndex, GlyphType.NORMAL_CHAR), transitionScore);
		else {
			// 1. Next state's glyph is just the rendering of the LM character
			GlyphChar nextGlyphChar = new GlyphChar(spaceCharIndex, GlyphType.NORMAL_CHAR);
			double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, spaceCharIndex, nextGlyphChar);
			addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
		}
	}

	/**
	 * Add transition states, allowing for the possibility of substitutions or elisions.
	 * 
	 *   1. Next state's glyph is just the rendering of the LM character
	 *   2. Next state's glyph is a substitution of the LM character
	 *   3. Next state's glyph is an elision-decorated version of the LM character
	 *   4. Next state's glyph is elided
	 *   5. Next state's glyph is the LM char, stripped of its accents
	 *   6. Next state's glyph is an elision after a space
	 *   7. Next state's glyph is a doubled version of the LM character
	 * 
	 */
	private void addGlyphStartStates(List<Tuple2<TransitionState, Double>> result, int nextLmChar, int[] nextContext, TransitionStateType nextType, int nextLanguage, double transitionScore) {
		if (!allowGlyphSubstitution)
			addState(result, nextContext, nextType, nextLanguage, new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR), transitionScore);
		else {
			Set<GlyphChar> potentialNextGlyphChars = new HashSet<GlyphChar>(); 
	
			// 1. Next state's glyph is just the rendering of the LM character
			potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.NORMAL_CHAR));
			
			// 2. Next state's glyph is a substitution of the LM character
			if (canBeReplaced.contains(nextLmChar)) {
				for (int nextGlyphCharIndex : lm.get(nextLanguage).getActiveCharacters()) {
					if (validSubstitutionChars.contains(nextGlyphCharIndex)) {
						potentialNextGlyphChars.add(new GlyphChar(nextGlyphCharIndex, GlyphType.NORMAL_CHAR));
					}
				}
			}
			if (nextLmChar == sCharIndex)
				potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.NORMAL_CHAR));
			
			// 3. Next state's glyph is an elision-decorated version of the LM character
			Integer tildeDecorated = addTilde.get(nextLmChar);
			if (tildeDecorated != null) {
				potentialNextGlyphChars.add(new GlyphChar(tildeDecorated, GlyphType.ELISION_TILDE));
			}
			
			// 5. Next state's glyph is the LM char, stripped of its accents
			Integer baseChar = diacriticDisregardMap.get(nextLmChar);
			if (baseChar != null) {
				potentialNextGlyphChars.add(new GlyphChar(baseChar, GlyphType.NORMAL_CHAR));
			}

			// 6. Next state's glyph is an elision after a space --- and the start state is always a "space"
			if (!elideAnything) {
			if (nextType == TransitionStateType.TMPL) {
				if (canBeElided.contains(nextLmChar)) {
					potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.FIRST_ELIDED));
				}
			}
			}

			// 7. Next state's glyph is a doubled version of the LM character
			if (validDoublableSet.contains(nextLmChar)) {
				potentialNextGlyphChars.add(new GlyphChar(nextLmChar, GlyphType.DOUBLED));
				if (nextLmChar == sCharIndex)
					potentialNextGlyphChars.add(new GlyphChar(longsCharIndex, GlyphType.DOUBLED));
			}
			
			// 8. Elide the character
			if (elideAnything) {
				if (nextType == TransitionStateType.TMPL) {
					if (canBeElided.contains(nextLmChar)) {
						potentialNextGlyphChars.add(new GlyphChar(spaceCharIndex, GlyphType.ELIDED));
					}
				}
			}

			// Create states for all the potential next glyphs
			for (GlyphChar nextGlyphChar : potentialNextGlyphChars) {
				double glyphLogProb = calculateGlyphLogProb(nextType, nextLanguage, nextLmChar, nextGlyphChar);
				addState(result, nextContext, nextType, nextLanguage, nextGlyphChar, transitionScore + glyphLogProb);
			}
		}
	}

	/**
	 * Make a collection of states that can be the start of a line.
	 * 
	 * First possibility: L-Margin, with no context. Has probability LINE_MRGN_PROB * prior prob of the language. (1 of this) 
	 * Other possibilities: TMPL, with any individual single character c as context (~75 of these) 
	 *   - probability is: 1-LINE_MRGN_PROB * probability of c with no context * prior prob of the language.
	 */
	public Collection<Tuple2<TransitionState, Double>> startStates() {
		List<Tuple2<TransitionState, Double>> result = new ArrayList<Tuple2<TransitionState, Double>>();
		/*
		 * Don't force a language choice.
		 */
		{
			double score = Math.log(LINE_MRGN_PROB) /*+ Math.log(1.0)*/;
			addNoSubGlyphStartState(result, new int[0], TransitionStateType.LMRGN, -1, score);
		}
		/*
		 * Choose among all the languages when there's an actual word (not a space).
		 */
		for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) {
			SingleLanguageModel destLM = lm.get(destLanguage);
			double destLanguagePrior = lm.languagePrior(destLanguage);
			for (int c : destLM.getActiveCharacters()) {
				if (c != spaceCharIndex) {
					double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(getNgramProb(destLM, new int[0], c)) + Math.log(destLanguagePrior);
					addGlyphStartStates(result, c, new int[] { c }, TransitionStateType.TMPL, destLanguage, score);
				}
			}
		}
		/*
		 * Since there's no "first" language, and we don't want to force a language 
		 * choice without an actual word, calculate the probability of starting the
		 * line with a space as the sum of the no-context space probabilities across
		 * all the languages, weighted by the language priors. 
		 */
		{
			double totalSpaceProb = 0.0;
			for (int language = 0; language < numLanguages; ++language)
				totalSpaceProb += getNgramProb(lm.get(language), new int[0], spaceCharIndex) * lm.languagePrior(language);
			double score = Math.log(1.0 - LINE_MRGN_PROB) + Math.log(totalSpaceProb) /*+ Math.log(1.0)*/;
			addNoSubGlyphStartState(result, new int[] { spaceCharIndex }, TransitionStateType.TMPL, -1, score);
		}
		return result;
	}

	private double getNgramProb(SingleLanguageModel slm, int[] context, int c) {
		if (slm != null) {
			return slm.getCharNgramProb(shrinkContext(context, slm), c);
		}
		else {
			// No current language, so sum transition to `c` across all languages
			double totalSpaceProb = 0.0;
			for (int language = 0; language < numLanguages; ++language) {
				SingleLanguageModel languageLM = this.lm.get(language);
				totalSpaceProb += languageLM.getCharNgramProb(shrinkContext(context, languageLM), c) * this.lm.languagePrior(language);
			}
			return totalSpaceProb;
		}
	}

	//	private int[] appendToContext(int[] originalContext, int c, SingleLanguageModels lm) {
	//		return shrinkContext(a.append(originalContext, c), slm);
	//	}

	private double calculateGlyphLogProb(TransitionStateType nextType, int nextLanguage, int nextLmChar, GlyphChar nextGlyphChar) {
		if (nextLanguage < 0) {
			if (this.alwaysSpaceTransitionTypes.contains(nextType) && nextGlyphChar.templateCharIndex == spaceCharIndex)
				return 0.0; // log(1)
			else
				return Double.NEGATIVE_INFINITY; // log(0)
		}
		else {
			double p = (1.0 - noCharSubPrior) * gsm.glyphProb(nextLanguage, nextLmChar, nextGlyphChar);
			double pWithBias = ((nextGlyphChar.glyphType == GlyphType.NORMAL_CHAR && nextGlyphChar.templateCharIndex == nextLmChar) ? noCharSubPrior + p : p);
			return Math.log(pWithBias);
		}
	}

	private int[] shrinkContext(int[] originalContext, SingleLanguageModel slm) {
		int[] newContext = originalContext;
		int maxOrder = slm.getMaxOrder();
		while (newContext.length > maxOrder - 1)
			newContext = ArrayHelper.takeRight(newContext, maxOrder - 1);
		if (slm != null) {
			newContext = slm.shrinkContext(newContext);
		}
		return newContext;
	}
}