package edu.berkeley.cs.nlp.ocular.lm; import java.util.ArrayList; import java.util.Collections; import java.util.List; import edu.berkeley.cs.nlp.ocular.util.Tuple2; import tberg.murphy.indexer.Indexer; /** * TODO: Move some of the probability calculations from CodeSwitchTransitionModel to here? * * @author Dan Garrette (dhgarrette@gmail.com) */ public class BasicCodeSwitchLanguageModel implements CodeSwitchLanguageModel { private static final long serialVersionUID = 3298359823L; private Indexer<String> langIndexer; /** * Map[destinationLanguage -> destinationLM] */ private List<SingleLanguageModel> subModels; /** * Map[destinationLanguage -> "prior prob of seeing destinationLanguage"] */ private List<Double> languagePriors; /** * Map[destinationLanguage -> Map[fromLanguage, "prior prob of switching from "from" to "destination"]] */ private List<List<Double>> languageTransitionProbs; private Indexer<String> charIndexer; private double pKeepSameLanguage; public Indexer<String> getLanguageIndexer() { return langIndexer; } public SingleLanguageModel get(int language) { if (language == -1) return null; else return subModels.get(language); } public double languagePrior(int language) { return languagePriors.get(language); } public double languageTransitionProb(int fromLanguage, int destinationLanguage) { return languageTransitionProbs.get(destinationLanguage).get(fromLanguage); } public Indexer<String> getCharacterIndexer() { return charIndexer; } public double getProbKeepSameLanguage() { return pKeepSameLanguage; } public BasicCodeSwitchLanguageModel(List<Tuple2<SingleLanguageModel, Double>> subModelsAndPriors, Indexer<String> charIndexer, Indexer<String> langIndexer, double pKeepSameLanguage) { if (subModelsAndPriors.isEmpty()) throw new IllegalArgumentException("languageModelsAndPriors may not be empty"); if (pKeepSameLanguage <= 0.0 || pKeepSameLanguage > 1.0) throw new IllegalArgumentException("pKeepSameLanguage must be between 0 and 1, was " + pKeepSameLanguage); // Total prob, for normalizing double languagePriorSum = 0.0; for (int langIndex = 0; langIndex < langIndexer.size(); ++langIndex) { Tuple2<SingleLanguageModel, Double> lmAndPrior = subModelsAndPriors.get(langIndex); double prior = lmAndPrior._2; if (prior <= 0.0) throw new IllegalArgumentException("prior on " + langIndexer.getObject(langIndex) + " is not positive (it's " + prior + ")"); languagePriorSum += prior; } this.subModels = new ArrayList<SingleLanguageModel>(); this.languagePriors = new ArrayList<Double>(); for (Tuple2<SingleLanguageModel, Double> lmAndPrior : subModelsAndPriors) { this.subModels.add(lmAndPrior._1); this.languagePriors.add(lmAndPrior._2 / languagePriorSum); } this.languageTransitionProbs = makeLanguageTransitionProbs(this.languagePriors, pKeepSameLanguage, langIndexer); this.charIndexer = charIndexer; this.langIndexer = langIndexer; this.pKeepSameLanguage = pKeepSameLanguage; } /** * @param languagePriors Map[destinationLanguage, "prior prob of destinationLanguage"] * @param pKeepSameLanguage The prior probability of deterministically keeping the same language on a word boundary. * @return Map[destinationLanguage -> Map[fromLanguage, "prob of switching from "from" to "to"]] */ public static List<List<Double>> makeLanguageTransitionProbs(List<Double> languagePriors, double pKeepSameLanguage, Indexer<String> langIndexer) { if (languagePriors.isEmpty()) throw new IllegalArgumentException("languagePriors may not be empty"); if (pKeepSameLanguage <= 0.0 || pKeepSameLanguage > 1.0) throw new IllegalArgumentException("pKeepSameLanguage must be between 0 and 1, was " + pKeepSameLanguage); int numLanguages = langIndexer.size(); if (numLanguages > 1) { double pSwitchLanguages = (1.0 - pKeepSameLanguage) / (numLanguages - 1); // Map[destinationLanguage -> Map[fromLanguage, "prob of switching from "from" to "to"]] List<List<Double>> result = new ArrayList<List<Double>>(); for (int destLanguage = 0; destLanguage < numLanguages; ++destLanguage) { double destPrior = languagePriors.get(destLanguage); if (destPrior <= 0.0) throw new IllegalArgumentException("prior on " + langIndexer.getObject(destLanguage) + " is not positive (it's " + destPrior + ")"); List<Double> transitionPriors = new ArrayList<Double>(); for (int fromLanguage = 0; fromLanguage < numLanguages; ++fromLanguage) { double transitionProb; if (fromLanguage == destLanguage) // keeping the same language across the transition transitionProb = pKeepSameLanguage; else transitionProb = pSwitchLanguages; // prior probability of keeping/switching with the same language and (normalized) prior of switching to destination language transitionPriors.add(transitionProb * destPrior); } result.add(transitionPriors); } // Adjust the results map by normalizing the probabilities for (int fromLanguage = 0; fromLanguage < numLanguages; ++fromLanguage) { double transitionPriorSum = 0.0; for (List<Double> transitionPriors : result) { // Get the total probability for normalization transitionPriorSum += transitionPriors.get(fromLanguage); } for (List<Double> transitionPriors : result) { // Normalize all the probabilities so that they sum to 1.0 double transitionProb = transitionPriors.get(fromLanguage); transitionPriors.set(fromLanguage, transitionProb / transitionPriorSum); // normalize the probability and put it back } } return result; } else { // Only one language means no switching ever, so probability of keeping the same language is 1.0 return Collections.singletonList(Collections.singletonList(1.0)); } } /** * TODO: This is really just here for DenseBigramTransitionModel to use. * I have *NO IDEA* whether it matters that this doesn't consider: * a) the role of spaces in determining language switches (it * kind of assumes that every character can be a different * language) * b) languageTransitionProb (since we can't track the language * of the context) */ public double getCharNgramProb(int[] context, int c) { // if(context[context.length-1] == charIndexer.getIndex(Main.SPACE)) { // this is right after a space // assume any language is possible double probSum = 0.0; for (int language = 0; language < this.langIndexer.size(); ++language) { probSum += subModels.get(language).getCharNgramProb(context, c) * languagePriors.get(language); } return probSum; // } // else{ // // } } }