LanguageCipher.java example

Explorer
SquidLib-master
package squidpony;

import regexodus.Category;
import regexodus.MatchResult;
import regexodus.Pattern;
import regexodus.Replacer;
import regexodus.Substitution;
import regexodus.TextBuffer;
import squidpony.squidmath.CrossHash;
import squidpony.squidmath.StatefulRNG;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;

/**
 * Class that builds up a dictionary of words in a source text to words generated by a FakeLanguageGen, and can
 * "translate" a source text to a similarly-punctuated, similarly-capitalized fake text. Uses a hash of each word in the
 * source text to determine the RNG seed that FakeLanguageGen will use, so the translation is not random. Can cipher a
 * typically English text and generate a text with FakeLanguageGen, but also decipher such a generated text with a
 * fully-complete, partially-complete, or partially-incorrect vocabulary.
 * <br>
 * This defaults to caching source-language words to their generated-language word translations in the field table, as
 * well as the reverse translation in reverse. This can be changed to reduce memory usage for large vocabularies with
 * {@code setCacheLevel()}, where it starts at 2 (writing to table and reverse), and can be lowered to 1 (writing to
 * table only) if you don't need reverse to decipher a language easily, or to 0 (writing to neither) if you expect that
 * memory will be at a premium and don't mind re-generating the same word each time it occurs in a source text. If
 * cacheLevel is 1 or less, then this will not check for overlap between previously-generated words (it won't have an
 * easy way to look up previously-generated ones) and so may be impossible to accurately decipher. As an example, one
 * test of level 1 generated "he" as the translation for both "a" and "at", so every time "a" had been ciphered and then
 * deciphered, the reproduced version said "at" instead. This won't happen by default, but the default instead relies on
 * words being entered as inputs to cipher() or lookup() in the same order. If words are entered in two different orders
 * to different runs of the program, they may have different generated results if cacheLevel is 2. One way to handle
 * this is to use cacheLevel 2 and cipher the whole game script, or just the unique words in it (maybe just a large word
 * list, such as http://wordlist.aspell.net/12dicts/ ), then serialize the LanguageCipher for later usage.
 * Created by Tommy Ettinger on 5/1/2016.
 * @author Tommy Ettinger
 * @see NaturalLanguageCipher NaturalLanguageCipher offers similar features but handles common prefixes and suffixes.
 */
public class LanguageCipher implements Serializable{
    private static final long serialVersionUID = 1287835632461186341L;
    /**
     * The FakeLanguageGen this will use to construct words; normally one of the static fields in FakeLanguageGen or a
     * FakeLanguageGen produced by using the mix() method of one of them. Manually constructing FakeLanguageGen objects
     * isn't especially easy, and if you decide to do that it's recommended you look at SquidLib's source to see how the
     * existing calls to constructors work.
     */
    public FakeLanguageGen language;
    private StatefulRNG rng;
    // not an OrderedMap because this should never be need a random element to be requested
    /**
     * The mapping of lower-case word keys to lower-case word values, where keys are in the source language and values
     * are generated by language.
     */
    public HashMap<String, String> table,
    /**
     * The mapping of lower-case word keys to lower-case word values, where keys are generated by language and values
     * are in the source language. Can be used as a complete vocabulary when passed to decipher.
     */
    reverse;
    private static final Pattern wordMatch = Pattern.compile("(\\pL+)|(\\pL[\\pL-]*\\pL)");

    /**
     * The degree of vocabulary to cache to speed up future searches at the expense of memory usage.
     * <ul>
     * <li>2 will cache source words to generated words in table, and generated to source in reverse.</li>
     * <li>1 will cache source words to generated words in table, and won't write to reverse.</li>
     * <li>0 won't write to table or reverse.</li>
     * </ul>
     * Defaults to 2, writing to both table and reverse.
     */
    public int cacheLevel = 2;

    public final long shift;

    /**
     * Constructs a LanguageCipher that will generate English-like or Dutch-like text by default.
     */
    public LanguageCipher()
    {
        this(FakeLanguageGen.ENGLISH);
    }

    /**
     * Constructs a LanguageCipher that will use the given style of language generator to produce its text.
     * @param language a FakeLanguageGen, typically one of the static constants in that class or a mix of them.
     */
    public LanguageCipher(FakeLanguageGen language)
    {
        this(language, 0);
    }

    /**
     * Constructs a LanguageCipher that will use the given style of language generator to produce its text.
     * @param language a FakeLanguageGen, typically one of the static constants in that class or a mix of them.
     * @param shift any long; this will be used to alter the specific words generated unless it is 0
     */
    public LanguageCipher(FakeLanguageGen language, long shift)
    {
        this.shift = shift;
        this.language = language.copy();
        rng = new StatefulRNG();
        table = new HashMap<>(512);
        reverse = new HashMap<>(512);
    }

    /**
     * Copies another LanguageCipher and constructs this one with the information in the other. Copies the dictionary
     * of known words, as well as the FakeLanguageGen style and everything else.
     * @param other a previously-constructed LanguageCipher.
     */
    public LanguageCipher(LanguageCipher other)
    {
        this.language = other.language.copy();
        this.rng = new StatefulRNG();
        this.table = new HashMap<>(other.table);
        this.reverse = new HashMap<>(other.reverse);
        this.shift = other.shift;
    }

    /**
     * Given a word in the source language (usually English), looks up an existing translation for that word, or if none
     * exists, generates a new word based on the hash of the source word and this LanguageCipher's FakeLanguageGen.
     * @param source a word in the source language
     * @return a word in the fake language
     */
    public String lookup(String source)
    {
        if(source == null || source.isEmpty())
            return "";
        String s2 = source.toLowerCase(), ciphered;
        if(table.containsKey(s2))
            ciphered = table.get(s2);
        else {
            long h = CrossHash.hash64(s2) + shift, frustration = 0;
            rng.setState(h);
            do {
                ciphered = language.word(rng, false, (int) Math.ceil(s2.length() / (2.2 + rng.nextDouble())));
                if(cacheLevel < 2 || frustration++ > 9)
                    break;
            }while (reverse.containsKey(ciphered));
            switch (cacheLevel) {
                case 2: reverse.put(ciphered, s2);
                case 1: table.put(s2, ciphered);
            }
        }
        char[] chars = ciphered.toCharArray();
        // Lu is the upper case letter category in Unicode; we're using regexodus for this because GWT won't
        // respect unicode case data on its own (see
        // https://github.com/gwtproject/gwt/blob/2.6.1/user/super/com/google/gwt/emul/java/lang/Character.java#L54-L61
        // ). We are using GWT to capitalize, though, which appears to work in practice and the docs agree.
        if(Category.Lu.contains(source.charAt(0)))
            chars[0] = Character.toUpperCase(chars[0]);
        if(source.length() > 1 && Category.Lu.contains(source.charAt(1))) {
            for (int i = 1; i < chars.length; i++) {
                chars[i] = Character.toUpperCase(chars[i]);
            }
        }
        return new String(chars);
    }

    /**
     * Given a String, StringBuilder, or other CharSequence that should contain words in the source language, this
     * translates each word to the fake language, using existing translations if previous calls to cipher() or lookup()
     * had translated that word.
     * @param text a CharSequence, such as a String, that contains words in the source language
     * @return a String of the translated text.
     */
    public String cipher(CharSequence text)
    {
        Replacer rep = wordMatch.replacer(new CipherSubstitution());
        return rep.replace(text);
    }

    private class CipherSubstitution implements Substitution
    {
        @Override
        public void appendSubstitution(MatchResult match, TextBuffer dest) {
            dest.append(lookup(match.group(0)));
        }
    }
    private class DecipherSubstition implements Substitution
    {
        private final Map<String, String> vocabulary;
        DecipherSubstition(final Map<String, String> vocabulary)
        {
            this.vocabulary = vocabulary;
        }
        public void appendSubstitution(MatchResult match, TextBuffer dest) {
            String translated = match.group(0);
            if(translated == null) {
                return;
            }
            translated = translated.toLowerCase();
            translated = vocabulary.get(translated);
            if(translated == null) {
                dest.append(match.group(0));
                return;
            }
            char[] chars = translated.toCharArray();
            if(Category.Lu.contains(match.charAt(0)))
                chars[0] = Character.toUpperCase(chars[0]);
            if(match.length() > 1 && Category.Lu.contains(match.charAt(1))) {
                for (int i = 1; i < chars.length; i++) {
                    chars[i] = Character.toUpperCase(chars[i]);
                }
            }
            dest.append(chars, 0, chars.length);
        }
    }

    /**
     * Deciphers words in an already-ciphered text with a given String-to-String Map for a vocabulary. This Map could be
     * the reverse field of this LanguageCipher, which would give a complete translation, or it could be a
     * partially-complete or partially-correct vocabulary of words the player has learned. The vocabulary should
     * typically have entries added using the quick and accurate learnTranslations() method, unless you want to add
     * translations one word at a time (then use learnTranslation() ) or you want incorrect or biased translations added
     * (then use mismatchTranslation() ). You don't need to use one of these methods if you just pass the whole of the
     * reverse field as a vocabulary, which will translate every word. If making your own vocabulary without the learn
     * methods, the keys need to be lower-case because while regex Patterns can be case-insensitive, Map lookups cannot.
     * @param text a text in the fake language
     * @param vocabulary a Map of Strings in the fake language to Strings in the source language
     * @return a deciphered version of text that has any words as keys in vocabulary translated to the source language
     */
    public String decipher(String text, final Map<String, String> vocabulary)
    {
        Pattern pat;
        Replacer rep;
        StringBuilder sb = new StringBuilder(128);
        sb.append("(?:");
        for(String k : vocabulary.keySet())
        {
            sb.append("(?:\\Q");
            sb.append(k);
            sb.append("\\E)|");
        }
        sb.deleteCharAt(sb.length() - 1);
        sb.append(')');

        pat = Pattern.compile("\\b" + sb + "\\b", "ui");

        rep = pat.replacer(new DecipherSubstition(vocabulary));
        return rep.replace(text);
    }

    /**
     * Adds a translation pair to vocabulary so it can be used in decipher, giving a correct translation for sourceWord.
     * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct a mismatched
     * translation added to vocabulary with mismatchTranslation.
     * @param vocabulary a Map of String keys to String values that will be modified in-place
     * @param sourceWord a word in the source language, typically English; the meaning will be "learned" for decipher
     * @return this, for chaining
     */
    public LanguageCipher learnTranslation(Map<String, String> vocabulary, String sourceWord)
    {
        vocabulary.put(lookup(sourceWord.toLowerCase()), sourceWord);
        return this;
    }

    /**
     * Adds translation pairs to vocabulary so it can be used in decipher, giving a correct translation for sourceWords.
     * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct mismatched
     * translations added to vocabulary with mismatchTranslation.
     * @param vocabulary a Map of String keys to String values that will be modified in-place
     * @param sourceWords an array or vararg of words in the source language, typically English; their meanings will
     *                    be "learned" for decipher
     * @return this, for chaining
     */
    public LanguageCipher learnTranslations(Map<String, String> vocabulary, String... sourceWords)
    {
        for (int i = 0; i < sourceWords.length; i++) {
            learnTranslation(vocabulary, sourceWords[i]);
        }
        return this;
    }

    /**
     * Adds translation pairs to vocabulary so it can be used in decipher, giving a correct translation for sourceWords.
     * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct mismatched
     * translations added to vocabulary with mismatchTranslation.
     * @param vocabulary a Map of String keys to String values that will be modified in-place
     * @param sourceWords an Iterable of words in the source language, typically English; their meanings will be
     *                   "learned" for decipher
     * @return this, for chaining
     */
    public LanguageCipher learnTranslations(Map<String, String> vocabulary, Iterable<String> sourceWords)
    {
        for (String s : sourceWords) {
            learnTranslation(vocabulary, s);
        }
        return this;
    }

    /**
     * Adds a translation pair to vocabulary so it can be used in decipher, giving a typically-incorrect translation for
     * correctWord where it provides mismatchWord instead when the ciphered version of correctWord appears.
     * Modifies vocabulary in-place and returns this LanguageCipher for chaining. You can use learnTranslation() to
     * correct a mismatched vocabulary word, or mismatchTranslation() again to change the mismatched word.
     * @param vocabulary a Map of String keys to String values that will be modified in-place
     * @param correctWord a word in the source language, typically English; where the ciphered version of this
     *                    appears and the text is deciphered, mismatchWord will be used instead
     * @param mismatchWord a String that will be used for deciphering in place of the translation of correctWord.
     * @return this, for chaining
     */
    public LanguageCipher mismatchTranslation(Map<String, String> vocabulary, String correctWord, String mismatchWord)
    {
        vocabulary.put(lookup(correctWord.toLowerCase()), mismatchWord);
        return this;
    }

    public int getCacheLevel() {
        return cacheLevel;
    }

    public void setCacheLevel(int cacheLevel) {
        if(cacheLevel >= 2) this.cacheLevel = 2;
        else if(cacheLevel <= 0) this.cacheLevel = 0;
        else this.cacheLevel = cacheLevel;
    }
}