package squidpony; import regexodus.Category; import regexodus.MatchResult; import regexodus.Pattern; import regexodus.Replacer; import regexodus.Substitution; import regexodus.TextBuffer; import squidpony.squidmath.CrossHash; import squidpony.squidmath.StatefulRNG; import java.io.Serializable; import java.util.HashMap; import java.util.Map; /** * Class that builds up a dictionary of words in a source text to words generated by a FakeLanguageGen, and can * "translate" a source text to a similarly-punctuated, similarly-capitalized fake text. Uses a hash of each word in the * source text to determine the RNG seed that FakeLanguageGen will use, so the translation is not random. Can cipher a * typically English text and generate a text with FakeLanguageGen, but also decipher such a generated text with a * fully-complete, partially-complete, or partially-incorrect vocabulary. * <br> * This defaults to caching source-language words to their generated-language word translations in the field table, as * well as the reverse translation in reverse. This can be changed to reduce memory usage for large vocabularies with * {@code setCacheLevel()}, where it starts at 2 (writing to table and reverse), and can be lowered to 1 (writing to * table only) if you don't need reverse to decipher a language easily, or to 0 (writing to neither) if you expect that * memory will be at a premium and don't mind re-generating the same word each time it occurs in a source text. If * cacheLevel is 1 or less, then this will not check for overlap between previously-generated words (it won't have an * easy way to look up previously-generated ones) and so may be impossible to accurately decipher. As an example, one * test of level 1 generated "he" as the translation for both "a" and "at", so every time "a" had been ciphered and then * deciphered, the reproduced version said "at" instead. This won't happen by default, but the default instead relies on * words being entered as inputs to cipher() or lookup() in the same order. If words are entered in two different orders * to different runs of the program, they may have different generated results if cacheLevel is 2. One way to handle * this is to use cacheLevel 2 and cipher the whole game script, or just the unique words in it (maybe just a large word * list, such as http://wordlist.aspell.net/12dicts/ ), then serialize the LanguageCipher for later usage. * Created by Tommy Ettinger on 5/1/2016. * @author Tommy Ettinger * @see NaturalLanguageCipher NaturalLanguageCipher offers similar features but handles common prefixes and suffixes. */ public class LanguageCipher implements Serializable{ private static final long serialVersionUID = 1287835632461186341L; /** * The FakeLanguageGen this will use to construct words; normally one of the static fields in FakeLanguageGen or a * FakeLanguageGen produced by using the mix() method of one of them. Manually constructing FakeLanguageGen objects * isn't especially easy, and if you decide to do that it's recommended you look at SquidLib's source to see how the * existing calls to constructors work. */ public FakeLanguageGen language; private StatefulRNG rng; // not an OrderedMap because this should never be need a random element to be requested /** * The mapping of lower-case word keys to lower-case word values, where keys are in the source language and values * are generated by language. */ public HashMap<String, String> table, /** * The mapping of lower-case word keys to lower-case word values, where keys are generated by language and values * are in the source language. Can be used as a complete vocabulary when passed to decipher. */ reverse; private static final Pattern wordMatch = Pattern.compile("(\\pL+)|(\\pL[\\pL-]*\\pL)"); /** * The degree of vocabulary to cache to speed up future searches at the expense of memory usage. * <ul> * <li>2 will cache source words to generated words in table, and generated to source in reverse.</li> * <li>1 will cache source words to generated words in table, and won't write to reverse.</li> * <li>0 won't write to table or reverse.</li> * </ul> * Defaults to 2, writing to both table and reverse. */ public int cacheLevel = 2; public final long shift; /** * Constructs a LanguageCipher that will generate English-like or Dutch-like text by default. */ public LanguageCipher() { this(FakeLanguageGen.ENGLISH); } /** * Constructs a LanguageCipher that will use the given style of language generator to produce its text. * @param language a FakeLanguageGen, typically one of the static constants in that class or a mix of them. */ public LanguageCipher(FakeLanguageGen language) { this(language, 0); } /** * Constructs a LanguageCipher that will use the given style of language generator to produce its text. * @param language a FakeLanguageGen, typically one of the static constants in that class or a mix of them. * @param shift any long; this will be used to alter the specific words generated unless it is 0 */ public LanguageCipher(FakeLanguageGen language, long shift) { this.shift = shift; this.language = language.copy(); rng = new StatefulRNG(); table = new HashMap<>(512); reverse = new HashMap<>(512); } /** * Copies another LanguageCipher and constructs this one with the information in the other. Copies the dictionary * of known words, as well as the FakeLanguageGen style and everything else. * @param other a previously-constructed LanguageCipher. */ public LanguageCipher(LanguageCipher other) { this.language = other.language.copy(); this.rng = new StatefulRNG(); this.table = new HashMap<>(other.table); this.reverse = new HashMap<>(other.reverse); this.shift = other.shift; } /** * Given a word in the source language (usually English), looks up an existing translation for that word, or if none * exists, generates a new word based on the hash of the source word and this LanguageCipher's FakeLanguageGen. * @param source a word in the source language * @return a word in the fake language */ public String lookup(String source) { if(source == null || source.isEmpty()) return ""; String s2 = source.toLowerCase(), ciphered; if(table.containsKey(s2)) ciphered = table.get(s2); else { long h = CrossHash.hash64(s2) + shift, frustration = 0; rng.setState(h); do { ciphered = language.word(rng, false, (int) Math.ceil(s2.length() / (2.2 + rng.nextDouble()))); if(cacheLevel < 2 || frustration++ > 9) break; }while (reverse.containsKey(ciphered)); switch (cacheLevel) { case 2: reverse.put(ciphered, s2); case 1: table.put(s2, ciphered); } } char[] chars = ciphered.toCharArray(); // Lu is the upper case letter category in Unicode; we're using regexodus for this because GWT won't // respect unicode case data on its own (see // https://github.com/gwtproject/gwt/blob/2.6.1/user/super/com/google/gwt/emul/java/lang/Character.java#L54-L61 // ). We are using GWT to capitalize, though, which appears to work in practice and the docs agree. if(Category.Lu.contains(source.charAt(0))) chars[0] = Character.toUpperCase(chars[0]); if(source.length() > 1 && Category.Lu.contains(source.charAt(1))) { for (int i = 1; i < chars.length; i++) { chars[i] = Character.toUpperCase(chars[i]); } } return new String(chars); } /** * Given a String, StringBuilder, or other CharSequence that should contain words in the source language, this * translates each word to the fake language, using existing translations if previous calls to cipher() or lookup() * had translated that word. * @param text a CharSequence, such as a String, that contains words in the source language * @return a String of the translated text. */ public String cipher(CharSequence text) { Replacer rep = wordMatch.replacer(new CipherSubstitution()); return rep.replace(text); } private class CipherSubstitution implements Substitution { @Override public void appendSubstitution(MatchResult match, TextBuffer dest) { dest.append(lookup(match.group(0))); } } private class DecipherSubstition implements Substitution { private final Map<String, String> vocabulary; DecipherSubstition(final Map<String, String> vocabulary) { this.vocabulary = vocabulary; } public void appendSubstitution(MatchResult match, TextBuffer dest) { String translated = match.group(0); if(translated == null) { return; } translated = translated.toLowerCase(); translated = vocabulary.get(translated); if(translated == null) { dest.append(match.group(0)); return; } char[] chars = translated.toCharArray(); if(Category.Lu.contains(match.charAt(0))) chars[0] = Character.toUpperCase(chars[0]); if(match.length() > 1 && Category.Lu.contains(match.charAt(1))) { for (int i = 1; i < chars.length; i++) { chars[i] = Character.toUpperCase(chars[i]); } } dest.append(chars, 0, chars.length); } } /** * Deciphers words in an already-ciphered text with a given String-to-String Map for a vocabulary. This Map could be * the reverse field of this LanguageCipher, which would give a complete translation, or it could be a * partially-complete or partially-correct vocabulary of words the player has learned. The vocabulary should * typically have entries added using the quick and accurate learnTranslations() method, unless you want to add * translations one word at a time (then use learnTranslation() ) or you want incorrect or biased translations added * (then use mismatchTranslation() ). You don't need to use one of these methods if you just pass the whole of the * reverse field as a vocabulary, which will translate every word. If making your own vocabulary without the learn * methods, the keys need to be lower-case because while regex Patterns can be case-insensitive, Map lookups cannot. * @param text a text in the fake language * @param vocabulary a Map of Strings in the fake language to Strings in the source language * @return a deciphered version of text that has any words as keys in vocabulary translated to the source language */ public String decipher(String text, final Map<String, String> vocabulary) { Pattern pat; Replacer rep; StringBuilder sb = new StringBuilder(128); sb.append("(?:"); for(String k : vocabulary.keySet()) { sb.append("(?:\\Q"); sb.append(k); sb.append("\\E)|"); } sb.deleteCharAt(sb.length() - 1); sb.append(')'); pat = Pattern.compile("\\b" + sb + "\\b", "ui"); rep = pat.replacer(new DecipherSubstition(vocabulary)); return rep.replace(text); } /** * Adds a translation pair to vocabulary so it can be used in decipher, giving a correct translation for sourceWord. * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct a mismatched * translation added to vocabulary with mismatchTranslation. * @param vocabulary a Map of String keys to String values that will be modified in-place * @param sourceWord a word in the source language, typically English; the meaning will be "learned" for decipher * @return this, for chaining */ public LanguageCipher learnTranslation(Map<String, String> vocabulary, String sourceWord) { vocabulary.put(lookup(sourceWord.toLowerCase()), sourceWord); return this; } /** * Adds translation pairs to vocabulary so it can be used in decipher, giving a correct translation for sourceWords. * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct mismatched * translations added to vocabulary with mismatchTranslation. * @param vocabulary a Map of String keys to String values that will be modified in-place * @param sourceWords an array or vararg of words in the source language, typically English; their meanings will * be "learned" for decipher * @return this, for chaining */ public LanguageCipher learnTranslations(Map<String, String> vocabulary, String... sourceWords) { for (int i = 0; i < sourceWords.length; i++) { learnTranslation(vocabulary, sourceWords[i]); } return this; } /** * Adds translation pairs to vocabulary so it can be used in decipher, giving a correct translation for sourceWords. * Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct mismatched * translations added to vocabulary with mismatchTranslation. * @param vocabulary a Map of String keys to String values that will be modified in-place * @param sourceWords an Iterable of words in the source language, typically English; their meanings will be * "learned" for decipher * @return this, for chaining */ public LanguageCipher learnTranslations(Map<String, String> vocabulary, Iterable<String> sourceWords) { for (String s : sourceWords) { learnTranslation(vocabulary, s); } return this; } /** * Adds a translation pair to vocabulary so it can be used in decipher, giving a typically-incorrect translation for * correctWord where it provides mismatchWord instead when the ciphered version of correctWord appears. * Modifies vocabulary in-place and returns this LanguageCipher for chaining. You can use learnTranslation() to * correct a mismatched vocabulary word, or mismatchTranslation() again to change the mismatched word. * @param vocabulary a Map of String keys to String values that will be modified in-place * @param correctWord a word in the source language, typically English; where the ciphered version of this * appears and the text is deciphered, mismatchWord will be used instead * @param mismatchWord a String that will be used for deciphering in place of the translation of correctWord. * @return this, for chaining */ public LanguageCipher mismatchTranslation(Map<String, String> vocabulary, String correctWord, String mismatchWord) { vocabulary.put(lookup(correctWord.toLowerCase()), mismatchWord); return this; } public int getCacheLevel() { return cacheLevel; } public void setCacheLevel(int cacheLevel) { if(cacheLevel >= 2) this.cacheLevel = 2; else if(cacheLevel <= 0) this.cacheLevel = 0; else this.cacheLevel = cacheLevel; } }