package squidpony;
import regexodus.Category;
import regexodus.MatchResult;
import regexodus.Pattern;
import regexodus.Replacer;
import regexodus.Substitution;
import regexodus.TextBuffer;
import squidpony.squidmath.CrossHash;
import squidpony.squidmath.StatefulRNG;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
/**
* Class that builds up a dictionary of words in a source text to words generated by a FakeLanguageGen, and can
* "translate" a source text to a similarly-punctuated, similarly-capitalized fake text. Uses a hash of each word in the
* source text to determine the RNG seed that FakeLanguageGen will use, so the translation is not random. Can cipher a
* typically English text and generate a text with FakeLanguageGen, but also decipher such a generated text with a
* fully-complete, partially-complete, or partially-incorrect vocabulary.
* <br>
* This defaults to caching source-language words to their generated-language word translations in the field table, as
* well as the reverse translation in reverse. This can be changed to reduce memory usage for large vocabularies with
* {@code setCacheLevel()}, where it starts at 2 (writing to table and reverse), and can be lowered to 1 (writing to
* table only) if you don't need reverse to decipher a language easily, or to 0 (writing to neither) if you expect that
* memory will be at a premium and don't mind re-generating the same word each time it occurs in a source text. If
* cacheLevel is 1 or less, then this will not check for overlap between previously-generated words (it won't have an
* easy way to look up previously-generated ones) and so may be impossible to accurately decipher. As an example, one
* test of level 1 generated "he" as the translation for both "a" and "at", so every time "a" had been ciphered and then
* deciphered, the reproduced version said "at" instead. This won't happen by default, but the default instead relies on
* words being entered as inputs to cipher() or lookup() in the same order. If words are entered in two different orders
* to different runs of the program, they may have different generated results if cacheLevel is 2. One way to handle
* this is to use cacheLevel 2 and cipher the whole game script, or just the unique words in it (maybe just a large word
* list, such as http://wordlist.aspell.net/12dicts/ ), then serialize the LanguageCipher for later usage.
* Created by Tommy Ettinger on 5/1/2016.
* @author Tommy Ettinger
* @see NaturalLanguageCipher NaturalLanguageCipher offers similar features but handles common prefixes and suffixes.
*/
public class LanguageCipher implements Serializable{
private static final long serialVersionUID = 1287835632461186341L;
/**
* The FakeLanguageGen this will use to construct words; normally one of the static fields in FakeLanguageGen or a
* FakeLanguageGen produced by using the mix() method of one of them. Manually constructing FakeLanguageGen objects
* isn't especially easy, and if you decide to do that it's recommended you look at SquidLib's source to see how the
* existing calls to constructors work.
*/
public FakeLanguageGen language;
private StatefulRNG rng;
// not an OrderedMap because this should never be need a random element to be requested
/**
* The mapping of lower-case word keys to lower-case word values, where keys are in the source language and values
* are generated by language.
*/
public HashMap<String, String> table,
/**
* The mapping of lower-case word keys to lower-case word values, where keys are generated by language and values
* are in the source language. Can be used as a complete vocabulary when passed to decipher.
*/
reverse;
private static final Pattern wordMatch = Pattern.compile("(\\pL+)|(\\pL[\\pL-]*\\pL)");
/**
* The degree of vocabulary to cache to speed up future searches at the expense of memory usage.
* <ul>
* <li>2 will cache source words to generated words in table, and generated to source in reverse.</li>
* <li>1 will cache source words to generated words in table, and won't write to reverse.</li>
* <li>0 won't write to table or reverse.</li>
* </ul>
* Defaults to 2, writing to both table and reverse.
*/
public int cacheLevel = 2;
public final long shift;
/**
* Constructs a LanguageCipher that will generate English-like or Dutch-like text by default.
*/
public LanguageCipher()
{
this(FakeLanguageGen.ENGLISH);
}
/**
* Constructs a LanguageCipher that will use the given style of language generator to produce its text.
* @param language a FakeLanguageGen, typically one of the static constants in that class or a mix of them.
*/
public LanguageCipher(FakeLanguageGen language)
{
this(language, 0);
}
/**
* Constructs a LanguageCipher that will use the given style of language generator to produce its text.
* @param language a FakeLanguageGen, typically one of the static constants in that class or a mix of them.
* @param shift any long; this will be used to alter the specific words generated unless it is 0
*/
public LanguageCipher(FakeLanguageGen language, long shift)
{
this.shift = shift;
this.language = language.copy();
rng = new StatefulRNG();
table = new HashMap<>(512);
reverse = new HashMap<>(512);
}
/**
* Copies another LanguageCipher and constructs this one with the information in the other. Copies the dictionary
* of known words, as well as the FakeLanguageGen style and everything else.
* @param other a previously-constructed LanguageCipher.
*/
public LanguageCipher(LanguageCipher other)
{
this.language = other.language.copy();
this.rng = new StatefulRNG();
this.table = new HashMap<>(other.table);
this.reverse = new HashMap<>(other.reverse);
this.shift = other.shift;
}
/**
* Given a word in the source language (usually English), looks up an existing translation for that word, or if none
* exists, generates a new word based on the hash of the source word and this LanguageCipher's FakeLanguageGen.
* @param source a word in the source language
* @return a word in the fake language
*/
public String lookup(String source)
{
if(source == null || source.isEmpty())
return "";
String s2 = source.toLowerCase(), ciphered;
if(table.containsKey(s2))
ciphered = table.get(s2);
else {
long h = CrossHash.hash64(s2) + shift, frustration = 0;
rng.setState(h);
do {
ciphered = language.word(rng, false, (int) Math.ceil(s2.length() / (2.2 + rng.nextDouble())));
if(cacheLevel < 2 || frustration++ > 9)
break;
}while (reverse.containsKey(ciphered));
switch (cacheLevel) {
case 2: reverse.put(ciphered, s2);
case 1: table.put(s2, ciphered);
}
}
char[] chars = ciphered.toCharArray();
// Lu is the upper case letter category in Unicode; we're using regexodus for this because GWT won't
// respect unicode case data on its own (see
// https://github.com/gwtproject/gwt/blob/2.6.1/user/super/com/google/gwt/emul/java/lang/Character.java#L54-L61
// ). We are using GWT to capitalize, though, which appears to work in practice and the docs agree.
if(Category.Lu.contains(source.charAt(0)))
chars[0] = Character.toUpperCase(chars[0]);
if(source.length() > 1 && Category.Lu.contains(source.charAt(1))) {
for (int i = 1; i < chars.length; i++) {
chars[i] = Character.toUpperCase(chars[i]);
}
}
return new String(chars);
}
/**
* Given a String, StringBuilder, or other CharSequence that should contain words in the source language, this
* translates each word to the fake language, using existing translations if previous calls to cipher() or lookup()
* had translated that word.
* @param text a CharSequence, such as a String, that contains words in the source language
* @return a String of the translated text.
*/
public String cipher(CharSequence text)
{
Replacer rep = wordMatch.replacer(new CipherSubstitution());
return rep.replace(text);
}
private class CipherSubstitution implements Substitution
{
@Override
public void appendSubstitution(MatchResult match, TextBuffer dest) {
dest.append(lookup(match.group(0)));
}
}
private class DecipherSubstition implements Substitution
{
private final Map<String, String> vocabulary;
DecipherSubstition(final Map<String, String> vocabulary)
{
this.vocabulary = vocabulary;
}
public void appendSubstitution(MatchResult match, TextBuffer dest) {
String translated = match.group(0);
if(translated == null) {
return;
}
translated = translated.toLowerCase();
translated = vocabulary.get(translated);
if(translated == null) {
dest.append(match.group(0));
return;
}
char[] chars = translated.toCharArray();
if(Category.Lu.contains(match.charAt(0)))
chars[0] = Character.toUpperCase(chars[0]);
if(match.length() > 1 && Category.Lu.contains(match.charAt(1))) {
for (int i = 1; i < chars.length; i++) {
chars[i] = Character.toUpperCase(chars[i]);
}
}
dest.append(chars, 0, chars.length);
}
}
/**
* Deciphers words in an already-ciphered text with a given String-to-String Map for a vocabulary. This Map could be
* the reverse field of this LanguageCipher, which would give a complete translation, or it could be a
* partially-complete or partially-correct vocabulary of words the player has learned. The vocabulary should
* typically have entries added using the quick and accurate learnTranslations() method, unless you want to add
* translations one word at a time (then use learnTranslation() ) or you want incorrect or biased translations added
* (then use mismatchTranslation() ). You don't need to use one of these methods if you just pass the whole of the
* reverse field as a vocabulary, which will translate every word. If making your own vocabulary without the learn
* methods, the keys need to be lower-case because while regex Patterns can be case-insensitive, Map lookups cannot.
* @param text a text in the fake language
* @param vocabulary a Map of Strings in the fake language to Strings in the source language
* @return a deciphered version of text that has any words as keys in vocabulary translated to the source language
*/
public String decipher(String text, final Map<String, String> vocabulary)
{
Pattern pat;
Replacer rep;
StringBuilder sb = new StringBuilder(128);
sb.append("(?:");
for(String k : vocabulary.keySet())
{
sb.append("(?:\\Q");
sb.append(k);
sb.append("\\E)|");
}
sb.deleteCharAt(sb.length() - 1);
sb.append(')');
pat = Pattern.compile("\\b" + sb + "\\b", "ui");
rep = pat.replacer(new DecipherSubstition(vocabulary));
return rep.replace(text);
}
/**
* Adds a translation pair to vocabulary so it can be used in decipher, giving a correct translation for sourceWord.
* Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct a mismatched
* translation added to vocabulary with mismatchTranslation.
* @param vocabulary a Map of String keys to String values that will be modified in-place
* @param sourceWord a word in the source language, typically English; the meaning will be "learned" for decipher
* @return this, for chaining
*/
public LanguageCipher learnTranslation(Map<String, String> vocabulary, String sourceWord)
{
vocabulary.put(lookup(sourceWord.toLowerCase()), sourceWord);
return this;
}
/**
* Adds translation pairs to vocabulary so it can be used in decipher, giving a correct translation for sourceWords.
* Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct mismatched
* translations added to vocabulary with mismatchTranslation.
* @param vocabulary a Map of String keys to String values that will be modified in-place
* @param sourceWords an array or vararg of words in the source language, typically English; their meanings will
* be "learned" for decipher
* @return this, for chaining
*/
public LanguageCipher learnTranslations(Map<String, String> vocabulary, String... sourceWords)
{
for (int i = 0; i < sourceWords.length; i++) {
learnTranslation(vocabulary, sourceWords[i]);
}
return this;
}
/**
* Adds translation pairs to vocabulary so it can be used in decipher, giving a correct translation for sourceWords.
* Modifies vocabulary in-place and returns this LanguageCipher for chaining. Can be used to correct mismatched
* translations added to vocabulary with mismatchTranslation.
* @param vocabulary a Map of String keys to String values that will be modified in-place
* @param sourceWords an Iterable of words in the source language, typically English; their meanings will be
* "learned" for decipher
* @return this, for chaining
*/
public LanguageCipher learnTranslations(Map<String, String> vocabulary, Iterable<String> sourceWords)
{
for (String s : sourceWords) {
learnTranslation(vocabulary, s);
}
return this;
}
/**
* Adds a translation pair to vocabulary so it can be used in decipher, giving a typically-incorrect translation for
* correctWord where it provides mismatchWord instead when the ciphered version of correctWord appears.
* Modifies vocabulary in-place and returns this LanguageCipher for chaining. You can use learnTranslation() to
* correct a mismatched vocabulary word, or mismatchTranslation() again to change the mismatched word.
* @param vocabulary a Map of String keys to String values that will be modified in-place
* @param correctWord a word in the source language, typically English; where the ciphered version of this
* appears and the text is deciphered, mismatchWord will be used instead
* @param mismatchWord a String that will be used for deciphering in place of the translation of correctWord.
* @return this, for chaining
*/
public LanguageCipher mismatchTranslation(Map<String, String> vocabulary, String correctWord, String mismatchWord)
{
vocabulary.put(lookup(correctWord.toLowerCase()), mismatchWord);
return this;
}
public int getCacheLevel() {
return cacheLevel;
}
public void setCacheLevel(int cacheLevel) {
if(cacheLevel >= 2) this.cacheLevel = 2;
else if(cacheLevel <= 0) this.cacheLevel = 0;
else this.cacheLevel = cacheLevel;
}
}