package moviescraper.doctord.controller.languagetranslation;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import moviescraper.doctord.controller.siteparsingprofile.SiteParsingProfile;
import org.apache.commons.lang3.text.WordUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class TranslateString {
private static final String japaneseSentenceEnders = "[。?�?]";
private static final int maxCharsPerRequest = 100;
/**
* @param japanesePersonName - Name of the person to translate. Method works best if the name is hiragana or katakana
* @return The name of person in Romaji
*/
public static String translateJapanesePersonNameToRomaji(String japanesePersonName)
{
//if we have any kanji at all in the string, we'll have to use google translate
for(int i = 0; i < japanesePersonName.length(); i++)
{
if(JapaneseCharacter.isKanji(japanesePersonName.charAt(i)))
return translateStringJapaneseToEnglish(japanesePersonName);
}
String romaji = JapaneseCharacter.convertToRomaji(japanesePersonName);
if(romaji != null)
{
romaji = WordUtils.capitalize(romaji).trim();
return romaji;
}
else return translateStringJapaneseToEnglish(japanesePersonName);
}
public static String translateStringJapaneseToEnglish(String japaneseKanjiString)
{
//Our overall approach for translation here is first try to split up string and translate one sentence at a time.
//If a sentence is still too long, then just translate part of it at one time
//We're splitting up strings because there's only so much we can pack in a URL at a time per request
//contains the original string split up by the japaneseSentenceEnders
//This uses lookahead to still keep the punctuation instead of discarding it during the split.
String [] splitBySentenceEnders = japaneseKanjiString.split("(?=" + japaneseSentenceEnders + ")");
//Split our sentences into maxCharPerRequest sized chunks, in case we had some super long sentence
List<List<String>> allSplits = new ArrayList<>();
for(int i = 0; i < splitBySentenceEnders.length; i++)
{
allSplits.add(splitStringIntoArrayList(splitBySentenceEnders[i],maxCharsPerRequest));
}
//flatten out our list so we can look through it easily
List<String> runTranslationFromThisList = new ArrayList<>(splitBySentenceEnders.length);
for(List<String> current : allSplits)
{
runTranslationFromThisList.addAll(current);
}
String englishStringBuilder = "";
final String encodingType = "UTF-8";
for(String japaneseString : runTranslationFromThisList)
{
try {
japaneseString = replaceAllKnownTranslations(japaneseString);
String translateBaseURL = "http://translate.google.com/?sl=ja&tl=en&js=n&prev=_t&hl=en&ie=utf-8&eotf=1&text=";
String postURLString = "&file=";
int urlLengthLimit = 2038;
int totalUrlLength = URLEncoder.encode(japaneseString, encodingType).length() + translateBaseURL.length() + postURLString.length();
//There's a 2000 character limit on URLs, so we may need to truncate in case we try to do something dumb like paste in some really long article
//This was needed using the old method of translation, but I don't think this if statement should happen anymore.
//I've kept this code in as a defensive measure in case for some reason it does, at least we'll still get some kind of translation
//back instead of getting a HTTP error
if (totalUrlLength >= urlLengthLimit)
{
int numberOfCharactersToGetRidOf = totalUrlLength - urlLengthLimit;
int newStringLength = (numberOfCharactersToGetRidOf / 3); //each unicode character corresponds to 3 characters in the URL
//maybe it had multi-byte Kanji or weird characters and it took more than 3 URL characters per string? This workaround seems to fix this issue, but more testing is needed
if(newStringLength > japaneseString.length())
{
//divide by 8 because I think that's how big multi-byte kanji are in a URL? For now this seems to work but it may be worth revisting this later
newStringLength = numberOfCharactersToGetRidOf / 8;
}
japaneseString = japaneseString.substring(newStringLength);
}
String translationServicePostURL = translateBaseURL + URLEncoder.encode(japaneseString, encodingType) + postURLString;
Document doc = Jsoup.connect(translationServicePostURL).referrer("http://translate.google.com").userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0").timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get();
Element translatedTextElement = doc.select(".short_text").first();
if (translatedTextElement == null)
translatedTextElement = doc.select(".long_text").first();
if(translatedTextElement == null)
englishStringBuilder += "";
else englishStringBuilder += translatedTextElement.text();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return englishStringBuilder; //return a blank translation in case of error for now
}
//Sometimes we just know some string isn't going to be translated right (a good example is a person's name) and we want to override the web translation engine for this string
//in that case. make a call to replaceAll with the japanese and english equivalent so we can manually override things
private static String replaceAllKnownTranslations(String japaneseString) {
String returnString = japaneseString;
returnString = japaneseString.replaceAll("�?��?��?�", "Tsubomi");
returnString = japaneseString.replaceAll("芦�??ユリア", "Yuria Ashina");
returnString = japaneseString.replaceAll("�?��?��?��?��??ら", "Sakura Aida");
returnString = japaneseString.replaceAll("野原ニコ", "Nico Nohara");
return returnString;
}
private static List<String> splitStringIntoArrayList(String text, int stringSizePerIndex){
List<String> strings = new ArrayList<>();
int index = 0;
while (index < text.length()) {
strings.add(text.substring(index, Math.min(index + stringSizePerIndex,text.length())));
index += stringSizePerIndex;
}
return strings;
}
}