package org.jabref.logic.layout.format; import org.jabref.logic.layout.LayoutFormatter; import org.jabref.logic.layout.StringInt; import org.jabref.logic.util.strings.RtfCharMap; import org.jabref.model.strings.StringUtil; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Transform a LaTeX-String to RTF. * * This method will: * * 1.) Remove LaTeX-Command sequences. * * 2.) Replace LaTeX-Special chars with RTF aquivalents. * * 3.) Replace emph and textit and textbf with their RTF replacements. * * 4.) Take special care to save all unicode characters correctly. * * 5.) Replace --- by \emdash and -- by \endash. */ public class RTFChars implements LayoutFormatter { private static final Log LOGGER = LogFactory.getLog(LayoutFormatter.class); private static final RtfCharMap RTF_CHARS = new RtfCharMap(); @Override public String format(String field) { StringBuilder sb = new StringBuilder(""); StringBuilder currentCommand = null; boolean escaped = false; boolean incommand = false; for (int i = 0; i < field.length(); i++) { char c = field.charAt(i); if (escaped && (c == '\\')) { sb.append('\\'); escaped = false; } else if (c == '\\') { escaped = true; incommand = true; currentCommand = new StringBuilder(); } else if (!incommand && ((c == '{') || (c == '}'))) { // Swallow the brace. } else if (Character.isLetter(c) || StringUtil.SPECIAL_COMMAND_CHARS.contains(String.valueOf(c))) { escaped = false; if (incommand) { // Else we are in a command, and should not keep the letter. currentCommand.append(c); testCharCom: if ((currentCommand.length() == 1) && StringUtil.SPECIAL_COMMAND_CHARS.contains(currentCommand.toString())) { // This indicates that we are in a command of the type // \^o or \~{n} if (i >= (field.length() - 1)) { break testCharCom; } String command = currentCommand.toString(); i++; c = field.charAt(i); String combody; if (c == '{') { StringInt part = getPart(field, i, true); i += part.i; combody = part.s; } else { combody = field.substring(i, i + 1); } String result = RTF_CHARS.get(command + combody); if (result != null) { sb.append(result); } incommand = false; escaped = false; } } else { sb.append(c); } } else { testContent: if (!incommand || (!Character.isWhitespace(c) && (c != '{') && (c != '}'))) { sb.append(c); } else { assert incommand; // First test for braces that may be part of a LaTeX command: if ((c == '{') && (currentCommand.length() == 0)) { // We have seen something like \{, which is probably the start // of a command like \{aa}. Swallow the brace. continue; } else if ((c == '}') && (currentCommand.length() > 0)) { // Seems to be the end of a command like \{aa}. Look it up: String command = currentCommand.toString(); String result = RTF_CHARS.get(command); if (result != null) { sb.append(result); } incommand = false; escaped = false; continue; } // Then look for italics etc., // but first check if we are already at the end of the string. if (i >= (field.length() - 1)) { break testContent; } if (((c == '{') || (c == ' ')) && (currentCommand.length() > 0)) { String command = currentCommand.toString(); // Then test if we are dealing with a italics or bold // command. If so, handle. if ("em".equals(command) || "emph".equals(command) || "textit".equals(command) || "it".equals(command)) { StringInt part = getPart(field, i, c == '{'); i += part.i; sb.append("{\\i ").append(part.s).append('}'); } else if ("textbf".equals(command) || "bf".equals(command)) { StringInt part = getPart(field, i, c == '{'); i += part.i; sb.append("{\\b ").append(part.s).append('}'); } else { LOGGER.info("Unknown command " + command); } if (c == ' ') { // command was separated with the content by ' ' // We have to add the space a } } else { sb.append(c); } } incommand = false; escaped = false; } } char[] chars = sb.toString().toCharArray(); sb = new StringBuilder(); for (char c : chars) { if (c < 128) { sb.append(c); } else { sb.append("\\u").append((long) c).append(transformSpecialCharacter(c)); } } return sb.toString().replace("---", "{\\emdash}").replace("--", "{\\endash}").replace("``", "{\\ldblquote}") .replace("''", "{\\rdblquote}"); } /** * @param text the text to extract the part from * @param i the position to start * @param commandNestedInBraces true if the command is nested in braces (\emph{xy}), false if spaces are sued (\emph xy) * @return a tuple of number of added characters and the extracted part */ private StringInt getPart(String text, int i, boolean commandNestedInBraces) { char c; int count = 0; int icount = i; StringBuilder part = new StringBuilder(); loop: while ((count >= 0) && (icount < text.length())) { icount++; c = text.charAt(icount); switch (c) { case '}': count--; break; case '{': count++; break; case ' ': if (!commandNestedInBraces) { // in any case, a space terminates the loop break loop; } break; default: break; } part.append(c); } String res = part.toString(); // the wrong "}" at the end is removed by "format(res)" return new StringInt(format(res), part.length()); } /** * This method transforms the unicode of a special character into its base character: 233 (é) - > e * @param c long * @return returns the basic character of the given unicode */ private String transformSpecialCharacter(long c) { if (((192 <= c) && (c <= 197)) || (c == 256) || (c == 258) || (c == 260)) { return "A"; } if (((224 <= c) && (c <= 229)) || (c == 257) || (c == 259) || (c == 261)) { return "a"; } if ((199 == c) || (262 == c) || (264 == c) || (266 == c) || (268 == c)) { return "C"; } if ((231 == c) || (263 == c) || (265 == c) || (267 == c) || (269 == c)) { return "c"; } if ((208 == c) || (272 == c)) { return "D"; } if ((240 == c) || (273 == c)) { return "d"; } if (((200 <= c) && (c <= 203)) || (274 == c) || (276 == c) || (278 == c) || (280 == c) || (282 == c)) { return "E"; } if (((232 <= c) && (c <= 235)) || (275 == c) || (277 == c) || (279 == c) || (281 == c) || (283 == c)) { return "e"; } if (((284 == c) || (286 == c)) || (288 == c) || (290 == c) || (330 == c)) { return "G"; } if ((285 == c) || (287 == c) || (289 == c) || (291 == c) || (331 == c)) { return "g"; } if ((292 == c) || (294 == c)) { return "H"; } if ((293 == c) || (295 == c)) { return "h"; } if (((204 <= c) && (c <= 207)) || (296 == c) || (298 == c) || (300 == c) || (302 == c) || (304 == c)) { return "I"; } if (((236 <= c) && (c <= 239)) || (297 == c) || (299 == c) || (301 == c) || (303 == c)) { return "i"; } if (308 == c) { return "J"; } if (309 == c) { return "j"; } if (310 == c) { return "K"; } if (311 == c) { return "k"; } if ((313 == c) || (315 == c) || (319 == c)) { return "L"; } if ((314 == c) || (316 == c) || (320 == c) || (322 == c)) { return "l"; } if ((209 == c) || (323 == c) || (325 == c) || (327 == c)) { return "N"; } if ((241 == c) || (324 == c) || (326 == c) || (328 == c)) { return "n"; } if (((210 <= c) && (c <= 214)) || (c == 216) || (332 == c) || (334 == c)) { return "O"; } if (((242 <= c) && (c <= 248) && (247 != c)) || (333 == c) || (335 == c)) { return "o"; } if ((340 == c) || (342 == c) || (344 == c)) { return "R"; } if ((341 == c) || (343 == c) || (345 == c)) { return "r"; } if ((346 == c) || (348 == c) || (350 == c) || (352 == c)) { return "S"; } if ((347 == c) || (349 == c) || (351 == c) || (353 == c)) { return "s"; } if ((354 == c) || (356 == c) || (358 == c)) { return "T"; } if ((355 == c) || (359 == c)) { return "t"; } if (((217 <= c) && (c <= 220)) || (360 == c) || (362 == c) || (364 == c) || (366 == c) || (370 == c)) { return "U"; } if (((249 <= c) && (c <= 251)) || (361 == c) || (363 == c) || (365 == c) || (367 == c) || (371 == c)) { return "u"; } if (372 == c) { return "W"; } if (373 == c) { return "w"; } if ((374 == c) || (376 == c) || (221 == c)) { return "Y"; } if ((375 == c) || (255 == c)) { return "y"; } if ((377 == c) || (379 == c) || (381 == c)) { return "Z"; } if ((378 == c) || (380 == c) || (382 == c)) { return "z"; } if (198 == c) { return "AE"; } if (230 == c) { return "ae"; } if (338 == c) { return "OE"; } if (339 == c) { return "oe"; } if (222 == c) { return "TH"; } if (223 == c) { return "ss"; } if (161 == c) { return "!"; } return "?"; } }