package net.sf.jabref.export.layout.format; import net.sf.jabref.Globals; import net.sf.jabref.export.layout.LayoutFormatter; import java.util.HashMap; /** * This formatter converts LaTeX character sequences their equicalent unicode characters, * and removes other LaTeX commands without handling them. */ public class FormatChars implements LayoutFormatter { public static HashMap<String, String> CHARS = new HashMap<String, String>(); static { CHARS.put("`A", "À"); // #192 CHARS.put("'A", "Á"); // #193 CHARS.put("^A", "Â"); // #194 CHARS.put("~A", "Ã"); // #195 CHARS.put("\"A", "Ä"); // #196 CHARS.put("AA", "Å"); // #197 CHARS.put("AE", "Æ"); // #198 CHARS.put("cC", "Ç"); // #199 CHARS.put("`E", "È"); // #200 CHARS.put("'E", "É"); // #201 CHARS.put("^E", "Ê"); // #202 CHARS.put("\"E", "Ë"); // #203 CHARS.put("`I", "Ì"); // #204 CHARS.put("'I", "Í"); // #205 CHARS.put("^I", "Î"); // #206 CHARS.put("\"I", "Ï"); // #207 CHARS.put("DH", "Ð"); // #208 CHARS.put("~N", "Ñ"); // #209 CHARS.put("`O", "Ò"); // #210 CHARS.put("'O", "Ó"); // #211 CHARS.put("^O", "Ô"); // #212 CHARS.put("~O", "Õ"); // #213 CHARS.put("\"O", "Ö"); // #214 // According to ISO 8859-1 the "\times" symbol should be placed here // (#215). // Omitting this, because it is a mathematical symbol. CHARS.put("O", "Ø"); // #216 CHARS.put("`U", "Ù"); // #217 CHARS.put("'U", "Ú"); // #218 CHARS.put("^U", "Û"); // #219 CHARS.put("\"U", "Ü"); // #220 CHARS.put("'Y", "Ý"); // #221 CHARS.put("TH", "Þ"); // #222 CHARS.put("ss", "ß"); // #223 CHARS.put("`a", "à"); // #224 CHARS.put("'a", "á"); // #225 CHARS.put("^a", "â"); // #226 CHARS.put("~a", "ã"); // #227 CHARS.put("\"a", "ä"); // #228 CHARS.put("aa", "å"); // #229 CHARS.put("ae", "æ"); // #230 CHARS.put("cc", "ç"); // #231 CHARS.put("`e", "è"); // #232 CHARS.put("'e", "é"); // #233 CHARS.put("^e", "ê"); // #234 CHARS.put("\"e", "ë"); // #235 CHARS.put("`i", "ì"); // #236 CHARS.put("'i", "í"); // #237 CHARS.put("^i", "î"); // #238 CHARS.put("\"i", "ï"); // #239 CHARS.put("dh", "ð"); // #240 CHARS.put("~n", "ñ"); // #241 CHARS.put("`o", "ò"); // #242 CHARS.put("'o", "ó"); // #243 CHARS.put("^o", "ô"); // #244 CHARS.put("~o", "õ"); // #245 CHARS.put("\"o", "ö"); // #246 // According to ISO 8859-1 the "\div" symbol should be placed here // (#247). // Omitting this, because it is a mathematical symbol. CHARS.put("o", "ø"); // #248 CHARS.put("`u", "ù"); // #249 CHARS.put("'u", "ú"); // #250 CHARS.put("^u", "û"); // #251 CHARS.put("\"u", "ü"); // #252 CHARS.put("'y", "ý"); // #253 CHARS.put("th", "þ"); // #254 CHARS.put("\"y", "ÿ"); // #255 // HTML special characters without names (UNICODE Latin Extended-A), // indicated by UNICODE number CHARS.put("=A", "Ā"); // "Amacr" CHARS.put("=a", "ā"); // "amacr" CHARS.put("uA", "Ă"); // "Abreve" CHARS.put("ua", "ă"); // "abreve" CHARS.put("kA", "Ą"); // "Aogon" CHARS.put("ka", "ą"); // "aogon" CHARS.put("'C", "Ć"); // "Cacute" CHARS.put("'c", "ć"); // "cacute" CHARS.put("^C", "Ĉ"); // "Ccirc" CHARS.put("^c", "ĉ"); // "ccirc" CHARS.put(".C", "Ċ"); // "Cdot" CHARS.put(".c", "ċ"); // "cdot" CHARS.put("vC", "Č"); // "Ccaron" CHARS.put("vc", "č"); // "ccaron" CHARS.put("vD", "Ď"); // "Dcaron" // Symbol #271 (d�) has no special Latex command CHARS.put("DJ", "Đ"); // "Dstrok" CHARS.put("dj", "đ"); // "dstrok" CHARS.put("=E", "Ē"); // "Emacr" CHARS.put("=e", "ē"); // "emacr" CHARS.put("uE", "Ĕ"); // "Ebreve" CHARS.put("ue", "ĕ"); // "ebreve" CHARS.put(".E", "Ė"); // "Edot" CHARS.put(".e", "ė"); // "edot" CHARS.put("kE", "Ę"); // "Eogon" CHARS.put("ke", "ę"); // "eogon" CHARS.put("vE", "Ě"); // "Ecaron" CHARS.put("ve", "ě"); // "ecaron" CHARS.put("^G", "Ĝ"); // "Gcirc" CHARS.put("^g", "ĝ"); // "gcirc" CHARS.put("uG", "Ğ"); // "Gbreve" CHARS.put("ug", "ğ"); // "gbreve" CHARS.put(".G", "Ġ"); // "Gdot" CHARS.put(".g", "ġ"); // "gdot" CHARS.put("cG", "Ģ"); // "Gcedil" CHARS.put("'g", "ģ"); // "gacute" CHARS.put("^H", "Ĥ"); // "Hcirc" CHARS.put("^h", "ĥ"); // "hcirc" CHARS.put("Hstrok", "Ħ"); // "Hstrok" CHARS.put("hstrok", "ħ"); // "hstrok" CHARS.put("~I", "Ĩ"); // "Itilde" CHARS.put("~i", "ĩ"); // "itilde" CHARS.put("=I", "Ī"); // "Imacr" CHARS.put("=i", "ī"); // "imacr" CHARS.put("uI", "Ĭ"); // "Ibreve" CHARS.put("ui", "ĭ"); // "ibreve" CHARS.put("kI", "Į"); // "Iogon" CHARS.put("ki", "į"); // "iogon" CHARS.put(".I", "İ"); // "Idot" CHARS.put("i", "ı"); // "inodot" // Symbol #306 (IJ) has no special Latex command // Symbol #307 (ij) has no special Latex command CHARS.put("^J", "Ĵ"); // "Jcirc" CHARS.put("^j", "ĵ"); // "jcirc" CHARS.put("cK", "Ķ"); // "Kcedil" CHARS.put("ck", "ķ"); // "kcedil" // Symbol #312 (k) has no special Latex command CHARS.put("'L", "Ĺ"); // "Lacute" CHARS.put("'l", "ĺ"); // "lacute" CHARS.put("cL", "Ļ"); // "Lcedil" CHARS.put("cl", "ļ"); // "lcedil" // Symbol #317 (L�) has no special Latex command // Symbol #318 (l�) has no special Latex command CHARS.put("Lmidot", "Ŀ"); // "Lmidot" CHARS.put("lmidot", "ŀ"); // "lmidot" CHARS.put("L", "Ł"); // "Lstrok" CHARS.put("l", "ł"); // "lstrok" CHARS.put("'N", "Ń"); // "Nacute" CHARS.put("'n", "ń"); // "nacute" CHARS.put("cN", "Ņ"); // "Ncedil" CHARS.put("cn", "ņ"); // "ncedil" CHARS.put("vN", "Ň"); // "Ncaron" CHARS.put("vn", "ň"); // "ncaron" // Symbol #329 (�n) has no special Latex command CHARS.put("NG", "Ŋ"); // "ENG" CHARS.put("ng", "ŋ"); // "eng" CHARS.put("=O", "Ō"); // "Omacr" CHARS.put("=o", "ō"); // "omacr" CHARS.put("uO", "Ŏ"); // "Obreve" CHARS.put("uo", "ŏ"); // "obreve" CHARS.put("HO", "Ő"); // "Odblac" CHARS.put("Ho", "ő"); // "odblac" CHARS.put("OE", "Œ"); // "OElig" CHARS.put("oe", "œ"); // "oelig" CHARS.put("'R", "Ŕ"); // "Racute" CHARS.put("'r", "ŕ"); // "racute" CHARS.put("cR", "Ŗ"); // "Rcedil" CHARS.put("cr", "ŗ"); // "rcedil" CHARS.put("vR", "Ř"); // "Rcaron" CHARS.put("vr", "ř"); // "rcaron" CHARS.put("'S", "Ś"); // "Sacute" CHARS.put("'s", "ś"); // "sacute" CHARS.put("^S", "Ŝ"); // "Scirc" CHARS.put("^s", "ŝ"); // "scirc" CHARS.put("cS", "Ş"); // "Scedil" CHARS.put("cs", "ş"); // "scedil" CHARS.put("vS", "Š"); // "Scaron" CHARS.put("vs", "š"); // "scaron" CHARS.put("cT", "Ţ"); // "Tcedil" CHARS.put("ct", "ţ"); // "tcedil" CHARS.put("vT", "Ť"); // "Tcaron" // Symbol #357 (t�) has no special Latex command CHARS.put("Tstrok", "Ŧ"); // "Tstrok" CHARS.put("tstrok", "ŧ"); // "tstrok" CHARS.put("~U", "Ũ"); // "Utilde" CHARS.put("~u", "ũ"); // "utilde" CHARS.put("=U", "Ū"); // "Umacr" CHARS.put("=u", "ū"); // "umacr" CHARS.put("uU", "Ŭ"); // "Ubreve" CHARS.put("uu", "ŭ"); // "ubreve" CHARS.put("rU", "Ů"); // "Uring" CHARS.put("ru", "ů"); // "uring" CHARS.put("HU", "ů"); // "Odblac" CHARS.put("Hu", "ű"); // "odblac" CHARS.put("kU", "Ų"); // "Uogon" CHARS.put("ku", "ų"); // "uogon" CHARS.put("^W", "Ŵ"); // "Wcirc" CHARS.put("^w", "ŵ"); // "wcirc" CHARS.put("^Y", "Ŷ"); // "Ycirc" CHARS.put("^y", "ŷ"); // "ycirc" CHARS.put("\"Y", "Ÿ"); // "Yuml" CHARS.put("'Z", "Ź"); // "Zacute" CHARS.put("'z", "ź"); // "zacute" CHARS.put(".Z", "Ż"); // "Zdot" CHARS.put(".z", "ż"); // "zdot" CHARS.put("vZ", "Ž"); // "Zcaron" CHARS.put("vz", "ž"); // "zcaron" // Symbol #383 (f) has no special Latex command CHARS.put("%", "%"); // percent sign } public String format(String field) { int i; field = field.replaceAll("&|\\\\&", "&").replaceAll("[\\n]{1,}", "<p>"); StringBuffer sb = new StringBuffer(); StringBuffer currentCommand = null; char c; boolean escaped = false, incommand = false; for (i = 0; i < field.length(); i++) { c = field.charAt(i); if (escaped && (c == '\\')) { sb.append('\\'); escaped = false; } else if (c == '\\') { if (incommand){ /* Close Command */ String command = currentCommand.toString(); Object result = CHARS.get(command); if (result != null) { sb.append((String) result); } else { sb.append(command); } } escaped = true; incommand = true; currentCommand = new StringBuffer(); } else if (!incommand && (c == '{' || c == '}')) { // Swallow the brace. } else if (Character.isLetter(c) || (c == '%') || (Globals.SPECIAL_COMMAND_CHARS.indexOf(String.valueOf(c)) >= 0)) { escaped = false; if (!incommand) sb.append(c); // Else we are in a command, and should not keep the letter. else { currentCommand.append(c); testCharCom: if ((currentCommand.length() == 1) && (Globals.SPECIAL_COMMAND_CHARS.indexOf(currentCommand.toString()) >= 0)) { // This indicates that we are in a command of the type // \^o or \~{n} if (i >= field.length() - 1) break testCharCom; String command = currentCommand.toString(); i++; c = field.charAt(i); // System.out.println("next: "+(char)c); String combody; if (c == '{') { IntAndString part = getPart(field, i, false); i += part.i; combody = part.s; } else { combody = field.substring(i, i + 1); // System.out.println("... "+combody); } Object result = CHARS.get(command + combody); if (result != null) sb.append((String) result); incommand = false; escaped = false; } else { // Are we already at the end of the string? if (i + 1 == field.length()){ String command = currentCommand.toString(); Object result = CHARS.get(command); /* If found, then use translated version. If not, * then keep * the text of the parameter intact. */ if (result != null) { sb.append((String) result); } else { sb.append(command); } } } } } else { String argument = null; if (!incommand) { sb.append(c); } else if (Character.isWhitespace(c) || (c == '{') || (c == '}')) { // First test if we are already at the end of the string. // if (i >= field.length()-1) // break testContent; String command = currentCommand.toString(); if (c == '{') { IntAndString part = getPart(field, i, true); i += part.i; argument = part.s; if (argument != null) { // handle common case of general latex command Object result = CHARS.get(command + argument); // System.out.print("command: "+command+", arg: "+argument); // System.out.print(", result: "); // If found, then use translated version. If not, then keep // the // text of the parameter intact. if (result != null) { sb.append((String) result); } else { sb.append(argument); } } } else if (c == '}') { // This end brace terminates a command. This can be the case in // constructs like {\aa}. The correct behaviour should be to // substitute the evaluated command and swallow the brace: Object result = CHARS.get(command); if (result != null) { sb.append((String) result); } else { // If the command is unknown, just print it: sb.append(command); } } else { Object result = CHARS.get(command); if (result != null) { sb.append((String) result); } else { sb.append(command); } sb.append(' '); } }/* else if (c == '}') { System.out.printf("com term by }: '%s'\n", currentCommand.toString()); argument = ""; }*/ else { /* * TODO: this point is reached, apparently, if a command is * terminated in a strange way, such as with "$\omega$". * Also, the command "\&" causes us to get here. The former * issue is maybe a little difficult to address, since it * involves the LaTeX math mode. We don't have a complete * LaTeX parser, so maybe it's better to ignore these * commands? */ } incommand = false; escaped = false; } } return sb.toString(); } private IntAndString getPart(String text, int i, boolean terminateOnEndBraceOnly) { char c; int count = 0; StringBuffer part = new StringBuffer(); // advance to first char and skip wihitespace i++; while (i < text.length() && Character.isWhitespace(text.charAt(i))){ i++; } // then grab whathever is the first token (counting braces) while (i < text.length()){ c = text.charAt(i); if (!terminateOnEndBraceOnly && count == 0 && Character.isWhitespace(c)) { i--; // end argument and leave whitespace for further // processing break; } if (c == '}' && --count < 0) break; else if (c == '{') count++; part.append(c); i++; } return new IntAndString(part.length(), format(part.toString())); } private class IntAndString { public int i; String s; public IntAndString(int i, String s) { this.i = i; this.s = s; } } }