FormatChars.java example

Explorer
Docear-master
package net.sf.jabref.export.layout.format;

import net.sf.jabref.Globals;
import net.sf.jabref.export.layout.LayoutFormatter;

import java.util.HashMap;

/**
 * This formatter converts LaTeX character sequences their equicalent unicode characters,
 * and removes other LaTeX commands without handling them.
 */
public class FormatChars implements LayoutFormatter {

    public static HashMap<String, String> CHARS = new HashMap<String, String>();

    static {
		CHARS.put("`A", "À"); // #192
		CHARS.put("'A", "Á"); // #193
		CHARS.put("^A", "Â"); // #194
		CHARS.put("~A", "Ã"); // #195
		CHARS.put("\"A", "Ä"); // #196
		CHARS.put("AA", "Å"); // #197
		CHARS.put("AE", "Æ"); // #198
		CHARS.put("cC", "Ç"); // #199
        CHARS.put("`E", "È"); // #200
		CHARS.put("'E", "É"); // #201
		CHARS.put("^E", "Ê"); // #202
		CHARS.put("\"E", "Ë"); // #203
		CHARS.put("`I", "Ì"); // #204
		CHARS.put("'I", "Í"); // #205
		CHARS.put("^I", "Î"); // #206
		CHARS.put("\"I", "Ï"); // #207
		CHARS.put("DH", "Ð"); // #208
		CHARS.put("~N", "Ñ"); // #209
		CHARS.put("`O", "Ò"); // #210
		CHARS.put("'O", "Ó"); // #211
		CHARS.put("^O", "Ô"); // #212
		CHARS.put("~O", "Õ"); // #213
		CHARS.put("\"O", "Ö"); // #214
		// According to ISO 8859-1 the "\times" symbol should be placed here
		// (#215).
		// Omitting this, because it is a mathematical symbol.
		CHARS.put("O", "Ø"); // #216
		CHARS.put("`U", "Ù"); // #217
		CHARS.put("'U", "Ú"); // #218
		CHARS.put("^U", "Û"); // #219
		CHARS.put("\"U", "Ü"); // #220
		CHARS.put("'Y", "Ý"); // #221
		CHARS.put("TH", "Þ"); // #222
		CHARS.put("ss", "ß"); // #223
		CHARS.put("`a", "à"); // #224
		CHARS.put("'a", "á"); // #225
		CHARS.put("^a", "â"); // #226
		CHARS.put("~a", "ã"); // #227
		CHARS.put("\"a", "ä"); // #228
		CHARS.put("aa", "å"); // #229
		CHARS.put("ae", "æ"); // #230
		CHARS.put("cc", "ç"); // #231
		CHARS.put("`e", "è"); // #232
		CHARS.put("'e", "é"); // #233
		CHARS.put("^e", "ê"); // #234
		CHARS.put("\"e", "ë"); // #235
		CHARS.put("`i", "ì"); // #236
		CHARS.put("'i", "í"); // #237
		CHARS.put("^i", "î"); // #238
		CHARS.put("\"i", "ï"); // #239
		CHARS.put("dh", "ð"); // #240
		CHARS.put("~n", "ñ"); // #241
		CHARS.put("`o", "ò"); // #242
		CHARS.put("'o", "ó"); // #243
		CHARS.put("^o", "ô"); // #244
		CHARS.put("~o", "õ"); // #245
		CHARS.put("\"o", "ö"); // #246
		// According to ISO 8859-1 the "\div" symbol should be placed here
		// (#247).
		// Omitting this, because it is a mathematical symbol.
		CHARS.put("o", "ø"); // #248
		CHARS.put("`u", "ù"); // #249
		CHARS.put("'u", "ú"); // #250
		CHARS.put("^u", "û"); // #251
		CHARS.put("\"u", "ü"); // #252
		CHARS.put("'y", "ý"); // #253
		CHARS.put("th", "þ"); // #254
		CHARS.put("\"y", "ÿ"); // #255

		// HTML special characters without names (UNICODE Latin Extended-A),
		// indicated by UNICODE number
		CHARS.put("=A", "Ā"); // "Amacr"
		CHARS.put("=a", "ā"); // "amacr"
		CHARS.put("uA", "Ă"); // "Abreve"
		CHARS.put("ua", "ă"); // "abreve"
		CHARS.put("kA", "Ą"); // "Aogon"
		CHARS.put("ka", "ą"); // "aogon"
		CHARS.put("'C", "Ć"); // "Cacute"
		CHARS.put("'c", "ć"); // "cacute"
		CHARS.put("^C", "Ĉ"); // "Ccirc"
		CHARS.put("^c", "ĉ"); // "ccirc"
		CHARS.put(".C", "Ċ"); // "Cdot"
		CHARS.put(".c", "ċ"); // "cdot"
		CHARS.put("vC", "Č"); // "Ccaron"
		CHARS.put("vc", "č"); // "ccaron"
		CHARS.put("vD", "Ď"); // "Dcaron"
		// Symbol #271 (d�) has no special Latex command
		CHARS.put("DJ", "Đ"); // "Dstrok"
		CHARS.put("dj", "đ"); // "dstrok"
		CHARS.put("=E", "Ē"); // "Emacr"
		CHARS.put("=e", "ē"); // "emacr"
		CHARS.put("uE", "Ĕ"); // "Ebreve"
		CHARS.put("ue", "ĕ"); // "ebreve"
		CHARS.put(".E", "Ė"); // "Edot"
		CHARS.put(".e", "ė"); // "edot"
		CHARS.put("kE", "Ę"); // "Eogon"
		CHARS.put("ke", "ę"); // "eogon"
		CHARS.put("vE", "Ě"); // "Ecaron"
		CHARS.put("ve", "ě"); // "ecaron"
		CHARS.put("^G", "Ĝ"); // "Gcirc"
		CHARS.put("^g", "ĝ"); // "gcirc"
		CHARS.put("uG", "Ğ"); // "Gbreve"
		CHARS.put("ug", "ğ"); // "gbreve"
		CHARS.put(".G", "Ġ"); // "Gdot"
		CHARS.put(".g", "ġ"); // "gdot"
		CHARS.put("cG", "Ģ"); // "Gcedil"
		CHARS.put("'g", "ģ"); // "gacute"
		CHARS.put("^H", "Ĥ"); // "Hcirc"
		CHARS.put("^h", "ĥ"); // "hcirc"
		CHARS.put("Hstrok", "Ħ"); // "Hstrok"
		CHARS.put("hstrok", "ħ"); // "hstrok"
		CHARS.put("~I", "Ĩ"); // "Itilde"
		CHARS.put("~i", "ĩ"); // "itilde"
		CHARS.put("=I", "Ī"); // "Imacr"
		CHARS.put("=i", "ī"); // "imacr"
		CHARS.put("uI", "Ĭ"); // "Ibreve"
		CHARS.put("ui", "ĭ"); // "ibreve"
		CHARS.put("kI", "Į"); // "Iogon"
		CHARS.put("ki", "į"); // "iogon"
		CHARS.put(".I", "İ"); // "Idot"
		CHARS.put("i", "ı"); // "inodot"
		// Symbol #306 (IJ) has no special Latex command
		// Symbol #307 (ij) has no special Latex command
		CHARS.put("^J", "Ĵ"); // "Jcirc"
		CHARS.put("^j", "ĵ"); // "jcirc"
		CHARS.put("cK", "Ķ"); // "Kcedil"
		CHARS.put("ck", "ķ"); // "kcedil"
		// Symbol #312 (k) has no special Latex command
		CHARS.put("'L", "Ĺ"); // "Lacute"
		CHARS.put("'l", "ĺ"); // "lacute"
		CHARS.put("cL", "Ļ"); // "Lcedil"
		CHARS.put("cl", "ļ"); // "lcedil"
		// Symbol #317 (L�) has no special Latex command
		// Symbol #318 (l�) has no special Latex command
		CHARS.put("Lmidot", "Ŀ"); // "Lmidot"
		CHARS.put("lmidot", "ŀ"); // "lmidot"
		CHARS.put("L", "Ł"); // "Lstrok"
		CHARS.put("l", "ł"); // "lstrok"
		CHARS.put("'N", "Ń"); // "Nacute"
		CHARS.put("'n", "ń"); // "nacute"
		CHARS.put("cN", "Ņ"); // "Ncedil"
		CHARS.put("cn", "ņ"); // "ncedil"
		CHARS.put("vN", "Ň"); // "Ncaron"
		CHARS.put("vn", "ň"); // "ncaron"
		// Symbol #329 (�n) has no special Latex command
		CHARS.put("NG", "Ŋ"); // "ENG"
		CHARS.put("ng", "ŋ"); // "eng"
		CHARS.put("=O", "Ō"); // "Omacr"
		CHARS.put("=o", "ō"); // "omacr"
		CHARS.put("uO", "Ŏ"); // "Obreve"
		CHARS.put("uo", "ŏ"); // "obreve"
		CHARS.put("HO", "Ő"); // "Odblac"
		CHARS.put("Ho", "ő"); // "odblac"
		CHARS.put("OE", "Œ"); // "OElig"
		CHARS.put("oe", "œ"); // "oelig"
		CHARS.put("'R", "Ŕ"); // "Racute"
		CHARS.put("'r", "ŕ"); // "racute"
		CHARS.put("cR", "Ŗ"); // "Rcedil"
		CHARS.put("cr", "ŗ"); // "rcedil"
		CHARS.put("vR", "Ř"); // "Rcaron"
		CHARS.put("vr", "ř"); // "rcaron"
		CHARS.put("'S", "Ś"); // "Sacute"
		CHARS.put("'s", "ś"); // "sacute"
		CHARS.put("^S", "Ŝ"); // "Scirc"
		CHARS.put("^s", "ŝ"); // "scirc"
		CHARS.put("cS", "Ş"); // "Scedil"
		CHARS.put("cs", "ş"); // "scedil"
		CHARS.put("vS", "Š"); // "Scaron"
		CHARS.put("vs", "š"); // "scaron"
		CHARS.put("cT", "Ţ"); // "Tcedil"
		CHARS.put("ct", "ţ"); // "tcedil"
		CHARS.put("vT", "Ť"); // "Tcaron"
		// Symbol #357 (t�) has no special Latex command
		CHARS.put("Tstrok", "Ŧ"); // "Tstrok"
		CHARS.put("tstrok", "ŧ"); // "tstrok"
		CHARS.put("~U", "Ũ"); // "Utilde"
		CHARS.put("~u", "ũ"); // "utilde"
		CHARS.put("=U", "Ū"); // "Umacr"
		CHARS.put("=u", "ū"); // "umacr"
		CHARS.put("uU", "Ŭ"); // "Ubreve"
		CHARS.put("uu", "ŭ"); // "ubreve"
		CHARS.put("rU", "Ů"); // "Uring"
		CHARS.put("ru", "ů"); // "uring"
		CHARS.put("HU", "ů"); // "Odblac"
		CHARS.put("Hu", "ű"); // "odblac"
		CHARS.put("kU", "Ų"); // "Uogon"
		CHARS.put("ku", "ų"); // "uogon"
		CHARS.put("^W", "Ŵ"); // "Wcirc"
		CHARS.put("^w", "ŵ"); // "wcirc"
		CHARS.put("^Y", "Ŷ"); // "Ycirc"
		CHARS.put("^y", "ŷ"); // "ycirc"
		CHARS.put("\"Y", "Ÿ"); // "Yuml"
		CHARS.put("'Z", "Ź"); // "Zacute"
		CHARS.put("'z", "ź"); // "zacute"
		CHARS.put(".Z", "Ż"); // "Zdot"
		CHARS.put(".z", "ż"); // "zdot"
		CHARS.put("vZ", "Ž"); // "Zcaron"
		CHARS.put("vz", "ž"); // "zcaron"
		// Symbol #383 (f) has no special Latex command
        CHARS.put("%", "%"); // percent sign
    }

    public String format(String field) {
		int i;
		field = field.replaceAll("&|\\\\&", "&").replaceAll("[\\n]{1,}", "<p>");

		StringBuffer sb = new StringBuffer();
		StringBuffer currentCommand = null;
		
		char c;
		boolean escaped = false, incommand = false;
		
		for (i = 0; i < field.length(); i++) {
			c = field.charAt(i);
			if (escaped && (c == '\\')) {
				sb.append('\\');
				escaped = false;
			} else if (c == '\\') {
				if (incommand){
					/* Close Command */
					String command = currentCommand.toString();
					Object result = CHARS.get(command);
					if (result != null) {
						sb.append((String) result);
					} else {
						sb.append(command);
					}
				}
				escaped = true;
				incommand = true;
				currentCommand = new StringBuffer();
			} else if (!incommand && (c == '{' || c == '}')) {
				// Swallow the brace.
			} else if (Character.isLetter(c) || (c == '%')
				|| (Globals.SPECIAL_COMMAND_CHARS.indexOf(String.valueOf(c)) >= 0)) {
				escaped = false;

                if (!incommand)
					sb.append(c);
					// Else we are in a command, and should not keep the letter.
				else {
					currentCommand.append(c);
                    testCharCom: if ((currentCommand.length() == 1)
						&& (Globals.SPECIAL_COMMAND_CHARS.indexOf(currentCommand.toString()) >= 0)) {
						// This indicates that we are in a command of the type
						// \^o or \~{n}
						if (i >= field.length() - 1)
							break testCharCom;

						String command = currentCommand.toString();
						i++;
						c = field.charAt(i);
						// System.out.println("next: "+(char)c);
						String combody;
						if (c == '{') {
							IntAndString part = getPart(field, i, false);
							i += part.i;
							combody = part.s;
						} else {
							combody = field.substring(i, i + 1);
							// System.out.println("... "+combody);
						}
						Object result = CHARS.get(command + combody);

						if (result != null)
							sb.append((String) result);

						incommand = false;
						escaped = false;
					} else { 
						//	Are we already at the end of the string?
						if (i + 1 == field.length()){
							String command = currentCommand.toString();
                            Object result = CHARS.get(command);
							/* If found, then use translated version. If not,
							 * then keep
							 * the text of the parameter intact.
							 */
							if (result != null) {
								sb.append((String) result);
							} else {
								sb.append(command);
							}
							
						}
					}
				}
			} else {
				String argument = null;

				if (!incommand) {
					sb.append(c);
				} else if (Character.isWhitespace(c) || (c == '{') || (c == '}')) {
					// First test if we are already at the end of the string.
					// if (i >= field.length()-1)
					// break testContent;

					String command = currentCommand.toString();
                                                
                    if (c == '{') {
						IntAndString part = getPart(field, i, true);
						i += part.i;
						argument = part.s;
						if (argument != null) {
							// handle common case of general latex command
							Object result = CHARS.get(command + argument);
							// System.out.print("command: "+command+", arg: "+argument);
							// System.out.print(", result: ");
							// If found, then use translated version. If not, then keep
							// the
							// text of the parameter intact.
							if (result != null) {
								sb.append((String) result);
							} else {
								sb.append(argument);
							}
						}
                    } else if (c == '}') {
                        // This end brace terminates a command. This can be the case in
                        // constructs like {\aa}. The correct behaviour should be to
                        // substitute the evaluated command and swallow the brace:
                        Object result = CHARS.get(command);
                        if (result != null) {
                            sb.append((String) result);
                        } else {
                            // If the command is unknown, just print it:
                            sb.append(command);
                        }
                    } else {
						Object result = CHARS.get(command);
						if (result != null) {
							sb.append((String) result);
						} else {
							sb.append(command);
						}
						sb.append(' ');
					}
				}/* else if (c == '}') {
                    System.out.printf("com term by }: '%s'\n", currentCommand.toString());

                    argument = "";
				}*/ else {
					/*
					 * TODO: this point is reached, apparently, if a command is
					 * terminated in a strange way, such as with "$\omega$".
					 * Also, the command "\&" causes us to get here. The former
					 * issue is maybe a little difficult to address, since it
					 * involves the LaTeX math mode. We don't have a complete
					 * LaTeX parser, so maybe it's better to ignore these
					 * commands?
					 */
				}
				
				incommand = false;
				escaped = false;
			}
		}

		return sb.toString();
	}

	private IntAndString getPart(String text, int i, boolean terminateOnEndBraceOnly) {
		char c;
		int count = 0;
		
		StringBuffer part = new StringBuffer();
		
		// advance to first char and skip wihitespace
		i++;
		while (i < text.length() && Character.isWhitespace(text.charAt(i))){
			i++;
		}
		
		// then grab whathever is the first token (counting braces)
		while (i < text.length()){
			c = text.charAt(i);
			if (!terminateOnEndBraceOnly && count == 0 && Character.isWhitespace(c)) {
				i--; // end argument and leave whitespace for further
					 // processing
				break;
			}
			if (c == '}' && --count < 0)
				break;
			else if (c == '{')
				count++;
			part.append(c);
			i++;
		}
		return new IntAndString(part.length(), format(part.toString()));
	}

	private class IntAndString {
		public int i;

		String s;

		public IntAndString(int i, String s) {
			this.i = i;
			this.s = s;
		}
	}
}