/* Copyright (C) 2003-2012 JabRef contributors. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package net.sf.jabref.imports; import java.util.HashMap; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.sf.jabref.export.layout.LayoutFormatter; import net.sf.jabref.Globals; public class HTMLConverter implements LayoutFormatter { /* Portions © International Organization for Standardization 1986: Permission to copy in any form is granted for use with conforming SGML systems and applications as defined in ISO 8879, provided this notice is included in all copies. */ // most of the LaTeX commands can be read at http://en.wikibooks.org/wiki/LaTeX/Accents // The symbols can be looked at http://www.fileformat.info/info/unicode/char/a4/index.htm. Replace "a4" with the U+ number // http://detexify.kirelabs.org/classify.html and http://www.ctan.org/tex-archive/info/symbols/comprehensive/ might help to find the right LaTeX command // http://llg.cubic.org/docs/ent2latex.html and http://www.w3.org/TR/xml-entity-names/byalpha.html are also useful // as well as http://www.w3.org/Math/characters/unicode.xml // An array of arrays of strings in the format: // {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"} // Leaving a field empty is OK as it then will not be included private String[][] conversionList = new String[][]{ {"160", "nbsp", "\\{~\\}"}, // no-break space = non-breaking space, // U+00A0 ISOnum {"161", "iexcl", "\\{\\\\textexclamdown\\}"}, // inverted exclamation mark, U+00A1 ISOnum {"162", "cent", "\\{\\\\textcent\\}"}, // cent sign, U+00A2 ISOnum {"163", "pound", "\\{\\\\pounds\\}"}, // pound sign, U+00A3 ISOnum {"164", "curren", "\\{\\\\textcurrency\\}"}, // currency sign, U+00A4 ISOnum {"165", "yen", "\\{\\\\textyen\\}"}, // yen sign = yuan sign, U+00A5 ISOnum {"166", "brvbar", "\\{\\\\textbrokenbar\\}"}, // broken bar = broken vertical bar, // U+00A6 ISOnum {"167", "sect", "\\{\\\\S\\}"}, // section sign, U+00A7 ISOnum {"168", "uml", "\\{\\\\\"\\{\\}\\}"}, // diaeresis = spacing diaeresis, // U+00A8 ISOdia {"169", "copy", "\\{\\\\copyright\\}"}, // copyright sign, U+00A9 ISOnum {"170", "ordf", "\\{\\\\textordfeminine\\}"}, // feminine ordinal indicator, U+00AA ISOnum {"171", "laquo", "\\{\\\\guillemotleft\\}"}, // left-pointing double angle quotation mark // = left pointing guillemet, U+00AB ISOnum {"172", "not", "\\$\\\\neg\\$"}, // not sign, U+00AC ISOnum {"173", "shy", "\\\\-"}, // soft hyphen = discretionary hyphen, // U+00AD ISOnum {"174", "reg", "\\{\\\\textregistered\\}"}, // registered sign = registered trade mark sign, // U+00AE ISOnum {"175", "macr", "\\{\\\\=\\{\\}\\}"}, // macron = spacing macron = overline // = APL overbar, U+00AF ISOdia {"176", "deg", "\\$\\\\deg\\$"}, // degree sign, U+00B0 ISOnum {"177", "plusmn", "\\$\\\\pm\\$"}, // plus-minus sign = plus-or-minus sign, // U+00B1 ISOnum {"178", "sup2", "\\\\textsuperscript\\{2\\}"}, // superscript two = superscript digit two // = squared, U+00B2 ISOnum {"179", "sup3", "\\\\textsuperscript\\{3\\}"}, // superscript three = superscript digit three // = cubed, U+00B3 ISOnum {"180", "acute", "\\{\\\\'\\{\\}\\}"}, // acute accent = spacing acute, // U+00B4 ISOdia {"181", "micro", "\\$\\\\mu\\$"}, // micro sign, U+00B5 ISOnum {"182", "para", "\\{\\\\P\\}"}, // pilcrow sign = paragraph sign, // U+00B6 ISOnum {"183", "middot", "\\$\\\\cdot\\$"}, // middle dot = Georgian comma // = Greek middle dot, U+00B7 ISOnum {"184", "cedil", "\\{\\\\c\\{\\}\\}"}, // cedilla = spacing cedilla, U+00B8 ISOdia {"185", "sup1", "\\\\textsuperscript\\{1\\}"}, // superscript one = superscript digit one, // U+00B9 ISOnum {"186", "ordm", "\\{\\\\textordmasculine\\}"}, // masculine ordinal indicator, // U+00BA ISOnum {"187", "raquo", "\\{\\\\guillemotright\\}"}, // right-pointing double angle quotation mark // = right pointing guillemet, U+00BB ISOnum {"188", "frac14", "\\$\\\\sfrac\\{1\\}\\{4\\}\\$"}, // vulgar fraction one quarter // = fraction one quarter, U+00BC ISOnum {"189", "frac12", "\\$\\\\sfrac\\{1\\}\\{2\\}\\$"}, // vulgar fraction one half // = fraction one half, U+00BD ISOnum {"190", "frac34", "\\$\\\\sfrac\\{3\\}\\{4\\}\\$"}, // vulgar fraction three quarters // = fraction three quarters, U+00BE ISOnum {"191", "iquest", "\\{\\\\textquestiondown\\}"}, // inverted question mark // = turned question mark, U+00BF ISOnum {"192", "Agrave", "\\{\\\\`\\{A\\}\\}"}, // latin capital letter A with grave // = latin capital letter A grave, // U+00C0 ISOlat1 {"193", "Aacute", "\\{\\\\'\\{A\\}\\}"}, // latin capital letter A with acute, // U+00C1 ISOlat1 {"194", "Acirc", "\\{\\\\\\^\\{A\\}\\}"}, // latin capital letter A with circumflex, // U+00C2 ISOlat1 {"195", "Atilde", "\\{\\\\~\\{A\\}\\}"}, // latin capital letter A with tilde, // U+00C3 ISOlat1 {"196", "Auml", "\\{\\\\\"\\{A\\}\\}"}, // latin capital letter A with diaeresis, // U+00C4 ISOlat1 {"197", "Aring", "\\{\\\\AA\\}"}, // latin capital letter A with ring above // = latin capital letter A ring, // U+00C5 ISOlat1 {"198", "AElig", "\\{\\\\AE\\}"}, // latin capital letter AE // = latin capital ligature AE, // U+00C6 ISOlat1 {"199", "Ccedil", "\\{\\\\c\\{C\\}\\}"}, // latin capital letter C with cedilla, // U+00C7 ISOlat1 {"200", "Egrave", "\\{\\\\`\\{E\\}\\}"}, // latin capital letter E with grave, // U+00C8 ISOlat1 {"201", "Eacute", "\\{\\\\'\\{E\\}\\}"}, // latin capital letter E with acute, // U+00C9 ISOlat1 {"202", "Ecirc", "\\{\\\\\\^\\{E\\}\\}"}, // latin capital letter E with circumflex, // U+00CA ISOlat1 {"203", "Euml", "\\{\\\\\"\\{E\\}\\}"}, // latin capital letter E with diaeresis, // U+00CB ISOlat1 {"204", "Igrave", "\\{\\\\`\\{I\\}\\}"}, // latin capital letter I with grave, // U+00CC ISOlat1 {"205", "Iacute", "\\{\\\\'\\{I\\}\\}"}, // latin capital letter I with acute, // U+00CD ISOlat1 {"206", "Icirc", "\\{\\\\\\^\\{I\\}\\}"}, // latin capital letter I with circumflex, // U+00CE ISOlat1 {"207", "Iuml", "\\{\\\\\"\\{I\\}\\}"}, // latin capital letter I with diaeresis, // U+00CF ISOlat1 {"208", "ETH", "\\{\\\\DH\\}"}, // latin capital letter ETH, U+00D0 ISOlat1 {"209", "Ntilde", "\\{\\\\~\\{N\\}\\}"}, // latin capital letter N with tilde, // U+00D1 ISOlat1 {"210", "Ograve", "\\{\\\\`\\{O\\}\\}"}, // latin capital letter O with grave, // U+00D2 ISOlat1 {"211", "Oacute", "\\{\\\\'\\{O\\}\\}"}, // latin capital letter O with acute, // U+00D3 ISOlat1 {"212", "Ocirc", "\\{\\\\\\^\\{O\\}\\}"}, // latin capital letter O with circumflex, // U+00D4 ISOlat1 {"213", "Otilde", "\\{\\\\~\\{O\\}\\}"}, // latin capital letter O with tilde, // U+00D5 ISOlat1 {"214", "Ouml", "\\{\\\\\"\\{O\\}\\}"}, // latin capital letter O with diaeresis, // U+00D6 ISOlat1 {"215", "times", "\\$\\\\times\\$"}, // multiplication sign, U+00D7 ISOnum {"216", "Oslash", "\\{\\\\O\\}"}, // latin capital letter O with stroke // = latin capital letter O slash, // U+00D8 ISOlat1 {"217", "Ugrave", "\\{\\\\`\\{U\\}\\}"}, // latin capital letter U with grave, // U+00D9 ISOlat1 {"218", "Uacute", "\\{\\\\'\\{U\\}\\}"}, // latin capital letter U with acute, // U+00DA ISOlat1 {"219", "Ucirc", "\\{\\\\\\^\\{U\\}\\}"}, // latin capital letter U with circumflex, // U+00DB ISOlat1 {"220", "Uuml", "\\{\\\\\"\\{U\\}\\}"}, // latin capital letter U with diaeresis, // U+00DC ISOlat1 {"221", "Yacute", "\\{\\\\'\\{Y\\}\\}"}, // latin capital letter Y with acute, // U+00DD ISOlat1 {"222", "THORN", "\\{\\\\TH\\}"}, // latin capital letter THORN, // U+00DE ISOlat1 {"223", "szlig", "\\{\\\\ss\\}"}, // latin small letter sharp s = ess-zed, // U+00DF ISOlat1 {"224", "agrave", "\\{\\\\`\\{a\\}\\}"}, // latin small letter a with grave // = latin small letter a grave, // U+00E0 ISOlat1 {"225", "aacute", "\\{\\\\'\\{a\\}\\}"}, // latin small letter a with acute, // U+00E1 ISOlat1 {"226", "acirc", "\\{\\\\\\^\\{a\\}\\}"}, // latin small letter a with circumflex, // U+00E2 ISOlat1 {"227", "atilde", "\\{\\\\~\\{a\\}\\}"}, // latin small letter a with tilde, // U+00E3 ISOlat1 {"228", "auml", "\\{\\\\\"\\{a\\}\\}"}, // latin small letter a with diaeresis, // U+00E4 ISOlat1 {"229", "aring", "\\{\\\\aa\\}"}, // latin small letter a with ring above // = latin small letter a ring, // U+00E5 ISOlat1 {"230", "aelig", "\\{\\\\ae\\}"}, // latin small letter ae // = latin small ligature ae, U+00E6 ISOlat1 {"231", "ccedil", "\\{\\\\c\\{c\\}\\}"}, // latin small letter c with cedilla, // U+00E7 ISOlat1 {"232", "egrave", "\\{\\\\`\\{e\\}\\}"}, // latin small letter e with grave, // U+00E8 ISOlat1 {"233", "eacute", "\\{\\\\'\\{e\\}\\}"}, // latin small letter e with acute, // U+00E9 ISOlat1 {"234", "ecirc", "\\{\\\\\\^\\{e\\}\\}"}, // latin small letter e with circumflex, // U+00EA ISOlat1 {"235", "euml", "\\{\\\\\"\\{e\\}\\}"}, // latin small letter e with diaeresis, // U+00EB ISOlat1 {"236", "igrave", "\\{\\\\`\\{\\\\i\\}\\}"}, // latin small letter i with grave, // U+00EC ISOlat1 {"237", "iacute", "\\{\\\\'\\{\\\\i\\}\\}"}, // latin small letter i with acute, // U+00ED ISOlat1 {"238", "icirc", "\\{\\\\\\^\\{\\\\i\\}\\}"}, // latin small letter i with circumflex, // U+00EE ISOlat1 {"239", "iuml", "\\{\\\\\"\\{\\\\i\\}\\}"}, // latin small letter i with diaeresis, // U+00EF ISOlat1 {"240", "eth", "\\{\\\\dh\\}"}, // latin small letter eth, U+00F0 ISOlat1 {"241", "ntilde", "\\{\\\\~\\{n\\}\\}"}, // latin small letter n with tilde, // U+00F1 ISOlat1 {"242", "ograve", "\\{\\\\`\\{o\\}\\}"}, // latin small letter o with grave, // U+00F2 ISOlat1 {"243", "oacute", "\\{\\\\'\\{o\\}\\}"}, // latin small letter o with acute, // U+00F3 ISOlat1 {"244", "ocirc", "\\{\\\\\\^\\{o\\}\\}"}, // latin small letter o with circumflex, // U+00F4 ISOlat1 {"245", "otilde", "\\{\\\\~\\{o\\}\\}"}, // latin small letter o with tilde, // U+00F5 ISOlat1 {"246", "ouml", "\\{\\\\\"\\{o\\}\\}"}, // latin small letter o with diaeresis, // U+00F6 ISOlat1 {"247", "divide", "\\$\\\\div\\$"}, // division sign, U+00F7 ISOnum {"248", "oslash", "\\{\\\\o\\}"}, // latin small letter o with stroke, // = latin small letter o slash, // U+00F8 ISOlat1 {"249", "ugrave", "\\{\\\\`\\{u\\}\\}"}, // latin small letter u with grave, // U+00F9 ISOlat1 {"250", "uacute", "\\{\\\\'\\{u\\}\\}"}, // latin small letter u with acute, // U+00FA ISOlat1 {"251", "ucirc", "\\{\\\\\\^\\{u\\}\\}"}, // latin small letter u with circumflex, // U+00FB ISOlat1 {"252", "uuml", "\\{\\\\\"\\{u\\}\\}"}, // latin small letter u with diaeresis, // U+00FC ISOlat1 {"253", "yacute", "\\{\\\\'\\{y\\}\\}"}, // latin small letter y with acute, // U+00FD ISOlat1 {"254", "thorn", "\\{\\\\th\\}"}, // latin small letter thorn, // U+00FE ISOlat1 {"255", "yuml", "\\{\\\\\"\\{y\\}\\}"}, // latin small letter y with diaeresis, // U+00FF ISOlat1 {"402", "fnof", "\\$f\\$"}, // latin small f with hook = function // = florin, U+0192 ISOtech /* Greek */ {"913", "Alpha", "\\{\\$\\\\Alpha\\$\\}"}, // greek capital letter alpha, U+0391 {"914", "Beta", "\\{\\$\\\\Beta\\$\\}"}, // greek capital letter beta, U+0392 {"915", "Gamma", "\\{\\$\\\\Gamma\\$\\}"}, // greek capital letter gamma, // U+0393 ISOgrk3 {"916", "Delta", "\\{\\$\\\\Delta\\$\\}"}, // greek capital letter delta, // U+0394 ISOgrk3 {"917", "Epsilon", "\\{\\$\\\\Epsilon\\$\\}"}, // greek capital letter epsilon, U+0395 {"918", "Zeta", "\\{\\$\\\\Zeta\\$\\}"}, // greek capital letter zeta, U+0396 {"919", "Eta", "\\{\\$\\\\Eta\\$\\}"}, // greek capital letter eta, U+0397 {"920", "Theta", "\\{\\$\\\\Theta\\$\\}"}, // greek capital letter theta, // U+0398 ISOgrk3 {"921", "Iota", "\\{\\$\\\\Iota\\$\\}"}, // greek capital letter iota, U+0399 {"922", "Kappa", "\\{\\$\\\\Kappa\\$\\}"}, // greek capital letter kappa, U+039A {"923", "Lambda", "\\{\\$\\\\Lambda\\$\\}"}, // greek capital letter lambda, // U+039B ISOgrk3 {"924", "Mu", "\\{\\$\\\\Mu\\$\\}"}, // greek capital letter mu, U+039C {"925", "Nu", "\\{\\$\\\\Nu\\$\\}"}, // greek capital letter nu, U+039D {"926", "Xi", "\\{\\$\\\\Xi\\$\\}"}, // greek capital letter xi, U+039E ISOgrk3 {"927", "Omicron", "\\{\\$\\\\Omicron\\$\\}"}, // greek capital letter omicron, U+039F {"928", "Pi", "\\{\\$\\\\Pi\\$\\}"}, // greek capital letter pi, U+03A0 ISOgrk3 {"929", "Rho", "\\{\\$\\\\Rho\\$\\}"}, // greek capital letter rho, U+03A1 /* there is no Sigmaf, and no U+03A2 character either */ {"931", "Sigma", "\\{\\$\\\\Sigma\\$\\}"}, // greek capital letter sigma, // U+03A3 ISOgrk3 {"932", "Tau", "\\{\\$\\\\Tau\\$\\}"}, // greek capital letter tau, U+03A4 {"933", "Upsilon", "\\{\\$\\\\Upsilon\\$\\}"}, // greek capital letter upsilon, // U+03A5 ISOgrk3 {"934", "Phi", "\\{\\$\\\\Phi\\$\\}"}, // greek capital letter phi, // U+03A6 ISOgrk3 {"935", "Chi", "\\{\\$\\\\Chi\\$\\}"}, // greek capital letter chi, U+03A7 {"936", "Psi", "\\{\\$\\\\Psi\\$\\}"}, // greek capital letter psi, // U+03A8 ISOgrk3 {"937", "Omega", "\\{\\$\\\\Omega\\$\\}"}, // greek capital letter omega, // U+03A9 ISOgrk3 {"945", "alpha", "\\$\\\\alpha\\$"}, // greek small letter alpha, // U+03B1 ISOgrk3 {"946", "beta", "\\$\\\\beta\\$"}, // greek small letter beta, U+03B2 ISOgrk3 {"947", "gamma", "\\$\\\\gamma\\$"}, // greek small letter gamma, // U+03B3 ISOgrk3 {"948", "delta", "\\$\\\\delta\\$"}, // greek small letter delta, // U+03B4 ISOgrk3 {"949", "epsilon", "\\$\\\\epsilon\\$"}, // greek small letter epsilon, // U+03B5 ISOgrk3 {"950", "zeta", "\\$\\\\zeta\\$"}, // greek small letter zeta, U+03B6 ISOgrk3 {"951", "eta", "\\$\\\\eta\\$"}, // greek small letter eta, U+03B7 ISOgrk3 {"952", "theta", "\\$\\\\theta\\$"}, // greek small letter theta, // U+03B8 ISOgrk3 {"953", "iota", "\\$\\\\iota\\$"}, // greek small letter iota, U+03B9 ISOgrk3 {"954", "kappa", "\\$\\\\kappa\\$"}, // greek small letter kappa, // U+03BA ISOgrk3 {"955", "lambda", "\\$\\\\lambda\\$"}, // greek small letter lambda, // U+03BB ISOgrk3 {"956", "mu", "\\$\\\\mu\\$"}, // greek small letter mu, U+03BC ISOgrk3 {"957", "nu", "\\$\\\\nu\\$"}, // greek small letter nu, U+03BD ISOgrk3 {"958", "xi", "\\$\\\\xi\\$"}, // greek small letter xi, U+03BE ISOgrk3 {"959", "omicron", "\\$\\\\omicron\\$"}, // greek small letter omicron, U+03BF NEW {"960", "pi", "\\$\\\\phi\\$"}, // greek small letter pi, U+03C0 ISOgrk3 {"961", "rho", "\\$\\\\rho\\$"}, // greek small letter rho, U+03C1 ISOgrk3 {"962", "sigmaf", "\\$\\\\varsigma\\$"}, // greek small letter final sigma, // U+03C2 ISOgrk3 {"963", "sigma", "\\$\\\\sigma\\$"}, // greek small letter sigma, // U+03C3 ISOgrk3 {"964", "tau", "\\$\\\\tau\\$"}, // greek small letter tau, U+03C4 ISOgrk3 {"965", "upsilon", "\\$\\\\upsilon\\$"}, // greek small letter upsilon, {"", "upsi", "\\$\\\\upsilon\\$"}, // alias // U+03C5 ISOgrk3 {"966", "phi", "\\$\\\\phi\\$"}, // greek small letter phi, U+03C6 ISOgrk3 {"967", "chi", "\\$\\\\chi\\$"}, // greek small letter chi, U+03C7 ISOgrk3 {"968", "psi", "\\$\\\\psi\\$"}, // greek small letter psi, U+03C8 ISOgrk3 {"969", "omega", "\\$\\\\omega\\$"}, // greek small letter omega, // U+03C9 ISOgrk3 {"977", "thetasym", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol, {"", "thetav", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol, {"", "vartheta", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol, // U+03D1 NEW {"978", "upsih", "\\{\\$\\\\Upsilon\\$\\}"}, // greek upsilon with hook symbol, // U+03D2 NEW {"982", "piv", "\\$\\\\varphi\\$"}, // greek pi symbol, U+03D6 ISOgrk3 /* General Punctuation */ {"8226", "bull", "\\$\\\\bullet\\$"}, // bullet = black small circle, // U+2022 ISOpub /* bullet is NOT the same as bullet operator, U+2219 */ {"8230", "hellip", "\\{\\\\ldots\\}"}, // horizontal ellipsis = three dot leader, // U+2026 ISOpub {"8242", "prime", "\\$\\\\prime\\$"}, // prime = minutes = feet, U+2032 ISOtech {"8243", "Prime", "\\$\\{''\\}\\$"}, // double prime = seconds = inches, // U+2033 ISOtech {"8254", "oline", "\\{\\\\=\\{\\}\\}"}, // overline = spacing overscore, // U+203E NEW {"8260", "frasl", "/"}, // fraction slash, U+2044 NEW /* Letterlike Symbols */ {"8472", "weierp", "\\$\\\\wp\\$"}, // script capital P = power set // = Weierstrass p, U+2118 ISOamso {"8465", "image", "\\{\\$\\\\Im\\$\\}"}, // blackletter capital I = imaginary part, // U+2111 ISOamso {"8476", "real", "\\{\\$\\\\Re\\$\\}"}, // blackletter capital R = real part symbol, // U+211C ISOamso {"8482", "trade", "\\{\\\\texttrademark\\}"}, // trade mark sign, U+2122 ISOnum {"8501", "alefsym", "\\$\\\\aleph\\$"}, // alef symbol = first transfinite cardinal, // U+2135 NEW /* alef symbol is NOT the same as hebrew letter alef, U+05D0 although the same glyph could be used to depict both characters */ /* Arrows */ {"8592", "larr", "\\$\\\\leftarrow\\$"}, // leftwards arrow, U+2190 ISOnum {"8593", "uarr", "\\$\\\\uparrow\\$"}, // upwards arrow, U+2191 ISOnum {"8594", "rarr", "\\$\\\\rightarrow\\$"}, // rightwards arrow, U+2192 ISOnum {"8595", "darr", "\\$\\\\downarrow\\$"}, // downwards arrow, U+2193 ISOnum {"8596", "harr", "\\$\\\\leftrightarrow\\$"}, // left right arrow, U+2194 ISOamsa {"8629", "crarr", "\\$\\\\dlsh\\$"}, // downwards arrow with corner leftwards // = carriage return, U+21B5 NEW - require mathabx {"8656", "lArr", "\\{\\$\\\\Leftarrow\\$\\}"}, // leftwards double arrow, U+21D0 ISOtech /* ISO 10646 does not say that lArr is the same as the 'is implied by' arrow but also does not have any other character for that function. So ? lArr can be used for 'is implied by' as ISOtech suggests */ {"8657", "uArr", "\\{\\$\\\\Uparrow\\$\\}"}, // upwards double arrow, U+21D1 ISOamsa {"8658", "rArr", "\\{\\$\\\\Rightarrow\\$\\}"}, // rightwards double arrow, // U+21D2 ISOtech /* ISO 10646 does not say this is the 'implies' character but does not have another character with this function so ? rArr can be used for 'implies' as ISOtech suggests */ {"8659", "dArr", "\\{\\$\\\\Downarrow\\$\\}"}, // downwards double arrow, U+21D3 ISOamsa {"8660", "hArr", "\\{\\$\\\\Leftrightarrow\\$\\}"}, // left right double arrow, // U+21D4 ISOamsa /* Mathematical Operators */ {"8704", "forall", "\\$\\\\forall\\$"}, // for all, U+2200 ISOtech {"8706", "part", "\\$\\\\partial\\$"}, // partial differential, U+2202 ISOtech {"8707", "exist", "\\$\\\\exists\\$"}, // there exists, U+2203 ISOtech {"8709", "empty", "\\$\\\\emptyset\\$"}, // empty set = null set = diameter, // U+2205 ISOamso {"8711", "nabla", "\\$\\\\nabla\\$"}, // nabla = backward difference, // U+2207 ISOtech {"8712", "isin", "\\$\\\\in\\$"}, // element of, U+2208 ISOtech {"8713", "notin", "\\$\\\\notin\\$"}, // not an element of, U+2209 ISOtech {"8715", "ni", "\\$\\\\ni\\$"}, // contains as member, U+220B ISOtech /* should there be a more memorable name than 'ni'? */ {"8719", "prod", "\\$\\\\prod\\$"}, // n-ary product = product sign, // U+220F ISOamsb /* prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both */ {"8721", "sum", "\\$\\\\sum\\$"}, // n-ary sumation, U+2211 ISOamsb /* sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both */ {"8722", "minus", "\\$-\\$"}, // minus sign, U+2212 ISOtech {"8727", "lowast", "\\$\\\\ast\\$"}, // asterisk operator, U+2217 ISOtech {"8730", "radic", "\\$\\\\sqrt{}\\$"}, // square root = radical sign, // U+221A ISOtech {"8733", "prop", "\\$\\\\propto\\$"}, // proportional to, U+221D ISOtech {"8734", "infin", "\\$\\\\infty\\$"}, // infinity, U+221E ISOtech {"8736", "ang", "\\$\\\\angle\\$"}, // angle, U+2220 ISOamso {"8743", "and", "\\$\\\\land\\$"}, // logical and = wedge, U+2227 ISOtech {"8744", "or", "\\$\\\\lor\\$"}, // logical or = vee, U+2228 ISOtech {"8745", "cap", "\\$\\\\cap\\$"}, // intersection = cap, U+2229 ISOtech {"8746", "cup", "\\$\\\\cup\\$"}, // union = cup, U+222A ISOtech {"8747", "int", "\\$\\\\int\\$"}, // integral, U+222B ISOtech {"8756", "there4", "\\$\\\\uptherefore\\$"}, // therefore, U+2234 ISOtech; only in LaTeX package MnSymbol {"8764", "sim", "\\$\\\\sim\\$"}, // tilde operator = varies with = similar to, // U+223C ISOtech /* tilde operator is NOT the same character as the tilde, U+007E, although the same glyph might be used to represent both */ {"8773", "cong", "\\$\\\\cong\\$"}, // approximately equal to, U+2245 ISOtech {"8776", "asymp", "\\$\\\\approx\\$"}, // almost equal to = asymptotic to, // U+2248 ISOamsr {"8800", "ne", "\\$\\\\neq\\$"}, // not equal to, U+2260 ISOtech {"8801", "equiv", "\\$\\\\equiv\\$"}, // identical to, U+2261 ISOtech {"8804", "le", "\\$\\\\leq\\$"}, // less-than or equal to, U+2264 ISOtech {"8805", "ge", "\\$\\\\geq\\$"}, // greater-than or equal to, // U+2265 ISOtech {"8834", "sub", "\\$\\\\subset\\$"}, // subset of, U+2282 ISOtech {"8835", "sup", "\\$\\\\supset\\$"}, // superset of, U+2283 ISOtech /* note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. Should it be, for symmetry? It is in ISOamsn */ {"8836", "nsub", "\\$\\\\nsubset\\$"}, // not a subset of, U+2284 ISOamsn {"8838", "sube", "\\$\\\\subseteq\\$"}, // subset of or equal to, U+2286 ISOtech {"8839", "supe", "\\$\\\\supseteq\\$"}, // superset of or equal to, // U+2287 ISOtech {"8853", "oplus", "\\$\\\\oplus\\$"}, // circled plus = direct sum, // U+2295 ISOamsb {"8855", "otimes", "\\$\\\\otimes\\$"}, // circled times = vector product, // U+2297 ISOamsb {"8869", "perp", "\\$\\\\perp\\$"}, // up tack = orthogonal to = perpendicular, // U+22A5 ISOtech {"8901", "sdot", "\\$\\\\cdot\\$"}, // dot operator, U+22C5 ISOamsb /* dot operator is NOT the same character as U+00B7 middle dot */ /* Miscellaneous Technical */ {"8968", "lceil", "\\$\\\\lceil\\$"}, // left ceiling = apl upstile, // U+2308 ISOamsc {"8969", "rceil", "\\$\\\\rceil\\$"}, // right ceiling, U+2309 ISOamsc {"8970", "lfloor", "\\$\\\\lfloor\\$"}, // left floor = apl downstile, // U+230A ISOamsc {"8971", "rfloor", "\\$\\\\rfloor\\$"}, // right floor, U+230B ISOamsc {"9001", "lang", "\\$\\\\langle\\$"}, // left-pointing angle bracket = bra, // U+2329 ISOtech /* lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark' */ {"9002", "rang", "\\$\\\\rangle\\$"}, // right-pointing angle bracket = ket, // U+232A ISOtech /* rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark' */ /* Geometric Shapes */ {"9674", "loz", "\\$\\\\lozenge\\$"}, // lozenge, U+25CA ISOpub /* Miscellaneous Symbols */ {"9824", "spades", "\\$\\\\spadesuit\\$"}, // black spade suit, U+2660 ISOpub /* black here seems to mean filled as opposed to hollow */ {"9827", "clubs", "\\$\\\\clubsuit\\$"}, // black club suit = shamrock, // U+2663 ISOpub {"9829", "hearts", "\\$\\\\heartsuit\\$"}, // black heart suit = valentine, // U+2665 ISOpub {"9830", "diams", "\\$\\\\diamondsuit\\$"}, // black diamond suit, U+2666 ISOpub {"34", "quot", "\""}, // quotation mark = APL quote, // U+0022 ISOnum {"38", "amp", "\\\\&"}, // ampersand, U+0026 ISOnum {"60", "lt", "\\$<\\$"}, // less-than sign, U+003C ISOnum {"62", "gt", "\\$>\\$"}, // greater-than sign, U+003E ISOnum /* Latin Extended-A */ {"338", "OElig", "\\{\\\\OE\\}"}, // latin capital ligature OE, // U+0152 ISOlat2 {"339", "oelig", "\\{\\\\oe\\}"}, // latin small ligature oe, U+0153 ISOlat2 /* ligature is a misnomer, this is a separate character in some languages */ {"352", "Scaron", "\\{\\\\v\\{S\\}\\}"}, // latin capital letter S with caron, // U+0160 ISOlat2 {"353", "scaron", "\\{\\\\v\\{s\\}\\}"}, // latin small letter s with caron, // U+0161 ISOlat2 {"376", "Yuml", "\\{\\\\\"\\{Y\\}\\}"}, // latin capital letter Y with diaeresis, // U+0178 ISOlat2 /* Spacing Modifier Letters */ {"710", "circ", "\\{\\\\textasciicircum\\}"}, // modifier letter circumflex accent, // U+02C6 ISOpub {"732", "tilde", "\\{\\\\textasciitilde\\}"}, // small tilde, U+02DC ISOdia /* General Punctuation */ {"8194", "ensp", "\\\\hspace\\{0.5em\\}"}, // en space, U+2002 ISOpub {"8195", "emsp", "\\\\hspace\\{1em\\}"}, // em space, U+2003 ISOpub {"8201", "thinsp", "\\\\hspace\\{0.167em\\}"}, // thin space, U+2009 ISOpub {"8204", "zwnj", ""}, // zero width non-joiner, // U+200C NEW RFC 2070 {"8205", "zwj", ""}, // zero width joiner, U+200D NEW RFC 2070 {"8206", "lrm", ""}, // left-to-right mark, U+200E NEW RFC 2070 {"8207", "rlm", ""}, // right-to-left mark, U+200F NEW RFC 2070 {"8211", "ndash", "--"}, // en dash, U+2013 ISOpub {"8212", "mdash", "---"}, // em dash, U+2014 ISOpub {"8216", "lsquo", "\\{\\\\textquoteleft\\}"}, // left single quotation mark, // U+2018 ISOnum {"8217", "rsquo", "\\{\\\\textquoteright\\}"}, // right single quotation mark, // U+2019 ISOnum {"8218", "sbquo", "\\{\\\\quotesinglbase\\}"}, // single low-9 quotation mark, U+201A NEW {"8220", "ldquo", "\\{\\\\textquotedblleft\\}"}, // left double quotation mark, // U+201C ISOnum {"8221", "rdquo", "\\{\\\\textquotedblright\\}"}, // right double quotation mark, // U+201D ISOnum {"8222", "bdquo", "\\{\\\\quotedblbase\\}"}, // double low-9 quotation mark, U+201E NEW {"8224", "dagger", "\\{\\\\dag\\}"}, // dagger, U+2020 ISOpub {"8225", "Dagger", "\\{\\\\ddag\\}"}, // double dagger, U+2021 ISOpub {"8240", "permil", "\\{\\\\textperthousand\\}"}, // per mille sign, U+2030 ISOtech {"8249", "lsaquo", "\\{\\\\guilsinglleft\\}"}, // single left-pointing angle quotation mark, // U+2039 ISO proposed /* lsaquo is proposed but not yet ISO standardized */ {"8250", "rsaquo", "\\{\\\\guilsinglright\\}"}, // single right-pointing angle quotation mark, // U+203A ISO proposed /* rsaquo is proposed but not yet ISO standardized */ {"8364", "euro", "\\{\\\\texteuro\\}"}, // euro sign, U+20AC NEW /* Manually added */ {"37", "percnt", "\\\\%"}, // Percent {"39", "", "'"}, // Apostrophe {"40", "", "("}, // Left bracket {"41", "", ")"}, // Right bracket {"43", "plus", "\\+"}, // Plus {"44", "comma", ","}, // Comma {"45", "hyphen", "-"}, // Hyphen {"46", "period", "\\."}, // Period {"47", "slash", "/"}, // Slash (solidus) {"58", "colon", ":"}, // Colon {"59", "semi", ";"}, // Semi colon {"91", "lsqb", "\\["}, // Left square bracket {"92", "bsol", "\\{\\\\textbackslash\\}"}, // Backslash {"93", "rsqb", "\\]"}, // Right square bracket {"94", "Hat", "\\{\\\\\\^\\{\\}\\}"}, // Circumflex {"95", "lowbar", "\\\\_"}, // Underscore {"96", "grave", "\\{\\\\`\\{\\}\\}"}, // Grave {"123", "lbrace", "\\\\\\{"}, // Left curly bracket {"", "lcub", "\\\\\\{"}, // Left curly bracket {"124", "vert", "\\|"}, // Vertical bar {"", "verbar", "\\|"}, // Vertical bar {"", "VerticalLine", "\\|"}, // Vertical bar {"125", "rbrace", "\\\\\\}"}, // Right curly bracket {"", "rcub", "\\\\\\}"}, // Right curly bracket {"138", "", "\\{\\\\v\\{S\\}\\}"}, // Line tabulation set // {"141", "", ""}, // Reverse line feed {"145", "", "`"}, // Apostrophe {"146", "", "'"}, // Apostrophe {"147", "", "``"}, // Quotation mark {"148", "", "''"}, // Quotation mark {"150", "", "--"}, // En dash {"154", "", "\\{\\\\v\\{s\\}\\}"}, // Single character introducer {"260", "Aogon", "\\{\\\\k\\{A\\}\\}"}, // capital A with ogonek {"261", "aogon", "\\{\\\\k\\{a\\}\\}"}, // small a with ogonek {"262", "Cacute", "\\{\\\\'\\{C\\}\\}"}, // capital C with acute {"263", "cacute", "\\{\\\\'\\{c\\}\\}"}, // small C with acute {"264", "Ccirc", "\\{\\\\\\^\\{C\\}\\}"}, // capital C with circumflex {"265", "ccirc", "\\{\\\\\\^\\{c\\}\\}"}, // small C with circumflex {"266", "Cdot", "\\{\\\\\\.\\{C\\}\\}"}, // capital C with dot above {"267", "cdot", "\\{\\\\\\.\\{c\\}\\}"}, // small C with dot above {"268", "Ccaron", "\\{\\\\v\\{C\\}\\}"}, // capital C with caron {"269", "ccaron", "\\{\\\\v\\{c\\}\\}"}, // small C with caron {"272", "Dstrok", "\\{\\\\DJ\\}"}, // capital D with stroke {"273", "dstrok", "\\{\\\\dj\\}"}, // small d with stroke {"280", "Eogon", "\\{\\\\k\\{E\\}\\}"}, // capital E with ogonek {"281", "eogon", "\\{\\\\k\\{e\\}\\}"}, // small e with ogonek {"298", "Imacr", "\\{\\\\=\\{I\\}\\}"}, // capital I with macron {"299", "imacr", "\\{\\\\=\\{\\\\i\\}\\}"}, // small i with macron {"302", "Iogon", "\\{\\\\k\\{I\\}\\}"}, // capital I with ogonek {"303", "iogon", "\\{\\\\k\\{i\\}\\}"}, // small i with ogonek {"304", "Idot", "\\{\\\\.\\{I\\}\\}"}, // capital I with dot above {"305", "inodot", "\\{\\\\i\\}"}, // Small i without the dot {"", "imath", "\\{\\\\i\\}"}, // Small i without the dot {"321", "Lstrok", "\\{\\\\L\\}"}, // upper case l with stroke {"322", "lstrok", "\\{\\\\l\\}"}, // lower case l with stroke {"370", "Uogon", "\\{\\\\k\\{U\\}\\}"}, // capital U with ogonek {"371", "uogon", "\\{\\\\k\\{u\\}\\}"}, // small u with ogonek {"490", "Oogon", "\\{\\\\k\\{O\\}\\}"}, // capital letter O with ogonek {"491", "oogon", "\\{\\\\k\\{o\\}\\}"}, // small letter o with ogonek {"492", "", "\\{\\\\k\\{\\\\=\\{O\\}\\}\\}"}, // capital letter O with ogonek and macron {"493", "", "\\{\\\\k\\{\\\\=\\{o\\}\\}\\}"}, // small letter o with ogonek and macron {"536", "", "\\{\\\\cb\\{S\\}\\}"}, // capital letter S with comma below, require combelow {"537", "", "\\{\\\\cb\\{s\\}\\}"}, // small letter S with comma below, require combelow {"538", "", "\\{\\\\cb\\{T\\}\\}"}, // capital letter T with comma below, require combelow {"539", "", "\\{\\\\cb\\{t\\}\\}"}, // small letter T with comma below, require combelow {"727", "caron", "\\{\\\\v\\{\\}\\}"}, // Caron {"", "Hacek", "\\{\\\\v\\{\\}\\}"}, // Caron {"728", "breve", "\\{\\\\u\\{\\}\\}"}, // Breve {"", "Breve", "\\{\\\\u\\{\\}\\}"}, // Breve {"729", "dot", "\\{\\\\\\.\\{\\}\\}"}, // Dot above {"730", "ring", "\\{\\\\r\\{\\}\\}"}, // Ring above {"731", "ogon", "\\{\\\\k\\{\\}\\}"}, // Ogonek {"733", "dblac", "\\{\\\\H\\{\\}\\}"}, // Double acute {"949", "epsi", "\\$\\\\epsilon\\$"}, // Epsilon - double check {"1013", "epsiv", "\\$\\\\varepsilonup\\$"}, // lunate epsilon, requires txfonts {"1055", "", "\\{\\\\cyrchar\\\\CYRP\\}"}, // Cyrillic capital Pe {"1082", "", "\\{\\\\cyrchar\\\\cyrk\\}"}, // Cyrillic small Ka // {"2013", "", ""}, // NKO letter FA -- Maybe en dash = 0x2013? // {"2014", "", ""}, // NKO letter FA -- Maybe em dash = 0x2014? {"8192", "", "\\\\hspace\\{0.5em\\}"}, // en quad {"8193", "", "\\\\hspace\\{1em\\}"}, // em quad {"8196", "", "\\\\hspace\\{0.333em\\}"}, // Three-Per-Em Space {"8197", "", "\\\\hspace\\{0.25em\\}"}, // Four-Per-Em Space {"8198", "", "\\\\hspace\\{0.167em\\}"}, // Six-Per-Em Space {"8208", "hyphen", "-"}, // Hyphen {"8229", "nldr", "\\.\\."}, // Double dots - en leader {"8451", "", "\\$\\\\deg\\$\\{C\\}"}, // Degree Celsius {"8459", "Hscr", "\\$\\\\mathcal\\{H\\}\\$"}, // script capital H -- possibly use \mathscr {"8460", "Hfr", "\\$\\\\mathbb\\{H\\}\\$"}, // black letter capital H -- requires e.g. amsfonts {"8466", "Lscr", "\\$\\\\mathcal\\{L\\}\\$"}, // script capital L -- possibly use \mathscr {"8467", "ell", "\\{\\\\ell\\}"}, // script small l {"8469", "naturals", "\\$\\\\mathbb\\{N\\}\\$"}, // double struck capital N -- requires e.g. amsfonts {"8486", "", "\\$\\{\\\\Omega\\}\\$"}, // Omega {"8491", "angst", "\\{\\\\AA\\}"}, // Angstrom {"8496", "Escr", "\\$\\\\mathcal\\{E\\}\\$"}, // script capital E {"8531", "frac13", "\\$\\\\sfrac\\{1\\}\\{3\\}\\$"}, // Vulgar fraction one third {"8532", "frac23", "\\$\\\\sfrac\\{2\\}\\{3\\}\\$"}, // Vulgar fraction two thirds {"8533", "frac15", "\\$\\\\sfrac\\{1\\}\\{5\\}\\$"}, // Vulgar fraction one fifth {"8534", "frac25", "\\$\\\\sfrac\\{2\\}\\{5\\}\\$"}, // Vulgar fraction two fifths {"8535", "frac35", "\\$\\\\sfrac\\{3\\}\\{5\\}\\$"}, // Vulgar fraction three fifths {"8536", "frac45", "\\$\\\\sfrac\\{4\\}\\{5\\}\\$"}, // Vulgar fraction four fifths {"8537", "frac16", "\\$\\\\sfrac\\{1\\}\\{6\\}\\$"}, // Vulgar fraction one sixth {"8538", "frac56", "\\$\\\\sfrac\\{5\\}\\{6\\}\\$"}, // Vulgar fraction five sixths {"8539", "frac18", "\\$\\\\sfrac\\{1\\}\\{8\\}\\$"}, // Vulgar fraction one eighth {"8540", "frac38", "\\$\\\\sfrac\\{3\\}\\{8\\}\\$"}, // Vulgar fraction three eighths {"8541", "frac58", "\\$\\\\sfrac\\{5\\}\\{8\\}\\$"}, // Vulgar fraction five eighths {"8542", "frac78", "\\$\\\\sfrac\\{7\\}\\{8\\}\\$"}, // Vulgar fraction seven eighths {"8710", "", "\\$\\\\triangle\\$"}, // Increment - could use a more appropriate symbol {"8714", "", "\\$\\\\in\\$"}, // Small element in {"8729", "bullet", "\\$\\\\bullet\\$"}, // Bullet operator {"8758", "ratio", ":"}, // Colon/ratio {"8771", "sime", "\\$\\\\simeq\\$"}, // almost equal to = asymptotic to, {"8776", "ap", "\\$\\\\approx\\$"}, // almost equal to = asymptotic to, {"8810", "ll", "\\$\\\\ll\\$"}, // Much less than {"", "Lt", "\\$\\\\ll\\$"}, // Much less than {"8811", "gg", "\\$\\\\gg\\$"}, // Much greater than {"", "Gt", "\\$\\\\gg\\$"}, // Much greater than {"8819", "gsim", "\\$\\\\gtrsim\\$"}, // Greater than or equivalent to {"8882", "vltri", "\\$\\\\triangleleft\\$"}, // Left triangle {"8883", "vrtri", "\\$\\\\triangleright\\$"}, // Right triangle {"8896", "xwedge", "\\$\\\\bigwedge\\$"}, // Big wedge {"8897", "xvee", "\\$\\\\bigvee\\$"}, // Big vee {"9426", "", "\\{\\\\copyright\\}"}, // circled small letter C {"9633", "square", "\\$\\\\square\\$"}, // White square {"9653", "utri", "\\$\\\\triangle\\$"}, // White up-pointing small triangle -- \vartriangle probably // better but requires amssymb {"10877", "les", "\\$\\\\leqslant\\$"}, // Less than slanted equal -- requires amssymb {"10878", "ges", "\\$\\\\geqslant\\$"}, // Less than slanted equal -- requires amssymb {"119978", "Oscr", "\\$\\\\mathcal\\{O\\}\\$"} // script capital O -- possibly use \mathscr }; // List of combining accents private String[][] accentList = new String[][] { {"768", "`"}, // Grave {"769", "'"}, // Acute {"770", "\\^"}, // Circumflex {"771", "~"}, // Tilde {"772", "="}, // Macron {"773", "="}, // Overline - not completely correct {"774", "u"}, // Breve {"775", "\\."}, // Dot above {"776", "\""}, // Diaeresis {"777", "h"}, // Hook above {"778", "r"}, // Ring {"779", "H"}, // Double acute {"780", "v"}, // Caron {"781", "\\|"}, // Vertical line above {"782", "U"}, // Double vertical line above {"783", "G"}, // Double grave {"784", "textdotbreve"}, // Candrabindu {"785", "t"}, // Inverted breve // {"786", ""}, // Turned comma above // {"787", ""}, // Comma above // {"788", ""}, // Reversed comma above // {"789", ""}, // Comma above right {"790", "textsubgrave"}, // Grave accent below -requires tipa {"791", "textsubacute"}, // Acute accent below - requires tipa {"792", "textadvancing"}, // Left tack below - requires tipa {"793", "textretracting"}, // Right tack below - requires tipa // {"794", ""}, // Left angle above // {"795", ""}, // Horn {"796", "textsublhalfring"}, // Left half ring below - requires tipa {"797", "textraising"}, // Up tack below - requires tipa {"798", "textlowering"}, // Down tack below - requires tipa {"799", "textsubplus"}, // Plus sign below - requires tipa // {"800", ""}, // Minus sign below // {"801", ""}, // Palatalized hook below // {"802", ""}, // Retroflex hook below {"803", "d"}, // Dot below {"804", "textsubumlaut"}, // Diaeresis below - requires tipa {"805", "textsubring"}, // Ring below - requires tipa {"806", "cb"}, // Comma below - requires combelow {"807", "c"}, // Cedilla {"808", "k"}, // Ogonek {"809", "textsyllabic"}, // Vertical line below - requires tipa {"810", "textsubbridge"}, // Bridge below - requires tipa {"811", "textsubw"}, // Inverted double arch below - requires tipa {"812", "textsubwedge"}, // Caron below {"813", "textsubcircum"}, // Circumflex accent below - requires tipa // {"814", ""}, // Breve below {"815", "textsubarch"}, // Inverted breve below - requires tipa {"816", "textsubtilde"}, // Tilde below - requires tipa {"817", "b"}, // Macron below - not completely correct {"818", "b"}, // Underline {"819", "subdoublebar"}, // Double low line -- requires extraipa {"820", "textsuperimposetilde"}, // Tilde overlay - requires tipa // {"821", ""}, // Short stroke overlay // {"822", ""}, // Long stroke overlay // {"823", ""}, // Short solidus overlay // {"824", ""}, // Long solidus overlay {"825", "textsubrhalfring"}, // Right half ring below - requires tipa {"826", "textinvsubbridge"}, // inverted bridge below - requires tipa {"827", "textsubsquare"}, // Square below - requires tipa {"828", "textseagull"}, // Seagull below - requires tipa {"829", "textovercross"}, // X above - requires tipa // {"830", ""}, // Vertical tilde // {"831", ""}, // Double overline // {"832", ""}, // Grave tone mark // {"833", ""}, // Acute tone mark // {"834", ""}, // Greek perispomeni // {"835", ""}, // Greek koronis // {"836", ""}, // Greek dialytika tonos // {"837", ""}, // Greek ypogegrammeni {"838", "overbridge"}, // Bridge above - requires extraipa {"839", "subdoublebar"}, // Equals sign below - requires extraipa {"840", "subdoublevert"}, // Double vertical line below - requires extraipa {"841", "subcorner"}, // Left angle below - requires extraipa {"842", "crtilde"}, // Not tilde above - requires extraipa {"843", "dottedtilde"}, // Homothetic above - requires extraipa {"844", "doubletilde"}, // Almost equal to above - requires extraipa {"845", "spreadlips"}, // Left right arrow below - requires extraipa {"846", "whistle"}, // Upwards arrow below - requires extraipa // {"864", ""}, // Double tilde // {"865", ""}, // Double inverted breve {"866", "sliding"}, // Double rightwards arrow below - requires extraipa }; private HashMap<String, String> escapedSymbols = new HashMap<String, String>(); private HashMap<Integer, String> escapedAccents = new HashMap<Integer, String>(); private HashMap<Integer, String> numSymbols = new HashMap<Integer, String>(); public HTMLConverter() { super(); for (int i=0;i<conversionList.length;i++) { if (conversionList[i][2].length() >= 1) { if (conversionList[i][1].length() >= 1) { escapedSymbols.put("&" + conversionList[i][1] + ";" , conversionList[i][2]); } if (conversionList[i][0].length() >= 1) { numSymbols.put(Integer.decode(conversionList[i][0]) , conversionList[i][2]); } } } for (int i=0;i<accentList.length;i++) { escapedAccents.put(Integer.decode(accentList[i][0]), accentList[i][1]); } } public String format(String text) { if (text == null) return null; StringBuffer sb = new StringBuffer(); // Deal with the form <sup>k</sup>and <sub>k</sub> // If the result is in text or equation form can be controlled // From the "Advanced settings" tab if(Globals.prefs.getBoolean("useConvertToEquation")) { text = text.replaceAll("<sup>([^<]+)</sup>", "\\$\\^\\{$1\\}\\$"); text = text.replaceAll("<sub>([^<]+)</sub>", "\\$_\\{$1\\}\\$"); } else { text = text.replaceAll("<sup>([^<]+)</sup>", "\\\\textsuperscript\\{$1\\}"); text = text.replaceAll("<sub>([^<]+)</sub>", "\\\\textsubscript\\{$1\\}"); } // TODO: maybe rewrite this based on regular expressions instead // Note that (at least) the IEEE Xplore fetcher must be fixed as it relies on the current way to // remove tags for its image alt-tag to equation converter for (int i=0; i<text.length(); i++) { int c = text.charAt(i); if (c == '<') { i = readTag(text, sb, i); } else sb.append((char)c); } text = sb.toString(); // Handle text based HTML entities Set<String> patterns = escapedSymbols.keySet(); for (String pattern: patterns) { text = text.replaceAll(pattern, escapedSymbols.get(pattern)); } // Handle numerical HTML entities Pattern escapedPattern = Pattern.compile("&#([x]*)([0]*)(\\p{XDigit}+);"); Matcher m = escapedPattern.matcher(text); while (m.find()) { // System.err.println("Found pattern: " + m.group(1)); // System.err.println("Found pattern: " + m.group(2)); int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3)); if(numSymbols.containsKey(num)) { text = text.replaceAll("&#" + m.group(1) + m.group(2) + m.group(3) + ";", numSymbols.get(num)); } } escapedPattern = Pattern.compile("(.)&#([x]*)([0]*)(\\p{XDigit}+);"); m = escapedPattern.matcher(text); while (m.find()) { // System.err.println("Found pattern: " + m.group(1)); // System.err.println("Found pattern: " + m.group(2)); int num = Integer.decode(m.group(2).replace("x", "#") + m.group(4)); if(escapedAccents.containsKey(num)) { if(m.group(1).equals("i")) { text = text.replaceAll(m.group(1) + "&#" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{\\\\i\\}\\}"); } else if(m.group(1).equals("j")){ text = text.replaceAll(m.group(1) + "&#" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{\\\\j\\}\\}"); } else { text = text.replaceAll(m.group(1) + "&#" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{" + m.group(1) + "\\}\\}"); } } } escapedPattern = Pattern.compile("&#([x]*)([0]*)(\\p{XDigit}+);"); m = escapedPattern.matcher(text); while (m.find()) { // System.err.println("Found pattern: " + m.group(1)); // System.err.println("Found pattern: " + m.group(2)); int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3)); System.err.println("HTML escaped char not converted: " + m.group(1) + m.group(2) + m.group(3) + " = " + Integer.toString(num)); } // Remove $$ in case of two adjacent conversions text = text.replace("$$",""); // Find non-covered special characters with alphabetic codes escapedPattern = Pattern.compile("&(\\w+);"); m = escapedPattern.matcher(text); while (m.find()) { System.err.println("HTML escaped char not converted: " + m.group(1)); } return text.trim(); } private final int MAX_TAG_LENGTH = 30; /*private final int MAX_CHAR_LENGTH = 10; private int readHtmlChar(String text, StringBuffer sb, int position) { // Have just read the < character that starts the tag. int index = text.indexOf(';', position); if ((index > position) && (index-position < MAX_CHAR_LENGTH)) { //String code = text.substring(position, index); //System.out.println("Removed code: "+text.substring(position, index)); return index; // Just skip the tag. } else return position; // Don't do anything. }*/ private int readTag(String text, StringBuffer sb, int position) { // Have just read the < character that starts the tag. int index = text.indexOf('>', position); if ((index > position) && (index-position < MAX_TAG_LENGTH)) { //System.out.println("Removed tag: "+text.substring(position, index)); return index; // Just skip the tag. } else return position; // Don't do anything. } }