/* Copyright (C) 2003-2012 JabRef contributors.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package net.sf.jabref.imports;
import java.util.HashMap;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sf.jabref.export.layout.LayoutFormatter;
import net.sf.jabref.Globals;
public class HTMLConverter implements LayoutFormatter {
/* Portions © International Organization for Standardization 1986:
Permission to copy in any form is granted for use with
conforming SGML systems and applications as defined in
ISO 8879, provided this notice is included in all copies.
*/
// most of the LaTeX commands can be read at http://en.wikibooks.org/wiki/LaTeX/Accents
// The symbols can be looked at http://www.fileformat.info/info/unicode/char/a4/index.htm. Replace "a4" with the U+ number
// http://detexify.kirelabs.org/classify.html and http://www.ctan.org/tex-archive/info/symbols/comprehensive/ might help to find the right LaTeX command
// http://llg.cubic.org/docs/ent2latex.html and http://www.w3.org/TR/xml-entity-names/byalpha.html are also useful
// as well as http://www.w3.org/Math/characters/unicode.xml
// An array of arrays of strings in the format:
// {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"}
// Leaving a field empty is OK as it then will not be included
private String[][] conversionList = new String[][]{
{"160", "nbsp", "\\{~\\}"}, // no-break space = non-breaking space,
// U+00A0 ISOnum
{"161", "iexcl", "\\{\\\\textexclamdown\\}"}, // inverted exclamation mark, U+00A1 ISOnum
{"162", "cent", "\\{\\\\textcent\\}"}, // cent sign, U+00A2 ISOnum
{"163", "pound", "\\{\\\\pounds\\}"}, // pound sign, U+00A3 ISOnum
{"164", "curren", "\\{\\\\textcurrency\\}"}, // currency sign, U+00A4 ISOnum
{"165", "yen", "\\{\\\\textyen\\}"}, // yen sign = yuan sign, U+00A5 ISOnum
{"166", "brvbar", "\\{\\\\textbrokenbar\\}"}, // broken bar = broken vertical bar,
// U+00A6 ISOnum
{"167", "sect", "\\{\\\\S\\}"}, // section sign, U+00A7 ISOnum
{"168", "uml", "\\{\\\\\"\\{\\}\\}"}, // diaeresis = spacing diaeresis,
// U+00A8 ISOdia
{"169", "copy", "\\{\\\\copyright\\}"}, // copyright sign, U+00A9 ISOnum
{"170", "ordf", "\\{\\\\textordfeminine\\}"}, // feminine ordinal indicator, U+00AA ISOnum
{"171", "laquo", "\\{\\\\guillemotleft\\}"}, // left-pointing double angle quotation mark
// = left pointing guillemet, U+00AB ISOnum
{"172", "not", "\\$\\\\neg\\$"}, // not sign, U+00AC ISOnum
{"173", "shy", "\\\\-"}, // soft hyphen = discretionary hyphen,
// U+00AD ISOnum
{"174", "reg", "\\{\\\\textregistered\\}"}, // registered sign = registered trade mark sign,
// U+00AE ISOnum
{"175", "macr", "\\{\\\\=\\{\\}\\}"}, // macron = spacing macron = overline
// = APL overbar, U+00AF ISOdia
{"176", "deg", "\\$\\\\deg\\$"}, // degree sign, U+00B0 ISOnum
{"177", "plusmn", "\\$\\\\pm\\$"}, // plus-minus sign = plus-or-minus sign,
// U+00B1 ISOnum
{"178", "sup2", "\\\\textsuperscript\\{2\\}"}, // superscript two = superscript digit two
// = squared, U+00B2 ISOnum
{"179", "sup3", "\\\\textsuperscript\\{3\\}"}, // superscript three = superscript digit three
// = cubed, U+00B3 ISOnum
{"180", "acute", "\\{\\\\'\\{\\}\\}"}, // acute accent = spacing acute,
// U+00B4 ISOdia
{"181", "micro", "\\$\\\\mu\\$"}, // micro sign, U+00B5 ISOnum
{"182", "para", "\\{\\\\P\\}"}, // pilcrow sign = paragraph sign,
// U+00B6 ISOnum
{"183", "middot", "\\$\\\\cdot\\$"}, // middle dot = Georgian comma
// = Greek middle dot, U+00B7 ISOnum
{"184", "cedil", "\\{\\\\c\\{\\}\\}"}, // cedilla = spacing cedilla, U+00B8 ISOdia
{"185", "sup1", "\\\\textsuperscript\\{1\\}"}, // superscript one = superscript digit one,
// U+00B9 ISOnum
{"186", "ordm", "\\{\\\\textordmasculine\\}"}, // masculine ordinal indicator,
// U+00BA ISOnum
{"187", "raquo", "\\{\\\\guillemotright\\}"}, // right-pointing double angle quotation mark
// = right pointing guillemet, U+00BB ISOnum
{"188", "frac14", "\\$\\\\sfrac\\{1\\}\\{4\\}\\$"}, // vulgar fraction one quarter
// = fraction one quarter, U+00BC ISOnum
{"189", "frac12", "\\$\\\\sfrac\\{1\\}\\{2\\}\\$"}, // vulgar fraction one half
// = fraction one half, U+00BD ISOnum
{"190", "frac34", "\\$\\\\sfrac\\{3\\}\\{4\\}\\$"}, // vulgar fraction three quarters
// = fraction three quarters, U+00BE ISOnum
{"191", "iquest", "\\{\\\\textquestiondown\\}"}, // inverted question mark
// = turned question mark, U+00BF ISOnum
{"192", "Agrave", "\\{\\\\`\\{A\\}\\}"}, // latin capital letter A with grave
// = latin capital letter A grave,
// U+00C0 ISOlat1
{"193", "Aacute", "\\{\\\\'\\{A\\}\\}"}, // latin capital letter A with acute,
// U+00C1 ISOlat1
{"194", "Acirc", "\\{\\\\\\^\\{A\\}\\}"}, // latin capital letter A with circumflex,
// U+00C2 ISOlat1
{"195", "Atilde", "\\{\\\\~\\{A\\}\\}"}, // latin capital letter A with tilde,
// U+00C3 ISOlat1
{"196", "Auml", "\\{\\\\\"\\{A\\}\\}"}, // latin capital letter A with diaeresis,
// U+00C4 ISOlat1
{"197", "Aring", "\\{\\\\AA\\}"}, // latin capital letter A with ring above
// = latin capital letter A ring,
// U+00C5 ISOlat1
{"198", "AElig", "\\{\\\\AE\\}"}, // latin capital letter AE
// = latin capital ligature AE,
// U+00C6 ISOlat1
{"199", "Ccedil", "\\{\\\\c\\{C\\}\\}"}, // latin capital letter C with cedilla,
// U+00C7 ISOlat1
{"200", "Egrave", "\\{\\\\`\\{E\\}\\}"}, // latin capital letter E with grave,
// U+00C8 ISOlat1
{"201", "Eacute", "\\{\\\\'\\{E\\}\\}"}, // latin capital letter E with acute,
// U+00C9 ISOlat1
{"202", "Ecirc", "\\{\\\\\\^\\{E\\}\\}"}, // latin capital letter E with circumflex,
// U+00CA ISOlat1
{"203", "Euml", "\\{\\\\\"\\{E\\}\\}"}, // latin capital letter E with diaeresis,
// U+00CB ISOlat1
{"204", "Igrave", "\\{\\\\`\\{I\\}\\}"}, // latin capital letter I with grave,
// U+00CC ISOlat1
{"205", "Iacute", "\\{\\\\'\\{I\\}\\}"}, // latin capital letter I with acute,
// U+00CD ISOlat1
{"206", "Icirc", "\\{\\\\\\^\\{I\\}\\}"}, // latin capital letter I with circumflex,
// U+00CE ISOlat1
{"207", "Iuml", "\\{\\\\\"\\{I\\}\\}"}, // latin capital letter I with diaeresis,
// U+00CF ISOlat1
{"208", "ETH", "\\{\\\\DH\\}"}, // latin capital letter ETH, U+00D0 ISOlat1
{"209", "Ntilde", "\\{\\\\~\\{N\\}\\}"}, // latin capital letter N with tilde,
// U+00D1 ISOlat1
{"210", "Ograve", "\\{\\\\`\\{O\\}\\}"}, // latin capital letter O with grave,
// U+00D2 ISOlat1
{"211", "Oacute", "\\{\\\\'\\{O\\}\\}"}, // latin capital letter O with acute,
// U+00D3 ISOlat1
{"212", "Ocirc", "\\{\\\\\\^\\{O\\}\\}"}, // latin capital letter O with circumflex,
// U+00D4 ISOlat1
{"213", "Otilde", "\\{\\\\~\\{O\\}\\}"}, // latin capital letter O with tilde,
// U+00D5 ISOlat1
{"214", "Ouml", "\\{\\\\\"\\{O\\}\\}"}, // latin capital letter O with diaeresis,
// U+00D6 ISOlat1
{"215", "times", "\\$\\\\times\\$"}, // multiplication sign, U+00D7 ISOnum
{"216", "Oslash", "\\{\\\\O\\}"}, // latin capital letter O with stroke
// = latin capital letter O slash,
// U+00D8 ISOlat1
{"217", "Ugrave", "\\{\\\\`\\{U\\}\\}"}, // latin capital letter U with grave,
// U+00D9 ISOlat1
{"218", "Uacute", "\\{\\\\'\\{U\\}\\}"}, // latin capital letter U with acute,
// U+00DA ISOlat1
{"219", "Ucirc", "\\{\\\\\\^\\{U\\}\\}"}, // latin capital letter U with circumflex,
// U+00DB ISOlat1
{"220", "Uuml", "\\{\\\\\"\\{U\\}\\}"}, // latin capital letter U with diaeresis,
// U+00DC ISOlat1
{"221", "Yacute", "\\{\\\\'\\{Y\\}\\}"}, // latin capital letter Y with acute,
// U+00DD ISOlat1
{"222", "THORN", "\\{\\\\TH\\}"}, // latin capital letter THORN,
// U+00DE ISOlat1
{"223", "szlig", "\\{\\\\ss\\}"}, // latin small letter sharp s = ess-zed,
// U+00DF ISOlat1
{"224", "agrave", "\\{\\\\`\\{a\\}\\}"}, // latin small letter a with grave
// = latin small letter a grave,
// U+00E0 ISOlat1
{"225", "aacute", "\\{\\\\'\\{a\\}\\}"}, // latin small letter a with acute,
// U+00E1 ISOlat1
{"226", "acirc", "\\{\\\\\\^\\{a\\}\\}"}, // latin small letter a with circumflex,
// U+00E2 ISOlat1
{"227", "atilde", "\\{\\\\~\\{a\\}\\}"}, // latin small letter a with tilde,
// U+00E3 ISOlat1
{"228", "auml", "\\{\\\\\"\\{a\\}\\}"}, // latin small letter a with diaeresis,
// U+00E4 ISOlat1
{"229", "aring", "\\{\\\\aa\\}"}, // latin small letter a with ring above
// = latin small letter a ring,
// U+00E5 ISOlat1
{"230", "aelig", "\\{\\\\ae\\}"}, // latin small letter ae
// = latin small ligature ae, U+00E6 ISOlat1
{"231", "ccedil", "\\{\\\\c\\{c\\}\\}"}, // latin small letter c with cedilla,
// U+00E7 ISOlat1
{"232", "egrave", "\\{\\\\`\\{e\\}\\}"}, // latin small letter e with grave,
// U+00E8 ISOlat1
{"233", "eacute", "\\{\\\\'\\{e\\}\\}"}, // latin small letter e with acute,
// U+00E9 ISOlat1
{"234", "ecirc", "\\{\\\\\\^\\{e\\}\\}"}, // latin small letter e with circumflex,
// U+00EA ISOlat1
{"235", "euml", "\\{\\\\\"\\{e\\}\\}"}, // latin small letter e with diaeresis,
// U+00EB ISOlat1
{"236", "igrave", "\\{\\\\`\\{\\\\i\\}\\}"}, // latin small letter i with grave,
// U+00EC ISOlat1
{"237", "iacute", "\\{\\\\'\\{\\\\i\\}\\}"}, // latin small letter i with acute,
// U+00ED ISOlat1
{"238", "icirc", "\\{\\\\\\^\\{\\\\i\\}\\}"}, // latin small letter i with circumflex,
// U+00EE ISOlat1
{"239", "iuml", "\\{\\\\\"\\{\\\\i\\}\\}"}, // latin small letter i with diaeresis,
// U+00EF ISOlat1
{"240", "eth", "\\{\\\\dh\\}"}, // latin small letter eth, U+00F0 ISOlat1
{"241", "ntilde", "\\{\\\\~\\{n\\}\\}"}, // latin small letter n with tilde,
// U+00F1 ISOlat1
{"242", "ograve", "\\{\\\\`\\{o\\}\\}"}, // latin small letter o with grave,
// U+00F2 ISOlat1
{"243", "oacute", "\\{\\\\'\\{o\\}\\}"}, // latin small letter o with acute,
// U+00F3 ISOlat1
{"244", "ocirc", "\\{\\\\\\^\\{o\\}\\}"}, // latin small letter o with circumflex,
// U+00F4 ISOlat1
{"245", "otilde", "\\{\\\\~\\{o\\}\\}"}, // latin small letter o with tilde,
// U+00F5 ISOlat1
{"246", "ouml", "\\{\\\\\"\\{o\\}\\}"}, // latin small letter o with diaeresis,
// U+00F6 ISOlat1
{"247", "divide", "\\$\\\\div\\$"}, // division sign, U+00F7 ISOnum
{"248", "oslash", "\\{\\\\o\\}"}, // latin small letter o with stroke,
// = latin small letter o slash,
// U+00F8 ISOlat1
{"249", "ugrave", "\\{\\\\`\\{u\\}\\}"}, // latin small letter u with grave,
// U+00F9 ISOlat1
{"250", "uacute", "\\{\\\\'\\{u\\}\\}"}, // latin small letter u with acute,
// U+00FA ISOlat1
{"251", "ucirc", "\\{\\\\\\^\\{u\\}\\}"}, // latin small letter u with circumflex,
// U+00FB ISOlat1
{"252", "uuml", "\\{\\\\\"\\{u\\}\\}"}, // latin small letter u with diaeresis,
// U+00FC ISOlat1
{"253", "yacute", "\\{\\\\'\\{y\\}\\}"}, // latin small letter y with acute,
// U+00FD ISOlat1
{"254", "thorn", "\\{\\\\th\\}"}, // latin small letter thorn,
// U+00FE ISOlat1
{"255", "yuml", "\\{\\\\\"\\{y\\}\\}"}, // latin small letter y with diaeresis,
// U+00FF ISOlat1
{"402", "fnof", "\\$f\\$"}, // latin small f with hook = function
// = florin, U+0192 ISOtech
/* Greek */
{"913", "Alpha", "\\{\\$\\\\Alpha\\$\\}"}, // greek capital letter alpha, U+0391
{"914", "Beta", "\\{\\$\\\\Beta\\$\\}"}, // greek capital letter beta, U+0392
{"915", "Gamma", "\\{\\$\\\\Gamma\\$\\}"}, // greek capital letter gamma,
// U+0393 ISOgrk3
{"916", "Delta", "\\{\\$\\\\Delta\\$\\}"}, // greek capital letter delta,
// U+0394 ISOgrk3
{"917", "Epsilon", "\\{\\$\\\\Epsilon\\$\\}"}, // greek capital letter epsilon, U+0395
{"918", "Zeta", "\\{\\$\\\\Zeta\\$\\}"}, // greek capital letter zeta, U+0396
{"919", "Eta", "\\{\\$\\\\Eta\\$\\}"}, // greek capital letter eta, U+0397
{"920", "Theta", "\\{\\$\\\\Theta\\$\\}"}, // greek capital letter theta,
// U+0398 ISOgrk3
{"921", "Iota", "\\{\\$\\\\Iota\\$\\}"}, // greek capital letter iota, U+0399
{"922", "Kappa", "\\{\\$\\\\Kappa\\$\\}"}, // greek capital letter kappa, U+039A
{"923", "Lambda", "\\{\\$\\\\Lambda\\$\\}"}, // greek capital letter lambda,
// U+039B ISOgrk3
{"924", "Mu", "\\{\\$\\\\Mu\\$\\}"}, // greek capital letter mu, U+039C
{"925", "Nu", "\\{\\$\\\\Nu\\$\\}"}, // greek capital letter nu, U+039D
{"926", "Xi", "\\{\\$\\\\Xi\\$\\}"}, // greek capital letter xi, U+039E ISOgrk3
{"927", "Omicron", "\\{\\$\\\\Omicron\\$\\}"}, // greek capital letter omicron, U+039F
{"928", "Pi", "\\{\\$\\\\Pi\\$\\}"}, // greek capital letter pi, U+03A0 ISOgrk3
{"929", "Rho", "\\{\\$\\\\Rho\\$\\}"}, // greek capital letter rho, U+03A1
/* there is no Sigmaf, and no U+03A2 character either */
{"931", "Sigma", "\\{\\$\\\\Sigma\\$\\}"}, // greek capital letter sigma,
// U+03A3 ISOgrk3
{"932", "Tau", "\\{\\$\\\\Tau\\$\\}"}, // greek capital letter tau, U+03A4
{"933", "Upsilon", "\\{\\$\\\\Upsilon\\$\\}"}, // greek capital letter upsilon,
// U+03A5 ISOgrk3
{"934", "Phi", "\\{\\$\\\\Phi\\$\\}"}, // greek capital letter phi,
// U+03A6 ISOgrk3
{"935", "Chi", "\\{\\$\\\\Chi\\$\\}"}, // greek capital letter chi, U+03A7
{"936", "Psi", "\\{\\$\\\\Psi\\$\\}"}, // greek capital letter psi,
// U+03A8 ISOgrk3
{"937", "Omega", "\\{\\$\\\\Omega\\$\\}"}, // greek capital letter omega,
// U+03A9 ISOgrk3
{"945", "alpha", "\\$\\\\alpha\\$"}, // greek small letter alpha,
// U+03B1 ISOgrk3
{"946", "beta", "\\$\\\\beta\\$"}, // greek small letter beta, U+03B2 ISOgrk3
{"947", "gamma", "\\$\\\\gamma\\$"}, // greek small letter gamma,
// U+03B3 ISOgrk3
{"948", "delta", "\\$\\\\delta\\$"}, // greek small letter delta,
// U+03B4 ISOgrk3
{"949", "epsilon", "\\$\\\\epsilon\\$"}, // greek small letter epsilon,
// U+03B5 ISOgrk3
{"950", "zeta", "\\$\\\\zeta\\$"}, // greek small letter zeta, U+03B6 ISOgrk3
{"951", "eta", "\\$\\\\eta\\$"}, // greek small letter eta, U+03B7 ISOgrk3
{"952", "theta", "\\$\\\\theta\\$"}, // greek small letter theta,
// U+03B8 ISOgrk3
{"953", "iota", "\\$\\\\iota\\$"}, // greek small letter iota, U+03B9 ISOgrk3
{"954", "kappa", "\\$\\\\kappa\\$"}, // greek small letter kappa,
// U+03BA ISOgrk3
{"955", "lambda", "\\$\\\\lambda\\$"}, // greek small letter lambda,
// U+03BB ISOgrk3
{"956", "mu", "\\$\\\\mu\\$"}, // greek small letter mu, U+03BC ISOgrk3
{"957", "nu", "\\$\\\\nu\\$"}, // greek small letter nu, U+03BD ISOgrk3
{"958", "xi", "\\$\\\\xi\\$"}, // greek small letter xi, U+03BE ISOgrk3
{"959", "omicron", "\\$\\\\omicron\\$"}, // greek small letter omicron, U+03BF NEW
{"960", "pi", "\\$\\\\phi\\$"}, // greek small letter pi, U+03C0 ISOgrk3
{"961", "rho", "\\$\\\\rho\\$"}, // greek small letter rho, U+03C1 ISOgrk3
{"962", "sigmaf", "\\$\\\\varsigma\\$"}, // greek small letter final sigma,
// U+03C2 ISOgrk3
{"963", "sigma", "\\$\\\\sigma\\$"}, // greek small letter sigma,
// U+03C3 ISOgrk3
{"964", "tau", "\\$\\\\tau\\$"}, // greek small letter tau, U+03C4 ISOgrk3
{"965", "upsilon", "\\$\\\\upsilon\\$"}, // greek small letter upsilon,
{"", "upsi", "\\$\\\\upsilon\\$"}, // alias
// U+03C5 ISOgrk3
{"966", "phi", "\\$\\\\phi\\$"}, // greek small letter phi, U+03C6 ISOgrk3
{"967", "chi", "\\$\\\\chi\\$"}, // greek small letter chi, U+03C7 ISOgrk3
{"968", "psi", "\\$\\\\psi\\$"}, // greek small letter psi, U+03C8 ISOgrk3
{"969", "omega", "\\$\\\\omega\\$"}, // greek small letter omega,
// U+03C9 ISOgrk3
{"977", "thetasym", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol,
{"", "thetav", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol,
{"", "vartheta", "\\$\\\\vartheta\\$"}, // greek small letter theta symbol,
// U+03D1 NEW
{"978", "upsih", "\\{\\$\\\\Upsilon\\$\\}"}, // greek upsilon with hook symbol,
// U+03D2 NEW
{"982", "piv", "\\$\\\\varphi\\$"}, // greek pi symbol, U+03D6 ISOgrk3
/* General Punctuation */
{"8226", "bull", "\\$\\\\bullet\\$"}, // bullet = black small circle,
// U+2022 ISOpub
/* bullet is NOT the same as bullet operator, U+2219 */
{"8230", "hellip", "\\{\\\\ldots\\}"}, // horizontal ellipsis = three dot leader,
// U+2026 ISOpub
{"8242", "prime", "\\$\\\\prime\\$"}, // prime = minutes = feet, U+2032 ISOtech
{"8243", "Prime", "\\$\\{''\\}\\$"}, // double prime = seconds = inches,
// U+2033 ISOtech
{"8254", "oline", "\\{\\\\=\\{\\}\\}"}, // overline = spacing overscore,
// U+203E NEW
{"8260", "frasl", "/"}, // fraction slash, U+2044 NEW
/* Letterlike Symbols */
{"8472", "weierp", "\\$\\\\wp\\$"}, // script capital P = power set
// = Weierstrass p, U+2118 ISOamso
{"8465", "image", "\\{\\$\\\\Im\\$\\}"}, // blackletter capital I = imaginary part,
// U+2111 ISOamso
{"8476", "real", "\\{\\$\\\\Re\\$\\}"}, // blackletter capital R = real part symbol,
// U+211C ISOamso
{"8482", "trade", "\\{\\\\texttrademark\\}"}, // trade mark sign, U+2122 ISOnum
{"8501", "alefsym", "\\$\\\\aleph\\$"}, // alef symbol = first transfinite cardinal,
// U+2135 NEW
/* alef symbol is NOT the same as hebrew letter alef,
U+05D0 although the same glyph could be used to depict both characters */
/* Arrows */
{"8592", "larr", "\\$\\\\leftarrow\\$"}, // leftwards arrow, U+2190 ISOnum
{"8593", "uarr", "\\$\\\\uparrow\\$"}, // upwards arrow, U+2191 ISOnum
{"8594", "rarr", "\\$\\\\rightarrow\\$"}, // rightwards arrow, U+2192 ISOnum
{"8595", "darr", "\\$\\\\downarrow\\$"}, // downwards arrow, U+2193 ISOnum
{"8596", "harr", "\\$\\\\leftrightarrow\\$"}, // left right arrow, U+2194 ISOamsa
{"8629", "crarr", "\\$\\\\dlsh\\$"}, // downwards arrow with corner leftwards
// = carriage return, U+21B5 NEW - require mathabx
{"8656", "lArr", "\\{\\$\\\\Leftarrow\\$\\}"}, // leftwards double arrow, U+21D0 ISOtech
/* ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
but also does not have any other character for that function. So ? lArr can
be used for 'is implied by' as ISOtech suggests */
{"8657", "uArr", "\\{\\$\\\\Uparrow\\$\\}"}, // upwards double arrow, U+21D1 ISOamsa
{"8658", "rArr", "\\{\\$\\\\Rightarrow\\$\\}"}, // rightwards double arrow,
// U+21D2 ISOtech
/* ISO 10646 does not say this is the 'implies' character but does not have
another character with this function so ?
rArr can be used for 'implies' as ISOtech suggests */
{"8659", "dArr", "\\{\\$\\\\Downarrow\\$\\}"}, // downwards double arrow, U+21D3 ISOamsa
{"8660", "hArr", "\\{\\$\\\\Leftrightarrow\\$\\}"}, // left right double arrow,
// U+21D4 ISOamsa
/* Mathematical Operators */
{"8704", "forall", "\\$\\\\forall\\$"}, // for all, U+2200 ISOtech
{"8706", "part", "\\$\\\\partial\\$"}, // partial differential, U+2202 ISOtech
{"8707", "exist", "\\$\\\\exists\\$"}, // there exists, U+2203 ISOtech
{"8709", "empty", "\\$\\\\emptyset\\$"}, // empty set = null set = diameter,
// U+2205 ISOamso
{"8711", "nabla", "\\$\\\\nabla\\$"}, // nabla = backward difference,
// U+2207 ISOtech
{"8712", "isin", "\\$\\\\in\\$"}, // element of, U+2208 ISOtech
{"8713", "notin", "\\$\\\\notin\\$"}, // not an element of, U+2209 ISOtech
{"8715", "ni", "\\$\\\\ni\\$"}, // contains as member, U+220B ISOtech
/* should there be a more memorable name than 'ni'? */
{"8719", "prod", "\\$\\\\prod\\$"}, // n-ary product = product sign,
// U+220F ISOamsb
/* prod is NOT the same character as U+03A0 'greek capital letter pi' though
the same glyph might be used for both */
{"8721", "sum", "\\$\\\\sum\\$"}, // n-ary sumation, U+2211 ISOamsb
/* sum is NOT the same character as U+03A3 'greek capital letter sigma'
though the same glyph might be used for both */
{"8722", "minus", "\\$-\\$"}, // minus sign, U+2212 ISOtech
{"8727", "lowast", "\\$\\\\ast\\$"}, // asterisk operator, U+2217 ISOtech
{"8730", "radic", "\\$\\\\sqrt{}\\$"}, // square root = radical sign,
// U+221A ISOtech
{"8733", "prop", "\\$\\\\propto\\$"}, // proportional to, U+221D ISOtech
{"8734", "infin", "\\$\\\\infty\\$"}, // infinity, U+221E ISOtech
{"8736", "ang", "\\$\\\\angle\\$"}, // angle, U+2220 ISOamso
{"8743", "and", "\\$\\\\land\\$"}, // logical and = wedge, U+2227 ISOtech
{"8744", "or", "\\$\\\\lor\\$"}, // logical or = vee, U+2228 ISOtech
{"8745", "cap", "\\$\\\\cap\\$"}, // intersection = cap, U+2229 ISOtech
{"8746", "cup", "\\$\\\\cup\\$"}, // union = cup, U+222A ISOtech
{"8747", "int", "\\$\\\\int\\$"}, // integral, U+222B ISOtech
{"8756", "there4", "\\$\\\\uptherefore\\$"}, // therefore, U+2234 ISOtech; only in LaTeX package MnSymbol
{"8764", "sim", "\\$\\\\sim\\$"}, // tilde operator = varies with = similar to,
// U+223C ISOtech
/* tilde operator is NOT the same character as the tilde, U+007E,
although the same glyph might be used to represent both */
{"8773", "cong", "\\$\\\\cong\\$"}, // approximately equal to, U+2245 ISOtech
{"8776", "asymp", "\\$\\\\approx\\$"}, // almost equal to = asymptotic to,
// U+2248 ISOamsr
{"8800", "ne", "\\$\\\\neq\\$"}, // not equal to, U+2260 ISOtech
{"8801", "equiv", "\\$\\\\equiv\\$"}, // identical to, U+2261 ISOtech
{"8804", "le", "\\$\\\\leq\\$"}, // less-than or equal to, U+2264 ISOtech
{"8805", "ge", "\\$\\\\geq\\$"}, // greater-than or equal to,
// U+2265 ISOtech
{"8834", "sub", "\\$\\\\subset\\$"}, // subset of, U+2282 ISOtech
{"8835", "sup", "\\$\\\\supset\\$"}, // superset of, U+2283 ISOtech
/* note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
font encoding and is not included. Should it be, for symmetry?
It is in ISOamsn */
{"8836", "nsub", "\\$\\\\nsubset\\$"}, // not a subset of, U+2284 ISOamsn
{"8838", "sube", "\\$\\\\subseteq\\$"}, // subset of or equal to, U+2286 ISOtech
{"8839", "supe", "\\$\\\\supseteq\\$"}, // superset of or equal to,
// U+2287 ISOtech
{"8853", "oplus", "\\$\\\\oplus\\$"}, // circled plus = direct sum,
// U+2295 ISOamsb
{"8855", "otimes", "\\$\\\\otimes\\$"}, // circled times = vector product,
// U+2297 ISOamsb
{"8869", "perp", "\\$\\\\perp\\$"}, // up tack = orthogonal to = perpendicular,
// U+22A5 ISOtech
{"8901", "sdot", "\\$\\\\cdot\\$"}, // dot operator, U+22C5 ISOamsb
/* dot operator is NOT the same character as U+00B7 middle dot */
/* Miscellaneous Technical */
{"8968", "lceil", "\\$\\\\lceil\\$"}, // left ceiling = apl upstile,
// U+2308 ISOamsc
{"8969", "rceil", "\\$\\\\rceil\\$"}, // right ceiling, U+2309 ISOamsc
{"8970", "lfloor", "\\$\\\\lfloor\\$"}, // left floor = apl downstile,
// U+230A ISOamsc
{"8971", "rfloor", "\\$\\\\rfloor\\$"}, // right floor, U+230B ISOamsc
{"9001", "lang", "\\$\\\\langle\\$"}, // left-pointing angle bracket = bra,
// U+2329 ISOtech
/* lang is NOT the same character as U+003C 'less than'
or U+2039 'single left-pointing angle quotation mark' */
{"9002", "rang", "\\$\\\\rangle\\$"}, // right-pointing angle bracket = ket,
// U+232A ISOtech
/* rang is NOT the same character as U+003E 'greater than'
or U+203A 'single right-pointing angle quotation mark' */
/* Geometric Shapes */
{"9674", "loz", "\\$\\\\lozenge\\$"}, // lozenge, U+25CA ISOpub
/* Miscellaneous Symbols */
{"9824", "spades", "\\$\\\\spadesuit\\$"}, // black spade suit, U+2660 ISOpub
/* black here seems to mean filled as opposed to hollow */
{"9827", "clubs", "\\$\\\\clubsuit\\$"}, // black club suit = shamrock,
// U+2663 ISOpub
{"9829", "hearts", "\\$\\\\heartsuit\\$"}, // black heart suit = valentine,
// U+2665 ISOpub
{"9830", "diams", "\\$\\\\diamondsuit\\$"}, // black diamond suit, U+2666 ISOpub
{"34", "quot", "\""}, // quotation mark = APL quote,
// U+0022 ISOnum
{"38", "amp", "\\\\&"}, // ampersand, U+0026 ISOnum
{"60", "lt", "\\$<\\$"}, // less-than sign, U+003C ISOnum
{"62", "gt", "\\$>\\$"}, // greater-than sign, U+003E ISOnum
/* Latin Extended-A */
{"338", "OElig", "\\{\\\\OE\\}"}, // latin capital ligature OE,
// U+0152 ISOlat2
{"339", "oelig", "\\{\\\\oe\\}"}, // latin small ligature oe, U+0153 ISOlat2
/* ligature is a misnomer, this is a separate character in some languages */
{"352", "Scaron", "\\{\\\\v\\{S\\}\\}"}, // latin capital letter S with caron,
// U+0160 ISOlat2
{"353", "scaron", "\\{\\\\v\\{s\\}\\}"}, // latin small letter s with caron,
// U+0161 ISOlat2
{"376", "Yuml", "\\{\\\\\"\\{Y\\}\\}"}, // latin capital letter Y with diaeresis,
// U+0178 ISOlat2
/* Spacing Modifier Letters */
{"710", "circ", "\\{\\\\textasciicircum\\}"}, // modifier letter circumflex accent,
// U+02C6 ISOpub
{"732", "tilde", "\\{\\\\textasciitilde\\}"}, // small tilde, U+02DC ISOdia
/* General Punctuation */
{"8194", "ensp", "\\\\hspace\\{0.5em\\}"}, // en space, U+2002 ISOpub
{"8195", "emsp", "\\\\hspace\\{1em\\}"}, // em space, U+2003 ISOpub
{"8201", "thinsp", "\\\\hspace\\{0.167em\\}"}, // thin space, U+2009 ISOpub
{"8204", "zwnj", ""}, // zero width non-joiner,
// U+200C NEW RFC 2070
{"8205", "zwj", ""}, // zero width joiner, U+200D NEW RFC 2070
{"8206", "lrm", ""}, // left-to-right mark, U+200E NEW RFC 2070
{"8207", "rlm", ""}, // right-to-left mark, U+200F NEW RFC 2070
{"8211", "ndash", "--"}, // en dash, U+2013 ISOpub
{"8212", "mdash", "---"}, // em dash, U+2014 ISOpub
{"8216", "lsquo", "\\{\\\\textquoteleft\\}"}, // left single quotation mark,
// U+2018 ISOnum
{"8217", "rsquo", "\\{\\\\textquoteright\\}"}, // right single quotation mark,
// U+2019 ISOnum
{"8218", "sbquo", "\\{\\\\quotesinglbase\\}"}, // single low-9 quotation mark, U+201A NEW
{"8220", "ldquo", "\\{\\\\textquotedblleft\\}"}, // left double quotation mark,
// U+201C ISOnum
{"8221", "rdquo", "\\{\\\\textquotedblright\\}"}, // right double quotation mark,
// U+201D ISOnum
{"8222", "bdquo", "\\{\\\\quotedblbase\\}"}, // double low-9 quotation mark, U+201E NEW
{"8224", "dagger", "\\{\\\\dag\\}"}, // dagger, U+2020 ISOpub
{"8225", "Dagger", "\\{\\\\ddag\\}"}, // double dagger, U+2021 ISOpub
{"8240", "permil", "\\{\\\\textperthousand\\}"}, // per mille sign, U+2030 ISOtech
{"8249", "lsaquo", "\\{\\\\guilsinglleft\\}"}, // single left-pointing angle quotation mark,
// U+2039 ISO proposed
/* lsaquo is proposed but not yet ISO standardized */
{"8250", "rsaquo", "\\{\\\\guilsinglright\\}"}, // single right-pointing angle quotation mark,
// U+203A ISO proposed
/* rsaquo is proposed but not yet ISO standardized */
{"8364", "euro", "\\{\\\\texteuro\\}"}, // euro sign, U+20AC NEW
/* Manually added */
{"37", "percnt", "\\\\%"}, // Percent
{"39", "", "'"}, // Apostrophe
{"40", "", "("}, // Left bracket
{"41", "", ")"}, // Right bracket
{"43", "plus", "\\+"}, // Plus
{"44", "comma", ","}, // Comma
{"45", "hyphen", "-"}, // Hyphen
{"46", "period", "\\."}, // Period
{"47", "slash", "/"}, // Slash (solidus)
{"58", "colon", ":"}, // Colon
{"59", "semi", ";"}, // Semi colon
{"91", "lsqb", "\\["}, // Left square bracket
{"92", "bsol", "\\{\\\\textbackslash\\}"}, // Backslash
{"93", "rsqb", "\\]"}, // Right square bracket
{"94", "Hat", "\\{\\\\\\^\\{\\}\\}"}, // Circumflex
{"95", "lowbar", "\\\\_"}, // Underscore
{"96", "grave", "\\{\\\\`\\{\\}\\}"}, // Grave
{"123", "lbrace", "\\\\\\{"}, // Left curly bracket
{"", "lcub", "\\\\\\{"}, // Left curly bracket
{"124", "vert", "\\|"}, // Vertical bar
{"", "verbar", "\\|"}, // Vertical bar
{"", "VerticalLine", "\\|"}, // Vertical bar
{"125", "rbrace", "\\\\\\}"}, // Right curly bracket
{"", "rcub", "\\\\\\}"}, // Right curly bracket
{"138", "", "\\{\\\\v\\{S\\}\\}"}, // Line tabulation set
// {"141", "", ""}, // Reverse line feed
{"145", "", "`"}, // Apostrophe
{"146", "", "'"}, // Apostrophe
{"147", "", "``"}, // Quotation mark
{"148", "", "''"}, // Quotation mark
{"150", "", "--"}, // En dash
{"154", "", "\\{\\\\v\\{s\\}\\}"}, // Single character introducer
{"260", "Aogon", "\\{\\\\k\\{A\\}\\}"}, // capital A with ogonek
{"261", "aogon", "\\{\\\\k\\{a\\}\\}"}, // small a with ogonek
{"262", "Cacute", "\\{\\\\'\\{C\\}\\}"}, // capital C with acute
{"263", "cacute", "\\{\\\\'\\{c\\}\\}"}, // small C with acute
{"264", "Ccirc", "\\{\\\\\\^\\{C\\}\\}"}, // capital C with circumflex
{"265", "ccirc", "\\{\\\\\\^\\{c\\}\\}"}, // small C with circumflex
{"266", "Cdot", "\\{\\\\\\.\\{C\\}\\}"}, // capital C with dot above
{"267", "cdot", "\\{\\\\\\.\\{c\\}\\}"}, // small C with dot above
{"268", "Ccaron", "\\{\\\\v\\{C\\}\\}"}, // capital C with caron
{"269", "ccaron", "\\{\\\\v\\{c\\}\\}"}, // small C with caron
{"272", "Dstrok", "\\{\\\\DJ\\}"}, // capital D with stroke
{"273", "dstrok", "\\{\\\\dj\\}"}, // small d with stroke
{"280", "Eogon", "\\{\\\\k\\{E\\}\\}"}, // capital E with ogonek
{"281", "eogon", "\\{\\\\k\\{e\\}\\}"}, // small e with ogonek
{"298", "Imacr", "\\{\\\\=\\{I\\}\\}"}, // capital I with macron
{"299", "imacr", "\\{\\\\=\\{\\\\i\\}\\}"}, // small i with macron
{"302", "Iogon", "\\{\\\\k\\{I\\}\\}"}, // capital I with ogonek
{"303", "iogon", "\\{\\\\k\\{i\\}\\}"}, // small i with ogonek
{"304", "Idot", "\\{\\\\.\\{I\\}\\}"}, // capital I with dot above
{"305", "inodot", "\\{\\\\i\\}"}, // Small i without the dot
{"", "imath", "\\{\\\\i\\}"}, // Small i without the dot
{"321", "Lstrok", "\\{\\\\L\\}"}, // upper case l with stroke
{"322", "lstrok", "\\{\\\\l\\}"}, // lower case l with stroke
{"370", "Uogon", "\\{\\\\k\\{U\\}\\}"}, // capital U with ogonek
{"371", "uogon", "\\{\\\\k\\{u\\}\\}"}, // small u with ogonek
{"490", "Oogon", "\\{\\\\k\\{O\\}\\}"}, // capital letter O with ogonek
{"491", "oogon", "\\{\\\\k\\{o\\}\\}"}, // small letter o with ogonek
{"492", "", "\\{\\\\k\\{\\\\=\\{O\\}\\}\\}"}, // capital letter O with ogonek and macron
{"493", "", "\\{\\\\k\\{\\\\=\\{o\\}\\}\\}"}, // small letter o with ogonek and macron
{"536", "", "\\{\\\\cb\\{S\\}\\}"}, // capital letter S with comma below, require combelow
{"537", "", "\\{\\\\cb\\{s\\}\\}"}, // small letter S with comma below, require combelow
{"538", "", "\\{\\\\cb\\{T\\}\\}"}, // capital letter T with comma below, require combelow
{"539", "", "\\{\\\\cb\\{t\\}\\}"}, // small letter T with comma below, require combelow
{"727", "caron", "\\{\\\\v\\{\\}\\}"}, // Caron
{"", "Hacek", "\\{\\\\v\\{\\}\\}"}, // Caron
{"728", "breve", "\\{\\\\u\\{\\}\\}"}, // Breve
{"", "Breve", "\\{\\\\u\\{\\}\\}"}, // Breve
{"729", "dot", "\\{\\\\\\.\\{\\}\\}"}, // Dot above
{"730", "ring", "\\{\\\\r\\{\\}\\}"}, // Ring above
{"731", "ogon", "\\{\\\\k\\{\\}\\}"}, // Ogonek
{"733", "dblac", "\\{\\\\H\\{\\}\\}"}, // Double acute
{"949", "epsi", "\\$\\\\epsilon\\$"}, // Epsilon - double check
{"1013", "epsiv", "\\$\\\\varepsilonup\\$"}, // lunate epsilon, requires txfonts
{"1055", "", "\\{\\\\cyrchar\\\\CYRP\\}"}, // Cyrillic capital Pe
{"1082", "", "\\{\\\\cyrchar\\\\cyrk\\}"}, // Cyrillic small Ka
// {"2013", "", ""}, // NKO letter FA -- Maybe en dash = 0x2013?
// {"2014", "", ""}, // NKO letter FA -- Maybe em dash = 0x2014?
{"8192", "", "\\\\hspace\\{0.5em\\}"}, // en quad
{"8193", "", "\\\\hspace\\{1em\\}"}, // em quad
{"8196", "", "\\\\hspace\\{0.333em\\}"}, // Three-Per-Em Space
{"8197", "", "\\\\hspace\\{0.25em\\}"}, // Four-Per-Em Space
{"8198", "", "\\\\hspace\\{0.167em\\}"}, // Six-Per-Em Space
{"8208", "hyphen", "-"}, // Hyphen
{"8229", "nldr", "\\.\\."}, // Double dots - en leader
{"8451", "", "\\$\\\\deg\\$\\{C\\}"}, // Degree Celsius
{"8459", "Hscr", "\\$\\\\mathcal\\{H\\}\\$"}, // script capital H -- possibly use \mathscr
{"8460", "Hfr", "\\$\\\\mathbb\\{H\\}\\$"}, // black letter capital H -- requires e.g. amsfonts
{"8466", "Lscr", "\\$\\\\mathcal\\{L\\}\\$"}, // script capital L -- possibly use \mathscr
{"8467", "ell", "\\{\\\\ell\\}"}, // script small l
{"8469", "naturals", "\\$\\\\mathbb\\{N\\}\\$"}, // double struck capital N -- requires e.g. amsfonts
{"8486", "", "\\$\\{\\\\Omega\\}\\$"}, // Omega
{"8491", "angst", "\\{\\\\AA\\}"}, // Angstrom
{"8496", "Escr", "\\$\\\\mathcal\\{E\\}\\$"}, // script capital E
{"8531", "frac13", "\\$\\\\sfrac\\{1\\}\\{3\\}\\$"}, // Vulgar fraction one third
{"8532", "frac23", "\\$\\\\sfrac\\{2\\}\\{3\\}\\$"}, // Vulgar fraction two thirds
{"8533", "frac15", "\\$\\\\sfrac\\{1\\}\\{5\\}\\$"}, // Vulgar fraction one fifth
{"8534", "frac25", "\\$\\\\sfrac\\{2\\}\\{5\\}\\$"}, // Vulgar fraction two fifths
{"8535", "frac35", "\\$\\\\sfrac\\{3\\}\\{5\\}\\$"}, // Vulgar fraction three fifths
{"8536", "frac45", "\\$\\\\sfrac\\{4\\}\\{5\\}\\$"}, // Vulgar fraction four fifths
{"8537", "frac16", "\\$\\\\sfrac\\{1\\}\\{6\\}\\$"}, // Vulgar fraction one sixth
{"8538", "frac56", "\\$\\\\sfrac\\{5\\}\\{6\\}\\$"}, // Vulgar fraction five sixths
{"8539", "frac18", "\\$\\\\sfrac\\{1\\}\\{8\\}\\$"}, // Vulgar fraction one eighth
{"8540", "frac38", "\\$\\\\sfrac\\{3\\}\\{8\\}\\$"}, // Vulgar fraction three eighths
{"8541", "frac58", "\\$\\\\sfrac\\{5\\}\\{8\\}\\$"}, // Vulgar fraction five eighths
{"8542", "frac78", "\\$\\\\sfrac\\{7\\}\\{8\\}\\$"}, // Vulgar fraction seven eighths
{"8710", "", "\\$\\\\triangle\\$"}, // Increment - could use a more appropriate symbol
{"8714", "", "\\$\\\\in\\$"}, // Small element in
{"8729", "bullet", "\\$\\\\bullet\\$"}, // Bullet operator
{"8758", "ratio", ":"}, // Colon/ratio
{"8771", "sime", "\\$\\\\simeq\\$"}, // almost equal to = asymptotic to,
{"8776", "ap", "\\$\\\\approx\\$"}, // almost equal to = asymptotic to,
{"8810", "ll", "\\$\\\\ll\\$"}, // Much less than
{"", "Lt", "\\$\\\\ll\\$"}, // Much less than
{"8811", "gg", "\\$\\\\gg\\$"}, // Much greater than
{"", "Gt", "\\$\\\\gg\\$"}, // Much greater than
{"8819", "gsim", "\\$\\\\gtrsim\\$"}, // Greater than or equivalent to
{"8882", "vltri", "\\$\\\\triangleleft\\$"}, // Left triangle
{"8883", "vrtri", "\\$\\\\triangleright\\$"}, // Right triangle
{"8896", "xwedge", "\\$\\\\bigwedge\\$"}, // Big wedge
{"8897", "xvee", "\\$\\\\bigvee\\$"}, // Big vee
{"9426", "", "\\{\\\\copyright\\}"}, // circled small letter C
{"9633", "square", "\\$\\\\square\\$"}, // White square
{"9653", "utri", "\\$\\\\triangle\\$"}, // White up-pointing small triangle -- \vartriangle probably
// better but requires amssymb
{"10877", "les", "\\$\\\\leqslant\\$"}, // Less than slanted equal -- requires amssymb
{"10878", "ges", "\\$\\\\geqslant\\$"}, // Less than slanted equal -- requires amssymb
{"119978", "Oscr", "\\$\\\\mathcal\\{O\\}\\$"} // script capital O -- possibly use \mathscr
};
// List of combining accents
private String[][] accentList = new String[][] {
{"768", "`"}, // Grave
{"769", "'"}, // Acute
{"770", "\\^"}, // Circumflex
{"771", "~"}, // Tilde
{"772", "="}, // Macron
{"773", "="}, // Overline - not completely correct
{"774", "u"}, // Breve
{"775", "\\."}, // Dot above
{"776", "\""}, // Diaeresis
{"777", "h"}, // Hook above
{"778", "r"}, // Ring
{"779", "H"}, // Double acute
{"780", "v"}, // Caron
{"781", "\\|"}, // Vertical line above
{"782", "U"}, // Double vertical line above
{"783", "G"}, // Double grave
{"784", "textdotbreve"}, // Candrabindu
{"785", "t"}, // Inverted breve
// {"786", ""}, // Turned comma above
// {"787", ""}, // Comma above
// {"788", ""}, // Reversed comma above
// {"789", ""}, // Comma above right
{"790", "textsubgrave"}, // Grave accent below -requires tipa
{"791", "textsubacute"}, // Acute accent below - requires tipa
{"792", "textadvancing"}, // Left tack below - requires tipa
{"793", "textretracting"}, // Right tack below - requires tipa
// {"794", ""}, // Left angle above
// {"795", ""}, // Horn
{"796", "textsublhalfring"}, // Left half ring below - requires tipa
{"797", "textraising"}, // Up tack below - requires tipa
{"798", "textlowering"}, // Down tack below - requires tipa
{"799", "textsubplus"}, // Plus sign below - requires tipa
// {"800", ""}, // Minus sign below
// {"801", ""}, // Palatalized hook below
// {"802", ""}, // Retroflex hook below
{"803", "d"}, // Dot below
{"804", "textsubumlaut"}, // Diaeresis below - requires tipa
{"805", "textsubring"}, // Ring below - requires tipa
{"806", "cb"}, // Comma below - requires combelow
{"807", "c"}, // Cedilla
{"808", "k"}, // Ogonek
{"809", "textsyllabic"}, // Vertical line below - requires tipa
{"810", "textsubbridge"}, // Bridge below - requires tipa
{"811", "textsubw"}, // Inverted double arch below - requires tipa
{"812", "textsubwedge"}, // Caron below
{"813", "textsubcircum"}, // Circumflex accent below - requires tipa
// {"814", ""}, // Breve below
{"815", "textsubarch"}, // Inverted breve below - requires tipa
{"816", "textsubtilde"}, // Tilde below - requires tipa
{"817", "b"}, // Macron below - not completely correct
{"818", "b"}, // Underline
{"819", "subdoublebar"}, // Double low line -- requires extraipa
{"820", "textsuperimposetilde"}, // Tilde overlay - requires tipa
// {"821", ""}, // Short stroke overlay
// {"822", ""}, // Long stroke overlay
// {"823", ""}, // Short solidus overlay
// {"824", ""}, // Long solidus overlay
{"825", "textsubrhalfring"}, // Right half ring below - requires tipa
{"826", "textinvsubbridge"}, // inverted bridge below - requires tipa
{"827", "textsubsquare"}, // Square below - requires tipa
{"828", "textseagull"}, // Seagull below - requires tipa
{"829", "textovercross"}, // X above - requires tipa
// {"830", ""}, // Vertical tilde
// {"831", ""}, // Double overline
// {"832", ""}, // Grave tone mark
// {"833", ""}, // Acute tone mark
// {"834", ""}, // Greek perispomeni
// {"835", ""}, // Greek koronis
// {"836", ""}, // Greek dialytika tonos
// {"837", ""}, // Greek ypogegrammeni
{"838", "overbridge"}, // Bridge above - requires extraipa
{"839", "subdoublebar"}, // Equals sign below - requires extraipa
{"840", "subdoublevert"}, // Double vertical line below - requires extraipa
{"841", "subcorner"}, // Left angle below - requires extraipa
{"842", "crtilde"}, // Not tilde above - requires extraipa
{"843", "dottedtilde"}, // Homothetic above - requires extraipa
{"844", "doubletilde"}, // Almost equal to above - requires extraipa
{"845", "spreadlips"}, // Left right arrow below - requires extraipa
{"846", "whistle"}, // Upwards arrow below - requires extraipa
// {"864", ""}, // Double tilde
// {"865", ""}, // Double inverted breve
{"866", "sliding"}, // Double rightwards arrow below - requires extraipa
};
private HashMap<String, String> escapedSymbols = new HashMap<String, String>();
private HashMap<Integer, String> escapedAccents = new HashMap<Integer, String>();
private HashMap<Integer, String> numSymbols = new HashMap<Integer, String>();
public HTMLConverter() {
super();
for (int i=0;i<conversionList.length;i++) {
if (conversionList[i][2].length() >= 1) {
if (conversionList[i][1].length() >= 1) {
escapedSymbols.put("&" + conversionList[i][1] + ";" , conversionList[i][2]);
}
if (conversionList[i][0].length() >= 1) {
numSymbols.put(Integer.decode(conversionList[i][0]) , conversionList[i][2]);
}
}
}
for (int i=0;i<accentList.length;i++) {
escapedAccents.put(Integer.decode(accentList[i][0]), accentList[i][1]);
}
}
public String format(String text) {
if (text == null)
return null;
StringBuffer sb = new StringBuffer();
// Deal with the form <sup>k</sup>and <sub>k</sub>
// If the result is in text or equation form can be controlled
// From the "Advanced settings" tab
if(Globals.prefs.getBoolean("useConvertToEquation")) {
text = text.replaceAll("<sup>([^<]+)</sup>", "\\$\\^\\{$1\\}\\$");
text = text.replaceAll("<sub>([^<]+)</sub>", "\\$_\\{$1\\}\\$");
} else {
text = text.replaceAll("<sup>([^<]+)</sup>", "\\\\textsuperscript\\{$1\\}");
text = text.replaceAll("<sub>([^<]+)</sub>", "\\\\textsubscript\\{$1\\}");
}
// TODO: maybe rewrite this based on regular expressions instead
// Note that (at least) the IEEE Xplore fetcher must be fixed as it relies on the current way to
// remove tags for its image alt-tag to equation converter
for (int i=0; i<text.length(); i++) {
int c = text.charAt(i);
if (c == '<') {
i = readTag(text, sb, i);
} else
sb.append((char)c);
}
text = sb.toString();
// Handle text based HTML entities
Set<String> patterns = escapedSymbols.keySet();
for (String pattern: patterns) {
text = text.replaceAll(pattern, escapedSymbols.get(pattern));
}
// Handle numerical HTML entities
Pattern escapedPattern = Pattern.compile("([x]*)([0]*)(\\p{XDigit}+);");
Matcher m = escapedPattern.matcher(text);
while (m.find()) {
// System.err.println("Found pattern: " + m.group(1));
// System.err.println("Found pattern: " + m.group(2));
int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3));
if(numSymbols.containsKey(num)) {
text = text.replaceAll("" + m.group(1) + m.group(2) + m.group(3) + ";", numSymbols.get(num));
}
}
escapedPattern = Pattern.compile("(.)([x]*)([0]*)(\\p{XDigit}+);");
m = escapedPattern.matcher(text);
while (m.find()) {
// System.err.println("Found pattern: " + m.group(1));
// System.err.println("Found pattern: " + m.group(2));
int num = Integer.decode(m.group(2).replace("x", "#") + m.group(4));
if(escapedAccents.containsKey(num)) {
if(m.group(1).equals("i")) {
text = text.replaceAll(m.group(1) + "" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{\\\\i\\}\\}");
} else if(m.group(1).equals("j")){
text = text.replaceAll(m.group(1) + "" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{\\\\j\\}\\}");
} else {
text = text.replaceAll(m.group(1) + "" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{" + m.group(1) + "\\}\\}");
}
}
}
escapedPattern = Pattern.compile("([x]*)([0]*)(\\p{XDigit}+);");
m = escapedPattern.matcher(text);
while (m.find()) {
// System.err.println("Found pattern: " + m.group(1));
// System.err.println("Found pattern: " + m.group(2));
int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3));
System.err.println("HTML escaped char not converted: " + m.group(1) + m.group(2) + m.group(3) + " = " + Integer.toString(num));
}
// Remove $$ in case of two adjacent conversions
text = text.replace("$$","");
// Find non-covered special characters with alphabetic codes
escapedPattern = Pattern.compile("&(\\w+);");
m = escapedPattern.matcher(text);
while (m.find()) {
System.err.println("HTML escaped char not converted: " + m.group(1));
}
return text.trim();
}
private final int MAX_TAG_LENGTH = 30;
/*private final int MAX_CHAR_LENGTH = 10;
private int readHtmlChar(String text, StringBuffer sb, int position) {
// Have just read the < character that starts the tag.
int index = text.indexOf(';', position);
if ((index > position) && (index-position < MAX_CHAR_LENGTH)) {
//String code = text.substring(position, index);
//System.out.println("Removed code: "+text.substring(position, index));
return index; // Just skip the tag.
} else return position; // Don't do anything.
}*/
private int readTag(String text, StringBuffer sb, int position) {
// Have just read the < character that starts the tag.
int index = text.indexOf('>', position);
if ((index > position) && (index-position < MAX_TAG_LENGTH)) {
//System.out.println("Removed tag: "+text.substring(position, index));
return index; // Just skip the tag.
} else return position; // Don't do anything.
}
}