From 6e19183171315aaa59b67da5a8c02bcd7a794500 Mon Sep 17 00:00:00 2001 From: Kevin Ballard Date: Fri, 2 May 2014 00:16:08 -0700 Subject: [PATCH 1/6] Add libhtml libhtml provides escaping/unescaping of HTML entities. It matches the HTML5 parsing rules as closely as possible. It provides convenience functions to escape/unescape, helpers to perform the escaping/unescaping during Show, and Writers that can escape/unescape in a streaming manner. References: http://www.w3.org/html/wg/drafts/html/CR/syntax.html --- mk/crates.mk | 5 +- src/README.md | 1 + src/libhtml/entity.rs | 2137 ++++++++++++++++++++++++++++++ src/libhtml/escape.rs | 497 +++++++ src/libhtml/fmt.rs | 43 + src/libhtml/lib.rs | 146 ++ src/librustdoc/html/escape.rs | 53 - src/librustdoc/html/highlight.rs | 2 +- src/librustdoc/lib.rs | 2 +- src/librustdoc/markdown.rs | 2 +- 10 files changed, 2830 insertions(+), 58 deletions(-) create mode 100644 src/libhtml/entity.rs create mode 100644 src/libhtml/escape.rs create mode 100644 src/libhtml/fmt.rs create mode 100644 src/libhtml/lib.rs delete mode 100644 src/librustdoc/html/escape.rs diff --git a/mk/crates.mk b/mk/crates.mk index a0a0bc800c45a..e17245e1fd476 100644 --- a/mk/crates.mk +++ b/mk/crates.mk @@ -51,7 +51,7 @@ TARGET_CRATES := libc std green rustuv native flate arena glob term semver \ uuid serialize sync getopts collections num test time rand \ - workcache url log regex graphviz core rlibc alloc + workcache url log regex graphviz core rlibc alloc html HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros fmt_macros CRATES := $(TARGET_CRATES) $(HOST_CRATES) TOOLS := compiletest rustdoc rustc @@ -68,7 +68,7 @@ DEPS_syntax := std term serialize collections log fmt_macros DEPS_rustc := syntax native:rustllvm flate arena serialize sync getopts \ collections time log graphviz DEPS_rustdoc := rustc native:hoedown serialize sync getopts collections \ - test time + test time html DEPS_flate := std native:miniz DEPS_arena := std collections DEPS_graphviz := std @@ -92,6 +92,7 @@ DEPS_log := std sync DEPS_regex := std collections DEPS_regex_macros = syntax std regex DEPS_fmt_macros = std +DEPS_html := std TOOL_DEPS_compiletest := test green rustuv getopts TOOL_DEPS_rustdoc := rustdoc native diff --git a/src/README.md b/src/README.md index de9a793bafc96..5548930475f5a 100644 --- a/src/README.md +++ b/src/README.md @@ -19,6 +19,7 @@ Source layout: | `libfourcc/` | Data format identifier library | | `libgetopts/` | Get command-line-options library | | `libglob/` | Unix glob patterns library | +| `libhtml/` | HTML escaping and unescaping library | | `libregex/` | Regular expressions | | `libsemver/` | Rust's semantic versioning library | | `libserialize/` | Encode-Decode types library | diff --git a/src/libhtml/entity.rs b/src/libhtml/entity.rs new file mode 100644 index 0000000000000..aff28b6c2082d --- /dev/null +++ b/src/libhtml/entity.rs @@ -0,0 +1,2137 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +pub static ENTITIES: &'static [(&'static str, &'static str, bool)] = &'static [ + ("Æ", "\xc6", false), + ("&", "&", false), + ("Á", "\xc1", false), + ("&Abreve", "\u0102", true), + ("Â", "\xc2", false), + ("&Acy", "\u0410", true), + ("&Afr", "\U0001d504", true), + ("À", "\xc0", false), + ("&Alpha", "\u0391", true), + ("&Amacr", "\u0100", true), + ("&And", "\u2a53", true), + ("&Aogon", "\u0104", true), + ("&Aopf", "\U0001d538", true), + ("&ApplyFunction", "\u2061", true), + ("Å", "\xc5", false), + ("&Ascr", "\U0001d49c", true), + ("&Assign", "\u2254", true), + ("Ã", "\xc3", false), + ("Ä", "\xc4", false), + ("&Backslash", "\u2216", true), + ("&Barv", "\u2ae7", true), + ("&Barwed", "\u2306", true), + ("&Bcy", "\u0411", true), + ("&Because", "\u2235", true), + ("&Bernoullis", "\u212c", true), + ("&Beta", "\u0392", true), + ("&Bfr", "\U0001d505", true), + ("&Bopf", "\U0001d539", true), + ("&Breve", "\u02d8", true), + ("&Bscr", "\u212c", true), + ("&Bumpeq", "\u224e", true), + ("&CHcy", "\u0427", true), + ("©", "\xa9", false), + ("&Cacute", "\u0106", true), + ("&Cap", "\u22d2", true), + ("&CapitalDifferentialD", "\u2145", true), + ("&Cayleys", "\u212d", true), + ("&Ccaron", "\u010c", true), + ("Ç", "\xc7", false), + ("&Ccirc", "\u0108", true), + ("&Cconint", "\u2230", true), + ("&Cdot", "\u010a", true), + ("&Cedilla", "\xb8", true), + ("&CenterDot", "\xb7", true), + ("&Cfr", "\u212d", true), + ("&Chi", "\u03a7", true), + ("&CircleDot", "\u2299", true), + ("&CircleMinus", "\u2296", true), + ("&CirclePlus", "\u2295", true), + ("&CircleTimes", "\u2297", true), + ("&ClockwiseContourIntegral", "\u2232", true), + ("&CloseCurlyDoubleQuote", "\u201d", true), + ("&CloseCurlyQuote", "\u2019", true), + ("&Colon", "\u2237", true), + ("&Colone", "\u2a74", true), + ("&Congruent", "\u2261", true), + ("&Conint", "\u222f", true), + ("&ContourIntegral", "\u222e", true), + ("&Copf", "\u2102", true), + ("&Coproduct", "\u2210", true), + ("&CounterClockwiseContourIntegral", "\u2233", true), + ("&Cross", "\u2a2f", true), + ("&Cscr", "\U0001d49e", true), + ("&Cup", "\u22d3", true), + ("&CupCap", "\u224d", true), + ("&DD", "\u2145", true), + ("&DDotrahd", "\u2911", true), + ("&DJcy", "\u0402", true), + ("&DScy", "\u0405", true), + ("&DZcy", "\u040f", true), + ("&Dagger", "\u2021", true), + ("&Darr", "\u21a1", true), + ("&Dashv", "\u2ae4", true), + ("&Dcaron", "\u010e", true), + ("&Dcy", "\u0414", true), + ("&Del", "\u2207", true), + ("&Delta", "\u0394", true), + ("&Dfr", "\U0001d507", true), + ("&DiacriticalAcute", "\xb4", true), + ("&DiacriticalDot", "\u02d9", true), + ("&DiacriticalDoubleAcute", "\u02dd", true), + ("&DiacriticalGrave", "`", true), + ("&DiacriticalTilde", "\u02dc", true), + ("&Diamond", "\u22c4", true), + ("&DifferentialD", "\u2146", true), + ("&Dopf", "\U0001d53b", true), + ("&Dot", "\xa8", true), + ("&DotDot", "\u20dc", true), + ("&DotEqual", "\u2250", true), + ("&DoubleContourIntegral", "\u222f", true), + ("&DoubleDot", "\xa8", true), + ("&DoubleDownArrow", "\u21d3", true), + ("&DoubleLeftArrow", "\u21d0", true), + ("&DoubleLeftRightArrow", "\u21d4", true), + ("&DoubleLeftTee", "\u2ae4", true), + ("&DoubleLongLeftArrow", "\u27f8", true), + ("&DoubleLongLeftRightArrow", "\u27fa", true), + ("&DoubleLongRightArrow", "\u27f9", true), + ("&DoubleRightArrow", "\u21d2", true), + ("&DoubleRightTee", "\u22a8", true), + ("&DoubleUpArrow", "\u21d1", true), + ("&DoubleUpDownArrow", "\u21d5", true), + ("&DoubleVerticalBar", "\u2225", true), + ("&DownArrow", "\u2193", true), + ("&DownArrowBar", "\u2913", true), + ("&DownArrowUpArrow", "\u21f5", true), + ("&DownBreve", "\u0311", true), + ("&DownLeftRightVector", "\u2950", true), + ("&DownLeftTeeVector", "\u295e", true), + ("&DownLeftVector", "\u21bd", true), + ("&DownLeftVectorBar", "\u2956", true), + ("&DownRightTeeVector", "\u295f", true), + ("&DownRightVector", "\u21c1", true), + ("&DownRightVectorBar", "\u2957", true), + ("&DownTee", "\u22a4", true), + ("&DownTeeArrow", "\u21a7", true), + ("&Downarrow", "\u21d3", true), + ("&Dscr", "\U0001d49f", true), + ("&Dstrok", "\u0110", true), + ("&ENG", "\u014a", true), + ("Ð", "\xd0", false), + ("É", "\xc9", false), + ("&Ecaron", "\u011a", true), + ("Ê", "\xca", false), + ("&Ecy", "\u042d", true), + ("&Edot", "\u0116", true), + ("&Efr", "\U0001d508", true), + ("È", "\xc8", false), + ("&Element", "\u2208", true), + ("&Emacr", "\u0112", true), + ("&EmptySmallSquare", "\u25fb", true), + ("&EmptyVerySmallSquare", "\u25ab", true), + ("&Eogon", "\u0118", true), + ("&Eopf", "\U0001d53c", true), + ("&Epsilon", "\u0395", true), + ("&Equal", "\u2a75", true), + ("&EqualTilde", "\u2242", true), + ("&Equilibrium", "\u21cc", true), + ("&Escr", "\u2130", true), + ("&Esim", "\u2a73", true), + ("&Eta", "\u0397", true), + ("Ë", "\xcb", false), + ("&Exists", "\u2203", true), + ("&ExponentialE", "\u2147", true), + ("&Fcy", "\u0424", true), + ("&Ffr", "\U0001d509", true), + ("&FilledSmallSquare", "\u25fc", true), + ("&FilledVerySmallSquare", "\u25aa", true), + ("&Fopf", "\U0001d53d", true), + ("&ForAll", "\u2200", true), + ("&Fouriertrf", "\u2131", true), + ("&Fscr", "\u2131", true), + ("&GJcy", "\u0403", true), + (">", ">", false), + ("&Gamma", "\u0393", true), + ("&Gammad", "\u03dc", true), + ("&Gbreve", "\u011e", true), + ("&Gcedil", "\u0122", true), + ("&Gcirc", "\u011c", true), + ("&Gcy", "\u0413", true), + ("&Gdot", "\u0120", true), + ("&Gfr", "\U0001d50a", true), + ("&Gg", "\u22d9", true), + ("&Gopf", "\U0001d53e", true), + ("&GreaterEqual", "\u2265", true), + ("&GreaterEqualLess", "\u22db", true), + ("&GreaterFullEqual", "\u2267", true), + ("&GreaterGreater", "\u2aa2", true), + ("&GreaterLess", "\u2277", true), + ("&GreaterSlantEqual", "\u2a7e", true), + ("&GreaterTilde", "\u2273", true), + ("&Gscr", "\U0001d4a2", true), + ("&Gt", "\u226b", true), + ("&HARDcy", "\u042a", true), + ("&Hacek", "\u02c7", true), + ("&Hat", "^", true), + ("&Hcirc", "\u0124", true), + ("&Hfr", "\u210c", true), + ("&HilbertSpace", "\u210b", true), + ("&Hopf", "\u210d", true), + ("&HorizontalLine", "\u2500", true), + ("&Hscr", "\u210b", true), + ("&Hstrok", "\u0126", true), + ("&HumpDownHump", "\u224e", true), + ("&HumpEqual", "\u224f", true), + ("&IEcy", "\u0415", true), + ("&IJlig", "\u0132", true), + ("&IOcy", "\u0401", true), + ("Í", "\xcd", false), + ("Î", "\xce", false), + ("&Icy", "\u0418", true), + ("&Idot", "\u0130", true), + ("&Ifr", "\u2111", true), + ("Ì", "\xcc", false), + ("&Im", "\u2111", true), + ("&Imacr", "\u012a", true), + ("&ImaginaryI", "\u2148", true), + ("&Implies", "\u21d2", true), + ("&Int", "\u222c", true), + ("&Integral", "\u222b", true), + ("&Intersection", "\u22c2", true), + ("&InvisibleComma", "\u2063", true), + ("&InvisibleTimes", "\u2062", true), + ("&Iogon", "\u012e", true), + ("&Iopf", "\U0001d540", true), + ("&Iota", "\u0399", true), + ("&Iscr", "\u2110", true), + ("&Itilde", "\u0128", true), + ("&Iukcy", "\u0406", true), + ("Ï", "\xcf", false), + ("&Jcirc", "\u0134", true), + ("&Jcy", "\u0419", true), + ("&Jfr", "\U0001d50d", true), + ("&Jopf", "\U0001d541", true), + ("&Jscr", "\U0001d4a5", true), + ("&Jsercy", "\u0408", true), + ("&Jukcy", "\u0404", true), + ("&KHcy", "\u0425", true), + ("&KJcy", "\u040c", true), + ("&Kappa", "\u039a", true), + ("&Kcedil", "\u0136", true), + ("&Kcy", "\u041a", true), + ("&Kfr", "\U0001d50e", true), + ("&Kopf", "\U0001d542", true), + ("&Kscr", "\U0001d4a6", true), + ("&LJcy", "\u0409", true), + ("<", "<", false), + ("&Lacute", "\u0139", true), + ("&Lambda", "\u039b", true), + ("&Lang", "\u27ea", true), + ("&Laplacetrf", "\u2112", true), + ("&Larr", "\u219e", true), + ("&Lcaron", "\u013d", true), + ("&Lcedil", "\u013b", true), + ("&Lcy", "\u041b", true), + ("&LeftAngleBracket", "\u27e8", true), + ("&LeftArrow", "\u2190", true), + ("&LeftArrowBar", "\u21e4", true), + ("&LeftArrowRightArrow", "\u21c6", true), + ("&LeftCeiling", "\u2308", true), + ("&LeftDoubleBracket", "\u27e6", true), + ("&LeftDownTeeVector", "\u2961", true), + ("&LeftDownVector", "\u21c3", true), + ("&LeftDownVectorBar", "\u2959", true), + ("&LeftFloor", "\u230a", true), + ("&LeftRightArrow", "\u2194", true), + ("&LeftRightVector", "\u294e", true), + ("&LeftTee", "\u22a3", true), + ("&LeftTeeArrow", "\u21a4", true), + ("&LeftTeeVector", "\u295a", true), + ("&LeftTriangle", "\u22b2", true), + ("&LeftTriangleBar", "\u29cf", true), + ("&LeftTriangleEqual", "\u22b4", true), + ("&LeftUpDownVector", "\u2951", true), + ("&LeftUpTeeVector", "\u2960", true), + ("&LeftUpVector", "\u21bf", true), + ("&LeftUpVectorBar", "\u2958", true), + ("&LeftVector", "\u21bc", true), + ("&LeftVectorBar", "\u2952", true), + ("&Leftarrow", "\u21d0", true), + ("&Leftrightarrow", "\u21d4", true), + ("&LessEqualGreater", "\u22da", true), + ("&LessFullEqual", "\u2266", true), + ("&LessGreater", "\u2276", true), + ("&LessLess", "\u2aa1", true), + ("&LessSlantEqual", "\u2a7d", true), + ("&LessTilde", "\u2272", true), + ("&Lfr", "\U0001d50f", true), + ("&Ll", "\u22d8", true), + ("&Lleftarrow", "\u21da", true), + ("&Lmidot", "\u013f", true), + ("&LongLeftArrow", "\u27f5", true), + ("&LongLeftRightArrow", "\u27f7", true), + ("&LongRightArrow", "\u27f6", true), + ("&Longleftarrow", "\u27f8", true), + ("&Longleftrightarrow", "\u27fa", true), + ("&Longrightarrow", "\u27f9", true), + ("&Lopf", "\U0001d543", true), + ("&LowerLeftArrow", "\u2199", true), + ("&LowerRightArrow", "\u2198", true), + ("&Lscr", "\u2112", true), + ("&Lsh", "\u21b0", true), + ("&Lstrok", "\u0141", true), + ("&Lt", "\u226a", true), + ("&Map", "\u2905", true), + ("&Mcy", "\u041c", true), + ("&MediumSpace", "\u205f", true), + ("&Mellintrf", "\u2133", true), + ("&Mfr", "\U0001d510", true), + ("&MinusPlus", "\u2213", true), + ("&Mopf", "\U0001d544", true), + ("&Mscr", "\u2133", true), + ("&Mu", "\u039c", true), + ("&NJcy", "\u040a", true), + ("&Nacute", "\u0143", true), + ("&Ncaron", "\u0147", true), + ("&Ncedil", "\u0145", true), + ("&Ncy", "\u041d", true), + ("&NegativeMediumSpace", "\u200b", true), + ("&NegativeThickSpace", "\u200b", true), + ("&NegativeThinSpace", "\u200b", true), + ("&NegativeVeryThinSpace", "\u200b", true), + ("&NestedGreaterGreater", "\u226b", true), + ("&NestedLessLess", "\u226a", true), + ("&NewLine", "\n", true), + ("&Nfr", "\U0001d511", true), + ("&NoBreak", "\u2060", true), + ("&NonBreakingSpace", "\xa0", true), + ("&Nopf", "\u2115", true), + ("&Not", "\u2aec", true), + ("&NotCongruent", "\u2262", true), + ("&NotCupCap", "\u226d", true), + ("&NotDoubleVerticalBar", "\u2226", true), + ("&NotElement", "\u2209", true), + ("&NotEqual", "\u2260", true), + ("&NotEqualTilde", "\u2242\u0338", true), + ("&NotExists", "\u2204", true), + ("&NotGreater", "\u226f", true), + ("&NotGreaterEqual", "\u2271", true), + ("&NotGreaterFullEqual", "\u2267\u0338", true), + ("&NotGreaterGreater", "\u226b\u0338", true), + ("&NotGreaterLess", "\u2279", true), + ("&NotGreaterSlantEqual", "\u2a7e\u0338", true), + ("&NotGreaterTilde", "\u2275", true), + ("&NotHumpDownHump", "\u224e\u0338", true), + ("&NotHumpEqual", "\u224f\u0338", true), + ("&NotLeftTriangle", "\u22ea", true), + ("&NotLeftTriangleBar", "\u29cf\u0338", true), + ("&NotLeftTriangleEqual", "\u22ec", true), + ("&NotLess", "\u226e", true), + ("&NotLessEqual", "\u2270", true), + ("&NotLessGreater", "\u2278", true), + ("&NotLessLess", "\u226a\u0338", true), + ("&NotLessSlantEqual", "\u2a7d\u0338", true), + ("&NotLessTilde", "\u2274", true), + ("&NotNestedGreaterGreater", "\u2aa2\u0338", true), + ("&NotNestedLessLess", "\u2aa1\u0338", true), + ("&NotPrecedes", "\u2280", true), + ("&NotPrecedesEqual", "\u2aaf\u0338", true), + ("&NotPrecedesSlantEqual", "\u22e0", true), + ("&NotReverseElement", "\u220c", true), + ("&NotRightTriangle", "\u22eb", true), + ("&NotRightTriangleBar", "\u29d0\u0338", true), + ("&NotRightTriangleEqual", "\u22ed", true), + ("&NotSquareSubset", "\u228f\u0338", true), + ("&NotSquareSubsetEqual", "\u22e2", true), + ("&NotSquareSuperset", "\u2290\u0338", true), + ("&NotSquareSupersetEqual", "\u22e3", true), + ("&NotSubset", "\u2282\u20d2", true), + ("&NotSubsetEqual", "\u2288", true), + ("&NotSucceeds", "\u2281", true), + ("&NotSucceedsEqual", "\u2ab0\u0338", true), + ("&NotSucceedsSlantEqual", "\u22e1", true), + ("&NotSucceedsTilde", "\u227f\u0338", true), + ("&NotSuperset", "\u2283\u20d2", true), + ("&NotSupersetEqual", "\u2289", true), + ("&NotTilde", "\u2241", true), + ("&NotTildeEqual", "\u2244", true), + ("&NotTildeFullEqual", "\u2247", true), + ("&NotTildeTilde", "\u2249", true), + ("&NotVerticalBar", "\u2224", true), + ("&Nscr", "\U0001d4a9", true), + ("Ñ", "\xd1", false), + ("&Nu", "\u039d", true), + ("&OElig", "\u0152", true), + ("Ó", "\xd3", false), + ("Ô", "\xd4", false), + ("&Ocy", "\u041e", true), + ("&Odblac", "\u0150", true), + ("&Ofr", "\U0001d512", true), + ("Ò", "\xd2", false), + ("&Omacr", "\u014c", true), + ("&Omega", "\u03a9", true), + ("&Omicron", "\u039f", true), + ("&Oopf", "\U0001d546", true), + ("&OpenCurlyDoubleQuote", "\u201c", true), + ("&OpenCurlyQuote", "\u2018", true), + ("&Or", "\u2a54", true), + ("&Oscr", "\U0001d4aa", true), + ("Ø", "\xd8", false), + ("Õ", "\xd5", false), + ("&Otimes", "\u2a37", true), + ("Ö", "\xd6", false), + ("&OverBar", "\u203e", true), + ("&OverBrace", "\u23de", true), + ("&OverBracket", "\u23b4", true), + ("&OverParenthesis", "\u23dc", true), + ("&PartialD", "\u2202", true), + ("&Pcy", "\u041f", true), + ("&Pfr", "\U0001d513", true), + ("&Phi", "\u03a6", true), + ("&Pi", "\u03a0", true), + ("&PlusMinus", "\xb1", true), + ("&Poincareplane", "\u210c", true), + ("&Popf", "\u2119", true), + ("&Pr", "\u2abb", true), + ("&Precedes", "\u227a", true), + ("&PrecedesEqual", "\u2aaf", true), + ("&PrecedesSlantEqual", "\u227c", true), + ("&PrecedesTilde", "\u227e", true), + ("&Prime", "\u2033", true), + ("&Product", "\u220f", true), + ("&Proportion", "\u2237", true), + ("&Proportional", "\u221d", true), + ("&Pscr", "\U0001d4ab", true), + ("&Psi", "\u03a8", true), + (""", "\"", false), + ("&Qfr", "\U0001d514", true), + ("&Qopf", "\u211a", true), + ("&Qscr", "\U0001d4ac", true), + ("&RBarr", "\u2910", true), + ("®", "\xae", false), + ("&Racute", "\u0154", true), + ("&Rang", "\u27eb", true), + ("&Rarr", "\u21a0", true), + ("&Rarrtl", "\u2916", true), + ("&Rcaron", "\u0158", true), + ("&Rcedil", "\u0156", true), + ("&Rcy", "\u0420", true), + ("&Re", "\u211c", true), + ("&ReverseElement", "\u220b", true), + ("&ReverseEquilibrium", "\u21cb", true), + ("&ReverseUpEquilibrium", "\u296f", true), + ("&Rfr", "\u211c", true), + ("&Rho", "\u03a1", true), + ("&RightAngleBracket", "\u27e9", true), + ("&RightArrow", "\u2192", true), + ("&RightArrowBar", "\u21e5", true), + ("&RightArrowLeftArrow", "\u21c4", true), + ("&RightCeiling", "\u2309", true), + ("&RightDoubleBracket", "\u27e7", true), + ("&RightDownTeeVector", "\u295d", true), + ("&RightDownVector", "\u21c2", true), + ("&RightDownVectorBar", "\u2955", true), + ("&RightFloor", "\u230b", true), + ("&RightTee", "\u22a2", true), + ("&RightTeeArrow", "\u21a6", true), + ("&RightTeeVector", "\u295b", true), + ("&RightTriangle", "\u22b3", true), + ("&RightTriangleBar", "\u29d0", true), + ("&RightTriangleEqual", "\u22b5", true), + ("&RightUpDownVector", "\u294f", true), + ("&RightUpTeeVector", "\u295c", true), + ("&RightUpVector", "\u21be", true), + ("&RightUpVectorBar", "\u2954", true), + ("&RightVector", "\u21c0", true), + ("&RightVectorBar", "\u2953", true), + ("&Rightarrow", "\u21d2", true), + ("&Ropf", "\u211d", true), + ("&RoundImplies", "\u2970", true), + ("&Rrightarrow", "\u21db", true), + ("&Rscr", "\u211b", true), + ("&Rsh", "\u21b1", true), + ("&RuleDelayed", "\u29f4", true), + ("&SHCHcy", "\u0429", true), + ("&SHcy", "\u0428", true), + ("&SOFTcy", "\u042c", true), + ("&Sacute", "\u015a", true), + ("&Sc", "\u2abc", true), + ("&Scaron", "\u0160", true), + ("&Scedil", "\u015e", true), + ("&Scirc", "\u015c", true), + ("&Scy", "\u0421", true), + ("&Sfr", "\U0001d516", true), + ("&ShortDownArrow", "\u2193", true), + ("&ShortLeftArrow", "\u2190", true), + ("&ShortRightArrow", "\u2192", true), + ("&ShortUpArrow", "\u2191", true), + ("&Sigma", "\u03a3", true), + ("&SmallCircle", "\u2218", true), + ("&Sopf", "\U0001d54a", true), + ("&Sqrt", "\u221a", true), + ("&Square", "\u25a1", true), + ("&SquareIntersection", "\u2293", true), + ("&SquareSubset", "\u228f", true), + ("&SquareSubsetEqual", "\u2291", true), + ("&SquareSuperset", "\u2290", true), + ("&SquareSupersetEqual", "\u2292", true), + ("&SquareUnion", "\u2294", true), + ("&Sscr", "\U0001d4ae", true), + ("&Star", "\u22c6", true), + ("&Sub", "\u22d0", true), + ("&Subset", "\u22d0", true), + ("&SubsetEqual", "\u2286", true), + ("&Succeeds", "\u227b", true), + ("&SucceedsEqual", "\u2ab0", true), + ("&SucceedsSlantEqual", "\u227d", true), + ("&SucceedsTilde", "\u227f", true), + ("&SuchThat", "\u220b", true), + ("&Sum", "\u2211", true), + ("&Sup", "\u22d1", true), + ("&Superset", "\u2283", true), + ("&SupersetEqual", "\u2287", true), + ("&Supset", "\u22d1", true), + ("Þ", "\xde", false), + ("&TRADE", "\u2122", true), + ("&TSHcy", "\u040b", true), + ("&TScy", "\u0426", true), + ("&Tab", "\t", true), + ("&Tau", "\u03a4", true), + ("&Tcaron", "\u0164", true), + ("&Tcedil", "\u0162", true), + ("&Tcy", "\u0422", true), + ("&Tfr", "\U0001d517", true), + ("&Therefore", "\u2234", true), + ("&Theta", "\u0398", true), + ("&ThickSpace", "\u205f\u200a", true), + ("&ThinSpace", "\u2009", true), + ("&Tilde", "\u223c", true), + ("&TildeEqual", "\u2243", true), + ("&TildeFullEqual", "\u2245", true), + ("&TildeTilde", "\u2248", true), + ("&Topf", "\U0001d54b", true), + ("&TripleDot", "\u20db", true), + ("&Tscr", "\U0001d4af", true), + ("&Tstrok", "\u0166", true), + ("Ú", "\xda", false), + ("&Uarr", "\u219f", true), + ("&Uarrocir", "\u2949", true), + ("&Ubrcy", "\u040e", true), + ("&Ubreve", "\u016c", true), + ("Û", "\xdb", false), + ("&Ucy", "\u0423", true), + ("&Udblac", "\u0170", true), + ("&Ufr", "\U0001d518", true), + ("Ù", "\xd9", false), + ("&Umacr", "\u016a", true), + ("&UnderBar", "_", true), + ("&UnderBrace", "\u23df", true), + ("&UnderBracket", "\u23b5", true), + ("&UnderParenthesis", "\u23dd", true), + ("&Union", "\u22c3", true), + ("&UnionPlus", "\u228e", true), + ("&Uogon", "\u0172", true), + ("&Uopf", "\U0001d54c", true), + ("&UpArrow", "\u2191", true), + ("&UpArrowBar", "\u2912", true), + ("&UpArrowDownArrow", "\u21c5", true), + ("&UpDownArrow", "\u2195", true), + ("&UpEquilibrium", "\u296e", true), + ("&UpTee", "\u22a5", true), + ("&UpTeeArrow", "\u21a5", true), + ("&Uparrow", "\u21d1", true), + ("&Updownarrow", "\u21d5", true), + ("&UpperLeftArrow", "\u2196", true), + ("&UpperRightArrow", "\u2197", true), + ("&Upsi", "\u03d2", true), + ("&Upsilon", "\u03a5", true), + ("&Uring", "\u016e", true), + ("&Uscr", "\U0001d4b0", true), + ("&Utilde", "\u0168", true), + ("Ü", "\xdc", false), + ("&VDash", "\u22ab", true), + ("&Vbar", "\u2aeb", true), + ("&Vcy", "\u0412", true), + ("&Vdash", "\u22a9", true), + ("&Vdashl", "\u2ae6", true), + ("&Vee", "\u22c1", true), + ("&Verbar", "\u2016", true), + ("&Vert", "\u2016", true), + ("&VerticalBar", "\u2223", true), + ("&VerticalLine", "|", true), + ("&VerticalSeparator", "\u2758", true), + ("&VerticalTilde", "\u2240", true), + ("&VeryThinSpace", "\u200a", true), + ("&Vfr", "\U0001d519", true), + ("&Vopf", "\U0001d54d", true), + ("&Vscr", "\U0001d4b1", true), + ("&Vvdash", "\u22aa", true), + ("&Wcirc", "\u0174", true), + ("&Wedge", "\u22c0", true), + ("&Wfr", "\U0001d51a", true), + ("&Wopf", "\U0001d54e", true), + ("&Wscr", "\U0001d4b2", true), + ("&Xfr", "\U0001d51b", true), + ("&Xi", "\u039e", true), + ("&Xopf", "\U0001d54f", true), + ("&Xscr", "\U0001d4b3", true), + ("&YAcy", "\u042f", true), + ("&YIcy", "\u0407", true), + ("&YUcy", "\u042e", true), + ("Ý", "\xdd", false), + ("&Ycirc", "\u0176", true), + ("&Ycy", "\u042b", true), + ("&Yfr", "\U0001d51c", true), + ("&Yopf", "\U0001d550", true), + ("&Yscr", "\U0001d4b4", true), + ("&Yuml", "\u0178", true), + ("&ZHcy", "\u0416", true), + ("&Zacute", "\u0179", true), + ("&Zcaron", "\u017d", true), + ("&Zcy", "\u0417", true), + ("&Zdot", "\u017b", true), + ("&ZeroWidthSpace", "\u200b", true), + ("&Zeta", "\u0396", true), + ("&Zfr", "\u2128", true), + ("&Zopf", "\u2124", true), + ("&Zscr", "\U0001d4b5", true), + ("á", "\xe1", false), + ("&abreve", "\u0103", true), + ("&ac", "\u223e", true), + ("&acE", "\u223e\u0333", true), + ("&acd", "\u223f", true), + ("â", "\xe2", false), + ("´", "\xb4", false), + ("&acy", "\u0430", true), + ("æ", "\xe6", false), + ("&af", "\u2061", true), + ("&afr", "\U0001d51e", true), + ("à", "\xe0", false), + ("&alefsym", "\u2135", true), + ("&aleph", "\u2135", true), + ("&alpha", "\u03b1", true), + ("&amacr", "\u0101", true), + ("&amalg", "\u2a3f", true), + ("&", "&", false), + ("&and", "\u2227", true), + ("&andand", "\u2a55", true), + ("&andd", "\u2a5c", true), + ("&andslope", "\u2a58", true), + ("&andv", "\u2a5a", true), + ("&ang", "\u2220", true), + ("&ange", "\u29a4", true), + ("&angle", "\u2220", true), + ("&angmsd", "\u2221", true), + ("&angmsdaa", "\u29a8", true), + ("&angmsdab", "\u29a9", true), + ("&angmsdac", "\u29aa", true), + ("&angmsdad", "\u29ab", true), + ("&angmsdae", "\u29ac", true), + ("&angmsdaf", "\u29ad", true), + ("&angmsdag", "\u29ae", true), + ("&angmsdah", "\u29af", true), + ("&angrt", "\u221f", true), + ("&angrtvb", "\u22be", true), + ("&angrtvbd", "\u299d", true), + ("&angsph", "\u2222", true), + ("&angst", "\xc5", true), + ("&angzarr", "\u237c", true), + ("&aogon", "\u0105", true), + ("&aopf", "\U0001d552", true), + ("&ap", "\u2248", true), + ("&apE", "\u2a70", true), + ("&apacir", "\u2a6f", true), + ("&ape", "\u224a", true), + ("&apid", "\u224b", true), + ("&apos", "\'", true), + ("&approx", "\u2248", true), + ("&approxeq", "\u224a", true), + ("å", "\xe5", false), + ("&ascr", "\U0001d4b6", true), + ("&ast", "*", true), + ("&asymp", "\u2248", true), + ("&asympeq", "\u224d", true), + ("ã", "\xe3", false), + ("ä", "\xe4", false), + ("&awconint", "\u2233", true), + ("&awint", "\u2a11", true), + ("&bNot", "\u2aed", true), + ("&backcong", "\u224c", true), + ("&backepsilon", "\u03f6", true), + ("&backprime", "\u2035", true), + ("&backsim", "\u223d", true), + ("&backsimeq", "\u22cd", true), + ("&barvee", "\u22bd", true), + ("&barwed", "\u2305", true), + ("&barwedge", "\u2305", true), + ("&bbrk", "\u23b5", true), + ("&bbrktbrk", "\u23b6", true), + ("&bcong", "\u224c", true), + ("&bcy", "\u0431", true), + ("&bdquo", "\u201e", true), + ("&becaus", "\u2235", true), + ("&because", "\u2235", true), + ("&bemptyv", "\u29b0", true), + ("&bepsi", "\u03f6", true), + ("&bernou", "\u212c", true), + ("&beta", "\u03b2", true), + ("&beth", "\u2136", true), + ("&between", "\u226c", true), + ("&bfr", "\U0001d51f", true), + ("&bigcap", "\u22c2", true), + ("&bigcirc", "\u25ef", true), + ("&bigcup", "\u22c3", true), + ("&bigodot", "\u2a00", true), + ("&bigoplus", "\u2a01", true), + ("&bigotimes", "\u2a02", true), + ("&bigsqcup", "\u2a06", true), + ("&bigstar", "\u2605", true), + ("&bigtriangledown", "\u25bd", true), + ("&bigtriangleup", "\u25b3", true), + ("&biguplus", "\u2a04", true), + ("&bigvee", "\u22c1", true), + ("&bigwedge", "\u22c0", true), + ("&bkarow", "\u290d", true), + ("&blacklozenge", "\u29eb", true), + ("&blacksquare", "\u25aa", true), + ("&blacktriangle", "\u25b4", true), + ("&blacktriangledown", "\u25be", true), + ("&blacktriangleleft", "\u25c2", true), + ("&blacktriangleright", "\u25b8", true), + ("&blank", "\u2423", true), + ("&blk12", "\u2592", true), + ("&blk14", "\u2591", true), + ("&blk34", "\u2593", true), + ("&block", "\u2588", true), + ("&bne", "=\u20e5", true), + ("&bnequiv", "\u2261\u20e5", true), + ("&bnot", "\u2310", true), + ("&bopf", "\U0001d553", true), + ("&bot", "\u22a5", true), + ("&bottom", "\u22a5", true), + ("&bowtie", "\u22c8", true), + ("&boxDL", "\u2557", true), + ("&boxDR", "\u2554", true), + ("&boxDl", "\u2556", true), + ("&boxDr", "\u2553", true), + ("&boxH", "\u2550", true), + ("&boxHD", "\u2566", true), + ("&boxHU", "\u2569", true), + ("&boxHd", "\u2564", true), + ("&boxHu", "\u2567", true), + ("&boxUL", "\u255d", true), + ("&boxUR", "\u255a", true), + ("&boxUl", "\u255c", true), + ("&boxUr", "\u2559", true), + ("&boxV", "\u2551", true), + ("&boxVH", "\u256c", true), + ("&boxVL", "\u2563", true), + ("&boxVR", "\u2560", true), + ("&boxVh", "\u256b", true), + ("&boxVl", "\u2562", true), + ("&boxVr", "\u255f", true), + ("&boxbox", "\u29c9", true), + ("&boxdL", "\u2555", true), + ("&boxdR", "\u2552", true), + ("&boxdl", "\u2510", true), + ("&boxdr", "\u250c", true), + ("&boxh", "\u2500", true), + ("&boxhD", "\u2565", true), + ("&boxhU", "\u2568", true), + ("&boxhd", "\u252c", true), + ("&boxhu", "\u2534", true), + ("&boxminus", "\u229f", true), + ("&boxplus", "\u229e", true), + ("&boxtimes", "\u22a0", true), + ("&boxuL", "\u255b", true), + ("&boxuR", "\u2558", true), + ("&boxul", "\u2518", true), + ("&boxur", "\u2514", true), + ("&boxv", "\u2502", true), + ("&boxvH", "\u256a", true), + ("&boxvL", "\u2561", true), + ("&boxvR", "\u255e", true), + ("&boxvh", "\u253c", true), + ("&boxvl", "\u2524", true), + ("&boxvr", "\u251c", true), + ("&bprime", "\u2035", true), + ("&breve", "\u02d8", true), + ("¦", "\xa6", false), + ("&bscr", "\U0001d4b7", true), + ("&bsemi", "\u204f", true), + ("&bsim", "\u223d", true), + ("&bsime", "\u22cd", true), + ("&bsol", "\\", true), + ("&bsolb", "\u29c5", true), + ("&bsolhsub", "\u27c8", true), + ("&bull", "\u2022", true), + ("&bullet", "\u2022", true), + ("&bump", "\u224e", true), + ("&bumpE", "\u2aae", true), + ("&bumpe", "\u224f", true), + ("&bumpeq", "\u224f", true), + ("&cacute", "\u0107", true), + ("&cap", "\u2229", true), + ("&capand", "\u2a44", true), + ("&capbrcup", "\u2a49", true), + ("&capcap", "\u2a4b", true), + ("&capcup", "\u2a47", true), + ("&capdot", "\u2a40", true), + ("&caps", "\u2229\ufe00", true), + ("&caret", "\u2041", true), + ("&caron", "\u02c7", true), + ("&ccaps", "\u2a4d", true), + ("&ccaron", "\u010d", true), + ("ç", "\xe7", false), + ("&ccirc", "\u0109", true), + ("&ccups", "\u2a4c", true), + ("&ccupssm", "\u2a50", true), + ("&cdot", "\u010b", true), + ("¸", "\xb8", false), + ("&cemptyv", "\u29b2", true), + ("¢", "\xa2", false), + ("¢erdot", "\xb7", true), + ("&cfr", "\U0001d520", true), + ("&chcy", "\u0447", true), + ("&check", "\u2713", true), + ("&checkmark", "\u2713", true), + ("&chi", "\u03c7", true), + ("&cir", "\u25cb", true), + ("&cirE", "\u29c3", true), + ("&circ", "\u02c6", true), + ("&circeq", "\u2257", true), + ("&circlearrowleft", "\u21ba", true), + ("&circlearrowright", "\u21bb", true), + ("&circledR", "\xae", true), + ("&circledS", "\u24c8", true), + ("&circledast", "\u229b", true), + ("&circledcirc", "\u229a", true), + ("&circleddash", "\u229d", true), + ("&cire", "\u2257", true), + ("&cirfnint", "\u2a10", true), + ("&cirmid", "\u2aef", true), + ("&cirscir", "\u29c2", true), + ("&clubs", "\u2663", true), + ("&clubsuit", "\u2663", true), + ("&colon", ":", true), + ("&colone", "\u2254", true), + ("&coloneq", "\u2254", true), + ("&comma", ",", true), + ("&commat", "@", true), + ("&comp", "\u2201", true), + ("&compfn", "\u2218", true), + ("&complement", "\u2201", true), + ("&complexes", "\u2102", true), + ("&cong", "\u2245", true), + ("&congdot", "\u2a6d", true), + ("&conint", "\u222e", true), + ("&copf", "\U0001d554", true), + ("&coprod", "\u2210", true), + ("©", "\xa9", false), + ("©sr", "\u2117", true), + ("&crarr", "\u21b5", true), + ("&cross", "\u2717", true), + ("&cscr", "\U0001d4b8", true), + ("&csub", "\u2acf", true), + ("&csube", "\u2ad1", true), + ("&csup", "\u2ad0", true), + ("&csupe", "\u2ad2", true), + ("&ctdot", "\u22ef", true), + ("&cudarrl", "\u2938", true), + ("&cudarrr", "\u2935", true), + ("&cuepr", "\u22de", true), + ("&cuesc", "\u22df", true), + ("&cularr", "\u21b6", true), + ("&cularrp", "\u293d", true), + ("&cup", "\u222a", true), + ("&cupbrcap", "\u2a48", true), + ("&cupcap", "\u2a46", true), + ("&cupcup", "\u2a4a", true), + ("&cupdot", "\u228d", true), + ("&cupor", "\u2a45", true), + ("&cups", "\u222a\ufe00", true), + ("&curarr", "\u21b7", true), + ("&curarrm", "\u293c", true), + ("&curlyeqprec", "\u22de", true), + ("&curlyeqsucc", "\u22df", true), + ("&curlyvee", "\u22ce", true), + ("&curlywedge", "\u22cf", true), + ("¤", "\xa4", false), + ("&curvearrowleft", "\u21b6", true), + ("&curvearrowright", "\u21b7", true), + ("&cuvee", "\u22ce", true), + ("&cuwed", "\u22cf", true), + ("&cwconint", "\u2232", true), + ("&cwint", "\u2231", true), + ("&cylcty", "\u232d", true), + ("&dArr", "\u21d3", true), + ("&dHar", "\u2965", true), + ("&dagger", "\u2020", true), + ("&daleth", "\u2138", true), + ("&darr", "\u2193", true), + ("&dash", "\u2010", true), + ("&dashv", "\u22a3", true), + ("&dbkarow", "\u290f", true), + ("&dblac", "\u02dd", true), + ("&dcaron", "\u010f", true), + ("&dcy", "\u0434", true), + ("&dd", "\u2146", true), + ("&ddagger", "\u2021", true), + ("&ddarr", "\u21ca", true), + ("&ddotseq", "\u2a77", true), + ("°", "\xb0", false), + ("&delta", "\u03b4", true), + ("&demptyv", "\u29b1", true), + ("&dfisht", "\u297f", true), + ("&dfr", "\U0001d521", true), + ("&dharl", "\u21c3", true), + ("&dharr", "\u21c2", true), + ("&diam", "\u22c4", true), + ("&diamond", "\u22c4", true), + ("&diamondsuit", "\u2666", true), + ("&diams", "\u2666", true), + ("&die", "\xa8", true), + ("&digamma", "\u03dd", true), + ("&disin", "\u22f2", true), + ("&div", "\xf7", true), + ("÷", "\xf7", false), + ("÷ontimes", "\u22c7", true), + ("&divonx", "\u22c7", true), + ("&djcy", "\u0452", true), + ("&dlcorn", "\u231e", true), + ("&dlcrop", "\u230d", true), + ("&dollar", "$", true), + ("&dopf", "\U0001d555", true), + ("&dot", "\u02d9", true), + ("&doteq", "\u2250", true), + ("&doteqdot", "\u2251", true), + ("&dotminus", "\u2238", true), + ("&dotplus", "\u2214", true), + ("&dotsquare", "\u22a1", true), + ("&doublebarwedge", "\u2306", true), + ("&downarrow", "\u2193", true), + ("&downdownarrows", "\u21ca", true), + ("&downharpoonleft", "\u21c3", true), + ("&downharpoonright", "\u21c2", true), + ("&drbkarow", "\u2910", true), + ("&drcorn", "\u231f", true), + ("&drcrop", "\u230c", true), + ("&dscr", "\U0001d4b9", true), + ("&dscy", "\u0455", true), + ("&dsol", "\u29f6", true), + ("&dstrok", "\u0111", true), + ("&dtdot", "\u22f1", true), + ("&dtri", "\u25bf", true), + ("&dtrif", "\u25be", true), + ("&duarr", "\u21f5", true), + ("&duhar", "\u296f", true), + ("&dwangle", "\u29a6", true), + ("&dzcy", "\u045f", true), + ("&dzigrarr", "\u27ff", true), + ("&eDDot", "\u2a77", true), + ("&eDot", "\u2251", true), + ("é", "\xe9", false), + ("&easter", "\u2a6e", true), + ("&ecaron", "\u011b", true), + ("&ecir", "\u2256", true), + ("ê", "\xea", false), + ("&ecolon", "\u2255", true), + ("&ecy", "\u044d", true), + ("&edot", "\u0117", true), + ("&ee", "\u2147", true), + ("&efDot", "\u2252", true), + ("&efr", "\U0001d522", true), + ("&eg", "\u2a9a", true), + ("è", "\xe8", false), + ("&egs", "\u2a96", true), + ("&egsdot", "\u2a98", true), + ("&el", "\u2a99", true), + ("&elinters", "\u23e7", true), + ("&ell", "\u2113", true), + ("&els", "\u2a95", true), + ("&elsdot", "\u2a97", true), + ("&emacr", "\u0113", true), + ("&empty", "\u2205", true), + ("&emptyset", "\u2205", true), + ("&emptyv", "\u2205", true), + ("&emsp", "\u2003", true), + ("&emsp13", "\u2004", true), + ("&emsp14", "\u2005", true), + ("&eng", "\u014b", true), + ("&ensp", "\u2002", true), + ("&eogon", "\u0119", true), + ("&eopf", "\U0001d556", true), + ("&epar", "\u22d5", true), + ("&eparsl", "\u29e3", true), + ("&eplus", "\u2a71", true), + ("&epsi", "\u03b5", true), + ("&epsilon", "\u03b5", true), + ("&epsiv", "\u03f5", true), + ("&eqcirc", "\u2256", true), + ("&eqcolon", "\u2255", true), + ("&eqsim", "\u2242", true), + ("&eqslantgtr", "\u2a96", true), + ("&eqslantless", "\u2a95", true), + ("&equals", "=", true), + ("&equest", "\u225f", true), + ("&equiv", "\u2261", true), + ("&equivDD", "\u2a78", true), + ("&eqvparsl", "\u29e5", true), + ("&erDot", "\u2253", true), + ("&erarr", "\u2971", true), + ("&escr", "\u212f", true), + ("&esdot", "\u2250", true), + ("&esim", "\u2242", true), + ("&eta", "\u03b7", true), + ("ð", "\xf0", false), + ("ë", "\xeb", false), + ("&euro", "\u20ac", true), + ("&excl", "!", true), + ("&exist", "\u2203", true), + ("&expectation", "\u2130", true), + ("&exponentiale", "\u2147", true), + ("&fallingdotseq", "\u2252", true), + ("&fcy", "\u0444", true), + ("&female", "\u2640", true), + ("&ffilig", "\ufb03", true), + ("&fflig", "\ufb00", true), + ("&ffllig", "\ufb04", true), + ("&ffr", "\U0001d523", true), + ("&filig", "\ufb01", true), + ("&fjlig", "fj", true), + ("&flat", "\u266d", true), + ("&fllig", "\ufb02", true), + ("&fltns", "\u25b1", true), + ("&fnof", "\u0192", true), + ("&fopf", "\U0001d557", true), + ("&forall", "\u2200", true), + ("&fork", "\u22d4", true), + ("&forkv", "\u2ad9", true), + ("&fpartint", "\u2a0d", true), + ("½", "\xbd", false), + ("&frac13", "\u2153", true), + ("¼", "\xbc", false), + ("&frac15", "\u2155", true), + ("&frac16", "\u2159", true), + ("&frac18", "\u215b", true), + ("&frac23", "\u2154", true), + ("&frac25", "\u2156", true), + ("¾", "\xbe", false), + ("&frac35", "\u2157", true), + ("&frac38", "\u215c", true), + ("&frac45", "\u2158", true), + ("&frac56", "\u215a", true), + ("&frac58", "\u215d", true), + ("&frac78", "\u215e", true), + ("&frasl", "\u2044", true), + ("&frown", "\u2322", true), + ("&fscr", "\U0001d4bb", true), + ("&gE", "\u2267", true), + ("&gEl", "\u2a8c", true), + ("&gacute", "\u01f5", true), + ("&gamma", "\u03b3", true), + ("&gammad", "\u03dd", true), + ("&gap", "\u2a86", true), + ("&gbreve", "\u011f", true), + ("&gcirc", "\u011d", true), + ("&gcy", "\u0433", true), + ("&gdot", "\u0121", true), + ("&ge", "\u2265", true), + ("&gel", "\u22db", true), + ("&geq", "\u2265", true), + ("&geqq", "\u2267", true), + ("&geqslant", "\u2a7e", true), + ("&ges", "\u2a7e", true), + ("&gescc", "\u2aa9", true), + ("&gesdot", "\u2a80", true), + ("&gesdoto", "\u2a82", true), + ("&gesdotol", "\u2a84", true), + ("&gesl", "\u22db\ufe00", true), + ("&gesles", "\u2a94", true), + ("&gfr", "\U0001d524", true), + ("&gg", "\u226b", true), + ("&ggg", "\u22d9", true), + ("&gimel", "\u2137", true), + ("&gjcy", "\u0453", true), + ("&gl", "\u2277", true), + ("&glE", "\u2a92", true), + ("&gla", "\u2aa5", true), + ("&glj", "\u2aa4", true), + ("&gnE", "\u2269", true), + ("&gnap", "\u2a8a", true), + ("&gnapprox", "\u2a8a", true), + ("&gne", "\u2a88", true), + ("&gneq", "\u2a88", true), + ("&gneqq", "\u2269", true), + ("&gnsim", "\u22e7", true), + ("&gopf", "\U0001d558", true), + ("&grave", "`", true), + ("&gscr", "\u210a", true), + ("&gsim", "\u2273", true), + ("&gsime", "\u2a8e", true), + ("&gsiml", "\u2a90", true), + (">", ">", false), + (">cc", "\u2aa7", true), + (">cir", "\u2a7a", true), + (">dot", "\u22d7", true), + (">lPar", "\u2995", true), + (">quest", "\u2a7c", true), + (">rapprox", "\u2a86", true), + (">rarr", "\u2978", true), + (">rdot", "\u22d7", true), + (">reqless", "\u22db", true), + (">reqqless", "\u2a8c", true), + (">rless", "\u2277", true), + (">rsim", "\u2273", true), + ("&gvertneqq", "\u2269\ufe00", true), + ("&gvnE", "\u2269\ufe00", true), + ("&hArr", "\u21d4", true), + ("&hairsp", "\u200a", true), + ("&half", "\xbd", true), + ("&hamilt", "\u210b", true), + ("&hardcy", "\u044a", true), + ("&harr", "\u2194", true), + ("&harrcir", "\u2948", true), + ("&harrw", "\u21ad", true), + ("&hbar", "\u210f", true), + ("&hcirc", "\u0125", true), + ("&hearts", "\u2665", true), + ("&heartsuit", "\u2665", true), + ("&hellip", "\u2026", true), + ("&hercon", "\u22b9", true), + ("&hfr", "\U0001d525", true), + ("&hksearow", "\u2925", true), + ("&hkswarow", "\u2926", true), + ("&hoarr", "\u21ff", true), + ("&homtht", "\u223b", true), + ("&hookleftarrow", "\u21a9", true), + ("&hookrightarrow", "\u21aa", true), + ("&hopf", "\U0001d559", true), + ("&horbar", "\u2015", true), + ("&hscr", "\U0001d4bd", true), + ("&hslash", "\u210f", true), + ("&hstrok", "\u0127", true), + ("&hybull", "\u2043", true), + ("&hyphen", "\u2010", true), + ("í", "\xed", false), + ("&ic", "\u2063", true), + ("î", "\xee", false), + ("&icy", "\u0438", true), + ("&iecy", "\u0435", true), + ("¡", "\xa1", false), + ("&iff", "\u21d4", true), + ("&ifr", "\U0001d526", true), + ("ì", "\xec", false), + ("&ii", "\u2148", true), + ("&iiiint", "\u2a0c", true), + ("&iiint", "\u222d", true), + ("&iinfin", "\u29dc", true), + ("&iiota", "\u2129", true), + ("&ijlig", "\u0133", true), + ("&imacr", "\u012b", true), + ("&image", "\u2111", true), + ("&imagline", "\u2110", true), + ("&imagpart", "\u2111", true), + ("&imath", "\u0131", true), + ("&imof", "\u22b7", true), + ("&imped", "\u01b5", true), + ("&in", "\u2208", true), + ("&incare", "\u2105", true), + ("&infin", "\u221e", true), + ("&infintie", "\u29dd", true), + ("&inodot", "\u0131", true), + ("&int", "\u222b", true), + ("&intcal", "\u22ba", true), + ("&integers", "\u2124", true), + ("&intercal", "\u22ba", true), + ("&intlarhk", "\u2a17", true), + ("&intprod", "\u2a3c", true), + ("&iocy", "\u0451", true), + ("&iogon", "\u012f", true), + ("&iopf", "\U0001d55a", true), + ("&iota", "\u03b9", true), + ("&iprod", "\u2a3c", true), + ("¿", "\xbf", false), + ("&iscr", "\U0001d4be", true), + ("&isin", "\u2208", true), + ("&isinE", "\u22f9", true), + ("&isindot", "\u22f5", true), + ("&isins", "\u22f4", true), + ("&isinsv", "\u22f3", true), + ("&isinv", "\u2208", true), + ("&it", "\u2062", true), + ("&itilde", "\u0129", true), + ("&iukcy", "\u0456", true), + ("ï", "\xef", false), + ("&jcirc", "\u0135", true), + ("&jcy", "\u0439", true), + ("&jfr", "\U0001d527", true), + ("&jmath", "\u0237", true), + ("&jopf", "\U0001d55b", true), + ("&jscr", "\U0001d4bf", true), + ("&jsercy", "\u0458", true), + ("&jukcy", "\u0454", true), + ("&kappa", "\u03ba", true), + ("&kappav", "\u03f0", true), + ("&kcedil", "\u0137", true), + ("&kcy", "\u043a", true), + ("&kfr", "\U0001d528", true), + ("&kgreen", "\u0138", true), + ("&khcy", "\u0445", true), + ("&kjcy", "\u045c", true), + ("&kopf", "\U0001d55c", true), + ("&kscr", "\U0001d4c0", true), + ("&lAarr", "\u21da", true), + ("&lArr", "\u21d0", true), + ("&lAtail", "\u291b", true), + ("&lBarr", "\u290e", true), + ("&lE", "\u2266", true), + ("&lEg", "\u2a8b", true), + ("&lHar", "\u2962", true), + ("&lacute", "\u013a", true), + ("&laemptyv", "\u29b4", true), + ("&lagran", "\u2112", true), + ("&lambda", "\u03bb", true), + ("&lang", "\u27e8", true), + ("&langd", "\u2991", true), + ("&langle", "\u27e8", true), + ("&lap", "\u2a85", true), + ("«", "\xab", false), + ("&larr", "\u2190", true), + ("&larrb", "\u21e4", true), + ("&larrbfs", "\u291f", true), + ("&larrfs", "\u291d", true), + ("&larrhk", "\u21a9", true), + ("&larrlp", "\u21ab", true), + ("&larrpl", "\u2939", true), + ("&larrsim", "\u2973", true), + ("&larrtl", "\u21a2", true), + ("&lat", "\u2aab", true), + ("&latail", "\u2919", true), + ("&late", "\u2aad", true), + ("&lates", "\u2aad\ufe00", true), + ("&lbarr", "\u290c", true), + ("&lbbrk", "\u2772", true), + ("&lbrace", "{", true), + ("&lbrack", "[", true), + ("&lbrke", "\u298b", true), + ("&lbrksld", "\u298f", true), + ("&lbrkslu", "\u298d", true), + ("&lcaron", "\u013e", true), + ("&lcedil", "\u013c", true), + ("&lceil", "\u2308", true), + ("&lcub", "{", true), + ("&lcy", "\u043b", true), + ("&ldca", "\u2936", true), + ("&ldquo", "\u201c", true), + ("&ldquor", "\u201e", true), + ("&ldrdhar", "\u2967", true), + ("&ldrushar", "\u294b", true), + ("&ldsh", "\u21b2", true), + ("&le", "\u2264", true), + ("&leftarrow", "\u2190", true), + ("&leftarrowtail", "\u21a2", true), + ("&leftharpoondown", "\u21bd", true), + ("&leftharpoonup", "\u21bc", true), + ("&leftleftarrows", "\u21c7", true), + ("&leftrightarrow", "\u2194", true), + ("&leftrightarrows", "\u21c6", true), + ("&leftrightharpoons", "\u21cb", true), + ("&leftrightsquigarrow", "\u21ad", true), + ("&leftthreetimes", "\u22cb", true), + ("&leg", "\u22da", true), + ("&leq", "\u2264", true), + ("&leqq", "\u2266", true), + ("&leqslant", "\u2a7d", true), + ("&les", "\u2a7d", true), + ("&lescc", "\u2aa8", true), + ("&lesdot", "\u2a7f", true), + ("&lesdoto", "\u2a81", true), + ("&lesdotor", "\u2a83", true), + ("&lesg", "\u22da\ufe00", true), + ("&lesges", "\u2a93", true), + ("&lessapprox", "\u2a85", true), + ("&lessdot", "\u22d6", true), + ("&lesseqgtr", "\u22da", true), + ("&lesseqqgtr", "\u2a8b", true), + ("&lessgtr", "\u2276", true), + ("&lesssim", "\u2272", true), + ("&lfisht", "\u297c", true), + ("&lfloor", "\u230a", true), + ("&lfr", "\U0001d529", true), + ("&lg", "\u2276", true), + ("&lgE", "\u2a91", true), + ("&lhard", "\u21bd", true), + ("&lharu", "\u21bc", true), + ("&lharul", "\u296a", true), + ("&lhblk", "\u2584", true), + ("&ljcy", "\u0459", true), + ("&ll", "\u226a", true), + ("&llarr", "\u21c7", true), + ("&llcorner", "\u231e", true), + ("&llhard", "\u296b", true), + ("&lltri", "\u25fa", true), + ("&lmidot", "\u0140", true), + ("&lmoust", "\u23b0", true), + ("&lmoustache", "\u23b0", true), + ("&lnE", "\u2268", true), + ("&lnap", "\u2a89", true), + ("&lnapprox", "\u2a89", true), + ("&lne", "\u2a87", true), + ("&lneq", "\u2a87", true), + ("&lneqq", "\u2268", true), + ("&lnsim", "\u22e6", true), + ("&loang", "\u27ec", true), + ("&loarr", "\u21fd", true), + ("&lobrk", "\u27e6", true), + ("&longleftarrow", "\u27f5", true), + ("&longleftrightarrow", "\u27f7", true), + ("&longmapsto", "\u27fc", true), + ("&longrightarrow", "\u27f6", true), + ("&looparrowleft", "\u21ab", true), + ("&looparrowright", "\u21ac", true), + ("&lopar", "\u2985", true), + ("&lopf", "\U0001d55d", true), + ("&loplus", "\u2a2d", true), + ("&lotimes", "\u2a34", true), + ("&lowast", "\u2217", true), + ("&lowbar", "_", true), + ("&loz", "\u25ca", true), + ("&lozenge", "\u25ca", true), + ("&lozf", "\u29eb", true), + ("&lpar", "(", true), + ("&lparlt", "\u2993", true), + ("&lrarr", "\u21c6", true), + ("&lrcorner", "\u231f", true), + ("&lrhar", "\u21cb", true), + ("&lrhard", "\u296d", true), + ("&lrm", "\u200e", true), + ("&lrtri", "\u22bf", true), + ("&lsaquo", "\u2039", true), + ("&lscr", "\U0001d4c1", true), + ("&lsh", "\u21b0", true), + ("&lsim", "\u2272", true), + ("&lsime", "\u2a8d", true), + ("&lsimg", "\u2a8f", true), + ("&lsqb", "[", true), + ("&lsquo", "\u2018", true), + ("&lsquor", "\u201a", true), + ("&lstrok", "\u0142", true), + ("<", "<", false), + ("<cc", "\u2aa6", true), + ("<cir", "\u2a79", true), + ("<dot", "\u22d6", true), + ("<hree", "\u22cb", true), + ("<imes", "\u22c9", true), + ("<larr", "\u2976", true), + ("<quest", "\u2a7b", true), + ("<rPar", "\u2996", true), + ("<ri", "\u25c3", true), + ("<rie", "\u22b4", true), + ("<rif", "\u25c2", true), + ("&lurdshar", "\u294a", true), + ("&luruhar", "\u2966", true), + ("&lvertneqq", "\u2268\ufe00", true), + ("&lvnE", "\u2268\ufe00", true), + ("&mDDot", "\u223a", true), + ("¯", "\xaf", false), + ("&male", "\u2642", true), + ("&malt", "\u2720", true), + ("&maltese", "\u2720", true), + ("&map", "\u21a6", true), + ("&mapsto", "\u21a6", true), + ("&mapstodown", "\u21a7", true), + ("&mapstoleft", "\u21a4", true), + ("&mapstoup", "\u21a5", true), + ("&marker", "\u25ae", true), + ("&mcomma", "\u2a29", true), + ("&mcy", "\u043c", true), + ("&mdash", "\u2014", true), + ("&measuredangle", "\u2221", true), + ("&mfr", "\U0001d52a", true), + ("&mho", "\u2127", true), + ("µ", "\xb5", false), + ("&mid", "\u2223", true), + ("&midast", "*", true), + ("&midcir", "\u2af0", true), + ("·", "\xb7", false), + ("&minus", "\u2212", true), + ("&minusb", "\u229f", true), + ("&minusd", "\u2238", true), + ("&minusdu", "\u2a2a", true), + ("&mlcp", "\u2adb", true), + ("&mldr", "\u2026", true), + ("&mnplus", "\u2213", true), + ("&models", "\u22a7", true), + ("&mopf", "\U0001d55e", true), + ("&mp", "\u2213", true), + ("&mscr", "\U0001d4c2", true), + ("&mstpos", "\u223e", true), + ("&mu", "\u03bc", true), + ("&multimap", "\u22b8", true), + ("&mumap", "\u22b8", true), + ("&nGg", "\u22d9\u0338", true), + ("&nGt", "\u226b\u20d2", true), + ("&nGtv", "\u226b\u0338", true), + ("&nLeftarrow", "\u21cd", true), + ("&nLeftrightarrow", "\u21ce", true), + ("&nLl", "\u22d8\u0338", true), + ("&nLt", "\u226a\u20d2", true), + ("&nLtv", "\u226a\u0338", true), + ("&nRightarrow", "\u21cf", true), + ("&nVDash", "\u22af", true), + ("&nVdash", "\u22ae", true), + ("&nabla", "\u2207", true), + ("&nacute", "\u0144", true), + ("&nang", "\u2220\u20d2", true), + ("&nap", "\u2249", true), + ("&napE", "\u2a70\u0338", true), + ("&napid", "\u224b\u0338", true), + ("&napos", "\u0149", true), + ("&napprox", "\u2249", true), + ("&natur", "\u266e", true), + ("&natural", "\u266e", true), + ("&naturals", "\u2115", true), + (" ", "\xa0", false), + ("&nbump", "\u224e\u0338", true), + ("&nbumpe", "\u224f\u0338", true), + ("&ncap", "\u2a43", true), + ("&ncaron", "\u0148", true), + ("&ncedil", "\u0146", true), + ("&ncong", "\u2247", true), + ("&ncongdot", "\u2a6d\u0338", true), + ("&ncup", "\u2a42", true), + ("&ncy", "\u043d", true), + ("&ndash", "\u2013", true), + ("&ne", "\u2260", true), + ("&neArr", "\u21d7", true), + ("&nearhk", "\u2924", true), + ("&nearr", "\u2197", true), + ("&nearrow", "\u2197", true), + ("&nedot", "\u2250\u0338", true), + ("&nequiv", "\u2262", true), + ("&nesear", "\u2928", true), + ("&nesim", "\u2242\u0338", true), + ("&nexist", "\u2204", true), + ("&nexists", "\u2204", true), + ("&nfr", "\U0001d52b", true), + ("&ngE", "\u2267\u0338", true), + ("&nge", "\u2271", true), + ("&ngeq", "\u2271", true), + ("&ngeqq", "\u2267\u0338", true), + ("&ngeqslant", "\u2a7e\u0338", true), + ("&nges", "\u2a7e\u0338", true), + ("&ngsim", "\u2275", true), + ("&ngt", "\u226f", true), + ("&ngtr", "\u226f", true), + ("&nhArr", "\u21ce", true), + ("&nharr", "\u21ae", true), + ("&nhpar", "\u2af2", true), + ("&ni", "\u220b", true), + ("&nis", "\u22fc", true), + ("&nisd", "\u22fa", true), + ("&niv", "\u220b", true), + ("&njcy", "\u045a", true), + ("&nlArr", "\u21cd", true), + ("&nlE", "\u2266\u0338", true), + ("&nlarr", "\u219a", true), + ("&nldr", "\u2025", true), + ("&nle", "\u2270", true), + ("&nleftarrow", "\u219a", true), + ("&nleftrightarrow", "\u21ae", true), + ("&nleq", "\u2270", true), + ("&nleqq", "\u2266\u0338", true), + ("&nleqslant", "\u2a7d\u0338", true), + ("&nles", "\u2a7d\u0338", true), + ("&nless", "\u226e", true), + ("&nlsim", "\u2274", true), + ("&nlt", "\u226e", true), + ("&nltri", "\u22ea", true), + ("&nltrie", "\u22ec", true), + ("&nmid", "\u2224", true), + ("&nopf", "\U0001d55f", true), + ("¬", "\xac", false), + ("¬in", "\u2209", true), + ("¬inE", "\u22f9\u0338", true), + ("¬indot", "\u22f5\u0338", true), + ("¬inva", "\u2209", true), + ("¬invb", "\u22f7", true), + ("¬invc", "\u22f6", true), + ("¬ni", "\u220c", true), + ("¬niva", "\u220c", true), + ("¬nivb", "\u22fe", true), + ("¬nivc", "\u22fd", true), + ("&npar", "\u2226", true), + ("&nparallel", "\u2226", true), + ("&nparsl", "\u2afd\u20e5", true), + ("&npart", "\u2202\u0338", true), + ("&npolint", "\u2a14", true), + ("&npr", "\u2280", true), + ("&nprcue", "\u22e0", true), + ("&npre", "\u2aaf\u0338", true), + ("&nprec", "\u2280", true), + ("&npreceq", "\u2aaf\u0338", true), + ("&nrArr", "\u21cf", true), + ("&nrarr", "\u219b", true), + ("&nrarrc", "\u2933\u0338", true), + ("&nrarrw", "\u219d\u0338", true), + ("&nrightarrow", "\u219b", true), + ("&nrtri", "\u22eb", true), + ("&nrtrie", "\u22ed", true), + ("&nsc", "\u2281", true), + ("&nsccue", "\u22e1", true), + ("&nsce", "\u2ab0\u0338", true), + ("&nscr", "\U0001d4c3", true), + ("&nshortmid", "\u2224", true), + ("&nshortparallel", "\u2226", true), + ("&nsim", "\u2241", true), + ("&nsime", "\u2244", true), + ("&nsimeq", "\u2244", true), + ("&nsmid", "\u2224", true), + ("&nspar", "\u2226", true), + ("&nsqsube", "\u22e2", true), + ("&nsqsupe", "\u22e3", true), + ("&nsub", "\u2284", true), + ("&nsubE", "\u2ac5\u0338", true), + ("&nsube", "\u2288", true), + ("&nsubset", "\u2282\u20d2", true), + ("&nsubseteq", "\u2288", true), + ("&nsubseteqq", "\u2ac5\u0338", true), + ("&nsucc", "\u2281", true), + ("&nsucceq", "\u2ab0\u0338", true), + ("&nsup", "\u2285", true), + ("&nsupE", "\u2ac6\u0338", true), + ("&nsupe", "\u2289", true), + ("&nsupset", "\u2283\u20d2", true), + ("&nsupseteq", "\u2289", true), + ("&nsupseteqq", "\u2ac6\u0338", true), + ("&ntgl", "\u2279", true), + ("ñ", "\xf1", false), + ("&ntlg", "\u2278", true), + ("&ntriangleleft", "\u22ea", true), + ("&ntrianglelefteq", "\u22ec", true), + ("&ntriangleright", "\u22eb", true), + ("&ntrianglerighteq", "\u22ed", true), + ("&nu", "\u03bd", true), + ("&num", "#", true), + ("&numero", "\u2116", true), + ("&numsp", "\u2007", true), + ("&nvDash", "\u22ad", true), + ("&nvHarr", "\u2904", true), + ("&nvap", "\u224d\u20d2", true), + ("&nvdash", "\u22ac", true), + ("&nvge", "\u2265\u20d2", true), + ("&nvgt", ">\u20d2", true), + ("&nvinfin", "\u29de", true), + ("&nvlArr", "\u2902", true), + ("&nvle", "\u2264\u20d2", true), + ("&nvlt", "<\u20d2", true), + ("&nvltrie", "\u22b4\u20d2", true), + ("&nvrArr", "\u2903", true), + ("&nvrtrie", "\u22b5\u20d2", true), + ("&nvsim", "\u223c\u20d2", true), + ("&nwArr", "\u21d6", true), + ("&nwarhk", "\u2923", true), + ("&nwarr", "\u2196", true), + ("&nwarrow", "\u2196", true), + ("&nwnear", "\u2927", true), + ("&oS", "\u24c8", true), + ("ó", "\xf3", false), + ("&oast", "\u229b", true), + ("&ocir", "\u229a", true), + ("ô", "\xf4", false), + ("&ocy", "\u043e", true), + ("&odash", "\u229d", true), + ("&odblac", "\u0151", true), + ("&odiv", "\u2a38", true), + ("&odot", "\u2299", true), + ("&odsold", "\u29bc", true), + ("&oelig", "\u0153", true), + ("&ofcir", "\u29bf", true), + ("&ofr", "\U0001d52c", true), + ("&ogon", "\u02db", true), + ("ò", "\xf2", false), + ("&ogt", "\u29c1", true), + ("&ohbar", "\u29b5", true), + ("&ohm", "\u03a9", true), + ("&oint", "\u222e", true), + ("&olarr", "\u21ba", true), + ("&olcir", "\u29be", true), + ("&olcross", "\u29bb", true), + ("&oline", "\u203e", true), + ("&olt", "\u29c0", true), + ("&omacr", "\u014d", true), + ("&omega", "\u03c9", true), + ("&omicron", "\u03bf", true), + ("&omid", "\u29b6", true), + ("&ominus", "\u2296", true), + ("&oopf", "\U0001d560", true), + ("&opar", "\u29b7", true), + ("&operp", "\u29b9", true), + ("&oplus", "\u2295", true), + ("&or", "\u2228", true), + ("&orarr", "\u21bb", true), + ("&ord", "\u2a5d", true), + ("&order", "\u2134", true), + ("&orderof", "\u2134", true), + ("ª", "\xaa", false), + ("º", "\xba", false), + ("&origof", "\u22b6", true), + ("&oror", "\u2a56", true), + ("&orslope", "\u2a57", true), + ("&orv", "\u2a5b", true), + ("&oscr", "\u2134", true), + ("ø", "\xf8", false), + ("&osol", "\u2298", true), + ("õ", "\xf5", false), + ("&otimes", "\u2297", true), + ("&otimesas", "\u2a36", true), + ("ö", "\xf6", false), + ("&ovbar", "\u233d", true), + ("&par", "\u2225", true), + ("¶", "\xb6", false), + ("¶llel", "\u2225", true), + ("&parsim", "\u2af3", true), + ("&parsl", "\u2afd", true), + ("&part", "\u2202", true), + ("&pcy", "\u043f", true), + ("&percnt", "%", true), + ("&period", ".", true), + ("&permil", "\u2030", true), + ("&perp", "\u22a5", true), + ("&pertenk", "\u2031", true), + ("&pfr", "\U0001d52d", true), + ("&phi", "\u03c6", true), + ("&phiv", "\u03d5", true), + ("&phmmat", "\u2133", true), + ("&phone", "\u260e", true), + ("&pi", "\u03c0", true), + ("&pitchfork", "\u22d4", true), + ("&piv", "\u03d6", true), + ("&planck", "\u210f", true), + ("&planckh", "\u210e", true), + ("&plankv", "\u210f", true), + ("&plus", "+", true), + ("&plusacir", "\u2a23", true), + ("&plusb", "\u229e", true), + ("&pluscir", "\u2a22", true), + ("&plusdo", "\u2214", true), + ("&plusdu", "\u2a25", true), + ("&pluse", "\u2a72", true), + ("±", "\xb1", false), + ("&plussim", "\u2a26", true), + ("&plustwo", "\u2a27", true), + ("&pm", "\xb1", true), + ("&pointint", "\u2a15", true), + ("&popf", "\U0001d561", true), + ("£", "\xa3", false), + ("&pr", "\u227a", true), + ("&prE", "\u2ab3", true), + ("&prap", "\u2ab7", true), + ("&prcue", "\u227c", true), + ("&pre", "\u2aaf", true), + ("&prec", "\u227a", true), + ("&precapprox", "\u2ab7", true), + ("&preccurlyeq", "\u227c", true), + ("&preceq", "\u2aaf", true), + ("&precnapprox", "\u2ab9", true), + ("&precneqq", "\u2ab5", true), + ("&precnsim", "\u22e8", true), + ("&precsim", "\u227e", true), + ("&prime", "\u2032", true), + ("&primes", "\u2119", true), + ("&prnE", "\u2ab5", true), + ("&prnap", "\u2ab9", true), + ("&prnsim", "\u22e8", true), + ("&prod", "\u220f", true), + ("&profalar", "\u232e", true), + ("&profline", "\u2312", true), + ("&profsurf", "\u2313", true), + ("&prop", "\u221d", true), + ("&propto", "\u221d", true), + ("&prsim", "\u227e", true), + ("&prurel", "\u22b0", true), + ("&pscr", "\U0001d4c5", true), + ("&psi", "\u03c8", true), + ("&puncsp", "\u2008", true), + ("&qfr", "\U0001d52e", true), + ("&qint", "\u2a0c", true), + ("&qopf", "\U0001d562", true), + ("&qprime", "\u2057", true), + ("&qscr", "\U0001d4c6", true), + ("&quaternions", "\u210d", true), + ("&quatint", "\u2a16", true), + ("&quest", "?", true), + ("&questeq", "\u225f", true), + (""", "\"", false), + ("&rAarr", "\u21db", true), + ("&rArr", "\u21d2", true), + ("&rAtail", "\u291c", true), + ("&rBarr", "\u290f", true), + ("&rHar", "\u2964", true), + ("&race", "\u223d\u0331", true), + ("&racute", "\u0155", true), + ("&radic", "\u221a", true), + ("&raemptyv", "\u29b3", true), + ("&rang", "\u27e9", true), + ("&rangd", "\u2992", true), + ("&range", "\u29a5", true), + ("&rangle", "\u27e9", true), + ("»", "\xbb", false), + ("&rarr", "\u2192", true), + ("&rarrap", "\u2975", true), + ("&rarrb", "\u21e5", true), + ("&rarrbfs", "\u2920", true), + ("&rarrc", "\u2933", true), + ("&rarrfs", "\u291e", true), + ("&rarrhk", "\u21aa", true), + ("&rarrlp", "\u21ac", true), + ("&rarrpl", "\u2945", true), + ("&rarrsim", "\u2974", true), + ("&rarrtl", "\u21a3", true), + ("&rarrw", "\u219d", true), + ("&ratail", "\u291a", true), + ("&ratio", "\u2236", true), + ("&rationals", "\u211a", true), + ("&rbarr", "\u290d", true), + ("&rbbrk", "\u2773", true), + ("&rbrace", "}", true), + ("&rbrack", "]", true), + ("&rbrke", "\u298c", true), + ("&rbrksld", "\u298e", true), + ("&rbrkslu", "\u2990", true), + ("&rcaron", "\u0159", true), + ("&rcedil", "\u0157", true), + ("&rceil", "\u2309", true), + ("&rcub", "}", true), + ("&rcy", "\u0440", true), + ("&rdca", "\u2937", true), + ("&rdldhar", "\u2969", true), + ("&rdquo", "\u201d", true), + ("&rdquor", "\u201d", true), + ("&rdsh", "\u21b3", true), + ("&real", "\u211c", true), + ("&realine", "\u211b", true), + ("&realpart", "\u211c", true), + ("&reals", "\u211d", true), + ("&rect", "\u25ad", true), + ("®", "\xae", false), + ("&rfisht", "\u297d", true), + ("&rfloor", "\u230b", true), + ("&rfr", "\U0001d52f", true), + ("&rhard", "\u21c1", true), + ("&rharu", "\u21c0", true), + ("&rharul", "\u296c", true), + ("&rho", "\u03c1", true), + ("&rhov", "\u03f1", true), + ("&rightarrow", "\u2192", true), + ("&rightarrowtail", "\u21a3", true), + ("&rightharpoondown", "\u21c1", true), + ("&rightharpoonup", "\u21c0", true), + ("&rightleftarrows", "\u21c4", true), + ("&rightleftharpoons", "\u21cc", true), + ("&rightrightarrows", "\u21c9", true), + ("&rightsquigarrow", "\u219d", true), + ("&rightthreetimes", "\u22cc", true), + ("&ring", "\u02da", true), + ("&risingdotseq", "\u2253", true), + ("&rlarr", "\u21c4", true), + ("&rlhar", "\u21cc", true), + ("&rlm", "\u200f", true), + ("&rmoust", "\u23b1", true), + ("&rmoustache", "\u23b1", true), + ("&rnmid", "\u2aee", true), + ("&roang", "\u27ed", true), + ("&roarr", "\u21fe", true), + ("&robrk", "\u27e7", true), + ("&ropar", "\u2986", true), + ("&ropf", "\U0001d563", true), + ("&roplus", "\u2a2e", true), + ("&rotimes", "\u2a35", true), + ("&rpar", ")", true), + ("&rpargt", "\u2994", true), + ("&rppolint", "\u2a12", true), + ("&rrarr", "\u21c9", true), + ("&rsaquo", "\u203a", true), + ("&rscr", "\U0001d4c7", true), + ("&rsh", "\u21b1", true), + ("&rsqb", "]", true), + ("&rsquo", "\u2019", true), + ("&rsquor", "\u2019", true), + ("&rthree", "\u22cc", true), + ("&rtimes", "\u22ca", true), + ("&rtri", "\u25b9", true), + ("&rtrie", "\u22b5", true), + ("&rtrif", "\u25b8", true), + ("&rtriltri", "\u29ce", true), + ("&ruluhar", "\u2968", true), + ("&rx", "\u211e", true), + ("&sacute", "\u015b", true), + ("&sbquo", "\u201a", true), + ("&sc", "\u227b", true), + ("&scE", "\u2ab4", true), + ("&scap", "\u2ab8", true), + ("&scaron", "\u0161", true), + ("&sccue", "\u227d", true), + ("&sce", "\u2ab0", true), + ("&scedil", "\u015f", true), + ("&scirc", "\u015d", true), + ("&scnE", "\u2ab6", true), + ("&scnap", "\u2aba", true), + ("&scnsim", "\u22e9", true), + ("&scpolint", "\u2a13", true), + ("&scsim", "\u227f", true), + ("&scy", "\u0441", true), + ("&sdot", "\u22c5", true), + ("&sdotb", "\u22a1", true), + ("&sdote", "\u2a66", true), + ("&seArr", "\u21d8", true), + ("&searhk", "\u2925", true), + ("&searr", "\u2198", true), + ("&searrow", "\u2198", true), + ("§", "\xa7", false), + ("&semi", ";", true), + ("&seswar", "\u2929", true), + ("&setminus", "\u2216", true), + ("&setmn", "\u2216", true), + ("&sext", "\u2736", true), + ("&sfr", "\U0001d530", true), + ("&sfrown", "\u2322", true), + ("&sharp", "\u266f", true), + ("&shchcy", "\u0449", true), + ("&shcy", "\u0448", true), + ("&shortmid", "\u2223", true), + ("&shortparallel", "\u2225", true), + ("­", "\xad", false), + ("&sigma", "\u03c3", true), + ("&sigmaf", "\u03c2", true), + ("&sigmav", "\u03c2", true), + ("&sim", "\u223c", true), + ("&simdot", "\u2a6a", true), + ("&sime", "\u2243", true), + ("&simeq", "\u2243", true), + ("&simg", "\u2a9e", true), + ("&simgE", "\u2aa0", true), + ("&siml", "\u2a9d", true), + ("&simlE", "\u2a9f", true), + ("&simne", "\u2246", true), + ("&simplus", "\u2a24", true), + ("&simrarr", "\u2972", true), + ("&slarr", "\u2190", true), + ("&smallsetminus", "\u2216", true), + ("&smashp", "\u2a33", true), + ("&smeparsl", "\u29e4", true), + ("&smid", "\u2223", true), + ("&smile", "\u2323", true), + ("&smt", "\u2aaa", true), + ("&smte", "\u2aac", true), + ("&smtes", "\u2aac\ufe00", true), + ("&softcy", "\u044c", true), + ("&sol", "/", true), + ("&solb", "\u29c4", true), + ("&solbar", "\u233f", true), + ("&sopf", "\U0001d564", true), + ("&spades", "\u2660", true), + ("&spadesuit", "\u2660", true), + ("&spar", "\u2225", true), + ("&sqcap", "\u2293", true), + ("&sqcaps", "\u2293\ufe00", true), + ("&sqcup", "\u2294", true), + ("&sqcups", "\u2294\ufe00", true), + ("&sqsub", "\u228f", true), + ("&sqsube", "\u2291", true), + ("&sqsubset", "\u228f", true), + ("&sqsubseteq", "\u2291", true), + ("&sqsup", "\u2290", true), + ("&sqsupe", "\u2292", true), + ("&sqsupset", "\u2290", true), + ("&sqsupseteq", "\u2292", true), + ("&squ", "\u25a1", true), + ("&square", "\u25a1", true), + ("&squarf", "\u25aa", true), + ("&squf", "\u25aa", true), + ("&srarr", "\u2192", true), + ("&sscr", "\U0001d4c8", true), + ("&ssetmn", "\u2216", true), + ("&ssmile", "\u2323", true), + ("&sstarf", "\u22c6", true), + ("&star", "\u2606", true), + ("&starf", "\u2605", true), + ("&straightepsilon", "\u03f5", true), + ("&straightphi", "\u03d5", true), + ("&strns", "\xaf", true), + ("&sub", "\u2282", true), + ("&subE", "\u2ac5", true), + ("&subdot", "\u2abd", true), + ("&sube", "\u2286", true), + ("&subedot", "\u2ac3", true), + ("&submult", "\u2ac1", true), + ("&subnE", "\u2acb", true), + ("&subne", "\u228a", true), + ("&subplus", "\u2abf", true), + ("&subrarr", "\u2979", true), + ("&subset", "\u2282", true), + ("&subseteq", "\u2286", true), + ("&subseteqq", "\u2ac5", true), + ("&subsetneq", "\u228a", true), + ("&subsetneqq", "\u2acb", true), + ("&subsim", "\u2ac7", true), + ("&subsub", "\u2ad5", true), + ("&subsup", "\u2ad3", true), + ("&succ", "\u227b", true), + ("&succapprox", "\u2ab8", true), + ("&succcurlyeq", "\u227d", true), + ("&succeq", "\u2ab0", true), + ("&succnapprox", "\u2aba", true), + ("&succneqq", "\u2ab6", true), + ("&succnsim", "\u22e9", true), + ("&succsim", "\u227f", true), + ("&sum", "\u2211", true), + ("&sung", "\u266a", true), + ("&sup", "\u2283", true), + ("¹", "\xb9", false), + ("²", "\xb2", false), + ("³", "\xb3", false), + ("&supE", "\u2ac6", true), + ("&supdot", "\u2abe", true), + ("&supdsub", "\u2ad8", true), + ("&supe", "\u2287", true), + ("&supedot", "\u2ac4", true), + ("&suphsol", "\u27c9", true), + ("&suphsub", "\u2ad7", true), + ("&suplarr", "\u297b", true), + ("&supmult", "\u2ac2", true), + ("&supnE", "\u2acc", true), + ("&supne", "\u228b", true), + ("&supplus", "\u2ac0", true), + ("&supset", "\u2283", true), + ("&supseteq", "\u2287", true), + ("&supseteqq", "\u2ac6", true), + ("&supsetneq", "\u228b", true), + ("&supsetneqq", "\u2acc", true), + ("&supsim", "\u2ac8", true), + ("&supsub", "\u2ad4", true), + ("&supsup", "\u2ad6", true), + ("&swArr", "\u21d9", true), + ("&swarhk", "\u2926", true), + ("&swarr", "\u2199", true), + ("&swarrow", "\u2199", true), + ("&swnwar", "\u292a", true), + ("ß", "\xdf", false), + ("&target", "\u2316", true), + ("&tau", "\u03c4", true), + ("&tbrk", "\u23b4", true), + ("&tcaron", "\u0165", true), + ("&tcedil", "\u0163", true), + ("&tcy", "\u0442", true), + ("&tdot", "\u20db", true), + ("&telrec", "\u2315", true), + ("&tfr", "\U0001d531", true), + ("&there4", "\u2234", true), + ("&therefore", "\u2234", true), + ("&theta", "\u03b8", true), + ("&thetasym", "\u03d1", true), + ("&thetav", "\u03d1", true), + ("&thickapprox", "\u2248", true), + ("&thicksim", "\u223c", true), + ("&thinsp", "\u2009", true), + ("&thkap", "\u2248", true), + ("&thksim", "\u223c", true), + ("þ", "\xfe", false), + ("&tilde", "\u02dc", true), + ("×", "\xd7", false), + ("×b", "\u22a0", true), + ("×bar", "\u2a31", true), + ("×d", "\u2a30", true), + ("&tint", "\u222d", true), + ("&toea", "\u2928", true), + ("&top", "\u22a4", true), + ("&topbot", "\u2336", true), + ("&topcir", "\u2af1", true), + ("&topf", "\U0001d565", true), + ("&topfork", "\u2ada", true), + ("&tosa", "\u2929", true), + ("&tprime", "\u2034", true), + ("&trade", "\u2122", true), + ("&triangle", "\u25b5", true), + ("&triangledown", "\u25bf", true), + ("&triangleleft", "\u25c3", true), + ("&trianglelefteq", "\u22b4", true), + ("&triangleq", "\u225c", true), + ("&triangleright", "\u25b9", true), + ("&trianglerighteq", "\u22b5", true), + ("&tridot", "\u25ec", true), + ("&trie", "\u225c", true), + ("&triminus", "\u2a3a", true), + ("&triplus", "\u2a39", true), + ("&trisb", "\u29cd", true), + ("&tritime", "\u2a3b", true), + ("&trpezium", "\u23e2", true), + ("&tscr", "\U0001d4c9", true), + ("&tscy", "\u0446", true), + ("&tshcy", "\u045b", true), + ("&tstrok", "\u0167", true), + ("&twixt", "\u226c", true), + ("&twoheadleftarrow", "\u219e", true), + ("&twoheadrightarrow", "\u21a0", true), + ("&uArr", "\u21d1", true), + ("&uHar", "\u2963", true), + ("ú", "\xfa", false), + ("&uarr", "\u2191", true), + ("&ubrcy", "\u045e", true), + ("&ubreve", "\u016d", true), + ("û", "\xfb", false), + ("&ucy", "\u0443", true), + ("&udarr", "\u21c5", true), + ("&udblac", "\u0171", true), + ("&udhar", "\u296e", true), + ("&ufisht", "\u297e", true), + ("&ufr", "\U0001d532", true), + ("ù", "\xf9", false), + ("&uharl", "\u21bf", true), + ("&uharr", "\u21be", true), + ("&uhblk", "\u2580", true), + ("&ulcorn", "\u231c", true), + ("&ulcorner", "\u231c", true), + ("&ulcrop", "\u230f", true), + ("&ultri", "\u25f8", true), + ("&umacr", "\u016b", true), + ("¨", "\xa8", false), + ("&uogon", "\u0173", true), + ("&uopf", "\U0001d566", true), + ("&uparrow", "\u2191", true), + ("&updownarrow", "\u2195", true), + ("&upharpoonleft", "\u21bf", true), + ("&upharpoonright", "\u21be", true), + ("&uplus", "\u228e", true), + ("&upsi", "\u03c5", true), + ("&upsih", "\u03d2", true), + ("&upsilon", "\u03c5", true), + ("&upuparrows", "\u21c8", true), + ("&urcorn", "\u231d", true), + ("&urcorner", "\u231d", true), + ("&urcrop", "\u230e", true), + ("&uring", "\u016f", true), + ("&urtri", "\u25f9", true), + ("&uscr", "\U0001d4ca", true), + ("&utdot", "\u22f0", true), + ("&utilde", "\u0169", true), + ("&utri", "\u25b5", true), + ("&utrif", "\u25b4", true), + ("&uuarr", "\u21c8", true), + ("ü", "\xfc", false), + ("&uwangle", "\u29a7", true), + ("&vArr", "\u21d5", true), + ("&vBar", "\u2ae8", true), + ("&vBarv", "\u2ae9", true), + ("&vDash", "\u22a8", true), + ("&vangrt", "\u299c", true), + ("&varepsilon", "\u03f5", true), + ("&varkappa", "\u03f0", true), + ("&varnothing", "\u2205", true), + ("&varphi", "\u03d5", true), + ("&varpi", "\u03d6", true), + ("&varpropto", "\u221d", true), + ("&varr", "\u2195", true), + ("&varrho", "\u03f1", true), + ("&varsigma", "\u03c2", true), + ("&varsubsetneq", "\u228a\ufe00", true), + ("&varsubsetneqq", "\u2acb\ufe00", true), + ("&varsupsetneq", "\u228b\ufe00", true), + ("&varsupsetneqq", "\u2acc\ufe00", true), + ("&vartheta", "\u03d1", true), + ("&vartriangleleft", "\u22b2", true), + ("&vartriangleright", "\u22b3", true), + ("&vcy", "\u0432", true), + ("&vdash", "\u22a2", true), + ("&vee", "\u2228", true), + ("&veebar", "\u22bb", true), + ("&veeeq", "\u225a", true), + ("&vellip", "\u22ee", true), + ("&verbar", "|", true), + ("&vert", "|", true), + ("&vfr", "\U0001d533", true), + ("&vltri", "\u22b2", true), + ("&vnsub", "\u2282\u20d2", true), + ("&vnsup", "\u2283\u20d2", true), + ("&vopf", "\U0001d567", true), + ("&vprop", "\u221d", true), + ("&vrtri", "\u22b3", true), + ("&vscr", "\U0001d4cb", true), + ("&vsubnE", "\u2acb\ufe00", true), + ("&vsubne", "\u228a\ufe00", true), + ("&vsupnE", "\u2acc\ufe00", true), + ("&vsupne", "\u228b\ufe00", true), + ("&vzigzag", "\u299a", true), + ("&wcirc", "\u0175", true), + ("&wedbar", "\u2a5f", true), + ("&wedge", "\u2227", true), + ("&wedgeq", "\u2259", true), + ("&weierp", "\u2118", true), + ("&wfr", "\U0001d534", true), + ("&wopf", "\U0001d568", true), + ("&wp", "\u2118", true), + ("&wr", "\u2240", true), + ("&wreath", "\u2240", true), + ("&wscr", "\U0001d4cc", true), + ("&xcap", "\u22c2", true), + ("&xcirc", "\u25ef", true), + ("&xcup", "\u22c3", true), + ("&xdtri", "\u25bd", true), + ("&xfr", "\U0001d535", true), + ("&xhArr", "\u27fa", true), + ("&xharr", "\u27f7", true), + ("&xi", "\u03be", true), + ("&xlArr", "\u27f8", true), + ("&xlarr", "\u27f5", true), + ("&xmap", "\u27fc", true), + ("&xnis", "\u22fb", true), + ("&xodot", "\u2a00", true), + ("&xopf", "\U0001d569", true), + ("&xoplus", "\u2a01", true), + ("&xotime", "\u2a02", true), + ("&xrArr", "\u27f9", true), + ("&xrarr", "\u27f6", true), + ("&xscr", "\U0001d4cd", true), + ("&xsqcup", "\u2a06", true), + ("&xuplus", "\u2a04", true), + ("&xutri", "\u25b3", true), + ("&xvee", "\u22c1", true), + ("&xwedge", "\u22c0", true), + ("ý", "\xfd", false), + ("&yacy", "\u044f", true), + ("&ycirc", "\u0177", true), + ("&ycy", "\u044b", true), + ("¥", "\xa5", false), + ("&yfr", "\U0001d536", true), + ("&yicy", "\u0457", true), + ("&yopf", "\U0001d56a", true), + ("&yscr", "\U0001d4ce", true), + ("&yucy", "\u044e", true), + ("ÿ", "\xff", false), + ("&zacute", "\u017a", true), + ("&zcaron", "\u017e", true), + ("&zcy", "\u0437", true), + ("&zdot", "\u017c", true), + ("&zeetrf", "\u2128", true), + ("&zeta", "\u03b6", true), + ("&zfr", "\U0001d537", true), + ("&zhcy", "\u0436", true), + ("&zigrarr", "\u21dd", true), + ("&zopf", "\U0001d56b", true), + ("&zscr", "\U0001d4cf", true), + ("&zwj", "\u200d", true), + ("&zwnj", "\u200c", true), +]; diff --git a/src/libhtml/escape.rs b/src/libhtml/escape.rs new file mode 100644 index 0000000000000..81fa5e000f6aa --- /dev/null +++ b/src/libhtml/escape.rs @@ -0,0 +1,497 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! HTML Escaping +//! +//! This module contains `Writer`s for escaping/unescaping HTML. + +use std::io::{Writer, IoResult}; +use std::{char, str}; +use entity::ENTITIES; + +/// A `Writer` adaptor that escapes any HTML characters written to it. +pub struct EscapeWriter { + inner: W, + mode: EscapeMode +} + +/// The mode that controls which characters need escaping. +#[deriving(Eq,Show)] +pub enum EscapeMode { + /// The general-purpose mode. Escapes ``&<>"'`. + EscapeDefault, + /// Escapes characters for text nodes. Escapes `&<>`. + EscapeText, + /// Escapes characters for double-quoted attribute values. Escapes `&"`. + EscapeAttr, + /// Escapes characters for single-quoted attribute values. Escapes `&'`. + EscapeSingleQuoteAttr, + /// Escapes all non-printable or non-ASCII characters, with the exception of U+0000. + EscapeAll +} + +impl EscapeWriter { + /// Creates a new `EscapeWriter` with the given mode. + pub fn new(inner: W, mode: EscapeMode) -> EscapeWriter { + EscapeWriter { + inner: inner, + mode: mode + } + } + + /// Gets a reference to the underlying `Writer`. + pub fn get_ref<'a>(&'a self) -> &'a W { + &self.inner + } + + /// Gets a mutable reference to the underlying `Writer`. + pub fn get_mut_ref<'a>(&'a mut self) -> &'a mut W { + &mut self.inner + } + + /// Unwraps this `EscapeWriter`, returning the underlying writer. + pub fn unwrap(self) -> W { + self.inner + } +} + +impl Writer for EscapeWriter { + fn write(&mut self, bytes: &[u8]) -> IoResult<()> { + if self.mode == EscapeAll { + // This mode needs to operate on chars. Everything else is handled below. + let s = str::from_utf8_lossy(bytes); + let s = s.as_slice(); + let mut last = 0u; + for (i, c) in s.char_indices() { + match c { + '&'|'<'|'>'|'"'|'\'' => (), + '\0' | '\x20'..'\x7E' => continue, + _ => () + } + if last < i { + try!(self.inner.write_str(s.slice(last, i))); + } + match c { + '&'|'<'|'>'|'"'|'\'' => { + let ent = match c { + '&' => "&", + '<' => "<", + '>' => ">", + '"' => """, + '\'' => "'", + _ => unreachable!() + }; + try!(self.inner.write_str(ent)); + } + _ => { + let c = c as u32; + try!(write!(&mut self.inner as &mut ::std::io::Writer, r"&\#x{:x};", c)); + } + } + last = i + char::len_utf8_bytes(c); + } + if last < s.as_slice().len() { + try!(self.inner.write_str(s.slice_from(last))); + } + } else { + // We only want to escape ASCII values, so we can safely operate on bytes + let mut last = 0; + for (i, b) in bytes.iter().enumerate() { + let ent = match (self.mode, *b as char) { + (_,'&') => "&", + (EscapeDefault,'<') |(EscapeText,'<') => "<", + (EscapeDefault,'>') |(EscapeText,'>') => ">", + (EscapeDefault,'\'')|(EscapeSingleQuoteAttr,'\'') => "'", + (EscapeDefault,'"') |(EscapeAttr,'"') => """, + _ => continue + }; + if last < i { + try!(self.inner.write(bytes.slice(last, i))); + } + try!(self.inner.write_str(ent)); + last = i + 1; + } + if last < bytes.len() { + try!(self.inner.write(bytes.slice_from(last))); + } + } + Ok(()) + } + + fn flush(&mut self) -> IoResult<()> { + self.inner.flush() + } +} + +/// A `Writer` adaptor that decodes any HTML entities in the text written to it. +pub struct UnescapeWriter { + state: UnescapeState, + inner: Option, + allowed: Option +} + +enum UnescapeState { + CharData, + Begin, + Named(uint, uint), // index into ENTITIES, and prefix len + HexStart, + Hex(u32), + DecStart, + Dec(u32) +} + +impl UnescapeWriter { + /// Creates a new `UnescapeWriter`. + pub fn new(inner: W) -> UnescapeWriter { + UnescapeWriter { + state: CharData, + inner: Some(inner), + allowed: None + } + } + + /// Creates a new `UnescapeWriter` with the specified allowed additional character. + /// + /// The allowed additional character may occur after `'&'` to indicate that this is not + /// an entity. + pub fn with_allowed_char(inner: W, allowed: char) -> UnescapeWriter { + UnescapeWriter { + state: CharData, + inner: Some(inner), + allowed: Some(allowed) + } + } + + /// Gets a reference to the underlying `Writer`. + /// + /// This type does not expose the ability to get a mutable reference to the + /// underlying `Writer` because that could possibly corrupt the buffer. + pub fn get_ref<'a>(&'a self) -> &'a W { + self.inner.get_ref() + } + + /// Unwraps this `UnescapeWriter`, returning the underlying `Writer`. + /// + /// The `UnescapeWriter` is flushed before returning the `Writer`, but the + /// `Writer` is not flushed. + /// + /// # Failure + /// + /// Fails if the outer flush returns an error. Call `flush_outer()` + /// explicitly to handle this. + pub fn unwrap(mut self) -> W { + self.flush_outer().unwrap(); + self.inner.take_unwrap() + } + + /// Flushes the `UnescapeWriter` without flushing the wrapped `Writer`. + /// + /// If the `UnescapeWriter` is in the middle of parsing an entity + /// reference, it will behave as though EOF were encountered and write the + /// approprite characters. Otherwise, this does nothing. + pub fn flush_outer(&mut self) -> IoResult<()> { + self.abort_entity() + } + + // Called when a character is encountered that isn't valid + fn abort_entity(&mut self) -> IoResult<()> { + let state = self.state; + self.state = CharData; + match state { + CharData => (), + Begin => { + try!(self.inner.get_mut_ref().write_str("&")); + } + Named(cursor, plen) => { + let (name, chars, needs_semi) = ENTITIES[cursor]; + if !needs_semi && name.len() == plen { + try!(self.inner.get_mut_ref().write_str(chars)); + } else { + try!(self.inner.get_mut_ref().write_str(name.slice_to(plen))); + } + } + DecStart => { + try!(self.inner.get_mut_ref().write_str("&#")); + } + HexStart => { + try!(self.inner.get_mut_ref().write_str("&#x")); + } + Hex(val) | Dec(val) => { + let c = match char::from_u32(val) { + None|Some('\0') => '\uFFFD', + Some(c@'\x80'..'\x9F') => { + COMPAT_TABLE[c as uint - 0x80] + } + Some(c) => c + }; + let mut buf = [0u8, ..4]; + let n = c.encode_utf8(buf); + try!(self.inner.get_mut_ref().write(buf.slice_to(n))); + self.state = CharData; + } + } + Ok(()) + } + + fn inner_write(&mut self, bytes: &[u8]) -> IoResult<()> { + match self.inner.get_mut_ref().write(bytes) { + ok@Ok(_) => ok, + err@Err(_) => { + self.state = CharData; + err + } + } + } + + fn inner_write_str(&mut self, s: &str) -> IoResult<()> { + match self.inner.get_mut_ref().write_str(s) { + ok@Ok(_) => ok, + err@Err(_) => { + self.state = CharData; + err + } + } + } +} + +#[unsafe_destructor] +impl Drop for UnescapeWriter { + fn drop(&mut self) { + if self.inner.is_some() { + // Ignore this error, we don't want to fail in Drop + let _ = self.flush_outer(); + } + } +} + +impl Writer for UnescapeWriter { + /// Writes `bytes` to the underlying `Writer`, unescaping any HTML entities. + /// + /// If an error is returned, this `UnescapeWriter` discards its internal state, + /// forgetting any in-progress entities. + fn write(&mut self, bytes: &[u8]) -> IoResult<()> { + let mut it = bytes.iter().enumerate().peekable(); + let mut cdata = 0; + loop { + let (i, b) = match it.peek() { + None => break, + Some(&(i, &b)) => (i, b) + }; + match (self.state, b as char) { + (CharData, '&') => { + it.next(); // consume & + match it.peek().map(|&(_,&b)| b as char) { + None|Some('\x09')|Some('\x0A')|Some('\x0C')| + Some(' ')|Some('<')|Some('&') => { + // This is an allowed character + continue; + } + Some(c) if self.allowed == Some(c) => { + // this is the additional allowed character + continue; + } + _ => () + } + if i > cdata { + try!(self.inner_write(bytes.slice(cdata,i))); + } + self.state = Begin + } + (CharData, _) => { + it.next(); // consume character + } + (Begin, '#') => { + it.next(); // consume # + self.state = match it.peek().map(|&(_,&b)| b as char) { + Some('x') => { + it.next(); // consume x + HexStart + } + _ => DecStart + } + } + (Begin, 'a'..'z')|(Begin, 'A'..'Z') => { + // No entities start with digits, so we don't have to check that + it.next(); // consume character + // Find the first entity that starts with this character + // The array is sorted, so we can binary search. + // Unfortunately there's no existing function to find the "insert location" + // for a key in a sorted vector, so let's implement it now. + let key: &[u8] = [b]; + let mut base: uint = 0; + let mut lim: uint = ENTITIES.len(); + while lim != 0 { + let ix = base + (lim >> 1); + let (name, _, _) = ENTITIES[ix]; + let name = name.slice_from(1); // trim off & + if key > name.as_bytes() { + base = ix + 1; + lim -= 1; + } + // key will never == name, there are no 1-char entities + lim >>= 1; + } + // base contains the insertion index, which is the first element + // with our character as a prefix. + // There's at least one entity that starts with every letter, so we don't + // have to worry about not finding one. + self.state = Named(base, 2); // plen is 2 to include & + } + (Named(cursor, plen), ';') => { + let (name, chars, _) = ENTITIES[cursor]; + if name.len() == plen { + // valid entity + it.next(); // consume ; + try!(self.inner_write_str(chars)); + self.state = CharData; + cdata = i+1; + } else { + try!(self.abort_entity()); + self.state = CharData; + cdata = i; + } + } + (Named(cursor, plen), 'a'..'z') | + (Named(cursor, plen), 'A'..'Z') | + (Named(cursor, plen), '0'..'9') => { + let mut cursor = cursor; + let (name, _, _) = ENTITIES[cursor]; + if name.len() > plen && name[plen] == b { + // existing cursor is still a match + } else { + // search forward to find the next entity with our prefix + let prefix = name.slice_to(plen); + for ix in range(cursor+1, ENTITIES.len()) { + let (name, _, _) = ENTITIES[ix]; + if !name.starts_with(prefix) { + // no match + cursor = -1; + break; + } + if name.len() > plen && name[plen] == b { + cursor = ix; + break; + } + } + } + if cursor == -1 { + // no match + try!(self.abort_entity()); + self.state = CharData; + cdata = i; + } else { + it.next(); // consume character + self.state = Named(cursor, plen+1); + } + } + (HexStart, 'a'..'f')|(HexStart, 'A'..'F')|(HexStart, '0'..'9') => { + self.state = Hex(0); + // don't consume, re-try this digit in the Hex state + } + (DecStart, '0'..'9') => { + self.state = Dec(0); + // don't consume, re-try this digit in the Dec state + } + (Hex(val), '0'..'9') => { + it.next(); // consume character + if val <= char::MAX as u32 { + let digit = (b - '0' as u8) as u32; + self.state = Hex(val*16 + digit); + } + } + (Hex(val), 'a'..'f') => { + it.next(); // consume character + if val <= char::MAX as u32 { + let digit = 10 + (b - 'a' as u8) as u32; + self.state = Hex(val*16 + digit); + } + } + (Hex(val), 'A'..'F') => { + it.next(); // consume character + if val <= char::MAX as u32 { + let digit = 10 + (b - 'A' as u8) as u32; + self.state = Hex(val*16 + digit); + } + } + (Dec(val), '0'..'9') => { + it.next(); // consume character + if val <= char::MAX as u32 { + let digit = (b - '0' as u8) as u32; + self.state = Dec(val*10 + digit); + } + } + (Hex(_), ';')|(Dec(_), ';') => { + it.next(); // consume character + // behavior here is identical to aborting, so let's do that + try!(self.abort_entity()); + cdata = i+1; + } + _ => { + // parse error that does not emit characters + try!(self.abort_entity()); + self.state = CharData; + cdata = i; + } + } + } + match self.state { + CharData => { + if cdata < bytes.len() { + try!(self.inner_write(bytes.slice_from(cdata))); + } + } + _ => () + } + Ok(()) + } + + fn flush(&mut self) -> IoResult<()> { + try!(self.flush_outer()); + self.inner.get_mut_ref().flush() + } +} + + + +static COMPAT_TABLE: [char, ..32] = [ + '\u20AC', // First entry is what 0x80 should be replaced with. + '\u0081', + '\u201A', + '\u0192', + '\u201E', + '\u2026', + '\u2020', + '\u2021', + '\u02C6', + '\u2030', + '\u0160', + '\u2039', + '\u0152', + '\u008D', + '\u017D', + '\u008F', + '\u0090', + '\u2018', + '\u2019', + '\u201C', + '\u201D', + '\u2022', + '\u2013', + '\u2014', + '\u02DC', + '\u2122', + '\u0161', + '\u203A', + '\u0153', + '\u009D', + '\u017E', + '\u0178', // Last entry is 0x9F. + // 0x00->'\uFFFD' is handled programmatically. +]; diff --git a/src/libhtml/fmt.rs b/src/libhtml/fmt.rs new file mode 100644 index 0000000000000..a9b3b8a8342a6 --- /dev/null +++ b/src/libhtml/fmt.rs @@ -0,0 +1,43 @@ +// Copyright 2013-2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! HTML fmt support +//! +//! This module contains helper structs for HTML escaping/unescaping with +//! format strings. + +use std::fmt; +use escape::{EscapeWriter, UnescapeWriter, EscapeDefault}; + +/// Wrapper struct which will emit the HTML-escaped version of the contained +/// string when passed to a format string. +/// +/// Escapes using the semantics of `html::escape::EscapeDefault`. +pub struct Escape(pub T); + +impl fmt::Show for Escape { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let Escape(ref inner) = *self; + (write!(EscapeWriter::new(fmt.by_ref(), EscapeDefault), "{}", inner)) + .map_err(|_| fmt::WriteError) + } +} + +/// Wrapper struct which will unescape HTML entities in the contained +/// string when passed to a format string. +pub struct Unescape(pub T); + +impl fmt::Show for Unescape { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let Unescape(ref inner) = *self; + let mut w = UnescapeWriter::new(fmt.by_ref()); + (write!(w, "{}", inner)).and_then(|_| w.flush_outer()).map_err(|_| fmt::WriteError) + } +} diff --git a/src/libhtml/lib.rs b/src/libhtml/lib.rs new file mode 100644 index 0000000000000..0472ea928678c --- /dev/null +++ b/src/libhtml/lib.rs @@ -0,0 +1,146 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! HTML Escaping +//! +//! # Examples +//! +//! Escaping some HTML text: +//! +//! ```rust +//! use html; +//! +//! fn main() { +//! let original = "

Dr. Jekyll & Mr. Hyde

"; +//! let escaped = html::escape(original); +//! assert_eq!(escaped.as_slice(), "<p>Dr. Jekyll & Mr. Hyde<p>"); +//! let unescaped = html::unescape(escaped); +//! assert_eq!(unescaped.as_slice(), original); +//! } +//! ``` +//! +//! Or, if you are formating multiple strings, using `html::fmt::Escape` or +//! `html::fmt::Unescape` can be used to reduce allocations, increasing perfomance. +//! +//! ```rust +//! use html::fmt::Escape; +//! +//! fn main() { +//! println!("

{}

{}

", Escape(""), Escape("in ")); +//! } +//! ``` +//! +//! Finally, `html::escape` has two `Writer` adaptors, `html::escape::EscapeWriter` +//! and `html::escape::UnescapeWriter` that can be used as desired. +//! +//! ```rust +//! use html::escape::UnescapeWriter; +//! use std::io; +//! +//! fn main() { +//! let mut w = UnescapeWriter::new(io::stdout()); +//! let _ = io::copy(&mut io::stdin(), &mut w); +//! } +//! ``` + +#![crate_id = "html#0.11-pre"] +#![license = "MIT/ASL2"] +#![crate_type = "dylib"] +#![crate_type = "rlib"] + +use std::fmt::Show; +use fmt::{Escape, Unescape}; + +pub mod escape; +pub mod fmt; +mod entity; + +/// Returns a new string with special characters escaped as HTML entities. +/// +/// This will escape only 5 characters: `<`, `>`, `&`, `'`, and `"`. +/// `unescape(escape(s)) == s` is always true, but the converse isn't necessarily true. +pub fn escape(s: T) -> ~str { + format!("{}", Escape(s)) +} + +/// Returns a new string with HTML entities transformed to unicode characters. +/// +/// It escapes a larger range of entities than `escape` escapes. For example, +/// `á` unescapes to "?", as does `á` and `&xE1;`. +/// `unescape(escape(s)) == s` is always true, but the converse isn't necessarily true. +pub fn unescape(s: T) -> ~str { + format!("{}", Unescape(s)) +} + +#[cfg(test)] +mod tests { + extern crate test; + use std::fmt; + use super::{escape, unescape}; + + struct Test(StrBuf); + + impl fmt::Show for Test { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let Test(ref s) = *self; + write!(fmt, "{}", s) + } + } + + struct UnTest(&'static str, &'static str); + + impl fmt::Show for UnTest { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let UnTest(s1, s2) = *self; + try!(write!(fmt, "{}", s1)); + write!(fmt, "{}", s2) + } + } + + #[test] + fn test_escape() { + let s = r#"'"#).as_slice(), + "'<script>"&foo;"</script>'"); + let mut w = EscapeWriter::new(MemWriter::new(), EscapeText); + assert!(w.write_str(r#"''"#).is_ok()); + assert_eq!(w.unwrap().unwrap().as_slice(), + r#"'<script>"&foo;"</script>'"#.as_bytes()); + } + + #[test] + fn test_unescape() { + macro_rules! check{ + ($text:expr, $exp:expr) => { + assert_eq!(unescape($text).as_slice(), $exp.as_slice()); + }; + (num: $num:expr, $exp:expr) => {{ + let num = $num; + let exp = $exp; + check!(format!(r"&\#{}", num), exp); + check!(format!(r"&\#{};", num), exp); + check!(format!(r"&\#x{:x}", num), exp); + check!(format!(r"&\#x{:x};", num), exp); + }}; + } + + // check text with no character references + check!("no character references", "no character references"); + // check & followed by invalid chars + check!("&\n&\t& &&", "&\n&\t& &&"); + // check & followed by numbers and letters + check!("&0 &9 &a &0; &9; &a;", "&0 &9 &a &0; &9; &a;"); + // check incomplete entities at the end of the string + for x in ["&", "&#", "&#x", "&#X", "&#y", "&#xy", "&#Xy"].iter() { + check!(x, x); + check!(x+";", x+";"); + } + // check several combinations of numeric character references, + // possibly followed by different characters + // NB: no runtime formatting strings so the loop has been unrolled + for (&num, &c) in [65u32, 97, 34, 38, 0x2603, 0x101234].iter() + .zip(["A", "a", "\"", "&", "\u2603", "\U00101234"].iter()) { + let v = [format!(r"&\#{}",num), format!(r"&\#{:07}",num), + format!(r"&\#{};",num), format!(r"&\#{:07};",num), + format!(r"&\#x{:x}",num), format!(r"&\#x{:06x}",num), + format!(r"&\#x{:x};",num), format!(r"&\#x{:06x};",num), + format!(r"&\#x{:X}",num), format!(r"&\#x{:06X}",num), + format!(r"&\#X{:x};",num), format!(r"&\#X{:06x};",num)]; + for s in v.iter() { + check!(s.as_slice(), c); + for end in [" ", "X"].iter() { + check!(*s+*end, c+*end); + } + } + } + // check invalid codepoints + for &cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000].iter() { + check!(num: cp, "\uFFFD"); + } + // check more invalid codepoints + // this test is elided because it's wrong. I don't know why cpython thinks codepoints + // [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff] should return nothing. + // check invalid numbers + for (&num, &c) in [0x0d, 0x80, 0x95, 0x9d].iter() + .zip(["\r", "\u20ac", "\u2022", "\x9d"].iter()) { + check!(num: num, c); + } + // check small numbers + check!(num: 0, "\uFFFD"); + check!(num: 9, "\t"); + // check a big number + check!(num: 1000000000000000000u64, "\uFFFD"); + // check that multiple trailing semicolons are handled correctly + for e in ["";", "";", "";", "";"].iter() { + check!(*e, "\";"); + } + // check that semicolons in the middle don't create problems + for e in [""quot;", ""quot;", ""quot;", ""quot;"].iter() { + check!(*e, "\"quot;"); + } + // check triple adjacent charrefs + for e in [""", """, """, """].iter() { + check!(e.repeat(3), r#"""""#); + check!((*e+";").repeat(3), r#"""""#); + } + // check that the case is respected + for e in ["&", "&", "&", "&"].iter() { + check!(*e, "&"); + } + for e in ["&Amp", "&Amp;"].iter() { + check!(*e, *e); + } + // check that non-existent named entities are returned unchanged + check!("&svadilfari;", "&svadilfari;"); + // the following examples are in the html5 specs + check!("¬it", "¬it"); + check!("¬it;", "¬it;"); + check!("¬in", "¬in"); + check!("∉", "∉"); + // a similar example with a long name + check!("¬ReallyAnExistingNamedCharacterReference;", + "¬ReallyAnExistingNamedCharacterReference;"); + // longest valid name + check!("∳", "∳"); + // check a charref that maps to two unicode chars + check!("∾̳", "\u223E\u0333"); + check!("&acE", "&acE"); + // test a large number of entities + check!("{ ".repeat(1050), "{ ".repeat(1050)); + // check some html5 entities + check!("ÉricÉric&alphacentauriαcentauri", + "ÉricÉric&alphacentauriαcentauri"); + check!("&co;", "&co;"); + } +} + +#[bench] +fn bench_escape(b: &mut test::Bencher) { + let s = "'"#).as_slice(), - "'<script>"&foo;"</script>'"); + "'<script>"&foo;"</script>'"); let mut w = EscapeWriter::new(MemWriter::new(), EscapeText); assert!(w.write_str(r#"''"#).is_ok()); assert_eq!(w.unwrap().unwrap().as_slice(), From fff0b8ea9296b57727807b74cf2f52bbb3097f62 Mon Sep 17 00:00:00 2001 From: Kevin Ballard Date: Sat, 17 May 2014 13:55:11 -0700 Subject: [PATCH 6/6] Add tests for the various escaping modes --- src/libhtml/tests.rs | 69 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/libhtml/tests.rs b/src/libhtml/tests.rs index cd5580694028a..c8c09d0d44b99 100644 --- a/src/libhtml/tests.rs +++ b/src/libhtml/tests.rs @@ -10,7 +10,10 @@ extern crate test; use std::fmt; +use std::io::MemWriter; use super::{escape, unescape}; +use super::escape::{EscapeWriter, UnescapeWriter}; +use super::escape::{EscapeDefault, EscapeText, EscapeAttr, EscapeSingleQuoteAttr}; struct Test(StrBuf); @@ -60,6 +63,72 @@ fn test_unescape() { assert_eq!(unescape("• &#XYZ;"), "\u2022 &#XYZ;".to_owned()); // this next escape overflows a u64. WebKit incorrectly treats this as • assert_eq!(unescape("�"), "\uFFFD".to_owned()); + assert_eq!(unescape("€Ÿ�"), "\u20AC\x81\u0178\uFFFD".to_owned()); +} + +macro_rules! escape_test{ + ($mode:ident, $input:expr, $result:expr) => {{ + use std::path::BytesContainer; + let mode = concat_idents!(Escape, $mode); + let mut w = EscapeWriter::new(MemWriter::new(), mode); + w.write($input.container_as_bytes()).unwrap(); + let v = w.unwrap().unwrap(); + // provide better errors by comparing strings when possible + let result = $result; + match (StrBuf::from_utf8(v), result.container_as_str()) { + (Ok(s), Some(res)) => assert_eq!(s.as_slice(), res), + (Ok(s), None) => assert_eq!(s.as_bytes(), result.container_as_bytes()), + (Err(v), _) => assert_eq!(v.as_slice(), result.container_as_bytes()) + } + }} +} + +#[test] +fn test_escapewriter_default() { + escape_test!(Default, "<>&\"'abc()\u2022", "<>&"'abc()\u2022"); + escape_test!(Default, "", ""); + escape_test!(Default, bytes!(0, 1, 0x80, "\x80"), bytes!(0, 1, 0x80, "\x80")); +} + +#[test] +fn test_escapewriter_text() { + escape_test!(Text, "<>&\"'abc()\u2022", "<>&\"'abc()\u2022"); + escape_test!(Text, "", ""); + escape_test!(Text, bytes!(0, 1, 0x80, "\x80"), bytes!(0, 1, 0x80, "\x80")); +} + +#[test] +fn test_escapewriter_attr() { + escape_test!(Attr, "<>&\"'abc()\u2022", "<>&"'abc()\u2022"); + escape_test!(Attr, "", ""); + escape_test!(Attr, bytes!(0, 1, 0x80, "\x80"), bytes!(0, 1, 0x80, "\x80")); +} + +#[test] +fn test_escapewriter_singlequote_attr() { + escape_test!(SingleQuoteAttr, "<>&\"'abc()\u2022", "<>&\"'abc()\u2022"); + escape_test!(SingleQuoteAttr, "", ""); + escape_test!(SingleQuoteAttr, bytes!(0, 1, 0x80, "\x80"), bytes!(0, 1, 0x80, "\x80")); +} + +#[test] +fn test_roundtrip_writer() { + let mut w = EscapeWriter::new(MemWriter::new(), EscapeDefault); + w.write_str("<>&\"'abc()\u2022").unwrap(); + w.write(bytes!(0, 1, 0x80, "\x80")).unwrap(); + let v = w.unwrap().unwrap(); + let mut w = UnescapeWriter::new(MemWriter::new()); + w.write(v.as_slice()).unwrap(); + let v = w.unwrap().unwrap(); + assert_eq!(v.as_slice(), bytes!("<>&\"'abc()\u2022", 0, 1, 0x80, "\x80")); +} + +#[test] +fn test_unescapewriter_with_allowed_char() { + let mut w = UnescapeWriter::with_allowed_char(MemWriter::new(), 'q'); + w.write_str("<>"").unwrap(); + let v = w.unwrap().unwrap(); + assert_eq!(v.as_slice(), "<>"".as_bytes()); } // Tests from python's html module