Skip to content

Commit 11bce6a

Browse files
authored
(enh) private __emitTokens callback to allow custom grammar parsers (#3620)
* private __emitTokens API * remove addKeyword from Emitter API * use language: namespace scope prefix to handle sublanguages * add emitTokens docs
1 parent aa58ffa commit 11bce6a

File tree

5 files changed

+114
-51
lines changed

5 files changed

+114
-51
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
Improvements:
44

55
- added a function to default export to generate a fresh highlighter instance to be used by extensions [WisamMechano][]
6+
- added BETA `__emitTokens` key to grammars to allow then to direct their own parsing, only using Highlight.js for the HTML rendering [Josh Goebel][]
67

78
New Grammars:
89

@@ -24,6 +25,7 @@ Parser:
2425

2526
- add removePlugin api [faga295][]
2627

28+
[Josh Goebel]: https://github.com/joshgoebel
2729
[Timur Kamaev]: https://github.com/doiftrue
2830
[Leopard20]: https://github.com/Leopard20/
2931
[WisamMechano]: https://github.com/wisammechano

docs/mode-reference.rst

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,52 @@ Disables autodetection for this language.
105105
(defaults to false, meaning auto-detect is enabled)
106106

107107

108+
__emitTokens
109+
^^^^^^^^^^^^
110+
111+
.. warning::
112+
113+
**This is currently still private/beta API**, though it's expected to be fairly stable.
114+
115+
It should land in version 12.0.
116+
117+
Allows grammars to bundle custom parsers - bypassing the default parser and grammar mode definitions. This should be a function that accepts the raw source code as the first argument and an "Emitter" object as the second.
118+
119+
A custom parser may parse the source as it sees fit - making calls to the Emitter along the way - allowing Highlight.js to generate and theme the final HTML.
120+
121+
The **Emitter** API is trivial:
122+
123+
- ``addText(text)``
124+
- ``startScope(name)``
125+
- ``endScope()``
126+
127+
Given:
128+
129+
::
130+
131+
hello beautiful world!
132+
133+
134+
Assuming beautiful is a keyword our Emitter calls might look something like:
135+
136+
::
137+
138+
addText("hello ")
139+
startScope("keyword")
140+
addText("beautiful")
141+
endScope()
142+
addText(" world!")
143+
144+
Resulting in the following generated HTML:
145+
146+
.. code-block:: html
147+
148+
hello <span class="hljs-keyword">beautiful</span> world!
149+
150+
.. note::
151+
152+
The intended use of ``addText`` is larger chunks of plain text, not individual characters. Custom parsers should buffer plain text output into complete strings rather than sending output one character at a time.
153+
108154
compilerExtensions
109155
^^^^^^^^^^^^^^^^^^
110156

src/highlight.js

Lines changed: 42 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import * as packageJSON from '../package.json';
1515
import * as logger from "./lib/logger.js";
1616
import HTMLInjectionError from "./lib/html_injection_error.js";
1717

18+
1819
/**
1920
@typedef {import('highlight.js').Mode} Mode
2021
@typedef {import('highlight.js').CompiledMode} CompiledMode
@@ -224,7 +225,7 @@ const HLJS = function(hljs) {
224225
buf += match[0];
225226
} else {
226227
const cssClass = language.classNameAliases[kind] || kind;
227-
emitter.addKeyword(match[0], cssClass);
228+
emitKeyword(match[0], cssClass);
228229
}
229230
} else {
230231
buf += match[0];
@@ -259,7 +260,7 @@ const HLJS = function(hljs) {
259260
if (top.relevance > 0) {
260261
relevance += result.relevance;
261262
}
262-
emitter.addSublanguage(result._emitter, result.language);
263+
emitter.__addSublanguage(result._emitter, result.language);
263264
}
264265

265266
function processBuffer() {
@@ -271,6 +272,18 @@ const HLJS = function(hljs) {
271272
modeBuffer = '';
272273
}
273274

275+
/**
276+
* @param {string} text
277+
* @param {string} scope
278+
*/
279+
function emitKeyword(keyword, scope) {
280+
if (keyword === "") return;
281+
282+
emitter.startScope(scope);
283+
emitter.addText(keyword);
284+
emitter.endScope();
285+
}
286+
274287
/**
275288
* @param {CompiledScope} scope
276289
* @param {RegExpMatchArray} match
@@ -283,7 +296,7 @@ const HLJS = function(hljs) {
283296
const klass = language.classNameAliases[scope[i]] || scope[i];
284297
const text = match[i];
285298
if (klass) {
286-
emitter.addKeyword(text, klass);
299+
emitKeyword(text, klass);
287300
} else {
288301
modeBuffer = text;
289302
processKeywords();
@@ -304,7 +317,7 @@ const HLJS = function(hljs) {
304317
if (mode.beginScope) {
305318
// beginScope just wraps the begin match itself in a scope
306319
if (mode.beginScope._wrap) {
307-
emitter.addKeyword(modeBuffer, language.classNameAliases[mode.beginScope._wrap] || mode.beginScope._wrap);
320+
emitKeyword(modeBuffer, language.classNameAliases[mode.beginScope._wrap] || mode.beginScope._wrap);
308321
modeBuffer = "";
309322
} else if (mode.beginScope._multi) {
310323
// at this point modeBuffer should just be the match
@@ -415,7 +428,7 @@ const HLJS = function(hljs) {
415428
const origin = top;
416429
if (top.endScope && top.endScope._wrap) {
417430
processBuffer();
418-
emitter.addKeyword(lexeme, top.endScope._wrap);
431+
emitKeyword(lexeme, top.endScope._wrap);
419432
} else if (top.endScope && top.endScope._multi) {
420433
processBuffer();
421434
emitMultiClass(top.endScope, match);
@@ -558,30 +571,34 @@ const HLJS = function(hljs) {
558571
let resumeScanAtSamePosition = false;
559572

560573
try {
561-
top.matcher.considerAll();
562-
563-
for (;;) {
564-
iterations++;
565-
if (resumeScanAtSamePosition) {
566-
// only regexes not matched previously will now be
567-
// considered for a potential match
568-
resumeScanAtSamePosition = false;
569-
} else {
570-
top.matcher.considerAll();
571-
}
572-
top.matcher.lastIndex = index;
574+
if (!language.__emitTokens) {
575+
top.matcher.considerAll();
576+
577+
for (;;) {
578+
iterations++;
579+
if (resumeScanAtSamePosition) {
580+
// only regexes not matched previously will now be
581+
// considered for a potential match
582+
resumeScanAtSamePosition = false;
583+
} else {
584+
top.matcher.considerAll();
585+
}
586+
top.matcher.lastIndex = index;
573587

574-
const match = top.matcher.exec(codeToHighlight);
575-
// console.log("match", match[0], match.rule && match.rule.begin)
588+
const match = top.matcher.exec(codeToHighlight);
589+
// console.log("match", match[0], match.rule && match.rule.begin)
576590

577-
if (!match) break;
591+
if (!match) break;
578592

579-
const beforeMatch = codeToHighlight.substring(index, match.index);
580-
const processedCount = processLexeme(beforeMatch, match);
581-
index = match.index + processedCount;
593+
const beforeMatch = codeToHighlight.substring(index, match.index);
594+
const processedCount = processLexeme(beforeMatch, match);
595+
index = match.index + processedCount;
596+
}
597+
processLexeme(codeToHighlight.substring(index));
598+
} else {
599+
language.__emitTokens(codeToHighlight, emitter);
582600
}
583-
processLexeme(codeToHighlight.substring(index));
584-
emitter.closeAllNodes();
601+
585602
emitter.finalize();
586603
result = emitter.toHTML();
587604

src/lib/html_renderer.js

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ const SPAN_CLOSE = '</span>';
2121
const emitsWrappingTags = (node) => {
2222
// rarely we can have a sublanguage where language is undefined
2323
// TODO: track down why
24-
return !!node.scope || (node.sublanguage && node.language);
24+
return !!node.scope;
2525
};
2626

2727
/**
@@ -30,13 +30,19 @@ const emitsWrappingTags = (node) => {
3030
* @param {{prefix:string}} options
3131
*/
3232
const scopeToCSSClass = (name, { prefix }) => {
33+
// sub-language
34+
if (name.startsWith("language:")) {
35+
return name.replace("language:", "language-");
36+
}
37+
// tiered scope: comment.line
3338
if (name.includes(".")) {
3439
const pieces = name.split(".");
3540
return [
3641
`${prefix}${pieces.shift()}`,
3742
...(pieces.map((x, i) => `${x}${"_".repeat(i + 1)}`))
3843
].join(" ");
3944
}
45+
// simple scope
4046
return `${prefix}${name}`;
4147
};
4248

@@ -69,12 +75,8 @@ export default class HTMLRenderer {
6975
openNode(node) {
7076
if (!emitsWrappingTags(node)) return;
7177

72-
let className = "";
73-
if (node.sublanguage) {
74-
className = `language-${node.language}`;
75-
} else {
76-
className = scopeToCSSClass(node.scope, { prefix: this.classPrefix });
77-
}
78+
const className = scopeToCSSClass(node.scope,
79+
{ prefix: this.classPrefix });
7880
this.span(className);
7981
}
8082

src/lib/token_tree.js

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,11 @@ class TokenTree {
106106
107107
Minimal interface:
108108
109-
- addKeyword(text, scope)
110109
- addText(text)
111-
- addSublanguage(emitter, subLanguageName)
110+
- __addSublanguage(emitter, subLanguageName)
111+
- startScope(scope)
112+
- endScope()
112113
- finalize()
113-
- openNode(scope)
114-
- closeNode()
115-
- closeAllNodes()
116114
- toHTML()
117115
118116
*/
@@ -131,34 +129,31 @@ export default class TokenTreeEmitter extends TokenTree {
131129

132130
/**
133131
* @param {string} text
134-
* @param {string} scope
135132
*/
136-
addKeyword(text, scope) {
133+
addText(text) {
137134
if (text === "") { return; }
138135

139-
this.openNode(scope);
140-
this.addText(text);
141-
this.closeNode();
136+
this.add(text);
142137
}
143138

144-
/**
145-
* @param {string} text
146-
*/
147-
addText(text) {
148-
if (text === "") { return; }
139+
/** @param {string} scope */
140+
startScope(scope) {
141+
this.openNode(scope);
142+
}
149143

150-
this.add(text);
144+
endScope() {
145+
this.closeNode();
151146
}
152147

153148
/**
154149
* @param {Emitter & {root: DataNode}} emitter
155150
* @param {string} name
156151
*/
157-
addSublanguage(emitter, name) {
152+
__addSublanguage(emitter, name) {
158153
/** @type DataNode */
159154
const node = emitter.root;
160-
node.sublanguage = true;
161-
node.language = name;
155+
if (name) node.scope = `language:${name}`;
156+
162157
this.add(node);
163158
}
164159

@@ -168,6 +163,7 @@ export default class TokenTreeEmitter extends TokenTree {
168163
}
169164

170165
finalize() {
166+
this.closeAllNodes();
171167
return true;
172168
}
173169
}

0 commit comments

Comments
 (0)