Skip to content

Commit 46d5e1a

Browse files
author
Peter Bengtsson
authored
Correct translation content from reusables too (#37722)
1 parent dc54233 commit 46d5e1a

File tree

8 files changed

+158
-94
lines changed

8 files changed

+158
-94
lines changed

lib/correct-translation-content.js

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/**
2+
* A lot of translations have minor corruptions that will lead to rendering
3+
* failing (and having to rely on English fallback). Many of these are
4+
* easy to manually correct for.
5+
*
6+
* This function is a temporary solution to correct for these corruptions.
7+
* It looks for easy "low hanging fruit" that we can correct for.
8+
*
9+
*/
10+
export function correctTranslatedContentStrings(content, englishContent, debug = false) {
11+
// A lot of translations have corruptions around the AUTOTITLE links.
12+
// We've requested that these are corrected back but as a temporary
13+
// solution we'll manually recover now.
14+
// See internal issue #2762
15+
// In late 2023, search in the translations repos if these things are
16+
// still happening and if not, the following lines can be removed.
17+
content = content.replaceAll('[AUTOTITLE"을 참조하세요]', '[AUTOTITLE]')
18+
content = content.replaceAll('[AUTOTITLE"을]', '[AUTOTITLE]')
19+
content = content.replaceAll('["AUTOTITLE]', '"[AUTOTITLE]')
20+
content = content.replaceAll('[AUTOTITLE"을 참조하세요.](', '[AUTOTITLE](')
21+
22+
// A lot of Liquid tags lose their linebreak after the `}` which can
23+
// result in formatting problems, especially around Markdown tables.
24+
// This code here, compares each Liquid statement, in the translation,
25+
// and tests if it appears like that but with a newline in the English.
26+
// English example:
27+
//
28+
// {%- ifversion ghes %}
29+
// | Thing | ✔️ |
30+
// {%- endif %}
31+
//
32+
// Translation example:
33+
//
34+
// {%- ifversion ghes %} | Thing | ✔️ | {%- endif %}
35+
//
36+
// There exists the risk that different Liquid statements gets compared
37+
// different Liquid statements in the English, but the risk is worth
38+
// taking because even if this accidentally introduces a newline, it's
39+
// unlikely to cause a problem. At worst that a sentence displays on its
40+
// own paragraph.
41+
content = content.replace(/\{%(.+?)%\} /g, (match) => {
42+
if (match.lastIndexOf('{%') > 0) {
43+
// For example:
44+
//
45+
// `{% bla bla %}, and {% foo bar %} `
46+
//
47+
// Our regex is not greedy, but technically, if you look closely
48+
// you'll see this is the first match that starts with `{%` and
49+
// ends with `%} `. Let's skip these.
50+
return match
51+
}
52+
53+
const withLinebreak = match.slice(0, -1) + '\n'
54+
if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) {
55+
return withLinebreak
56+
}
57+
return match
58+
})
59+
// The above corrections deepend on looking for `{% foo %} ` and replacing
60+
// it with `{% foo %}\n`. ...if `{% foo %}\n` was in the English
61+
// content and `{% foo %} ` was *not*.
62+
// However we see a lot of cases of this:
63+
//
64+
// ... {% endif %} | First Column ...
65+
//
66+
// Which needs to become this:
67+
//
68+
// ... {% endif %}
69+
// | First Column ...
70+
//
71+
// And since `{% endif %}` is such a common Liquid tag we can't reply
72+
// on lookig for it with `{% endif %}\n` in the English content.
73+
content = content.replace(/\{% endif %\} \| /g, (match) => {
74+
const potentiallyBetter = '{% endif %}\n| '
75+
if (englishContent.includes(potentiallyBetter)) {
76+
return potentiallyBetter
77+
}
78+
return match
79+
})
80+
81+
// All too often we see translations that look like this:
82+
//
83+
// | Qualifizierer | Beschreibung | | -------- | -------- | {% ifversion ghec or ghes > 3.8 %} | `advanced-security:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_GH_advanced_security %} aktiviert wurde | {% endif %} | `code-scanning-pull-request-alerts:enabled`| Zeigt Repositorys an, für die die {% data variables.product.prodname_code_scanning %} zur Ausführung bei Pull Requests konfiguriert wurde | | `dependabot-security-updates:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_dependabot %}-Sicherheitsupdates aktiviert wurden | | `secret-scanning-push-protection:enabled` | Zeigt Repositorys an, für die der Pushschutz für die {% data variables.product.prodname_secret_scanning %} aktiviert wurde | {% endif %}
84+
//
85+
// Yes, that's one very long line. Notice how all the necessary linebreaks
86+
// are suddenly gone.
87+
content = content.replaceAll(' | | ', ' |\n| ')
88+
89+
return content
90+
}

lib/get-data.js

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import matter from 'gray-matter'
66
import { merge, get } from 'lodash-es'
77

88
import languages from './languages.js'
9+
import { correctTranslatedContentStrings } from './correct-translation-content.js'
910

1011
// If you run `export DEBUG_JIT_DATA_READS=true` in your terminal,
1112
// next time it will mention every file it reads from disk.
@@ -175,7 +176,33 @@ function getDataByDir(dottedPath, dir, englishRoot) {
175176
fullPath.push(...split)
176177
fullPath.push(`${nakedname}.md`)
177178
const markdown = getMarkdownContent(dir, fullPath.join(path.sep), englishRoot)
178-
return matter(markdown).content
179+
let { content } = matter(markdown)
180+
if (dir !== englishRoot) {
181+
// If we're reading a translation, we need to replace the possible
182+
// corruptions. For example `[AUTOTITLE"을](/foo/bar)`.
183+
// To do this we'll need the English equivalent
184+
let englishContent = content
185+
try {
186+
englishContent = getMarkdownContent(englishRoot, fullPath.join(path.sep), englishRoot)
187+
} catch (error) {
188+
// In some real but rare cases a reusable doesn't exist in English.
189+
// At all.
190+
// This can happen when the translation is really out of date.
191+
// You might have an old `docs-internal.locale/content/**/*.md`
192+
// file that mentions `{% data reusables.foo.bar %}`. And it's
193+
// working fine, except none of that exists in English.
194+
// If this is the case, we still want to executed the
195+
// correctTranslatedContentStrings() function, but we can't
196+
// genuinely give it the English equivalent content, which it
197+
// sometimes uses to correct some Liquid tags. At least other
198+
// good corrections might happen.
199+
if (error.code !== 'ENOENT') {
200+
throw error
201+
}
202+
}
203+
content = correctTranslatedContentStrings(content, englishContent)
204+
}
205+
return content
179206
}
180207

181208
// E.g. {% data ui.pages.foo.bar %}

lib/page-data.js

Lines changed: 1 addition & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import nonEnterpriseDefaultVersion from './non-enterprise-default-version.js'
77
import readFileContents from './read-file-contents.js'
88
import Page from './page.js'
99
import frontmatterSchema from './frontmatter.js'
10+
import { correctTranslatedContentStrings } from './correct-translation-content.js'
1011

1112
// If you run `export DEBUG_TRANSLATION_FALLBACKS=true` in your terminal,
1213
// every time a translation file fails to initialize we fall back to English
@@ -181,97 +182,6 @@ async function translateTree(dir, langObj, enTree) {
181182
return item
182183
}
183184

184-
/**
185-
* A lot of translations have minor corruptions that will lead to rendering
186-
* failing (and having to rely on English fallback). Many of these are
187-
* easy to manually correct for.
188-
*
189-
* This function is a temporary solution to correct for these corruptions.
190-
* It looks for easy "low hanging fruit" that we can correct for.
191-
*
192-
*/
193-
export function correctTranslatedContentStrings(content, englishContent, debug = false) {
194-
// A lot of translations have corruptions around the AUTOTITLE links.
195-
// We've requested that these are corrected back but as a temporary
196-
// solution we'll manually recover now.
197-
// See internal issue #2762
198-
// In late 2023, search in the translations repos if these things are
199-
// still happening and if not, the following lines can be removed.
200-
content = content.replaceAll('[AUTOTITLE"을 참조하세요]', '[AUTOTITLE]')
201-
content = content.replaceAll('[AUTOTITLE"을]', '[AUTOTITLE]')
202-
content = content.replaceAll('["AUTOTITLE]', '"[AUTOTITLE]')
203-
content = content.replaceAll('[AUTOTITLE"을 참조하세요.](', '[AUTOTITLE](')
204-
205-
// A lot of Liquid tags lose their linebreak after the `}` which can
206-
// result in formatting problems, especially around Markdown tables.
207-
// This code here, compares each Liquid statement, in the translation,
208-
// and tests if it appears like that but with a newline in the English.
209-
// English example:
210-
//
211-
// {%- ifversion ghes %}
212-
// | Thing | ✔️ |
213-
// {%- endif %}
214-
//
215-
// Translation example:
216-
//
217-
// {%- ifversion ghes %} | Thing | ✔️ | {%- endif %}
218-
//
219-
// There exists the risk that different Liquid statements gets compared
220-
// different Liquid statements in the English, but the risk is worth
221-
// taking because even if this accidentally introduces a newline, it's
222-
// unlikely to cause a problem. At worst that a sentence displays on its
223-
// own paragraph.
224-
content = content.replace(/\{%(.+?)%\} /g, (match) => {
225-
if (match.lastIndexOf('{%') > 0) {
226-
// For example:
227-
//
228-
// `{% bla bla %}, and {% foo bar %} `
229-
//
230-
// Our regex is not greedy, but technically, if you look closely
231-
// you'll see this is the first match that starts with `{%` and
232-
// ends with `%} `. Let's skip these.
233-
return match
234-
}
235-
236-
const withLinebreak = match.slice(0, -1) + '\n'
237-
if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) {
238-
return withLinebreak
239-
}
240-
return match
241-
})
242-
// The above corrections deepend on looking for `{% foo %} ` and replacing
243-
// it with `{% foo %}\n`. ...if `{% foo %}\n` was in the English
244-
// content and `{% foo %} ` was *not*.
245-
// However we see a lot of cases of this:
246-
//
247-
// ... {% endif %} | First Column ...
248-
//
249-
// Which needs to become this:
250-
//
251-
// ... {% endif %}
252-
// | First Column ...
253-
//
254-
// And since `{% endif %}` is such a common Liquid tag we can't reply
255-
// on lookig for it with `{% endif %}\n` in the English content.
256-
content = content.replace(/\{% endif %\} \| /g, (match) => {
257-
const potentiallyBetter = '{% endif %}\n| '
258-
if (englishContent.includes(potentiallyBetter)) {
259-
return potentiallyBetter
260-
}
261-
return match
262-
})
263-
264-
// All too often we see translations that look like this:
265-
//
266-
// | Qualifizierer | Beschreibung | | -------- | -------- | {% ifversion ghec or ghes > 3.8 %} | `advanced-security:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_GH_advanced_security %} aktiviert wurde | {% endif %} | `code-scanning-pull-request-alerts:enabled`| Zeigt Repositorys an, für die die {% data variables.product.prodname_code_scanning %} zur Ausführung bei Pull Requests konfiguriert wurde | | `dependabot-security-updates:enabled` | Zeigt Repositorys an, für die {% data variables.product.prodname_dependabot %}-Sicherheitsupdates aktiviert wurden | | `secret-scanning-push-protection:enabled` | Zeigt Repositorys an, für die der Pushschutz für die {% data variables.product.prodname_secret_scanning %} aktiviert wurde | {% endif %}
267-
//
268-
// Yes, that's one very long line. Notice how all the necessary linebreaks
269-
// are suddenly gone.
270-
content = content.replaceAll(' | | ', ' |\n| ')
271-
272-
return content
273-
}
274-
275185
/**
276186
* The siteTree is a nested object with pages for every language and version, useful for nav because it
277187
* contains parent, child, and sibling relationships:

middleware/contextualizers/glossaries.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { getDataByLanguage } from '../../lib/get-data.js'
22
import liquid from '../../lib/render-content/liquid.js'
33
import { executeWithFallback } from '../../lib/render-with-fallback.js'
4-
import { correctTranslatedContentStrings } from '../../lib/page-data.js'
4+
import { correctTranslatedContentStrings } from '../../lib/correct-translation-content.js'
55

66
export default async function glossaries(req, res, next) {
77
if (!req.pagePath.endsWith('get-started/quickstart/github-glossary')) return next()

tests/fixtures/content/get-started/quickstart/hello-world.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,7 @@ like "Enterprise Server X.Y". It should change the above sentence.
3030
"[AUTOTITLE](/get-started/quickstart/dynamic-title)"
3131

3232
"[AUTOTITLE](/get-started/foo/cross-version-linking)"
33+
34+
## Use of a reusable that might have auto-title links
35+
36+
{% data reusables.gated-features.more-info %}

tests/fixtures/translations/ja-jp/content/get-started/quickstart/hello-world.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,15 @@ intro: 'この Hello World 演習に従って、{% data variables.product.produc
1111
1212
## 可変タイトルのページへのリンク
1313

14-
"[AUTOTITLE](/get-started/quickstart/dynamic-title)"
14+
Hi, if you look carefully at this next line, the way the auto-title keyword
15+
is used, is wrong. The quotation is inside the square bracket! Naughty.
16+
But this is a common occurrence in translations and we need to smartly
17+
recover from it.
18+
19+
["AUTOTITLE](/get-started/quickstart/dynamic-title)"
1520

1621
"[AUTOTITLE](/get-started/foo/cross-version-linking)"
22+
23+
## Use of a reusable that might have auto-title links
24+
25+
{% data reusables.gated-features.more-info %}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{% ifversion fpt or ghec %}詳細については, see ["AUTOTITLE](/get-started/quickstart/hello-world)."{% endif %}

tests/rendering-fixtures/translations.js

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,27 @@ describe('translations', () => {
8888
expect(tds[1]).toBe('Present')
8989
}
9090
})
91+
92+
test('automatic correction of bad AUTOTITLE in reusables', async () => {
93+
const $ = await getDOM('/ja/get-started/quickstart/hello-world')
94+
const links = $('#article-contents a[href]')
95+
const texts = links.map((i, element) => $(element).text()).get()
96+
// That Japanese page uses AUTOTITLE links. Both in the main `.md` file
97+
// but also inside a reusable.
98+
// E.g. `["AUTOTITLE](/get-started/quickstart/hello-world)."`
99+
// If we didn't do the necessary string corrections on translations'
100+
// content and reusables what *would* remain is a HTML link that
101+
// would look like this:
102+
//
103+
// <a href="/ja/get-started/quickstart/hello-world">&quot;AUTOTITLE</a>
104+
//
105+
// This test makes sure no such string is left in any of the article
106+
// content links.
107+
// Note that, in English, it's not acceptable to have such a piece of
108+
// Markdown. It would not be let into `main` by our CI checks. But
109+
// by their nature, translations are not checked by CI in the same way.
110+
// Its "flaws" have to be corrected at runtime.
111+
const stillAutotitle = texts.filter((text) => /autotitle/i.test(text))
112+
expect(stillAutotitle.length).toBe(0)
113+
})
91114
})

0 commit comments

Comments
 (0)