From a1df99fff007cb8835842bc4961bd27b9febf7a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sat, 7 Jun 2025 20:37:43 +0200 Subject: [PATCH 01/13] Generate a llms-full version of the docs --- bun.lock | 37 +++- .../[siteData]/llms-full.txt/route.ts | 14 ++ packages/gitbook/package.json | 7 + packages/gitbook/src/lib/urls.ts | 14 ++ packages/gitbook/src/routes/llms-full.ts | 185 ++++++++++++++++++ 5 files changed, 254 insertions(+), 3 deletions(-) create mode 100644 packages/gitbook-v2/src/app/sites/static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts create mode 100644 packages/gitbook/src/routes/llms-full.ts diff --git a/bun.lock b/bun.lock index cf33dddcca..f4e3bfd06b 100644 --- a/bun.lock +++ b/bun.lock @@ -80,8 +80,13 @@ "jwt-decode": "^4.0.0", "katex": "^0.16.9", "mathjax": "^3.2.2", + "mdast-util-from-markdown": "^2.0.2", + "mdast-util-frontmatter": "^2.0.1", + "mdast-util-gfm": "^3.1.0", "mdast-util-to-markdown": "^2.1.2", "memoizee": "^0.4.17", + "micromark-extension-frontmatter": "^2.0.0", + "micromark-extension-gfm": "^3.0.0", "next": "14.2.26", "next-themes": "^0.2.1", "nuqs": "^2.2.3", @@ -104,6 +109,8 @@ "tailwind-merge": "^2.2.0", "tailwind-shades": "^1.1.2", "unified": "^11.0.5", + "unist-util-remove": "^4.0.0", + "unist-util-visit": "^5.0.0", "url-join": "^5.0.0", "usehooks-ts": "^3.1.0", "zod": "^3.24.2", @@ -1809,6 +1816,8 @@ "fastq": ["fastq@1.17.1", "", { "dependencies": { "reusify": "^1.0.4" } }, "sha512-sRVD3lWVIXWg6By68ZN7vho9a1pQcN/WBFaAAsDDFzlJjvoGx0P8z7V1t72grFJfJhu3YPZBuu25f7Kaw2jN1w=="], + "fault": ["fault@2.0.1", "", { "dependencies": { "format": "^0.2.0" } }, "sha512-WtySTkS4OKev5JtpHXnib4Gxiurzh5NCGvWrFaZ34m6JehfTUhKZvn9njTfw48t6JumVQOmrKqpmGcdwxnhqBQ=="], + "fd-slicer": ["fd-slicer@1.1.0", "", { "dependencies": { "pend": "~1.2.0" } }, "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g=="], "fdir": ["fdir@6.4.3", "", { "peerDependencies": { "picomatch": "^3 || ^4" }, "optionalPeers": ["picomatch"] }, "sha512-PMXmW2y1hDDfTSRc9gaXIuCCRpuoz3Kaz8cUelp3smouvfT632ozg2vrT6lJsHKKOF59YLbOGfAWGUcKEfRMQw=="], @@ -1835,6 +1844,8 @@ "form-data": ["form-data@4.0.1", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "mime-types": "^2.1.12" } }, "sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw=="], + "format": ["format@0.2.2", "", {}, "sha512-wzsgA6WOq+09wrU1tsJ09udeR/YZRaeArL9e1wPbFg3GG2yDnC2ldKpxs4xunpFF9DgqCqOIra3bc1HWrJ37Ww=="], + "forwarded": ["forwarded@0.2.0", "", {}, "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow=="], "fraction.js": ["fraction.js@4.3.7", "", {}, "sha512-ZsDfxO51wGAXREY55a7la9LScWpwv9RxIrYABrlvOFBlH/ShPnrtsXeuUIfXKKOVicNxQ+o8JTbJvjS4M89yew=="], @@ -2181,9 +2192,11 @@ "mdast-util-find-and-replace": ["mdast-util-find-and-replace@3.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "escape-string-regexp": "^5.0.0", "unist-util-is": "^6.0.0", "unist-util-visit-parents": "^6.0.0" } }, "sha512-SG21kZHGC3XRTSUhtofZkBzZTJNM5ecCi0SK2IMKmSXR8vO3peL+kb1O0z7Zl83jKtutG4k5Wv/W7V3/YHvzPA=="], - "mdast-util-from-markdown": ["mdast-util-from-markdown@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "mdast-util-to-string": "^4.0.0", "micromark": "^4.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-aJEUyzZ6TzlsX2s5B4Of7lN7EQtAxvtradMMglCQDyaTFgse6CmtmdJ15ElnVRlCg1vpNyVtbem0PWzlNieZsA=="], + "mdast-util-from-markdown": ["mdast-util-from-markdown@2.0.2", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "mdast-util-to-string": "^4.0.0", "micromark": "^4.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-uZhTV/8NBuw0WHkPTrCqDOl0zVe1BIng5ZtHoDk49ME1qqcjYmmLmOf0gELgcRMxN4w2iuIeVso5/6QymSrgmA=="], + + "mdast-util-frontmatter": ["mdast-util-frontmatter@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "devlop": "^1.0.0", "escape-string-regexp": "^5.0.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0", "micromark-extension-frontmatter": "^2.0.0" } }, "sha512-LRqI9+wdgC25P0URIJY9vwocIzCcksduHQ9OF2joxQoyTNVduwLAFUzjoopuRJbJAReaKrNQKAZKL3uCMugWJA=="], - "mdast-util-gfm": ["mdast-util-gfm@3.0.0", "", { "dependencies": { "mdast-util-from-markdown": "^2.0.0", "mdast-util-gfm-autolink-literal": "^2.0.0", "mdast-util-gfm-footnote": "^2.0.0", "mdast-util-gfm-strikethrough": "^2.0.0", "mdast-util-gfm-table": "^2.0.0", "mdast-util-gfm-task-list-item": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-dgQEX5Amaq+DuUqf26jJqSK9qgixgd6rYDHAv4aTBuA92cTknZlKpPfa86Z/s8Dj8xsAQpFfBmPUHWJBWqS4Bw=="], + "mdast-util-gfm": ["mdast-util-gfm@3.1.0", "", { "dependencies": { "mdast-util-from-markdown": "^2.0.0", "mdast-util-gfm-autolink-literal": "^2.0.0", "mdast-util-gfm-footnote": "^2.0.0", "mdast-util-gfm-strikethrough": "^2.0.0", "mdast-util-gfm-table": "^2.0.0", "mdast-util-gfm-task-list-item": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ=="], "mdast-util-gfm-autolink-literal": ["mdast-util-gfm-autolink-literal@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "ccount": "^2.0.0", "devlop": "^1.0.0", "mdast-util-find-and-replace": "^3.0.0", "micromark-util-character": "^2.0.0" } }, "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ=="], @@ -2227,6 +2240,8 @@ "micromark-core-commonmark": ["micromark-core-commonmark@2.0.1", "", { "dependencies": { "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "micromark-factory-destination": "^2.0.0", "micromark-factory-label": "^2.0.0", "micromark-factory-space": "^2.0.0", "micromark-factory-title": "^2.0.0", "micromark-factory-whitespace": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-chunked": "^2.0.0", "micromark-util-classify-character": "^2.0.0", "micromark-util-html-tag-name": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-resolve-all": "^2.0.0", "micromark-util-subtokenize": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-CUQyKr1e///ZODyD1U3xit6zXwy1a8q2a1S1HKtIlmgvurrEpaw/Y9y6KSIbF8P59cn/NjzHyO+Q2fAyYLQrAA=="], + "micromark-extension-frontmatter": ["micromark-extension-frontmatter@2.0.0", "", { "dependencies": { "fault": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-C4AkuM3dA58cgZha7zVnuVxBhDsbttIMiytjgsM2XbHAB2faRVaHRle40558FBN+DJcrLNCoqG5mlrpdU4cRtg=="], + "micromark-extension-gfm": ["micromark-extension-gfm@3.0.0", "", { "dependencies": { "micromark-extension-gfm-autolink-literal": "^2.0.0", "micromark-extension-gfm-footnote": "^2.0.0", "micromark-extension-gfm-strikethrough": "^2.0.0", "micromark-extension-gfm-table": "^2.0.0", "micromark-extension-gfm-tagfilter": "^2.0.0", "micromark-extension-gfm-task-list-item": "^2.0.0", "micromark-util-combine-extensions": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w=="], "micromark-extension-gfm-autolink-literal": ["micromark-extension-gfm-autolink-literal@2.1.0", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-sanitize-uri": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw=="], @@ -2829,6 +2844,8 @@ "unist-util-position": ["unist-util-position@5.0.0", "", { "dependencies": { "@types/unist": "^3.0.0" } }, "sha512-fucsC7HjXvkB5R3kTCO7kUjRdrS0BJt3M/FPxmHMBOm8JQi2BsHAHFsy27E0EolP8rp0NzXsJ+jNPyDWvOJZPA=="], + "unist-util-remove": ["unist-util-remove@4.0.0", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-is": "^6.0.0", "unist-util-visit-parents": "^6.0.0" } }, "sha512-b4gokeGId57UVRX/eVKej5gXqGlc9+trkORhFJpu9raqZkZhU0zm8Doi05+HaiBsMEIJowL+2WtQ5ItjsngPXg=="], + "unist-util-stringify-position": ["unist-util-stringify-position@4.0.0", "", { "dependencies": { "@types/unist": "^3.0.0" } }, "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ=="], "unist-util-visit": ["unist-util-visit@5.0.0", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-is": "^6.0.0", "unist-util-visit-parents": "^6.0.0" } }, "sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg=="], @@ -4115,14 +4132,20 @@ "make-dir/semver": ["semver@6.3.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="], - "mdast-util-gfm/mdast-util-to-markdown": ["mdast-util-to-markdown@2.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "longest-streak": "^3.0.0", "mdast-util-phrasing": "^4.0.0", "mdast-util-to-string": "^4.0.0", "micromark-util-decode-string": "^2.0.0", "unist-util-visit": "^5.0.0", "zwitch": "^2.0.0" } }, "sha512-SR2VnIEdVNCJbP6y7kVTJgPLifdr8WEU440fQec7qHoHOUz/oJ2jmNRqdDQ3rbiStOXb2mCDGTuwsK5OPUgYlQ=="], + "mdast-util-gfm-footnote/mdast-util-from-markdown": ["mdast-util-from-markdown@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "mdast-util-to-string": "^4.0.0", "micromark": "^4.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-aJEUyzZ6TzlsX2s5B4Of7lN7EQtAxvtradMMglCQDyaTFgse6CmtmdJ15ElnVRlCg1vpNyVtbem0PWzlNieZsA=="], "mdast-util-gfm-footnote/mdast-util-to-markdown": ["mdast-util-to-markdown@2.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "longest-streak": "^3.0.0", "mdast-util-phrasing": "^4.0.0", "mdast-util-to-string": "^4.0.0", "micromark-util-decode-string": "^2.0.0", "unist-util-visit": "^5.0.0", "zwitch": "^2.0.0" } }, "sha512-SR2VnIEdVNCJbP6y7kVTJgPLifdr8WEU440fQec7qHoHOUz/oJ2jmNRqdDQ3rbiStOXb2mCDGTuwsK5OPUgYlQ=="], + "mdast-util-gfm-strikethrough/mdast-util-from-markdown": ["mdast-util-from-markdown@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "mdast-util-to-string": "^4.0.0", "micromark": "^4.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-aJEUyzZ6TzlsX2s5B4Of7lN7EQtAxvtradMMglCQDyaTFgse6CmtmdJ15ElnVRlCg1vpNyVtbem0PWzlNieZsA=="], + "mdast-util-gfm-strikethrough/mdast-util-to-markdown": ["mdast-util-to-markdown@2.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "longest-streak": "^3.0.0", "mdast-util-phrasing": "^4.0.0", "mdast-util-to-string": "^4.0.0", "micromark-util-decode-string": "^2.0.0", "unist-util-visit": "^5.0.0", "zwitch": "^2.0.0" } }, "sha512-SR2VnIEdVNCJbP6y7kVTJgPLifdr8WEU440fQec7qHoHOUz/oJ2jmNRqdDQ3rbiStOXb2mCDGTuwsK5OPUgYlQ=="], + "mdast-util-gfm-table/mdast-util-from-markdown": ["mdast-util-from-markdown@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "mdast-util-to-string": "^4.0.0", "micromark": "^4.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-aJEUyzZ6TzlsX2s5B4Of7lN7EQtAxvtradMMglCQDyaTFgse6CmtmdJ15ElnVRlCg1vpNyVtbem0PWzlNieZsA=="], + "mdast-util-gfm-table/mdast-util-to-markdown": ["mdast-util-to-markdown@2.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "longest-streak": "^3.0.0", "mdast-util-phrasing": "^4.0.0", "mdast-util-to-string": "^4.0.0", "micromark-util-decode-string": "^2.0.0", "unist-util-visit": "^5.0.0", "zwitch": "^2.0.0" } }, "sha512-SR2VnIEdVNCJbP6y7kVTJgPLifdr8WEU440fQec7qHoHOUz/oJ2jmNRqdDQ3rbiStOXb2mCDGTuwsK5OPUgYlQ=="], + "mdast-util-gfm-task-list-item/mdast-util-from-markdown": ["mdast-util-from-markdown@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "mdast-util-to-string": "^4.0.0", "micromark": "^4.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-aJEUyzZ6TzlsX2s5B4Of7lN7EQtAxvtradMMglCQDyaTFgse6CmtmdJ15ElnVRlCg1vpNyVtbem0PWzlNieZsA=="], + "mdast-util-gfm-task-list-item/mdast-util-to-markdown": ["mdast-util-to-markdown@2.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "longest-streak": "^3.0.0", "mdast-util-phrasing": "^4.0.0", "mdast-util-to-string": "^4.0.0", "micromark-util-decode-string": "^2.0.0", "unist-util-visit": "^5.0.0", "zwitch": "^2.0.0" } }, "sha512-SR2VnIEdVNCJbP6y7kVTJgPLifdr8WEU440fQec7qHoHOUz/oJ2jmNRqdDQ3rbiStOXb2mCDGTuwsK5OPUgYlQ=="], "meow/type-fest": ["type-fest@0.13.1", "", {}, "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg=="], @@ -4191,6 +4214,10 @@ "read-yaml-file/pify": ["pify@4.0.1", "", {}, "sha512-uB80kBFb/tfd68bVleG9T5GGsGPjJrLAUpR5PZIrhBnIaRTQRjqdJSsIKkOP6OAIFbj7GOrcudc5pNjZ+geV2g=="], + "remark-gfm/mdast-util-gfm": ["mdast-util-gfm@3.0.0", "", { "dependencies": { "mdast-util-from-markdown": "^2.0.0", "mdast-util-gfm-autolink-literal": "^2.0.0", "mdast-util-gfm-footnote": "^2.0.0", "mdast-util-gfm-strikethrough": "^2.0.0", "mdast-util-gfm-table": "^2.0.0", "mdast-util-gfm-task-list-item": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-dgQEX5Amaq+DuUqf26jJqSK9qgixgd6rYDHAv4aTBuA92cTknZlKpPfa86Z/s8Dj8xsAQpFfBmPUHWJBWqS4Bw=="], + + "remark-parse/mdast-util-from-markdown": ["mdast-util-from-markdown@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "mdast-util-to-string": "^4.0.0", "micromark": "^4.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-aJEUyzZ6TzlsX2s5B4Of7lN7EQtAxvtradMMglCQDyaTFgse6CmtmdJ15ElnVRlCg1vpNyVtbem0PWzlNieZsA=="], + "remark-stringify/mdast-util-to-markdown": ["mdast-util-to-markdown@2.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "longest-streak": "^3.0.0", "mdast-util-phrasing": "^4.0.0", "mdast-util-to-string": "^4.0.0", "micromark-util-decode-string": "^2.0.0", "unist-util-visit": "^5.0.0", "zwitch": "^2.0.0" } }, "sha512-SR2VnIEdVNCJbP6y7kVTJgPLifdr8WEU440fQec7qHoHOUz/oJ2jmNRqdDQ3rbiStOXb2mCDGTuwsK5OPUgYlQ=="], "rimraf/glob": ["glob@10.4.5", "", { "dependencies": { "foreground-child": "^3.1.0", "jackspeak": "^3.1.2", "minimatch": "^9.0.4", "minipass": "^7.1.2", "package-json-from-dist": "^1.0.0", "path-scurry": "^1.11.1" }, "bin": { "glob": "dist/esm/bin.mjs" } }, "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg=="], @@ -5019,6 +5046,10 @@ "read-yaml-file/js-yaml/argparse": ["argparse@1.0.10", "", { "dependencies": { "sprintf-js": "~1.0.2" } }, "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg=="], + "remark-gfm/mdast-util-gfm/mdast-util-from-markdown": ["mdast-util-from-markdown@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "mdast-util-to-string": "^4.0.0", "micromark": "^4.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-aJEUyzZ6TzlsX2s5B4Of7lN7EQtAxvtradMMglCQDyaTFgse6CmtmdJ15ElnVRlCg1vpNyVtbem0PWzlNieZsA=="], + + "remark-gfm/mdast-util-gfm/mdast-util-to-markdown": ["mdast-util-to-markdown@2.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "longest-streak": "^3.0.0", "mdast-util-phrasing": "^4.0.0", "mdast-util-to-string": "^4.0.0", "micromark-util-decode-string": "^2.0.0", "unist-util-visit": "^5.0.0", "zwitch": "^2.0.0" } }, "sha512-SR2VnIEdVNCJbP6y7kVTJgPLifdr8WEU440fQec7qHoHOUz/oJ2jmNRqdDQ3rbiStOXb2mCDGTuwsK5OPUgYlQ=="], + "rimraf/glob/jackspeak": ["jackspeak@3.4.3", "", { "dependencies": { "@isaacs/cliui": "^8.0.2" }, "optionalDependencies": { "@pkgjs/parseargs": "^0.11.0" } }, "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw=="], "rimraf/glob/minimatch": ["minimatch@9.0.5", "", { "dependencies": { "brace-expansion": "^2.0.1" } }, "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow=="], diff --git a/packages/gitbook-v2/src/app/sites/static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts b/packages/gitbook-v2/src/app/sites/static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts new file mode 100644 index 0000000000..e372357721 --- /dev/null +++ b/packages/gitbook-v2/src/app/sites/static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts @@ -0,0 +1,14 @@ +import type { NextRequest } from 'next/server'; + +import { serveLLMsFullTxt } from '@/routes/llms-full'; +import { type RouteLayoutParams, getStaticSiteContext } from '@v2/app/utils'; + +export const dynamic = 'force-static'; + +export async function GET( + _request: NextRequest, + { params }: { params: Promise } +) { + const { context } = await getStaticSiteContext(await params); + return serveLLMsFullTxt(context, { withMarkdownPages: true }); +} diff --git a/packages/gitbook/package.json b/packages/gitbook/package.json index 8443bb615a..96b79070a4 100644 --- a/packages/gitbook/package.json +++ b/packages/gitbook/package.json @@ -46,6 +46,13 @@ "katex": "^0.16.9", "mathjax": "^3.2.2", "mdast-util-to-markdown": "^2.1.2", + "mdast-util-from-markdown": "^2.0.2", + "mdast-util-frontmatter": "^2.0.1", + "mdast-util-gfm": "^3.1.0", + "micromark-extension-gfm": "^3.0.0", + "micromark-extension-frontmatter": "^2.0.0", + "unist-util-remove": "^4.0.0", + "unist-util-visit": "^5.0.0", "memoizee": "^0.4.17", "next": "14.2.26", "next-themes": "^0.2.1", diff --git a/packages/gitbook/src/lib/urls.ts b/packages/gitbook/src/lib/urls.ts index 0bfa89e769..05dba4f3d0 100644 --- a/packages/gitbook/src/lib/urls.ts +++ b/packages/gitbook/src/lib/urls.ts @@ -8,3 +8,17 @@ export function checkIsHttpURL(input: string | URL): boolean { const parsed = new URL(input); return parsed.protocol === 'http:' || parsed.protocol === 'https:'; } + +/** + * True for absolute URLs (`scheme:*`) or hash-only anchors. + */ +export function checkIsExternalURL(input: string): boolean { + return URL.canParse(input); +} + +/** + * True for a hash-only anchor. + */ +export function checkIsAnchor(input: string): boolean { + return input.startsWith('#'); +} diff --git a/packages/gitbook/src/routes/llms-full.ts b/packages/gitbook/src/routes/llms-full.ts new file mode 100644 index 0000000000..c999b48c8c --- /dev/null +++ b/packages/gitbook/src/routes/llms-full.ts @@ -0,0 +1,185 @@ +import path from 'node:path'; +import { joinPath } from '@/lib/paths'; +import { getIndexablePages } from '@/lib/sitemap'; +import { getSiteStructureSections } from '@/lib/sites'; +import { checkIsAnchor, checkIsExternalURL } from '@/lib/urls'; +import type { RevisionPageDocument, SiteSection, SiteSpace } from '@gitbook/api'; +import { type GitBookSiteContext, checkIsRootSiteContext } from '@v2/lib/context'; +import { throwIfDataError } from '@v2/lib/data'; +import assertNever from 'assert-never'; +import type { Root, RootContent } from 'mdast'; +import { fromMarkdown } from 'mdast-util-from-markdown'; +import { frontmatterFromMarkdown } from 'mdast-util-frontmatter'; +import { gfmFromMarkdown, gfmToMarkdown } from 'mdast-util-gfm'; +import { toMarkdown } from 'mdast-util-to-markdown'; +import { frontmatter } from 'micromark-extension-frontmatter'; +import { gfm } from 'micromark-extension-gfm'; +import { remove } from 'unist-util-remove'; +import { visit } from 'unist-util-visit'; + +/** + * Generate a llms-full.txt file for the site. + */ +export async function serveLLMsFullTxt(context: GitBookSiteContext) { + if (!checkIsRootSiteContext(context)) { + return new Response('llms.txt is only served from the root of the site', { status: 404 }); + } + + const tree: Root = { + type: 'root', + children: await getNodesFromSiteStructure(context), + }; + + return new Response(toMarkdown(tree, { extensions: [gfmToMarkdown()] }), { + headers: { + 'Content-Type': 'text/plain; charset=utf-8', + }, + }); +} + +/** + * Get MDAST nodes from site structure. + */ +async function getNodesFromSiteStructure(context: GitBookSiteContext): Promise { + switch (context.structure.type) { + case 'sections': + return getNodesFromSections( + context, + getSiteStructureSections(context.structure, { ignoreGroups: true }) + ); + case 'siteSpaces': + return getNodesFromSiteSpaces(context, context.structure.structure, ''); + default: + assertNever(context.structure); + } +} + +/** + * Get MDAST nodes from site sections. + */ +async function getNodesFromSections( + context: GitBookSiteContext, + siteSections: SiteSection[] +): Promise { + const all = await Promise.all( + siteSections.map(async (siteSection): Promise => { + const siteSpaceNodes = await getNodesFromSiteSpaces( + context, + siteSection.siteSpaces, + siteSection.path + ); + return siteSpaceNodes; + }) + ); + return all.flat(); +} + +/** + * Get MDAST nodes from site spaces. + */ +async function getNodesFromSiteSpaces( + context: GitBookSiteContext, + siteSpaces: SiteSpace[], + basePath: string +): Promise { + const { dataFetcher } = context; + + const all = await Promise.all( + siteSpaces.map(async (siteSpace): Promise => { + const siteSpaceUrl = siteSpace.urls.published; + if (!siteSpaceUrl) { + return []; + } + const rootPages = await throwIfDataError( + dataFetcher.getRevisionPages({ + spaceId: siteSpace.space.id, + revisionId: siteSpace.space.revision, + metadata: false, + }) + ); + const pages = getIndexablePages(rootPages); + const nodes = ( + await Promise.all( + pages.map(async ({ page }): Promise => { + if (page.type !== 'document' || !page.documentId) { + return []; + } + + return getNodesFromPage( + context, + siteSpace, + page, + joinPath(basePath, siteSpace.path) + ); + }) + ) + ).flat(); + return nodes; + }) + ); + return all.flat(); +} + +/** + * Get MDAST nodes from a page. + */ +async function getNodesFromPage( + context: GitBookSiteContext, + siteSpace: SiteSpace, + page: RevisionPageDocument, + basePath: string +): Promise { + const { dataFetcher } = context; + + const pageMarkdown = await throwIfDataError( + dataFetcher.getRevisionPageMarkdown({ + spaceId: siteSpace.space.id, + revisionId: siteSpace.space.revision, + pageId: page.id, + }) + ); + + const tree = fromMarkdown(pageMarkdown, { + extensions: [frontmatter(['yaml']), gfm()], + mdastExtensions: [frontmatterFromMarkdown(['yaml']), gfmFromMarkdown()], + }); + + // Remove frontmatter + remove(tree, 'yaml'); + + // Rewrite relative links to absolute links + transformLinks(context, tree, { currentPagePath: page.path, basePath }); + + return tree.children; +} + +/** + * Re-writes the URL of every relative link so it is expressed from the site-root. + */ +export function transformLinks( + context: GitBookSiteContext, + tree: Root, + options: { currentPagePath: string; basePath: string } +): Root { + const { linker } = context; + const { currentPagePath, basePath } = options; + const currentDir = path.posix.dirname(currentPagePath); + + visit(tree, 'link', (node) => { + const original = (node as any).url as string; + + // Skip anchors, mailto:, http(s):, protocol-like, or already-rooted paths + if (checkIsExternalURL(original) || checkIsAnchor(original) || original.startsWith('/')) { + return; + } + + // Resolve against the current page’s directory and strip any leading “/” + const pathInSite = path.posix + .normalize(path.posix.join(basePath, currentDir, original)) + .replace(/^\/+/, ''); + + (node as any).url = linker.toPathInSite(pathInSite); + }); + + return tree; +} From a035bf8389082f08a44600297fa256a1664f0523 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sat, 7 Jun 2025 20:39:32 +0200 Subject: [PATCH 02/13] Changeset --- .changeset/orange-hounds-sparkle.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/orange-hounds-sparkle.md diff --git a/.changeset/orange-hounds-sparkle.md b/.changeset/orange-hounds-sparkle.md new file mode 100644 index 0000000000..28f2b6aa6e --- /dev/null +++ b/.changeset/orange-hounds-sparkle.md @@ -0,0 +1,5 @@ +--- +"gitbook-v2": patch +--- + +Generate a llms-full.txt version of the docs site From dcf80d21321643167f0fee5bdfa6a02f758c9be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sat, 7 Jun 2025 20:40:14 +0200 Subject: [PATCH 03/13] Improve code --- packages/gitbook/src/routes/llms-full.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/gitbook/src/routes/llms-full.ts b/packages/gitbook/src/routes/llms-full.ts index c999b48c8c..5e5019ddb5 100644 --- a/packages/gitbook/src/routes/llms-full.ts +++ b/packages/gitbook/src/routes/llms-full.ts @@ -7,7 +7,7 @@ import type { RevisionPageDocument, SiteSection, SiteSpace } from '@gitbook/api' import { type GitBookSiteContext, checkIsRootSiteContext } from '@v2/lib/context'; import { throwIfDataError } from '@v2/lib/data'; import assertNever from 'assert-never'; -import type { Root, RootContent } from 'mdast'; +import type { Link, Root, RootContent } from 'mdast'; import { fromMarkdown } from 'mdast-util-from-markdown'; import { frontmatterFromMarkdown } from 'mdast-util-frontmatter'; import { gfmFromMarkdown, gfmToMarkdown } from 'mdast-util-gfm'; @@ -165,8 +165,8 @@ export function transformLinks( const { currentPagePath, basePath } = options; const currentDir = path.posix.dirname(currentPagePath); - visit(tree, 'link', (node) => { - const original = (node as any).url as string; + visit(tree, 'link', (node: Link) => { + const original = node.url; // Skip anchors, mailto:, http(s):, protocol-like, or already-rooted paths if (checkIsExternalURL(original) || checkIsAnchor(original) || original.startsWith('/')) { @@ -178,7 +178,7 @@ export function transformLinks( .normalize(path.posix.join(basePath, currentDir, original)) .replace(/^\/+/, ''); - (node as any).url = linker.toPathInSite(pathInSite); + node.url = linker.toPathInSite(pathInSite); }); return tree; From 67b3cf56aa66b887aa19aa505f61f3dea60e79fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sat, 7 Jun 2025 20:41:28 +0200 Subject: [PATCH 04/13] Add tests --- packages/gitbook/e2e/internal.spec.ts | 32 +++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/packages/gitbook/e2e/internal.spec.ts b/packages/gitbook/e2e/internal.spec.ts index 17f03b9318..925521e02b 100644 --- a/packages/gitbook/e2e/internal.spec.ts +++ b/packages/gitbook/e2e/internal.spec.ts @@ -431,6 +431,38 @@ const testCases: TestsCase[] = [ }, ], }, + { + name: 'llms.txt', + skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel', + contentBaseURL: 'https://gitbook.gitbook.io/test-gitbook-open/', + tests: [ + { + name: 'llms.txt', + url: 'llms.txt', + screenshot: false, + run: async (_page, response) => { + expect(response?.status()).toBe(200); + expect(response?.headers()['content-type']).toContain('text/markdown'); + }, + }, + ], + }, + { + name: 'llms-full.txt', + skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel', + contentBaseURL: 'https://gitbook.gitbook.io/test-gitbook-open/', + tests: [ + { + name: 'llms-full.txt', + url: 'llms-full.txt', + screenshot: false, + run: async (_page, response) => { + expect(response?.status()).toBe(200); + expect(response?.headers()['content-type']).toContain('text/markdown'); + }, + }, + ], + }, { name: 'Site subdirectory (proxy)', skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel', From cab3a74095253b27aa85163f23491916496075c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sat, 7 Jun 2025 20:42:28 +0200 Subject: [PATCH 05/13] Fix TS --- .../static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/gitbook-v2/src/app/sites/static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts b/packages/gitbook-v2/src/app/sites/static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts index e372357721..7702d6101c 100644 --- a/packages/gitbook-v2/src/app/sites/static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts +++ b/packages/gitbook-v2/src/app/sites/static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts @@ -10,5 +10,5 @@ export async function GET( { params }: { params: Promise } ) { const { context } = await getStaticSiteContext(await params); - return serveLLMsFullTxt(context, { withMarkdownPages: true }); + return serveLLMsFullTxt(context); } From dba5b5c5fcbf77e290d5fc9d1afa4de418fb4f09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sat, 7 Jun 2025 20:48:04 +0200 Subject: [PATCH 06/13] Add page description --- packages/gitbook/src/routes/llms-full.ts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/packages/gitbook/src/routes/llms-full.ts b/packages/gitbook/src/routes/llms-full.ts index 5e5019ddb5..411aee3435 100644 --- a/packages/gitbook/src/routes/llms-full.ts +++ b/packages/gitbook/src/routes/llms-full.ts @@ -7,7 +7,7 @@ import type { RevisionPageDocument, SiteSection, SiteSpace } from '@gitbook/api' import { type GitBookSiteContext, checkIsRootSiteContext } from '@v2/lib/context'; import { throwIfDataError } from '@v2/lib/data'; import assertNever from 'assert-never'; -import type { Link, Root, RootContent } from 'mdast'; +import type { Link, Paragraph, Root, RootContent } from 'mdast'; import { fromMarkdown } from 'mdast-util-from-markdown'; import { frontmatterFromMarkdown } from 'mdast-util-frontmatter'; import { gfmFromMarkdown, gfmToMarkdown } from 'mdast-util-gfm'; @@ -147,6 +147,16 @@ async function getNodesFromPage( // Remove frontmatter remove(tree, 'yaml'); + if (page.description) { + // The first node is the page title as a H1, we insert the description as a paragraph + // after it. + const descriptionNode: Paragraph = { + type: 'paragraph', + children: [{ type: 'text', value: page.description }], + }; + tree.children.splice(1, 0, descriptionNode); + } + // Rewrite relative links to absolute links transformLinks(context, tree, { currentPagePath: page.path, basePath }); From 68e74b96f6c802400edf83ec075e32650a7e2919 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sat, 7 Jun 2025 20:56:17 +0200 Subject: [PATCH 07/13] Limit to 500 --- packages/gitbook/src/routes/llms-full.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/gitbook/src/routes/llms-full.ts b/packages/gitbook/src/routes/llms-full.ts index 411aee3435..8cc2156893 100644 --- a/packages/gitbook/src/routes/llms-full.ts +++ b/packages/gitbook/src/routes/llms-full.ts @@ -97,7 +97,10 @@ async function getNodesFromSiteSpaces( metadata: false, }) ); - const pages = getIndexablePages(rootPages); + const pages = getIndexablePages(rootPages) + // We currently limit the number of pages to 500 to avoid generating a too large markdown output + // and because of limits with the server on how many requests / files can be opened. + .slice(0, 500); const nodes = ( await Promise.all( pages.map(async ({ page }): Promise => { From 5c34e8e41ea127e6bc3c012cbea5c80acbd824c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sat, 7 Jun 2025 21:15:41 +0200 Subject: [PATCH 08/13] Respond with markdown content type --- packages/gitbook/src/routes/llms-full.ts | 2 +- packages/gitbook/src/routes/llms.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/gitbook/src/routes/llms-full.ts b/packages/gitbook/src/routes/llms-full.ts index 8cc2156893..cfc16fa51c 100644 --- a/packages/gitbook/src/routes/llms-full.ts +++ b/packages/gitbook/src/routes/llms-full.ts @@ -32,7 +32,7 @@ export async function serveLLMsFullTxt(context: GitBookSiteContext) { return new Response(toMarkdown(tree, { extensions: [gfmToMarkdown()] }), { headers: { - 'Content-Type': 'text/plain; charset=utf-8', + 'Content-Type': 'text/markdown; charset=utf-8', }, }); } diff --git a/packages/gitbook/src/routes/llms.ts b/packages/gitbook/src/routes/llms.ts index 24f9119505..a7e953cbad 100644 --- a/packages/gitbook/src/routes/llms.ts +++ b/packages/gitbook/src/routes/llms.ts @@ -46,7 +46,7 @@ export async function serveLLMsTxt( }), { headers: { - 'Content-Type': 'text/plain; charset=utf-8', + 'Content-Type': 'text/markdown; charset=utf-8', }, } ); From 3971ce5cad559ae1d3b44985813a451bfaf44b33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sat, 7 Jun 2025 21:17:09 +0200 Subject: [PATCH 09/13] Remove condition for our own doc --- packages/gitbook/src/routes/llms-full.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/gitbook/src/routes/llms-full.ts b/packages/gitbook/src/routes/llms-full.ts index cfc16fa51c..403e44e54b 100644 --- a/packages/gitbook/src/routes/llms-full.ts +++ b/packages/gitbook/src/routes/llms-full.ts @@ -104,7 +104,7 @@ async function getNodesFromSiteSpaces( const nodes = ( await Promise.all( pages.map(async ({ page }): Promise => { - if (page.type !== 'document' || !page.documentId) { + if (page.type !== 'document') { return []; } From e5bb89a3a7773c7a591bf249e1a6b7a83294500e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sat, 7 Jun 2025 23:08:33 +0200 Subject: [PATCH 10/13] Stream it --- bun.lock | 4 +- packages/gitbook/package.json | 2 +- packages/gitbook/src/routes/llms-full.ts | 155 ++++++++++++----------- 3 files changed, 86 insertions(+), 75 deletions(-) diff --git a/bun.lock b/bun.lock index f4e3bfd06b..5a2653c915 100644 --- a/bun.lock +++ b/bun.lock @@ -92,7 +92,7 @@ "nuqs": "^2.2.3", "object-hash": "^3.0.0", "openapi-types": "^12.1.3", - "p-map": "^7.0.0", + "p-map": "^7.0.3", "parse-cache-control": "^1.0.1", "partial-json": "^0.1.7", "react": "^19.0.0", @@ -2410,7 +2410,7 @@ "p-locate": ["p-locate@4.1.0", "", { "dependencies": { "p-limit": "^2.2.0" } }, "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A=="], - "p-map": ["p-map@7.0.2", "", {}, "sha512-z4cYYMMdKHzw4O5UkWJImbZynVIo0lSGTXc7bzB1e/rrDqkgGUNysK/o4bTr+0+xKvvLoTyGqYC4Fgljy9qe1Q=="], + "p-map": ["p-map@7.0.3", "", {}, "sha512-VkndIv2fIB99swvQoA65bm+fsmt6UNdGeIB0oxBs+WhAhdh08QA04JXpI7rbB9r08/nkbysKoya9rtDERYOYMA=="], "p-try": ["p-try@2.2.0", "", {}, "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ=="], diff --git a/packages/gitbook/package.json b/packages/gitbook/package.json index 96b79070a4..62952f844f 100644 --- a/packages/gitbook/package.json +++ b/packages/gitbook/package.json @@ -59,7 +59,7 @@ "nuqs": "^2.2.3", "object-hash": "^3.0.0", "openapi-types": "^12.1.3", - "p-map": "^7.0.0", + "p-map": "^7.0.3", "parse-cache-control": "^1.0.1", "partial-json": "^0.1.7", "react": "^19.0.0", diff --git a/packages/gitbook/src/routes/llms-full.ts b/packages/gitbook/src/routes/llms-full.ts index 403e44e54b..320eac31ca 100644 --- a/packages/gitbook/src/routes/llms-full.ts +++ b/packages/gitbook/src/routes/llms-full.ts @@ -7,131 +7,141 @@ import type { RevisionPageDocument, SiteSection, SiteSpace } from '@gitbook/api' import { type GitBookSiteContext, checkIsRootSiteContext } from '@v2/lib/context'; import { throwIfDataError } from '@v2/lib/data'; import assertNever from 'assert-never'; -import type { Link, Paragraph, Root, RootContent } from 'mdast'; +import type { Link, Paragraph, Root } from 'mdast'; import { fromMarkdown } from 'mdast-util-from-markdown'; import { frontmatterFromMarkdown } from 'mdast-util-frontmatter'; import { gfmFromMarkdown, gfmToMarkdown } from 'mdast-util-gfm'; import { toMarkdown } from 'mdast-util-to-markdown'; import { frontmatter } from 'micromark-extension-frontmatter'; import { gfm } from 'micromark-extension-gfm'; +import { pMapIterable } from 'p-map'; import { remove } from 'unist-util-remove'; import { visit } from 'unist-util-visit'; +// We limit the concurrency to 100 to avoid reaching limit with concurrent requests +// or file descriptor limits. +const MAX_CONCURRENCY = 100; + /** * Generate a llms-full.txt file for the site. + * As the result can be large, we stream it as we generate it. */ export async function serveLLMsFullTxt(context: GitBookSiteContext) { if (!checkIsRootSiteContext(context)) { return new Response('llms.txt is only served from the root of the site', { status: 404 }); } - const tree: Root = { - type: 'root', - children: await getNodesFromSiteStructure(context), - }; - - return new Response(toMarkdown(tree, { extensions: [gfmToMarkdown()] }), { - headers: { - 'Content-Type': 'text/markdown; charset=utf-8', - }, - }); + return new Response( + new ReadableStream({ + async pull(controller) { + await streamMarkdownFromSiteStructure(context, controller); + controller.close(); + }, + }), + { + headers: { + 'Content-Type': 'text/markdown; charset=utf-8', + }, + } + ); } /** - * Get MDAST nodes from site structure. + * Stream markdown from site structure. */ -async function getNodesFromSiteStructure(context: GitBookSiteContext): Promise { +async function streamMarkdownFromSiteStructure( + context: GitBookSiteContext, + stream: ReadableStreamDefaultController +): Promise { switch (context.structure.type) { case 'sections': - return getNodesFromSections( + return streamMarkdownFromSections( context, + stream, getSiteStructureSections(context.structure, { ignoreGroups: true }) ); case 'siteSpaces': - return getNodesFromSiteSpaces(context, context.structure.structure, ''); + return streamMarkdownFromSiteSpaces(context, stream, context.structure.structure, ''); default: assertNever(context.structure); } } /** - * Get MDAST nodes from site sections. + * Stream markdown from site sections. */ -async function getNodesFromSections( +async function streamMarkdownFromSections( context: GitBookSiteContext, + stream: ReadableStreamDefaultController, siteSections: SiteSection[] -): Promise { - const all = await Promise.all( - siteSections.map(async (siteSection): Promise => { - const siteSpaceNodes = await getNodesFromSiteSpaces( - context, - siteSection.siteSpaces, - siteSection.path - ); - return siteSpaceNodes; - }) - ); - return all.flat(); +): Promise { + for (const siteSection of siteSections) { + await streamMarkdownFromSiteSpaces( + context, + stream, + siteSection.siteSpaces, + siteSection.path + ); + } } /** - * Get MDAST nodes from site spaces. + * Stream markdown from site spaces. */ -async function getNodesFromSiteSpaces( +async function streamMarkdownFromSiteSpaces( context: GitBookSiteContext, + stream: ReadableStreamDefaultController, siteSpaces: SiteSpace[], basePath: string -): Promise { +): Promise { const { dataFetcher } = context; - const all = await Promise.all( - siteSpaces.map(async (siteSpace): Promise => { - const siteSpaceUrl = siteSpace.urls.published; - if (!siteSpaceUrl) { - return []; + for (const siteSpace of siteSpaces) { + const siteSpaceUrl = siteSpace.urls.published; + if (!siteSpaceUrl) { + continue; + } + const rootPages = await throwIfDataError( + dataFetcher.getRevisionPages({ + spaceId: siteSpace.space.id, + revisionId: siteSpace.space.revision, + metadata: false, + }) + ); + const pages = getIndexablePages(rootPages); + + for await (const markdown of pMapIterable( + pages, + async ({ page }) => { + if (page.type !== 'document') { + return ''; + } + + return getMarkdownForPage( + context, + siteSpace, + page, + joinPath(basePath, siteSpace.path) + ); + }, + { + concurrency: MAX_CONCURRENCY, } - const rootPages = await throwIfDataError( - dataFetcher.getRevisionPages({ - spaceId: siteSpace.space.id, - revisionId: siteSpace.space.revision, - metadata: false, - }) - ); - const pages = getIndexablePages(rootPages) - // We currently limit the number of pages to 500 to avoid generating a too large markdown output - // and because of limits with the server on how many requests / files can be opened. - .slice(0, 500); - const nodes = ( - await Promise.all( - pages.map(async ({ page }): Promise => { - if (page.type !== 'document') { - return []; - } - - return getNodesFromPage( - context, - siteSpace, - page, - joinPath(basePath, siteSpace.path) - ); - }) - ) - ).flat(); - return nodes; - }) - ); - return all.flat(); + )) { + stream.enqueue(markdown); + } + } } /** - * Get MDAST nodes from a page. + * Get markdown from a page. */ -async function getNodesFromPage( +async function getMarkdownForPage( context: GitBookSiteContext, siteSpace: SiteSpace, page: RevisionPageDocument, basePath: string -): Promise { +): Promise { const { dataFetcher } = context; const pageMarkdown = await throwIfDataError( @@ -163,7 +173,8 @@ async function getNodesFromPage( // Rewrite relative links to absolute links transformLinks(context, tree, { currentPagePath: page.path, basePath }); - return tree.children; + const markdown = toMarkdown(tree, { extensions: [gfmToMarkdown()] }); + return `${markdown}\n\n`; } /** From 79fd3829484a5bb25d437170338b4460bfbec0f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sun, 8 Jun 2025 01:02:27 +0200 Subject: [PATCH 11/13] Try with Uint8Array --- packages/gitbook/src/routes/llms-full.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/gitbook/src/routes/llms-full.ts b/packages/gitbook/src/routes/llms-full.ts index 320eac31ca..90efb73f63 100644 --- a/packages/gitbook/src/routes/llms-full.ts +++ b/packages/gitbook/src/routes/llms-full.ts @@ -32,7 +32,7 @@ export async function serveLLMsFullTxt(context: GitBookSiteContext) { } return new Response( - new ReadableStream({ + new ReadableStream({ async pull(controller) { await streamMarkdownFromSiteStructure(context, controller); controller.close(); @@ -51,7 +51,7 @@ export async function serveLLMsFullTxt(context: GitBookSiteContext) { */ async function streamMarkdownFromSiteStructure( context: GitBookSiteContext, - stream: ReadableStreamDefaultController + stream: ReadableStreamDefaultController ): Promise { switch (context.structure.type) { case 'sections': @@ -72,7 +72,7 @@ async function streamMarkdownFromSiteStructure( */ async function streamMarkdownFromSections( context: GitBookSiteContext, - stream: ReadableStreamDefaultController, + stream: ReadableStreamDefaultController, siteSections: SiteSection[] ): Promise { for (const siteSection of siteSections) { @@ -90,7 +90,7 @@ async function streamMarkdownFromSections( */ async function streamMarkdownFromSiteSpaces( context: GitBookSiteContext, - stream: ReadableStreamDefaultController, + stream: ReadableStreamDefaultController, siteSpaces: SiteSpace[], basePath: string ): Promise { @@ -128,7 +128,7 @@ async function streamMarkdownFromSiteSpaces( concurrency: MAX_CONCURRENCY, } )) { - stream.enqueue(markdown); + stream.enqueue(new TextEncoder().encode(markdown)); } } } From 3c05a65554f58c28f4363a9905dc2a27654e4670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sun, 8 Jun 2025 11:32:41 +0200 Subject: [PATCH 12/13] Make it work on v1 --- packages/gitbook/e2e/internal.spec.ts | 2 -- .../(site)/(core)/llms-full.txt/route.ts | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 packages/gitbook/src/app/middleware/(site)/(core)/llms-full.txt/route.ts diff --git a/packages/gitbook/e2e/internal.spec.ts b/packages/gitbook/e2e/internal.spec.ts index 925521e02b..4a31aa15e1 100644 --- a/packages/gitbook/e2e/internal.spec.ts +++ b/packages/gitbook/e2e/internal.spec.ts @@ -433,7 +433,6 @@ const testCases: TestsCase[] = [ }, { name: 'llms.txt', - skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel', contentBaseURL: 'https://gitbook.gitbook.io/test-gitbook-open/', tests: [ { @@ -449,7 +448,6 @@ const testCases: TestsCase[] = [ }, { name: 'llms-full.txt', - skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel', contentBaseURL: 'https://gitbook.gitbook.io/test-gitbook-open/', tests: [ { diff --git a/packages/gitbook/src/app/middleware/(site)/(core)/llms-full.txt/route.ts b/packages/gitbook/src/app/middleware/(site)/(core)/llms-full.txt/route.ts new file mode 100644 index 0000000000..58a98beaec --- /dev/null +++ b/packages/gitbook/src/app/middleware/(site)/(core)/llms-full.txt/route.ts @@ -0,0 +1,14 @@ +import type { NextRequest } from 'next/server'; + +import { getSiteContentPointer } from '@/lib/pointer'; +import { fetchV1ContextForSitePointer } from '@/lib/v1'; +import { serveLLMsFullTxt } from '@/routes/llms-full'; + +export const runtime = 'edge'; + +export async function GET(_req: NextRequest) { + const pointer = await getSiteContentPointer(); + const context = await fetchV1ContextForSitePointer(pointer); + + return serveLLMsFullTxt(context); +} From 0c0041e3fd97048bcdbffc46ebf00ba2bea48f21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Samy=20Pess=C3=A9?= Date: Sun, 8 Jun 2025 11:44:16 +0200 Subject: [PATCH 13/13] Revert "Make it work on v1" This reverts commit 3c05a65554f58c28f4363a9905dc2a27654e4670. --- packages/gitbook/e2e/internal.spec.ts | 2 ++ .../(site)/(core)/llms-full.txt/route.ts | 14 -------------- 2 files changed, 2 insertions(+), 14 deletions(-) delete mode 100644 packages/gitbook/src/app/middleware/(site)/(core)/llms-full.txt/route.ts diff --git a/packages/gitbook/e2e/internal.spec.ts b/packages/gitbook/e2e/internal.spec.ts index 4a31aa15e1..925521e02b 100644 --- a/packages/gitbook/e2e/internal.spec.ts +++ b/packages/gitbook/e2e/internal.spec.ts @@ -433,6 +433,7 @@ const testCases: TestsCase[] = [ }, { name: 'llms.txt', + skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel', contentBaseURL: 'https://gitbook.gitbook.io/test-gitbook-open/', tests: [ { @@ -448,6 +449,7 @@ const testCases: TestsCase[] = [ }, { name: 'llms-full.txt', + skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel', contentBaseURL: 'https://gitbook.gitbook.io/test-gitbook-open/', tests: [ { diff --git a/packages/gitbook/src/app/middleware/(site)/(core)/llms-full.txt/route.ts b/packages/gitbook/src/app/middleware/(site)/(core)/llms-full.txt/route.ts deleted file mode 100644 index 58a98beaec..0000000000 --- a/packages/gitbook/src/app/middleware/(site)/(core)/llms-full.txt/route.ts +++ /dev/null @@ -1,14 +0,0 @@ -import type { NextRequest } from 'next/server'; - -import { getSiteContentPointer } from '@/lib/pointer'; -import { fetchV1ContextForSitePointer } from '@/lib/v1'; -import { serveLLMsFullTxt } from '@/routes/llms-full'; - -export const runtime = 'edge'; - -export async function GET(_req: NextRequest) { - const pointer = await getSiteContentPointer(); - const context = await fetchV1ContextForSitePointer(pointer); - - return serveLLMsFullTxt(context); -}