docs: update

Mister-Hope · Mister-Hope · commit 091b432e781b · 2025-05-25T14:54:10.000+08:00
diff --git a/docs/plugins/search/meilisearch.md b/docs/plugins/search/meilisearch.md
@@ -112,7 +112,7 @@ Then, create a **correct configuration file** for the scraper. Here, we provide
 ```
 
 - `index_uid` should be a unique name for your index, which will be used to search.
-- `start_urls` and `sitemap_urls` (optional) shall be customized according to the website to be scraped.
+- `start_urls` and `sitemap_urls` (optional) shall be customized according to the website to be scraped. We recommend using it with [`@vuepress/plugin-sitemap`](../seo/sitemap/README.md) plugin and providing the corresponding `sitemap.xml` URL.
 - `selectors` field can be customized according to third-party theme DOM structure.
 - You can add new fields to `custom_settings` according to your needs.
 
@@ -144,23 +144,35 @@ Here:
 
 When the scraper completes, MeiliSearch will update the existing index with latest document content.
 
-Each time the scraper deletes and recreates the index. During this process, all the documents will be deleted and re-added. This might be slow for too many documents. However, when we only need to update part of the document content, we can use `only_urls` to tell the scraper to update only the specified urls instead of crawling all of them once.
+抓取器每次都会将索引删除并重新创建，在这个过程中所有的文档都将被删除并重新添加，这对过多的文档来说可能会很慢。所以我们的 `jqiue/docs-scraper` 允许你提供 `only_urls` 只抓取变更的文档内容时。
 
-```json
-{
-  "only_urls": ["https://<YOUR_WEBSITE_URL>/specifies/"]
-}
-```
+Each time the scraper deletes and recreates the index, all documents will be deleted and re-added. This can be slow for a large number of documents. Therefore, our `jqiue/docs-scraper` allows you to provide `only_urls` to only scrape the changed document content.
 
-Using `npx gous <docsDir> <replaceUrl> <scraperPath>` in your project can automatically generate `only_urls` for your scraper configuration file.
+```sh
+Usage: vp-meilisearch-crawler [options] <source> [scraper-path]
+
+Generate crawler config for meilisearch
+
+Arguments:
+  source                 Source directory of VuePress project
+  scraper-path           Scrapper config file path (default: .vuepress/meilisearch-config.json relative to source folder)
+
+Options:
+  -c, --config [config]  Set path to config file
+  --cache [cache]        Set the directory of the cache files
+  --temp [temp]          Set the directory of the temporary files
+  --clean-cache          Clean the cache files before generation
+  --clean-temp           Clean the temporary files before generation
+  -V, --version          output the version number
+  -h, --help             display help for command
+```
 
-::: tip description
+You can use `vp-meilisearch-scrapper <docsDir> <scraperPath>` in CI or Git Hooks to automatically generate `only_urls` for your scraper configuration file.
 
-If your project is not managed using Git or the os does not have Git installed, it cannot run.
+::: note
 
-- `docsDir` The parent directory of `.vuepress`. For example, if your directory is `docs/.vuepress`, then this value is `docs`
-- `replaceUrl` The URL of your document.
-- `scraperPath` The path of the scraper configuration file
+- `vp-meilisearch-crawler` needs to be run in a Git project.
+- `scraper-path` must correctly point to your scraper configuration file, which should be properly set up with all necessary fields except for `only_urls`.
 
 :::
 
@@ -258,6 +270,13 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          # This is required for the helper to compare the current and previous commits
+          fetch-depth: 2
+
+      - name: Generate Only URLs
+        # You may need to cd to the directory where `@vuepress/plugin-meilisearch` is installed first
+        run: pnpm vp-meilisearch-scrapper <docsDir> <path/to/your/scraper/config.json>
 
       - name: Run scraper
         env:
diff --git a/docs/zh/plugins/search/meilisearch.md b/docs/zh/plugins/search/meilisearch.md
@@ -112,7 +112,7 @@ docker pull jqiue/docs-scraper:latest
 ```
 
 - `index_uid` 应为你的索引分配一个唯一名称，用于搜索。
-- `start_urls` 和 `sitemap_urls`（可选）应根据要抓取的网站进行自定义。
+- `start_urls` 和 `sitemap_urls`（可选）应根据要抓取的网站进行自定义。我们建议与 [`@vuepress/plugin-sitemap`](../seo/sitemap/README.md) 插件一起使用并提供对应的 `sitemap.xml` URL。
 - `selectors` 字段可以根据第三方主题 DOM 结构进行自定义。
 - 你可以根据需要向 `custom_settings` 中添加新字段。
 
@@ -144,23 +144,33 @@ docker run -t --rm \
 
 抓取完成后，MeiliSearch 将更新现有索引以包含最新的文档内容。
 
-抓取器每次都会将索引删除并重新创建，在这个过程中所有的文档都将被删除并重新添加，这对过多的文档来说可能会很慢，但我们只需要更新部分文档内容时就可以使用 `only_urls` 告诉抓取器只更新指定的 URL 而不必全部抓取一遍
+抓取器每次都会将索引删除并重新创建，在这个过程中所有的文档都将被删除并重新添加，这对过多的文档来说可能会很慢。所以我们的 `jqiue/docs-scraper` 允许你提供 `only_urls` 只抓取变更的文档内容时。
 
-```json
-{
-  "only_urls": ["https://<YOUR_WEBSITE_URL>/specifies/"]
-}
-```
+你可以在 CI 或 Git Hooks 中使用 `vp-meilisearch-scrapper <docsDir> <scraperPath>` 为你的抓取器配置文件自动生成 `only_urls`。
 
-在项目使用`npx gous <docsDir> <replaceUrl> <scraperPath>`可以为你的抓取器配置文件自动生成 `only_urls`
-
-::: tip 说明
+```sh
+使用: vp-meilisearch-crawler [options] <source> [scraper-path]
+
+生成 meilisearch 的抓取器配置文件
+
+参数:
+  source                 VuePress 项目的源目录
+  scraper-path           抓取器配置文件路径 (默认: .vuepress/meilisearch-config.json 相对于源文件夹)
+
+选项:
+  -c, --config [config]  设置配置文件路径
+  --cache [cache]        设置缓存文件目录
+  --temp [temp]          设置临时文件目录
+  --clean-cache          在生成之前清理缓存文件
+  --clean-temp           在生成之前清理临时文件
+  -V, --version          输出版本号
+  -h, --help             显示帮助信息
+```
 
-如果你的项目不是使用 Git 进行管理或者系统没有安装 Git 则无法运行
+::: note
 
-- `docsDir` `.vuepress`的父目录，比如你的目录是`docs/.vuepress`，则该值为 `docs`
-- `replaceUrl` 网站的 URL
-- `scraperPath` 抓取器配置文件的路径
+- `vp-meilisearch-scrapper` 需要在 Git 项目中运行。
+- `scraper-path` 必须正确指向你的抓取器配置文件，这个文件应正确设置除了 `only_urls` 之外的所有必要字段。
 
 :::
 
@@ -256,11 +266,18 @@ jobs:
     runs-on: ubuntu-latest
     name: 重新抓取 MeiliSearch 文档
     steps:
-      - 名称：Checkout
+      - name: 检出
         uses: actions/checkout@v4
+        with:
+          # 这是比较当前和上一个提交所必需的
+          fetch-depth: 2
+
+      - name: Generate Only URLs
+        # 你可能需要先 cd 到安装 `@vuepress/plugin-meilisearch` 的目录
+        run: pnpm vp-meilisearch-scrapper <docsDir> <path/to/your/scraper/config.json>
 
-      - 名称：运行抓取器
-        env：
+      - name: 运行抓取器
+        env:
           # 替换为你自己的 MeiliSearch 主机 URL
           HOST_URL: <YOUR_MEILISEARCH_HOST_URL>
           API_KEY: ${{ secrets.MEILISEARCH_MASTER_KEY }}
diff --git a/plugins/search/plugin-meilisearch/package.json b/plugins/search/plugin-meilisearch/package.json
@@ -38,7 +38,7 @@
     "style": "sass src:lib --embed-sources --style=compressed --pkg-importer=node"
   },
   "bin": {
-    "vp-meilisearch-crawler": "./lib/cli/index.js"
+    "vp-meilisearch-scrapper": "./lib/cli/index.js"
   },
   "dependencies": {
     "@vuepress/helper": "workspace:*",
diff --git a/plugins/search/plugin-meilisearch/src/cli/generateScraperConfig.ts b/plugins/search/plugin-meilisearch/src/cli/generateScraperConfig.ts
@@ -41,8 +41,9 @@ const generateOnlyUrls = (
     },
     {},
   )
+
   const siteDestLocation =
-    new URL(scraperConfig.start_urls[0]).hostname + app.options.base
+    new URL(scraperConfig.start_urls[0]).origin + app.options.base.slice(0, -1)
 
   return changedMarkdownFilesPathRelative.map(
     (markdownFilePathRelative) =>
@@ -102,21 +103,19 @@ export const generateScraperConfig = async (
     await fs.remove(app.dir.cache())
   }
 
-  const outputPath = output
+  const scraperPath = output
     ? path.join(process.cwd(), output)
     : path.join(app.dir.source(), '.vuepress', 'meilisearch-config.json')
 
   if (!fs.existsSync(source)) {
     throw new Error(`Source directory ${source} does not exist!`)
   }
 
-  const scraperPath = path.resolve(output)
-
-  if (!fs.existsSync(outputPath)) {
+  if (!fs.existsSync(scraperPath)) {
     throw new Error(`Scraper file not found at ${scraperPath}`)
   }
 
-  const scraperConfig = fs.readJSONSync(outputPath, 'utf-8') as ScraperConfig
+  const scraperConfig = fs.readJSONSync(scraperPath, 'utf-8') as ScraperConfig
 
   const sourceRelativePath = getGitRelativePath(app.dir.source())
 
@@ -132,6 +131,14 @@ export const generateScraperConfig = async (
     return
   }
 
+  // initialize vuepress app to get pages
+  logger.info('Initializing VuePress and preparing data...')
+
+  // initialize vuepress app to get pages
+  await app.init()
+
+  logger.info('Generating only_urls...')
+
   const onlyUrls = generateOnlyUrls(
     app,
     changedMarkdownFilesPathRelative,
diff --git a/plugins/search/plugin-meilisearch/src/cli/index.ts b/plugins/search/plugin-meilisearch/src/cli/index.ts
@@ -2,8 +2,9 @@
 
 import { createCommand } from 'commander'
 
+import { logger } from 'vuepress/utils'
 import pkg from '../../package.json' with { type: 'json' }
-import { generateScraperConfig } from './generateScraperConfig'
+import { generateScraperConfig } from './generateScraperConfig.js'
 
 interface MeiliSearchCommandOptions {
   config?: string
@@ -25,8 +26,8 @@ program
   .option('--clean-temp', 'Clean the temporary files before generation')
   .argument('<source>', 'Source directory of VuePress project')
   .argument(
-    '[output]',
-    'Output folder (default: .vuepress/meilisearch-config.json relative to source folder)',
+    '[scraper-path]',
+    'Scrapper config file path (default: .vuepress/meilisearch-config.json relative to source folder)',
   )
   .action(
     async (
@@ -37,7 +38,8 @@ program
       try {
         await generateScraperConfig(sourceDir, output, commandOptions)
       } catch (error) {
-        program.error(`Command execution error: ${(error as Error).message}`)
+        logger.error(error)
+        program.error(`Command execution error.`)
       }
     },
   )
diff --git a/plugins/search/plugin-meilisearch/src/cli/shouldRescrape.ts b/plugins/search/plugin-meilisearch/src/cli/shouldRescrape.ts
@@ -1,7 +1,7 @@
 import type { SpawnSyncReturns } from 'node:child_process'
 import { spawnSync } from 'node:child_process'
 import { logger } from 'vuepress/utils'
-import { getWorkspaceStatus } from './utils'
+import { getWorkspaceStatus } from './utils.js'
 
 /**
  * Checks if a full rescrape is needed by examining the most recent commit