diff --git a/clis/openclaw/all.ts b/clis/openclaw/all.ts new file mode 100644 index 000000000..f30fd0bce --- /dev/null +++ b/clis/openclaw/all.ts @@ -0,0 +1,126 @@ +/** + * OpenCLI all - 下载所有文档 + * + * Usage: + * opencli openclaw all + * opencli openclaw all --lang zh-CN + * + * 依赖: + * extractCategories() from './list' - 获取分类列表 + * extractContent() from './read' - 提取文档内容 + * postProcessFile() from './read' - 后处理文件 + */ + +import { cli, Strategy } from '@jackwener/opencli/registry'; +import { getBaseUrl, getDefaultLang, buildDocUrl, safeTitle } from './tool.js'; +import { downloadArticle } from '@jackwener/opencli/download/article-download'; +import { extractCategories, type Category } from './list.js'; +import { extractContent, postProcessFile } from './read.js'; + +cli({ + site: 'openclaw', + name: 'all ', + description: '下载所有 OpenCLI 文档', + domain: 'docs.openclaw.ai', + strategy: Strategy.PUBLIC, + browser: true, + timeoutSeconds: 300, + args: [ + { name: 'lang', default: 'en', help: '语言: en, zh-CN, ja-JP 等' }, + { name: 'output', default: './openclaw-docs', help: '输出目录' }, + { name: 'download-images', type: 'boolean', default: false, help: '是否下载图片' }, + ], + columns: ['title', 'path', 'status'], + func: async (page, kwargs) => { + const lang = kwargs.lang || getDefaultLang(); + const baseUrl = await getBaseUrl(lang); + const results: Array<{ title: string; path: string; status: string }> = []; + const maxRetries = 3; + + // 1. 获取顶层分类列表(复用 list.ts 的 extractCategories 函数) + await page.goto(baseUrl); + await page.wait(3); + const categories = await extractCategories(page); + + if (!categories || categories.length === 0) { + return [{ title: 'No categories found', path: baseUrl, status: 'failed' }]; + } + + // 2. 下载每个分类(复用 read.ts 的 extractContent 和 postProcessFile 函数) + // 失败的重试放到最后 + const failedCategories: Category[] = []; + + for (const cat of categories as Category[]) { + const success = await downloadCategory(page, cat, baseUrl, kwargs); + if (success) { + results.push({ title: cat.title, path: cat.path, status: 'success' }); + } else { + failedCategories.push(cat); + results.push({ title: cat.title, path: cat.path, status: 'pending' }); + } + } + + // 3. 重试失败的分类 + for (const cat of failedCategories) { + let success = false; + for (let retry = 1; retry <= maxRetries; retry++) { + // 等待后再重试 + await page.wait(5000 * retry); + success = await downloadCategory(page, cat, baseUrl, kwargs); + if (success) break; + } + // 更新结果状态 + const resultItem = results.find(r => r.path === cat.path); + if (resultItem) { + resultItem.status = success ? 'success' : 'failed'; + } + } + + return results; + }, +}); + +/** + * 下载单个分类文档 + * + * @returns 是否成功 + */ +async function downloadCategory( + page: any, + cat: Category, + baseUrl: string, + kwargs: any +): Promise { + try { + const catUrl = buildDocUrl(baseUrl, cat.path); + await page.goto(catUrl); + // 随机等待 3-5 秒,避免请求过快 + await page.wait(3000 + Math.random() * 2000); + + // 提取内容(复用 read.ts 的逻辑) + const data = await extractContent(page); + + // 下载文档 + await downloadArticle( + { + title: data.title || cat.title, + sourceUrl: catUrl, + contentHtml: data.contentHtml, + imageUrls: data.imageUrls, + }, + { + output: kwargs.output, + downloadImages: kwargs['download-images'], + } + ); + + // 后处理文件 + const safeTitleStr = safeTitle(data.title || cat.title); + const mdPath = `${kwargs.output}/${safeTitleStr}/${safeTitleStr}.md`; + postProcessFile(mdPath); + + return true; + } catch { + return false; + } +} diff --git a/clis/openclaw/list.ts b/clis/openclaw/list.ts new file mode 100644 index 000000000..de31f5141 --- /dev/null +++ b/clis/openclaw/list.ts @@ -0,0 +1,103 @@ +/** + * OpenCLI list - 获取文档顶层分类列表 + * + * Usage: + * opencli openclaw list + * opencli openclaw list --lang zh-CN + */ + +import { cli, Strategy } from '@jackwener/opencli/registry'; +import { getBaseUrl } from './tool.js'; + +export interface Category { + title: string; + path: string; +} + +/** + * 提取文档分类列表 + * + * @param page - Playwright page 对象(需已导航到文档首页) + */ +export async function extractCategories(page: any): Promise { + return await page.evaluate(` + (() => { + const results = []; + const seen = new Set(); + + const headerSelectors = [ + 'header a[href]', 'nav a[href]', '.vp-header a[href]', + '.navbar a[href]', '[class*="header"] a[href]', '[class*="nav"] a[href]' + ]; + + for (const selector of headerSelectors) { + const elements = document.querySelectorAll(selector); + elements.forEach(el => { + const href = el.getAttribute('href'); + const text = el.textContent?.trim() || ''; + if (text && href && !seen.has(href) && + (href.startsWith('/') || href.startsWith('.')) && + text.length > 1 && text.length < 30 && + !href.includes('#') && !href.includes('?')) { + seen.add(href); + results.push({ title: text, path: href }); + } + }); + if (results.length > 0) break; + } + + if (results.length < 3) { + results.length = 0; + seen.clear(); + const sidebarSelectors = [ + '.vp-sidebar .sidebar-item', '.vp-sidebar > ul > li > a', + '.sidebar > .sidebar-links > li > a', '.sidebar a[href]', 'aside.sidebar a[href]' + ]; + for (const selector of sidebarSelectors) { + const elements = document.querySelectorAll(selector); + if (elements.length === 0) continue; + elements.forEach(el => { + const href = el.getAttribute('href'); + const text = el.textContent?.trim() || ''; + const parent = el.parentElement; + const grandparent = parent?.parentElement; + const isTopLevel = grandparent && ( + grandparent.classList?.contains('sidebar') || + grandparent.classList?.contains('vp-sidebar') || + grandparent.tagName === 'ASIDE' + ); + if (text && href && !seen.has(href) && + (href.startsWith('/') || href.startsWith('.')) && + text.length > 1 && text.length < 30 && + !href.includes('#') && (isTopLevel || elements.length < 15)) { + seen.add(href); + results.push({ title: text, path: href }); + } + }); + if (results.length >= 3) break; + } + } + + return results; + })() + `) as Category[]; +} + +cli({ + site: 'openclaw', + name: 'list', + description: '获取 OpenCLI 文档顶层分类列表', + domain: 'docs.openclaw.ai', + strategy: Strategy.PUBLIC, + browser: true, + args: [ + { name: 'lang', default: 'en', help: '语言: en 或 zh-CN' }, + ], + columns: ['title', 'path'], + func: async (page, kwargs) => { + const baseUrl = await getBaseUrl(kwargs.lang); + await page.goto(baseUrl); + await page.wait(3); + return await extractCategories(page); + }, +}); diff --git a/clis/openclaw/read.ts b/clis/openclaw/read.ts new file mode 100644 index 000000000..cce80906e --- /dev/null +++ b/clis/openclaw/read.ts @@ -0,0 +1,247 @@ +/** + * OpenCLI read - 读取文档内容 + * + * Usage: + * opencli openclaw read --path "/guide/intro" + * opencli openclaw read --path "/guide/intro" --lang zh-CN + * + * 核心函数: + * extractContent(page) - 提取页面内容 + * postProcessFile(mdPath) - 后处理 Markdown 文件 + */ + +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { cli, Strategy } from '@jackwener/opencli/registry'; +import { downloadArticle } from '@jackwener/opencli/download/article-download'; +import { getBaseUrl, getDefaultLang, safeTitle, fixWindowsPath } from './tool.js'; + +export interface Content { + title: string; + contentHtml: string; + imageUrls: string[]; + sourceUrl: string; +} + +/** + * 提取文档内容 + * + * @param page - Playwright page 对象(需已导航到文档页面) + * @returns 包含 title, contentHtml, imageUrls 的对象 + */ +export async function extractContent(page: any): Promise { + return await page.evaluate(` + (() => { + const result = { title: '', contentHtml: '', imageUrls: [] }; + + // 标题提取 - 优先使用 og:title meta 标签 + const ogTitle = document.querySelector('meta[property="og:title"]'); + if (ogTitle) { + result.title = ogTitle.getAttribute('content')?.trim() || ''; + } + // 降级到 document.title + if (!result.title) { + result.title = document.title?.trim() || ''; + } + // 清理标题后缀(如 " - OpenCLI") + result.title = result.title.replace(/\\s*[|\\-–—]\\s*OpenCLI.*$/, '').trim() || 'untitled'; + + // 内容提取 - 尝试多个选择器找到主要内容区域 + let contentEl = null; + const contentSelectors = [ + 'article', // 标准文章标签 + '[role="main"]', // ARIA 主内容区 + 'main', // HTML5 main 标签 + '.mdx-content', // MDX 内容容器 + '.content', // 通用内容类名 + '.docs-content' // 文档内容类名 + ]; + + for (const selector of contentSelectors) { + const el = document.querySelector(selector); + if (el && (el.textContent?.length || 0) > 200) { + contentEl = el; + break; + } + } + + // 如果都没找到,使用 body + if (!contentEl) { + contentEl = document.body; + } + + // 克隆并清理噪音元素(导航、侧边栏、搜索等) + const clone = contentEl.cloneNode(true); + const noise = [ + 'nav', 'header:not(article header)', 'footer:not(article footer)', 'aside', + '.navbar', '.nav', '.sidebar', '.menu', '.header', '.footer', + '.sidebar-content', '#sidebar', '#navbar', '#table-of-contents', + '[class*="sidebar"]', '[class*="nav-"]', '[class*="navbar"]', + '.search', '#search', '[class*="search"]', + '[data-testid*="search"]', '#search-bar-entry', '#search-bar-entry-mobile', + '.nav-logo', '[class*="logo"]', '[class*="brand"]', 'picture', + '#localization-select-trigger', '[class*="localization"]', '[class*="language"]', + '.toc', '#table-of-contents', '[class*="table-of-contents"]', + '#content-side-layout', '.content-side-layout', '#background-color', + '.comments', '.comment', '.ad', '.ads', '.advertisement', '[class*="advertisement"]', + 'script', 'style', 'noscript', 'iframe', 'template', + '[hidden]', '[aria-hidden="true"]', '.sr-only', '.visually-hidden', + '[data-component-part="copy-code-button"]', '[data-rmiz]', '[aria-owns*="rmiz"]', + '.code-block[data-floating-buttons]', '[data-custom-css-index]' + ].join(', '); + + clone.querySelectorAll(noise).forEach(el => el.remove()); + result.contentHtml = clone.innerHTML; + + // 图片提取 - 去重 + const seen = new Set(); + clone.querySelectorAll('img').forEach(img => { + const src = img.getAttribute('data-src') + || img.getAttribute('data-original') + || img.getAttribute('src'); + if (src && !src.startsWith('data:') && !seen.has(src)) { + seen.add(src); + result.imageUrls.push(src); + } + }); + + return result; + })() + `) as Content; +} + +/** + * 后处理 Markdown 文件 + * + * 修复各种提取和转换过程中的问题: + * - 被拆分的标题 + * - 残留的空标题标记 + * - frontmatter 格式 + * - 原文链接位置 + * + * @param mdPath - Markdown 文件路径 + */ +export function postProcessFile(mdPath: string): void { + try { + let content = fs.readFileSync(mdPath, 'utf-8'); + + // 1. 修复被拆分的标题: ## + [](#xxx) + 标题文本 → ## 标题 + content = content.replace(/^##\s*\n+\[\s*​?\s*\]\(#([^)]+)\)\s*\n+([^#\n-][^\n]+)/gm, (_, anchor, title) => { + return `## ${title.trim()}`; + }); + + // 2. 修复 frontmatter 格式: # 标题 - OpenClaw → > OpenClaw - 标题 + content = content.replace(/^# (.+) - OpenClaw\s*$/m, '> OpenClaw - $1'); + + // 3. 把 > 原文链接: 行移到文件末尾 + let sourceUrl = ''; + content = content.replace(/^> 原文链接:\s*(.+)\s*$/gm, (_, url) => { + sourceUrl = `> 原文链接: ${url.trim()}`; + return ''; + }); + + // 4. 清理残留的空标题标记 ## 和 # + content = content.replace(/^##\s*$/gm, ''); + content = content.replace(/^#\s*$/gm, ''); + + // 5. 清理残留的空链接行 + content = content.replace(/^\[\s*​?\s*\]\(#[^)]+\)\s*$/gm, ''); + content = content.replace(/^\[\s*​?\s*\n+\s*\]\(#[^)]+\)\s*$/gm, ''); + + // 6. 如果正文第一段是中文标题(页面 h1),添加 # 标记 + content = content.replace(/^---[\s\n]*([\u4e00-\u9fff])/m, '---\n\n# $1'); + + // 7. 清理开头的噪音:blockquotes (如 "> OpenClaw - xxx") 和多余的 --- + content = content.replace(/^> OpenClaw - [^\n]+\n*/gm, ''); + content = content.replace(/^---\n*/gm, ''); + + // 8. 清理多余空行 + content = content.replace(/\n{4,}/g, '\n\n'); + + // 9. 将原文链接追加到文件末尾 + if (sourceUrl) { + content = content.trim() + '\n\n' + sourceUrl + '\n'; + } + + fs.writeFileSync(mdPath, content, 'utf-8'); + } catch { + // ignore errors + } +} + +cli({ + site: 'openclaw', + name: 'read', + description: '读取 OpenCLI docs', + domain: 'docs.openclaw.ai', + strategy: Strategy.PUBLIC, + browser: true, + args: [ + { name: 'path', required: true, help: '文档路径,如 /guide/intro(不需要包含 /zh-CN 前缀)' }, + { name: 'lang', default: 'en', help: '语言: en 或 zh-CN' }, + { name: 'output', default: './openclaw-docs', help: '输出目录' }, + { name: 'download-images', type: 'boolean', default: false, help: '是否下载图片' }, + { name: 'wait', type: 'int', default: 3, help: '页面加载后等待秒数' } + ], + columns: ['title', 'url', 'size'], + func: async (page, kwargs) => { + const lang = kwargs.lang || getDefaultLang(); + const baseUrl = await getBaseUrl(lang); + const pathArg = kwargs.path; + const waitSeconds = kwargs.wait ?? 3; + + // 构建完整 URL + let url = pathArg; + if (!url.startsWith('http')) { + // 避免 Windows 路径问题:如果 path 是 /D:/xxx 格式,修正为 /xxx + url = fixWindowsPath(url); + + if (!url.startsWith('/')) url = '/' + url; + + // 如果 path 已经包含语言前缀(如 /zh-CN/xxx),则直接使用 baseUrl + // 否则添加语言前缀 + const pathLangPrefix = '/' + lang; + if (url.startsWith(pathLangPrefix + '/') || url === pathLangPrefix) { + url = baseUrl.replace(/\/zh-CN$/, '') + url; + } else { + url = baseUrl + url; + } + } + + // 导航到文档页面 + await page.goto(url); + await page.wait(waitSeconds); + + // 提取文档内容 + const data = await extractContent(page); + + // 计算内容大小 + const size = (data?.contentHtml?.length || 0); + + // 下载文章 + const downloadResult = await downloadArticle( + { + title: data?.title || 'untitled', + sourceUrl: url, + contentHtml: data?.contentHtml || '', + imageUrls: data?.imageUrls, + }, + { + output: kwargs.output, + downloadImages: kwargs['download-images'], + } + ); + + // 后处理文件 + const safeTitleStr = safeTitle(data?.title || 'untitled'); + const mdPath = path.join(kwargs.output, safeTitleStr, `${safeTitleStr}.md`); + postProcessFile(mdPath); + + const result = downloadResult[0] || {}; + return [{ + title: data?.title || result.title || 'untitled', + url: url, + size: size ?? result.size, + }]; + }, +}); diff --git a/clis/openclaw/tool.ts b/clis/openclaw/tool.ts new file mode 100644 index 000000000..7695af9d1 --- /dev/null +++ b/clis/openclaw/tool.ts @@ -0,0 +1,152 @@ +/** + * 文档语言和 URL 工具模块 + * + * 供 list.ts 和 read.ts 调用 + */ + +export interface Language { + code: string; + name: string; + url: string; +} + +// 语言列表(映射表) +const LANGS: Language[] = [ + { code: 'en', name: 'English', url: 'https://docs.openclaw.ai' }, + { code: 'zh-CN', name: '简体中文', url: 'https://docs.openclaw.ai/zh-CN' } +]; + +/** + * 获取支持的语言代码列表 + */ +export function getSupportedLangCodes(): string[] { + return LANGS.map(l => l.code); +} + +/** + * 获取默认语言代码 + */ +export function getDefaultLang(): string { + return 'en'; +} + +/** + * 根据语言代码获取基础 URL + * + * @param lang - 语言代码(如 'en', 'zh-CN', 'cn', 'zh', 'ja-JP') + * @returns 对应的基础 URL,默认返回英文 + */ +export async function getBaseUrl(lang?: string): Promise { + const code = lang || 'en'; + const codeLower = code.toLowerCase(); + + // 精确匹配 + const exact = LANGS.find(l => l.code.toLowerCase() === codeLower); + if (exact) return exact.url; + + // 模糊匹配:用户输入可能是完整形式(如 ja-JP, zh-CN)或部分形式(如 cn, zh, ja) + // 先尝试前缀匹配(用户输入是完整形式,LANGS 是短形式) + let fuzzy = LANGS.find(l => codeLower.startsWith(l.code.toLowerCase() + '-') || codeLower.startsWith(l.code.toLowerCase() + '_')); + // 再尝试包含匹配(用户输入是短形式,LANGS 是完整形式,如 cn→zh-CN) + if (!fuzzy) fuzzy = LANGS.find(l => l.code.toLowerCase().includes(codeLower)); + if (fuzzy) { + // 模糊匹配到,先尝试用户输入的直接 URL(可能更准确),再试 LANGS URL + const directUrl = code === 'en' ? 'https://docs.openclaw.ai' : `https://docs.openclaw.ai/${code}`; + if (await urlExists(directUrl)) return directUrl; + if (await urlExists(fuzzy.url)) return fuzzy.url; + return 'https://docs.openclaw.ai'; + } + + // 没有匹配到,尝试直接用原始 code 构建 URL + const directUrl = code === 'en' ? 'https://docs.openclaw.ai' : `https://docs.openclaw.ai/${code}`; + if (await urlExists(directUrl)) return directUrl; + + // 回退到英文 + return 'https://docs.openclaw.ai'; +} + +/** + * 从 baseUrl 提取 origin(域名部分),去除语言前缀 + * + * @param baseUrl - 带语言前缀的 URL,如 https://docs.openclaw.ai/zh-CN + * @returns 纯域名,如 https://docs.openclaw.ai + */ +export function getOrigin(baseUrl: string): string { + return new URL(baseUrl).origin; +} + +/** + * 从路径构建完整文档 URL + * + * @param baseUrl - 带语言前缀的 base URL + * @param path - 文档路径(可能已包含语言前缀) + * @returns 完整的文档 URL + */ +export function buildDocUrl(baseUrl: string, path: string): string { + // 如果 path 已是完整 URL,直接返回 + if (path.startsWith('http')) return path; + + const origin = getOrigin(baseUrl); + + // path 已包含语言前缀(如 /zh-CN/channels),直接拼接 origin + if (path.startsWith('/')) { + return origin + path; + } + + // 相对路径,拼接到 baseUrl + return baseUrl + '/' + path; +} + +/** + * 清理标题,移除特殊字符 + * + * @param title - 原始标题 + * @returns 安全的标题(用于文件名) + */ +export function safeTitle(title: string): string { + return title.replace(/[^\w\u4e00-\u9fff-]/g, '_').substring(0, 80); +} + +/** + * 修复 Windows 路径问题 + * + * 将 /D:/xxx 格式修正为 /xxx + * + * @param path - 路径 + * @returns 修正后的路径 + */ +export function fixWindowsPath(path: string): string { + const windowsPathMatch = path.match(/^\/([A-Za-z]):\/(.*)/); + if (windowsPathMatch) { + return '/' + windowsPathMatch[2]; + } + return path; +} + +/** + * 检查 URL 是否存在 + */ +async function urlExists(url: string): Promise { + try { + const res = await fetch(url, { method: 'HEAD', redirect: 'manual' }); + return res.ok || res.status === 301 || res.status === 302; + } catch { + return false; + } +} + +/** + * 获取语言列表 + */ +export function getLanguages(): Language[] { + return LANGS; +} + +/** + * 从文档首页提取可用语言列表(保留接口,内部使用 LANGS) + * + * @deprecated 使用 getLanguages() 代替 + */ +export async function getAvailableLanguages(): Promise { + return LANGS; +} \ No newline at end of file diff --git a/package.json b/package.json index eb20d32f2..ef2a0ff5f 100644 --- a/package.json +++ b/package.json @@ -51,7 +51,7 @@ "postinstall": "node scripts/postinstall.js || true; node scripts/fetch-adapters.js || true", "typecheck": "tsc --noEmit", "lint": "tsc --noEmit", - "prepare": "[ -d src ] && npm run build || true", + "prepare": "node scripts/prepare.cjs", "prepublishOnly": "npm run build", "test": "vitest run --project unit", "test:bun": "bun vitest run --project unit", diff --git a/scripts/prepare.cjs b/scripts/prepare.cjs new file mode 100644 index 000000000..79428cbcd --- /dev/null +++ b/scripts/prepare.cjs @@ -0,0 +1,30 @@ +#!/usr/bin/env node +/** + * Cross-platform prepare script — builds the project if src/ exists. + * Works on Linux, macOS, and Windows. + */ + +const { execSync } = require('child_process'); +const { existsSync } = require('fs'); +const path = require('path'); + +function main() { + // Check if src directory exists + if (!existsSync('src')) { + return; + } + + try { + console.log('Building project...'); + execSync('npm run build', { + stdio: 'inherit', + shell: true, + cwd: path.resolve(__dirname, '..') + }); + } catch (err) { + // Build failure is non-fatal for prepare + process.exit(0); + } +} + +main();