diff --git a/README.md b/README.md index cbe9417080..bbf963080d 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ $ npm install @adobe/spacecat-audit-worker ## Usage -See the [API documentation](docs/API.md). +See the detailed [API documentation](docs/API.md). ## Development diff --git a/src/common/step-audit.js b/src/common/step-audit.js index 30729cda2f..a0183188ba 100644 --- a/src/common/step-audit.js +++ b/src/common/step-audit.js @@ -83,6 +83,10 @@ export class StepAudit extends BaseAudit { if (step.destination === AUDIT_STEP_DESTINATIONS.SCRAPE_CLIENT) { const scrapeClient = ScrapeClient.createFrom(context); const payload = destination.formatPayload(stepResult, auditContext, context); + if (stepResult?.status === 'skipped' || !Array.isArray(payload?.urls) || payload.urls.length === 0) { + log.info(`Skipping scrapeJob creation for step ${step.name}: no URLs to scrape`); + return stepResult; + } log.debug(`Creating new scrapeJob with the ScrapeClient. Payload: ${JSON.stringify(payload)}`); const scrapeJob = await scrapeClient.createScrapeJob(payload); log.info(`Created scrapeJob with id: ${scrapeJob.id}`); diff --git a/src/internal-links/base-url.js b/src/internal-links/base-url.js new file mode 100644 index 0000000000..7d81f9deac --- /dev/null +++ b/src/internal-links/base-url.js @@ -0,0 +1,40 @@ +/* + * Copyright 2026 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import { isValidUrl } from '@adobe/spacecat-shared-utils'; +import { wwwUrlResolver } from '../common/base-audit.js'; + +export function getInternalLinksFetchConfig(site) { + const siteConfig = site?.getConfig?.(); + return siteConfig?.getFetchConfig?.() + || siteConfig?.fetchConfig + || siteConfig?.config?.fetchConfig + || {}; +} + +export function resolveInternalLinksBaseURL(site) { + const overrideBaseURL = getInternalLinksFetchConfig(site)?.overrideBaseURL; + if (isValidUrl(overrideBaseURL)) { + return overrideBaseURL; + } + + return site?.getBaseURL?.() || ''; +} + +export async function resolveInternalLinksRumDomain(site, context) { + return wwwUrlResolver({ + getBaseURL: () => site?.getBaseURL?.(), + getConfig: () => ({ + getFetchConfig: () => ({}), + }), + }, context); +} diff --git a/src/internal-links/config.js b/src/internal-links/config.js index 01872a9e3c..3f614b9e23 100644 --- a/src/internal-links/config.js +++ b/src/internal-links/config.js @@ -14,6 +14,11 @@ import { hasText } from '@adobe/spacecat-shared-utils'; import { PAGES_PER_BATCH } from './crawl-detection.js'; import { MAX_BROKEN_LINKS_REPORTED } from './result-utils.js'; +export { + getInternalLinksFetchConfig, + resolveInternalLinksBaseURL, +} from './base-url.js'; + const MAX_URLS_TO_PROCESS = 100; const DEFAULT_LINKCHECKER_MIN_TIME_NEEDED_MS = 5 * 60 * 1000; const MAX_BROKEN_LINKS = 100; @@ -131,7 +136,36 @@ export class InternalLinksConfigResolver { } isLinkCheckerEnabled() { - return this.handlerConfig.isLinkcheckerEnabled ?? false; + const camelCaseValue = getBooleanConfig(this.handlerConfig.isLinkCheckerEnabled, undefined); + const legacyValue = getBooleanConfig(this.handlerConfig.isLinkcheckerEnabled, undefined); + + if (typeof camelCaseValue === 'boolean') { + return camelCaseValue; + } + if (typeof legacyValue === 'boolean') { + return legacyValue; + } + return false; + } + + getLinkCheckerFlagDebugInfo() { + const camelCaseValue = getBooleanConfig(this.handlerConfig.isLinkCheckerEnabled, undefined); + const legacyValue = getBooleanConfig(this.handlerConfig.isLinkcheckerEnabled, undefined); + let source = 'default:false'; + if (typeof camelCaseValue === 'boolean') { + source = 'isLinkCheckerEnabled'; + } else if (typeof legacyValue === 'boolean') { + source = 'isLinkcheckerEnabled'; + } + + return { + enabled: this.isLinkCheckerEnabled(), + source, + camelCaseRaw: this.handlerConfig.isLinkCheckerEnabled, + legacyRaw: this.handlerConfig.isLinkcheckerEnabled, + camelCaseValue, + legacyValue, + }; } getLinkCheckerProgramId() { diff --git a/src/internal-links/crawl-detection.js b/src/internal-links/crawl-detection.js index 7d784b5bbc..3b83faddf4 100644 --- a/src/internal-links/crawl-detection.js +++ b/src/internal-links/crawl-detection.js @@ -73,7 +73,8 @@ function getSourceItemType(parentTag) { function getAssetTypeFromUrl(url, pageUrl = 'https://example.com') { try { const pathname = new URL(url, pageUrl).pathname.toLowerCase(); - if (/\.(svg|png|jpe?g|gif|webp|avif)$/.test(pathname)) return 'image'; + if (/\.svg$/.test(pathname)) return 'svg'; + if (/\.(png|jpe?g|gif|webp|avif)$/.test(pathname)) return 'image'; /* c8 ignore start - Asset type branches covered by integration tests at extraction level */ if (/\.css$/.test(pathname)) return 'css'; if (/\.js$/.test(pathname)) return 'js'; diff --git a/src/internal-links/finalization.js b/src/internal-links/finalization.js index 1f33cc2af1..9eb2217bba 100644 --- a/src/internal-links/finalization.js +++ b/src/internal-links/finalization.js @@ -15,6 +15,7 @@ import { createInternalLinksStepLogger } from './logging.js'; import { classifyStatusBucket, isLinkInaccessible } from './helpers.js'; import { isWithinAuditScope } from './subpath-filter.js'; import { isSharedInternalResource } from './scope-utils.js'; +import { resolveInternalLinksBaseURL } from './base-url.js'; function isOnAuditHost(url, baseURL) { try { @@ -27,10 +28,52 @@ function isOnAuditHost(url, baseURL) { } } +function normalizeLinkCheckerUrl(url, baseURL) { + if (!url || !baseURL) { + return url; + } + + try { + const parsedBaseURL = new URL(prependSchema(baseURL)); + const trimmedUrl = String(url).trim(); + + if (trimmedUrl.startsWith('http://') || trimmedUrl.startsWith('https://')) { + return trimmedUrl; + } + + let normalizedPath = trimmedUrl; + + // LinkChecker emits repository-style content paths for source pages in AEM CS logs. + // Resolve them back onto the publish host so they can be scoped like crawl/RUM URLs. + if (normalizedPath.startsWith('/content/ASO/')) { + normalizedPath = normalizedPath.replace(/^\/content\/ASO/, ''); + } + + return new URL(normalizedPath, `${parsedBaseURL.protocol}//${parsedBaseURL.host}`).toString(); + } catch (error) { + return url; + } +} + function normalizeLinkCheckerValidity(validity) { return String(validity || 'UNKNOWN').trim().toUpperCase(); } +const LINKCHECKER_VALIDITY_ONLY_ASSET_TYPES = new Set([ + 'image', + 'svg', + 'css', + 'js', + 'iframe', + 'video', + 'audio', + 'media', +]); + +function requiresExplicitBrokenStatus(itemType) { + return LINKCHECKER_VALIDITY_ONLY_ASSET_TYPES.has(itemType); +} + function disableEventLoopWait(context) { if (context && 'callbackWaitsForEmptyEventLoop' in context) { context.callbackWaitsForEmptyEventLoop = false; @@ -130,7 +173,7 @@ export function createFinalizeCrawlDetection({ step: 'finalize-crawl-detection', }); const shouldCleanup = !skipCrawlDetection; - const baseURL = typeof site.getBaseURL === 'function' ? site.getBaseURL() : ''; + const baseURL = resolveInternalLinksBaseURL(site); let finalizationLockAcquired = false; let finalizationLockEtag = null; @@ -149,6 +192,7 @@ export function createFinalizeCrawlDetection({ const timeoutStatus = getTimeoutStatus(lambdaStartTime, context); log.info('====== Finalize: Merge and Generate Suggestions ======'); log.info(`auditId: ${auditId}`); + log.info(`Using audit scope URL for finalization: ${baseURL}`); log.info(`Timeout status: ${timeoutStatus.percentUsed.toFixed(1)}% used, ${Math.floor(timeoutStatus.safeTimeRemaining / 1000)}s safe time remaining`); /* c8 ignore next 4 - Defensive timeout warning path depends on invocation timing */ @@ -191,41 +235,57 @@ export function createFinalizeCrawlDetection({ log.info(`Crawl detected ${crawlLinks.length} broken links`); /* c8 ignore start - defensive normalization defaults */ - let skippedLinkCheckerRows = 0; + const linkCheckerSkipReasons = { + missingUrl: 0, + noBrokenSignal: 0, + outsideScope: 0, + }; + let normalizedLinkCheckerUrls = 0; + let normalizedRepositoryPaths = 0; const linkCheckerLinks = linkCheckerResults .map((lc) => { + const normalizedUrlFrom = normalizeLinkCheckerUrl(lc.urlFrom, baseURL); + const normalizedUrlTo = normalizeLinkCheckerUrl(lc.urlTo, baseURL); + if (normalizedUrlFrom !== lc.urlFrom || normalizedUrlTo !== lc.urlTo) { + normalizedLinkCheckerUrls += 1; + } + if (String(lc.urlFrom || '').startsWith('/content/ASO/')) { + normalizedRepositoryPaths += 1; + } const itemType = lc.itemType || 'link'; const validity = normalizeLinkCheckerValidity(lc.validity); const httpStatus = Number.parseInt(lc.httpStatus, 10); const statusBucket = classifyStatusBucket(httpStatus); const isExplicitBrokenValidity = validity === 'INVALID'; const hasBrokenStatus = Boolean(statusBucket); + const requireBrokenStatus = requiresExplicitBrokenStatus(itemType); - if (!lc.urlFrom || !lc.urlTo) { - skippedLinkCheckerRows += 1; + if (!normalizedUrlFrom || !normalizedUrlTo) { + linkCheckerSkipReasons.missingUrl += 1; return null; } - if (!hasBrokenStatus && !isExplicitBrokenValidity) { - skippedLinkCheckerRows += 1; + if ((requireBrokenStatus && !hasBrokenStatus) + || (!hasBrokenStatus && !isExplicitBrokenValidity)) { + linkCheckerSkipReasons.noBrokenSignal += 1; return null; } if (baseURL) { - const targetInScope = isWithinAuditScope(lc.urlTo, baseURL) - || isSharedInternalResource(lc.urlTo, baseURL, itemType); - if (!(isOnAuditHost(lc.urlFrom, baseURL) - && isOnAuditHost(lc.urlTo, baseURL) - && isWithinAuditScope(lc.urlFrom, baseURL) + const targetInScope = isWithinAuditScope(normalizedUrlTo, baseURL) + || isSharedInternalResource(normalizedUrlTo, baseURL, itemType); + if (!(isOnAuditHost(normalizedUrlFrom, baseURL) + && isOnAuditHost(normalizedUrlTo, baseURL) + && isWithinAuditScope(normalizedUrlFrom, baseURL) && targetInScope)) { - skippedLinkCheckerRows += 1; + linkCheckerSkipReasons.outsideScope += 1; return null; } } return { - urlFrom: lc.urlFrom, - urlTo: lc.urlTo, + urlFrom: normalizedUrlFrom, + urlTo: normalizedUrlTo, anchorText: lc.anchorText || '', itemType, detectionSource: 'linkchecker', @@ -238,9 +298,20 @@ export function createFinalizeCrawlDetection({ .filter(Boolean); /* c8 ignore stop */ + log.info( + `LinkChecker normalization v2 active: normalized=${normalizedLinkCheckerUrls}, ` + + `repositoryPaths=${normalizedRepositoryPaths}`, + ); log.info(`LinkChecker links transformed: ${linkCheckerLinks.length} broken links`); + const skippedLinkCheckerRows = Object.values(linkCheckerSkipReasons) + .reduce((sum, count) => sum + count, 0); if (skippedLinkCheckerRows > 0) { - log.info(`Skipped ${skippedLinkCheckerRows} LinkChecker rows without a broken signal or outside audit scope`); + log.info( + `Skipped ${skippedLinkCheckerRows} LinkChecker rows ` + + `(missingUrl=${linkCheckerSkipReasons.missingUrl}, ` + + `noBrokenSignal=${linkCheckerSkipReasons.noBrokenSignal}, ` + + `outsideScope=${linkCheckerSkipReasons.outsideScope})`, + ); } const preValidationStatus = getTimeoutStatus(lambdaStartTime, context); diff --git a/src/internal-links/handler.js b/src/internal-links/handler.js index 91bb1980ef..8e92baa746 100644 --- a/src/internal-links/handler.js +++ b/src/internal-links/handler.js @@ -14,7 +14,6 @@ import RUMAPIClient from '@adobe/spacecat-shared-rum-api-client'; import { Audit, Opportunity as Oppty, Suggestion as SuggestionDataAccess } from '@adobe/spacecat-shared-data-access'; import { isNonEmptyArray } from '@adobe/spacecat-shared-utils'; import { AuditBuilder } from '../common/audit-builder.js'; -import { wwwUrlResolver } from '../common/base-audit.js'; import { convertToOpportunity } from '../common/opportunity.js'; import { createContextLogger } from '../common/context-logger.js'; import { isUnscrapeable, filterBrokenSuggestedUrls } from '../utils/url-utils.js'; @@ -72,6 +71,7 @@ import { createInternalLinksOrchestration } from './orchestration.js'; import { createInternalLinksConfigResolver } from './config.js'; import { createSplunkClient } from './splunk-client.js'; import { extractLocalePathPrefix } from './scope-utils.js'; +import { resolveInternalLinksRumDomain } from './base-url.js'; const { AUDIT_STEP_DESTINATIONS } = Audit; const INTERVAL = 30; @@ -91,8 +91,9 @@ export const { auditType: AUDIT_TYPE, interval: INTERVAL, createContextLogger, + createConfigResolver: createInternalLinksConfigResolver, createRUMAPIClient: (context) => RUMAPIClient.createFrom(context), - resolveFinalUrl: wwwUrlResolver, + resolveFinalUrl: resolveInternalLinksRumDomain, isLinkInaccessible, calculatePriority, isWithinAuditScope, @@ -163,7 +164,7 @@ export const { }); export default new AuditBuilder() - .withUrlResolver(wwwUrlResolver) + .withUrlResolver(resolveInternalLinksRumDomain) .addStep( 'runAuditAndImportTopPagesStep', runAuditAndImportTopPagesStep, diff --git a/src/internal-links/linkchecker-orchestration.js b/src/internal-links/linkchecker-orchestration.js index 64a2184502..4c23d37c55 100644 --- a/src/internal-links/linkchecker-orchestration.js +++ b/src/internal-links/linkchecker-orchestration.js @@ -21,6 +21,7 @@ import { releaseExecutionLock, } from './batch-state.js'; import { sleep } from '../support/utils.js'; +import { resolveInternalLinksBaseURL } from './base-url.js'; const MAX_POLLING_CONTINUATIONS = 10; @@ -160,7 +161,8 @@ export function createLinkCheckerOrchestration({ const auditId = audit.getId(); const executionLockKey = 'linkchecker-start'; - const isLinkcheckerEnabled = config.isLinkCheckerEnabled(); + const linkCheckerFlagDebug = config.getLinkCheckerFlagDebugInfo(); + const isLinkcheckerEnabled = linkCheckerFlagDebug.enabled; const log = createInternalLinksStepLogger({ createContextLogger, log: baseLog, @@ -171,7 +173,13 @@ export function createLinkCheckerOrchestration({ }); log.info('====== LinkChecker Detection Step ======'); - log.info(`auditId: ${auditId}, isLinkcheckerEnabled: ${isLinkcheckerEnabled}`); + log.info( + `auditId: ${auditId}, isLinkcheckerEnabled: ${isLinkcheckerEnabled}, ` + + `flagSource=${linkCheckerFlagDebug.source}, ` + + `camelCaseRaw=${String(linkCheckerFlagDebug.camelCaseRaw)}, ` + + `legacyRaw=${String(linkCheckerFlagDebug.legacyRaw)}, ` + + 'resolverVersion=v2', + ); const workflowCompletedAt = getWorkflowCompletedAt(audit); if (workflowCompletedAt) { @@ -219,6 +227,7 @@ export function createLinkCheckerOrchestration({ programId, environmentId, lookbackMinutes, + scopeBaseURL: resolveInternalLinksBaseURL(site), }); log.info('Submitting Splunk job for LinkChecker logs'); diff --git a/src/internal-links/linkchecker-splunk.js b/src/internal-links/linkchecker-splunk.js index 7289fd614c..1f70a8822a 100644 --- a/src/internal-links/linkchecker-splunk.js +++ b/src/internal-links/linkchecker-splunk.js @@ -19,21 +19,35 @@ const DEFAULT_MAX_RESULTS = 10000; const TRANSIENT_SPLUNK_STATUS_CODES = new Set([429, 500, 502, 503, 504]); const SPLUNK_REQUEST_MAX_RETRIES = 3; const SPLUNK_RETRY_BASE_DELAY_MS = 1000; +const DEFAULT_SPLUNK_SEARCH_NAMESPACE = 'admin/search'; function escapeSplunkString(value) { return String(value).replace(/\\/g, '\\\\').replace(/"/g, '\\"'); } +function buildScopeFilter(scopeBaseURL) { + if (!scopeBaseURL) { + return null; + } + + try { + const { pathname } = new URL(scopeBaseURL); + if (!pathname || pathname === '/') { + return null; + } + + const repoScopedPrefix = `/content/ASO${pathname}`; + return `| where like(urlFrom, "${escapeSplunkString(repoScopedPrefix)}%") OR like(urlFrom, "${escapeSplunkString(pathname)}%")`; + } catch (error) { + return null; + } +} + function getSplunkSearchNamespace(context = {}) { const configuredNamespace = context?.env?.SPLUNK_SEARCH_NAMESPACE; /* c8 ignore next - fallback coercion branch */ const normalizedNamespace = String(configuredNamespace || '').replace(/^\/+|\/+$/g, ''); - - if (!normalizedNamespace) { - throw new Error('SPLUNK_SEARCH_NAMESPACE must be configured for internal-links LinkChecker'); - } - - return normalizedNamespace; + return normalizedNamespace || DEFAULT_SPLUNK_SEARCH_NAMESPACE; } async function fetchSplunkAPIWithRetry(client, request, log) { @@ -87,13 +101,15 @@ export function buildLinkCheckerQuery({ environmentId, lookbackMinutes = DEFAULT_LOOKBACK_MINUTES, maxResults = DEFAULT_MAX_RESULTS, + scopeBaseURL, }) { // Query structure: // 1. Base index and time range // 2. Filter by program and environment (automatically logged by AEM) // 3. Filter for LinkChecker internal link removal events (FT_SITES-39847) - // 4. Parse JSON structure - // 5. Extract fields and limit results + // 4. Parse the top-level event JSON, then extract the embedded JSON payload from msg + // 5. Parse embedded LinkChecker payload, extract fields, and limit results + const scopeFilter = buildScopeFilter(scopeBaseURL); return [ 'search', 'index=dx_aem_engineering', @@ -101,8 +117,10 @@ export function buildLinkCheckerQuery({ 'latest=@m', `aem_program_id="${escapeSplunkString(programId)}"`, `aem_envId="${escapeSplunkString(environmentId)}"`, - '"linkchecker.removed_internal_link"', // FT_SITES-39847 ensures this field exists - '| spath', // Parse JSON structure + 'msg="*linkchecker.removed_internal_link*"', + '| spath', // Parse top-level Skyline event JSON + '| rex field=msg "LinkCheckerTransformer (?\\{.*\\})$"', + '| spath input=linkchecker_json', // Parse embedded LinkChecker payload from msg '| rename linkchecker.removed_internal_link.urlFrom as urlFrom', '| rename linkchecker.removed_internal_link.urlTo as urlTo', '| rename linkchecker.removed_internal_link.validity as validity', @@ -113,6 +131,7 @@ export function buildLinkCheckerQuery({ '| rename linkchecker.removed_internal_link.httpStatus as httpStatus', '| rename linkchecker.removed_internal_link.timestamp as timestamp', '| where isnotnull(urlFrom) AND isnotnull(urlTo)', + ...(scopeFilter ? [scopeFilter] : []), '| dedup urlFrom, urlTo, itemType', '| table urlFrom, urlTo, validity, elementName, attributeName, itemType, anchorText, httpStatus, timestamp', `| head ${maxResults}`, diff --git a/src/internal-links/opportunity-suggestions.js b/src/internal-links/opportunity-suggestions.js index d2ba4d69e7..7fc34fb3b8 100644 --- a/src/internal-links/opportunity-suggestions.js +++ b/src/internal-links/opportunity-suggestions.js @@ -10,7 +10,9 @@ * governing permissions and limitations under the License. */ -import { createInternalLinksConfigResolver } from './config.js'; +import { + createInternalLinksConfigResolver, +} from './config.js'; import { createInternalLinksStepLogger } from './logging.js'; export function createOpportunityAndSuggestionsStep({ diff --git a/src/internal-links/rum-detection.js b/src/internal-links/rum-detection.js index a9c8ebe57a..170957c372 100644 --- a/src/internal-links/rum-detection.js +++ b/src/internal-links/rum-detection.js @@ -11,6 +11,7 @@ */ import { createInternalLinksStepLogger } from './logging.js'; +import { getInternalLinksFetchConfig } from './base-url.js'; const RUM_VALIDATION_CONCURRENCY = 10; @@ -41,6 +42,7 @@ export function createInternalLinksRumSteps({ auditType, interval, createContextLogger, + createConfigResolver = () => ({ isLinkCheckerEnabled: () => false }), createRUMAPIClient, resolveFinalUrl, isLinkInaccessible, @@ -57,11 +59,14 @@ export function createInternalLinksRumSteps({ auditType, siteId: site.getId(), auditId, - step: 'rum-detection', + step: 'rum-detector-phase', }); const finalUrl = context?.finalUrl || await resolveFinalUrl(site, context); + const siteBaseURL = site.getBaseURL(); + const { overrideBaseURL } = getInternalLinksFetchConfig(site); - log.info('====== RUM Detection Phase ======'); + log.info('====== RUM Detection Phase started ======'); + log.info(`RUM resolver v2: siteBaseURL=${siteBaseURL}, overrideBaseURL=${overrideBaseURL || 'none'}, resolvedRumDomain=${finalUrl}`); log.info(`Site: ${site.getId()}, Domain: ${finalUrl}`); try { @@ -92,7 +97,7 @@ export function createInternalLinksRumSteps({ }; } - const baseURL = site.getBaseURL(); + const baseURL = siteBaseURL; const scopedInternal404Links = internal404Links.filter((link) => ( isWithinAuditScope(link.url_from, baseURL) && isWithinAuditScope(link.url_to, baseURL) @@ -181,7 +186,7 @@ export function createInternalLinksRumSteps({ async function runAuditAndImportTopPagesStep(context) { const { - site, log: baseLog, finalUrl, audit, + site, log: baseLog, finalUrl, audit, env, } = context; /* c8 ignore next - defensive logger context when audit is absent or incomplete */ const auditId = audit && typeof audit.getId === 'function' ? audit.getId() : undefined; @@ -205,8 +210,28 @@ export function createInternalLinksRumSteps({ const { success } = internalLinksAuditRunnerResult.auditResult; if (!success) { - log.error('RUM detection audit failed'); - throw new Error('Audit failed, skip scraping and suggestion generation'); + const { error: rumError } = internalLinksAuditRunnerResult.auditResult; + const config = createConfigResolver(site, env); + if (config.isLinkCheckerEnabled()) { + log.warn(`RUM detection audit failed, continuing with LinkChecker enabled: ${rumError}`); + return { + auditResult: { + ...internalLinksAuditRunnerResult.auditResult, + brokenInternalLinks: [], + success: true, + auditContext: { + interval, + rumError, + }, + }, + fullAuditRef: finalUrl, + type: 'top-pages', + siteId: site.getId(), + }; + } + + log.error(`RUM detection audit failed: ${rumError}`); + throw new Error(rumError); } log.info(`RUM detection complete. Found ${internalLinksAuditRunnerResult.auditResult.brokenInternalLinks?.length || 0} broken links`); diff --git a/src/internal-links/scrape-submission.js b/src/internal-links/scrape-submission.js index 7880561822..a63560eacd 100644 --- a/src/internal-links/scrape-submission.js +++ b/src/internal-links/scrape-submission.js @@ -10,7 +10,9 @@ * governing permissions and limitations under the License. */ -import { createInternalLinksConfigResolver } from './config.js'; +import { + createInternalLinksConfigResolver, +} from './config.js'; import { createInternalLinksStepLogger } from './logging.js'; export function createSubmitForScraping({ @@ -79,6 +81,7 @@ export function createSubmitForScraping({ log.warn('No URLs available for scraping'); log.info('=========================================='); return { + status: 'skipped', auditResult: audit.getAuditResult(), fullAuditRef: audit.getFullAuditRef(), urls: [], diff --git a/test/audits/internal-links/config.test.js b/test/audits/internal-links/config.test.js index 00d9abb06c..8947b60169 100644 --- a/test/audits/internal-links/config.test.js +++ b/test/audits/internal-links/config.test.js @@ -16,7 +16,9 @@ import { expect } from 'chai'; import { InternalLinksConfigResolver, createInternalLinksConfigResolver, + resolveInternalLinksBaseURL, } from '../../../src/internal-links/config.js'; +import { resolveInternalLinksRumDomain } from '../../../src/internal-links/base-url.js'; function createSite(config = {}, deliveryConfig = {}) { return { @@ -220,6 +222,130 @@ describe('internal-links config resolver', () => { expect(resolver.getLinkCheckerEnvironmentId()).to.equal('env-456'); }); + it('supports the camelCase site-config flag for enabling LinkChecker', () => { + const resolver = new InternalLinksConfigResolver(createSite({ + isLinkCheckerEnabled: true, + }), {}); + + expect(resolver.isLinkCheckerEnabled()).to.equal(true); + }); + + it('supports the legacy site-config flag for enabling LinkChecker', () => { + const resolver = new InternalLinksConfigResolver(createSite({ + isLinkcheckerEnabled: true, + }), {}); + + expect(resolver.isLinkCheckerEnabled()).to.equal(true); + }); + + it('exposes LinkChecker flag debug info for troubleshooting', () => { + const resolver = new InternalLinksConfigResolver(createSite({ + isLinkCheckerEnabled: true, + isLinkcheckerEnabled: false, + }), {}); + + expect(resolver.getLinkCheckerFlagDebugInfo()).to.deep.equal({ + enabled: true, + source: 'isLinkCheckerEnabled', + camelCaseRaw: true, + legacyRaw: false, + camelCaseValue: true, + legacyValue: false, + }); + }); + + it('prefers the legacy LinkChecker flag in debug info when camelCase is absent', () => { + const resolver = new InternalLinksConfigResolver(createSite({ + isLinkcheckerEnabled: true, + }), {}); + + expect(resolver.getLinkCheckerFlagDebugInfo()).to.deep.equal({ + enabled: true, + source: 'isLinkcheckerEnabled', + camelCaseRaw: undefined, + legacyRaw: true, + camelCaseValue: undefined, + legacyValue: true, + }); + }); + + it('prefers fetchConfig.overrideBaseURL for internal-links scope when valid', () => { + const site = { + getBaseURL: () => 'https://example.com/en.html', + getConfig: () => ({ + getFetchConfig: () => ({ + overrideBaseURL: 'https://example.com/en', + }), + }), + }; + + expect(resolveInternalLinksBaseURL(site)).to.equal('https://example.com/en'); + }); + + it('reads overrideBaseURL from nested config.fetchConfig fallback', () => { + const site = { + getBaseURL: () => 'https://example.com/en.html', + getConfig: () => ({ + config: { + fetchConfig: { + overrideBaseURL: 'https://example.com/en', + }, + }, + }), + }; + + expect(resolveInternalLinksBaseURL(site)).to.equal('https://example.com/en'); + }); + + it('falls back to site baseURL when overrideBaseURL is missing or invalid', () => { + const invalidOverrideSite = { + getBaseURL: () => 'https://example.com/en.html', + getConfig: () => ({ + getFetchConfig: () => ({ + overrideBaseURL: 'not-a-valid-url', + }), + }), + }; + const missingOverrideSite = { + getBaseURL: () => 'https://example.com/en.html', + getConfig: () => ({ + getFetchConfig: () => ({}), + }), + }; + + expect(resolveInternalLinksBaseURL(invalidOverrideSite)).to.equal('https://example.com/en.html'); + expect(resolveInternalLinksBaseURL(missingOverrideSite)).to.equal('https://example.com/en.html'); + }); + + it('resolves rum domain without using overrideBaseURL', async () => { + const site = { + getBaseURL: () => 'https://publish-p165653-e1774234.adobeaemcloud.com/wknd-abhigarg-0001/us/en.html', + getConfig: () => ({ + getFetchConfig: () => ({ + overrideBaseURL: 'https://publish-p165653-e1774234.adobeaemcloud.com/wknd-abhigarg-0001/us/en', + }), + }), + }; + const context = { + log: { + debug: () => {}, + error: () => {}, + }, + rumApiClient: { + retrieveDomainkey: async (domain) => { + if (domain === 'www.publish-p165653-e1774234.adobeaemcloud.com') { + return { domain }; + } + throw new Error('not found'); + }, + }, + }; + + const result = await resolveInternalLinksRumDomain(site, context); + + expect(result).to.equal('publish-p165653-e1774234.adobeaemcloud.com'); + }); + it('prefers deliveryConfig program and environment IDs over handler config', () => { const resolver = new InternalLinksConfigResolver(createSite({ aemProgramId: 'handler-program', diff --git a/test/audits/internal-links/crawl-detection.test.js b/test/audits/internal-links/crawl-detection.test.js index f6ecb8c7a7..ef12172ed2 100644 --- a/test/audits/internal-links/crawl-detection.test.js +++ b/test/audits/internal-links/crawl-detection.test.js @@ -1589,6 +1589,49 @@ describe('Crawl Detection Module', () => { expect(result.results.some((entry) => entry.urlTo === 'https://example.com/content/dam/apcolourcatalogue/asset/hero.webp' && entry.itemType === 'image')).to.equal(true); }); + it('should classify SVG assets referenced from CSS url() as svg', async () => { + const scrapeResultPaths = new Map([ + ['https://example.com/page1', 'scrapes/page1.json'], + ]); + + const htmlWithCssSvgAsset = ` + + + + + + + `; + + getObjectFromKeyStub.resolves({ + scrapeResult: { rawBody: htmlWithCssSvgAsset }, + finalUrl: 'https://example.com/page1', + }); + + isLinkInaccessibleStub + .withArgs('https://example.com/webassets/icons/search.svg') + .resolves(createValidationResponse(true, 404, '4xx', { contentType: 'image/svg+xml' })); + + const result = await detectBrokenLinksFromCrawlBatch({ + scrapeResultPaths, + batchStartIndex: 0, + batchSize: 1, + initialBrokenUrls: [], + initialWorkingUrls: [], + }, mockContext); + + expect(result.results).to.have.lengthOf(1); + expect(result.results[0]).to.deep.include({ + urlTo: 'https://example.com/webassets/icons/search.svg', + itemType: 'svg', + anchorText: '[style url()]', + }); + }); + it('should ignore CSS URLs with invalid escaped code points', async () => { const scrapeResultPaths = new Map([ ['https://example.com/page1', 'scrapes/page1.json'], diff --git a/test/audits/internal-links/finalization.test.js b/test/audits/internal-links/finalization.test.js index 8be2a4a0f3..bdfa2cf31d 100644 --- a/test/audits/internal-links/finalization.test.js +++ b/test/audits/internal-links/finalization.test.js @@ -195,6 +195,80 @@ describe('internal-links finalization', () => { expect(updateAuditResult.firstCall.args[6]).to.have.property('internalLinksWorkflowCompletedAt'); }); + it('uses overrideBaseURL when normalizing and scoping linkchecker results', async () => { + const updateAuditResult = sinon.stub().resolves({}); + + const finalize = createFinalizeCrawlDetection({ + auditType: 'broken-internal-links', + createContextLogger: (log) => log, + createConfigResolver: () => ({ + getIncludedStatusBuckets: () => ['not_found_404', 'masked_by_linkchecker'], + getIncludedItemTypes: () => ['link'], + }), + calculatePriority: (links) => links, + mergeAndDeduplicate: (firstLinks, secondLinks) => [...secondLinks, ...firstLinks], + loadFinalResults: sinon.stub().resolves([]), + cleanupBatchState: sinon.stub().resolves(), + getTimeoutStatus: sinon.stub().returns({ + percentUsed: 1, + safeTimeRemaining: 100000, + isApproachingTimeout: false, + }), + updateAuditResult, + opportunityAndSuggestionsStep: sinon.stub().resolves({ status: 'complete' }), + filterByStatusIfNeeded, + filterByItemTypes, + }); + + await finalize({ + log: { + info: sinon.stub(), + warn: sinon.stub(), + error: sinon.stub(), + debug: sinon.stub(), + }, + site: { + getId: () => 'site-1', + getBaseURL: () => 'https://example.com/en.html', + getConfig: () => ({ + getFetchConfig: () => ({ + overrideBaseURL: 'https://example.com/en', + }), + }), + }, + env: {}, + audit: { + getId: () => 'audit-1', + getAuditResult: () => ({ + brokenInternalLinks: [], + }), + }, + dataAccess: {}, + linkCheckerResults: [ + { + urlFrom: '/content/ASO/en/adventures.html', + urlTo: '/en/workshop', + itemType: 'link', + httpStatus: 404, + }, + ], + }, { skipCrawlDetection: false }); + + expect(updateAuditResult.firstCall.args[2]).to.deep.equal([ + { + urlFrom: 'https://example.com/en/adventures.html', + urlTo: 'https://example.com/en/workshop', + anchorText: '', + itemType: 'link', + detectionSource: 'linkchecker', + trafficDomain: 1, + httpStatus: 404, + statusBucket: 'not_found_404', + validity: 'UNKNOWN', + }, + ]); + }); + it('preserves batch state when finalization fails before completion', async () => { const cleanupBatchState = sinon.stub().resolves(); const updateAuditResult = sinon.stub().resolves({}); @@ -444,6 +518,135 @@ describe('internal-links finalization', () => { ]); }); + it('normalizes relative LinkChecker URLs onto the audited host before scope checks', async () => { + const updateAuditResult = sinon.stub().resolves({}); + const log = { + info: sinon.stub(), + warn: sinon.stub(), + error: sinon.stub(), + debug: sinon.stub(), + }; + + const finalize = createFinalizeCrawlDetection({ + auditType: 'broken-internal-links', + createContextLogger: (baseLog) => baseLog, + createConfigResolver: () => ({ + getIncludedStatusBuckets: () => ['not_found_404'], + getIncludedItemTypes: () => ['link'], + }), + calculatePriority: (links) => links.map((link) => ({ ...link, priority: 'high' })), + mergeAndDeduplicate: (firstLinks, secondLinks) => [...secondLinks, ...firstLinks], + loadFinalResults: sinon.stub().resolves([]), + cleanupBatchState: sinon.stub().resolves(), + getTimeoutStatus: sinon.stub().returns({ + percentUsed: 1, + safeTimeRemaining: 100000, + isApproachingTimeout: false, + }), + updateAuditResult, + opportunityAndSuggestionsStep: sinon.stub().resolves({ status: 'complete' }), + filterByStatusIfNeeded, + filterByItemTypes, + }); + + await finalize({ + log, + site: { + getId: () => 'site-1', + getBaseURL: () => 'https://publish.example.com/wknd-abhigarg-0001/us/en', + }, + env: {}, + audit: { + getId: () => 'audit-1', + getAuditResult: () => ({ brokenInternalLinks: [] }), + }, + dataAccess: {}, + linkCheckerResults: [ + { + urlFrom: '/content/ASO/wknd-abhigarg-0001/us/en/adventures.html', + urlTo: '/wknd-abhigarg-0001/us/en/workshop', + itemType: 'link', + validity: 'INVALID', + httpStatus: 404, + }, + ], + }, { skipCrawlDetection: false }); + + expect(updateAuditResult.firstCall.args[2]).to.deep.equal([ + { + urlFrom: 'https://publish.example.com/wknd-abhigarg-0001/us/en/adventures.html', + urlTo: 'https://publish.example.com/wknd-abhigarg-0001/us/en/workshop', + anchorText: '', + itemType: 'link', + detectionSource: 'linkchecker', + trafficDomain: 1, + httpStatus: 404, + statusBucket: 'not_found_404', + validity: 'INVALID', + priority: 'high', + }, + ]); + expect(log.info).to.have.been.calledWith( + sinon.match('LinkChecker normalization v2 active: normalized=1, repositoryPaths=1'), + ); + }); + + it('falls back safely when LinkChecker URL normalization cannot parse the base URL', async () => { + const updateAuditResult = sinon.stub().resolves({}); + + const finalize = createFinalizeCrawlDetection({ + auditType: 'broken-internal-links', + createContextLogger: (log) => log, + createConfigResolver: () => ({ + getIncludedStatusBuckets: () => ['not_found_404'], + getIncludedItemTypes: () => ['link'], + }), + calculatePriority: (links) => links.map((link) => ({ ...link, priority: 'high' })), + mergeAndDeduplicate: (firstLinks, secondLinks) => [...secondLinks, ...firstLinks], + loadFinalResults: sinon.stub().resolves([]), + cleanupBatchState: sinon.stub().resolves(), + getTimeoutStatus: sinon.stub().returns({ + percentUsed: 1, + safeTimeRemaining: 100000, + isApproachingTimeout: false, + }), + updateAuditResult, + opportunityAndSuggestionsStep: sinon.stub().resolves({ status: 'complete' }), + filterByStatusIfNeeded, + filterByItemTypes, + }); + + await finalize({ + log: { + info: sinon.stub(), + warn: sinon.stub(), + error: sinon.stub(), + debug: sinon.stub(), + }, + site: { + getId: () => 'site-1', + getBaseURL: () => 'https://[::1', + }, + env: {}, + audit: { + getId: () => 'audit-1', + getAuditResult: () => ({ brokenInternalLinks: [] }), + }, + dataAccess: {}, + linkCheckerResults: [ + { + urlFrom: '/content/ASO/wknd-abhigarg-0001/us/en/adventures.html', + urlTo: '/wknd-abhigarg-0001/us/en/workshop', + itemType: 'link', + validity: 'INVALID', + httpStatus: 404, + }, + ], + }, { skipCrawlDetection: false }); + + expect(updateAuditResult.firstCall.args[2]).to.deep.equal([]); + }); + it('drops LinkChecker rows without broken validity or broken status', async () => { const updateAuditResult = sinon.stub().resolves({}); @@ -683,7 +886,7 @@ describe('internal-links finalization', () => { }, dataAccess: {}, linkCheckerResults: [ - { urlFrom: ':::invalid', urlTo: 'https://example.com/missing', itemType: 'link', httpStatus: 404 }, + { urlFrom: 'https://[::1', urlTo: 'https://example.com/missing', itemType: 'link', httpStatus: 404 }, ], }, { skipCrawlDetection: false }); @@ -823,6 +1026,58 @@ describe('internal-links finalization', () => { expect(linkCheckerLinks[0].statusBucket).to.equal('not_found_404'); }); + it('should drop LinkChecker SVG assets without an explicit broken HTTP status', async () => { + const updateAuditResult = sinon.stub().resolves({}); + const finalize = buildFinalize({ updateAuditResult }); + + await finalize(buildContext([ + { + urlFrom: 'https://example.com/page', + urlTo: 'https://example.com/webassets/icons/search.svg', + itemType: 'svg', + validity: 'INVALID', + }, + ]), { skipCrawlDetection: false }); + + expect(mockIsLinkInaccessible).to.not.have.been.called; + const reportedLinks = updateAuditResult.firstCall.args[2]; + const linkCheckerLinks = reportedLinks.filter((l) => l.detectionSource === 'linkchecker'); + expect(linkCheckerLinks).to.have.lengthOf(0); + }); + + it('should keep LinkChecker SVG assets when they have an explicit broken HTTP status', async () => { + mockIsLinkInaccessible.resolves({ + isBroken: false, + inconclusive: true, + httpStatus: null, + statusBucket: null, + }); + + const updateAuditResult = sinon.stub().resolves({}); + const finalize = buildFinalize({ + updateAuditResult, + createConfigResolver: () => ({ + getIncludedStatusBuckets: () => ['not_found_404', 'masked_by_linkchecker'], + getIncludedItemTypes: () => ['link', 'svg'], + }), + }); + + await finalize(buildContext([ + { + urlFrom: 'https://example.com/page', + urlTo: 'https://example.com/webassets/icons/search.svg', + itemType: 'svg', + httpStatus: 404, + validity: 'INVALID', + }, + ]), { skipCrawlDetection: false }); + + const reportedLinks = updateAuditResult.firstCall.args[2]; + const linkCheckerLinks = reportedLinks.filter((l) => l.detectionSource === 'linkchecker'); + expect(linkCheckerLinks).to.have.lengthOf(1); + expect(linkCheckerLinks[0].statusBucket).to.equal('not_found_404'); + }); + it('should skip re-validation when insufficient time remains', async () => { const updateAuditResult = sinon.stub().resolves({}); const getTimeoutStatus = sinon.stub().returns({ diff --git a/test/audits/internal-links/handler.test.js b/test/audits/internal-links/handler.test.js index 2d3acdf767..3b6cda11d4 100755 --- a/test/audits/internal-links/handler.test.js +++ b/test/audits/internal-links/handler.test.js @@ -233,8 +233,8 @@ describe('Broken internal links audit', () => { const rumQueryStub = sinon.stub().resolves([]); const { internalLinksAuditRunner: mockedRunner } = await esmock('../../../src/internal-links/handler.js', { - '../../../src/common/base-audit.js': { - wwwUrlResolver: resolverStub, + '../../../src/internal-links/base-url.js': { + resolveInternalLinksRumDomain: resolverStub, }, '@adobe/spacecat-shared-rum-api-client': { default: { @@ -643,6 +643,46 @@ describe('Broken internal links audit', () => { expect(result.urls).to.deep.include({ url: 'https://example.com/included2' }); }).timeout(10000); + it('submitForScraping should continue using site baseURL for audit scope filtering', async () => { + const mockSiteTopPage = { + allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([ + { getUrl: () => 'https://example.com/en/adventures.html' }, + ]), + }; + + const mockSite = { + ...site, + getBaseURL: () => 'https://example.com/en.html', + getConfig: () => ({ + getIncludedURLs: () => [], + getFetchConfig: () => ({ + overrideBaseURL: 'https://example.com/en', + }), + }), + }; + + const mockAudit = { + getId: () => 'audit-123', + getAuditResult: () => ({ success: true }), + getFullAuditRef: () => 'www.example.com', + }; + + const testContext = { + ...context, + site: mockSite, + audit: mockAudit, + dataAccess: { + ...context.dataAccess, + SiteTopPage: mockSiteTopPage, + }, + }; + + const result = await submitForScraping(testContext); + + expect(result.status).to.equal('skipped'); + expect(result.urls).to.deep.equal([]); + }).timeout(10000); + it('submitForScraping should filter out unscrape-able files', async () => { const mockSiteTopPage = { @@ -723,6 +763,7 @@ describe('Broken internal links audit', () => { const result = await submitForScraping(testContext); // All URLs should be filtered out (PDFs are unscrape-able) + expect(result.status).to.equal('skipped'); expect(result.urls).to.be.an('array'); expect(result.urls.length).to.equal(0); }).timeout(10000); @@ -909,7 +950,7 @@ describe('Broken internal links audit', () => { expect(result.urls.length).to.be.greaterThan(0); }).timeout(10000); - it('runAuditAndImportTopPagesStep should throw error when RUM audit returns success=false', async () => { + it('runAuditAndImportTopPagesStep should continue when RUM audit returns success=false and LinkChecker is enabled', async () => { // Mock RUMAPIClient to throw an error, which causes internalLinksAuditRunner to catch it // and return success: false const mockRumClient = { @@ -923,13 +964,33 @@ describe('Broken internal links audit', () => { const { runAuditAndImportTopPagesStep: runAuditAndImportTopPagesStepMocked } = await esmock('../../../src/internal-links/handler.js', { '@adobe/spacecat-shared-rum-api-client': { default: RUMAPIClientMock }, }); - const testContext = { ...context }; + const testContext = { + ...context, + env: {}, + site: { + ...context.site, + getConfig: () => ({ + getHandlers: () => ({ + 'broken-internal-links': { + config: { + isLinkCheckerEnabled: true, + }, + }, + }), + }), + }, + }; delete testContext.rumApiClient; - // Should throw because internalLinksAuditRunner caught an error and returned success: false - await expect(runAuditAndImportTopPagesStepMocked(testContext)) - .to.be.rejectedWith('Audit failed, skip scraping and suggestion generation'); + const result = await runAuditAndImportTopPagesStepMocked(testContext); + expect(result.type).to.equal('top-pages'); + expect(result.auditResult.success).to.equal(true); + expect(result.auditResult.brokenInternalLinks).to.deep.equal([]); + expect(result.auditResult.auditContext).to.deep.equal({ + interval: 30, + rumError: 'audit failed with error: RUM API connection failed', + }); expect(mockRumClient.query).to.have.been.called; }).timeout(10000); diff --git a/test/audits/internal-links/linkchecker-splunk.test.js b/test/audits/internal-links/linkchecker-splunk.test.js index 21f8206b58..9b18918bb4 100644 --- a/test/audits/internal-links/linkchecker-splunk.test.js +++ b/test/audits/internal-links/linkchecker-splunk.test.js @@ -79,8 +79,10 @@ describe('linkchecker-splunk', () => { expect(query).to.include('latest=@m'); expect(query).to.include('aem_program_id="program123"'); expect(query).to.include('aem_envId="env456"'); - expect(query).to.include('"linkchecker.removed_internal_link"'); + expect(query).to.include('msg="*linkchecker.removed_internal_link*"'); expect(query).to.include('| spath'); + expect(query).to.include('| rex field=msg "LinkCheckerTransformer (?\\{.*\\})$"'); + expect(query).to.include('| spath input=linkchecker_json'); expect(query).to.include('| rename linkchecker.removed_internal_link.urlFrom as urlFrom'); expect(query).to.include('| where isnotnull(urlFrom) AND isnotnull(urlTo)'); expect(query).to.include('| head 10000'); @@ -115,6 +117,36 @@ describe('linkchecker-splunk', () => { expect(query).to.include('aem_program_id="program\\"123"'); expect(query).to.include('aem_envId="env\\\\456"'); }); + + it('adds a scope filter when scopeBaseURL has a subpath', () => { + const query = buildLinkCheckerQuery({ + programId: 'program123', + environmentId: 'env456', + scopeBaseURL: 'https://publish.example.com/wknd-abhigarg-0001/us/en', + }); + + expect(query).to.include('| where like(urlFrom, "/content/ASO/wknd-abhigarg-0001/us/en%") OR like(urlFrom, "/wknd-abhigarg-0001/us/en%")'); + }); + + it('does not add a scope filter when scopeBaseURL has no subpath', () => { + const query = buildLinkCheckerQuery({ + programId: 'program123', + environmentId: 'env456', + scopeBaseURL: 'https://publish.example.com', + }); + + expect(query).to.not.include('like(urlFrom'); + }); + + it('does not add a scope filter when scopeBaseURL is invalid', () => { + const query = buildLinkCheckerQuery({ + programId: 'program123', + environmentId: 'env456', + scopeBaseURL: 'not a valid url', + }); + + expect(query).to.not.include('like(urlFrom'); + }); }); describe('submitSplunkJob', () => { @@ -146,17 +178,32 @@ describe('linkchecker-splunk', () => { ); }); - it('throws when configured Splunk namespace is blank after normalization', async () => { + it('falls back to admin/search when configured Splunk namespace is blank after normalization', async () => { mockClient.env = { SPLUNK_SEARCH_NAMESPACE: '///' }; - await expect(submitSplunkJob(mockClient, 'search query', mockLog)) - .to.be.rejectedWith('SPLUNK_SEARCH_NAMESPACE must be configured'); + mockClient.fetchAPI.resolves({ + status: 201, + json: sandbox.stub().resolves({ sid: 'job-id-123' }), + }); + + await submitSplunkJob(mockClient, 'search query', mockLog); + + expect(mockClient.fetchAPI.firstCall.args[0]).to.equal( + 'https://splunk.example.com:8089/servicesNS/admin/search/search/search/jobs', + ); }); - it('throws when client env is missing', async () => { + it('falls back to admin/search when client env is missing', async () => { mockClient.env = undefined; + mockClient.fetchAPI.resolves({ + status: 201, + json: sandbox.stub().resolves({ sid: 'job-id-123' }), + }); - await expect(submitSplunkJob(mockClient, 'search query', mockLog)) - .to.be.rejectedWith('SPLUNK_SEARCH_NAMESPACE must be configured'); + await submitSplunkJob(mockClient, 'search query', mockLog); + + expect(mockClient.fetchAPI.firstCall.args[0]).to.equal( + 'https://splunk.example.com:8089/servicesNS/admin/search/search/search/jobs', + ); }); it('throws error if submission fails', async () => { diff --git a/test/audits/internal-links/rum-detection.test.js b/test/audits/internal-links/rum-detection.test.js index a8fd8dc39b..f101fef52f 100644 --- a/test/audits/internal-links/rum-detection.test.js +++ b/test/audits/internal-links/rum-detection.test.js @@ -43,6 +43,9 @@ describe('internal-links rum-detection', () => { auditType: 'broken-internal-links', interval: 30, createContextLogger: (log) => log, + createConfigResolver: () => ({ + isLinkCheckerEnabled: () => false, + }), createRUMAPIClient: sinon.stub(), resolveFinalUrl: sinon.stub().resolves('https://example.com'), isLinkInaccessible: sinon.stub(), @@ -162,6 +165,49 @@ describe('internal-links rum-detection', () => { ); }); + it('uses site baseURL when filtering rum links by audit scope even if overrideBaseURL exists', async () => { + const isWithinAuditScope = sinon.stub().returns(true); + const rumApiClient = { + query: sinon.stub().resolves([ + { + url_from: 'https://example.com/en/source', + url_to: 'https://example.com/en/missing', + traffic_domain: 42, + }, + ]), + }; + const isLinkInaccessible = sinon.stub().resolves({ + isBroken: true, + inconclusive: false, + httpStatus: 404, + statusBucket: 'not_found_404', + contentType: 'text/html', + }); + const log = createLog(); + const { runAuditAndImportTopPagesStep } = createSteps({ isWithinAuditScope, isLinkInaccessible }); + + await runAuditAndImportTopPagesStep({ + log, + site: { + getId: () => 'site-1', + getBaseURL: () => 'https://example.com/en.html', + getConfig: () => ({ + getFetchConfig: () => ({ + overrideBaseURL: 'https://example.com/en', + }), + }), + }, + rumApiClient, + finalUrl: 'https://example.com', + audit: { + getId: () => 'audit-1', + }, + }); + + expect(isWithinAuditScope.firstCall.args[1]).to.equal('https://example.com/en.html'); + expect(isWithinAuditScope.secondCall.args[1]).to.equal('https://example.com/en.html'); + }); + it('strips hashes from persisted rum source and target URLs', async () => { const rumApiClient = { query: sinon.stub().resolves([ @@ -341,9 +387,72 @@ describe('internal-links rum-detection', () => { site: createSite(), rumApiClient, finalUrl: 'https://example.com', - })).to.be.rejectedWith('Audit failed, skip scraping and suggestion generation'); + })).to.be.rejectedWith('audit failed with error: upstream down'); + + expect(log.error).to.have.been.calledWith('RUM detection audit failed: audit failed with error: upstream down'); + }); - expect(log.error).to.have.been.calledWith('RUM detection audit failed'); + it('uses the default LinkChecker config resolver when none is provided', async () => { + const rumApiClient = { + query: sinon.stub().rejects(new Error('upstream down')), + }; + const log = createLog(); + const { runAuditAndImportTopPagesStep } = createInternalLinksRumSteps({ + auditType: 'broken-internal-links', + interval: 30, + createContextLogger: (baseLog) => baseLog, + createRUMAPIClient: sinon.stub(), + resolveFinalUrl: sinon.stub().resolves('https://example.com'), + isLinkInaccessible: sinon.stub(), + calculatePriority: (links) => links, + isWithinAuditScope: sinon.stub().returns(true), + }); + + await expect(runAuditAndImportTopPagesStep({ + log, + site: createSite(), + rumApiClient, + finalUrl: 'https://example.com', + })).to.be.rejectedWith('audit failed with error: upstream down'); + }); + + it('continues from the top-pages step when rum detection fails and LinkChecker is enabled', async () => { + const rumApiClient = { + query: sinon.stub().rejects(new Error('upstream down')), + }; + const log = createLog(); + const { runAuditAndImportTopPagesStep } = createSteps({ + createConfigResolver: () => ({ + isLinkCheckerEnabled: () => true, + }), + }); + + const result = await runAuditAndImportTopPagesStep({ + log, + site: createSite(), + rumApiClient, + finalUrl: 'https://example.com', + env: {}, + }); + + expect(result).to.deep.equal({ + auditResult: { + finalUrl: 'https://example.com', + error: 'audit failed with error: upstream down', + success: true, + brokenInternalLinks: [], + auditContext: { + interval: 30, + rumError: 'audit failed with error: upstream down', + }, + }, + fullAuditRef: 'https://example.com', + type: 'top-pages', + siteId: 'site-1', + }); + expect(log.warn).to.have.been.calledWith( + 'RUM detection audit failed, continuing with LinkChecker enabled: audit failed with error: upstream down', + ); }); it('logs zero broken links when prioritized rum links are undefined', async () => { diff --git a/test/common/step-audit.test.js b/test/common/step-audit.test.js index e6e22aa986..7ffe8bcf73 100644 --- a/test/common/step-audit.test.js +++ b/test/common/step-audit.test.js @@ -436,6 +436,51 @@ describe('Step-based Audit Tests', () => { expect(context.log.info).to.have.been.calledWith('Created scrapeJob with id: scrape-job-123'); }); + it('skips SCRAPE_CLIENT job creation when the step returns no URLs', async () => { + nock('https://space.cat') + .get('/') + .reply(200, 'Success'); + + const mockScrapeClient = { + createScrapeJob: sandbox.stub().resolves({ id: 'scrape-job-123' }), + }; + sandbox.stub(ScrapeClient, 'createFrom').returns(mockScrapeClient); + + const scrapeAudit = new AuditBuilder() + .addStep('scrape-step', async () => ({ + status: 'skipped', + urls: [], + siteId: '42322ae6-b8b1-4a61-9c88-25205fa65b07', + }), AUDIT_STEP_DESTINATIONS.SCRAPE_CLIENT) + .addStep('final', async () => ({ status: 'complete' })) + .build(); + + const existingAudit = { + getId: () => '109b71f7-2005-454e-8191-8e92e05daac2', + getAuditType: () => 'content-audit', + getFullAuditRef: () => 's3://test/123', + }; + context.dataAccess.Audit.findById.resolves(existingAudit); + + const scrapeMessage = { + type: 'content-audit', + siteId: '42322ae6-b8b1-4a61-9c88-25205fa65b07', + auditContext: { + next: 'scrape-step', + auditId: '109b71f7-2005-454e-8191-8e92e05daac2', + }, + }; + + const result = await scrapeAudit.run(scrapeMessage, context); + + expect(result.status).to.equal(200); + expect(ScrapeClient.createFrom).to.have.been.calledOnce; + expect(mockScrapeClient.createScrapeJob).to.not.have.been.called; + expect(context.log.info).to.have.been.calledWith( + sinon.match('Skipping scrapeJob creation for step scrape-step: no URLs to scrape'), + ); + }); + it('loads scrape result paths when scrapeJobId is provided', async () => { // Mock HTTP request for URL resolution nock('https://space.cat')