Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ $ npm install @adobe/spacecat-audit-worker

## Usage

See the [API documentation](docs/API.md).
See the detailed [API documentation](docs/API.md).

## Development

Expand Down
4 changes: 4 additions & 0 deletions src/common/step-audit.js
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ export class StepAudit extends BaseAudit {
if (step.destination === AUDIT_STEP_DESTINATIONS.SCRAPE_CLIENT) {
const scrapeClient = ScrapeClient.createFrom(context);
const payload = destination.formatPayload(stepResult, auditContext, context);
if (stepResult?.status === 'skipped' || !Array.isArray(payload?.urls) || payload.urls.length === 0) {
log.info(`Skipping scrapeJob creation for step ${step.name}: no URLs to scrape`);
return stepResult;
}
log.debug(`Creating new scrapeJob with the ScrapeClient. Payload: ${JSON.stringify(payload)}`);
const scrapeJob = await scrapeClient.createScrapeJob(payload);
log.info(`Created scrapeJob with id: ${scrapeJob.id}`);
Expand Down
40 changes: 40 additions & 0 deletions src/internal-links/base-url.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright 2026 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import { isValidUrl } from '@adobe/spacecat-shared-utils';
import { wwwUrlResolver } from '../common/base-audit.js';

export function getInternalLinksFetchConfig(site) {
const siteConfig = site?.getConfig?.();
return siteConfig?.getFetchConfig?.()
|| siteConfig?.fetchConfig
|| siteConfig?.config?.fetchConfig
|| {};
}

export function resolveInternalLinksBaseURL(site) {
const overrideBaseURL = getInternalLinksFetchConfig(site)?.overrideBaseURL;
if (isValidUrl(overrideBaseURL)) {
return overrideBaseURL;
}

return site?.getBaseURL?.() || '';
}

export async function resolveInternalLinksRumDomain(site, context) {
return wwwUrlResolver({
getBaseURL: () => site?.getBaseURL?.(),
getConfig: () => ({
getFetchConfig: () => ({}),
}),
}, context);
}
36 changes: 35 additions & 1 deletion src/internal-links/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ import { hasText } from '@adobe/spacecat-shared-utils';
import { PAGES_PER_BATCH } from './crawl-detection.js';
import { MAX_BROKEN_LINKS_REPORTED } from './result-utils.js';

export {
getInternalLinksFetchConfig,
resolveInternalLinksBaseURL,
} from './base-url.js';

const MAX_URLS_TO_PROCESS = 100;
const DEFAULT_LINKCHECKER_MIN_TIME_NEEDED_MS = 5 * 60 * 1000;
const MAX_BROKEN_LINKS = 100;
Expand Down Expand Up @@ -131,7 +136,36 @@ export class InternalLinksConfigResolver {
}

isLinkCheckerEnabled() {
return this.handlerConfig.isLinkcheckerEnabled ?? false;
const camelCaseValue = getBooleanConfig(this.handlerConfig.isLinkCheckerEnabled, undefined);
const legacyValue = getBooleanConfig(this.handlerConfig.isLinkcheckerEnabled, undefined);

if (typeof camelCaseValue === 'boolean') {
return camelCaseValue;
}
if (typeof legacyValue === 'boolean') {
return legacyValue;
}
return false;
}

getLinkCheckerFlagDebugInfo() {
const camelCaseValue = getBooleanConfig(this.handlerConfig.isLinkCheckerEnabled, undefined);
const legacyValue = getBooleanConfig(this.handlerConfig.isLinkcheckerEnabled, undefined);
let source = 'default:false';
if (typeof camelCaseValue === 'boolean') {
source = 'isLinkCheckerEnabled';
} else if (typeof legacyValue === 'boolean') {
source = 'isLinkcheckerEnabled';
}

return {
enabled: this.isLinkCheckerEnabled(),
source,
camelCaseRaw: this.handlerConfig.isLinkCheckerEnabled,
legacyRaw: this.handlerConfig.isLinkcheckerEnabled,
camelCaseValue,
legacyValue,
};
}

getLinkCheckerProgramId() {
Expand Down
3 changes: 2 additions & 1 deletion src/internal-links/crawl-detection.js
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ function getSourceItemType(parentTag) {
function getAssetTypeFromUrl(url, pageUrl = 'https://example.com') {
try {
const pathname = new URL(url, pageUrl).pathname.toLowerCase();
if (/\.(svg|png|jpe?g|gif|webp|avif)$/.test(pathname)) return 'image';
if (/\.svg$/.test(pathname)) return 'svg';
if (/\.(png|jpe?g|gif|webp|avif)$/.test(pathname)) return 'image';
/* c8 ignore start - Asset type branches covered by integration tests at extraction level */
if (/\.css$/.test(pathname)) return 'css';
if (/\.js$/.test(pathname)) return 'js';
Expand Down
101 changes: 86 additions & 15 deletions src/internal-links/finalization.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { createInternalLinksStepLogger } from './logging.js';
import { classifyStatusBucket, isLinkInaccessible } from './helpers.js';
import { isWithinAuditScope } from './subpath-filter.js';
import { isSharedInternalResource } from './scope-utils.js';
import { resolveInternalLinksBaseURL } from './base-url.js';

function isOnAuditHost(url, baseURL) {
try {
Expand All @@ -27,10 +28,52 @@ function isOnAuditHost(url, baseURL) {
}
}

function normalizeLinkCheckerUrl(url, baseURL) {
if (!url || !baseURL) {
return url;
}

try {
const parsedBaseURL = new URL(prependSchema(baseURL));
const trimmedUrl = String(url).trim();

if (trimmedUrl.startsWith('http://') || trimmedUrl.startsWith('https://')) {
return trimmedUrl;
}

let normalizedPath = trimmedUrl;

// LinkChecker emits repository-style content paths for source pages in AEM CS logs.
// Resolve them back onto the publish host so they can be scoped like crawl/RUM URLs.
if (normalizedPath.startsWith('/content/ASO/')) {
normalizedPath = normalizedPath.replace(/^\/content\/ASO/, '');
}

return new URL(normalizedPath, `${parsedBaseURL.protocol}//${parsedBaseURL.host}`).toString();
} catch (error) {
return url;
}
}

function normalizeLinkCheckerValidity(validity) {
return String(validity || 'UNKNOWN').trim().toUpperCase();
}

const LINKCHECKER_VALIDITY_ONLY_ASSET_TYPES = new Set([
'image',
'svg',
'css',
'js',
'iframe',
'video',
'audio',
'media',
]);

function requiresExplicitBrokenStatus(itemType) {
return LINKCHECKER_VALIDITY_ONLY_ASSET_TYPES.has(itemType);
}

function disableEventLoopWait(context) {
if (context && 'callbackWaitsForEmptyEventLoop' in context) {
context.callbackWaitsForEmptyEventLoop = false;
Expand Down Expand Up @@ -130,7 +173,7 @@ export function createFinalizeCrawlDetection({
step: 'finalize-crawl-detection',
});
const shouldCleanup = !skipCrawlDetection;
const baseURL = typeof site.getBaseURL === 'function' ? site.getBaseURL() : '';
const baseURL = resolveInternalLinksBaseURL(site);
let finalizationLockAcquired = false;
let finalizationLockEtag = null;

Expand All @@ -149,6 +192,7 @@ export function createFinalizeCrawlDetection({
const timeoutStatus = getTimeoutStatus(lambdaStartTime, context);
log.info('====== Finalize: Merge and Generate Suggestions ======');
log.info(`auditId: ${auditId}`);
log.info(`Using audit scope URL for finalization: ${baseURL}`);
log.info(`Timeout status: ${timeoutStatus.percentUsed.toFixed(1)}% used, ${Math.floor(timeoutStatus.safeTimeRemaining / 1000)}s safe time remaining`);

/* c8 ignore next 4 - Defensive timeout warning path depends on invocation timing */
Expand Down Expand Up @@ -191,41 +235,57 @@ export function createFinalizeCrawlDetection({
log.info(`Crawl detected ${crawlLinks.length} broken links`);

/* c8 ignore start - defensive normalization defaults */
let skippedLinkCheckerRows = 0;
const linkCheckerSkipReasons = {
missingUrl: 0,
noBrokenSignal: 0,
outsideScope: 0,
};
let normalizedLinkCheckerUrls = 0;
let normalizedRepositoryPaths = 0;
const linkCheckerLinks = linkCheckerResults
.map((lc) => {
const normalizedUrlFrom = normalizeLinkCheckerUrl(lc.urlFrom, baseURL);
const normalizedUrlTo = normalizeLinkCheckerUrl(lc.urlTo, baseURL);
if (normalizedUrlFrom !== lc.urlFrom || normalizedUrlTo !== lc.urlTo) {
normalizedLinkCheckerUrls += 1;
}
if (String(lc.urlFrom || '').startsWith('/content/ASO/')) {
normalizedRepositoryPaths += 1;
}
const itemType = lc.itemType || 'link';
const validity = normalizeLinkCheckerValidity(lc.validity);
const httpStatus = Number.parseInt(lc.httpStatus, 10);
const statusBucket = classifyStatusBucket(httpStatus);
const isExplicitBrokenValidity = validity === 'INVALID';
const hasBrokenStatus = Boolean(statusBucket);
const requireBrokenStatus = requiresExplicitBrokenStatus(itemType);

if (!lc.urlFrom || !lc.urlTo) {
skippedLinkCheckerRows += 1;
if (!normalizedUrlFrom || !normalizedUrlTo) {
linkCheckerSkipReasons.missingUrl += 1;
return null;
}

if (!hasBrokenStatus && !isExplicitBrokenValidity) {
skippedLinkCheckerRows += 1;
if ((requireBrokenStatus && !hasBrokenStatus)
|| (!hasBrokenStatus && !isExplicitBrokenValidity)) {
linkCheckerSkipReasons.noBrokenSignal += 1;
return null;
}

if (baseURL) {
const targetInScope = isWithinAuditScope(lc.urlTo, baseURL)
|| isSharedInternalResource(lc.urlTo, baseURL, itemType);
if (!(isOnAuditHost(lc.urlFrom, baseURL)
&& isOnAuditHost(lc.urlTo, baseURL)
&& isWithinAuditScope(lc.urlFrom, baseURL)
const targetInScope = isWithinAuditScope(normalizedUrlTo, baseURL)
|| isSharedInternalResource(normalizedUrlTo, baseURL, itemType);
if (!(isOnAuditHost(normalizedUrlFrom, baseURL)
&& isOnAuditHost(normalizedUrlTo, baseURL)
&& isWithinAuditScope(normalizedUrlFrom, baseURL)
&& targetInScope)) {
skippedLinkCheckerRows += 1;
linkCheckerSkipReasons.outsideScope += 1;
return null;
}
}

return {
urlFrom: lc.urlFrom,
urlTo: lc.urlTo,
urlFrom: normalizedUrlFrom,
urlTo: normalizedUrlTo,
anchorText: lc.anchorText || '',
itemType,
detectionSource: 'linkchecker',
Expand All @@ -238,9 +298,20 @@ export function createFinalizeCrawlDetection({
.filter(Boolean);
/* c8 ignore stop */

log.info(
`LinkChecker normalization v2 active: normalized=${normalizedLinkCheckerUrls}, `
+ `repositoryPaths=${normalizedRepositoryPaths}`,
);
log.info(`LinkChecker links transformed: ${linkCheckerLinks.length} broken links`);
const skippedLinkCheckerRows = Object.values(linkCheckerSkipReasons)
.reduce((sum, count) => sum + count, 0);
if (skippedLinkCheckerRows > 0) {
log.info(`Skipped ${skippedLinkCheckerRows} LinkChecker rows without a broken signal or outside audit scope`);
log.info(
`Skipped ${skippedLinkCheckerRows} LinkChecker rows `
+ `(missingUrl=${linkCheckerSkipReasons.missingUrl}, `
+ `noBrokenSignal=${linkCheckerSkipReasons.noBrokenSignal}, `
+ `outsideScope=${linkCheckerSkipReasons.outsideScope})`,
);
}

const preValidationStatus = getTimeoutStatus(lambdaStartTime, context);
Expand Down
7 changes: 4 additions & 3 deletions src/internal-links/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import RUMAPIClient from '@adobe/spacecat-shared-rum-api-client';
import { Audit, Opportunity as Oppty, Suggestion as SuggestionDataAccess } from '@adobe/spacecat-shared-data-access';
import { isNonEmptyArray } from '@adobe/spacecat-shared-utils';
import { AuditBuilder } from '../common/audit-builder.js';
import { wwwUrlResolver } from '../common/base-audit.js';
import { convertToOpportunity } from '../common/opportunity.js';
import { createContextLogger } from '../common/context-logger.js';
import { isUnscrapeable, filterBrokenSuggestedUrls } from '../utils/url-utils.js';
Expand Down Expand Up @@ -72,6 +71,7 @@ import { createInternalLinksOrchestration } from './orchestration.js';
import { createInternalLinksConfigResolver } from './config.js';
import { createSplunkClient } from './splunk-client.js';
import { extractLocalePathPrefix } from './scope-utils.js';
import { resolveInternalLinksRumDomain } from './base-url.js';

const { AUDIT_STEP_DESTINATIONS } = Audit;
const INTERVAL = 30;
Expand All @@ -91,8 +91,9 @@ export const {
auditType: AUDIT_TYPE,
interval: INTERVAL,
createContextLogger,
createConfigResolver: createInternalLinksConfigResolver,
createRUMAPIClient: (context) => RUMAPIClient.createFrom(context),
resolveFinalUrl: wwwUrlResolver,
resolveFinalUrl: resolveInternalLinksRumDomain,
isLinkInaccessible,
calculatePriority,
isWithinAuditScope,
Expand Down Expand Up @@ -163,7 +164,7 @@ export const {
});

export default new AuditBuilder()
.withUrlResolver(wwwUrlResolver)
.withUrlResolver(resolveInternalLinksRumDomain)
.addStep(
'runAuditAndImportTopPagesStep',
runAuditAndImportTopPagesStep,
Expand Down
13 changes: 11 additions & 2 deletions src/internal-links/linkchecker-orchestration.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import {
releaseExecutionLock,
} from './batch-state.js';
import { sleep } from '../support/utils.js';
import { resolveInternalLinksBaseURL } from './base-url.js';

const MAX_POLLING_CONTINUATIONS = 10;

Expand Down Expand Up @@ -160,7 +161,8 @@ export function createLinkCheckerOrchestration({

const auditId = audit.getId();
const executionLockKey = 'linkchecker-start';
const isLinkcheckerEnabled = config.isLinkCheckerEnabled();
const linkCheckerFlagDebug = config.getLinkCheckerFlagDebugInfo();
const isLinkcheckerEnabled = linkCheckerFlagDebug.enabled;
const log = createInternalLinksStepLogger({
createContextLogger,
log: baseLog,
Expand All @@ -171,7 +173,13 @@ export function createLinkCheckerOrchestration({
});

log.info('====== LinkChecker Detection Step ======');
log.info(`auditId: ${auditId}, isLinkcheckerEnabled: ${isLinkcheckerEnabled}`);
log.info(
`auditId: ${auditId}, isLinkcheckerEnabled: ${isLinkcheckerEnabled}, `
+ `flagSource=${linkCheckerFlagDebug.source}, `
+ `camelCaseRaw=${String(linkCheckerFlagDebug.camelCaseRaw)}, `
+ `legacyRaw=${String(linkCheckerFlagDebug.legacyRaw)}, `
+ 'resolverVersion=v2',
);

const workflowCompletedAt = getWorkflowCompletedAt(audit);
if (workflowCompletedAt) {
Expand Down Expand Up @@ -219,6 +227,7 @@ export function createLinkCheckerOrchestration({
programId,
environmentId,
lookbackMinutes,
scopeBaseURL: resolveInternalLinksBaseURL(site),
});

log.info('Submitting Splunk job for LinkChecker logs');
Expand Down
Loading
Loading