Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
5e31734
feat: Manage a custom list of audit target URLs
slitviachenko Apr 2, 2026
7edf209
Merge remote-tracking branch 'origin/main' into SITES-42547-Manage-a-…
slitviachenko Apr 2, 2026
c7999ca
feat: Manage a custom list of audit target URLs
slitviachenko Apr 2, 2026
fec9679
feat: Manage a custom list of audit target URLs
slitviachenko Apr 3, 2026
0f97843
feat: Manage a custom list of audit target URLs
slitviachenko Apr 3, 2026
f9844c7
Add process.env.SPACECAT_DISABLE_CUSTOM_AUDIT_TARGET_URLS
slitviachenko Apr 3, 2026
bf495bc
Add process.env.SPACECAT_ENABLE_CUSTOM_AUDIT_TARGET_URLS
slitviachenko Apr 3, 2026
ee117b6
Merge remote-tracking branch 'origin/main' into SITES-42547-Manage-a-…
slitviachenko Apr 7, 2026
79c1eed
Merge remote-tracking branch 'origin/main' into SITES-42547-Manage-a-…
slitviachenko Apr 7, 2026
b2f25d8
Refactor URL merging logic across multiple handlers to use `mergeTopP…
slitviachenko Apr 8, 2026
f72df51
Refactor URL handling across multiple handlers to utilize `getMergedA…
slitviachenko Apr 8, 2026
ed9a54d
Refactor `getTopPagesForSiteId` to simplify URL retrieval and improve…
slitviachenko Apr 8, 2026
46ea4e6
Merge remote-tracking branch 'origin/main' into SITES-42547-Manage-a-…
slitviachenko Apr 8, 2026
b5ee643
Merge remote-tracking branch 'origin/main' into SITES-42547-Manage-a-…
slitviachenko Apr 8, 2026
bb3914e
Remove the INCLUDE_CUSTOM_URLS flag
slitviachenko Apr 9, 2026
943d89a
Merge remote-tracking branch 'origin/main' into SITES-42547-Manage-a-…
slitviachenko Apr 9, 2026
654947b
Merge remote-tracking branch 'origin/main' into SITES-42547-Manage-a-…
slitviachenko Apr 9, 2026
508edb0
fix some linter issues
slitviachenko Apr 9, 2026
080a7e3
Merge remote-tracking branch 'origin/main' into SITES-42547-Manage-a-…
slitviachenko Apr 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 26 additions & 19 deletions src/backlinks/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ import calculateKpiMetrics from './kpi-metrics.js';
import { convertToOpportunity } from '../common/opportunity.js';
import { createOpportunityData } from './opportunity-data-mapper.js';
import { syncSuggestionsWithPublishDetection, warnOnInvalidSuggestionData } from '../utils/data-access.js';
import { getMergedAuditInputUrls } from '../utils/audit-input-urls.js';
import { filterByAuditScope, extractPathPrefix } from '../internal-links/subpath-filter.js';
import {
filterBrokenSuggestedUrls,
isUnscrapeable,
urlsMatch,
} from '../utils/url-utils.js';
import BrightDataClient, { buildLocaleSearchUrl } from '../support/bright-data-client.js';
Expand Down Expand Up @@ -238,24 +238,30 @@ export async function submitForScraping(context) {
const {
site, dataAccess, audit, log,
} = context;
const { SiteTopPage } = dataAccess;
const auditResult = audit.getAuditResult();
if (auditResult.success === false) {
throw new Error('Audit failed, skipping scraping and suggestions generation');
}
const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'seo', 'global');
const { urls: allUrls } = await getMergedAuditInputUrls({
site,
dataAccess,
auditType: 'broken-backlinks',
getAgenticUrls: () => Promise.resolve([]),
log,
});
const allPages = allUrls.map((url) => ({ getUrl: () => url }));

// Filter top pages by audit scope (subpath/locale) if baseURL has a subpath
const baseURL = site.getBaseURL();
const filteredTopPages = filterByAuditScope(topPages, baseURL, { urlProperty: 'getUrl' }, log);
// Filter top pages by audit scope (subpath/locale) if baseURL has a subpath
const filteredTopPages = filterByAuditScope(allPages, baseURL, { urlProperty: 'getUrl' }, log);

log.info(`Found ${topPages.length} top pages, ${filteredTopPages.length} within audit scope`);
log.info(`Found ${allPages.length} top pages (${allUrls.length} merged), ${filteredTopPages.length} within audit scope`);

if (filteredTopPages.length === 0) {
if (topPages.length === 0) {
if (allPages.length === 0) {
throw new Error(`No top pages found in database for site ${site.getId()}. SEO data import required.`);
} else {
throw new Error(`All ${topPages.length} top pages filtered out by audit scope. BaseURL: ${baseURL} requires subpath match but no pages match scope.`);
throw new Error(`All ${allPages.length} top pages filtered out by audit scope. BaseURL: ${baseURL} requires subpath match but no pages match scope.`);
}
}

Expand All @@ -270,7 +276,7 @@ export const generateSuggestionData = async (context) => {
const {
site, audit, dataAccess, log, sqs, env, finalUrl,
} = context;
const { Configuration, Suggestion, SiteTopPage } = dataAccess;
const { Configuration, Suggestion } = dataAccess;

const auditResult = audit.getAuditResult();
if (auditResult.success === false) {
Expand Down Expand Up @@ -479,12 +485,20 @@ export const generateSuggestionData = async (context) => {
);

// Get top pages and filter by audit scope
const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'seo', 'global');
const baseURL = site.getBaseURL();
const filteredTopPages = filterByAuditScope(topPages, baseURL, { urlProperty: 'getUrl' }, log);
const { urls: allUrls } = await getMergedAuditInputUrls({
site,
dataAccess,
auditType: 'broken-backlinks',
getAgenticUrls: () => Promise.resolve([]),
log,
});
const allPages = allUrls.map((url) => ({ getUrl: () => url }));

const baseURL = site.getBaseURL();
// Filter alternatives by locales/subpaths present in broken links
// This limits suggestions to relevant locales only
const filteredTopPages = filterByAuditScope(allPages, baseURL, { urlProperty: 'getUrl' }, log);

const allTopPageUrls = filteredTopPages.map((page) => page.getUrl());

// Extract unique locales/subpaths from broken links
Expand All @@ -511,13 +525,6 @@ export const generateSuggestionData = async (context) => {
alternativeUrls = allTopPageUrls;
}

// Filter out unscrape-able file types before sending to Mystique
const originalCount = alternativeUrls.length;
alternativeUrls = alternativeUrls.filter((url) => !isUnscrapeable(url));
if (alternativeUrls.length < originalCount) {
log.info(`Filtered out ${originalCount - alternativeUrls.length} unscrape-able file URLs (PDFs, Office docs, etc.) from alternative URLs before sending to Mystique`);
}

// Validate before sending to Mystique
if (brokenLinksForMystique.length === 0) {
log.info('All broken links resolved via Bright Data. Skipping Mystique.');
Expand Down
31 changes: 13 additions & 18 deletions src/canonical/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import {
syncSuggestions,
keepLatestMergeDataFunction,
} from '../utils/data-access.js';
import { getMergedAuditInputUrls } from '../utils/audit-input-urls.js';
import { convertToOpportunity } from '../common/opportunity.js';
import { createOpportunityData, createOpportunityDataForElmo } from './opportunity-data-mapper.js';
import { CANONICAL_CHECKS } from './constants.js';
Expand Down Expand Up @@ -81,14 +82,18 @@ export async function submitForScraping(context) {
};
}

const { SiteTopPage } = dataAccess;

const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'seo', 'global');
const { urls: allUrls } = await getMergedAuditInputUrls({
site,
dataAccess,
auditType,
getAgenticUrls: () => Promise.resolve([]),
log,
});

log.info(`[canonical] Found ${topPages?.length || 0} top pages for scraping`);
log.info(`[canonical] Found ${allUrls.length} pages for scraping`);

if (!topPages || topPages.length === 0) {
log.info(`[canonical] No top pages found for site ${site.getId()}, skipping scraping`);
if (allUrls.length === 0) {
log.info(`[canonical] No pages found for site ${site.getId()}, skipping scraping`);
return {
auditResult: {
status: 'NO_OPPORTUNITIES',
Expand All @@ -98,18 +103,8 @@ export async function submitForScraping(context) {
};
}

const topPagesUrls = topPages.map((page) => page.getUrl());

// Filter out auth pages and PDFs before scraping
const filteredUrls = topPagesUrls.filter((url) => {
if (isAuthUrl(url)) {
return false;
}
if (isPdfUrl(url)) {
return false;
}
return true;
});
// Filter out auth pages (non-HTML/PDFs already filtered by getMergedAuditInputUrls)
const filteredUrls = allUrls.filter((url) => !isAuthUrl(url));

// Filter out pages disallowed by robots.txt
const robots = await fetchRobotsTxt(site.getBaseURL(), log);
Expand Down
26 changes: 12 additions & 14 deletions src/headings/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import { noopUrlResolver } from '../common/index.js';
import { syncSuggestions } from '../utils/data-access.js';
import { convertToOpportunity } from '../common/opportunity.js';
import { getTopAgenticUrlsFromAthena } from '../utils/agentic-urls.js';
import { getMergedAuditInputUrls } from '../utils/audit-input-urls.js';
import { createOpportunityData } from './opportunity-data-mapper.js';

import {
Expand All @@ -29,7 +30,6 @@ import {
cheerioLoad,
loadScrapeJson,
getBrandGuidelines,
getTopPages,
initializeAuditContext,
} from './shared-utils.js';

Expand Down Expand Up @@ -379,22 +379,20 @@ export async function headingsAuditRunner(baseURL, context, site) {
const { S3_SCRAPER_BUCKET_NAME } = context.env;

try {
// Get top 200 pages - try Athena first, fall back to SEO provider
// Merge all URL sources: custom audit targets, included, agentic (Athena), SEO top pages
log.debug(`[Headings Audit] Fetching top pages for site: ${siteId}`);

let topPages = [];

// Try to get top agentic URLs from Athena first
const athenaUrls = await getTopAgenticUrlsFromAthena(site, context);
if (athenaUrls && athenaUrls.length > 0) {
topPages = athenaUrls.slice(0, 200).map((url) => ({ url }));
} else {
// Fallback to SEO provider if Athena returns no data
log.info('[Headings Audit] No agentic URLs from Athena, falling back to SEO top pages');
topPages = await getTopPages(dataAccess, siteId, context, log, 200);
}
const mergedInput = await getMergedAuditInputUrls({
site,
dataAccess,
auditType,
getAgenticUrls: () => getTopAgenticUrlsFromAthena(site, context),
topOrganicLimit: 200,
log,
});
const topPages = mergedInput.urls.map((url) => ({ url }));

log.debug(`[Headings Audit] Processing ${topPages.length} top pages for headings audit (limited to 200)`);
log.debug(`[Headings Audit] Processing ${topPages.length} top pages for headings audit`);
log.debug(`[Headings Audit] Top pages sample: ${topPages.slice(0, 3).map((p) => p.url).join(', ')}`);
if (topPages.length === 0) {
log.warn('[Headings Audit] No top pages found, ending audit.');
Expand Down
23 changes: 0 additions & 23 deletions src/headings/shared-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import { getPrompt } from '@adobe/spacecat-shared-utils';
import { AzureOpenAIClient } from '@adobe/spacecat-shared-gpt-client';
import { load as cheerioLoad } from 'cheerio';
import SeoChecks from '../metatags/seo-checks.js';
import { getTopPagesForSiteId } from '../utils/data-access.js';
import { getObjectKeysUsingPrefix, getObjectFromKey } from '../utils/s3-utils.js';
import {
getHeadingLevel,
Expand Down Expand Up @@ -224,28 +223,6 @@ export async function getBrandGuidelines(healthyTagsObject, log, context, site =
return aiResponseContent;
}

/**
* Get top pages for a site with validation
* @param {Object} dataAccess - Data access object
* @param {string} siteId - Site ID
* @param {Object} context - Audit context
* @param {Object} log - Logger instance
* @param {number} limit - Maximum number of pages to return
* @returns {Promise<Array>} Array of top pages
*/
export async function getTopPages(dataAccess, siteId, context, log, limit = 200) {
log.debug(`Fetching top pages for site: ${siteId}`);
const allTopPages = await getTopPagesForSiteId(dataAccess, siteId, context, log);
const topPages = allTopPages.slice(0, limit);

log.debug(`Processing ${topPages.length} top pages (limited to ${limit})`);
if (topPages.length > 0) {
log.debug(`Top pages sample: ${topPages.slice(0, 3).map((p) => p.url).join(', ')}`);
}

return topPages;
}

/**
* Initialize context for headings/TOC audit
* @param {Object} context - Audit context
Expand Down
18 changes: 14 additions & 4 deletions src/hreflang/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@ import { isLangCode } from 'is-language-code';

import { AuditBuilder } from '../common/audit-builder.js';
import { noopUrlResolver } from '../common/index.js';
import { syncSuggestions, keepLatestMergeDataFunction, getTopPagesForSiteId } from '../utils/data-access.js';
import {
syncSuggestions, keepLatestMergeDataFunction,
} from '../utils/data-access.js';
import { getMergedAuditInputUrls } from '../utils/audit-input-urls.js';
import { convertToOpportunity } from '../common/opportunity.js';
import { createOpportunityData, createOpportunityDataForElmo } from './opportunity-data-mapper.js';
import { limitConcurrencyAllSettled } from '../support/utils.js';
Expand Down Expand Up @@ -166,9 +169,16 @@ export async function hreflangAuditRunner(baseURL, context, site) {
log.debug(`Starting Hreflang Audit with siteId: ${siteId}`);

try {
// Get top 200 pages
const allTopPages = await getTopPagesForSiteId(dataAccess, siteId, context, log);
const topPages = allTopPages.slice(0, 200);
// Merge all URL sources: custom audit targets, included, SEO top pages
const mergedInput = await getMergedAuditInputUrls({
site,
dataAccess,
auditType,
getAgenticUrls: () => Promise.resolve([]),
topOrganicLimit: 200,
log,
});
const topPages = mergedInput.urls.map((url) => ({ url }));

log.debug(`Processing ${topPages.length} top pages for hreflang audit (limited to 200)`);

Expand Down
68 changes: 42 additions & 26 deletions src/image-alt-text/url-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { Audit as AuditModel } from '@adobe/spacecat-shared-data-access';
import RUMAPIClient from '@adobe/spacecat-shared-rum-api-client';
import { getRUMUrl } from '../support/utils.js';
import { RUM_INTERVAL } from './constants.js';
import { getAuditTargetUrls } from '../utils/data-access.js';

const AUDIT_TYPE = AuditModel.AUDIT_TYPES.ALT_TEXT;

Expand All @@ -32,7 +33,8 @@ function normalizeUrl(url) {
}

/**
* Fetches top page URLs using a fallback chain: SEO → RUM → includedURLs.
* Fetches top page URLs using a fallback chain: SEO → RUM → includedURLs,
* then merges custom audit target URLs from site config.
* @param {Object} params
* @param {string} params.siteId - Site ID
* @param {Object} params.site - Site object
Expand All @@ -45,43 +47,57 @@ export async function getTopPageUrls({
siteId, site, dataAccess, context, log,
}) {
const { SiteTopPage } = dataAccess;
let baseUrls = [];

// 1. Try SEO top pages
const seoPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(siteId, 'seo', 'global');
if (seoPages.length > 0) {
log.info(`[${AUDIT_TYPE}]: Found ${seoPages.length} top pages from SEO provider`);
return seoPages.map((page) => page.getUrl());
baseUrls = seoPages.map((page) => page.getUrl());
}

// 2. Fallback to RUM traffic-acquisition
log.info(`[${AUDIT_TYPE}]: No SEO top pages, falling back to RUM`);
try {
const finalUrl = await getRUMUrl(site.getBaseURL());
const rumAPIClient = RUMAPIClient.createFrom(context);
const options = {
domain: finalUrl,
interval: RUM_INTERVAL,
};
const results = await rumAPIClient.query('traffic-acquisition', options);
if (results && results.length > 0) {
const rumUrls = results
.sort((a, b) => (b.earned || 0) - (a.earned || 0))
.map((r) => normalizeUrl(r.url));
log.info(`[${AUDIT_TYPE}]: Found ${rumUrls.length} URLs from RUM`);
return rumUrls;
if (baseUrls.length === 0) {
log.info(`[${AUDIT_TYPE}]: No SEO top pages, falling back to RUM`);
try {
const finalUrl = await getRUMUrl(site.getBaseURL());
const rumAPIClient = RUMAPIClient.createFrom(context);
const options = {
domain: finalUrl,
interval: RUM_INTERVAL,
};
const results = await rumAPIClient.query('traffic-acquisition', options);
if (results && results.length > 0) {
baseUrls = results
.sort((a, b) => (b.earned || 0) - (a.earned || 0))
.map((r) => normalizeUrl(r.url));
log.info(`[${AUDIT_TYPE}]: Found ${baseUrls.length} URLs from RUM`);
}
} catch (err) {
log.warn(`[${AUDIT_TYPE}]: RUM fallback failed: ${err.message}`);
}
} catch (err) {
log.warn(`[${AUDIT_TYPE}]: RUM fallback failed: ${err.message}`);
}

// 3. Fallback to includedURLs from site config
log.info(`[${AUDIT_TYPE}]: No URLs from RUM, falling back to includedURLs`);
const includedURLs = site?.getConfig?.()?.getIncludedURLs('alt-text') || [];
if (includedURLs.length > 0) {
log.info(`[${AUDIT_TYPE}]: Found ${includedURLs.length} included URLs from site config`);
return includedURLs;
if (baseUrls.length === 0) {
log.info(`[${AUDIT_TYPE}]: No URLs from RUM, falling back to includedURLs`);
const includedURLs = site?.getConfig?.()?.getIncludedURLs('alt-text') || [];
if (includedURLs.length > 0) {
log.info(`[${AUDIT_TYPE}]: Found ${includedURLs.length} included URLs from site config`);
baseUrls = includedURLs;
}
}

// 4. Merge custom audit target URLs
const customUrls = getAuditTargetUrls(site, log);
if (customUrls.length > 0) {
const merged = [...new Set([...baseUrls, ...customUrls])];
log.info(`[${AUDIT_TYPE}]: Merged ${baseUrls.length} base URLs + ${customUrls.length} custom URLs = ${merged.length} unique`);
return merged;
}

log.warn(`[${AUDIT_TYPE}]: No URLs found from any source (SEO, RUM, includedURLs)`);
return [];
if (baseUrls.length === 0) {
log.warn(`[${AUDIT_TYPE}]: No URLs found from any source (SEO, RUM, includedURLs, custom)`);
}
return baseUrls;
}
4 changes: 1 addition & 3 deletions src/internal-links/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import { AuditBuilder } from '../common/audit-builder.js';
import { wwwUrlResolver } from '../common/base-audit.js';
import { convertToOpportunity } from '../common/opportunity.js';
import { createContextLogger } from '../common/context-logger.js';
import { isUnscrapeable, filterBrokenSuggestedUrls } from '../utils/url-utils.js';
import { filterBrokenSuggestedUrls } from '../utils/url-utils.js';
import { syncBrokenInternalLinksSuggestions } from './suggestions-generator.js';
import {
isLinkInaccessible,
Expand Down Expand Up @@ -98,7 +98,6 @@ export const submitForScraping = createSubmitForScraping({
auditType: AUDIT_TYPE,
createContextLogger,
isWithinAuditScope,
isUnscrapeable,
});

export const opportunityAndSuggestionsStep = createOpportunityAndSuggestionsStep({
Expand All @@ -113,7 +112,6 @@ export const opportunityAndSuggestionsStep = createOpportunityAndSuggestionsStep
syncBrokenInternalLinksSuggestions,
filterByAuditScope,
extractPathPrefix: extractLocalePathPrefix,
isUnscrapeable,
filterBrokenSuggestedUrls,
BrightDataClient,
buildLocaleSearchUrl,
Expand Down
Loading
Loading