adobe · slitviachenko · Apr 9, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/src/backlinks/handler.js b/src/backlinks/handler.js
@@ -22,10 +22,10 @@ import calculateKpiMetrics from './kpi-metrics.js';
 import { convertToOpportunity } from '../common/opportunity.js';
 import { createOpportunityData } from './opportunity-data-mapper.js';
 import { syncSuggestionsWithPublishDetection, warnOnInvalidSuggestionData } from '../utils/data-access.js';
+import { getMergedAuditInputUrls } from '../utils/audit-input-urls.js';
 import { filterByAuditScope, extractPathPrefix } from '../internal-links/subpath-filter.js';
 import {
   filterBrokenSuggestedUrls,
-  isUnscrapeable,
   urlsMatch,
 } from '../utils/url-utils.js';
 import BrightDataClient, { buildLocaleSearchUrl } from '../support/bright-data-client.js';
@@ -238,24 +238,30 @@ export async function submitForScraping(context) {
   const {
     site, dataAccess, audit, log,
   } = context;
-  const { SiteTopPage } = dataAccess;
   const auditResult = audit.getAuditResult();
   if (auditResult.success === false) {
     throw new Error('Audit failed, skipping scraping and suggestions generation');
   }
-  const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'seo', 'global');
+  const { urls: allUrls } = await getMergedAuditInputUrls({
+    site,
+    dataAccess,
+    auditType: 'broken-backlinks',
+    getAgenticUrls: () => Promise.resolve([]),
+    log,
+  });
+  const allPages = allUrls.map((url) => ({ getUrl: () => url }));
 
-  // Filter top pages by audit scope (subpath/locale) if baseURL has a subpath
   const baseURL = site.getBaseURL();
-  const filteredTopPages = filterByAuditScope(topPages, baseURL, { urlProperty: 'getUrl' }, log);
+  // Filter top pages by audit scope (subpath/locale) if baseURL has a subpath
+  const filteredTopPages = filterByAuditScope(allPages, baseURL, { urlProperty: 'getUrl' }, log);
 
-  log.info(`Found ${topPages.length} top pages, ${filteredTopPages.length} within audit scope`);
+  log.info(`Found ${allPages.length} top pages (${allUrls.length} merged), ${filteredTopPages.length} within audit scope`);
 
   if (filteredTopPages.length === 0) {
-    if (topPages.length === 0) {
+    if (allPages.length === 0) {
       throw new Error(`No top pages found in database for site ${site.getId()}. SEO data import required.`);
     } else {
-      throw new Error(`All ${topPages.length} top pages filtered out by audit scope. BaseURL: ${baseURL} requires subpath match but no pages match scope.`);
+      throw new Error(`All ${allPages.length} top pages filtered out by audit scope. BaseURL: ${baseURL} requires subpath match but no pages match scope.`);
     }
   }
 
@@ -270,7 +276,7 @@ export const generateSuggestionData = async (context) => {
   const {
     site, audit, dataAccess, log, sqs, env, finalUrl,
   } = context;
-  const { Configuration, Suggestion, SiteTopPage } = dataAccess;
+  const { Configuration, Suggestion } = dataAccess;
 
   const auditResult = audit.getAuditResult();
   if (auditResult.success === false) {
@@ -479,12 +485,20 @@ export const generateSuggestionData = async (context) => {
   );
 
   // Get top pages and filter by audit scope
-  const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'seo', 'global');
-  const baseURL = site.getBaseURL();
-  const filteredTopPages = filterByAuditScope(topPages, baseURL, { urlProperty: 'getUrl' }, log);
+  const { urls: allUrls } = await getMergedAuditInputUrls({
+    site,
+    dataAccess,
+    auditType: 'broken-backlinks',
+    getAgenticUrls: () => Promise.resolve([]),
+    log,
+  });
+  const allPages = allUrls.map((url) => ({ getUrl: () => url }));
 
+  const baseURL = site.getBaseURL();
   // Filter alternatives by locales/subpaths present in broken links
   // This limits suggestions to relevant locales only
+  const filteredTopPages = filterByAuditScope(allPages, baseURL, { urlProperty: 'getUrl' }, log);
+
   const allTopPageUrls = filteredTopPages.map((page) => page.getUrl());
 
   // Extract unique locales/subpaths from broken links
@@ -511,13 +525,6 @@ export const generateSuggestionData = async (context) => {
     alternativeUrls = allTopPageUrls;
   }
 
-  // Filter out unscrape-able file types before sending to Mystique
-  const originalCount = alternativeUrls.length;
-  alternativeUrls = alternativeUrls.filter((url) => !isUnscrapeable(url));
-  if (alternativeUrls.length < originalCount) {
-    log.info(`Filtered out ${originalCount - alternativeUrls.length} unscrape-able file URLs (PDFs, Office docs, etc.) from alternative URLs before sending to Mystique`);
-  }
-
   // Validate before sending to Mystique
   if (brokenLinksForMystique.length === 0) {
     log.info('All broken links resolved via Bright Data. Skipping Mystique.');

diff --git a/src/canonical/handler.js b/src/canonical/handler.js
@@ -23,6 +23,7 @@ import {
   syncSuggestions,
   keepLatestMergeDataFunction,
 } from '../utils/data-access.js';
+import { getMergedAuditInputUrls } from '../utils/audit-input-urls.js';
 import { convertToOpportunity } from '../common/opportunity.js';
 import { createOpportunityData, createOpportunityDataForElmo } from './opportunity-data-mapper.js';
 import { CANONICAL_CHECKS } from './constants.js';
@@ -81,14 +82,18 @@ export async function submitForScraping(context) {
     };
   }
 
-  const { SiteTopPage } = dataAccess;
-
-  const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'seo', 'global');
+  const { urls: allUrls } = await getMergedAuditInputUrls({
+    site,
+    dataAccess,
+    auditType,
+    getAgenticUrls: () => Promise.resolve([]),
+    log,
+  });
 
-  log.info(`[canonical] Found ${topPages?.length || 0} top pages for scraping`);
+  log.info(`[canonical] Found ${allUrls.length} pages for scraping`);
 
-  if (!topPages || topPages.length === 0) {
-    log.info(`[canonical] No top pages found for site ${site.getId()}, skipping scraping`);
+  if (allUrls.length === 0) {
+    log.info(`[canonical] No pages found for site ${site.getId()}, skipping scraping`);
     return {
       auditResult: {
         status: 'NO_OPPORTUNITIES',
@@ -98,18 +103,8 @@ export async function submitForScraping(context) {
     };
   }
 
-  const topPagesUrls = topPages.map((page) => page.getUrl());
-
-  // Filter out auth pages and PDFs before scraping
-  const filteredUrls = topPagesUrls.filter((url) => {
-    if (isAuthUrl(url)) {
-      return false;
-    }
-    if (isPdfUrl(url)) {
-      return false;
-    }
-    return true;
-  });
+  // Filter out auth pages (non-HTML/PDFs already filtered by getMergedAuditInputUrls)
+  const filteredUrls = allUrls.filter((url) => !isAuthUrl(url));
 
   // Filter out pages disallowed by robots.txt
   const robots = await fetchRobotsTxt(site.getBaseURL(), log);

diff --git a/src/headings/handler.js b/src/headings/handler.js
@@ -19,6 +19,7 @@ import { noopUrlResolver } from '../common/index.js';
 import { syncSuggestions } from '../utils/data-access.js';
 import { convertToOpportunity } from '../common/opportunity.js';
 import { getTopAgenticUrlsFromAthena } from '../utils/agentic-urls.js';
+import { getMergedAuditInputUrls } from '../utils/audit-input-urls.js';
 import { createOpportunityData } from './opportunity-data-mapper.js';
 
 import {
@@ -29,7 +30,6 @@ import {
   cheerioLoad,
   loadScrapeJson,
   getBrandGuidelines,
-  getTopPages,
   initializeAuditContext,
 } from './shared-utils.js';
 
@@ -379,22 +379,20 @@ export async function headingsAuditRunner(baseURL, context, site) {
   const { S3_SCRAPER_BUCKET_NAME } = context.env;
 
   try {
-    // Get top 200 pages - try Athena first, fall back to SEO provider
+    // Merge all URL sources: custom audit targets, included, agentic (Athena), SEO top pages
     log.debug(`[Headings Audit] Fetching top pages for site: ${siteId}`);
 
-    let topPages = [];
-
-    // Try to get top agentic URLs from Athena first
-    const athenaUrls = await getTopAgenticUrlsFromAthena(site, context);
-    if (athenaUrls && athenaUrls.length > 0) {
-      topPages = athenaUrls.slice(0, 200).map((url) => ({ url }));
-    } else {
-      // Fallback to SEO provider if Athena returns no data
-      log.info('[Headings Audit] No agentic URLs from Athena, falling back to SEO top pages');
-      topPages = await getTopPages(dataAccess, siteId, context, log, 200);
-    }
+    const mergedInput = await getMergedAuditInputUrls({
+      site,
+      dataAccess,
+      auditType,
+      getAgenticUrls: () => getTopAgenticUrlsFromAthena(site, context),
+      topOrganicLimit: 200,
+      log,
+    });
+    const topPages = mergedInput.urls.map((url) => ({ url }));
 
-    log.debug(`[Headings Audit] Processing ${topPages.length} top pages for headings audit (limited to 200)`);
+    log.debug(`[Headings Audit] Processing ${topPages.length} top pages for headings audit`);
     log.debug(`[Headings Audit] Top pages sample: ${topPages.slice(0, 3).map((p) => p.url).join(', ')}`);
     if (topPages.length === 0) {
       log.warn('[Headings Audit] No top pages found, ending audit.');

diff --git a/src/headings/shared-utils.js b/src/headings/shared-utils.js
@@ -14,7 +14,6 @@ import { getPrompt } from '@adobe/spacecat-shared-utils';
 import { AzureOpenAIClient } from '@adobe/spacecat-shared-gpt-client';
 import { load as cheerioLoad } from 'cheerio';
 import SeoChecks from '../metatags/seo-checks.js';
-import { getTopPagesForSiteId } from '../utils/data-access.js';
 import { getObjectKeysUsingPrefix, getObjectFromKey } from '../utils/s3-utils.js';
 import {
   getHeadingLevel,
@@ -224,28 +223,6 @@ export async function getBrandGuidelines(healthyTagsObject, log, context, site =
   return aiResponseContent;
 }
 
-/**
- * Get top pages for a site with validation
- * @param {Object} dataAccess - Data access object
- * @param {string} siteId - Site ID
- * @param {Object} context - Audit context
- * @param {Object} log - Logger instance
- * @param {number} limit - Maximum number of pages to return
- * @returns {Promise<Array>} Array of top pages
- */
-export async function getTopPages(dataAccess, siteId, context, log, limit = 200) {
-  log.debug(`Fetching top pages for site: ${siteId}`);
-  const allTopPages = await getTopPagesForSiteId(dataAccess, siteId, context, log);
-  const topPages = allTopPages.slice(0, limit);
-
-  log.debug(`Processing ${topPages.length} top pages (limited to ${limit})`);
-  if (topPages.length > 0) {
-    log.debug(`Top pages sample: ${topPages.slice(0, 3).map((p) => p.url).join(', ')}`);
-  }
-
-  return topPages;
-}
-
 /**
  * Initialize context for headings/TOC audit
  * @param {Object} context - Audit context

diff --git a/src/hreflang/handler.js b/src/hreflang/handler.js
@@ -17,7 +17,10 @@ import { isLangCode } from 'is-language-code';
 
 import { AuditBuilder } from '../common/audit-builder.js';
 import { noopUrlResolver } from '../common/index.js';
-import { syncSuggestions, keepLatestMergeDataFunction, getTopPagesForSiteId } from '../utils/data-access.js';
+import {
+  syncSuggestions, keepLatestMergeDataFunction,
+} from '../utils/data-access.js';
+import { getMergedAuditInputUrls } from '../utils/audit-input-urls.js';
 import { convertToOpportunity } from '../common/opportunity.js';
 import { createOpportunityData, createOpportunityDataForElmo } from './opportunity-data-mapper.js';
 import { limitConcurrencyAllSettled } from '../support/utils.js';
@@ -166,9 +169,16 @@ export async function hreflangAuditRunner(baseURL, context, site) {
   log.debug(`Starting Hreflang Audit with siteId: ${siteId}`);
 
   try {
-    // Get top 200 pages
-    const allTopPages = await getTopPagesForSiteId(dataAccess, siteId, context, log);
-    const topPages = allTopPages.slice(0, 200);
+    // Merge all URL sources: custom audit targets, included, SEO top pages
+    const mergedInput = await getMergedAuditInputUrls({
+      site,
+      dataAccess,
+      auditType,
+      getAgenticUrls: () => Promise.resolve([]),
+      topOrganicLimit: 200,
+      log,
+    });
+    const topPages = mergedInput.urls.map((url) => ({ url }));
 
     log.debug(`Processing ${topPages.length} top pages for hreflang audit (limited to 200)`);
 

diff --git a/src/image-alt-text/url-utils.js b/src/image-alt-text/url-utils.js
@@ -14,6 +14,7 @@ import { Audit as AuditModel } from '@adobe/spacecat-shared-data-access';
 import RUMAPIClient from '@adobe/spacecat-shared-rum-api-client';
 import { getRUMUrl } from '../support/utils.js';
 import { RUM_INTERVAL } from './constants.js';
+import { getAuditTargetUrls } from '../utils/data-access.js';
 
 const AUDIT_TYPE = AuditModel.AUDIT_TYPES.ALT_TEXT;
 
@@ -32,7 +33,8 @@ function normalizeUrl(url) {
 }
 
 /**
- * Fetches top page URLs using a fallback chain: SEO → RUM → includedURLs.
+ * Fetches top page URLs using a fallback chain: SEO → RUM → includedURLs,
+ * then merges custom audit target URLs from site config.
  * @param {Object} params
  * @param {string} params.siteId - Site ID
  * @param {Object} params.site - Site object
@@ -45,43 +47,57 @@ export async function getTopPageUrls({
   siteId, site, dataAccess, context, log,
 }) {
   const { SiteTopPage } = dataAccess;
+  let baseUrls = [];
 
   // 1. Try SEO top pages
   const seoPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(siteId, 'seo', 'global');
   if (seoPages.length > 0) {
     log.info(`[${AUDIT_TYPE}]: Found ${seoPages.length} top pages from SEO provider`);
-    return seoPages.map((page) => page.getUrl());
+    baseUrls = seoPages.map((page) => page.getUrl());
   }
 
   // 2. Fallback to RUM traffic-acquisition
-  log.info(`[${AUDIT_TYPE}]: No SEO top pages, falling back to RUM`);
-  try {
-    const finalUrl = await getRUMUrl(site.getBaseURL());
-    const rumAPIClient = RUMAPIClient.createFrom(context);
-    const options = {
-      domain: finalUrl,
-      interval: RUM_INTERVAL,
-    };
-    const results = await rumAPIClient.query('traffic-acquisition', options);
-    if (results && results.length > 0) {
-      const rumUrls = results
-        .sort((a, b) => (b.earned || 0) - (a.earned || 0))
-        .map((r) => normalizeUrl(r.url));
-      log.info(`[${AUDIT_TYPE}]: Found ${rumUrls.length} URLs from RUM`);
-      return rumUrls;
+  if (baseUrls.length === 0) {
+    log.info(`[${AUDIT_TYPE}]: No SEO top pages, falling back to RUM`);
+    try {
+      const finalUrl = await getRUMUrl(site.getBaseURL());
+      const rumAPIClient = RUMAPIClient.createFrom(context);
+      const options = {
+        domain: finalUrl,
+        interval: RUM_INTERVAL,
+      };
+      const results = await rumAPIClient.query('traffic-acquisition', options);
+      if (results && results.length > 0) {
+        baseUrls = results
+          .sort((a, b) => (b.earned || 0) - (a.earned || 0))
+          .map((r) => normalizeUrl(r.url));
+        log.info(`[${AUDIT_TYPE}]: Found ${baseUrls.length} URLs from RUM`);
+      }
+    } catch (err) {
+      log.warn(`[${AUDIT_TYPE}]: RUM fallback failed: ${err.message}`);
     }
-  } catch (err) {
-    log.warn(`[${AUDIT_TYPE}]: RUM fallback failed: ${err.message}`);
   }
 
   // 3. Fallback to includedURLs from site config
-  log.info(`[${AUDIT_TYPE}]: No URLs from RUM, falling back to includedURLs`);
-  const includedURLs = site?.getConfig?.()?.getIncludedURLs('alt-text') || [];
-  if (includedURLs.length > 0) {
-    log.info(`[${AUDIT_TYPE}]: Found ${includedURLs.length} included URLs from site config`);
-    return includedURLs;
+  if (baseUrls.length === 0) {
+    log.info(`[${AUDIT_TYPE}]: No URLs from RUM, falling back to includedURLs`);
+    const includedURLs = site?.getConfig?.()?.getIncludedURLs('alt-text') || [];
+    if (includedURLs.length > 0) {
+      log.info(`[${AUDIT_TYPE}]: Found ${includedURLs.length} included URLs from site config`);
+      baseUrls = includedURLs;
+    }
+  }
+
+  // 4. Merge custom audit target URLs
+  const customUrls = getAuditTargetUrls(site, log);
+  if (customUrls.length > 0) {
+    const merged = [...new Set([...baseUrls, ...customUrls])];
+    log.info(`[${AUDIT_TYPE}]: Merged ${baseUrls.length} base URLs + ${customUrls.length} custom URLs = ${merged.length} unique`);
+    return merged;
   }
 
-  log.warn(`[${AUDIT_TYPE}]: No URLs found from any source (SEO, RUM, includedURLs)`);
-  return [];
+  if (baseUrls.length === 0) {
+    log.warn(`[${AUDIT_TYPE}]: No URLs found from any source (SEO, RUM, includedURLs, custom)`);
+  }
+  return baseUrls;
 }
diff --git a/src/internal-links/handler.js b/src/internal-links/handler.js
@@ -17,7 +17,7 @@ import { AuditBuilder } from '../common/audit-builder.js';
 import { wwwUrlResolver } from '../common/base-audit.js';
 import { convertToOpportunity } from '../common/opportunity.js';
 import { createContextLogger } from '../common/context-logger.js';
-import { isUnscrapeable, filterBrokenSuggestedUrls } from '../utils/url-utils.js';
+import { filterBrokenSuggestedUrls } from '../utils/url-utils.js';
 import { syncBrokenInternalLinksSuggestions } from './suggestions-generator.js';
 import {
   isLinkInaccessible,
@@ -98,7 +98,6 @@ export const submitForScraping = createSubmitForScraping({
   auditType: AUDIT_TYPE,
   createContextLogger,
   isWithinAuditScope,
-  isUnscrapeable,
 });
 
 export const opportunityAndSuggestionsStep = createOpportunityAndSuggestionsStep({
@@ -113,7 +112,6 @@ export const opportunityAndSuggestionsStep = createOpportunityAndSuggestionsStep
   syncBrokenInternalLinksSuggestions,
   filterByAuditScope,
   extractPathPrefix: extractLocalePathPrefix,
-  isUnscrapeable,
   filterBrokenSuggestedUrls,
   BrightDataClient,
   buildLocaleSearchUrl,