diff --git a/src/prerender/handler.js b/src/prerender/handler.js index 64f7bab234..a16a972596 100644 --- a/src/prerender/handler.js +++ b/src/prerender/handler.js @@ -255,6 +255,21 @@ async function getRecentlyProcessedPathnames(context, siteId) { } } +/** + * Returns true when the URL's pathname is NOT in the set of recently processed pathnames. + * URLs that cannot be parsed are treated as not recent (included by default). + * @param {string} url + * @param {Set} recentPathnames + * @returns {boolean} + */ +function isNotRecentUrl(url, recentPathnames) { + try { + return !recentPathnames.has(new URL(url).pathname); + } catch { + return true; + } +} + function normalizePathname(url) { try { const { pathname } = new URL(url); @@ -746,21 +761,6 @@ export async function importTopPages(context) { }; } -/** - * Returns true when the URL's pathname is NOT in the set of recently processed pathnames. - * URLs that cannot be parsed are treated as not recent (included by default). - * @param {string} url - * @param {Set} recentPathnames - * @returns {boolean} - */ -function isNotRecentUrl(url, recentPathnames) { - try { - return !recentPathnames.has(new URL(url).pathname); - } catch { - return true; - } -} - /** * Step 2: Submit URLs for scraping OR skip if in ai-only mode * @param {Object} context - Audit context with site and dataAccess @@ -811,55 +811,70 @@ export async function submitForScraping(context) { } const topPagesUrls = await getTopOrganicUrlsFromSeo(context); - // getTopAgenticUrls internally handles errors and returns [] on failure - const agenticUrls = await getTopAgenticUrls(site, context); - const preferredBase = getPreferredBaseUrl(site, context); const rebasedTopPagesUrls = topPagesUrls.map((url) => rebaseUrl(url, preferredBase, log)); const rebasedIncludedURLs = ((await site?.getConfig?.()?.getIncludedURLs?.(AUDIT_TYPE)) || []) .map((url) => rebaseUrl(url, preferredBase, log)); - // Daily batching: filter URLs recently processed within the rolling recent window - const recentPathnames = await getRecentlyProcessedPathnames(context, siteId); - - const filteredOrganicUrls = rebasedTopPagesUrls - .filter((url) => isNotRecentUrl(url, recentPathnames)); - const filteredIncludedURLs = rebasedIncludedURLs - .filter((url) => isNotRecentUrl(url, recentPathnames)); - const filteredAgenticUrls = agenticUrls.filter((url) => isNotRecentUrl(url, recentPathnames)); - - const hasRecentOrganic = filteredOrganicUrls.length !== topPagesUrls.length; - const isFirstRunOfCycle = !hasRecentOrganic; - - // Build a single ordered queue across all URL sources and slice the next daily batch - // after removing anything processed within the recent window. - const orderedCandidateUrls = [ - ...filteredOrganicUrls, - ...filteredIncludedURLs, - ...filteredAgenticUrls, - ]; - const batchedUrls = orderedCandidateUrls.slice(0, DAILY_BATCH_SIZE); - - const organicUrlSet = new Set(filteredOrganicUrls); - const includedUrlSet = new Set(filteredIncludedURLs); - const batchedOrganicUrls = batchedUrls.filter((url) => organicUrlSet.has(url)); - const batchedIncludedURLs = batchedUrls.filter((url) => includedUrlSet.has(url)); - const batchedAgenticUrls = batchedUrls.filter( - (url) => !organicUrlSet.has(url) && !includedUrlSet.has(url), - ); - - // Merge URLs ensuring uniqueness while handling www vs non-www differences - // Also filters out non-HTML URLs (PDFs, images, etc.) in a single pass - const { urls: finalUrls, filteredCount } = mergeAndGetUniqueHtmlUrls(batchedUrls); + // When triggered from Slack, skip agentic sources and daily batching + const isSlackTriggered = !!(auditContext?.slackContext?.channelId); + + let finalUrls; + let filteredCount; + let agenticUrlsCount = 0; + let currentAgentic = 0; + let currentOrganic; + let currentIncludedUrls; + let isFirstRunOfCycle; + let agenticNewThisCycle = 0; + + if (isSlackTriggered) { + ({ urls: finalUrls, filteredCount } = mergeAndGetUniqueHtmlUrls([ + ...rebasedTopPagesUrls, + ...rebasedIncludedURLs, + ])); + currentOrganic = rebasedTopPagesUrls.length; + currentIncludedUrls = rebasedIncludedURLs.length; + isFirstRunOfCycle = true; + } else { + // getTopAgenticUrls internally handles errors and returns [] on failure + const agenticUrls = await getTopAgenticUrls(site, context); + agenticUrlsCount = agenticUrls.length; + + // Daily batching: filter URLs recently processed within the rolling recent window + const recentPathnames = await getRecentlyProcessedPathnames(context, siteId); + + const filteredOrganicUrls = rebasedTopPagesUrls + .filter((url) => isNotRecentUrl(url, recentPathnames)); + const filteredIncludedURLs = rebasedIncludedURLs + .filter((url) => isNotRecentUrl(url, recentPathnames)); + const filteredAgenticUrls = agenticUrls.filter((url) => isNotRecentUrl(url, recentPathnames)); + + const hasRecentOrganic = filteredOrganicUrls.length !== topPagesUrls.length; + isFirstRunOfCycle = !hasRecentOrganic; + agenticNewThisCycle = filteredAgenticUrls.length; + + const orderedCandidateUrls = [ + ...filteredOrganicUrls, + ...filteredIncludedURLs, + ...filteredAgenticUrls, + ]; + const batchedUrls = orderedCandidateUrls.slice(0, DAILY_BATCH_SIZE); + + const organicUrlSet = new Set(filteredOrganicUrls); + const includedUrlSet = new Set(filteredIncludedURLs); + currentOrganic = batchedUrls.filter((url) => organicUrlSet.has(url)).length; + currentIncludedUrls = batchedUrls.filter((url) => includedUrlSet.has(url)).length; + currentAgentic = batchedUrls.filter( + (url) => !organicUrlSet.has(url) && !includedUrlSet.has(url), + ).length; - const currentAgentic = batchedAgenticUrls.length; - const currentOrganic = batchedOrganicUrls.length; - const currentIncludedUrls = batchedIncludedURLs.length; + ({ urls: finalUrls, filteredCount } = mergeAndGetUniqueHtmlUrls(batchedUrls)); + } - log.info(`${LOG_PREFIX} - prerender_submit_scraping_metrics: + log.info(`${LOG_PREFIX} prerender_submit_scraping_metrics: submittedUrls=${finalUrls.length}, - agenticUrls=${agenticUrls.length}, + agenticUrls=${agenticUrlsCount}, topPagesUrls=${topPagesUrls.length}, includedURLs=${rebasedIncludedURLs.length}, filteredOutUrls=${filteredCount}, @@ -867,9 +882,9 @@ export async function submitForScraping(context) { currentOrganic=${currentOrganic}, currentIncludedUrls=${currentIncludedUrls}, isFirstRunOfCycle=${isFirstRunOfCycle}, - agenticNewThisCycle=${filteredAgenticUrls.length}, + agenticNewThisCycle=${agenticNewThisCycle}, baseUrl=${site.getBaseURL()}, - siteId=${siteId},`); + siteId=${siteId}`); if (finalUrls.length === 0) { // Fallback to base URL if no URLs found @@ -880,7 +895,7 @@ export async function submitForScraping(context) { return { urls: finalUrls.map((url) => ({ url })), - siteId: site.getId(), + siteId, processingType: AUDIT_TYPE, maxScrapeAge: 0, options: { @@ -1441,7 +1456,7 @@ export async function getScrapeJobStats( */ export async function processContentAndGenerateOpportunities(context) { const { - site, audit, log, scrapeResultPaths, data, dataAccess, + site, audit, log, scrapeResultPaths, data, dataAccess, auditContext, } = context; // Check for AI-only mode - skip processing step (step 1 already triggered Mystique) @@ -1453,6 +1468,7 @@ export async function processContentAndGenerateOpportunities(context) { const siteId = site.getId(); const startTime = process.hrtime(); + const isSlackTriggered = !!(auditContext?.slackContext?.channelId); // Check if this is a paid LLMO customer early so we can use it in all logs const isPaid = await isPaidLLMOCustomer(context); @@ -1470,11 +1486,13 @@ export async function processContentAndGenerateOpportunities(context) { log.info(`${LOG_PREFIX} Found ${urlsToCheck.length} URLs from scrape results`); } else { /* c8 ignore start */ - // Fetch agentic URLs only for URL list fallback - try { - agenticUrls = await getTopAgenticUrls(site, context); - } catch (e) { - log.warn(`${LOG_PREFIX} Failed to fetch agentic URLs for fallback: ${e.message}. baseUrl=${site.getBaseURL()}`); + // Fetch agentic URLs for URL list fallback (skipped for Slack-triggered runs) + if (!isSlackTriggered) { + try { + agenticUrls = await getTopAgenticUrls(site, context); + } catch (e) { + log.warn(`${LOG_PREFIX} Failed to fetch agentic URLs for fallback: ${e.message}. baseUrl=${site.getBaseURL()}`); + } } // Load top organic pages cache for fallback merging @@ -1522,7 +1540,6 @@ export async function processContentAndGenerateOpportunities(context) { log.info(`${LOG_PREFIX} Found ${urlsNeedingPrerender.length}/${successfulComparisons.length} URLs needing prerender from total ${urlsToCheck.length} URLs scraped. isPaidLLMOCustomer=${isPaid}`); - const { auditContext } = context; const { scrapeJobId } = auditContext || {}; // getScrapeJobStats combines 403s from COMPLETE-status URLs (already in comparisonResults) // and FAILED-status URLs (absent from comparisonResults, fetched from ScrapeUrl table). @@ -1672,7 +1689,6 @@ export async function processContentAndGenerateOpportunities(context) { }; // Upload status.json on error so UI can show audit status via S3 fallback - const { auditContext } = context; await uploadStatusSummaryToS3(site.getBaseURL(), { siteId, auditId: audit.getId(), diff --git a/test/audits/prerender/handler.test.js b/test/audits/prerender/handler.test.js index 2e3058d810..8ce4f49e34 100644 --- a/test/audits/prerender/handler.test.js +++ b/test/audits/prerender/handler.test.js @@ -1028,6 +1028,106 @@ describe('Prerender Audit', () => { }); + it('should include organic URLs even when all are in the recency window when triggered from Slack', async () => { + const athenaStub = sandbox.stub().resolves([]); + const mockHandler = await esmock('../../../src/prerender/handler.js', { + '../../../src/utils/agentic-urls.js': { + getTopAgenticUrlsFromAthena: athenaStub, + }, + }); + + const context = { + site: { + getId: () => 'site-1', + getBaseURL: () => 'https://example.com', + getConfig: () => ({ getIncludedURLs: () => [] }), + }, + auditContext: { slackContext: { channelId: 'C123', threadTs: '1.0' } }, + dataAccess: { + SiteTopPage: { + allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([ + { getUrl: () => 'https://example.com/organic-page-1' }, + { getUrl: () => 'https://example.com/organic-page-2' }, + ]), + }, + }, + log: { info: sandbox.stub(), warn: sandbox.stub(), debug: sandbox.stub() }, + env: {}, + }; + + const result = await mockHandler.submitForScraping(context); + + // Both URLs must be present even though they would be "recent" in a scheduled run + expect(result.urls).to.deep.equal([ + { url: 'https://example.com/organic-page-1' }, + { url: 'https://example.com/organic-page-2' }, + ]); + }); + + it('should not fetch agentic URLs when triggered from Slack', async () => { + const athenaStub = sandbox.stub().resolves(['https://example.com/agentic-1']); + const mockHandler = await esmock('../../../src/prerender/handler.js', { + '../../../src/utils/agentic-urls.js': { + getTopAgenticUrlsFromAthena: athenaStub, + }, + }); + + const context = { + site: { + getId: () => 'site-1', + getBaseURL: () => 'https://example.com', + getConfig: () => ({ getIncludedURLs: () => [] }), + }, + auditContext: { slackContext: { channelId: 'C123', threadTs: '1.0' } }, + dataAccess: { + SiteTopPage: { + allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([ + { getUrl: () => 'https://example.com/organic-page-1' }, + { getUrl: () => 'https://example.com/organic-page-2' }, + ]), + }, + }, + log: { info: sandbox.stub(), warn: sandbox.stub(), debug: sandbox.stub() }, + env: {}, + }; + + const result = await mockHandler.submitForScraping(context); + + expect(athenaStub).to.not.have.been.called; + expect(result.urls).to.deep.equal([ + { url: 'https://example.com/organic-page-1' }, + { url: 'https://example.com/organic-page-2' }, + ]); + }); + + it('should still fetch agentic URLs for scheduled (non-Slack) runs', async () => { + const athenaStub = sandbox.stub().resolves(['https://example.com/agentic-1']); + const mockHandler = await esmock('../../../src/prerender/handler.js', { + '../../../src/utils/agentic-urls.js': { + getTopAgenticUrlsFromAthena: athenaStub, + }, + }); + + const context = { + site: { + getId: () => 'site-1', + getBaseURL: () => 'https://example.com', + getConfig: () => ({ getIncludedURLs: () => [] }), + }, + dataAccess: { + SiteTopPage: { + allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([]), + }, + PageCitability: { allByIndexKeys: sandbox.stub().resolves([]) }, + }, + log: { info: sandbox.stub(), warn: sandbox.stub(), debug: sandbox.stub() }, + env: {}, + }; + + await mockHandler.submitForScraping(context); + + expect(athenaStub).to.have.been.called; + }); }); @@ -1237,6 +1337,51 @@ describe('Prerender Audit', () => { expect(context.log.info).to.have.been.calledWith('Prerender - No URLs found for comparison. baseUrl=https://example.com, siteId=test-site-id'); }); + it('should not fetch agentic URLs in fallback path when triggered from Slack', async () => { + const athenaStub = sandbox.stub().resolves(['https://example.com/agentic-1']); + const mockHandler = await esmock('../../../src/prerender/handler.js', { + '../../../src/utils/agentic-urls.js': { + getTopAgenticUrlsFromAthena: athenaStub, + getPreferredBaseUrl: () => 'https://example.com', + }, + }); + + const context = { + site: { + getId: () => 'test-site-id', + getBaseURL: () => 'https://example.com', + getConfig: () => ({ getIncludedURLs: () => [] }), + }, + audit: { getId: () => 'audit-id' }, + dataAccess: { + SiteTopPage: { + allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([ + { getUrl: () => 'https://example.com/organic-1', getTraffic: () => 100 }, + ]), + }, + Opportunity: { allBySiteIdAndStatus: sandbox.stub().resolves([]) }, + LatestAudit: { updateByKeys: sandbox.stub().resolves() }, + }, + log: { + info: sandbox.stub(), + debug: sandbox.stub(), + warn: sandbox.stub(), + error: sandbox.stub(), + }, + scrapeResultPaths: new Map(), // No scrape results → triggers fallback path + s3Client: { send: sandbox.stub().rejects(new Error('No S3 data')) }, + env: { S3_SCRAPER_BUCKET_NAME: 'test-bucket' }, + auditContext: { + scrapeJobId: 'test-job-id', + slackContext: { channelId: 'C123', threadTs: '1.0' }, + }, + }; + + await mockHandler.processContentAndGenerateOpportunities(context); + + expect(athenaStub).to.not.have.been.called; + }); + it('should trigger opportunity processing path when prerender is detected', async () => { // This test covers line 341 by ensuring the full opportunity processing flow executes const mockOpportunity = {