Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
61255c8
docs: add prerender organic mode design spec
ssilare-adobe Apr 6, 2026
b53493e
docs: add prerender organic mode implementation plan
ssilare-adobe Apr 6, 2026
5eb1df9
feat(prerender): add organic-only mode to skip agentic URL sources
ssilare-adobe Apr 6, 2026
649aebf
test(prerender): remove unnecessary getPreferredBaseUrl mock in organ…
ssilare-adobe Apr 6, 2026
a59756f
fix(prerender): bypass recency filter and batching in organic mode
ssilare-adobe Apr 6, 2026
ab99f13
Merge branch 'main' into feat/prerender-organic-mode
ssilare-adobe Apr 6, 2026
cf02794
feat(prerender): detect slack-triggered runs and use organic+included…
ssilare-adobe Apr 6, 2026
f33f578
Merge branch 'main' into feat/prerender-organic-mode
ssilare-adobe Apr 6, 2026
5659973
fix(prerender): guard agentic URL fallback in step 3 for Slack-trigge…
ssilare-adobe Apr 6, 2026
eefe10e
Merge branch 'main' into feat/prerender-organic-mode
ssilare-adobe Apr 6, 2026
b550078
chore(prerender): remove docs, restore agenticNewThisCycle metric
sahil9001 Apr 7, 2026
59e8b64
Merge branch 'main' into feat/prerender-organic-mode
ssilare-adobe Apr 7, 2026
c69e0e9
Merge branch 'main' into feat/prerender-organic-mode
ssilare-adobe Apr 7, 2026
9742244
Merge branch 'main' into feat/prerender-organic-mode
ssilare-adobe Apr 7, 2026
8ec0464
Merge branch 'main' into feat/prerender-organic-mode
ssilare-adobe Apr 8, 2026
f0e7fe8
Merge branch 'main' into feat/prerender-organic-mode
ssilare-adobe Apr 8, 2026
9436316
Merge branch 'main' into feat/prerender-organic-mode
ssilare-adobe Apr 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 83 additions & 67 deletions src/prerender/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,21 @@ async function getRecentlyProcessedPathnames(context, siteId) {
}
}

/**
* Returns true when the URL's pathname is NOT in the set of recently processed pathnames.
* URLs that cannot be parsed are treated as not recent (included by default).
* @param {string} url
* @param {Set<string>} recentPathnames
* @returns {boolean}
*/
function isNotRecentUrl(url, recentPathnames) {
try {
return !recentPathnames.has(new URL(url).pathname);
} catch {
return true;
}
}

function normalizePathname(url) {
try {
const { pathname } = new URL(url);
Expand Down Expand Up @@ -728,21 +743,6 @@ export async function importTopPages(context) {
};
}

/**
* Returns true when the URL's pathname is NOT in the set of recently processed pathnames.
* URLs that cannot be parsed are treated as not recent (included by default).
* @param {string} url
* @param {Set<string>} recentPathnames
* @returns {boolean}
*/
function isNotRecentUrl(url, recentPathnames) {
try {
return !recentPathnames.has(new URL(url).pathname);
} catch {
return true;
}
}

/**
* Step 2: Submit URLs for scraping OR skip if in ai-only mode
* @param {Object} context - Audit context with site and dataAccess
Expand Down Expand Up @@ -793,65 +793,80 @@ export async function submitForScraping(context) {
}

const topPagesUrls = await getTopOrganicUrlsFromSeo(context);
// getTopAgenticUrls internally handles errors and returns [] on failure
const agenticUrls = await getTopAgenticUrls(site, context);

const preferredBase = getPreferredBaseUrl(site, context);
const rebasedTopPagesUrls = topPagesUrls.map((url) => rebaseUrl(url, preferredBase, log));
const rebasedIncludedURLs = ((await site?.getConfig?.()?.getIncludedURLs?.(AUDIT_TYPE)) || [])
.map((url) => rebaseUrl(url, preferredBase, log));

// Daily batching: filter URLs recently processed within the rolling recent window
const recentPathnames = await getRecentlyProcessedPathnames(context, siteId);

const filteredOrganicUrls = rebasedTopPagesUrls
.filter((url) => isNotRecentUrl(url, recentPathnames));
const filteredIncludedURLs = rebasedIncludedURLs
.filter((url) => isNotRecentUrl(url, recentPathnames));
const filteredAgenticUrls = agenticUrls.filter((url) => isNotRecentUrl(url, recentPathnames));

const hasRecentOrganic = filteredOrganicUrls.length !== topPagesUrls.length;
const isFirstRunOfCycle = !hasRecentOrganic;

// Build a single ordered queue across all URL sources and slice the next daily batch
// after removing anything processed within the recent window.
const orderedCandidateUrls = [
...filteredOrganicUrls,
...filteredIncludedURLs,
...filteredAgenticUrls,
];
const batchedUrls = orderedCandidateUrls.slice(0, DAILY_BATCH_SIZE);

const organicUrlSet = new Set(filteredOrganicUrls);
const includedUrlSet = new Set(filteredIncludedURLs);
const batchedOrganicUrls = batchedUrls.filter((url) => organicUrlSet.has(url));
const batchedIncludedURLs = batchedUrls.filter((url) => includedUrlSet.has(url));
const batchedAgenticUrls = batchedUrls.filter(
(url) => !organicUrlSet.has(url) && !includedUrlSet.has(url),
);

// Merge URLs ensuring uniqueness while handling www vs non-www differences
// Also filters out non-HTML URLs (PDFs, images, etc.) in a single pass
const { urls: finalUrls, filteredCount } = mergeAndGetUniqueHtmlUrls(batchedUrls);
// When triggered from Slack, skip agentic sources and daily batching
const isSlackTriggered = !!(auditContext?.slackContext?.channelId);

let finalUrls;
let filteredCount;
let agenticUrlsCount = 0;
let currentAgentic = 0;
let currentOrganic;
let currentIncludedUrls;
let isFirstRunOfCycle;
let agenticNewThisCycle = 0;

if (isSlackTriggered) {
({ urls: finalUrls, filteredCount } = mergeAndGetUniqueHtmlUrls([
...rebasedTopPagesUrls,
...rebasedIncludedURLs,
]));
currentOrganic = rebasedTopPagesUrls.length;
currentIncludedUrls = rebasedIncludedURLs.length;
isFirstRunOfCycle = true;
} else {
// getTopAgenticUrls internally handles errors and returns [] on failure
const agenticUrls = await getTopAgenticUrls(site, context);
agenticUrlsCount = agenticUrls.length;

// Daily batching: filter URLs recently processed within the rolling recent window
const recentPathnames = await getRecentlyProcessedPathnames(context, siteId);

const filteredOrganicUrls = rebasedTopPagesUrls
.filter((url) => isNotRecentUrl(url, recentPathnames));
const filteredIncludedURLs = rebasedIncludedURLs
.filter((url) => isNotRecentUrl(url, recentPathnames));
const filteredAgenticUrls = agenticUrls.filter((url) => isNotRecentUrl(url, recentPathnames));

const hasRecentOrganic = filteredOrganicUrls.length !== topPagesUrls.length;
isFirstRunOfCycle = !hasRecentOrganic;
agenticNewThisCycle = filteredAgenticUrls.length;

const orderedCandidateUrls = [
...filteredOrganicUrls,
...filteredIncludedURLs,
...filteredAgenticUrls,
];
const batchedUrls = orderedCandidateUrls.slice(0, DAILY_BATCH_SIZE);

const organicUrlSet = new Set(filteredOrganicUrls);
const includedUrlSet = new Set(filteredIncludedURLs);
currentOrganic = batchedUrls.filter((url) => organicUrlSet.has(url)).length;
currentIncludedUrls = batchedUrls.filter((url) => includedUrlSet.has(url)).length;
currentAgentic = batchedUrls.filter(
(url) => !organicUrlSet.has(url) && !includedUrlSet.has(url),
).length;

const currentAgentic = batchedAgenticUrls.length;
const currentOrganic = batchedOrganicUrls.length;
const currentIncludedUrls = batchedIncludedURLs.length;
({ urls: finalUrls, filteredCount } = mergeAndGetUniqueHtmlUrls(batchedUrls));
}

log.info(`${LOG_PREFIX}
prerender_submit_scraping_metrics:
log.info(`${LOG_PREFIX} prerender_submit_scraping_metrics:
submittedUrls=${finalUrls.length},
agenticUrls=${agenticUrls.length},
agenticUrls=${agenticUrlsCount},
topPagesUrls=${topPagesUrls.length},
includedURLs=${rebasedIncludedURLs.length},
filteredOutUrls=${filteredCount},
currentAgentic=${currentAgentic},
currentOrganic=${currentOrganic},
currentIncludedUrls=${currentIncludedUrls},
isFirstRunOfCycle=${isFirstRunOfCycle},
agenticNewThisCycle=${filteredAgenticUrls.length},
agenticNewThisCycle=${agenticNewThisCycle},
baseUrl=${site.getBaseURL()},
siteId=${siteId},`);
siteId=${siteId}`);

if (finalUrls.length === 0) {
// Fallback to base URL if no URLs found
Expand All @@ -862,7 +877,7 @@ export async function submitForScraping(context) {

return {
urls: finalUrls.map((url) => ({ url })),
siteId: site.getId(),
siteId,
processingType: AUDIT_TYPE,
maxScrapeAge: 0,
options: {
Expand Down Expand Up @@ -1422,7 +1437,7 @@ export async function getScrapeJobStats(
*/
export async function processContentAndGenerateOpportunities(context) {
const {
site, audit, log, scrapeResultPaths, data, dataAccess,
site, audit, log, scrapeResultPaths, data, dataAccess, auditContext,
} = context;

// Check for AI-only mode - skip processing step (step 1 already triggered Mystique)
Expand All @@ -1434,6 +1449,7 @@ export async function processContentAndGenerateOpportunities(context) {

const siteId = site.getId();
const startTime = process.hrtime();
const isSlackTriggered = !!(auditContext?.slackContext?.channelId);

// Check if this is a paid LLMO customer early so we can use it in all logs
const isPaid = await isPaidLLMOCustomer(context);
Expand All @@ -1451,11 +1467,13 @@ export async function processContentAndGenerateOpportunities(context) {
log.info(`${LOG_PREFIX} Found ${urlsToCheck.length} URLs from scrape results`);
} else {
/* c8 ignore start */
// Fetch agentic URLs only for URL list fallback
try {
agenticUrls = await getTopAgenticUrls(site, context);
} catch (e) {
log.warn(`${LOG_PREFIX} Failed to fetch agentic URLs for fallback: ${e.message}. baseUrl=${site.getBaseURL()}`);
// Fetch agentic URLs for URL list fallback (skipped for Slack-triggered runs)
if (!isSlackTriggered) {
try {
agenticUrls = await getTopAgenticUrls(site, context);
} catch (e) {
log.warn(`${LOG_PREFIX} Failed to fetch agentic URLs for fallback: ${e.message}. baseUrl=${site.getBaseURL()}`);
}
}

// Load top organic pages cache for fallback merging
Expand Down Expand Up @@ -1503,7 +1521,6 @@ export async function processContentAndGenerateOpportunities(context) {

log.info(`${LOG_PREFIX} Found ${urlsNeedingPrerender.length}/${successfulComparisons.length} URLs needing prerender from total ${urlsToCheck.length} URLs scraped. isPaidLLMOCustomer=${isPaid}`);

const { auditContext } = context;
const { scrapeJobId } = auditContext || {};
// getScrapeJobStats combines 403s from COMPLETE-status URLs (already in comparisonResults)
// and FAILED-status URLs (absent from comparisonResults, fetched from ScrapeUrl table).
Expand Down Expand Up @@ -1653,7 +1670,6 @@ export async function processContentAndGenerateOpportunities(context) {
};

// Upload status.json on error so UI can show audit status via S3 fallback
const { auditContext } = context;
await uploadStatusSummaryToS3(site.getBaseURL(), {
siteId,
auditId: audit.getId(),
Expand Down
145 changes: 145 additions & 0 deletions test/audits/prerender/handler.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,106 @@ describe('Prerender Audit', () => {

});

it('should include organic URLs even when all are in the recency window when triggered from Slack', async () => {
const athenaStub = sandbox.stub().resolves([]);
const mockHandler = await esmock('../../../src/prerender/handler.js', {
'../../../src/utils/agentic-urls.js': {
getTopAgenticUrlsFromAthena: athenaStub,
},
});

const context = {
site: {
getId: () => 'site-1',
getBaseURL: () => 'https://example.com',
getConfig: () => ({ getIncludedURLs: () => [] }),
},
auditContext: { slackContext: { channelId: 'C123', threadTs: '1.0' } },
dataAccess: {
SiteTopPage: {
allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([
{ getUrl: () => 'https://example.com/organic-page-1' },
{ getUrl: () => 'https://example.com/organic-page-2' },
]),
},
},
log: { info: sandbox.stub(), warn: sandbox.stub(), debug: sandbox.stub() },
env: {},
};

const result = await mockHandler.submitForScraping(context);

// Both URLs must be present even though they would be "recent" in a scheduled run
expect(result.urls).to.deep.equal([
{ url: 'https://example.com/organic-page-1' },
{ url: 'https://example.com/organic-page-2' },
]);
});

it('should not fetch agentic URLs when triggered from Slack', async () => {
const athenaStub = sandbox.stub().resolves(['https://example.com/agentic-1']);
const mockHandler = await esmock('../../../src/prerender/handler.js', {
'../../../src/utils/agentic-urls.js': {
getTopAgenticUrlsFromAthena: athenaStub,
},
});

const context = {
site: {
getId: () => 'site-1',
getBaseURL: () => 'https://example.com',
getConfig: () => ({ getIncludedURLs: () => [] }),
},
auditContext: { slackContext: { channelId: 'C123', threadTs: '1.0' } },
dataAccess: {
SiteTopPage: {
allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([
{ getUrl: () => 'https://example.com/organic-page-1' },
{ getUrl: () => 'https://example.com/organic-page-2' },
]),
},
},
log: { info: sandbox.stub(), warn: sandbox.stub(), debug: sandbox.stub() },
env: {},
};

const result = await mockHandler.submitForScraping(context);

expect(athenaStub).to.not.have.been.called;
expect(result.urls).to.deep.equal([
{ url: 'https://example.com/organic-page-1' },
{ url: 'https://example.com/organic-page-2' },
]);
});

it('should still fetch agentic URLs for scheduled (non-Slack) runs', async () => {
const athenaStub = sandbox.stub().resolves(['https://example.com/agentic-1']);
const mockHandler = await esmock('../../../src/prerender/handler.js', {
'../../../src/utils/agentic-urls.js': {
getTopAgenticUrlsFromAthena: athenaStub,
},
});

const context = {
site: {
getId: () => 'site-1',
getBaseURL: () => 'https://example.com',
getConfig: () => ({ getIncludedURLs: () => [] }),
},
dataAccess: {
SiteTopPage: {
allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([]),
},
PageCitability: { allByIndexKeys: sandbox.stub().resolves([]) },
},
log: { info: sandbox.stub(), warn: sandbox.stub(), debug: sandbox.stub() },
env: {},
};

await mockHandler.submitForScraping(context);

expect(athenaStub).to.have.been.called;
});

});

Expand Down Expand Up @@ -1237,6 +1337,51 @@ describe('Prerender Audit', () => {
expect(context.log.info).to.have.been.calledWith('Prerender - No URLs found for comparison. baseUrl=https://example.com, siteId=test-site-id');
});

it('should not fetch agentic URLs in fallback path when triggered from Slack', async () => {
const athenaStub = sandbox.stub().resolves(['https://example.com/agentic-1']);
const mockHandler = await esmock('../../../src/prerender/handler.js', {
'../../../src/utils/agentic-urls.js': {
getTopAgenticUrlsFromAthena: athenaStub,
getPreferredBaseUrl: () => 'https://example.com',
},
});

const context = {
site: {
getId: () => 'test-site-id',
getBaseURL: () => 'https://example.com',
getConfig: () => ({ getIncludedURLs: () => [] }),
},
audit: { getId: () => 'audit-id' },
dataAccess: {
SiteTopPage: {
allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([
{ getUrl: () => 'https://example.com/organic-1', getTraffic: () => 100 },
]),
},
Opportunity: { allBySiteIdAndStatus: sandbox.stub().resolves([]) },
LatestAudit: { updateByKeys: sandbox.stub().resolves() },
},
log: {
info: sandbox.stub(),
debug: sandbox.stub(),
warn: sandbox.stub(),
error: sandbox.stub(),
},
scrapeResultPaths: new Map(), // No scrape results → triggers fallback path
s3Client: { send: sandbox.stub().rejects(new Error('No S3 data')) },
env: { S3_SCRAPER_BUCKET_NAME: 'test-bucket' },
auditContext: {
scrapeJobId: 'test-job-id',
slackContext: { channelId: 'C123', threadTs: '1.0' },
},
};

await mockHandler.processContentAndGenerateOpportunities(context);

expect(athenaStub).to.not.have.been.called;
});

it('should trigger opportunity processing path when prerender is detected', async () => {
// This test covers line 341 by ensuring the full opportunity processing flow executes
const mockOpportunity = {
Expand Down
Loading