Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 83 additions & 67 deletions src/prerender/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,21 @@ async function getRecentlyProcessedPathnames(context, siteId) {
}
}

/**
* Returns true when the URL's pathname is NOT in the set of recently processed pathnames.
* URLs that cannot be parsed are treated as not recent (included by default).
* @param {string} url
* @param {Set<string>} recentPathnames
* @returns {boolean}
*/
function isNotRecentUrl(url, recentPathnames) {
try {
return !recentPathnames.has(new URL(url).pathname);
} catch {
return true;
}
}

function normalizePathname(url) {
try {
const { pathname } = new URL(url);
Expand Down Expand Up @@ -728,21 +743,6 @@ export async function importTopPages(context) {
};
}

/**
* Returns true when the URL's pathname is NOT in the set of recently processed pathnames.
* URLs that cannot be parsed are treated as not recent (included by default).
* @param {string} url
* @param {Set<string>} recentPathnames
* @returns {boolean}
*/
function isNotRecentUrl(url, recentPathnames) {
try {
return !recentPathnames.has(new URL(url).pathname);
} catch {
return true;
}
}

/**
* Step 2: Submit URLs for scraping OR skip if in ai-only mode
* @param {Object} context - Audit context with site and dataAccess
Expand Down Expand Up @@ -793,65 +793,80 @@ export async function submitForScraping(context) {
}

const topPagesUrls = await getTopOrganicUrlsFromSeo(context);
// getTopAgenticUrls internally handles errors and returns [] on failure
const agenticUrls = await getTopAgenticUrls(site, context);

const preferredBase = getPreferredBaseUrl(site, context);
const rebasedTopPagesUrls = topPagesUrls.map((url) => rebaseUrl(url, preferredBase, log));
const rebasedIncludedURLs = ((await site?.getConfig?.()?.getIncludedURLs?.(AUDIT_TYPE)) || [])
.map((url) => rebaseUrl(url, preferredBase, log));

// Daily batching: filter URLs recently processed within the rolling recent window
const recentPathnames = await getRecentlyProcessedPathnames(context, siteId);

const filteredOrganicUrls = rebasedTopPagesUrls
.filter((url) => isNotRecentUrl(url, recentPathnames));
const filteredIncludedURLs = rebasedIncludedURLs
.filter((url) => isNotRecentUrl(url, recentPathnames));
const filteredAgenticUrls = agenticUrls.filter((url) => isNotRecentUrl(url, recentPathnames));

const hasRecentOrganic = filteredOrganicUrls.length !== topPagesUrls.length;
const isFirstRunOfCycle = !hasRecentOrganic;

// Build a single ordered queue across all URL sources and slice the next daily batch
// after removing anything processed within the recent window.
const orderedCandidateUrls = [
...filteredOrganicUrls,
...filteredIncludedURLs,
...filteredAgenticUrls,
];
const batchedUrls = orderedCandidateUrls.slice(0, DAILY_BATCH_SIZE);

const organicUrlSet = new Set(filteredOrganicUrls);
const includedUrlSet = new Set(filteredIncludedURLs);
const batchedOrganicUrls = batchedUrls.filter((url) => organicUrlSet.has(url));
const batchedIncludedURLs = batchedUrls.filter((url) => includedUrlSet.has(url));
const batchedAgenticUrls = batchedUrls.filter(
(url) => !organicUrlSet.has(url) && !includedUrlSet.has(url),
);

// Merge URLs ensuring uniqueness while handling www vs non-www differences
// Also filters out non-HTML URLs (PDFs, images, etc.) in a single pass
const { urls: finalUrls, filteredCount } = mergeAndGetUniqueHtmlUrls(batchedUrls);
// When triggered from Slack, skip agentic sources and daily batching
const isSlackTriggered = !!(auditContext?.slackContext?.channelId);

let finalUrls;
let filteredCount;
let agenticUrlsCount = 0;
let currentAgentic = 0;
let currentOrganic;
let currentIncludedUrls;
let isFirstRunOfCycle;
let agenticNewThisCycle = 0;

if (isSlackTriggered) {
({ urls: finalUrls, filteredCount } = mergeAndGetUniqueHtmlUrls([
...rebasedTopPagesUrls,
...rebasedIncludedURLs,
]));
currentOrganic = rebasedTopPagesUrls.length;
currentIncludedUrls = rebasedIncludedURLs.length;
isFirstRunOfCycle = true;
} else {
// getTopAgenticUrls internally handles errors and returns [] on failure
const agenticUrls = await getTopAgenticUrls(site, context);
agenticUrlsCount = agenticUrls.length;

// Daily batching: filter URLs recently processed within the rolling recent window
const recentPathnames = await getRecentlyProcessedPathnames(context, siteId);

const filteredOrganicUrls = rebasedTopPagesUrls
.filter((url) => isNotRecentUrl(url, recentPathnames));
const filteredIncludedURLs = rebasedIncludedURLs
.filter((url) => isNotRecentUrl(url, recentPathnames));
const filteredAgenticUrls = agenticUrls.filter((url) => isNotRecentUrl(url, recentPathnames));

const hasRecentOrganic = filteredOrganicUrls.length !== topPagesUrls.length;
isFirstRunOfCycle = !hasRecentOrganic;
agenticNewThisCycle = filteredAgenticUrls.length;

const orderedCandidateUrls = [
...filteredOrganicUrls,
...filteredIncludedURLs,
...filteredAgenticUrls,
];
const batchedUrls = orderedCandidateUrls.slice(0, DAILY_BATCH_SIZE);

const organicUrlSet = new Set(filteredOrganicUrls);
const includedUrlSet = new Set(filteredIncludedURLs);
currentOrganic = batchedUrls.filter((url) => organicUrlSet.has(url)).length;
currentIncludedUrls = batchedUrls.filter((url) => includedUrlSet.has(url)).length;
currentAgentic = batchedUrls.filter(
(url) => !organicUrlSet.has(url) && !includedUrlSet.has(url),
).length;

const currentAgentic = batchedAgenticUrls.length;
const currentOrganic = batchedOrganicUrls.length;
const currentIncludedUrls = batchedIncludedURLs.length;
({ urls: finalUrls, filteredCount } = mergeAndGetUniqueHtmlUrls(batchedUrls));
}

log.info(`${LOG_PREFIX}
prerender_submit_scraping_metrics:
log.info(`${LOG_PREFIX} prerender_submit_scraping_metrics:
submittedUrls=${finalUrls.length},
agenticUrls=${agenticUrls.length},
agenticUrls=${agenticUrlsCount},
topPagesUrls=${topPagesUrls.length},
includedURLs=${rebasedIncludedURLs.length},
filteredOutUrls=${filteredCount},
currentAgentic=${currentAgentic},
currentOrganic=${currentOrganic},
currentIncludedUrls=${currentIncludedUrls},
isFirstRunOfCycle=${isFirstRunOfCycle},
agenticNewThisCycle=${filteredAgenticUrls.length},
agenticNewThisCycle=${agenticNewThisCycle},
baseUrl=${site.getBaseURL()},
siteId=${siteId},`);
siteId=${siteId}`);

if (finalUrls.length === 0) {
// Fallback to base URL if no URLs found
Expand All @@ -862,7 +877,7 @@ export async function submitForScraping(context) {

return {
urls: finalUrls.map((url) => ({ url })),
siteId: site.getId(),
siteId,
processingType: AUDIT_TYPE,
maxScrapeAge: 0,
options: {
Expand Down Expand Up @@ -1422,7 +1437,7 @@ export async function getScrapeJobStats(
*/
export async function processContentAndGenerateOpportunities(context) {
const {
site, audit, log, scrapeResultPaths, data, dataAccess,
site, audit, log, scrapeResultPaths, data, dataAccess, auditContext,
} = context;

// Check for AI-only mode - skip processing step (step 1 already triggered Mystique)
Expand All @@ -1434,6 +1449,7 @@ export async function processContentAndGenerateOpportunities(context) {

const siteId = site.getId();
const startTime = process.hrtime();
const isSlackTriggered = !!(auditContext?.slackContext?.channelId);

// Check if this is a paid LLMO customer early so we can use it in all logs
const isPaid = await isPaidLLMOCustomer(context);
Expand All @@ -1451,11 +1467,13 @@ export async function processContentAndGenerateOpportunities(context) {
log.info(`${LOG_PREFIX} Found ${urlsToCheck.length} URLs from scrape results`);
} else {
/* c8 ignore start */
// Fetch agentic URLs only for URL list fallback
try {
agenticUrls = await getTopAgenticUrls(site, context);
} catch (e) {
log.warn(`${LOG_PREFIX} Failed to fetch agentic URLs for fallback: ${e.message}. baseUrl=${site.getBaseURL()}`);
// Fetch agentic URLs for URL list fallback (skipped for Slack-triggered runs)
if (!isSlackTriggered) {
try {
agenticUrls = await getTopAgenticUrls(site, context);
} catch (e) {
log.warn(`${LOG_PREFIX} Failed to fetch agentic URLs for fallback: ${e.message}. baseUrl=${site.getBaseURL()}`);
}
}

// Load top organic pages cache for fallback merging
Expand Down Expand Up @@ -1503,7 +1521,6 @@ export async function processContentAndGenerateOpportunities(context) {

log.info(`${LOG_PREFIX} Found ${urlsNeedingPrerender.length}/${successfulComparisons.length} URLs needing prerender from total ${urlsToCheck.length} URLs scraped. isPaidLLMOCustomer=${isPaid}`);

const { auditContext } = context;
const { scrapeJobId } = auditContext || {};
// getScrapeJobStats combines 403s from COMPLETE-status URLs (already in comparisonResults)
// and FAILED-status URLs (absent from comparisonResults, fetched from ScrapeUrl table).
Expand Down Expand Up @@ -1653,7 +1670,6 @@ export async function processContentAndGenerateOpportunities(context) {
};

// Upload status.json on error so UI can show audit status via S3 fallback
const { auditContext } = context;
await uploadStatusSummaryToS3(site.getBaseURL(), {
siteId,
auditId: audit.getId(),
Expand Down
145 changes: 145 additions & 0 deletions test/audits/prerender/handler.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,106 @@ describe('Prerender Audit', () => {

});

it('should include organic URLs even when all are in the recency window when triggered from Slack', async () => {
const athenaStub = sandbox.stub().resolves([]);
const mockHandler = await esmock('../../../src/prerender/handler.js', {
'../../../src/utils/agentic-urls.js': {
getTopAgenticUrlsFromAthena: athenaStub,
},
});

const context = {
site: {
getId: () => 'site-1',
getBaseURL: () => 'https://example.com',
getConfig: () => ({ getIncludedURLs: () => [] }),
},
auditContext: { slackContext: { channelId: 'C123', threadTs: '1.0' } },
dataAccess: {
SiteTopPage: {
allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([
{ getUrl: () => 'https://example.com/organic-page-1' },
{ getUrl: () => 'https://example.com/organic-page-2' },
]),
},
},
log: { info: sandbox.stub(), warn: sandbox.stub(), debug: sandbox.stub() },
env: {},
};

const result = await mockHandler.submitForScraping(context);

// Both URLs must be present even though they would be "recent" in a scheduled run
expect(result.urls).to.deep.equal([
{ url: 'https://example.com/organic-page-1' },
{ url: 'https://example.com/organic-page-2' },
]);
});

it('should not fetch agentic URLs when triggered from Slack', async () => {
const athenaStub = sandbox.stub().resolves(['https://example.com/agentic-1']);
const mockHandler = await esmock('../../../src/prerender/handler.js', {
'../../../src/utils/agentic-urls.js': {
getTopAgenticUrlsFromAthena: athenaStub,
},
});

const context = {
site: {
getId: () => 'site-1',
getBaseURL: () => 'https://example.com',
getConfig: () => ({ getIncludedURLs: () => [] }),
},
auditContext: { slackContext: { channelId: 'C123', threadTs: '1.0' } },
dataAccess: {
SiteTopPage: {
allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([
{ getUrl: () => 'https://example.com/organic-page-1' },
{ getUrl: () => 'https://example.com/organic-page-2' },
]),
},
},
log: { info: sandbox.stub(), warn: sandbox.stub(), debug: sandbox.stub() },
env: {},
};

const result = await mockHandler.submitForScraping(context);

expect(athenaStub).to.not.have.been.called;
expect(result.urls).to.deep.equal([
{ url: 'https://example.com/organic-page-1' },
{ url: 'https://example.com/organic-page-2' },
]);
});

it('should still fetch agentic URLs for scheduled (non-Slack) runs', async () => {
const athenaStub = sandbox.stub().resolves(['https://example.com/agentic-1']);
const mockHandler = await esmock('../../../src/prerender/handler.js', {
'../../../src/utils/agentic-urls.js': {
getTopAgenticUrlsFromAthena: athenaStub,
},
});

const context = {
site: {
getId: () => 'site-1',
getBaseURL: () => 'https://example.com',
getConfig: () => ({ getIncludedURLs: () => [] }),
},
dataAccess: {
SiteTopPage: {
allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([]),
},
PageCitability: { allByIndexKeys: sandbox.stub().resolves([]) },
},
log: { info: sandbox.stub(), warn: sandbox.stub(), debug: sandbox.stub() },
env: {},
};

await mockHandler.submitForScraping(context);

expect(athenaStub).to.have.been.called;
});

});

Expand Down Expand Up @@ -1237,6 +1337,51 @@ describe('Prerender Audit', () => {
expect(context.log.info).to.have.been.calledWith('Prerender - No URLs found for comparison. baseUrl=https://example.com, siteId=test-site-id');
});

it('should not fetch agentic URLs in fallback path when triggered from Slack', async () => {
const athenaStub = sandbox.stub().resolves(['https://example.com/agentic-1']);
const mockHandler = await esmock('../../../src/prerender/handler.js', {
'../../../src/utils/agentic-urls.js': {
getTopAgenticUrlsFromAthena: athenaStub,
getPreferredBaseUrl: () => 'https://example.com',
},
});

const context = {
site: {
getId: () => 'test-site-id',
getBaseURL: () => 'https://example.com',
getConfig: () => ({ getIncludedURLs: () => [] }),
},
audit: { getId: () => 'audit-id' },
dataAccess: {
SiteTopPage: {
allBySiteIdAndSourceAndGeo: sandbox.stub().resolves([
{ getUrl: () => 'https://example.com/organic-1', getTraffic: () => 100 },
]),
},
Opportunity: { allBySiteIdAndStatus: sandbox.stub().resolves([]) },
LatestAudit: { updateByKeys: sandbox.stub().resolves() },
},
log: {
info: sandbox.stub(),
debug: sandbox.stub(),
warn: sandbox.stub(),
error: sandbox.stub(),
},
scrapeResultPaths: new Map(), // No scrape results → triggers fallback path
s3Client: { send: sandbox.stub().rejects(new Error('No S3 data')) },
env: { S3_SCRAPER_BUCKET_NAME: 'test-bucket' },
auditContext: {
scrapeJobId: 'test-job-id',
slackContext: { channelId: 'C123', threadTs: '1.0' },
},
};

await mockHandler.processContentAndGenerateOpportunities(context);

expect(athenaStub).to.not.have.been.called;
});

it('should trigger opportunity processing path when prerender is detected', async () => {
// This test covers line 341 by ensuring the full opportunity processing flow executes
const mockOpportunity = {
Expand Down
Loading