diff --git a/src/canonical/handler.js b/src/canonical/handler.js index 6d68d423d2..0d2cb46bde 100644 --- a/src/canonical/handler.js +++ b/src/canonical/handler.js @@ -364,6 +364,58 @@ export function isSelfReferencing(url1, url2) { return normalizePath(url1) === normalizePath(url2); } +/** + * Validates the structure and types of canonical metadata returned by the scraper. + * Returns false (and logs a warning) for any of these conditions: + * - metadata is missing or not a plain object + * - any of exists / count / inHead keys are absent + * - exists or inHead are not booleans + * - count is not a number + * - href key is absent when exists is true + * + * @param {*} metadata - The canonical metadata object from the scrape result. + * @param {object} log - Logger. + * @param {string} key - S3 key (used in log messages). + * @returns {boolean} True if the metadata is structurally valid, false otherwise. + */ +export function isValidCanonicalMetadata(metadata, log, key) { + if (!metadata || typeof metadata !== 'object') { + log.warn(`[canonical] No canonical metadata in S3 object: ${key}, skipping page`); + return false; + } + + const requiredFields = ['exists', 'count', 'inHead']; + for (const field of requiredFields) { + if (!(field in metadata)) { + log.warn(`[canonical] Canonical metadata missing required field '${field}' in ${key}, skipping page`); + return false; + } + } + + if (typeof metadata.exists !== 'boolean') { + log.warn(`[canonical] Canonical metadata 'exists' is not a boolean in ${key}, skipping page`); + return false; + } + + if (typeof metadata.count !== 'number') { + log.warn(`[canonical] Canonical metadata 'count' is not a number in ${key}, skipping page`); + return false; + } + + if (typeof metadata.inHead !== 'boolean') { + log.warn(`[canonical] Canonical metadata 'inHead' is not a boolean in ${key}, skipping page`); + return false; + } + + // href is only required when exists is true; a page with no canonical tag won't have one + if (metadata.exists && !('href' in metadata)) { + log.warn(`[canonical] Canonical metadata 'href' is absent when exists=true in ${key}, skipping page`); + return false; + } + + return true; +} + /** * Generates a suggestion for fixing a canonical issue based on the check type. * @@ -447,16 +499,9 @@ export async function processScrapedContent(context) { return null; } - if (!scrapedObject?.scrapeResult?.canonical) { - log.warn(`[canonical] No canonical metadata in S3 object: ${key}`); - return { - url: finalUrl, - checks: [{ - check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check, - success: false, - explanation: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.explanation, - }], - }; + const canonicalMetadata = scrapedObject?.scrapeResult?.canonical; + if (!isValidCanonicalMetadata(canonicalMetadata, log, key)) { + return null; } // Filter out scraped pages that redirected to auth/login pages or PDFs @@ -475,9 +520,6 @@ export async function processScrapedContent(context) { log.info(`[canonical] Skipping ${finalUrl} - disallowed by robots.txt`); return null; } - - // Use canonical metadata already extracted by the scraper (Puppeteer) - const canonicalMetadata = scrapedObject.scrapeResult.canonical; const canonicalUrl = canonicalMetadata.href || null; const canonicalTagChecks = []; diff --git a/test/audits/canonical.test.js b/test/audits/canonical.test.js index a7c95eaf58..d8fa78a89c 100755 --- a/test/audits/canonical.test.js +++ b/test/audits/canonical.test.js @@ -27,6 +27,7 @@ import canonicalAudit, { processScrapedContent, getPreviewAuthOptions, isSelfReferencing, + isValidCanonicalMetadata, } from '../../src/canonical/handler.js'; import { getTopPagesForSiteId } from '../../src/utils/data-access.js'; import { CANONICAL_CHECKS } from '../../src/canonical/constants.js'; @@ -1439,6 +1440,76 @@ describe('Canonical URL Tests', () => { }); }); + describe('isValidCanonicalMetadata', () => { + const log = { warn: sinon.stub() }; + const key = 'scrapes/test/page/scrape.json'; + const validMetadata = { + exists: true, count: 1, href: 'https://example.com/', inHead: true, + }; + + beforeEach(() => log.warn.resetHistory()); + + it('should return false for null metadata', () => { + expect(isValidCanonicalMetadata(null, log, key)).to.be.false; + expect(log.warn).to.have.been.calledWithMatch(/No canonical metadata/); + }); + + it('should return false for non-object metadata', () => { + expect(isValidCanonicalMetadata('string', log, key)).to.be.false; + }); + + it('should return false when exists key is absent', () => { + const { exists: _, ...meta } = validMetadata; + expect(isValidCanonicalMetadata(meta, log, key)).to.be.false; + expect(log.warn).to.have.been.calledWithMatch(/missing required field 'exists'/); + }); + + it('should return false when count key is absent', () => { + const { count: _, ...meta } = validMetadata; + expect(isValidCanonicalMetadata(meta, log, key)).to.be.false; + expect(log.warn).to.have.been.calledWithMatch(/missing required field 'count'/); + }); + + it('should return false when inHead key is absent', () => { + const { inHead: _, ...meta } = validMetadata; + expect(isValidCanonicalMetadata(meta, log, key)).to.be.false; + expect(log.warn).to.have.been.calledWithMatch(/missing required field 'inHead'/); + }); + + it('should return false when exists is not a boolean', () => { + expect(isValidCanonicalMetadata({ ...validMetadata, exists: '' }, log, key)).to.be.false; + expect(log.warn).to.have.been.calledWithMatch(/'exists' is not a boolean/); + }); + + it('should return false when count is not a number', () => { + expect(isValidCanonicalMetadata({ ...validMetadata, count: '' }, log, key)).to.be.false; + expect(log.warn).to.have.been.calledWithMatch(/'count' is not a number/); + }); + + it('should return false when inHead is not a boolean', () => { + expect(isValidCanonicalMetadata({ ...validMetadata, inHead: '' }, log, key)).to.be.false; + expect(log.warn).to.have.been.calledWithMatch(/'inHead' is not a boolean/); + }); + + it('should return false when exists is true but href key is absent', () => { + const { href: _, ...meta } = validMetadata; + expect(isValidCanonicalMetadata(meta, log, key)).to.be.false; + expect(log.warn).to.have.been.calledWithMatch(/'href' is absent when exists=true/); + }); + + it('should return true for valid metadata with exists=true', () => { + expect(isValidCanonicalMetadata(validMetadata, log, key)).to.be.true; + expect(log.warn).to.not.have.been.called; + }); + + it('should return true for valid metadata with exists=false and no href', () => { + expect(isValidCanonicalMetadata({ + exists: false, count: 0, inHead: false, + }, log, key)).to.be.true; + expect(log.warn).to.not.have.been.called; + }); + }); + describe('createOpportunityData', () => { it('should return canonical opportunity data with correct structure', () => { const result = createOpportunityData();