Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 55 additions & 13 deletions src/canonical/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,58 @@ export function isSelfReferencing(url1, url2) {
return normalizePath(url1) === normalizePath(url2);
}

/**
* Validates the structure and types of canonical metadata returned by the scraper.
* Returns false (and logs a warning) for any of these conditions:
* - metadata is missing or not a plain object
* - any of exists / count / inHead keys are absent
* - exists or inHead are not booleans
* - count is not a number
* - href key is absent when exists is true
*
* @param {*} metadata - The canonical metadata object from the scrape result.
* @param {object} log - Logger.
* @param {string} key - S3 key (used in log messages).
* @returns {boolean} True if the metadata is structurally valid, false otherwise.
*/
export function isValidCanonicalMetadata(metadata, log, key) {
if (!metadata || typeof metadata !== 'object') {
log.warn(`[canonical] No canonical metadata in S3 object: ${key}, skipping page`);
return false;
}

const requiredFields = ['exists', 'count', 'inHead'];
for (const field of requiredFields) {
if (!(field in metadata)) {
log.warn(`[canonical] Canonical metadata missing required field '${field}' in ${key}, skipping page`);
return false;
}
}

if (typeof metadata.exists !== 'boolean') {
log.warn(`[canonical] Canonical metadata 'exists' is not a boolean in ${key}, skipping page`);
return false;
}

if (typeof metadata.count !== 'number') {
log.warn(`[canonical] Canonical metadata 'count' is not a number in ${key}, skipping page`);
return false;
}

if (typeof metadata.inHead !== 'boolean') {
log.warn(`[canonical] Canonical metadata 'inHead' is not a boolean in ${key}, skipping page`);
return false;
}

// href is only required when exists is true; a page with no canonical tag won't have one
if (metadata.exists && !('href' in metadata)) {
log.warn(`[canonical] Canonical metadata 'href' is absent when exists=true in ${key}, skipping page`);
return false;
}

return true;
}

/**
* Generates a suggestion for fixing a canonical issue based on the check type.
*
Expand Down Expand Up @@ -447,16 +499,9 @@ export async function processScrapedContent(context) {
return null;
}

if (!scrapedObject?.scrapeResult?.canonical) {
log.warn(`[canonical] No canonical metadata in S3 object: ${key}`);
return {
url: finalUrl,
checks: [{
check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check,
success: false,
explanation: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.explanation,
}],
};
const canonicalMetadata = scrapedObject?.scrapeResult?.canonical;
if (!isValidCanonicalMetadata(canonicalMetadata, log, key)) {
return null;
}

// Filter out scraped pages that redirected to auth/login pages or PDFs
Expand All @@ -475,9 +520,6 @@ export async function processScrapedContent(context) {
log.info(`[canonical] Skipping ${finalUrl} - disallowed by robots.txt`);
return null;
}

// Use canonical metadata already extracted by the scraper (Puppeteer)
const canonicalMetadata = scrapedObject.scrapeResult.canonical;
const canonicalUrl = canonicalMetadata.href || null;
const canonicalTagChecks = [];

Expand Down
71 changes: 71 additions & 0 deletions test/audits/canonical.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import canonicalAudit, {
processScrapedContent,
getPreviewAuthOptions,
isSelfReferencing,
isValidCanonicalMetadata,
} from '../../src/canonical/handler.js';
import { getTopPagesForSiteId } from '../../src/utils/data-access.js';
import { CANONICAL_CHECKS } from '../../src/canonical/constants.js';
Expand Down Expand Up @@ -1439,6 +1440,76 @@ describe('Canonical URL Tests', () => {
});
});

describe('isValidCanonicalMetadata', () => {
const log = { warn: sinon.stub() };
const key = 'scrapes/test/page/scrape.json';
const validMetadata = {
exists: true, count: 1, href: 'https://example.com/', inHead: true,
};

beforeEach(() => log.warn.resetHistory());

it('should return false for null metadata', () => {
expect(isValidCanonicalMetadata(null, log, key)).to.be.false;
expect(log.warn).to.have.been.calledWithMatch(/No canonical metadata/);
});

it('should return false for non-object metadata', () => {
expect(isValidCanonicalMetadata('string', log, key)).to.be.false;
});

it('should return false when exists key is absent', () => {
const { exists: _, ...meta } = validMetadata;
expect(isValidCanonicalMetadata(meta, log, key)).to.be.false;
expect(log.warn).to.have.been.calledWithMatch(/missing required field 'exists'/);
});

it('should return false when count key is absent', () => {
const { count: _, ...meta } = validMetadata;
expect(isValidCanonicalMetadata(meta, log, key)).to.be.false;
expect(log.warn).to.have.been.calledWithMatch(/missing required field 'count'/);
});

it('should return false when inHead key is absent', () => {
const { inHead: _, ...meta } = validMetadata;
expect(isValidCanonicalMetadata(meta, log, key)).to.be.false;
expect(log.warn).to.have.been.calledWithMatch(/missing required field 'inHead'/);
});

it('should return false when exists is not a boolean', () => {
expect(isValidCanonicalMetadata({ ...validMetadata, exists: '' }, log, key)).to.be.false;
expect(log.warn).to.have.been.calledWithMatch(/'exists' is not a boolean/);
});

it('should return false when count is not a number', () => {
expect(isValidCanonicalMetadata({ ...validMetadata, count: '' }, log, key)).to.be.false;
expect(log.warn).to.have.been.calledWithMatch(/'count' is not a number/);
});

it('should return false when inHead is not a boolean', () => {
expect(isValidCanonicalMetadata({ ...validMetadata, inHead: '' }, log, key)).to.be.false;
expect(log.warn).to.have.been.calledWithMatch(/'inHead' is not a boolean/);
});

it('should return false when exists is true but href key is absent', () => {
const { href: _, ...meta } = validMetadata;
expect(isValidCanonicalMetadata(meta, log, key)).to.be.false;
expect(log.warn).to.have.been.calledWithMatch(/'href' is absent when exists=true/);
});

it('should return true for valid metadata with exists=true', () => {
expect(isValidCanonicalMetadata(validMetadata, log, key)).to.be.true;
expect(log.warn).to.not.have.been.called;
});

it('should return true for valid metadata with exists=false and no href', () => {
expect(isValidCanonicalMetadata({
exists: false, count: 0, inHead: false,
}, log, key)).to.be.true;
expect(log.warn).to.not.have.been.called;
});
});

describe('createOpportunityData', () => {
it('should return canonical opportunity data with correct structure', () => {
const result = createOpportunityData();
Expand Down
Loading