From 6f6a8f7b3328c62f978519a1ce04a991fcf9eecb Mon Sep 17 00:00:00 2001 From: Loki Coyote Date: Sat, 23 Sep 2023 09:52:27 +0000 Subject: [PATCH] adding support for optionally configuring the preservation shape field data in _source --- Dockerfile | 3 + mappings/document.js | 423 ++++++++++++++++++++++--------------------- schema.js | 6 +- test/document.js | 31 +++- 4 files changed, 253 insertions(+), 210 deletions(-) diff --git a/Dockerfile b/Dockerfile index cee131c5..508f2d96 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,5 +12,8 @@ RUN npm install # add code from local checkout to image ADD . ${WORKDIR} +# run tests +RUN npm test + # run as pelias user USER pelias diff --git a/mappings/document.js b/mappings/document.js index da22b396..7f9cf081 100644 --- a/mappings/document.js +++ b/mappings/document.js @@ -1,3 +1,5 @@ +const _ = require('lodash'); +const peliasConfig = require('pelias-config'); const admin = require('./partial/admin'); const countryAbbreviation = require('./partial/countryAbbreviation'); const postalcode = require('./partial/postalcode'); @@ -6,211 +8,222 @@ const multiplier = require('./partial/multiplier'); const keyword = require('./partial/keyword'); const keyword_with_doc_values = require('./partial/keyword_with_doc_values'); -var schema = { - properties: { - - // data partitioning - source: keyword_with_doc_values, - layer: keyword_with_doc_values, - - // place name (ngram analysis) - name: hash, - - // place name (phrase analysis) - phrase: hash, - - // address data - address_parts: { - type: 'object', - dynamic: 'strict', - properties: { - name: { - type: 'text', - analyzer: 'keyword', - search_analyzer: 'keyword', - similarity: 'peliasDefaultSimilarity' - }, - unit: { - type: 'text', - analyzer: 'peliasUnit', - search_analyzer: 'peliasUnit', - similarity: 'peliasDefaultSimilarity' - }, - number: { - type: 'text', - analyzer: 'peliasHousenumber', - search_analyzer: 'peliasHousenumber', - similarity: 'peliasDefaultSimilarity' - }, - street: { - type: 'text', - analyzer: 'peliasStreet', - search_analyzer: 'peliasQuery', - similarity: 'peliasDefaultSimilarity' - }, - cross_street: { - type: 'text', - analyzer: 'peliasStreet', - search_analyzer: 'peliasQuery', - similarity: 'peliasDefaultSimilarity' - }, - zip: { - type: 'text', - analyzer: 'peliasZip', - search_analyzer: 'peliasZip', - similarity: 'peliasDefaultSimilarity' - }, - } +function generate(config) { + var mappings = { + properties: { + + // data partitioning + source: keyword_with_doc_values, + layer: keyword_with_doc_values, + + // place name (ngram analysis) + name: hash, + + // place name (phrase analysis) + phrase: hash, + + // address data + address_parts: { + type: 'object', + dynamic: 'strict', + properties: { + name: { + type: 'text', + analyzer: 'keyword', + search_analyzer: 'keyword', + similarity: 'peliasDefaultSimilarity' + }, + unit: { + type: 'text', + analyzer: 'peliasUnit', + search_analyzer: 'peliasUnit', + similarity: 'peliasDefaultSimilarity' + }, + number: { + type: 'text', + analyzer: 'peliasHousenumber', + search_analyzer: 'peliasHousenumber', + similarity: 'peliasDefaultSimilarity' + }, + street: { + type: 'text', + analyzer: 'peliasStreet', + search_analyzer: 'peliasQuery', + similarity: 'peliasDefaultSimilarity' + }, + cross_street: { + type: 'text', + analyzer: 'peliasStreet', + search_analyzer: 'peliasQuery', + similarity: 'peliasDefaultSimilarity' + }, + zip: { + type: 'text', + analyzer: 'peliasZip', + search_analyzer: 'peliasZip', + similarity: 'peliasDefaultSimilarity' + }, + } + }, + + // hierarchy + parent: { + type: 'object', + dynamic: 'strict', + properties: { + // https://github.com/whosonfirst/whosonfirst-placetypes#continent + continent: admin, + continent_a: admin, + continent_id: keyword, + continent_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#ocean + ocean: admin, + ocean_a: admin, + ocean_id: keyword, + ocean_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#empire + empire: admin, + empire_a: admin, + empire_id: keyword, + empire_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#country + country: admin, + country_a: countryAbbreviation, + country_id: keyword, + country_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#dependency + dependency: admin, + dependency_a: admin, + dependency_id: keyword, + dependency_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#marinearea + marinearea: admin, + marinearea_a: admin, + marinearea_id: keyword, + marinearea_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#macroregion + macroregion: admin, + macroregion_a: admin, + macroregion_id: keyword, + macroregion_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#region + region: admin, + region_a: admin, + region_id: keyword, + region_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#macrocounty + macrocounty: admin, + macrocounty_a: admin, + macrocounty_id: keyword, + macrocounty_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#county + county: admin, + county_a: admin, + county_id: keyword, + county_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#locality + locality: admin, + locality_a: admin, + locality_id: keyword, + locality_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#borough + borough: admin, + borough_a: admin, + borough_id: keyword, + borough_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#localadmin + localadmin: admin, + localadmin_a: admin, + localadmin_id: keyword, + localadmin_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#neighbourhood + neighbourhood: admin, + neighbourhood_a: admin, + neighbourhood_id: keyword, + neighbourhood_source: keyword, + + // https://github.com/whosonfirst/whosonfirst-placetypes#postalcode + postalcode: postalcode, + postalcode_a: postalcode, + postalcode_id: keyword, + postalcode_source: keyword + } + }, + + // geography + center_point: require('./partial/centroid'), + shape: require('./partial/shape'), + bounding_box: require('./partial/boundingbox'), + + // meta info + source_id: keyword, + category: keyword, + population: multiplier, + popularity: multiplier, + + // addendum (non-indexed supplimentary data) + addendum: hash }, - - // hierarchy - parent: { - type: 'object', - dynamic: 'strict', - properties: { - // https://github.com/whosonfirst/whosonfirst-placetypes#continent - continent: admin, - continent_a: admin, - continent_id: keyword, - continent_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#ocean - ocean: admin, - ocean_a: admin, - ocean_id: keyword, - ocean_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#empire - empire: admin, - empire_a: admin, - empire_id: keyword, - empire_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#country - country: admin, - country_a: countryAbbreviation, - country_id: keyword, - country_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#dependency - dependency: admin, - dependency_a: admin, - dependency_id: keyword, - dependency_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#marinearea - marinearea: admin, - marinearea_a: admin, - marinearea_id: keyword, - marinearea_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#macroregion - macroregion: admin, - macroregion_a: admin, - macroregion_id: keyword, - macroregion_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#region - region: admin, - region_a: admin, - region_id: keyword, - region_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#macrocounty - macrocounty: admin, - macrocounty_a: admin, - macrocounty_id: keyword, - macrocounty_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#county - county: admin, - county_a: admin, - county_id: keyword, - county_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#locality - locality: admin, - locality_a: admin, - locality_id: keyword, - locality_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#borough - borough: admin, - borough_a: admin, - borough_id: keyword, - borough_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#localadmin - localadmin: admin, - localadmin_a: admin, - localadmin_id: keyword, - localadmin_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#neighbourhood - neighbourhood: admin, - neighbourhood_a: admin, - neighbourhood_id: keyword, - neighbourhood_source: keyword, - - // https://github.com/whosonfirst/whosonfirst-placetypes#postalcode - postalcode: postalcode, - postalcode_a: postalcode, - postalcode_id: keyword, - postalcode_source: keyword - } - }, - - // geography - center_point: require('./partial/centroid'), - shape: require('./partial/shape'), - bounding_box: require('./partial/boundingbox'), - - // meta info - source_id: keyword, - category: keyword, - population: multiplier, - popularity: multiplier, - - // addendum (non-indexed supplimentary data) - addendum: hash - }, - dynamic_templates: [{ - nameGram: { - path_match: 'name.*', - match_mapping_type: 'string', - mapping: { - type: 'text', - analyzer: 'peliasIndexOneEdgeGram', - search_analyzer: 'peliasQuery', - similarity: 'peliasDefaultSimilarity' + dynamic_templates: [ + { + nameGram: { + path_match: 'name.*', + match_mapping_type: 'string', + mapping: { + type: 'text', + analyzer: 'peliasIndexOneEdgeGram', + search_analyzer: 'peliasQuery', + similarity: 'peliasDefaultSimilarity' + } + }, + }, + { + phrase: { + path_match: 'phrase.*', + match_mapping_type: 'string', + mapping: { + type: 'text', + analyzer: 'peliasPhrase', + search_analyzer: 'peliasQuery', + similarity: 'peliasDefaultSimilarity' + } + } + }, + { + addendum: { + path_match: 'addendum.*', + match_mapping_type: 'string', + mapping: { + type: 'keyword', + index: false, + doc_values: false + } + } } + ], + _source: { + excludes: ['shape', 'phrase'] }, - },{ - phrase: { - path_match: 'phrase.*', - match_mapping_type: 'string', - mapping: { - type: 'text', - analyzer: 'peliasPhrase', - search_analyzer: 'peliasQuery', - similarity: 'peliasDefaultSimilarity' - } - } - },{ - addendum: { - path_match: 'addendum.*', - match_mapping_type: 'string', - mapping: { - type: 'keyword', - index: false, - doc_values: false - } - } - }], - _source: { - excludes : ['shape','phrase'] - }, - dynamic: 'strict' -}; - -module.exports = schema; + dynamic: 'strict' + }; + // Merge settings from pelias/config + // if the item is an array, overwrite it entirely. This allows replacing the _source excludes + mappings = _.mergeWith({}, mappings, _.get(config, 'elasticsearch.mappings', {}), + (obj,src) => _.isArray(src)? src : undefined + ); + return mappings; +} +module.exports = generate; diff --git a/schema.js b/schema.js index 5ff23128..73508134 100644 --- a/schema.js +++ b/schema.js @@ -1,6 +1,10 @@ +const peliasConfig = require('pelias-config'); +const config = peliasConfig.generate() +require('./configValidation').validate(config); + const schema = { settings: require('./settings')(), - mappings: require('./mappings/document'), + mappings: require('./mappings/document')(config), }; module.exports = schema; diff --git a/test/document.js b/test/document.js index 81aed4aa..0d94b29f 100644 --- a/test/document.js +++ b/test/document.js @@ -1,5 +1,9 @@ const _ = require('lodash'); -const schema = require('../mappings/document'); +const peliasConfig = require('pelias-config'); +const config = peliasConfig.generate() +require('../configValidation').validate(config); + +const schema = require('../mappings/document')(config); module.exports.tests = {}; @@ -242,9 +246,9 @@ module.exports.tests.dynamic_disabled = function(test, common) { }); }; -// shape field should be exluded from _source because it's massive -module.exports.tests._source = function(test, common) { - test('_source', function(t) { +// shape field should be exluded from _source by default because it's massive +module.exports.tests._source_excludes = function(test, common) { + test('_source shape excludes', function(t) { t.ok(Array.isArray(schema._source.excludes), 'exclusions specified'); t.equal(schema._source.excludes[0], 'shape', 'exclude shape'); t.equal(schema._source.excludes[1], 'phrase', 'exclude phrase'); @@ -252,6 +256,25 @@ module.exports.tests._source = function(test, common) { }); }; +// shape field should be included in _source when explicitly configured +module.exports.tests._source_excludes_override = function(test, common) { + test('_source excludes override', function(t) { + var tmp_config = { + elasticsearch: { + mappings: { + _source: { + excludes: ['phrase'] + } + } + } + }; + var tmp_schema = require('../mappings/document')(tmp_config); + t.ok(Array.isArray(tmp_schema._source.excludes), 'exclusions specified'); + t.equal(tmp_schema._source.excludes[0], 'phrase', 'exclude phrase'); + t.end(); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) {