From cc61511096f9f69c1d1ec57ffba0040844855c69 Mon Sep 17 00:00:00 2001 From: voetberg Date: Thu, 7 Mar 2024 13:44:05 -0600 Subject: [PATCH 1/5] Common: Update probe to use prometheus pusher, sqla 2.0 syntax and data model. #127 Changes: - Change text-only queries to poll the data model (rucio.db.sqla.models) - Push results to a remote (See documentation of probes for discriptions). Names: locked_expired_rules.(rse), locked_expired_rules.dids.(rse) --- common/check_expired_locked_rules | 139 ++++++++++++++++++++++-------- 1 file changed, 102 insertions(+), 37 deletions(-) diff --git a/common/check_expired_locked_rules b/common/check_expired_locked_rules index 55e139a1..a3f3cfa7 100755 --- a/common/check_expired_locked_rules +++ b/common/check_expired_locked_rules @@ -1,56 +1,121 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 +#!/usr/bin/env python3 +# Copyright 2012-2024 CERN # # Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # # Authors: # - Cedric Serfon, , 2015 +# - Donata Mielaikaite, , 2020 +# - Eric Vaandering, , 2020 +# - Maggie Voetberg , 2024 + ''' Probe to check the locked expired rules or datasets with locked rules ''' import sys -from rucio.db.sqla.session import get_session +import traceback +from sqlalchemy import select, and_ +from sqlalchemy.sql import functions + +from rucio.db.sqla import models, session +from utils.common import PrometheusPusher # Exit statuses OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 +if __name__ == '__main__': -def main(): - ''' - Probe to check the locked expired rules or datasets with locked rules - ''' status = OK - session = get_session() - try: - query = "select rawtohex(id), scope, name, rse_expression from atlas_rucio.rules where locked=1 and expires_at Date: Mon, 5 Aug 2024 10:19:08 -0500 Subject: [PATCH 2/5] Common: Reformat queries and executation of queries, change where statements to use true() and null() options, use default dictionary as way to collect results --- common/check_expired_locked_rules | 90 +++++++++++++++---------------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/common/check_expired_locked_rules b/common/check_expired_locked_rules index a3f3cfa7..b5c2a5d4 100755 --- a/common/check_expired_locked_rules +++ b/common/check_expired_locked_rules @@ -24,10 +24,12 @@ Probe to check the locked expired rules or datasets with locked rules ''' +from collections import defaultdict +import datetime import sys import traceback from sqlalchemy import select, and_ -from sqlalchemy.sql import functions +from sqlalchemy.sql import true, null from rucio.db.sqla import models, session from utils.common import PrometheusPusher @@ -39,31 +41,36 @@ if __name__ == '__main__': status = OK session = session.get_session() - with PrometheusPusher() as manager: - try: - statement = select( - models.ReplicationRule.id, - models.ReplicationRule.scope, - models.ReplicationRule.name, - models.ReplicationRule.rse_expression - ).where( + + # Select statement used for both metrics + base_statement = select( + models.ReplicationRule, + models.DataIdentifier.name, + models.DataIdentifier.scope, + models.ReplicationRule.rse_expression, + ) + + # Use prometheus pusher to send results to a remote service + with PrometheusPusher() as manager: + try: + statement = base_statement.where( and_( - models.ReplicationRule.locked == '1', - models.ReplicationRule.expires_at Date: Mon, 5 Aug 2024 10:20:33 -0500 Subject: [PATCH 3/5] Common: Format of call of prometheus pusher --- common/check_expired_locked_rules | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/common/check_expired_locked_rules b/common/check_expired_locked_rules index b5c2a5d4..af4dff5c 100755 --- a/common/check_expired_locked_rules +++ b/common/check_expired_locked_rules @@ -72,12 +72,13 @@ if __name__ == '__main__': # Add a summary entry so when there are no result from the query there is metric continuity rule_counts["All"] = sum(rule_counts.values()) + # Send to Prometheus pusher for rse_expression, count in rule_counts.items(): - manager.gauge('locked_expired_rules.{rse_expression}', - documentation='Number of rules that are locked and expired, by RSE.' - ).labels( - rse_expression=rse_expression - ).set(count) + (manager.gauge( + "locked_expired_rules.{rse_expression}", + documentation="Number of rules that are locked and expired, by RSE expression.") + .labels(rse_expression=rse_expression) + .set(count)) except Exception as error: print(traceback.format_exc()) @@ -105,12 +106,13 @@ if __name__ == '__main__': status = CRITICAL datasets_count[row.rse_expression] += 1 + rule_counts["All"] = sum(rule_counts.values()) for rse_expression, dids in datasets_count.items(): - manager.gauge('locked_expired_rules.dids.{rse_expression}', - documentation='Number of expired DIDs with locked rules, by RSE' - ).labels( - rse_expression=rse_expression - ).set(dids) + (manager.gauge( + "locked_expired_rules.dids.{rse_expression}", + documentation="Number of expired DIDs with locked rules, by RSE expression") + .labels(rse_expression=rse_expression) + .set(dids)) except: print(traceback.format_exc()) From 68ccd17b695293a4019ff7a751706686b9c84506 Mon Sep 17 00:00:00 2001 From: voetberg Date: Mon, 5 Aug 2024 10:21:42 -0500 Subject: [PATCH 4/5] Common: Correct header to most recent version, sort imports --- common/check_expired_locked_rules | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/common/check_expired_locked_rules b/common/check_expired_locked_rules index af4dff5c..bd49bcdb 100755 --- a/common/check_expired_locked_rules +++ b/common/check_expired_locked_rules @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright 2012-2024 CERN +# Copyright European Organization for Nuclear Research (CERN) since 2012 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,26 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -# Authors: -# - Cedric Serfon, , 2015 -# - Donata Mielaikaite, , 2020 -# - Eric Vaandering, , 2020 -# - Maggie Voetberg , 2024 -''' +""" Probe to check the locked expired rules or datasets with locked rules -''' +""" -from collections import defaultdict import datetime import sys import traceback -from sqlalchemy import select, and_ -from sqlalchemy.sql import true, null +from collections import defaultdict + +from sqlalchemy import and_, select +from sqlalchemy.sql import null, true from rucio.db.sqla import models, session + from utils.common import PrometheusPusher # Exit statuses @@ -114,7 +110,7 @@ if __name__ == '__main__': .labels(rse_expression=rse_expression) .set(dids)) - except: + except Exception: print(traceback.format_exc()) sys.exit(UNKNOWN) From b52e9eb427feb35c5b620cfc5bcc0097a2cad8a1 Mon Sep 17 00:00:00 2001 From: voetberg Date: Tue, 6 Aug 2024 08:37:24 -0500 Subject: [PATCH 5/5] Common: Address review comments --- common/check_expired_locked_rules | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/common/check_expired_locked_rules b/common/check_expired_locked_rules index bd49bcdb..6119ff85 100755 --- a/common/check_expired_locked_rules +++ b/common/check_expired_locked_rules @@ -18,31 +18,32 @@ Probe to check the locked expired rules or datasets with locked rules """ -import datetime import sys import traceback from collections import defaultdict +from datetime import datetime from sqlalchemy import and_, select from sqlalchemy.sql import null, true -from rucio.db.sqla import models, session +from rucio.db.sqla import models +from rucio.db.sqla.session import get_session from utils.common import PrometheusPusher # Exit statuses OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 -if __name__ == '__main__': +if __name__ == "__main__": status = OK - session = session.get_session() + session = get_session() # Select statement used for both metrics base_statement = select( - models.ReplicationRule, - models.DataIdentifier.name, - models.DataIdentifier.scope, + models.ReplicationRule.id, + models.ReplicationRule.name, + models.ReplicationRule.scope, models.ReplicationRule.rse_expression, ) @@ -60,7 +61,7 @@ if __name__ == '__main__': # Print rules for nagios monitoring print("Locked expired rules") for row in session.execute(statement): - print(row.rule_id, row.scope, row.name, row.rse_expression) + print(row.id, row.scope, row.name, row.rse_expression) status = CRITICAL # Keep track of the counts rule_counts[row.rse_expression] += 1 @@ -98,11 +99,11 @@ if __name__ == '__main__': print("Datasets expired with locked rules") for row in session.execute(statement): - print(row.rule_id, row.scope, row.name, row.rse_expression) + print(row.id, row.scope, row.name, row.rse_expression) status = CRITICAL datasets_count[row.rse_expression] += 1 - rule_counts["All"] = sum(rule_counts.values()) + datasets_count["All"] = sum(datasets_count.values()) for rse_expression, dids in datasets_count.items(): (manager.gauge( "locked_expired_rules.dids.{rse_expression}",