diff --git a/coldfront/core/resource/management/commands/add_resource_defaults.py b/coldfront/core/resource/management/commands/add_resource_defaults.py index 1d60141c62..534c2659b9 100644 --- a/coldfront/core/resource/management/commands/add_resource_defaults.py +++ b/coldfront/core/resource/management/commands/add_resource_defaults.py @@ -33,6 +33,7 @@ def handle(self, *args, **options): ('GPU Count', 'Int'), ('Features', 'Text'), ('slurm_integration', 'Text'), + ('NodeType', 'Text'), # UBCCR ('Core Count', 'Int'), # ('expiry_time', 'Int'), @@ -63,6 +64,9 @@ def handle(self, *args, **options): ) for resource_type, description in ( + # FASRC + ('Supergroup', 'Compute Supergroup'), + # UBCCR ('Storage', 'Network Storage'), ('Storage Tier', 'Storage Tier',), ('Cloud', 'Cloud Computing'), diff --git a/coldfront/core/resource/templates/resource_detail.html b/coldfront/core/resource/templates/resource_detail.html index c369312c38..ec88171628 100644 --- a/coldfront/core/resource/templates/resource_detail.html +++ b/coldfront/core/resource/templates/resource_detail.html @@ -205,7 +205,7 @@

Resource Al {# {% if user_sync_dt %}#} {# Last Sync: {{user_sync_dt}}#} {# {% endif %}#} - {% if 'Cluster' in resource.resource_type.name %} + {% if 'Cluster' in resource.resource_type.name or 'Supergroup' in resource.resource_type.name %}
Edit Resource Allocations @@ -222,7 +222,7 @@

Resource Al Project Users - {% if 'Cluster' in resource.resource_type.name %} + {% if 'Cluster' in resource.resource_type.name or 'Supergroup' in resource.resource_type.name %} CPU Hours Percent Usage RawShare @@ -256,7 +256,7 @@

Resource Al {{ allocation.user_count }} - {% if 'Cluster' in resource.resource_type.name %} + {% if 'Cluster' in resource.resource_type.name or 'Supergroup' in resource.resource_type.name %} {{ allocation.usage}} diff --git a/coldfront/plugins/ldap/signals.py b/coldfront/plugins/ldap/signals.py index c91d54dcaf..3a3d50efa1 100644 --- a/coldfront/plugins/ldap/signals.py +++ b/coldfront/plugins/ldap/signals.py @@ -12,11 +12,13 @@ project_reactivate_projectuser, ) from coldfront.core.project.models import ( + Project, + ProjectUser, ProjectUserRoleChoice, ProjectUserStatusChoice, - ProjectUser, ) -from coldfront.plugins.ldap.utils import LDAPConn +from coldfront.core.resource.models import Resource +from coldfront.plugins.ldap.utils import LDAPConn, LDAPException if 'coldfront.plugins.sftocf' in import_from_settings('INSTALLED_APPS', []): from coldfront.plugins.sftocf.signals import ( @@ -115,6 +117,83 @@ def update_new_project(sender, **kwargs): extra={ 'category': 'database_change:ProjectUser', 'status': 'success' } ) +# @receiver(slurmrest_supergroup_membership_update) +def update_supergroup_membership(sender, **kwargs): + """Update ColdFront Supergroup allocation list based on AD group membership + Supergroups are ColdFront Resources that correspond to slurm accounts and to AD groups. + They need to be linked to the ColdFront cluster Allocations that belong to the Projects that + correspond to the Supergroup's AD group members. + """ + # get supergroup name from kwargs + supergroup_name = kwargs['supergroup_name'] + # get corresponding AD group and supergroup Resource + try: + ad_conn = LDAPConn() + members = ad_conn.return_group_group_members(supergroup_name) + except Exception as e: + logger.exception( + "error encountered retrieving members and manager for Supergroup %s: %s", + supergroup_name, e + ) + raise LDAPException(f"ldap error: {e}") from e + + supergroup_obj = Resource.objects.get(name=supergroup_name) + # collect Projects corresponding to AD group membership + for member in members: + if not ad_conn.check_group_validity(member): + logger.warning( + "skipping invalid Supergroup %s member %s", + supergroup_name, member['sAMAccountName'][0] + ) + continue + try: + project_obj = Project.objects.get( + title=member['sAMAccountName'][0], + ) + except Exception as e: + logger.warning( + "issue retrieving Project for Supergroup %s member %s: %s", + supergroup_name, member['sAMAccountName'][0], e + ) + continue + # get allocation corresponding to the Project and supergroup's parent resource + try: + allocation_obj = project_obj.allocation_set.get( + resources=supergroup_obj.parent_resource, + ) + except Exception as e: + logger.warning( + "issue retrieving Allocation for Supergroup %s member Project %s: %s", + supergroup_name, project_obj.title, e + ) + continue + # link allocation to supergroup if not already linked + if supergroup_obj not in allocation_obj.resources.all(): + allocation_obj.resources.add(supergroup_obj) + logger.info( + "linked Supergroup %s to Allocation for Project %s", + supergroup_name, project_obj.title, + extra={ 'category': 'ldap:Supergroup', 'status': 'success' } + ) + + # remove allocations no longer linked to AD group members + for allocation in supergroup_obj.allocation_set.all(): + project_title = allocation.project.title + keep_linked = True + if not ad_conn.check_group_validity({'sAMAccountName':[project_title]}): + keep_linked = False + # check if project_title is in membership + member_usernames = [m['sAMAccountName'][0] for m in members] + if project_title not in member_usernames: + keep_linked = False + if not keep_linked: + allocation.resources.remove(supergroup_obj) + logger.info( + "removed Supergroup %s from Allocation for Project %s", + supergroup_name, project_title, + extra={ 'category': 'ldap:Supergroup', 'status': 'success' } + ) + @receiver(project_filter_users_to_remove) def filter_project_users_to_remove(sender, **kwargs): users_to_remove = kwargs['users_to_remove'] diff --git a/coldfront/plugins/ldap/utils.py b/coldfront/plugins/ldap/utils.py index 128b2a571d..a313e4c28b 100644 --- a/coldfront/plugins/ldap/utils.py +++ b/coldfront/plugins/ldap/utils.py @@ -88,6 +88,25 @@ def member_in_group(self, member_dn, group_dn): group = self.search_groups({'distinguishedName': group_dn, 'member': member_dn}) return bool(group) + def check_group_validity(self, group_entry): + """Check if group entry is valid. + + A valid group has a valid manager. + """ + # run checks on the groups to ensure they have valid managers and aren't disabled + try: + _, manager = self.return_group_members_manager(group_entry['sAMAccountName'][0]) + except LDAPException as e: + logger.warning('group %s invalid: %s', group_entry['sAMAccountName'][0], e) + return False + if not manager: + return False + if not user_valid(manager): + return False + return True + + + def search(self, attr_search_dict, search_base, attributes=ALL_ATTRIBUTES): """Run an LDAP search. @@ -172,14 +191,16 @@ def return_user_by_name(self, username, return_as='dict', attributes=ALL_ATTRIBU raise ValueError(f"no users returned for username {username}") return user[0] - def return_group_by_name(self, groupname, return_as='dict', attributes=ALL_ATTRIBUTES): + def return_group_by_name(self, samaccountname, return_as='dict', attributes=ALL_ATTRIBUTES): group = self.search_groups( - {"sAMAccountName": groupname}, return_as=return_as, attributes=attributes + {"sAMAccountName": samaccountname}, return_as=return_as, attributes=attributes ) if len(group) > 1: + logger.error('multiple groups with same sAMAccountName: %s', samaccountname) raise ValueError("too many groups in value returned") if not group: - raise ValueError("no groups returned") + logger.error('no groups found with sAMAccountName: %s', samaccountname) + raise ValueError("no matching groups returned") return group[0] def add_user_to_group(self, user_name, group_name): @@ -324,12 +345,15 @@ def return_group_members_manager(self, samaccountname): ) if len(group_entries) > 1: logger.error('multiple groups with same sAMAccountName: %s', samaccountname) - return 'multiple groups with same sAMAccountName' + raise LDAPException('multiple groups with same sAMAccountName') if not group_entries: logger.error('no groups found with sAMAccountName: %s', samaccountname) - return 'no matching groups found' + raise LDAPException('no matching groups found') group_entry = group_entries[0] - return self.manager_members_from_group(group_entry) + try: + return self.manager_members_from_group(group_entry) + except LDAPException as e: + raise def manager_members_from_group(self, group_entry): group_dn = group_entry['distinguishedName'][0] @@ -343,7 +367,7 @@ def manager_members_from_group(self, group_entry): group_manager_dn = group_entry['managedBy'][0] except Exception as e: logger.error('no manager specified for group %s', group_dn) - return 'no manager specified' + raise LDAPException('no manager specified') from e manager_attr_list = user_attr_list + ['memberOf'] group_manager = self.search_users( {'distinguishedName': group_manager_dn}, attributes=manager_attr_list @@ -351,9 +375,18 @@ def manager_members_from_group(self, group_entry): logger.debug('group_manager:\n%s', group_manager) if not group_manager: logger.error('no ADUser manager found for group %s', group_dn) - return 'no ADUser manager found' + raise LDAPException('no ADUser manager found') return (group_members, group_manager[0]) + def return_group_group_members(self, samaccountname): + """return group entries that are members of the specified group.""" + logger.debug('return_group_group_members for Group %s', samaccountname) + group = self.return_group_by_name(samaccountname) + group_dn = group['distinguishedName'][0] + group_members = self.search_groups({'memberOf': group_dn}) + logger.debug('group_members:\n%s', group_members) + return group_members + def user_valid(user): return user['userAccountControl'][0] in [512, 66048] diff --git a/coldfront/plugins/slurmrest/associations.py b/coldfront/plugins/slurmrest/associations.py index 2dd3587995..f1a0028198 100644 --- a/coldfront/plugins/slurmrest/associations.py +++ b/coldfront/plugins/slurmrest/associations.py @@ -49,9 +49,12 @@ def __init__(self, cluster_name): self.owner_attribute_type = ResourceAttributeType.objects.get(name='Owner') self.features_attribute_type = ResourceAttributeType.objects.get(name='Features') + self.nodetype_attribute_type = ResourceAttributeType.objects.get(name='NodeType') self.gpu_count_attribute_type = ResourceAttributeType.objects.get(name='GPU Count') self.core_count_attribute_type = ResourceAttributeType.objects.get(name='Core Count') + self.cpu_count_attribute_type = ResourceAttributeType.objects.get(name='CPU Count') self.service_end_attribute_type = ResourceAttributeType.objects.get(name='ServiceEnd') + self.rawshare_attribute_type = ResourceAttributeType.objects.get(name='Rawshare') self.slurm_specs_resourceattribute_type = ResourceAttributeType.objects.get( name=SLURMREST_SPECS_ATTRIBUTE_NAME) self.account_attribute_type = AllocationAttributeType.objects.get( @@ -170,6 +173,80 @@ def id_partition_projects(self, partition_data): return partition_account_names + + ### Supergroup methods ### + def update_supergroups(self): + supergroup_resources = Resource.objects.filter( + parent_resource=self.cluster_resource, + resource_type__name='Supergroup' + ) + for supergroup in supergroup_resources: + fairshare = self.calculate_supergroup_fairshare(supergroup.name) + logger.info( + "Updated supergroup %s fairshare to %s", supergroup.name, fairshare + ) + + def calculate_supergroup_fairshare(self, supergroup_name): + """Calculate fairshare for a given supergroup based on its node cores and types. + equation is: + """ + supergroup_resource = Resource.objects.get( + resource_type__name='Supergroup', + name=supergroup_name + ) + supergroup_nodes = Resource.objects.filter( + resource_type__name='Compute Node', + linked_resources=supergroup_resource, + is_available=True, + ) + node_multiplier_dict = { + 'h200': 546.9, + 'h100': 546.9, + 'a100': 209.1, + 'a100-mig': 209.1, + 'v100': 75, + 'a40': 10, + 'rtxa6000': 10, + 'icelake': 1.15, + 'cascadelake': 1, + 'sapphirerapids': 0.6, + 'genoa': 0.6, + 'milan': 0.5, + 'skylake': 0.5, + 'haswell': 0.4, + 'broadwell': 0.4, + } + total_fairshare = 0 + for node in supergroup_nodes: + nodetype_attr = node.resourceattribute_set.get( + resource_attribute_type__name='NodeType' + ) + nodetype_dict = dict( + [item.split(':') for item in nodetype_attr.value.split(',')] + ) + + cpu_count_attr = node.resourceattribute_set.get( + resource_attribute_type__name='CPU Count') + cpu_type = nodetype_dict['cpu_type'] + multiplier = node_multiplier_dict[cpu_type] + cpu_value = int(cpu_count_attr.value) * multiplier + total_fairshare += cpu_value + + if 'gpu_type' in nodetype_dict: + gpu_count_attr = node.resourceattribute_set.get( + resource_attribute_type__name='GPU Count') + gpu_type = nodetype_dict['gpu_type'] + gpu_multiplier = node_multiplier_dict[gpu_type] + gpu_value = int(gpu_count_attr.value) * gpu_multiplier + total_fairshare += gpu_value + # create or update the supergroup's Rawshare attribute + supergroup_resource.resourceattribute_set.update_or_create( + resource_attribute_type=self.rawshare_attribute_type, + defaults={'value': str(int(total_fairshare))} + ) + return str(int(total_fairshare)) + + ### Node import methods ### def import_node_data(self): """Import Slurm nodes as Coldfront Resources. @@ -182,7 +259,8 @@ def import_node_data(self): resource_type=self.node_resource_type, is_available=True, ).exclude(name__in=[n['name'] for n in self.nodes]): - logger.info("Node resource %s no longer exists in Slurm cluster %s; marking unavailable.", + logger.info( + "Node resource %s no longer exists in Slurm cluster %s; marking unavailable.", resource_to_delete.name, self.cluster_name ) resource_to_delete.is_available = False @@ -193,6 +271,84 @@ def import_node_data(self): defaults={'value':timezone.now()} ) + def determine_node_owner(self, node_data): + features = node_data.get('features', []) + joined = ','.join(features) + if 'o_s_' in joined or 'o_g_' in joined: + for feature in features: + if feature.startswith('o_s_') or feature.startswith('o_g_'): + return feature[4:] + node_partitions = node_data.get('partitions', []) + for partition_name in ['serial_requeue', 'gpu_requeue']: + if partition_name in node_partitions: + node_partitions.remove(partition_name) + group_list = [] + for partition_name in node_partitions: + partition = next( + (p for p in self.partitions if p['name'] == partition_name), None + ) + partition_groups = partition['groups'].get('allowed', '').split(',') + if 'seas' in partition_groups: + return 'seas' + # add all groups that aren't slurm_admin to group_list + group_list.extend( + [g for g in partition_groups if '-admin' not in g] + ) + group_list = list(set(group_list)) + if group_list in (['cluster_users_2', 'cluster_users'], ['fasse_users']): + return 'fasrc' + if len(group_list) == 1: + return group_list[0] + for group in group_list: + if 'kempner' in group: + return 'kempner_partition' + if 'iaifi' in group: + return 'iaifi_partition' + try: + Project.objects.get(title=group) + except Project.DoesNotExist: + group_list = [g for g in group_list if g != group] + if len(group_list) == 1: + return group_list[0] + logger.warning( + "can't determine owner of node_name %s. Partitions: %s Group list: %s", + node_data['name'], node_partitions, set(group_list) + ) + return '' + + def determine_node_type(self, node_data): + result = {} + features = node_data.get('features', []) + if 'gpu' in features: + for node_type in ['h200', 'h100', 'a100', 'a100-mig', 'v100', 'a40', 'rtxa6000']: + if node_type in features: + result['gpu_type'] = node_type + continue + for node_type in ['icelake', 'cascadelake', 'sapphirerapids', 'genoa', 'milan', 'skylake']: + if node_type in features: + result['cpu_type'] = node_type + continue + return result + + def process_tres(self, node_data): + tres = node_data.get('tres', 0) + tres_dict = {} + if tres != 0: + tres_dict = dict([i.split('=') for i in tres.split(',')]) + return tres_dict + + def update_node_supergroups(self, node_resource, node_owner): + """If a supergroup exists for the node owner, link it to the node resource.""" + try: + supergroup_resource = Resource.objects.get( + resource_type__name='Supergroup', + name=node_owner + ) + if supergroup_resource not in node_resource.linked_resources.all(): + node_resource.linked_resources.add(supergroup_resource) + except Resource.DoesNotExist: + pass + def create_update_node_resource(self, node_data): """Create or update a Coldfront Resource for a Slurm node.""" # create or get the node resource @@ -209,19 +365,38 @@ def create_update_node_resource(self, node_data): resource_attribute_type=self.features_attribute_type, defaults={'value': ','.join(node_data['features'])} ) + tres_dict = self.process_tres(node_data) + if tres_dict: + gpu_count = tres_dict.get('gres/gpu', 0) + node_resource.resourceattribute_set.update_or_create( + resource_attribute_type=self.gpu_count_attribute_type, + defaults={'value': str(gpu_count)} + ) + node_resource.resourceattribute_set.update_or_create( - resource_attribute_type=self.gpu_count_attribute_type, - defaults={'value': str(node_data.get('gpus', 0))} + resource_attribute_type=self.cpu_count_attribute_type, + defaults={'value': str(node_data.get('cpus', 0))} ) node_resource.resourceattribute_set.update_or_create( resource_attribute_type=self.core_count_attribute_type, defaults={'value': str(node_data.get('cores', 0))} ) - # owner sometimes needs to be set manually, so we don't update it if it exists - node_resource.resourceattribute_set.get_or_create( + + owner = self.determine_node_owner(node_data) + node_resource.resourceattribute_set.update_or_create( resource_attribute_type=self.owner_attribute_type, - defaults={'value': node_data.get('owner', 'unknown')} + defaults={'value': owner} + ) + + node_type = self.determine_node_type(node_data) + node_type_string = ','.join([f"{k}:{v}" for k,v in node_type.items()]) + node_resource.resourceattribute_set.update_or_create( + resource_attribute_type=self.nodetype_attribute_type, + defaults={'value': node_type_string} ) + + self.update_node_supergroups(node_resource, owner) + if created: logger.info("Created new node resource: %s", node_resource.name) else: