From bc8817fc5feebff0918b492413881f8d4fee1fff Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 2 Dec 2023 13:06:32 -0500 Subject: [PATCH 1/4] Added speed tests. Related to https://github.com/TranslatorSRI/NameResolution/issues/113 --- tests/nameres/test_nameres_from_gsheet.py | 61 +++++++++++++++++++++-- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/tests/nameres/test_nameres_from_gsheet.py b/tests/nameres/test_nameres_from_gsheet.py index 5b24ad7..baea458 100644 --- a/tests/nameres/test_nameres_from_gsheet.py +++ b/tests/nameres/test_nameres_from_gsheet.py @@ -1,3 +1,5 @@ +import logging +import time import urllib.parse import requests import pytest @@ -9,6 +11,16 @@ @pytest.mark.parametrize("test_row", gsheet.test_rows) def test_label(target_info, test_row, test_category): + """ + :param target_info: The target_info object (really a config object). + :param test_row: A test row to be tested. + :param test_category: A function that can be called with a category name to determine whether or not a particular + category should be tested. + :return: The number of queries generated. + """ + + count_queries = 0 + nameres_url = target_info['NameResURL'] limit = target_info['NameResLimit'] nameres_xfail_if_in_top = int(target_info['NameResXFailIfInTop']) @@ -54,6 +66,7 @@ def test_label(target_info, test_row, test_category): test_summary = f"querying {nameres_url_lookup} with label '{label}' and biolink_type {biolink_class}" response = requests.get(nameres_url_lookup, request) + count_queries += 1 assert response.ok, f"Could not send request {request} to GET {nameres_url_lookup}: {response}" results = response.json() @@ -74,7 +87,7 @@ def test_label(target_info, test_row, test_category): else: assert expected_id not in all_curies, f"Negative test {test_summary} did not find expected ID {expected_id} in top {limit} results." - return + return count_queries # There are three possible responses: if not results: @@ -84,22 +97,62 @@ def test_label(target_info, test_row, test_category): pytest.fail(f"No expected CURIE for {test_summary} from {source_info}: best result is {results[0]}") elif results[0]['curie'] == expected_id: top_result = results[0] - assert top_result['curie'] == expected_id,\ + assert top_result['curie'] == expected_id, \ f"{test_summary} returned expected ID {expected_id} as top result" # Additionally, test the biolink_class_exclude field if there is one. if biolink_class_exclude: - assert biolink_class_exclude not in top_result['types'],\ + assert biolink_class_exclude not in top_result['types'], \ f"Biolink types for {top_result['curie']} are {top_result['types']}, which includes {biolink_class_exclude} which should be excluded." elif expected_id in all_curies: expected_index = all_curies.index(expected_id) fail_message = f"{test_summary} returns {results[0]['curie']} ('{results[0]['label']}') as the " \ - f"top result, but {expected_id} is at {expected_index} index." + f"top result, but {expected_id} is at {expected_index} index." if expected_index <= nameres_xfail_if_in_top: pytest.xfail(fail_message) else: pytest.fail(fail_message) else: pytest.fail(f"{test_summary} but expected result {expected_id} not found: {results}") + + return count_queries + + +@pytest.mark.parametrize("category_and_expected_times", [ + # We expect unit tests to run in less than half a second each query and name. + {'category': 'Unit Tests', 'expected_time_per_query': 0.5}, +]) +def test_query_rates(target_info, category_and_expected_times): + """ + This is being done in service of https://github.com/TranslatorSRI/NameResolution/issues/113 + + To ensure that we can handle 20 simultaneous queries within 10 seconds, we will run a set of + rows from the Google Sheet, and measure the rate at which we process those queries. + + :param target_info: The target_info object (really a config object). + """ + + category = category_and_expected_times['category'] + rows_to_test = list(filter(lambda row: row.Category == category, gsheet.test_rows)) + assert len(rows_to_test) > 0, f"Category '{category}' not found in Google Sheet {gsheet}." + + time_started = time.time_ns() + count_queries = 0 + for row in rows_to_test: + count_queries += test_label(target_info, row, lambda cat: True) + time_ended = time.time_ns() + time_taken = time_ended - time_started + time_taken_secs = float(time_taken) / 1e+9 + + time_per_test_row = time_taken_secs / len(rows_to_test) + time_per_query = time_taken_secs / count_queries + print(f"NameRes took {time_taken_secs:.3f} seconds to process {len(rows_to_test)} test rows " + + f"({time_per_test_row:.3f} seconds/test row, {time_per_query:.3f} seconds/query) on {target_info}") + + assert len(rows_to_test) > 20, f"Categories with fewer than twenty test rows are not likely to be representative." + assert count_queries > 20, f"Categories with fewer than twenty queries are not likely to be representative." + + if 'expected_time_per_query' in category_and_expected_times: + assert time_per_query < category_and_expected_times['expected_time_per_query'] From 1973e6deac7823b90a5a9a69cad55811f0b670d5 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 2 Dec 2023 13:17:21 -0500 Subject: [PATCH 2/4] Added NodeNorm normalization rate testing. --- tests/nodenorm/test_nodenorm_from_gsheet.py | 54 ++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/tests/nodenorm/test_nodenorm_from_gsheet.py b/tests/nodenorm/test_nodenorm_from_gsheet.py index f7dcf2d..058af39 100644 --- a/tests/nodenorm/test_nodenorm_from_gsheet.py +++ b/tests/nodenorm/test_nodenorm_from_gsheet.py @@ -1,4 +1,5 @@ import itertools +import time import urllib.parse import requests import pytest @@ -10,6 +11,16 @@ @pytest.mark.parametrize("test_row", gsheet.test_rows) def test_normalization(target_info, test_row, test_category): + """ + Test normalization on NodeNorm. + + :param target_info: The target information to test. + :param test_row: The TestRow to test. + :param test_category: A function that accepts a category name and + :return: The number of queries executed. + """ + count_queries = 0 + nodenorm_url = target_info['NodeNormURL'] category = test_row.Category @@ -49,6 +60,7 @@ def test_normalization(target_info, test_row, test_category): test_summary = f"Queried {query_id} ({preferred_label}) on {nodenorm_url_lookup}" response = requests.get(nodenorm_url_lookup, request) + count_queries += 1 assert response.ok, f"Could not send request {request} to GET {nodenorm_url_lookup}: {response}" results = response.json() @@ -85,4 +97,44 @@ def test_normalization(target_info, test_row, test_category): f"found in types: {biolink_types}") else: assert biolink_type in set(biolink_types), (f"{test_summary} biolink type {biolink_type} not found in " - f"types: {biolink_types}") \ No newline at end of file + f"types: {biolink_types}") + + return count_queries + + +@pytest.mark.parametrize("category_and_expected_times", [ + # We expect unit tests to run in less than half a second each query and name. + {'category': 'Unit Tests', 'expected_time_per_query': 0.2}, +]) +def test_normalization_rates(target_info, category_and_expected_times): + """ + This is being done in service of https://github.com/TranslatorSRI/NodeNormalization/issues/205 + + To ensure that we can handle 20 simultaneous queries within 10 seconds, we will run a set of + rows from the Google Sheet, and measure the rate at which we process those queries. + + :param target_info: The target_info object (really a config object). + """ + + category = category_and_expected_times['category'] + rows_to_test = list(filter(lambda row: row.Category == category, gsheet.test_rows)) + assert len(rows_to_test) > 0, f"Category '{category}' not found in Google Sheet {gsheet}." + + time_started = time.time_ns() + count_queries = 0 + for row in rows_to_test: + count_queries += test_normalization(target_info, row, lambda cat: True) + time_ended = time.time_ns() + time_taken = time_ended - time_started + time_taken_secs = float(time_taken) / 1e+9 + + time_per_test_row = time_taken_secs / len(rows_to_test) + time_per_query = time_taken_secs / count_queries + print(f"NodeNorm took {time_taken_secs:.3f} seconds to process {len(rows_to_test)} test rows " + + f"({time_per_test_row:.3f} seconds/test row, {time_per_query:.3f} seconds/query) on {target_info}") + + assert len(rows_to_test) > 20, f"Categories with fewer than twenty test rows are not likely to be representative." + assert count_queries > 20, f"Categories with fewer than twenty queries are not likely to be representative." + + if 'expected_time_per_query' in category_and_expected_times: + assert time_per_query < category_and_expected_times['expected_time_per_query'] From aaacd768cf5a9642094851103dbba71d9c7ebcd9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 2 Dec 2023 13:51:11 -0500 Subject: [PATCH 3/4] Added Slow Tests for NameRes. --- tests/nameres/test_nameres_from_gsheet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/nameres/test_nameres_from_gsheet.py b/tests/nameres/test_nameres_from_gsheet.py index baea458..bf1c73e 100644 --- a/tests/nameres/test_nameres_from_gsheet.py +++ b/tests/nameres/test_nameres_from_gsheet.py @@ -123,6 +123,7 @@ def test_label(target_info, test_row, test_category): @pytest.mark.parametrize("category_and_expected_times", [ # We expect unit tests to run in less than half a second each query and name. {'category': 'Unit Tests', 'expected_time_per_query': 0.5}, + {'category': 'Slow Tests', 'expected_time_per_query': 1}, ]) def test_query_rates(target_info, category_and_expected_times): """ From 15e2784d2be5d665ce5cb664b59cc08860e30127 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 3 Dec 2023 00:26:22 -0500 Subject: [PATCH 4/4] Commented out slow tests until we can fix them. --- tests/nameres/test_nameres_from_gsheet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nameres/test_nameres_from_gsheet.py b/tests/nameres/test_nameres_from_gsheet.py index bf1c73e..d1be851 100644 --- a/tests/nameres/test_nameres_from_gsheet.py +++ b/tests/nameres/test_nameres_from_gsheet.py @@ -123,7 +123,7 @@ def test_label(target_info, test_row, test_category): @pytest.mark.parametrize("category_and_expected_times", [ # We expect unit tests to run in less than half a second each query and name. {'category': 'Unit Tests', 'expected_time_per_query': 0.5}, - {'category': 'Slow Tests', 'expected_time_per_query': 1}, + # {'category': 'Slow Tests', 'expected_time_per_query': 1}, ]) def test_query_rates(target_info, category_and_expected_times): """