diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py index a9e12b3be..7337b2858 100644 --- a/test/formatters/granite/test_intrinsics_formatters.py +++ b/test/formatters/granite/test_intrinsics_formatters.py @@ -6,9 +6,11 @@ # Standard import copy +import functools import json import os import pathlib +import sys from unittest import mock # Third Party @@ -29,6 +31,10 @@ ) from mellea.formatters.granite.base import util as base_util from mellea.formatters.granite.intrinsics import json_util, util as intrinsics_util +from mellea.formatters.granite.intrinsics.constants import ( + BASE_MODEL_TO_CANONICAL_NAME, + OLD_LAYOUT_REPOS, +) def _read_file(name): @@ -130,6 +136,13 @@ class YamlJsonCombo(pydantic.BaseModel): base_model_id: str = _DEFAULT_BASE_MODEL """Base model on which the target adapter was trained. Should be small enough to run on the CI server.""" + last_validated_commit: str | None = None + """Hugging Face commit SHA of the adapter subpath under which the canned outputs + for this case were last validated. Used by the per-test drift-aware xfail logic and + by ``test_adapter_versions_unchanged``. This is NOT the revision used to download + adapters — that remains the ``revision`` field (default ``"main"``). When the + adapter's subpath on Hugging Face moves past this SHA and the canned-output test + fails, the failure is converted to xfail. ``None`` means drift checks are skipped.""" def _resolve_yaml(self): """ @@ -152,6 +165,7 @@ def _resolve_yaml(self): short_name="answerability_simple", inputs_file=_INPUT_JSON_DIR / "simple.json", task="answerability", + last_validated_commit="450f37fe89519a7b39eb8bf4acab51f022164ac5", ), YamlJsonCombo( short_name="answerability_extra_params", @@ -163,17 +177,20 @@ def _resolve_yaml(self): short_name="answerability_answerable", inputs_file=_INPUT_JSON_DIR / "answerable.json", task="answerability", + last_validated_commit="450f37fe89519a7b39eb8bf4acab51f022164ac5", ), YamlJsonCombo( short_name="answerability_answerable_alora", inputs_file=_INPUT_JSON_DIR / "answerable.json", task="answerability", is_alora=True, + last_validated_commit="450f37fe89519a7b39eb8bf4acab51f022164ac5", ), YamlJsonCombo( short_name="answerability_unanswerable", inputs_file=_INPUT_JSON_DIR / "unanswerable.json", task="answerability", + last_validated_commit="450f37fe89519a7b39eb8bf4acab51f022164ac5", ), YamlJsonCombo( short_name="instruction", @@ -186,16 +203,19 @@ def _resolve_yaml(self): short_name="hallucination_detection", inputs_file=_INPUT_JSON_DIR / "hallucination_detection.json", task="hallucination_detection", + last_validated_commit="450f37fe89519a7b39eb8bf4acab51f022164ac5", ), YamlJsonCombo( short_name="query_clarification", inputs_file=_INPUT_JSON_DIR / "query_clarification.json", task="query_clarification", + last_validated_commit="450f37fe89519a7b39eb8bf4acab51f022164ac5", ), YamlJsonCombo( short_name="query_rewrite", inputs_file=_INPUT_JSON_DIR / "query_rewrite.json", task="query_rewrite", + last_validated_commit="450f37fe89519a7b39eb8bf4acab51f022164ac5", ), YamlJsonCombo( short_name="context_relevance", @@ -204,6 +224,7 @@ def _resolve_yaml(self): task="context_relevance", # No Granite 4.1 version of this adapter base_model_id="ibm-granite/granite-4.0-micro", + last_validated_commit="450f37fe89519a7b39eb8bf4acab51f022164ac5", ), YamlJsonCombo( short_name="context_relevance_alora", @@ -213,11 +234,13 @@ def _resolve_yaml(self): is_alora=True, # No Granite 4.1 version of this adapter base_model_id="ibm-granite/granite-4.0-micro", + last_validated_commit="450f37fe89519a7b39eb8bf4acab51f022164ac5", ), YamlJsonCombo( short_name="citations", inputs_file=_INPUT_JSON_DIR / "citations.json", task="citations", + last_validated_commit="450f37fe89519a7b39eb8bf4acab51f022164ac5", ), YamlJsonCombo( short_name="context-attribution", @@ -227,6 +250,37 @@ def _resolve_yaml(self): revision="c9c189f5ad0b2890660397070613fda46d6ceb80", # No Granite 4.1 version of this adapter at the selected Git commit base_model_id="ibm-granite/granite-4.0-micro", + last_validated_commit="065a365c8dae0a32b360e68353df3f8f8f1dbf8e", + ), + YamlJsonCombo( + short_name="requirement_check", + inputs_file=_INPUT_JSON_DIR / "requirement_check.json", + task="requirement-check", + repo_id="ibm-granite/granitelib-core-r1.0", + last_validated_commit="6b9a42d5e23364b3aca0ae334fbbea57c510623a", + ), + YamlJsonCombo( + short_name="requirement_check_alora", + inputs_file=_INPUT_JSON_DIR / "requirement_check.json", + task="requirement-check", + is_alora=True, + repo_id="ibm-granite/granitelib-core-r1.0", + last_validated_commit="6b9a42d5e23364b3aca0ae334fbbea57c510623a", + ), + YamlJsonCombo( + short_name="uncertainty", + inputs_file=_INPUT_JSON_DIR / "uncertainty.json", + task="uncertainty", + repo_id="ibm-granite/granitelib-core-r1.0", + last_validated_commit="6b9a42d5e23364b3aca0ae334fbbea57c510623a", + ), + YamlJsonCombo( + short_name="uncertainty_alora", + inputs_file=_INPUT_JSON_DIR / "uncertainty.json", + task="uncertainty", + is_alora=True, + repo_id="ibm-granite/granitelib-core-r1.0", + last_validated_commit="6b9a42d5e23364b3aca0ae334fbbea57c510623a", ), # gpt-oss-20b intrinsics (canned output tests only, no inference) YamlJsonCombo( @@ -235,6 +289,7 @@ def _resolve_yaml(self): task="answerability", repo_id="ibm-granite/granitelib-rag-gpt-oss-r1.0", base_model_id="openai/gpt-oss-20b", + last_validated_commit="2bb95d68002197c6a7a763dca5d1a95d939c4743", ), YamlJsonCombo( short_name="gpt_oss_citations", @@ -242,6 +297,7 @@ def _resolve_yaml(self): task="citations", repo_id="ibm-granite/granitelib-rag-gpt-oss-r1.0", base_model_id="openai/gpt-oss-20b", + last_validated_commit="ed250f0f343684df948d24bee64ec6b76680719c", ), YamlJsonCombo( short_name="gpt_oss_hallucination_detection", @@ -249,6 +305,7 @@ def _resolve_yaml(self): task="hallucination_detection", repo_id="ibm-granite/granitelib-rag-gpt-oss-r1.0", base_model_id="openai/gpt-oss-20b", + last_validated_commit="2bb95d68002197c6a7a763dca5d1a95d939c4743", ), YamlJsonCombo( short_name="gpt_oss_query_rewrite", @@ -256,6 +313,7 @@ def _resolve_yaml(self): task="query_rewrite", repo_id="ibm-granite/granitelib-rag-gpt-oss-r1.0", base_model_id="openai/gpt-oss-20b", + last_validated_commit="ed250f0f343684df948d24bee64ec6b76680719c", ), ] _YAML_JSON_COMBOS = {c.short_name: c for c in _YAML_JSON_COMBOS_LIST} @@ -364,6 +422,84 @@ def _yaml_json_combo_for_ollama(request: pytest.FixtureRequest) -> YamlJsonCombo return _YAML_JSON_COMBOS_FOR_OLLAMA[request.param]._resolve_yaml() +def _adapter_subpath(cfg: YamlJsonCombo) -> str: + """Return the Hugging Face Hub subpath where ``cfg``'s adapter lives. + + Mirrors the layout logic in + ``mellea.formatters.granite.intrinsics.util.obtain_lora()``. + """ + model_name = BASE_MODEL_TO_CANONICAL_NAME.get(cfg.base_model_id, cfg.base_model_id) + lora_str = "alora" if cfg.is_alora else "lora" + if cfg.repo_id in OLD_LAYOUT_REPOS: + return f"{cfg.task}/{lora_str}/{model_name}" + return f"{cfg.task}/{model_name}/{lora_str}" + + +# The ``functools.cache`` lives on the original function object, so its memoized +# entries persist for the whole test session. Tests that want to override this +# function (see ``test_xfail_if_drifted``) monkeypatch the *module attribute*, +# which rebinds the name to a stub — calls from ``_xfail_if_drifted`` then resolve +# to the stub regardless of any cached entries on the original. This works in +# practice but the interaction is subtle; don't rely on cache invalidation, rely +# on rebinding. +@functools.cache +def _last_commit_for_subpath(repo_id: str, subpath: str, revision: str) -> str | None: + """Return the SHA of the most recent commit on ``revision`` of ``repo_id`` that + modified any file under ``subpath``. + + Returns ``None`` if the lookup fails for any reason (network down, auth missing, + repo private, path not found) so drift checks degrade gracefully offline rather + than producing false failures. + """ + try: + # Third Party + import huggingface_hub + + info = huggingface_hub.HfApi().get_paths_info( + repo_id=repo_id, paths=[subpath], revision=revision, expand=True + ) + except Exception: + return None + if not info: + return None + last_commit = getattr(info[0], "last_commit", None) + if last_commit is None: + return None + return getattr(last_commit, "oid", None) + + +def _xfail_if_drifted(cfg: YamlJsonCombo) -> None: + """If ``cfg``'s adapter subpath has drifted from its recorded SHA, mark the + current test as xfail before any assertions run. + + No-op when ``cfg.last_validated_commit`` is ``None``, ``cfg.task`` is ``None``, + or the upstream SHA cannot be resolved (offline / auth missing) — without a + confirmed mismatch we let the test run normally. + + Drift detection alerts on *any* file under the adapter subpath changing — not + just adapter weights or ``io.yaml``. A README typo fix or a new image asset + will move the SHA and trip this guard even though no test would actually + fail. A drifted test that still passes will surface as XPASS. + """ + # ``cfg.task is None`` covers fake configs that don't correspond to a real + # adapter on Hugging Face (e.g. the ``instruction`` combo) — there's nothing + # to check drift against, so skip rather than error. + if cfg.last_validated_commit is None or cfg.task is None: + return + + adapter_subpath = _adapter_subpath(cfg) + current = _last_commit_for_subpath(cfg.repo_id, adapter_subpath, cfg.revision) + if current is None or current == cfg.last_validated_commit: + return + pytest.xfail( + f"Adapter at {cfg.repo_id}/{adapter_subpath} drifted from " + f"recorded {cfg.last_validated_commit[:8]} to {current[:8]}. The change " + f"may be nonfunctional (e.g. a README edit) — if this test still passes " + f"as XPASS, you may be able to simply bump the `last_validated_commit`. " + f"Otherwise refresh the canned outputs once the new adapter is verified." + ) + + def test_no_orphan_files(): """Check whether there are input files that aren't used by any test.""" used_json_files = {t.inputs_file for t in _YAML_JSON_COMBOS.values()} @@ -484,6 +620,7 @@ def test_canned_input(yaml_json_combo_no_alora): the expected output """ cfg = yaml_json_combo_no_alora + _xfail_if_drifted(cfg) if cfg.arguments_file: with open(cfg.arguments_file, encoding="utf8") as f: transform_kwargs = json.load(f) @@ -559,6 +696,7 @@ def test_canned_output(yaml_json_combo_with_lora_model): # Same cases as test_canned_input cfg = yaml_json_combo_with_lora_model + _xfail_if_drifted(cfg) # Input is input to model, not input to rewriter input_file = _CANNED_INPUT_EXPECTED_DIR / f"{cfg.short_name}.json" @@ -662,6 +800,7 @@ def test_run_transformers(yaml_json_combo_with_model, gh_run): torch.set_num_threads(2) cfg = yaml_json_combo_with_model + _xfail_if_drifted(cfg) if cfg.arguments_file: with open(cfg.arguments_file, encoding="utf8") as f: transform_kwargs = json.load(f) @@ -767,7 +906,14 @@ def test_run_ollama(yaml_json_combo_for_ollama): """ Run the target model end-to-end with a mock Ollama backend. """ - cfg = yaml_json_combo_for_ollama + # The combos in _YAML_JSON_COMBOS_LIST are module-scoped singletons reused + # across tests, so we copy before doing the Ollama-specific base-model-id + # swap below. + cfg = yaml_json_combo_for_ollama.model_copy() + + # Explicitly don't check drift here. Ollama models don't have their own yaml combo + # that we can track. + # _xfail_if_drifted(cfg) # Change base model id to Ollama's version if cfg.base_model_id == "ibm-granite/granite-4.0-micro": @@ -873,3 +1019,113 @@ def test_run_ollama(yaml_json_combo_for_ollama): print(f" {t_json=}") print(f" {e_json=}") assert t_json == pytest.approx(e_json, abs=0.1) + + +@pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Don't cause CICD pipelines to fail due to adapter version changes alone.", +) +@pytest.mark.huggingface +def test_adapter_versions_unchanged(): + """Fail when any tracked adapter subpath on Hugging Face has moved past the SHA + recorded in ``YamlJsonCombo.last_validated_commit``. + + A failure here means the upstream intrinsic adapter directory has been touched. + Detection is path-level — *any* file under the subpath bumps the SHA, including + nonfunctional changes like README edits, image assets, or new metadata files. + Triage: + + * If the canned-output tests still pass against the new adapter, the change was + most likely nonfunctional — you may be able to just update each drifted entry's + ``last_validated_commit`` to the new SHA. + * If they fail, regenerate the canned outputs and update both. + + Skips entries where ``last_validated_commit`` is ``None`` (e.g. fake configs + with no real adapter). Silently tolerates network/auth failures so this test + doesn't false-fail when run offline. + """ + drifts: list[tuple[str, str, str, str, str]] = [] + for cfg in _YAML_JSON_COMBOS_LIST: + if cfg.last_validated_commit is None or cfg.task is None: + continue + subpath = _adapter_subpath(cfg) + current = _last_commit_for_subpath(cfg.repo_id, subpath, cfg.revision) + if current is None: + continue + if current != cfg.last_validated_commit: + drifts.append( + ( + cfg.short_name, + cfg.repo_id, + subpath, + cfg.last_validated_commit, + current, + ) + ) + if drifts: + msg_lines = ["Adapter versions have drifted:"] + for name, repo, sub, old, new in drifts: + msg_lines.append(f" {name}: {repo}/{sub} {old[:8]} -> {new[:8]}") + msg_lines.append( + "\nDetection is path-level, so a drift may be nonfunctional (README " + "edit, new image asset, etc.). If the canned-output tests still pass " + "against the new adapter, you should ensure it was a meaningful change " + "(ie in the adapter or the io.yaml). If the change is meaningful, " + "regenerate the canned outputs and update the ``last_validated_commit``." + ) + pytest.fail("\n".join(msg_lines)) + + +_FAKE_RECORDED_SHA = "a" * 40 +_FAKE_UPSTREAM_SHA = "b" * 40 + + +@pytest.mark.parametrize( + ("recorded", "upstream", "expect_xfail"), + [ + # Drift detected → mark xfail. The xfail message must surface both SHAs so a + # maintainer reading the log can see what changed. + pytest.param(_FAKE_RECORDED_SHA, _FAKE_UPSTREAM_SHA, True, id="drift"), + # Upstream matches recorded → run normally; real regressions surface as real + # failures. + pytest.param(_FAKE_RECORDED_SHA, _FAKE_RECORDED_SHA, False, id="no_drift"), + # No baseline → run normally without querying upstream (the stub asserts it + # is never called). + pytest.param(None, "should-not-query", False, id="none_recorded"), + # Upstream unreachable (offline, auth missing) → run normally. + pytest.param(_FAKE_RECORDED_SHA, None, False, id="network_down"), + ], +) +def test_xfail_if_drifted(monkeypatch, recorded, upstream, expect_xfail): + """``_xfail_if_drifted`` only marks xfail when the upstream SHA is resolved AND + differs from the recorded SHA; otherwise it returns and lets the test proceed.""" + + def _stub(*_args, **_kwargs): + if recorded is None: + raise AssertionError( + "_last_commit_for_subpath must not be queried when " + "last_validated_commit is None" + ) + return upstream + + monkeypatch.setattr(sys.modules[__name__], "_last_commit_for_subpath", _stub) + + cfg = YamlJsonCombo( + short_name="fake_combo", + inputs_file=_INPUT_JSON_DIR / "simple.json", + task="answerability", + last_validated_commit=recorded, + ) + + if expect_xfail: + with pytest.raises(pytest.xfail.Exception) as exc_info: + _xfail_if_drifted(cfg) + msg = str(exc_info.value) + # Reaching xfail requires both SHAs to be resolved, but guard explicitly + # so that future parametrize cases with ``None`` don't crash here. + if recorded is not None: + assert recorded[:8] in msg + if upstream is not None: + assert upstream[:8] in msg + else: + _xfail_if_drifted(cfg) # must not raise diff --git a/test/formatters/granite/testdata/input_args/requirement_check.json b/test/formatters/granite/testdata/input_args/requirement_check.json deleted file mode 100644 index 67fb262d7..000000000 --- a/test/formatters/granite/testdata/input_args/requirement_check.json +++ /dev/null @@ -1 +0,0 @@ -{"requirement": "The user's question is not one of the homework questions given in the provided documents."} \ No newline at end of file diff --git a/test/formatters/granite/testdata/input_json/requirement_check.json b/test/formatters/granite/testdata/input_json/requirement_check.json new file mode 100644 index 000000000..16c4947ed --- /dev/null +++ b/test/formatters/granite/testdata/input_json/requirement_check.json @@ -0,0 +1,28 @@ +{ + "messages": [ + { + "role": "assistant", + "content": "Hello there, welcome to math questions!" + }, + { + "content": "What is the square root of 4?", + "role": "user" + }, + { + "role": "assistant", + "content": "The square root of 4 is 2." + }, + { + "content": ": The user's question is not one of the homework questions given in the provided documents.\nPlease verify if the assistant's generation satisfies the user's requirements or not and reply with a binary label accordingly. Respond with a json {\"score\": \"yes\"} if the constraints are satisfied or respond with {\"score\": \"no\"} if the constraints are not satisfied.", + "role": "user" + } + ], + "extra_body": { + "documents": [ + { + "doc_id": "1", + "text": "\nHere's a list of math problems that are on the homework assignment:\n\n1) 12+5=17\n2) 20-8=12\n3) 6*7=42\n4) 45/9=5\n5) 13*2=26\n6) 99+11=110\n7) 100-35=65\n\n" + } + ] + } +} diff --git a/test/formatters/granite/testdata/input_json/uncertainty.json b/test/formatters/granite/testdata/input_json/uncertainty.json new file mode 100644 index 000000000..80da39296 --- /dev/null +++ b/test/formatters/granite/testdata/input_json/uncertainty.json @@ -0,0 +1,28 @@ +{ + "messages": [ + { + "role": "assistant", + "content": "Welcome to pet questions!" + }, + { + "role": "user", + "content": "Which of my pets have fleas?" + }, + { + "role": "assistant", + "content": "Based on the documents, your dog has fleas but your cat does not." + } + ], + "extra_body": { + "documents": [ + { + "doc_id": "1", + "text": "My dog has fleas." + }, + { + "doc_id": "2", + "text": "My cat does not have fleas." + } + ] + } +} diff --git a/test/formatters/granite/testdata/test_canned_input/requirement_check.json b/test/formatters/granite/testdata/test_canned_input/requirement_check.json new file mode 100644 index 000000000..a9647ae4e --- /dev/null +++ b/test/formatters/granite/testdata/test_canned_input/requirement_check.json @@ -0,0 +1,50 @@ +{ + "messages": [ + { + "content": "Hello there, welcome to math questions!", + "role": "assistant" + }, + { + "content": "What is the square root of 4?", + "role": "user" + }, + { + "content": "The square root of 4 is 2.", + "role": "assistant" + }, + { + "content": ": The user's question is not one of the homework questions given in the provided documents.\nPlease verify if the assistant's generation satisfies the user's requirements or not and reply with a binary label accordingly. Respond with a json {\"score\": \"yes\"} if the constraints are satisfied or respond with {\"score\": \"no\"} if the constraints are not satisfied.", + "role": "user" + } + ], + "extra_body": { + "documents": [ + { + "text": "\nHere's a list of math problems that are on the homework assignment:\n\n1) 12+5=17\n2) 20-8=12\n3) 6*7=42\n4) 45/9=5\n5) 13*2=26\n6) 99+11=110\n7) 100-35=65\n\n", + "doc_id": "1" + } + ], + "structured_outputs": { + "json": { + "type": "object", + "properties": { + "score": { + "type": "string", + "enum": [ + "yes", + "no" + ] + } + }, + "required": [ + "score" + ], + "additionalProperties": false + } + } + }, + "max_completion_tokens": 15, + "temperature": 0.0, + "logprobs": true, + "top_logprobs": 10 +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/uncertainty.json b/test/formatters/granite/testdata/test_canned_input/uncertainty.json new file mode 100644 index 000000000..58cb9e17b --- /dev/null +++ b/test/formatters/granite/testdata/test_canned_input/uncertainty.json @@ -0,0 +1,58 @@ +{ + "messages": [ + { + "content": "Welcome to pet questions!", + "role": "assistant" + }, + { + "content": "Which of my pets have fleas?", + "role": "user" + }, + { + "content": "Based on the documents, your dog has fleas but your cat does not.", + "role": "assistant" + } + ], + "extra_body": { + "documents": [ + { + "text": "My dog has fleas.", + "doc_id": "1" + }, + { + "text": "My cat does not have fleas.", + "doc_id": "2" + } + ], + "structured_outputs": { + "json": { + "type": "object", + "properties": { + "score": { + "type": "string", + "enum": [ + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9" + ] + } + }, + "required": [ + "score" + ], + "additionalProperties": false + } + } + }, + "max_completion_tokens": 15, + "temperature": 0.0, + "logprobs": true, + "top_logprobs": 10 +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/expected_result/requirement_check.json b/test/formatters/granite/testdata/test_canned_output/expected_result/requirement_check.json new file mode 100644 index 000000000..f36db8afa --- /dev/null +++ b/test/formatters/granite/testdata/test_canned_output/expected_result/requirement_check.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"requirement_check\": {\"score\": 0.4206108287116171}}", + "role": "assistant" + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/expected_result/uncertainty.json b/test/formatters/granite/testdata/test_canned_output/expected_result/uncertainty.json new file mode 100644 index 000000000..3571f5028 --- /dev/null +++ b/test/formatters/granite/testdata/test_canned_output/expected_result/uncertainty.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"certainty\": 0.8550706654633036}", + "role": "assistant" + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/model_output/requirement_check.json b/test/formatters/granite/testdata/test_canned_output/model_output/requirement_check.json new file mode 100644 index 000000000..f4b1e3494 --- /dev/null +++ b/test/formatters/granite/testdata/test_canned_output/model_output/requirement_check.json @@ -0,0 +1,554 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"score\": \"no\"}", + "role": "assistant" + }, + "logprobs": { + "content": [ + { + "token": "{\"", + "logprob": 0.0, + "bytes": [ + 123, + 34 + ], + "top_logprobs": [ + { + "token": "{\"", + "logprob": 0.0, + "bytes": [ + 123, + 34 + ] + }, + { + "token": "{\n", + "logprob": -21.467071533203125, + "bytes": [ + 123, + 10 + ] + }, + { + "token": "{", + "logprob": -21.603395462036133, + "bytes": [ + 123 + ] + }, + { + "token": "{\n\n", + "logprob": -26.860279083251953, + "bytes": [ + 123, + 10, + 10 + ] + }, + { + "token": "{\n\n\n", + "logprob": -36.29634475708008, + "bytes": [ + 123, + 10, + 10, + 10 + ] + }, + { + "token": "'", + "logprob": -3.4028234663852886e38, + "bytes": [ + 39 + ] + }, + { + "token": "#", + "logprob": -3.4028234663852886e38, + "bytes": [ + 35 + ] + }, + { + "token": "&", + "logprob": -3.4028234663852886e38, + "bytes": [ + 38 + ] + }, + { + "token": "%", + "logprob": -3.4028234663852886e38, + "bytes": [ + 37 + ] + }, + { + "token": "*", + "logprob": -3.4028234663852886e38, + "bytes": [ + 42 + ] + } + ] + }, + { + "token": "score", + "logprob": 0.0, + "bytes": [ + 115, + 99, + 111, + 114, + 101 + ], + "top_logprobs": [ + { + "token": "score", + "logprob": 0.0, + "bytes": [ + 115, + 99, + 111, + 114, + 101 + ] + }, + { + "token": "sc", + "logprob": -20.93582534790039, + "bytes": [ + 115, + 99 + ] + }, + { + "token": "s", + "logprob": -27.3807315826416, + "bytes": [ + 115 + ] + }, + { + "token": "'", + "logprob": -3.4028234663852886e38, + "bytes": [ + 39 + ] + }, + { + "token": "#", + "logprob": -3.4028234663852886e38, + "bytes": [ + 35 + ] + }, + { + "token": "&", + "logprob": -3.4028234663852886e38, + "bytes": [ + 38 + ] + }, + { + "token": "%", + "logprob": -3.4028234663852886e38, + "bytes": [ + 37 + ] + }, + { + "token": "*", + "logprob": -3.4028234663852886e38, + "bytes": [ + 42 + ] + }, + { + "token": ")", + "logprob": -3.4028234663852886e38, + "bytes": [ + 41 + ] + }, + { + "token": "!", + "logprob": -3.4028234663852886e38, + "bytes": [ + 33 + ] + } + ] + }, + { + "token": "\":", + "logprob": 0.0, + "bytes": [ + 34, + 58 + ], + "top_logprobs": [ + { + "token": "\":", + "logprob": 0.0, + "bytes": [ + 34, + 58 + ] + }, + { + "token": "\":\"", + "logprob": -26.95718765258789, + "bytes": [ + 34, + 58, + 34 + ] + }, + { + "token": "\"", + "logprob": -28.592464447021484, + "bytes": [ + 34 + ] + }, + { + "token": "\":\n", + "logprob": -34.68193054199219, + "bytes": [ + 34, + 58, + 10 + ] + }, + { + "token": "\"\n\n", + "logprob": -35.96494674682617, + "bytes": [ + 34, + 10, + 10 + ] + }, + { + "token": "\"\n", + "logprob": -36.166988372802734, + "bytes": [ + 34, + 10 + ] + }, + { + "token": "\":\n\n", + "logprob": -37.52782440185547, + "bytes": [ + 34, + 58, + 10, + 10 + ] + }, + { + "token": "\"\n\n\n", + "logprob": -44.714290618896484, + "bytes": [ + 34, + 10, + 10, + 10 + ] + }, + { + "token": "\"\n\n\n\n", + "logprob": -45.71239471435547, + "bytes": [ + 34, + 10, + 10, + 10, + 10 + ] + }, + { + "token": "'", + "logprob": -3.4028234663852886e38, + "bytes": [ + 39 + ] + } + ] + }, + { + "token": " \"", + "logprob": 0.0, + "bytes": [ + 32, + 34 + ], + "top_logprobs": [ + { + "token": " \"", + "logprob": 0.0, + "bytes": [ + 32, + 34 + ] + }, + { + "token": " ", + "logprob": -22.438066482543945, + "bytes": [ + 32 + ] + }, + { + "token": "\"", + "logprob": -29.785236358642578, + "bytes": [ + 34 + ] + }, + { + "token": "\t", + "logprob": -32.06733703613281, + "bytes": [ + 9 + ] + }, + { + "token": " ", + "logprob": -32.360626220703125, + "bytes": [ + 32, + 32 + ] + }, + { + "token": " \n", + "logprob": -35.26121520996094, + "bytes": [ + 32, + 10 + ] + }, + { + "token": " ", + "logprob": -35.70950698852539, + "bytes": [ + 32, + 32, + 32 + ] + }, + { + "token": " ", + "logprob": -36.709312438964844, + "bytes": [ + 32, + 32, + 32, + 32 + ] + }, + { + "token": "\n\n", + "logprob": -37.2135009765625, + "bytes": [ + 10, + 10 + ] + }, + { + "token": " \n\n", + "logprob": -37.86414337158203, + "bytes": [ + 32, + 10, + 10 + ] + } + ] + }, + { + "token": "no", + "logprob": -0.5457810759544373, + "bytes": [ + 110, + 111 + ], + "top_logprobs": [ + { + "token": "no", + "logprob": -0.5457810759544373, + "bytes": [ + 110, + 111 + ] + }, + { + "token": "yes", + "logprob": -0.8660477995872498, + "bytes": [ + 121, + 101, + 115 + ] + }, + { + "token": "ye", + "logprob": -15.616496086120605, + "bytes": [ + 121, + 101 + ] + }, + { + "token": "n", + "logprob": -15.969169616699219, + "bytes": [ + 110 + ] + }, + { + "token": "y", + "logprob": -16.61600112915039, + "bytes": [ + 121 + ] + }, + { + "token": "'", + "logprob": -3.4028234663852886e38, + "bytes": [ + 39 + ] + }, + { + "token": "#", + "logprob": -3.4028234663852886e38, + "bytes": [ + 35 + ] + }, + { + "token": "&", + "logprob": -3.4028234663852886e38, + "bytes": [ + 38 + ] + }, + { + "token": "%", + "logprob": -3.4028234663852886e38, + "bytes": [ + 37 + ] + }, + { + "token": "*", + "logprob": -3.4028234663852886e38, + "bytes": [ + 42 + ] + } + ] + }, + { + "token": "\"}", + "logprob": 0.0, + "bytes": [ + 34, + 125 + ], + "top_logprobs": [ + { + "token": "\"}", + "logprob": 0.0, + "bytes": [ + 34, + 125 + ] + }, + { + "token": "\"", + "logprob": -24.70654296875, + "bytes": [ + 34 + ] + }, + { + "token": "\"\n", + "logprob": -31.599437713623047, + "bytes": [ + 34, + 10 + ] + }, + { + "token": "\"\n\n", + "logprob": -35.08838653564453, + "bytes": [ + 34, + 10, + 10 + ] + }, + { + "token": "\"\n\n\n", + "logprob": -38.5650634765625, + "bytes": [ + 34, + 10, + 10, + 10 + ] + }, + { + "token": "\"\n\n\n\n", + "logprob": -39.870243072509766, + "bytes": [ + 34, + 10, + 10, + 10, + 10 + ] + }, + { + "token": "'", + "logprob": -3.4028234663852886e38, + "bytes": [ + 39 + ] + }, + { + "token": "#", + "logprob": -3.4028234663852886e38, + "bytes": [ + 35 + ] + }, + { + "token": "&", + "logprob": -3.4028234663852886e38, + "bytes": [ + 38 + ] + }, + { + "token": "%", + "logprob": -3.4028234663852886e38, + "bytes": [ + 37 + ] + } + ] + } + ] + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/model_output/uncertainty.json b/test/formatters/granite/testdata/test_canned_output/model_output/uncertainty.json new file mode 100644 index 000000000..16f33279a --- /dev/null +++ b/test/formatters/granite/testdata/test_canned_output/model_output/uncertainty.json @@ -0,0 +1,553 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"score\": \"8\"}", + "role": "assistant" + }, + "logprobs": { + "content": [ + { + "token": "{\"", + "logprob": 0.0, + "bytes": [ + 123, + 34 + ], + "top_logprobs": [ + { + "token": "{\"", + "logprob": 0.0, + "bytes": [ + 123, + 34 + ] + }, + { + "token": "{\n", + "logprob": -17.71022605895996, + "bytes": [ + 123, + 10 + ] + }, + { + "token": "{", + "logprob": -20.982749938964844, + "bytes": [ + 123 + ] + }, + { + "token": "{\n\n", + "logprob": -24.666601181030273, + "bytes": [ + 123, + 10, + 10 + ] + }, + { + "token": "{\n\n\n", + "logprob": -30.7917537689209, + "bytes": [ + 123, + 10, + 10, + 10 + ] + }, + { + "token": "'", + "logprob": -3.4028234663852886e38, + "bytes": [ + 39 + ] + }, + { + "token": "#", + "logprob": -3.4028234663852886e38, + "bytes": [ + 35 + ] + }, + { + "token": "&", + "logprob": -3.4028234663852886e38, + "bytes": [ + 38 + ] + }, + { + "token": "%", + "logprob": -3.4028234663852886e38, + "bytes": [ + 37 + ] + }, + { + "token": "*", + "logprob": -3.4028234663852886e38, + "bytes": [ + 42 + ] + } + ] + }, + { + "token": "score", + "logprob": 0.0, + "bytes": [ + 115, + 99, + 111, + 114, + 101 + ], + "top_logprobs": [ + { + "token": "score", + "logprob": 0.0, + "bytes": [ + 115, + 99, + 111, + 114, + 101 + ] + }, + { + "token": "s", + "logprob": -20.803585052490234, + "bytes": [ + 115 + ] + }, + { + "token": "sc", + "logprob": -23.3030948638916, + "bytes": [ + 115, + 99 + ] + }, + { + "token": "'", + "logprob": -3.4028234663852886e38, + "bytes": [ + 39 + ] + }, + { + "token": "#", + "logprob": -3.4028234663852886e38, + "bytes": [ + 35 + ] + }, + { + "token": "&", + "logprob": -3.4028234663852886e38, + "bytes": [ + 38 + ] + }, + { + "token": "%", + "logprob": -3.4028234663852886e38, + "bytes": [ + 37 + ] + }, + { + "token": "*", + "logprob": -3.4028234663852886e38, + "bytes": [ + 42 + ] + }, + { + "token": ")", + "logprob": -3.4028234663852886e38, + "bytes": [ + 41 + ] + }, + { + "token": "!", + "logprob": -3.4028234663852886e38, + "bytes": [ + 33 + ] + } + ] + }, + { + "token": "\":", + "logprob": 0.0, + "bytes": [ + 34, + 58 + ], + "top_logprobs": [ + { + "token": "\":", + "logprob": 0.0, + "bytes": [ + 34, + 58 + ] + }, + { + "token": "\":\"", + "logprob": -16.800575256347656, + "bytes": [ + 34, + 58, + 34 + ] + }, + { + "token": "\"", + "logprob": -19.040285110473633, + "bytes": [ + 34 + ] + }, + { + "token": "\":\n", + "logprob": -24.836816787719727, + "bytes": [ + 34, + 58, + 10 + ] + }, + { + "token": "\"\n", + "logprob": -27.759153366088867, + "bytes": [ + 34, + 10 + ] + }, + { + "token": "\":\n\n", + "logprob": -28.046525955200195, + "bytes": [ + 34, + 58, + 10, + 10 + ] + }, + { + "token": "\"\n\n", + "logprob": -28.138172149658203, + "bytes": [ + 34, + 10, + 10 + ] + }, + { + "token": "\"\n\n\n", + "logprob": -35.56932830810547, + "bytes": [ + 34, + 10, + 10, + 10 + ] + }, + { + "token": "\"\n\n\n\n", + "logprob": -38.376564025878906, + "bytes": [ + 34, + 10, + 10, + 10, + 10 + ] + }, + { + "token": "'", + "logprob": -3.4028234663852886e38, + "bytes": [ + 39 + ] + } + ] + }, + { + "token": " \"", + "logprob": 0.0, + "bytes": [ + 32, + 34 + ], + "top_logprobs": [ + { + "token": " \"", + "logprob": 0.0, + "bytes": [ + 32, + 34 + ] + }, + { + "token": " ", + "logprob": -24.842153549194336, + "bytes": [ + 32 + ] + }, + { + "token": "\"", + "logprob": -25.94455909729004, + "bytes": [ + 34 + ] + }, + { + "token": " \n", + "logprob": -27.855976104736328, + "bytes": [ + 32, + 10 + ] + }, + { + "token": " \n\n", + "logprob": -28.620742797851562, + "bytes": [ + 32, + 10, + 10 + ] + }, + { + "token": "\t", + "logprob": -29.597455978393555, + "bytes": [ + 9 + ] + }, + { + "token": " ", + "logprob": -31.807470321655273, + "bytes": [ + 32, + 32 + ] + }, + { + "token": " \n", + "logprob": -32.43643569946289, + "bytes": [ + 32, + 32, + 10 + ] + }, + { + "token": " ", + "logprob": -32.538841247558594, + "bytes": [ + 32, + 32, + 32 + ] + }, + { + "token": " ", + "logprob": -32.98236846923828, + "bytes": [ + 32, + 32, + 32, + 32, + 32, + 32, + 32 + ] + } + ] + }, + { + "token": "8", + "logprob": -0.06208917871117592, + "bytes": [ + 56 + ], + "top_logprobs": [ + { + "token": "8", + "logprob": -0.06208917871117592, + "bytes": [ + 56 + ] + }, + { + "token": "9", + "logprob": -2.8483803272247314, + "bytes": [ + 57 + ] + }, + { + "token": "4", + "logprob": -6.404694557189941, + "bytes": [ + 52 + ] + }, + { + "token": "7", + "logprob": -7.414227485656738, + "bytes": [ + 55 + ] + }, + { + "token": "6", + "logprob": -12.079367637634277, + "bytes": [ + 54 + ] + }, + { + "token": "5", + "logprob": -14.447489738464355, + "bytes": [ + 53 + ] + }, + { + "token": "3", + "logprob": -15.777976989746094, + "bytes": [ + 51 + ] + }, + { + "token": "2", + "logprob": -18.722196578979492, + "bytes": [ + 50 + ] + }, + { + "token": "0", + "logprob": -19.81536865234375, + "bytes": [ + 48 + ] + }, + { + "token": "1", + "logprob": -21.15589714050293, + "bytes": [ + 49 + ] + } + ] + }, + { + "token": "\"}", + "logprob": 0.0, + "bytes": [ + 34, + 125 + ], + "top_logprobs": [ + { + "token": "\"}", + "logprob": 0.0, + "bytes": [ + 34, + 125 + ] + }, + { + "token": "\"", + "logprob": -26.619150161743164, + "bytes": [ + 34 + ] + }, + { + "token": "\"\n", + "logprob": -38.56539535522461, + "bytes": [ + 34, + 10 + ] + }, + { + "token": "\"\n\n", + "logprob": -41.54514694213867, + "bytes": [ + 34, + 10, + 10 + ] + }, + { + "token": "\"\n\n\n\n", + "logprob": -47.137237548828125, + "bytes": [ + 34, + 10, + 10, + 10, + 10 + ] + }, + { + "token": "\"\n\n\n", + "logprob": -48.34684753417969, + "bytes": [ + 34, + 10, + 10, + 10 + ] + }, + { + "token": "'", + "logprob": -3.4028234663852886e38, + "bytes": [ + 39 + ] + }, + { + "token": "#", + "logprob": -3.4028234663852886e38, + "bytes": [ + 35 + ] + }, + { + "token": "&", + "logprob": -3.4028234663852886e38, + "bytes": [ + 38 + ] + }, + { + "token": "%", + "logprob": -3.4028234663852886e38, + "bytes": [ + 37 + ] + } + ] + } + ] + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/requirement_check.json b/test/formatters/granite/testdata/test_run_transformers/requirement_check.json new file mode 100644 index 000000000..f36db8afa --- /dev/null +++ b/test/formatters/granite/testdata/test_run_transformers/requirement_check.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"requirement_check\": {\"score\": 0.4206108287116171}}", + "role": "assistant" + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/requirement_check_alora.json b/test/formatters/granite/testdata/test_run_transformers/requirement_check_alora.json new file mode 100644 index 000000000..267490083 --- /dev/null +++ b/test/formatters/granite/testdata/test_run_transformers/requirement_check_alora.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"requirement_check\": {\"score\": 0.2185103906492881}}", + "role": "assistant" + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/uncertainty.json b/test/formatters/granite/testdata/test_run_transformers/uncertainty.json new file mode 100644 index 000000000..3571f5028 --- /dev/null +++ b/test/formatters/granite/testdata/test_run_transformers/uncertainty.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"certainty\": 0.8550706654633036}", + "role": "assistant" + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/uncertainty_alora.json b/test/formatters/granite/testdata/test_run_transformers/uncertainty_alora.json new file mode 100644 index 000000000..d2c0e5e26 --- /dev/null +++ b/test/formatters/granite/testdata/test_run_transformers/uncertainty_alora.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"certainty\": 0.07628673620716085}", + "role": "assistant" + } + } + ] +} \ No newline at end of file