From 09a535f3efcac2d717d79f8c0e61ac69ed64e061 Mon Sep 17 00:00:00 2001 From: encordrob Date: Mon, 25 May 2026 11:19:48 +0100 Subject: [PATCH 1/2] Add per-segment transcript support for audio objects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Audio objects use a text attribute (display name containing "#transcript") whose content is split into per-segment actions with frame ranges, even though the ontology marks the attribute dynamic=false. The SDK previously routed every transcript action through set_answer_from_list, which overwrote the single static slot for the attribute — so get_answer(transcript_attr) returned whichever segment was processed last and there was no public way to read per-segment text with timestamps. This change: - Stashes raw transcript actions on the ObjectInstance instead of folding them into the static answer map, identified by the "#transcript" marker in the attribute name. - Adds ObjectInstance.get_transcripts(attribute=None) returning a sorted list of TranscriptSegment(range, text, feature_hash, attribute_name) records. - Makes get_answer(transcript_attr) reconstruct the joined transcript text from the stashed actions on demand, so the SDK does not depend on the backend pre-joining transcripts into object_answers.classifications. - Re-emits stashed transcript actions verbatim in _to_object_actions so a read → modify → save round-trip preserves per-segment data. - Adds set_answer_from_list silently skipping transcript entries as a defensive guard against any other parse path inadvertently clobbering the static map. --- encord/objects/__init__.py | 2 + encord/objects/ontology_labels_impl.py | 55 +-- encord/objects/ontology_object_instance.py | 75 ++++ encord/objects/transcript.py | 47 +++ .../data/all_types_ontology_structure.py | 52 +++ tests/objects/test_transcripts.py | 324 ++++++++++++++++++ 6 files changed, 534 insertions(+), 21 deletions(-) create mode 100644 encord/objects/transcript.py create mode 100644 tests/objects/test_transcripts.py diff --git a/encord/objects/__init__.py b/encord/objects/__init__.py index 3e6ff5881..379a1af6c 100644 --- a/encord/objects/__init__.py +++ b/encord/objects/__init__.py @@ -8,6 +8,7 @@ from encord.objects.ontology_object_instance import AnswerForFrames, ObjectInstance from encord.objects.ontology_structure import OntologyStructure from encord.objects.options import FlatOption, NestableOption, Option +from encord.objects.transcript import TranscriptSegment __all__ = [ "AnswerForFrames", @@ -26,4 +27,5 @@ "RadioAttribute", "Shape", "TextAttribute", + "TranscriptSegment", ] diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py index e4a688537..d1f3377a7 100644 --- a/encord/objects/ontology_labels_impl.py +++ b/encord/objects/ontology_labels_impl.py @@ -100,6 +100,7 @@ from encord.objects.metadata import DataGroupMetadata, DICOMSeriesMetadata, DICOMSliceMetadata from encord.objects.ontology_object import Object from encord.objects.ontology_object_instance import ObjectInstance +from encord.objects.transcript import is_transcript_raw_entry from encord.objects.ontology_structure import OntologyStructure from encord.objects.spaces.annotation.base_annotation import ( _AnnotationMetadata, @@ -2190,11 +2191,12 @@ def _to_object_answers(self) -> Dict[str, Any]: def _to_object_actions(self) -> Dict[str, ObjectAction]: ret: Dict[str, Any] = {} for obj in self._objects_map.values(): - all_static_answers = self._dynamic_answers_to_encord_dict(obj) - if len(all_static_answers) == 0: + actions = list(reversed(self._dynamic_answers_to_encord_dict(obj))) + actions.extend(obj._transcript_actions) + if not actions: continue ret[obj.object_hash] = { - "actions": list(reversed(all_static_answers)), + "actions": actions, "objectHash": obj.object_hash, } @@ -2202,18 +2204,19 @@ def _to_object_actions(self) -> Dict[str, ObjectAction]: for obj in space._objects_map.values(): # Currently, dynamic attributes only available for VideoSpace if isinstance(space, VideoSpace): - all_static_answers = self._dynamic_answers_to_encord_dict(obj) - if len(all_static_answers) == 0: - continue - - if obj.object_hash in ret: - # The same object might still exist across object hashes - continue - else: - ret[obj.object_hash] = { - "actions": list(all_static_answers), - "objectHash": obj.object_hash, - } + actions = list(self._dynamic_answers_to_encord_dict(obj)) + else: + actions = [] + actions.extend(obj._transcript_actions) + if not actions: + continue + if obj.object_hash in ret: + # The same object might still exist across object hashes + continue + ret[obj.object_hash] = { + "actions": actions, + "objectHash": obj.object_hash, + } return ret def _to_classification_answers(self) -> Dict[str, ClassificationAnswer]: @@ -3059,19 +3062,29 @@ def _add_objects_answers(self, object_answers: dict): def _add_action_answers(self, label_row_dict: dict): for answer in label_row_dict["object_actions"].values(): object_hash = answer["objectHash"] - object_instance = self._objects_map.get(object_hash) answer_list = answer["actions"] - if object_instance is not None: - object_instance.set_answer_from_list(answer_list) - else: + + target_object = self._objects_map.get(object_hash) + if target_object is None: # Not great that we're looping through spaces, but usually not that many spaces on a label row - answer_list = answer["actions"] for space in self._space_map.values(): object_on_space = space._objects_map.get(object_hash) if object_on_space is not None: - object_on_space.set_answer_from_list(answers_list=answer_list) + target_object = object_on_space break + if target_object is None: + continue + + non_transcript_actions: List[Any] = [] + for raw_action in answer_list: + if is_transcript_raw_entry(raw_action): + target_object._add_transcript_action(raw_action) + else: + non_transcript_actions.append(raw_action) + if non_transcript_actions: + target_object.set_answer_from_list(non_transcript_actions) + def _create_new_object_instance(self, frame_object_label: FrameObject, frame: int) -> ObjectInstance: ontology = self._ontology.structure feature_hash = frame_object_label["featureHash"] diff --git a/encord/objects/ontology_object_instance.py b/encord/objects/ontology_object_instance.py index 8f717b64c..2a17acc36 100644 --- a/encord/objects/ontology_object_instance.py +++ b/encord/objects/ontology_object_instance.py @@ -65,6 +65,7 @@ from encord.objects.spaces.annotation.base_annotation import _AnnotationData, _AnnotationMetadata, _ObjectAnnotation from encord.objects.spaces.annotation.geometric_annotation import _GeometricAnnotationData from encord.objects.spaces.annotation.range_annotation import _RangeObjectAnnotationData +from encord.objects.transcript import TranscriptSegment, is_transcript_attribute, is_transcript_raw_entry from encord.objects.types import ( AnswerDict, AttributeDict, @@ -91,6 +92,11 @@ def __init__(self, ontology_object: Object, *, object_hash: Optional[str] = None self._dynamic_answer_manager = DynamicAnswerManager(self) + # Raw transcript action dicts as they were parsed from the label row. + # Kept opaque so we can round-trip them on save without losing fields like + # trackHash, manualAnnotation, etc. See encord.objects.transcript. + self._transcript_actions: List[Dict[str, Any]] = [] + # Only used for non-frame entities self._non_geometric = ontology_object.shape in (Shape.AUDIO, Shape.TEXT) @@ -249,6 +255,17 @@ def get_answer( if attribute.dynamic: return self._dynamic_answer_manager.get_answer(attribute, filter_answer, filter_frame) + if is_transcript_attribute(attribute): + # Transcripts are not stored in _static_answer_map; reconstruct the + # joined string from the per-segment actions on demand. + segments = [ + s for s in self._iter_transcript_segments() if s.feature_hash == attribute.feature_node_hash + ] + if not segments: + return None + segments.sort(key=lambda s: s.range[0]) + return "\n".join(s.text for s in segments) + static_answer = self._static_answer_map[attribute.feature_node_hash] if not static_answer.is_answered(): @@ -324,6 +341,12 @@ def set_answer_from_list(self, answers_list: List[AttributeDict]) -> None: grouped_answers = defaultdict(list) for answer_dict in answers_list: + # Transcript entries (text + range) don't fit the static-answer model; they + # are managed via _transcript_actions and get_transcripts() / get_answer(). + # Silently skip them here so any caller (parse path, space parser, customer + # code) can't accidentally clobber the static answer map. + if is_transcript_raw_entry(answer_dict): + continue feature_hash = answer_dict["featureHash"] attribute = _get_attribute_by_hash(feature_hash, self._ontology_object.attributes) if attribute is None: @@ -353,6 +376,57 @@ def set_answer_from_list(self, answers_list: List[AttributeDict]) -> None: assert attribute # we already checked that attribute is not null above. So just silencing this for now self._set_answer_from_grouped_list(attribute, answers_list) + def get_transcripts( + self, + attribute: Optional[TextAttribute] = None, + ) -> List[TranscriptSegment]: + """Return per-segment transcripts for this object, sorted by start frame. + + Each entry is one frame-range within one transcript action. An action with + multiple sub-ranges produces multiple ``TranscriptSegment`` entries that share + the same text. + + Args: + attribute: If provided, return only segments for this transcript attribute. + If ``None``, return all transcript segments across all transcript + attributes on this object. + + Returns: + A list of ``TranscriptSegment`` ordered by ``range[0]``. + """ + segments = list(self._iter_transcript_segments()) + if attribute is not None: + segments = [s for s in segments if s.feature_hash == attribute.feature_node_hash] + segments.sort(key=lambda s: s.range[0]) + return segments + + def _iter_transcript_segments(self) -> Iterable[TranscriptSegment]: + for raw in self._transcript_actions: + text = raw.get("answers") + if not isinstance(text, str): + continue + feature_hash = raw.get("featureHash") + name = raw.get("name", "") + if not isinstance(feature_hash, str): + continue + for sub_range in raw.get("range", []) or []: + if not isinstance(sub_range, (list, tuple)) or len(sub_range) < 2: + continue + yield TranscriptSegment( + range=(int(sub_range[0]), int(sub_range[1])), + text=text, + feature_hash=feature_hash, + attribute_name=name, + ) + + def _add_transcript_action(self, raw: Dict[str, Any]) -> None: + """Stash a raw transcript action dict for later read and serialisation. + + This is an internal hook used by the label row parser; customers should not + call it directly. + """ + self._transcript_actions.append(raw) + @staticmethod def _merge_answers_to_non_overlapping_ranges(ranges: List[Tuple[Range, Set[str]]]) -> List[Tuple[Range, Set[str]]]: ranges.sort(key=lambda x: x[0].start) @@ -678,6 +752,7 @@ def copy(self) -> ObjectInstance: ret._frames_to_instance_data = deepcopy(self._frames_to_instance_data) ret._static_answer_map = deepcopy(self._static_answer_map) ret._dynamic_answer_manager = self._dynamic_answer_manager.copy() + ret._transcript_actions = deepcopy(self._transcript_actions) return ret def get_annotations(self) -> List[Annotation]: diff --git a/encord/objects/transcript.py b/encord/objects/transcript.py new file mode 100644 index 000000000..b4ced4f9a --- /dev/null +++ b/encord/objects/transcript.py @@ -0,0 +1,47 @@ +"""--- +title: "Objects - Transcripts" +slug: "sdk-ref-objects-transcript" +hidden: false +metadata: + title: "Objects - Transcripts" + description: "Encord SDK Objects - Transcript segment data class." +category: "64e481b57b6027003f20aaa0" +--- +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Mapping, Tuple + +from encord.objects.attributes import Attribute + +# Marker substring used in the ontology attribute display name to identify +# transcript-bearing text attributes. There is no schema flag for this yet; +# the marker is the only signal the FE and SDK agree on. +TRANSCRIPT_NAME_MARKER = "#transcript" + + +@dataclass(frozen=True) +class TranscriptSegment: + """One playback segment of a transcript on an ObjectInstance. + + A transcript-bearing attribute on an object may have multiple segments, + each with its own frame range and text. A single segment with multiple + sub-ranges in the underlying label is unrolled into one TranscriptSegment + per sub-range, all sharing the same text. + """ + + range: Tuple[int, int] + text: str + feature_hash: str + attribute_name: str + + +def is_transcript_attribute(attribute: Attribute) -> bool: + return TRANSCRIPT_NAME_MARKER in attribute.name + + +def is_transcript_raw_entry(entry: Mapping[str, Any]) -> bool: + name = entry.get("name") + return isinstance(name, str) and TRANSCRIPT_NAME_MARKER in name diff --git a/tests/objects/data/all_types_ontology_structure.py b/tests/objects/data/all_types_ontology_structure.py index 3257edf05..a49e8950f 100644 --- a/tests/objects/data/all_types_ontology_structure.py +++ b/tests/objects/data/all_types_ontology_structure.py @@ -151,6 +151,57 @@ attributes=[], ) +AUDIO_TRANSCRIPT_OBJECT = Object( + uid=14, + name="audio transcript object", + color="#A4FF00", + shape=Shape.AUDIO, + feature_node_hash="audioTranscriptObjectHash", + archived=False, + attributes=[ + TextAttribute( + uid=[14, 1], + feature_node_hash="captionTranscriptHash", + name="Caption #transcript", + required=False, + dynamic=False, + archived=False, + ), + TextAttribute( + uid=[14, 2], + feature_node_hash="speakerTranscriptHash", + name="Speaker #transcript", + required=False, + dynamic=False, + archived=False, + ), + ChecklistAttribute( + uid=[14, 3], + feature_node_hash="audioMoodChecklistHash", + name="Mood", + required=False, + dynamic=True, + archived=False, + options=[ + FlatOption( + uid=[14, 3, 1], + feature_node_hash="audioMoodHappy", + label="Happy", + value="happy", + archived=False, + ), + FlatOption( + uid=[14, 3, 2], + feature_node_hash="audioMoodSad", + label="Sad", + value="sad", + archived=False, + ), + ], + ), + ], +) + all_types_structure = OntologyStructure( objects=[ Object( @@ -416,6 +467,7 @@ AUDIO_OBJECT_1, AUDIO_OBJECT_2, AUDIO_OBJECT_3, + AUDIO_TRANSCRIPT_OBJECT, TEXT_OBJECT, CUBOID_2D_OBJECT, Object( diff --git a/tests/objects/test_transcripts.py b/tests/objects/test_transcripts.py new file mode 100644 index 000000000..4e3ec0c23 --- /dev/null +++ b/tests/objects/test_transcripts.py @@ -0,0 +1,324 @@ +from copy import deepcopy +from dataclasses import asdict +from unittest.mock import Mock + +import pytest + +from encord.objects import ( + LabelRowV2, + ObjectInstance, + TranscriptSegment, +) +from encord.objects.attributes import ChecklistAttribute, TextAttribute +from encord.objects.ontology_object import Object +from encord.objects.options import Option +from encord.objects.utils import _lower_snake_case +from encord.orm.label_row import LabelRowMetadata +from tests.objects.common import BASE_LABEL_ROW_METADATA +from tests.objects.data.all_types_ontology_structure import AUDIO_TRANSCRIPT_OBJECT, all_types_structure +from tests.objects.objects_test_utils import validate_label_row_serialisation + +audio_transcript_object: Object = all_types_structure.get_child_by_hash( + AUDIO_TRANSCRIPT_OBJECT.feature_node_hash, type_=Object +) +caption_attr: TextAttribute = all_types_structure.get_child_by_hash("captionTranscriptHash", type_=TextAttribute) +speaker_attr: TextAttribute = all_types_structure.get_child_by_hash("speakerTranscriptHash", type_=TextAttribute) +mood_attr: ChecklistAttribute = all_types_structure.get_child_by_hash( + "audioMoodChecklistHash", type_=ChecklistAttribute +) +mood_happy: Option = all_types_structure.get_child_by_hash("audioMoodHappy", type_=Option) + + +OBJECT_HASH = "TrAnSc1234" + + +def _action(feature_hash: str, name: str, text: str, ranges): + """Construct a raw object_action dict mimicking what the backend serves.""" + return { + "name": name, + "value": _lower_snake_case(name), + "answers": text, + "featureHash": feature_hash, + "manualAnnotation": True, + "dynamic": True, + "range": [list(r) for r in ranges], + "shouldPropagate": False, + "trackHash": f"track-{text}", + } + + +def _audio_label_dict_with_actions(actions, joined_classifications=None): + """Build an audio label_row dict carrying transcript actions for a single object.""" + classifications = list(joined_classifications) if joined_classifications else [] + return { + "label_hash": "0aea5ac7-cbc0-4451-a242-e22445d2c9fa", + "branch_name": "main", + "created_at": "Thu, 09 Feb 2023 14:12:03 UTC", + "last_edited_at": "Thu, 09 Feb 2023 14:12:03 UTC", + "data_hash": "aaa6bc82-9f89-4545-adbb-f271bf28cf99", + "dataset_hash": "b02ba3d9-883b-4c5e-ba09-751072ccfc57", + "dataset_title": "Audio Dataset", + "data_title": "sample-audio.mp3", + "data_type": "audio", + "annotation_task_status": "QUEUED", + "is_shadow_data": False, + "object_answers": { + OBJECT_HASH: { + "classifications": classifications, + "objectHash": OBJECT_HASH, + "range": [[0, 300]], + "createdBy": "user1", + "createdAt": "Thu, 05 Dec 2024 15:24:19 UTC", + "lastEditedBy": "user1", + "lastEditedAt": "Thu, 05 Dec 2024 15:24:44 UTC", + "manualAnnotation": True, + "featureHash": audio_transcript_object.feature_node_hash, + "name": audio_transcript_object.name, + "color": audio_transcript_object.color, + "shape": "audio", + "value": _lower_snake_case(audio_transcript_object.name), + } + }, + "classification_answers": {}, + "object_actions": { + OBJECT_HASH: { + "actions": list(actions), + "objectHash": OBJECT_HASH, + } + }, + "label_status": "LABELLED", + "spaces": {}, + "data_units": { + "cd53f484-c9ab-4fd1-9c14-5b34d4e42ba2": { + "data_hash": "cd53f484-c9ab-4fd1-9c14-5b34d4e42ba2", + "data_title": "sample-audio.mp3", + "data_link": "audio-link", + "data_type": "audio/mpeg", + "data_sequence": 0, + "audio_codec": "mp3", + "audio_sample_rate": 44100, + "audio_bit_depth": 8, + "audio_num_channels": 2, + "labels": {}, + "data_duration": 100, + } + }, + } + + +def _audio_label_row(all_types_ontology) -> LabelRowV2: + metadata = asdict(BASE_LABEL_ROW_METADATA) + metadata["frames_per_second"] = 1000 + metadata["data_type"] = "AUDIO" + metadata["number_of_frames"] = metadata["duration"] * metadata["frames_per_second"] + return LabelRowV2(LabelRowMetadata(**metadata), Mock(), all_types_ontology) + + +def _get_obj(label_row: LabelRowV2) -> ObjectInstance: + objects = label_row.get_object_instances() + assert len(objects) == 1 + return objects[0] + + +def test_get_answer_returns_joined_transcript_from_actions(all_types_ontology): + """Even when classifications is empty, get_answer joins the per-segment actions.""" + label_dict = _audio_label_dict_with_actions( + actions=[ + _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]), + _action(caption_attr.feature_node_hash, caption_attr.name, "How are you", [(51, 100)]), + _action(caption_attr.feature_node_hash, caption_attr.name, "Goodbye", [(200, 250)]), + ] + ) + label_row = _audio_label_row(all_types_ontology) + label_row.from_labels_dict(label_dict) + + obj = _get_obj(label_row) + assert obj.get_answer(caption_attr) == "Hello world\nHow are you\nGoodbye" + + +def test_get_answer_is_not_overwritten_by_last_action(all_types_ontology): + """Actions delivered in any order yield the same playback-ordered joined string.""" + label_dict = _audio_label_dict_with_actions( + actions=[ + _action(caption_attr.feature_node_hash, caption_attr.name, "Goodbye", [(200, 250)]), + _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]), + _action(caption_attr.feature_node_hash, caption_attr.name, "How are you", [(51, 100)]), + ] + ) + label_row = _audio_label_row(all_types_ontology) + label_row.from_labels_dict(label_dict) + + obj = _get_obj(label_row) + assert obj.get_answer(caption_attr) == "Hello world\nHow are you\nGoodbye" + + +def test_get_answer_ignores_backend_joined_classification(all_types_ontology): + """The classifications mirror is informational; actions are the source of truth.""" + label_dict = _audio_label_dict_with_actions( + actions=[ + _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]), + _action(caption_attr.feature_node_hash, caption_attr.name, "Goodbye", [(200, 250)]), + ], + joined_classifications=[ + { + "name": caption_attr.name, + "value": _lower_snake_case(caption_attr.name), + # Deliberately wrong / stale joined value so we catch any code path + # that reads from classifications instead of from actions. + "answers": "this should be ignored", + "featureHash": caption_attr.feature_node_hash, + "manualAnnotation": True, + } + ], + ) + label_row = _audio_label_row(all_types_ontology) + label_row.from_labels_dict(label_dict) + + obj = _get_obj(label_row) + assert obj.get_answer(caption_attr) == "Hello world\nGoodbye" + + +def test_get_transcripts_returns_segments_in_playback_order(all_types_ontology): + label_dict = _audio_label_dict_with_actions( + actions=[ + _action(caption_attr.feature_node_hash, caption_attr.name, "Goodbye", [(200, 250)]), + _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]), + _action(caption_attr.feature_node_hash, caption_attr.name, "How are you", [(51, 100)]), + ] + ) + label_row = _audio_label_row(all_types_ontology) + label_row.from_labels_dict(label_dict) + + obj = _get_obj(label_row) + segments = obj.get_transcripts() + assert [s.text for s in segments] == ["Hello world", "How are you", "Goodbye"] + assert [s.range for s in segments] == [(0, 50), (51, 100), (200, 250)] + assert all(isinstance(s, TranscriptSegment) for s in segments) + assert all(s.feature_hash == caption_attr.feature_node_hash for s in segments) + assert all(s.attribute_name == caption_attr.name for s in segments) + + +def test_get_transcripts_filters_by_attribute(all_types_ontology): + label_dict = _audio_label_dict_with_actions( + actions=[ + _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]), + _action(speaker_attr.feature_node_hash, speaker_attr.name, "Alice", [(0, 50)]), + _action(caption_attr.feature_node_hash, caption_attr.name, "How are you", [(51, 100)]), + _action(speaker_attr.feature_node_hash, speaker_attr.name, "Bob", [(51, 100)]), + ] + ) + label_row = _audio_label_row(all_types_ontology) + label_row.from_labels_dict(label_dict) + + obj = _get_obj(label_row) + captions = obj.get_transcripts(caption_attr) + speakers = obj.get_transcripts(speaker_attr) + assert [s.text for s in captions] == ["Hello world", "How are you"] + assert [s.text for s in speakers] == ["Alice", "Bob"] + + # No filter returns all segments across both attributes. + all_segments = obj.get_transcripts() + assert len(all_segments) == 4 + + # And per-attribute get_answer joins are independent. + assert obj.get_answer(caption_attr) == "Hello world\nHow are you" + assert obj.get_answer(speaker_attr) == "Alice\nBob" + + +def test_get_transcripts_returns_empty_when_no_actions(all_types_ontology): + label_dict = _audio_label_dict_with_actions(actions=[]) + label_row = _audio_label_row(all_types_ontology) + label_row.from_labels_dict(label_dict) + + obj = _get_obj(label_row) + assert obj.get_transcripts() == [] + assert obj.get_answer(caption_attr) is None + + +def test_get_transcripts_unrolls_multi_sub_range_action(all_types_ontology): + """One action with two sub-ranges produces two TranscriptSegment entries with the same text.""" + label_dict = _audio_label_dict_with_actions( + actions=[ + _action(caption_attr.feature_node_hash, caption_attr.name, "repeated", [(0, 50), (100, 150)]), + ] + ) + label_row = _audio_label_row(all_types_ontology) + label_row.from_labels_dict(label_dict) + + obj = _get_obj(label_row) + segments = obj.get_transcripts() + assert len(segments) == 2 + assert [s.range for s in segments] == [(0, 50), (100, 150)] + assert all(s.text == "repeated" for s in segments) + + +def test_non_transcript_dynamic_attributes_still_work_on_transcript_object(all_types_ontology): + """A non-transcript dynamic attribute on the same object should be unaffected by the partition.""" + mood_action = { + "name": mood_attr.name, + "value": _lower_snake_case(mood_attr.name), + "answers": [ + { + "name": mood_happy.label, + "value": mood_happy.value, + "featureHash": mood_happy.feature_node_hash, + } + ], + "featureHash": mood_attr.feature_node_hash, + "manualAnnotation": True, + "dynamic": True, + "range": [[0, 100]], + "shouldPropagate": False, + "trackHash": "mood-track", + } + label_dict = _audio_label_dict_with_actions( + actions=[ + _action(caption_attr.feature_node_hash, caption_attr.name, "Hello", [(0, 100)]), + mood_action, + ] + ) + label_row = _audio_label_row(all_types_ontology) + label_row.from_labels_dict(label_dict) + + obj = _get_obj(label_row) + assert obj.get_answer(caption_attr) == "Hello" + + mood_answers = obj.get_answer(mood_attr) + # Dynamic checklist returns AnswersForFrames; check the one we set. + assert len(mood_answers) == 1 + assert mood_answers[0].answer == [mood_happy] + + +def test_round_trip_preserves_transcript_actions(all_types_ontology): + original_actions = [ + _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]), + _action(caption_attr.feature_node_hash, caption_attr.name, "How are you", [(51, 100)]), + _action(speaker_attr.feature_node_hash, speaker_attr.name, "Alice", [(0, 50)]), + ] + label_dict = _audio_label_dict_with_actions(actions=deepcopy(original_actions)) + label_row = _audio_label_row(all_types_ontology) + label_row.from_labels_dict(label_dict) + + serialised = label_row.to_encord_dict() + out_actions = serialised["object_actions"][OBJECT_HASH]["actions"] + + # Order is not guaranteed (we append in stash order), so compare as sorted by text+range. + def key(a): + return (a["featureHash"], a["answers"], tuple(tuple(r) for r in a["range"])) + + assert sorted(out_actions, key=key) == sorted(original_actions, key=key) + + +def test_full_serialisation_roundtrip_with_transcripts(all_types_ontology): + """End-to-end: load → save → reload → save again gives identical output.""" + label_dict = _audio_label_dict_with_actions( + actions=[ + _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]), + _action(caption_attr.feature_node_hash, caption_attr.name, "Goodbye", [(200, 250)]), + _action(speaker_attr.feature_node_hash, speaker_attr.name, "Alice", [(0, 250)]), + ] + ) + label_row = _audio_label_row(all_types_ontology) + label_row.from_labels_dict(label_dict) + + validate_label_row_serialisation(label_row) From 13b1f70e8d68207f2c5c55ddcbd946693b2565a5 Mon Sep 17 00:00:00 2001 From: encordrob Date: Mon, 25 May 2026 11:25:11 +0100 Subject: [PATCH 2/2] Fix lint and format on transcript changes --- encord/objects/ontology_labels_impl.py | 4 ++-- encord/objects/ontology_object_instance.py | 4 +--- tests/objects/test_transcripts.py | 2 -- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py index d1f3377a7..2729a9a3d 100644 --- a/encord/objects/ontology_labels_impl.py +++ b/encord/objects/ontology_labels_impl.py @@ -100,7 +100,6 @@ from encord.objects.metadata import DataGroupMetadata, DICOMSeriesMetadata, DICOMSliceMetadata from encord.objects.ontology_object import Object from encord.objects.ontology_object_instance import ObjectInstance -from encord.objects.transcript import is_transcript_raw_entry from encord.objects.ontology_structure import OntologyStructure from encord.objects.spaces.annotation.base_annotation import ( _AnnotationMetadata, @@ -117,6 +116,7 @@ from encord.objects.spaces.range_space.point_cloud_space import PointCloudFileSpace from encord.objects.spaces.range_space.text_space import TextSpace from encord.objects.spaces.types import ChildInfo, SpaceInfo +from encord.objects.transcript import is_transcript_raw_entry from encord.objects.types import ( AttributeDict, BaseFrameObject, @@ -2191,7 +2191,7 @@ def _to_object_answers(self) -> Dict[str, Any]: def _to_object_actions(self) -> Dict[str, ObjectAction]: ret: Dict[str, Any] = {} for obj in self._objects_map.values(): - actions = list(reversed(self._dynamic_answers_to_encord_dict(obj))) + actions: List[Any] = list(reversed(self._dynamic_answers_to_encord_dict(obj))) actions.extend(obj._transcript_actions) if not actions: continue diff --git a/encord/objects/ontology_object_instance.py b/encord/objects/ontology_object_instance.py index 2a17acc36..7e36ebaff 100644 --- a/encord/objects/ontology_object_instance.py +++ b/encord/objects/ontology_object_instance.py @@ -258,9 +258,7 @@ def get_answer( if is_transcript_attribute(attribute): # Transcripts are not stored in _static_answer_map; reconstruct the # joined string from the per-segment actions on demand. - segments = [ - s for s in self._iter_transcript_segments() if s.feature_hash == attribute.feature_node_hash - ] + segments = [s for s in self._iter_transcript_segments() if s.feature_hash == attribute.feature_node_hash] if not segments: return None segments.sort(key=lambda s: s.range[0]) diff --git a/tests/objects/test_transcripts.py b/tests/objects/test_transcripts.py index 4e3ec0c23..f1d07c65f 100644 --- a/tests/objects/test_transcripts.py +++ b/tests/objects/test_transcripts.py @@ -2,8 +2,6 @@ from dataclasses import asdict from unittest.mock import Mock -import pytest - from encord.objects import ( LabelRowV2, ObjectInstance,