From 09a535f3efcac2d717d79f8c0e61ac69ed64e061 Mon Sep 17 00:00:00 2001
From: encordrob <robert@encord.com>
Date: Mon, 25 May 2026 11:19:48 +0100
Subject: [PATCH 1/2] Add per-segment transcript support for audio objects
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Audio objects use a text attribute (display name containing "#transcript") whose
content is split into per-segment actions with frame ranges, even though the
ontology marks the attribute dynamic=false. The SDK previously routed every
transcript action through set_answer_from_list, which overwrote the single static
slot for the attribute — so get_answer(transcript_attr) returned whichever
segment was processed last and there was no public way to read per-segment text
with timestamps.

This change:

- Stashes raw transcript actions on the ObjectInstance instead of folding them
  into the static answer map, identified by the "#transcript" marker in the
  attribute name.
- Adds ObjectInstance.get_transcripts(attribute=None) returning a sorted list of
  TranscriptSegment(range, text, feature_hash, attribute_name) records.
- Makes get_answer(transcript_attr) reconstruct the joined transcript text from
  the stashed actions on demand, so the SDK does not depend on the backend
  pre-joining transcripts into object_answers.classifications.
- Re-emits stashed transcript actions verbatim in _to_object_actions so a read →
  modify → save round-trip preserves per-segment data.
- Adds set_answer_from_list silently skipping transcript entries as a defensive
  guard against any other parse path inadvertently clobbering the static map.
---
 encord/objects/__init__.py                    |   2 +
 encord/objects/ontology_labels_impl.py        |  55 +--
 encord/objects/ontology_object_instance.py    |  75 ++++
 encord/objects/transcript.py                  |  47 +++
 .../data/all_types_ontology_structure.py      |  52 +++
 tests/objects/test_transcripts.py             | 324 ++++++++++++++++++
 6 files changed, 534 insertions(+), 21 deletions(-)
 create mode 100644 encord/objects/transcript.py
 create mode 100644 tests/objects/test_transcripts.py

diff --git a/encord/objects/__init__.py b/encord/objects/__init__.py
index 3e6ff5881..379a1af6c 100644
--- a/encord/objects/__init__.py
+++ b/encord/objects/__init__.py
@@ -8,6 +8,7 @@
 from encord.objects.ontology_object_instance import AnswerForFrames, ObjectInstance
 from encord.objects.ontology_structure import OntologyStructure
 from encord.objects.options import FlatOption, NestableOption, Option
+from encord.objects.transcript import TranscriptSegment
 
 __all__ = [
     "AnswerForFrames",
@@ -26,4 +27,5 @@
     "RadioAttribute",
     "Shape",
     "TextAttribute",
+    "TranscriptSegment",
 ]
diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py
index e4a688537..d1f3377a7 100644
--- a/encord/objects/ontology_labels_impl.py
+++ b/encord/objects/ontology_labels_impl.py
@@ -100,6 +100,7 @@
 from encord.objects.metadata import DataGroupMetadata, DICOMSeriesMetadata, DICOMSliceMetadata
 from encord.objects.ontology_object import Object
 from encord.objects.ontology_object_instance import ObjectInstance
+from encord.objects.transcript import is_transcript_raw_entry
 from encord.objects.ontology_structure import OntologyStructure
 from encord.objects.spaces.annotation.base_annotation import (
     _AnnotationMetadata,
@@ -2190,11 +2191,12 @@ def _to_object_answers(self) -> Dict[str, Any]:
     def _to_object_actions(self) -> Dict[str, ObjectAction]:
         ret: Dict[str, Any] = {}
         for obj in self._objects_map.values():
-            all_static_answers = self._dynamic_answers_to_encord_dict(obj)
-            if len(all_static_answers) == 0:
+            actions = list(reversed(self._dynamic_answers_to_encord_dict(obj)))
+            actions.extend(obj._transcript_actions)
+            if not actions:
                 continue
             ret[obj.object_hash] = {
-                "actions": list(reversed(all_static_answers)),
+                "actions": actions,
                 "objectHash": obj.object_hash,
             }
 
@@ -2202,18 +2204,19 @@ def _to_object_actions(self) -> Dict[str, ObjectAction]:
             for obj in space._objects_map.values():
                 # Currently, dynamic attributes only available for VideoSpace
                 if isinstance(space, VideoSpace):
-                    all_static_answers = self._dynamic_answers_to_encord_dict(obj)
-                    if len(all_static_answers) == 0:
-                        continue
-
-                    if obj.object_hash in ret:
-                        # The same object might still exist across object hashes
-                        continue
-                    else:
-                        ret[obj.object_hash] = {
-                            "actions": list(all_static_answers),
-                            "objectHash": obj.object_hash,
-                        }
+                    actions = list(self._dynamic_answers_to_encord_dict(obj))
+                else:
+                    actions = []
+                actions.extend(obj._transcript_actions)
+                if not actions:
+                    continue
+                if obj.object_hash in ret:
+                    # The same object might still exist across object hashes
+                    continue
+                ret[obj.object_hash] = {
+                    "actions": actions,
+                    "objectHash": obj.object_hash,
+                }
         return ret
 
     def _to_classification_answers(self) -> Dict[str, ClassificationAnswer]:
@@ -3059,19 +3062,29 @@ def _add_objects_answers(self, object_answers: dict):
     def _add_action_answers(self, label_row_dict: dict):
         for answer in label_row_dict["object_actions"].values():
             object_hash = answer["objectHash"]
-            object_instance = self._objects_map.get(object_hash)
             answer_list = answer["actions"]
-            if object_instance is not None:
-                object_instance.set_answer_from_list(answer_list)
-            else:
+
+            target_object = self._objects_map.get(object_hash)
+            if target_object is None:
                 # Not great that we're looping through spaces, but usually not that many spaces on a label row
-                answer_list = answer["actions"]
                 for space in self._space_map.values():
                     object_on_space = space._objects_map.get(object_hash)
                     if object_on_space is not None:
-                        object_on_space.set_answer_from_list(answers_list=answer_list)
+                        target_object = object_on_space
                         break
 
+            if target_object is None:
+                continue
+
+            non_transcript_actions: List[Any] = []
+            for raw_action in answer_list:
+                if is_transcript_raw_entry(raw_action):
+                    target_object._add_transcript_action(raw_action)
+                else:
+                    non_transcript_actions.append(raw_action)
+            if non_transcript_actions:
+                target_object.set_answer_from_list(non_transcript_actions)
+
     def _create_new_object_instance(self, frame_object_label: FrameObject, frame: int) -> ObjectInstance:
         ontology = self._ontology.structure
         feature_hash = frame_object_label["featureHash"]
diff --git a/encord/objects/ontology_object_instance.py b/encord/objects/ontology_object_instance.py
index 8f717b64c..2a17acc36 100644
--- a/encord/objects/ontology_object_instance.py
+++ b/encord/objects/ontology_object_instance.py
@@ -65,6 +65,7 @@
 from encord.objects.spaces.annotation.base_annotation import _AnnotationData, _AnnotationMetadata, _ObjectAnnotation
 from encord.objects.spaces.annotation.geometric_annotation import _GeometricAnnotationData
 from encord.objects.spaces.annotation.range_annotation import _RangeObjectAnnotationData
+from encord.objects.transcript import TranscriptSegment, is_transcript_attribute, is_transcript_raw_entry
 from encord.objects.types import (
     AnswerDict,
     AttributeDict,
@@ -91,6 +92,11 @@ def __init__(self, ontology_object: Object, *, object_hash: Optional[str] = None
 
         self._dynamic_answer_manager = DynamicAnswerManager(self)
 
+        # Raw transcript action dicts as they were parsed from the label row.
+        # Kept opaque so we can round-trip them on save without losing fields like
+        # trackHash, manualAnnotation, etc. See encord.objects.transcript.
+        self._transcript_actions: List[Dict[str, Any]] = []
+
         # Only used for non-frame entities
         self._non_geometric = ontology_object.shape in (Shape.AUDIO, Shape.TEXT)
 
@@ -249,6 +255,17 @@ def get_answer(
         if attribute.dynamic:
             return self._dynamic_answer_manager.get_answer(attribute, filter_answer, filter_frame)
 
+        if is_transcript_attribute(attribute):
+            # Transcripts are not stored in _static_answer_map; reconstruct the
+            # joined string from the per-segment actions on demand.
+            segments = [
+                s for s in self._iter_transcript_segments() if s.feature_hash == attribute.feature_node_hash
+            ]
+            if not segments:
+                return None
+            segments.sort(key=lambda s: s.range[0])
+            return "\n".join(s.text for s in segments)
+
         static_answer = self._static_answer_map[attribute.feature_node_hash]
 
         if not static_answer.is_answered():
@@ -324,6 +341,12 @@ def set_answer_from_list(self, answers_list: List[AttributeDict]) -> None:
         grouped_answers = defaultdict(list)
 
         for answer_dict in answers_list:
+            # Transcript entries (text + range) don't fit the static-answer model; they
+            # are managed via _transcript_actions and get_transcripts() / get_answer().
+            # Silently skip them here so any caller (parse path, space parser, customer
+            # code) can't accidentally clobber the static answer map.
+            if is_transcript_raw_entry(answer_dict):
+                continue
             feature_hash = answer_dict["featureHash"]
             attribute = _get_attribute_by_hash(feature_hash, self._ontology_object.attributes)
             if attribute is None:
@@ -353,6 +376,57 @@ def set_answer_from_list(self, answers_list: List[AttributeDict]) -> None:
             assert attribute  # we already checked that attribute is not null above. So just silencing this for now
             self._set_answer_from_grouped_list(attribute, answers_list)
 
+    def get_transcripts(
+        self,
+        attribute: Optional[TextAttribute] = None,
+    ) -> List[TranscriptSegment]:
+        """Return per-segment transcripts for this object, sorted by start frame.
+
+        Each entry is one frame-range within one transcript action. An action with
+        multiple sub-ranges produces multiple ``TranscriptSegment`` entries that share
+        the same text.
+
+        Args:
+            attribute: If provided, return only segments for this transcript attribute.
+                If ``None``, return all transcript segments across all transcript
+                attributes on this object.
+
+        Returns:
+            A list of ``TranscriptSegment`` ordered by ``range[0]``.
+        """
+        segments = list(self._iter_transcript_segments())
+        if attribute is not None:
+            segments = [s for s in segments if s.feature_hash == attribute.feature_node_hash]
+        segments.sort(key=lambda s: s.range[0])
+        return segments
+
+    def _iter_transcript_segments(self) -> Iterable[TranscriptSegment]:
+        for raw in self._transcript_actions:
+            text = raw.get("answers")
+            if not isinstance(text, str):
+                continue
+            feature_hash = raw.get("featureHash")
+            name = raw.get("name", "")
+            if not isinstance(feature_hash, str):
+                continue
+            for sub_range in raw.get("range", []) or []:
+                if not isinstance(sub_range, (list, tuple)) or len(sub_range) < 2:
+                    continue
+                yield TranscriptSegment(
+                    range=(int(sub_range[0]), int(sub_range[1])),
+                    text=text,
+                    feature_hash=feature_hash,
+                    attribute_name=name,
+                )
+
+    def _add_transcript_action(self, raw: Dict[str, Any]) -> None:
+        """Stash a raw transcript action dict for later read and serialisation.
+
+        This is an internal hook used by the label row parser; customers should not
+        call it directly.
+        """
+        self._transcript_actions.append(raw)
+
     @staticmethod
     def _merge_answers_to_non_overlapping_ranges(ranges: List[Tuple[Range, Set[str]]]) -> List[Tuple[Range, Set[str]]]:
         ranges.sort(key=lambda x: x[0].start)
@@ -678,6 +752,7 @@ def copy(self) -> ObjectInstance:
         ret._frames_to_instance_data = deepcopy(self._frames_to_instance_data)
         ret._static_answer_map = deepcopy(self._static_answer_map)
         ret._dynamic_answer_manager = self._dynamic_answer_manager.copy()
+        ret._transcript_actions = deepcopy(self._transcript_actions)
         return ret
 
     def get_annotations(self) -> List[Annotation]:
diff --git a/encord/objects/transcript.py b/encord/objects/transcript.py
new file mode 100644
index 000000000..b4ced4f9a
--- /dev/null
+++ b/encord/objects/transcript.py
@@ -0,0 +1,47 @@
+"""---
+title: "Objects - Transcripts"
+slug: "sdk-ref-objects-transcript"
+hidden: false
+metadata:
+  title: "Objects - Transcripts"
+  description: "Encord SDK Objects - Transcript segment data class."
+category: "64e481b57b6027003f20aaa0"
+---
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Mapping, Tuple
+
+from encord.objects.attributes import Attribute
+
+# Marker substring used in the ontology attribute display name to identify
+# transcript-bearing text attributes. There is no schema flag for this yet;
+# the marker is the only signal the FE and SDK agree on.
+TRANSCRIPT_NAME_MARKER = "#transcript"
+
+
+@dataclass(frozen=True)
+class TranscriptSegment:
+    """One playback segment of a transcript on an ObjectInstance.
+
+    A transcript-bearing attribute on an object may have multiple segments,
+    each with its own frame range and text. A single segment with multiple
+    sub-ranges in the underlying label is unrolled into one TranscriptSegment
+    per sub-range, all sharing the same text.
+    """
+
+    range: Tuple[int, int]
+    text: str
+    feature_hash: str
+    attribute_name: str
+
+
+def is_transcript_attribute(attribute: Attribute) -> bool:
+    return TRANSCRIPT_NAME_MARKER in attribute.name
+
+
+def is_transcript_raw_entry(entry: Mapping[str, Any]) -> bool:
+    name = entry.get("name")
+    return isinstance(name, str) and TRANSCRIPT_NAME_MARKER in name
diff --git a/tests/objects/data/all_types_ontology_structure.py b/tests/objects/data/all_types_ontology_structure.py
index 3257edf05..a49e8950f 100644
--- a/tests/objects/data/all_types_ontology_structure.py
+++ b/tests/objects/data/all_types_ontology_structure.py
@@ -151,6 +151,57 @@
     attributes=[],
 )
 
+AUDIO_TRANSCRIPT_OBJECT = Object(
+    uid=14,
+    name="audio transcript object",
+    color="#A4FF00",
+    shape=Shape.AUDIO,
+    feature_node_hash="audioTranscriptObjectHash",
+    archived=False,
+    attributes=[
+        TextAttribute(
+            uid=[14, 1],
+            feature_node_hash="captionTranscriptHash",
+            name="Caption #transcript",
+            required=False,
+            dynamic=False,
+            archived=False,
+        ),
+        TextAttribute(
+            uid=[14, 2],
+            feature_node_hash="speakerTranscriptHash",
+            name="Speaker #transcript",
+            required=False,
+            dynamic=False,
+            archived=False,
+        ),
+        ChecklistAttribute(
+            uid=[14, 3],
+            feature_node_hash="audioMoodChecklistHash",
+            name="Mood",
+            required=False,
+            dynamic=True,
+            archived=False,
+            options=[
+                FlatOption(
+                    uid=[14, 3, 1],
+                    feature_node_hash="audioMoodHappy",
+                    label="Happy",
+                    value="happy",
+                    archived=False,
+                ),
+                FlatOption(
+                    uid=[14, 3, 2],
+                    feature_node_hash="audioMoodSad",
+                    label="Sad",
+                    value="sad",
+                    archived=False,
+                ),
+            ],
+        ),
+    ],
+)
+
 all_types_structure = OntologyStructure(
     objects=[
         Object(
@@ -416,6 +467,7 @@
         AUDIO_OBJECT_1,
         AUDIO_OBJECT_2,
         AUDIO_OBJECT_3,
+        AUDIO_TRANSCRIPT_OBJECT,
         TEXT_OBJECT,
         CUBOID_2D_OBJECT,
         Object(
diff --git a/tests/objects/test_transcripts.py b/tests/objects/test_transcripts.py
new file mode 100644
index 000000000..4e3ec0c23
--- /dev/null
+++ b/tests/objects/test_transcripts.py
@@ -0,0 +1,324 @@
+from copy import deepcopy
+from dataclasses import asdict
+from unittest.mock import Mock
+
+import pytest
+
+from encord.objects import (
+    LabelRowV2,
+    ObjectInstance,
+    TranscriptSegment,
+)
+from encord.objects.attributes import ChecklistAttribute, TextAttribute
+from encord.objects.ontology_object import Object
+from encord.objects.options import Option
+from encord.objects.utils import _lower_snake_case
+from encord.orm.label_row import LabelRowMetadata
+from tests.objects.common import BASE_LABEL_ROW_METADATA
+from tests.objects.data.all_types_ontology_structure import AUDIO_TRANSCRIPT_OBJECT, all_types_structure
+from tests.objects.objects_test_utils import validate_label_row_serialisation
+
+audio_transcript_object: Object = all_types_structure.get_child_by_hash(
+    AUDIO_TRANSCRIPT_OBJECT.feature_node_hash, type_=Object
+)
+caption_attr: TextAttribute = all_types_structure.get_child_by_hash("captionTranscriptHash", type_=TextAttribute)
+speaker_attr: TextAttribute = all_types_structure.get_child_by_hash("speakerTranscriptHash", type_=TextAttribute)
+mood_attr: ChecklistAttribute = all_types_structure.get_child_by_hash(
+    "audioMoodChecklistHash", type_=ChecklistAttribute
+)
+mood_happy: Option = all_types_structure.get_child_by_hash("audioMoodHappy", type_=Option)
+
+
+OBJECT_HASH = "TrAnSc1234"
+
+
+def _action(feature_hash: str, name: str, text: str, ranges):
+    """Construct a raw object_action dict mimicking what the backend serves."""
+    return {
+        "name": name,
+        "value": _lower_snake_case(name),
+        "answers": text,
+        "featureHash": feature_hash,
+        "manualAnnotation": True,
+        "dynamic": True,
+        "range": [list(r) for r in ranges],
+        "shouldPropagate": False,
+        "trackHash": f"track-{text}",
+    }
+
+
+def _audio_label_dict_with_actions(actions, joined_classifications=None):
+    """Build an audio label_row dict carrying transcript actions for a single object."""
+    classifications = list(joined_classifications) if joined_classifications else []
+    return {
+        "label_hash": "0aea5ac7-cbc0-4451-a242-e22445d2c9fa",
+        "branch_name": "main",
+        "created_at": "Thu, 09 Feb 2023 14:12:03 UTC",
+        "last_edited_at": "Thu, 09 Feb 2023 14:12:03 UTC",
+        "data_hash": "aaa6bc82-9f89-4545-adbb-f271bf28cf99",
+        "dataset_hash": "b02ba3d9-883b-4c5e-ba09-751072ccfc57",
+        "dataset_title": "Audio Dataset",
+        "data_title": "sample-audio.mp3",
+        "data_type": "audio",
+        "annotation_task_status": "QUEUED",
+        "is_shadow_data": False,
+        "object_answers": {
+            OBJECT_HASH: {
+                "classifications": classifications,
+                "objectHash": OBJECT_HASH,
+                "range": [[0, 300]],
+                "createdBy": "user1",
+                "createdAt": "Thu, 05 Dec 2024 15:24:19 UTC",
+                "lastEditedBy": "user1",
+                "lastEditedAt": "Thu, 05 Dec 2024 15:24:44 UTC",
+                "manualAnnotation": True,
+                "featureHash": audio_transcript_object.feature_node_hash,
+                "name": audio_transcript_object.name,
+                "color": audio_transcript_object.color,
+                "shape": "audio",
+                "value": _lower_snake_case(audio_transcript_object.name),
+            }
+        },
+        "classification_answers": {},
+        "object_actions": {
+            OBJECT_HASH: {
+                "actions": list(actions),
+                "objectHash": OBJECT_HASH,
+            }
+        },
+        "label_status": "LABELLED",
+        "spaces": {},
+        "data_units": {
+            "cd53f484-c9ab-4fd1-9c14-5b34d4e42ba2": {
+                "data_hash": "cd53f484-c9ab-4fd1-9c14-5b34d4e42ba2",
+                "data_title": "sample-audio.mp3",
+                "data_link": "audio-link",
+                "data_type": "audio/mpeg",
+                "data_sequence": 0,
+                "audio_codec": "mp3",
+                "audio_sample_rate": 44100,
+                "audio_bit_depth": 8,
+                "audio_num_channels": 2,
+                "labels": {},
+                "data_duration": 100,
+            }
+        },
+    }
+
+
+def _audio_label_row(all_types_ontology) -> LabelRowV2:
+    metadata = asdict(BASE_LABEL_ROW_METADATA)
+    metadata["frames_per_second"] = 1000
+    metadata["data_type"] = "AUDIO"
+    metadata["number_of_frames"] = metadata["duration"] * metadata["frames_per_second"]
+    return LabelRowV2(LabelRowMetadata(**metadata), Mock(), all_types_ontology)
+
+
+def _get_obj(label_row: LabelRowV2) -> ObjectInstance:
+    objects = label_row.get_object_instances()
+    assert len(objects) == 1
+    return objects[0]
+
+
+def test_get_answer_returns_joined_transcript_from_actions(all_types_ontology):
+    """Even when classifications is empty, get_answer joins the per-segment actions."""
+    label_dict = _audio_label_dict_with_actions(
+        actions=[
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]),
+            _action(caption_attr.feature_node_hash, caption_attr.name, "How are you", [(51, 100)]),
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Goodbye", [(200, 250)]),
+        ]
+    )
+    label_row = _audio_label_row(all_types_ontology)
+    label_row.from_labels_dict(label_dict)
+
+    obj = _get_obj(label_row)
+    assert obj.get_answer(caption_attr) == "Hello world\nHow are you\nGoodbye"
+
+
+def test_get_answer_is_not_overwritten_by_last_action(all_types_ontology):
+    """Actions delivered in any order yield the same playback-ordered joined string."""
+    label_dict = _audio_label_dict_with_actions(
+        actions=[
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Goodbye", [(200, 250)]),
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]),
+            _action(caption_attr.feature_node_hash, caption_attr.name, "How are you", [(51, 100)]),
+        ]
+    )
+    label_row = _audio_label_row(all_types_ontology)
+    label_row.from_labels_dict(label_dict)
+
+    obj = _get_obj(label_row)
+    assert obj.get_answer(caption_attr) == "Hello world\nHow are you\nGoodbye"
+
+
+def test_get_answer_ignores_backend_joined_classification(all_types_ontology):
+    """The classifications mirror is informational; actions are the source of truth."""
+    label_dict = _audio_label_dict_with_actions(
+        actions=[
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]),
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Goodbye", [(200, 250)]),
+        ],
+        joined_classifications=[
+            {
+                "name": caption_attr.name,
+                "value": _lower_snake_case(caption_attr.name),
+                # Deliberately wrong / stale joined value so we catch any code path
+                # that reads from classifications instead of from actions.
+                "answers": "this should be ignored",
+                "featureHash": caption_attr.feature_node_hash,
+                "manualAnnotation": True,
+            }
+        ],
+    )
+    label_row = _audio_label_row(all_types_ontology)
+    label_row.from_labels_dict(label_dict)
+
+    obj = _get_obj(label_row)
+    assert obj.get_answer(caption_attr) == "Hello world\nGoodbye"
+
+
+def test_get_transcripts_returns_segments_in_playback_order(all_types_ontology):
+    label_dict = _audio_label_dict_with_actions(
+        actions=[
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Goodbye", [(200, 250)]),
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]),
+            _action(caption_attr.feature_node_hash, caption_attr.name, "How are you", [(51, 100)]),
+        ]
+    )
+    label_row = _audio_label_row(all_types_ontology)
+    label_row.from_labels_dict(label_dict)
+
+    obj = _get_obj(label_row)
+    segments = obj.get_transcripts()
+    assert [s.text for s in segments] == ["Hello world", "How are you", "Goodbye"]
+    assert [s.range for s in segments] == [(0, 50), (51, 100), (200, 250)]
+    assert all(isinstance(s, TranscriptSegment) for s in segments)
+    assert all(s.feature_hash == caption_attr.feature_node_hash for s in segments)
+    assert all(s.attribute_name == caption_attr.name for s in segments)
+
+
+def test_get_transcripts_filters_by_attribute(all_types_ontology):
+    label_dict = _audio_label_dict_with_actions(
+        actions=[
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]),
+            _action(speaker_attr.feature_node_hash, speaker_attr.name, "Alice", [(0, 50)]),
+            _action(caption_attr.feature_node_hash, caption_attr.name, "How are you", [(51, 100)]),
+            _action(speaker_attr.feature_node_hash, speaker_attr.name, "Bob", [(51, 100)]),
+        ]
+    )
+    label_row = _audio_label_row(all_types_ontology)
+    label_row.from_labels_dict(label_dict)
+
+    obj = _get_obj(label_row)
+    captions = obj.get_transcripts(caption_attr)
+    speakers = obj.get_transcripts(speaker_attr)
+    assert [s.text for s in captions] == ["Hello world", "How are you"]
+    assert [s.text for s in speakers] == ["Alice", "Bob"]
+
+    # No filter returns all segments across both attributes.
+    all_segments = obj.get_transcripts()
+    assert len(all_segments) == 4
+
+    # And per-attribute get_answer joins are independent.
+    assert obj.get_answer(caption_attr) == "Hello world\nHow are you"
+    assert obj.get_answer(speaker_attr) == "Alice\nBob"
+
+
+def test_get_transcripts_returns_empty_when_no_actions(all_types_ontology):
+    label_dict = _audio_label_dict_with_actions(actions=[])
+    label_row = _audio_label_row(all_types_ontology)
+    label_row.from_labels_dict(label_dict)
+
+    obj = _get_obj(label_row)
+    assert obj.get_transcripts() == []
+    assert obj.get_answer(caption_attr) is None
+
+
+def test_get_transcripts_unrolls_multi_sub_range_action(all_types_ontology):
+    """One action with two sub-ranges produces two TranscriptSegment entries with the same text."""
+    label_dict = _audio_label_dict_with_actions(
+        actions=[
+            _action(caption_attr.feature_node_hash, caption_attr.name, "repeated", [(0, 50), (100, 150)]),
+        ]
+    )
+    label_row = _audio_label_row(all_types_ontology)
+    label_row.from_labels_dict(label_dict)
+
+    obj = _get_obj(label_row)
+    segments = obj.get_transcripts()
+    assert len(segments) == 2
+    assert [s.range for s in segments] == [(0, 50), (100, 150)]
+    assert all(s.text == "repeated" for s in segments)
+
+
+def test_non_transcript_dynamic_attributes_still_work_on_transcript_object(all_types_ontology):
+    """A non-transcript dynamic attribute on the same object should be unaffected by the partition."""
+    mood_action = {
+        "name": mood_attr.name,
+        "value": _lower_snake_case(mood_attr.name),
+        "answers": [
+            {
+                "name": mood_happy.label,
+                "value": mood_happy.value,
+                "featureHash": mood_happy.feature_node_hash,
+            }
+        ],
+        "featureHash": mood_attr.feature_node_hash,
+        "manualAnnotation": True,
+        "dynamic": True,
+        "range": [[0, 100]],
+        "shouldPropagate": False,
+        "trackHash": "mood-track",
+    }
+    label_dict = _audio_label_dict_with_actions(
+        actions=[
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Hello", [(0, 100)]),
+            mood_action,
+        ]
+    )
+    label_row = _audio_label_row(all_types_ontology)
+    label_row.from_labels_dict(label_dict)
+
+    obj = _get_obj(label_row)
+    assert obj.get_answer(caption_attr) == "Hello"
+
+    mood_answers = obj.get_answer(mood_attr)
+    # Dynamic checklist returns AnswersForFrames; check the one we set.
+    assert len(mood_answers) == 1
+    assert mood_answers[0].answer == [mood_happy]
+
+
+def test_round_trip_preserves_transcript_actions(all_types_ontology):
+    original_actions = [
+        _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]),
+        _action(caption_attr.feature_node_hash, caption_attr.name, "How are you", [(51, 100)]),
+        _action(speaker_attr.feature_node_hash, speaker_attr.name, "Alice", [(0, 50)]),
+    ]
+    label_dict = _audio_label_dict_with_actions(actions=deepcopy(original_actions))
+    label_row = _audio_label_row(all_types_ontology)
+    label_row.from_labels_dict(label_dict)
+
+    serialised = label_row.to_encord_dict()
+    out_actions = serialised["object_actions"][OBJECT_HASH]["actions"]
+
+    # Order is not guaranteed (we append in stash order), so compare as sorted by text+range.
+    def key(a):
+        return (a["featureHash"], a["answers"], tuple(tuple(r) for r in a["range"]))
+
+    assert sorted(out_actions, key=key) == sorted(original_actions, key=key)
+
+
+def test_full_serialisation_roundtrip_with_transcripts(all_types_ontology):
+    """End-to-end: load → save → reload → save again gives identical output."""
+    label_dict = _audio_label_dict_with_actions(
+        actions=[
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Hello world", [(0, 50)]),
+            _action(caption_attr.feature_node_hash, caption_attr.name, "Goodbye", [(200, 250)]),
+            _action(speaker_attr.feature_node_hash, speaker_attr.name, "Alice", [(0, 250)]),
+        ]
+    )
+    label_row = _audio_label_row(all_types_ontology)
+    label_row.from_labels_dict(label_dict)
+
+    validate_label_row_serialisation(label_row)

From 13b1f70e8d68207f2c5c55ddcbd946693b2565a5 Mon Sep 17 00:00:00 2001
From: encordrob <robert@encord.com>
Date: Mon, 25 May 2026 11:25:11 +0100
Subject: [PATCH 2/2] Fix lint and format on transcript changes

---
 encord/objects/ontology_labels_impl.py     | 4 ++--
 encord/objects/ontology_object_instance.py | 4 +---
 tests/objects/test_transcripts.py          | 2 --
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py
index d1f3377a7..2729a9a3d 100644
--- a/encord/objects/ontology_labels_impl.py
+++ b/encord/objects/ontology_labels_impl.py
@@ -100,7 +100,6 @@
 from encord.objects.metadata import DataGroupMetadata, DICOMSeriesMetadata, DICOMSliceMetadata
 from encord.objects.ontology_object import Object
 from encord.objects.ontology_object_instance import ObjectInstance
-from encord.objects.transcript import is_transcript_raw_entry
 from encord.objects.ontology_structure import OntologyStructure
 from encord.objects.spaces.annotation.base_annotation import (
     _AnnotationMetadata,
@@ -117,6 +116,7 @@
 from encord.objects.spaces.range_space.point_cloud_space import PointCloudFileSpace
 from encord.objects.spaces.range_space.text_space import TextSpace
 from encord.objects.spaces.types import ChildInfo, SpaceInfo
+from encord.objects.transcript import is_transcript_raw_entry
 from encord.objects.types import (
     AttributeDict,
     BaseFrameObject,
@@ -2191,7 +2191,7 @@ def _to_object_answers(self) -> Dict[str, Any]:
     def _to_object_actions(self) -> Dict[str, ObjectAction]:
         ret: Dict[str, Any] = {}
         for obj in self._objects_map.values():
-            actions = list(reversed(self._dynamic_answers_to_encord_dict(obj)))
+            actions: List[Any] = list(reversed(self._dynamic_answers_to_encord_dict(obj)))
             actions.extend(obj._transcript_actions)
             if not actions:
                 continue
diff --git a/encord/objects/ontology_object_instance.py b/encord/objects/ontology_object_instance.py
index 2a17acc36..7e36ebaff 100644
--- a/encord/objects/ontology_object_instance.py
+++ b/encord/objects/ontology_object_instance.py
@@ -258,9 +258,7 @@ def get_answer(
         if is_transcript_attribute(attribute):
             # Transcripts are not stored in _static_answer_map; reconstruct the
             # joined string from the per-segment actions on demand.
-            segments = [
-                s for s in self._iter_transcript_segments() if s.feature_hash == attribute.feature_node_hash
-            ]
+            segments = [s for s in self._iter_transcript_segments() if s.feature_hash == attribute.feature_node_hash]
             if not segments:
                 return None
             segments.sort(key=lambda s: s.range[0])
diff --git a/tests/objects/test_transcripts.py b/tests/objects/test_transcripts.py
index 4e3ec0c23..f1d07c65f 100644
--- a/tests/objects/test_transcripts.py
+++ b/tests/objects/test_transcripts.py
@@ -2,8 +2,6 @@
 from dataclasses import asdict
 from unittest.mock import Mock
 
-import pytest
-
 from encord.objects import (
     LabelRowV2,
     ObjectInstance,