encord-team · encordrob · May 25, 2026 · May 25, 2026 · gemini-code-assist · May 25, 2026
diff --git a/encord/objects/__init__.py b/encord/objects/__init__.py
@@ -8,6 +8,7 @@
 from encord.objects.ontology_object_instance import AnswerForFrames, ObjectInstance
 from encord.objects.ontology_structure import OntologyStructure
 from encord.objects.options import FlatOption, NestableOption, Option
+from encord.objects.transcript import TranscriptSegment
 
 __all__ = [
     "AnswerForFrames",
@@ -26,4 +27,5 @@
     "RadioAttribute",
     "Shape",
     "TextAttribute",
+    "TranscriptSegment",
 ]
diff --git a/encord/objects/ontology_labels_impl.py b/encord/objects/ontology_labels_impl.py
@@ -100,6 +100,7 @@
 from encord.objects.metadata import DataGroupMetadata, DICOMSeriesMetadata, DICOMSliceMetadata
 from encord.objects.ontology_object import Object
 from encord.objects.ontology_object_instance import ObjectInstance
+from encord.objects.transcript import is_transcript_raw_entry
 from encord.objects.ontology_structure import OntologyStructure
 from encord.objects.spaces.annotation.base_annotation import (
     _AnnotationMetadata,
@@ -2190,30 +2191,32 @@ def _to_object_answers(self) -> Dict[str, Any]:
     def _to_object_actions(self) -> Dict[str, ObjectAction]:
         ret: Dict[str, Any] = {}
         for obj in self._objects_map.values():
-            all_static_answers = self._dynamic_answers_to_encord_dict(obj)
-            if len(all_static_answers) == 0:
+            actions = list(reversed(self._dynamic_answers_to_encord_dict(obj)))
+            actions.extend(obj._transcript_actions)
+            if not actions:
                 continue
             ret[obj.object_hash] = {
-                "actions": list(reversed(all_static_answers)),
+                "actions": actions,
                 "objectHash": obj.object_hash,
             }
 
         for space in self._space_map.values():
             for obj in space._objects_map.values():
                 # Currently, dynamic attributes only available for VideoSpace
                 if isinstance(space, VideoSpace):
-                    all_static_answers = self._dynamic_answers_to_encord_dict(obj)
-                    if len(all_static_answers) == 0:
-                        continue
-
-                    if obj.object_hash in ret:
-                        # The same object might still exist across object hashes
-                        continue
-                    else:
-                        ret[obj.object_hash] = {
-                            "actions": list(all_static_answers),
-                            "objectHash": obj.object_hash,
-                        }
+                    actions = list(self._dynamic_answers_to_encord_dict(obj))
+                else:
+                    actions = []
+                actions.extend(obj._transcript_actions)
+                if not actions:
+                    continue
+                if obj.object_hash in ret:
+                    # The same object might still exist across object hashes
+                    continue
+                ret[obj.object_hash] = {
+                    "actions": actions,
+                    "objectHash": obj.object_hash,
+                }
         return ret
 
     def _to_classification_answers(self) -> Dict[str, ClassificationAnswer]:
@@ -3059,19 +3062,29 @@ def _add_objects_answers(self, object_answers: dict):
     def _add_action_answers(self, label_row_dict: dict):
         for answer in label_row_dict["object_actions"].values():
             object_hash = answer["objectHash"]
-            object_instance = self._objects_map.get(object_hash)
             answer_list = answer["actions"]
-            if object_instance is not None:
-                object_instance.set_answer_from_list(answer_list)
-            else:
+
+            target_object = self._objects_map.get(object_hash)
+            if target_object is None:
                 # Not great that we're looping through spaces, but usually not that many spaces on a label row
-                answer_list = answer["actions"]
                 for space in self._space_map.values():
                     object_on_space = space._objects_map.get(object_hash)
                     if object_on_space is not None:
-                        object_on_space.set_answer_from_list(answers_list=answer_list)
+                        target_object = object_on_space
                         break
 
+            if target_object is None:
+                continue
+
+            non_transcript_actions: List[Any] = []
+            for raw_action in answer_list:
+                if is_transcript_raw_entry(raw_action):
+                    target_object._add_transcript_action(raw_action)
+                else:
+                    non_transcript_actions.append(raw_action)
+            if non_transcript_actions:
+                target_object.set_answer_from_list(non_transcript_actions)
+
     def _create_new_object_instance(self, frame_object_label: FrameObject, frame: int) -> ObjectInstance:
         ontology = self._ontology.structure
         feature_hash = frame_object_label["featureHash"]

diff --git a/encord/objects/ontology_object_instance.py b/encord/objects/ontology_object_instance.py
@@ -65,6 +65,7 @@
 from encord.objects.spaces.annotation.base_annotation import _AnnotationData, _AnnotationMetadata, _ObjectAnnotation
 from encord.objects.spaces.annotation.geometric_annotation import _GeometricAnnotationData
 from encord.objects.spaces.annotation.range_annotation import _RangeObjectAnnotationData
+from encord.objects.transcript import TranscriptSegment, is_transcript_attribute, is_transcript_raw_entry
 from encord.objects.types import (
     AnswerDict,
     AttributeDict,
@@ -91,6 +92,11 @@ def __init__(self, ontology_object: Object, *, object_hash: Optional[str] = None
 
         self._dynamic_answer_manager = DynamicAnswerManager(self)
 
+        # Raw transcript action dicts as they were parsed from the label row.
+        # Kept opaque so we can round-trip them on save without losing fields like
+        # trackHash, manualAnnotation, etc. See encord.objects.transcript.
+        self._transcript_actions: List[Dict[str, Any]] = []
+
         # Only used for non-frame entities
         self._non_geometric = ontology_object.shape in (Shape.AUDIO, Shape.TEXT)
 
@@ -249,6 +255,17 @@ def get_answer(
         if attribute.dynamic:
             return self._dynamic_answer_manager.get_answer(attribute, filter_answer, filter_frame)
 
+        if is_transcript_attribute(attribute):
+            # Transcripts are not stored in _static_answer_map; reconstruct the
+            # joined string from the per-segment actions on demand.
+            segments = [
+                s for s in self._iter_transcript_segments() if s.feature_hash == attribute.feature_node_hash
+            ]
+            if not segments:
+                return None
+            segments.sort(key=lambda s: s.range[0])
+            return "\n".join(s.text for s in segments)
+
         static_answer = self._static_answer_map[attribute.feature_node_hash]
 
         if not static_answer.is_answered():
@@ -324,6 +341,12 @@ def set_answer_from_list(self, answers_list: List[AttributeDict]) -> None:
         grouped_answers = defaultdict(list)
 
         for answer_dict in answers_list:
+            # Transcript entries (text + range) don't fit the static-answer model; they
+            # are managed via _transcript_actions and get_transcripts() / get_answer().
+            # Silently skip them here so any caller (parse path, space parser, customer
+            # code) can't accidentally clobber the static answer map.
+            if is_transcript_raw_entry(answer_dict):
+                continue
             feature_hash = answer_dict["featureHash"]
             attribute = _get_attribute_by_hash(feature_hash, self._ontology_object.attributes)
             if attribute is None:
@@ -353,6 +376,57 @@ def set_answer_from_list(self, answers_list: List[AttributeDict]) -> None:
             assert attribute  # we already checked that attribute is not null above. So just silencing this for now
             self._set_answer_from_grouped_list(attribute, answers_list)
 
+    def get_transcripts(
+        self,
+        attribute: Optional[TextAttribute] = None,
+    ) -> List[TranscriptSegment]:
+        """Return per-segment transcripts for this object, sorted by start frame.
+
+        Each entry is one frame-range within one transcript action. An action with
+        multiple sub-ranges produces multiple ``TranscriptSegment`` entries that share
+        the same text.
+
+        Args:
+            attribute: If provided, return only segments for this transcript attribute.
+                If ``None``, return all transcript segments across all transcript
+                attributes on this object.
+
+        Returns:
+            A list of ``TranscriptSegment`` ordered by ``range[0]``.
+        """
+        segments = list(self._iter_transcript_segments())
+        if attribute is not None:
+            segments = [s for s in segments if s.feature_hash == attribute.feature_node_hash]
+        segments.sort(key=lambda s: s.range[0])
+        return segments
+
+    def _iter_transcript_segments(self) -> Iterable[TranscriptSegment]:
+        for raw in self._transcript_actions:
+            text = raw.get("answers")
+            if not isinstance(text, str):
+                continue
+            feature_hash = raw.get("featureHash")
+            name = raw.get("name", "")
+            if not isinstance(feature_hash, str):
+                continue
+            for sub_range in raw.get("range", []) or []:
+                if not isinstance(sub_range, (list, tuple)) or len(sub_range) < 2:
+                    continue
+                yield TranscriptSegment(
+                    range=(int(sub_range[0]), int(sub_range[1])),
+                    text=text,
+                    feature_hash=feature_hash,
+                    attribute_name=name,
+                )
+
+    def _add_transcript_action(self, raw: Dict[str, Any]) -> None:
+        """Stash a raw transcript action dict for later read and serialisation.
+
+        This is an internal hook used by the label row parser; customers should not
+        call it directly.
+        """
+        self._transcript_actions.append(raw)
+
     @staticmethod
     def _merge_answers_to_non_overlapping_ranges(ranges: List[Tuple[Range, Set[str]]]) -> List[Tuple[Range, Set[str]]]:
         ranges.sort(key=lambda x: x[0].start)
@@ -678,6 +752,7 @@ def copy(self) -> ObjectInstance:
         ret._frames_to_instance_data = deepcopy(self._frames_to_instance_data)
         ret._static_answer_map = deepcopy(self._static_answer_map)
         ret._dynamic_answer_manager = self._dynamic_answer_manager.copy()
+        ret._transcript_actions = deepcopy(self._transcript_actions)
         return ret
 
     def get_annotations(self) -> List[Annotation]:

diff --git a/encord/objects/transcript.py b/encord/objects/transcript.py
@@ -0,0 +1,47 @@
+"""---
+title: "Objects - Transcripts"
+slug: "sdk-ref-objects-transcript"
+hidden: false
+metadata:
+  title: "Objects - Transcripts"
+  description: "Encord SDK Objects - Transcript segment data class."
+category: "64e481b57b6027003f20aaa0"
+---
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Mapping, Tuple
+
+from encord.objects.attributes import Attribute
+
+# Marker substring used in the ontology attribute display name to identify
+# transcript-bearing text attributes. There is no schema flag for this yet;
+# the marker is the only signal the FE and SDK agree on.
+TRANSCRIPT_NAME_MARKER = "#transcript"
+
+
+@dataclass(frozen=True)
+class TranscriptSegment:
+    """One playback segment of a transcript on an ObjectInstance.
+
+    A transcript-bearing attribute on an object may have multiple segments,
+    each with its own frame range and text. A single segment with multiple
+    sub-ranges in the underlying label is unrolled into one TranscriptSegment
+    per sub-range, all sharing the same text.
+    """
+
+    range: Tuple[int, int]
+    text: str
+    feature_hash: str
+    attribute_name: str
+
+
+def is_transcript_attribute(attribute: Attribute) -> bool:
+    return TRANSCRIPT_NAME_MARKER in attribute.name
+
+
+def is_transcript_raw_entry(entry: Mapping[str, Any]) -> bool:
+    name = entry.get("name")
+    return isinstance(name, str) and TRANSCRIPT_NAME_MARKER in name
diff --git a/tests/objects/data/all_types_ontology_structure.py b/tests/objects/data/all_types_ontology_structure.py
@@ -151,6 +151,57 @@
     attributes=[],
 )
 
+AUDIO_TRANSCRIPT_OBJECT = Object(
+    uid=14,
+    name="audio transcript object",
+    color="#A4FF00",
+    shape=Shape.AUDIO,
+    feature_node_hash="audioTranscriptObjectHash",
+    archived=False,
+    attributes=[
+        TextAttribute(
+            uid=[14, 1],
+            feature_node_hash="captionTranscriptHash",
+            name="Caption #transcript",
+            required=False,
+            dynamic=False,
+            archived=False,
+        ),
+        TextAttribute(
+            uid=[14, 2],
+            feature_node_hash="speakerTranscriptHash",
+            name="Speaker #transcript",
+            required=False,
+            dynamic=False,
+            archived=False,
+        ),
+        ChecklistAttribute(
+            uid=[14, 3],
+            feature_node_hash="audioMoodChecklistHash",
+            name="Mood",
+            required=False,
+            dynamic=True,
+            archived=False,
+            options=[
+                FlatOption(
+                    uid=[14, 3, 1],
+                    feature_node_hash="audioMoodHappy",
+                    label="Happy",
+                    value="happy",
+                    archived=False,
+                ),
+                FlatOption(
+                    uid=[14, 3, 2],
+                    feature_node_hash="audioMoodSad",
+                    label="Sad",
+                    value="sad",
+                    archived=False,
+                ),
+            ],
+        ),
+    ],
+)
+
 all_types_structure = OntologyStructure(
     objects=[
         Object(
@@ -416,6 +467,7 @@
         AUDIO_OBJECT_1,
         AUDIO_OBJECT_2,
         AUDIO_OBJECT_3,
+        AUDIO_TRANSCRIPT_OBJECT,
         TEXT_OBJECT,
         CUBOID_2D_OBJECT,
         Object(