Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions encord/objects/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from encord.objects.ontology_object_instance import AnswerForFrames, ObjectInstance
from encord.objects.ontology_structure import OntologyStructure
from encord.objects.options import FlatOption, NestableOption, Option
from encord.objects.transcript import TranscriptSegment

__all__ = [
"AnswerForFrames",
Expand All @@ -26,4 +27,5 @@
"RadioAttribute",
"Shape",
"TextAttribute",
"TranscriptSegment",
]
55 changes: 34 additions & 21 deletions encord/objects/ontology_labels_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
from encord.objects.metadata import DataGroupMetadata, DICOMSeriesMetadata, DICOMSliceMetadata
from encord.objects.ontology_object import Object
from encord.objects.ontology_object_instance import ObjectInstance
from encord.objects.transcript import is_transcript_raw_entry
from encord.objects.ontology_structure import OntologyStructure
from encord.objects.spaces.annotation.base_annotation import (
_AnnotationMetadata,
Expand Down Expand Up @@ -2190,30 +2191,32 @@ def _to_object_answers(self) -> Dict[str, Any]:
def _to_object_actions(self) -> Dict[str, ObjectAction]:
ret: Dict[str, Any] = {}
for obj in self._objects_map.values():
all_static_answers = self._dynamic_answers_to_encord_dict(obj)
if len(all_static_answers) == 0:
actions = list(reversed(self._dynamic_answers_to_encord_dict(obj)))
actions.extend(obj._transcript_actions)
if not actions:
continue
ret[obj.object_hash] = {
"actions": list(reversed(all_static_answers)),
"actions": actions,
"objectHash": obj.object_hash,
}

for space in self._space_map.values():
for obj in space._objects_map.values():
# Currently, dynamic attributes only available for VideoSpace
if isinstance(space, VideoSpace):
all_static_answers = self._dynamic_answers_to_encord_dict(obj)
if len(all_static_answers) == 0:
continue

if obj.object_hash in ret:
# The same object might still exist across object hashes
continue
else:
ret[obj.object_hash] = {
"actions": list(all_static_answers),
"objectHash": obj.object_hash,
}
actions = list(self._dynamic_answers_to_encord_dict(obj))
else:
actions = []
actions.extend(obj._transcript_actions)
if not actions:
continue
if obj.object_hash in ret:
# The same object might still exist across object hashes
continue
ret[obj.object_hash] = {
"actions": actions,
"objectHash": obj.object_hash,
}
return ret

def _to_classification_answers(self) -> Dict[str, ClassificationAnswer]:
Expand Down Expand Up @@ -3059,19 +3062,29 @@ def _add_objects_answers(self, object_answers: dict):
def _add_action_answers(self, label_row_dict: dict):
for answer in label_row_dict["object_actions"].values():
object_hash = answer["objectHash"]
object_instance = self._objects_map.get(object_hash)
answer_list = answer["actions"]
if object_instance is not None:
object_instance.set_answer_from_list(answer_list)
else:

target_object = self._objects_map.get(object_hash)
if target_object is None:
# Not great that we're looping through spaces, but usually not that many spaces on a label row
answer_list = answer["actions"]
for space in self._space_map.values():
object_on_space = space._objects_map.get(object_hash)
if object_on_space is not None:
object_on_space.set_answer_from_list(answers_list=answer_list)
target_object = object_on_space
break

if target_object is None:
continue

non_transcript_actions: List[Any] = []
for raw_action in answer_list:
if is_transcript_raw_entry(raw_action):
target_object._add_transcript_action(raw_action)
else:
non_transcript_actions.append(raw_action)
if non_transcript_actions:
target_object.set_answer_from_list(non_transcript_actions)

def _create_new_object_instance(self, frame_object_label: FrameObject, frame: int) -> ObjectInstance:
ontology = self._ontology.structure
feature_hash = frame_object_label["featureHash"]
Expand Down
75 changes: 75 additions & 0 deletions encord/objects/ontology_object_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
from encord.objects.spaces.annotation.base_annotation import _AnnotationData, _AnnotationMetadata, _ObjectAnnotation
from encord.objects.spaces.annotation.geometric_annotation import _GeometricAnnotationData
from encord.objects.spaces.annotation.range_annotation import _RangeObjectAnnotationData
from encord.objects.transcript import TranscriptSegment, is_transcript_attribute, is_transcript_raw_entry
from encord.objects.types import (
AnswerDict,
AttributeDict,
Expand All @@ -91,6 +92,11 @@ def __init__(self, ontology_object: Object, *, object_hash: Optional[str] = None

self._dynamic_answer_manager = DynamicAnswerManager(self)

# Raw transcript action dicts as they were parsed from the label row.
# Kept opaque so we can round-trip them on save without losing fields like
# trackHash, manualAnnotation, etc. See encord.objects.transcript.
self._transcript_actions: List[Dict[str, Any]] = []

# Only used for non-frame entities
self._non_geometric = ontology_object.shape in (Shape.AUDIO, Shape.TEXT)

Expand Down Expand Up @@ -249,6 +255,17 @@ def get_answer(
if attribute.dynamic:
return self._dynamic_answer_manager.get_answer(attribute, filter_answer, filter_frame)

if is_transcript_attribute(attribute):
# Transcripts are not stored in _static_answer_map; reconstruct the
# joined string from the per-segment actions on demand.
segments = [
s for s in self._iter_transcript_segments() if s.feature_hash == attribute.feature_node_hash
]
if not segments:
return None
segments.sort(key=lambda s: s.range[0])
return "\n".join(s.text for s in segments)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logic for filtering, sorting, and joining transcript segments is duplicated between get_answer and get_transcripts. Refactoring get_answer to call get_transcripts would improve maintainability and ensure consistent behavior across the SDK.

            segments = self.get_transcripts(cast(TextAttribute, attribute))
            return "\n".join(s.text for s in segments) if segments else None


static_answer = self._static_answer_map[attribute.feature_node_hash]

if not static_answer.is_answered():
Expand Down Expand Up @@ -324,6 +341,12 @@ def set_answer_from_list(self, answers_list: List[AttributeDict]) -> None:
grouped_answers = defaultdict(list)

for answer_dict in answers_list:
# Transcript entries (text + range) don't fit the static-answer model; they
# are managed via _transcript_actions and get_transcripts() / get_answer().
# Silently skip them here so any caller (parse path, space parser, customer
# code) can't accidentally clobber the static answer map.
if is_transcript_raw_entry(answer_dict):
continue
feature_hash = answer_dict["featureHash"]
attribute = _get_attribute_by_hash(feature_hash, self._ontology_object.attributes)
if attribute is None:
Expand Down Expand Up @@ -353,6 +376,57 @@ def set_answer_from_list(self, answers_list: List[AttributeDict]) -> None:
assert attribute # we already checked that attribute is not null above. So just silencing this for now
self._set_answer_from_grouped_list(attribute, answers_list)

def get_transcripts(
self,
attribute: Optional[TextAttribute] = None,
) -> List[TranscriptSegment]:
"""Return per-segment transcripts for this object, sorted by start frame.

Each entry is one frame-range within one transcript action. An action with
multiple sub-ranges produces multiple ``TranscriptSegment`` entries that share
the same text.

Args:
attribute: If provided, return only segments for this transcript attribute.
If ``None``, return all transcript segments across all transcript
attributes on this object.

Returns:
A list of ``TranscriptSegment`` ordered by ``range[0]``.
"""
segments = list(self._iter_transcript_segments())
if attribute is not None:
segments = [s for s in segments if s.feature_hash == attribute.feature_node_hash]
segments.sort(key=lambda s: s.range[0])
return segments

def _iter_transcript_segments(self) -> Iterable[TranscriptSegment]:
for raw in self._transcript_actions:
text = raw.get("answers")
if not isinstance(text, str):
continue
feature_hash = raw.get("featureHash")
name = raw.get("name", "")
if not isinstance(feature_hash, str):
continue
for sub_range in raw.get("range", []) or []:
if not isinstance(sub_range, (list, tuple)) or len(sub_range) < 2:
continue
yield TranscriptSegment(
range=(int(sub_range[0]), int(sub_range[1])),
text=text,
feature_hash=feature_hash,
attribute_name=name,
)

def _add_transcript_action(self, raw: Dict[str, Any]) -> None:
"""Stash a raw transcript action dict for later read and serialisation.

This is an internal hook used by the label row parser; customers should not
call it directly.
"""
self._transcript_actions.append(raw)

@staticmethod
def _merge_answers_to_non_overlapping_ranges(ranges: List[Tuple[Range, Set[str]]]) -> List[Tuple[Range, Set[str]]]:
ranges.sort(key=lambda x: x[0].start)
Expand Down Expand Up @@ -678,6 +752,7 @@ def copy(self) -> ObjectInstance:
ret._frames_to_instance_data = deepcopy(self._frames_to_instance_data)
ret._static_answer_map = deepcopy(self._static_answer_map)
ret._dynamic_answer_manager = self._dynamic_answer_manager.copy()
ret._transcript_actions = deepcopy(self._transcript_actions)
return ret

def get_annotations(self) -> List[Annotation]:
Expand Down
47 changes: 47 additions & 0 deletions encord/objects/transcript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""---
title: "Objects - Transcripts"
slug: "sdk-ref-objects-transcript"
hidden: false
metadata:
title: "Objects - Transcripts"
description: "Encord SDK Objects - Transcript segment data class."
category: "64e481b57b6027003f20aaa0"
---
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Mapping, Tuple

from encord.objects.attributes import Attribute

# Marker substring used in the ontology attribute display name to identify
# transcript-bearing text attributes. There is no schema flag for this yet;
# the marker is the only signal the FE and SDK agree on.
TRANSCRIPT_NAME_MARKER = "#transcript"


@dataclass(frozen=True)
class TranscriptSegment:
"""One playback segment of a transcript on an ObjectInstance.

A transcript-bearing attribute on an object may have multiple segments,
each with its own frame range and text. A single segment with multiple
sub-ranges in the underlying label is unrolled into one TranscriptSegment
per sub-range, all sharing the same text.
"""

range: Tuple[int, int]
text: str
feature_hash: str
attribute_name: str


def is_transcript_attribute(attribute: Attribute) -> bool:
return TRANSCRIPT_NAME_MARKER in attribute.name


def is_transcript_raw_entry(entry: Mapping[str, Any]) -> bool:
name = entry.get("name")
return isinstance(name, str) and TRANSCRIPT_NAME_MARKER in name
52 changes: 52 additions & 0 deletions tests/objects/data/all_types_ontology_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,57 @@
attributes=[],
)

AUDIO_TRANSCRIPT_OBJECT = Object(
uid=14,
name="audio transcript object",
color="#A4FF00",
shape=Shape.AUDIO,
feature_node_hash="audioTranscriptObjectHash",
archived=False,
attributes=[
TextAttribute(
uid=[14, 1],
feature_node_hash="captionTranscriptHash",
name="Caption #transcript",
required=False,
dynamic=False,
archived=False,
),
TextAttribute(
uid=[14, 2],
feature_node_hash="speakerTranscriptHash",
name="Speaker #transcript",
required=False,
dynamic=False,
archived=False,
),
ChecklistAttribute(
uid=[14, 3],
feature_node_hash="audioMoodChecklistHash",
name="Mood",
required=False,
dynamic=True,
archived=False,
options=[
FlatOption(
uid=[14, 3, 1],
feature_node_hash="audioMoodHappy",
label="Happy",
value="happy",
archived=False,
),
FlatOption(
uid=[14, 3, 2],
feature_node_hash="audioMoodSad",
label="Sad",
value="sad",
archived=False,
),
],
),
],
)

all_types_structure = OntologyStructure(
objects=[
Object(
Expand Down Expand Up @@ -416,6 +467,7 @@
AUDIO_OBJECT_1,
AUDIO_OBJECT_2,
AUDIO_OBJECT_3,
AUDIO_TRANSCRIPT_OBJECT,
TEXT_OBJECT,
CUBOID_2D_OBJECT,
Object(
Expand Down
Loading
Loading