Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions marimo/_convert/common/dom_traversal.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

if TYPE_CHECKING:
from collections.abc import Callable
from pathlib import Path

LOGGER = _loggers.marimo_logger()

Expand Down Expand Up @@ -345,3 +346,114 @@ def replacer(value: str) -> str | None:
)

return processed_html, replaced_files


# Public folder file pattern: public/{path} or ./public/{path}
_PUBLIC_FILE_PATTERN = re.compile(r"^(?:\./)?public/(.+)$")


def _resolve_public_file(public_dir: Path, relpath: str) -> Path | None:
"""Resolve a `public/`-prefixed path against the public dir.

Returns the resolved path if it points to an existing regular file
strictly inside `public_dir`, or None otherwise. Rejects path traversal
and symlinks that escape the public directory.
"""
try:
Comment thread
mscolnick marked this conversation as resolved.
# `strict=True` ensures the file exists.
candidate = (public_dir / relpath).resolve(strict=True)
public_resolved = public_dir.resolve(strict=True)
except (OSError, ValueError):
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
Outdated
return None

# Containment check: the resolved file must live under the resolved
# public directory (catches path traversal and symlink escapes).
try:
candidate.relative_to(public_resolved)
except ValueError:
return None

if not candidate.is_file():
return None

return candidate


def replace_public_files_with_data_uris(
html: str,
public_dir: Path,
*,
allowed_tags: set[str] | None = None,
allowed_attributes: set[str] | None = None,
max_inline_bytes: int | None = None,
) -> tuple[str, set[str]]:
"""Inline `public/`-prefixed file references as data URIs.

Scans `html` for media tag attributes (e.g. `<img src="public/...">`),
reads the referenced file from the notebook's `public/` folder, and
replaces the attribute value with a base64-encoded data URI so the
HTML can be served standalone. Paths that escape `public_dir` (via
`..` segments or symlinks) are rejected and left unchanged.

Args:
html: The HTML string to process.
public_dir: Path to the notebook's `public/` directory.
allowed_tags: Tags to scan. Defaults to {"img", "audio", "video",
"source"}.
allowed_attributes: Attributes to scan. Defaults to {"src"}.
max_inline_bytes: Maximum file size to inline. Larger files are
left as-is. None means no limit.

Returns:
Tuple of (processed_html, replaced_paths) where `replaced_paths`
is the set of attribute values that were successfully inlined.
"""
if allowed_tags is None:
allowed_tags = {"img", "audio", "video", "source"}
if allowed_attributes is None:
allowed_attributes = {"src"}

replaced: set[str] = set()

# If the public directory does not exist, there is nothing to inline.
if not public_dir.exists():
return html, replaced

def replacer(value: str) -> str | None:
match = _PUBLIC_FILE_PATTERN.match(value)
if not match:
return None
relpath = match.group(1)
resolved = _resolve_public_file(public_dir, relpath)
if resolved is None:
return None
try:
file_bytes = resolved.read_bytes()
except OSError as e:
LOGGER.warning(
"Failed to read public file %s during export: %s", value, e
)
return None
if max_inline_bytes is not None and len(file_bytes) > max_inline_bytes:
LOGGER.info(
"Skipping public file %s (%d bytes exceeds %d byte inline"
" limit)",
value,
len(file_bytes),
max_inline_bytes,
)
return None
Comment thread
mscolnick marked this conversation as resolved.
Outdated
mime_type = mimetypes.guess_type(resolved.name)[0] or "text/plain"
replaced.add(value)
return build_data_url(
cast(KnownMimeType, mime_type),
base64.b64encode(file_bytes),
)

processed_html = replace_html_attributes(
html=html,
allowed_tags=allowed_tags,
allowed_attributes=allowed_attributes,
replacer_fn=replacer,
)
return processed_html, replaced
79 changes: 59 additions & 20 deletions marimo/_server/export/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from marimo._config.settings import GLOBAL_SETTINGS
from marimo._config.utils import deep_copy
from marimo._convert.common.dom_traversal import (
replace_public_files_with_data_uris,
replace_virtual_files_with_data_uris,
)
from marimo._convert.common.filename import (
Expand Down Expand Up @@ -55,7 +56,7 @@
from marimo._version import __version__

if TYPE_CHECKING:
from collections.abc import Mapping
from collections.abc import Iterator, Mapping

from traitlets.config import Config

Expand Down Expand Up @@ -121,6 +122,13 @@ def export_as_html(
session_snapshot
)

# Inline references to files in the notebook's `public/` folder so
# the exported HTML is self-contained. Without this, `mo.md` images
# like `![alt](public/image.png)` break when the HTML is opened
# outside the notebook's directory.
public_dir = Path(filename).resolve().parent / "public"
self._inline_public_files(session_snapshot, public_dir)

app_code = app.to_py()

# Prepare code for export
Expand Down Expand Up @@ -168,6 +176,19 @@ def _prepare_display_config(
config["display"] = display_config
return cast(MarimoConfig, config)

@staticmethod
def _iter_html_data_strings(
session_snapshot: NotebookSessionV1,
) -> Iterator[tuple[dict[str, Any], str, str]]:
Comment thread
mscolnick marked this conversation as resolved.
"""Yield (output_data_dict, mime_type, data) for each string output."""
for cell in session_snapshot["cells"]:
for output in cell["outputs"]:
if output["type"] != "data":
continue
for mime_type, data in output["data"].items():
if isinstance(data, str):
Comment thread
mscolnick marked this conversation as resolved.
Outdated
yield output["data"], mime_type, data

def _inline_virtual_files(
self, session_snapshot: NotebookSessionV1
) -> tuple[NotebookSessionV1, set[str]]:
Expand All @@ -178,28 +199,46 @@ def _inline_virtual_files(
"""
replaced_files: set[str] = set()

for cell in session_snapshot["cells"]:
for output in cell["outputs"]:
if output["type"] != "data":
continue

for mime_type, data in output["data"].items():
if not isinstance(data, str):
continue
if self._VIRTUAL_FILE_PATTERN not in data:
continue

processed, files = replace_virtual_files_with_data_uris(
data,
allowed_tags=VIRTUAL_FILE_ALLOWED_TAGS,
allowed_attributes=VIRTUAL_FILE_ALLOWED_ATTRIBUTES,
max_inline_bytes=MAX_VIRTUAL_FILE_INLINE_BYTES,
)
replaced_files.update(files)
output["data"][mime_type] = processed
for data_dict, mime_type, data in self._iter_html_data_strings(
session_snapshot
):
if self._VIRTUAL_FILE_PATTERN not in data:
continue
processed, files = replace_virtual_files_with_data_uris(
data,
allowed_tags=VIRTUAL_FILE_ALLOWED_TAGS,
allowed_attributes=VIRTUAL_FILE_ALLOWED_ATTRIBUTES,
max_inline_bytes=MAX_VIRTUAL_FILE_INLINE_BYTES,
)
replaced_files.update(files)
data_dict[mime_type] = processed

return session_snapshot, replaced_files

def _inline_public_files(
self,
session_snapshot: NotebookSessionV1,
public_dir: Path,
) -> None:
"""Replace `public/`-prefixed file paths in HTML outputs with data URIs.

Mutates `session_snapshot` in-place.
"""
if not public_dir.exists():
return

for data_dict, mime_type, data in self._iter_html_data_strings(
session_snapshot
):
if "public/" not in data:
continue
processed, _ = replace_public_files_with_data_uris(
data,
public_dir=public_dir,
max_inline_bytes=MAX_VIRTUAL_FILE_INLINE_BYTES,
)
data_dict[mime_type] = processed

def _prepare_code(
self,
include_code: bool,
Expand Down
Loading
Loading