diff --git a/.gitignore b/.gitignore
index 1694325cf45..4b515442720 100644
--- a/.gitignore
+++ b/.gitignore
@@ -229,4 +229,4 @@ src/native/target*
 .analysis/
 
 # file created when running scripts/lint
-uv.lock
\ No newline at end of file
+uv.lock
diff --git a/.riot/requirements/1059304.txt b/.riot/requirements/1059304.txt
new file mode 100644
index 00000000000..d9c2b106d89
--- /dev/null
+++ b/.riot/requirements/1059304.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1059304.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+exceptiongroup==1.3.1
+filelock==3.29.1
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.4.2
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.3.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/116b0b8.txt b/.riot/requirements/116b0b8.txt
new file mode 100644
index 00000000000..c428ec61c6b
--- /dev/null
+++ b/.riot/requirements/116b0b8.txt
@@ -0,0 +1,28 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/116b0b8.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.1
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.3.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/1346e9d.txt b/.riot/requirements/1346e9d.txt
new file mode 100644
index 00000000000..14b0ff6eccf
--- /dev/null
+++ b/.riot/requirements/1346e9d.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1346e9d.in
+#
+attrs==26.1.0
+coverage[toml]==7.10.7
+exceptiongroup==1.3.1
+filelock==3.19.1
+fsspec==2025.10.0
+hypothesis==6.45.0
+iniconfig==2.1.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.2.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==8.4.2
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.13.1
+tomli==2.4.1
+torch==2.5.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/1351aca.txt b/.riot/requirements/1351aca.txt
new file mode 100644
index 00000000000..2a5f81ce7a3
--- /dev/null
+++ b/.riot/requirements/1351aca.txt
@@ -0,0 +1,31 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1351aca.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.11.0
+typing-extensions==4.15.0
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==81.0.0
diff --git a/.riot/requirements/139b6b2.txt b/.riot/requirements/139b6b2.txt
new file mode 100644
index 00000000000..156bee5d715
--- /dev/null
+++ b/.riot/requirements/139b6b2.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/139b6b2.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+exceptiongroup==1.3.1
+filelock==3.29.1
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.4.2
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.1.2
+typing-extensions==4.15.0
diff --git a/.riot/requirements/16e767e.txt b/.riot/requirements/16e767e.txt
new file mode 100644
index 00000000000..6053f8be5f3
--- /dev/null
+++ b/.riot/requirements/16e767e.txt
@@ -0,0 +1,28 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/16e767e.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.4.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/173555b.txt b/.riot/requirements/173555b.txt
new file mode 100644
index 00000000000..8806dadf055
--- /dev/null
+++ b/.riot/requirements/173555b.txt
@@ -0,0 +1,31 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/173555b.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.12.0
+typing-extensions==4.15.0
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==81.0.0
diff --git a/.riot/requirements/177b157.txt b/.riot/requirements/177b157.txt
new file mode 100644
index 00000000000..6db2893b99d
--- /dev/null
+++ b/.riot/requirements/177b157.txt
@@ -0,0 +1,27 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/177b157.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.1
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.0.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/179c655.txt b/.riot/requirements/179c655.txt
new file mode 100644
index 00000000000..8b96803664e
--- /dev/null
+++ b/.riot/requirements/179c655.txt
@@ -0,0 +1,31 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate --output-file=.riot/requirements/179c655.txt .riot/requirements/179c655.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.13.1
+torch==2.5.1
+typing-extensions==4.15.0
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==82.0.1
diff --git a/.riot/requirements/17a8226.txt b/.riot/requirements/17a8226.txt
new file mode 100644
index 00000000000..67efb17aff2
--- /dev/null
+++ b/.riot/requirements/17a8226.txt
@@ -0,0 +1,28 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/17a8226.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.1
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.1.2
+typing-extensions==4.15.0
diff --git a/.riot/requirements/181e2d5.txt b/.riot/requirements/181e2d5.txt
new file mode 100644
index 00000000000..ae157b66f35
--- /dev/null
+++ b/.riot/requirements/181e2d5.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/181e2d5.in
+#
+attrs==26.1.0
+coverage[toml]==7.10.7
+exceptiongroup==1.3.1
+filelock==3.19.1
+fsspec==2025.10.0
+hypothesis==6.45.0
+iniconfig==2.1.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.2.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==8.4.2
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.1.2
+typing-extensions==4.15.0
diff --git a/.riot/requirements/1989fbc.txt b/.riot/requirements/1989fbc.txt
new file mode 100644
index 00000000000..b1229d837c6
--- /dev/null
+++ b/.riot/requirements/1989fbc.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1989fbc.in
+#
+attrs==26.1.0
+coverage[toml]==7.10.7
+exceptiongroup==1.3.1
+filelock==3.19.1
+fsspec==2025.10.0
+hypothesis==6.45.0
+iniconfig==2.1.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.2.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==8.4.2
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.4.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/19ca09f.txt b/.riot/requirements/19ca09f.txt
new file mode 100644
index 00000000000..d5b1044b891
--- /dev/null
+++ b/.riot/requirements/19ca09f.txt
@@ -0,0 +1,31 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/19ca09f.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.8.0
+typing-extensions==4.15.0
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==82.0.1
diff --git a/.riot/requirements/1a4c54d.txt b/.riot/requirements/1a4c54d.txt
new file mode 100644
index 00000000000..fc009b490fd
--- /dev/null
+++ b/.riot/requirements/1a4c54d.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1a4c54d.in
+#
+attrs==26.1.0
+coverage[toml]==7.10.7
+exceptiongroup==1.3.1
+filelock==3.19.1
+fsspec==2025.10.0
+hypothesis==6.45.0
+iniconfig==2.1.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.2.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==8.4.2
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.3.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/1a9e432.txt b/.riot/requirements/1a9e432.txt
new file mode 100644
index 00000000000..09e6848ed2c
--- /dev/null
+++ b/.riot/requirements/1a9e432.txt
@@ -0,0 +1,28 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1a9e432.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.1
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.2.2
+typing-extensions==4.15.0
diff --git a/.riot/requirements/1b254f8.txt b/.riot/requirements/1b254f8.txt
new file mode 100644
index 00000000000..bfee11f94c0
--- /dev/null
+++ b/.riot/requirements/1b254f8.txt
@@ -0,0 +1,28 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1b254f8.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.1
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.3.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/1d55347.txt b/.riot/requirements/1d55347.txt
new file mode 100644
index 00000000000..9c1ec80cd8e
--- /dev/null
+++ b/.riot/requirements/1d55347.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1d55347.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+exceptiongroup==1.3.1
+filelock==3.29.1
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.4.2
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.2.2
+typing-extensions==4.15.0
diff --git a/.riot/requirements/1d6137c.txt b/.riot/requirements/1d6137c.txt
new file mode 100644
index 00000000000..ff3d58d89e9
--- /dev/null
+++ b/.riot/requirements/1d6137c.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1d6137c.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+exceptiongroup==1.3.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.4.2
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.4.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/1e9ae39.txt b/.riot/requirements/1e9ae39.txt
new file mode 100644
index 00000000000..c652dc05312
--- /dev/null
+++ b/.riot/requirements/1e9ae39.txt
@@ -0,0 +1,28 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1e9ae39.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.13.1
+torch==2.6.0
+typing-extensions==4.15.0
diff --git a/.riot/requirements/1ea7124.txt b/.riot/requirements/1ea7124.txt
new file mode 100644
index 00000000000..400d8084475
--- /dev/null
+++ b/.riot/requirements/1ea7124.txt
@@ -0,0 +1,31 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1ea7124.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.4.1
+typing-extensions==4.15.0
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==82.0.1
diff --git a/.riot/requirements/1efcde5.txt b/.riot/requirements/1efcde5.txt
new file mode 100644
index 00000000000..94ad685f2d5
--- /dev/null
+++ b/.riot/requirements/1efcde5.txt
@@ -0,0 +1,31 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate --output-file=.riot/requirements/1efcde5.txt .riot/requirements/1efcde5.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.13.1
+torch==2.6.0
+typing-extensions==4.15.0
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==82.0.1
diff --git a/.riot/requirements/21226ae.txt b/.riot/requirements/21226ae.txt
new file mode 100644
index 00000000000..a62fe680e93
--- /dev/null
+++ b/.riot/requirements/21226ae.txt
@@ -0,0 +1,31 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate --output-file=.riot/requirements/21226ae.txt .riot/requirements/21226ae.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.7.1
+typing-extensions==4.15.0
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==82.0.1
diff --git a/.riot/requirements/2dde9bb.txt b/.riot/requirements/2dde9bb.txt
new file mode 100644
index 00000000000..db8a451d890
--- /dev/null
+++ b/.riot/requirements/2dde9bb.txt
@@ -0,0 +1,31 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/2dde9bb.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.9.1
+typing-extensions==4.15.0
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==82.0.1
diff --git a/.riot/requirements/34517c6.txt b/.riot/requirements/34517c6.txt
new file mode 100644
index 00000000000..3a238ca594f
--- /dev/null
+++ b/.riot/requirements/34517c6.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/34517c6.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+exceptiongroup==1.3.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.4.2
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.13.1
+tomli==2.4.1
+torch==2.5.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/6444f67.txt b/.riot/requirements/6444f67.txt
new file mode 100644
index 00000000000..a06c7d66b04
--- /dev/null
+++ b/.riot/requirements/6444f67.txt
@@ -0,0 +1,29 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/6444f67.in
+#
+attrs==26.1.0
+coverage[toml]==7.10.7
+exceptiongroup==1.3.1
+filelock==3.19.1
+hypothesis==6.45.0
+iniconfig==2.1.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.2.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==8.4.2
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.0.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/6fb24b4.txt b/.riot/requirements/6fb24b4.txt
new file mode 100644
index 00000000000..5b1f15e607d
--- /dev/null
+++ b/.riot/requirements/6fb24b4.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/6fb24b4.in
+#
+attrs==26.1.0
+coverage[toml]==7.10.7
+exceptiongroup==1.3.1
+filelock==3.19.1
+fsspec==2025.10.0
+hypothesis==6.45.0
+iniconfig==2.1.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.2.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==8.4.2
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.13.1
+tomli==2.4.1
+torch==2.6.0
+typing-extensions==4.15.0
diff --git a/.riot/requirements/7878a79.txt b/.riot/requirements/7878a79.txt
new file mode 100644
index 00000000000..d8530ff6e9f
--- /dev/null
+++ b/.riot/requirements/7878a79.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/7878a79.in
+#
+attrs==26.1.0
+coverage[toml]==7.10.7
+exceptiongroup==1.3.1
+filelock==3.19.1
+fsspec==2025.10.0
+hypothesis==6.45.0
+iniconfig==2.1.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.2.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==8.4.2
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.7.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/a9c7746.txt b/.riot/requirements/a9c7746.txt
new file mode 100644
index 00000000000..29922b16883
--- /dev/null
+++ b/.riot/requirements/a9c7746.txt
@@ -0,0 +1,28 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate --output-file=.riot/requirements/a9c7746.txt .riot/requirements/a9c7746.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.1
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.2.2
+typing-extensions==4.15.0
diff --git a/.riot/requirements/afdf8ce.txt b/.riot/requirements/afdf8ce.txt
new file mode 100644
index 00000000000..bd5643910cb
--- /dev/null
+++ b/.riot/requirements/afdf8ce.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/afdf8ce.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+exceptiongroup==1.3.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.4.2
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.13.1
+tomli==2.4.1
+torch==2.6.0
+typing-extensions==4.15.0
diff --git a/.riot/requirements/b77de6a.txt b/.riot/requirements/b77de6a.txt
new file mode 100644
index 00000000000..d90b034e1ab
--- /dev/null
+++ b/.riot/requirements/b77de6a.txt
@@ -0,0 +1,29 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/b77de6a.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+exceptiongroup==1.3.1
+filelock==3.29.1
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.4.2
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.0.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/d300b85.txt b/.riot/requirements/d300b85.txt
new file mode 100644
index 00000000000..b40dd8fd361
--- /dev/null
+++ b/.riot/requirements/d300b85.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/d300b85.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+exceptiongroup==1.3.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.4.2
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.7.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/d598449.txt b/.riot/requirements/d598449.txt
new file mode 100644
index 00000000000..7bfe1c31637
--- /dev/null
+++ b/.riot/requirements/d598449.txt
@@ -0,0 +1,31 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/d598449.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.10.0
+typing-extensions==4.15.0
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==82.0.1
diff --git a/.riot/requirements/dc250d4.txt b/.riot/requirements/dc250d4.txt
new file mode 100644
index 00000000000..7d82d707c06
--- /dev/null
+++ b/.riot/requirements/dc250d4.txt
@@ -0,0 +1,28 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/dc250d4.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.13.1
+torch==2.5.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/e321c89.txt b/.riot/requirements/e321c89.txt
new file mode 100644
index 00000000000..1157c900764
--- /dev/null
+++ b/.riot/requirements/e321c89.txt
@@ -0,0 +1,28 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/e321c89.in
+#
+attrs==26.1.0
+coverage[toml]==7.14.1
+filelock==3.29.3
+fsspec==2026.4.0
+hypothesis==6.45.0
+iniconfig==2.3.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.6.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+torch==2.7.1
+typing-extensions==4.15.0
diff --git a/.riot/requirements/efc40e8.txt b/.riot/requirements/efc40e8.txt
new file mode 100644
index 00000000000..b394baf2d6c
--- /dev/null
+++ b/.riot/requirements/efc40e8.txt
@@ -0,0 +1,30 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/efc40e8.in
+#
+attrs==26.1.0
+coverage[toml]==7.10.7
+exceptiongroup==1.3.1
+filelock==3.19.1
+fsspec==2025.10.0
+hypothesis==6.45.0
+iniconfig==2.1.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mock==5.2.0
+mpmath==1.3.0
+networkx==3.2.1
+opentracing==2.4.0
+packaging==26.2
+pluggy==1.6.0
+pygments==2.20.0
+pytest==8.4.2
+pytest-cov==7.1.0
+pytest-mock==3.15.1
+sortedcontainers==2.4.0
+sympy==1.14.0
+tomli==2.4.1
+torch==2.2.2
+typing-extensions==4.15.0
diff --git a/ddtrace/_monkey.py b/ddtrace/_monkey.py
index 180cc194d87..f672fb48b82 100644
--- a/ddtrace/_monkey.py
+++ b/ddtrace/_monkey.py
@@ -109,6 +109,7 @@
     "anthropic": True,
     "crewai": True,
     "pydantic_ai": True,
+    "pytorch": False,
     "vllm": True,
     "mlflow": config._model_lab_enabled,
     "subprocess": True,
@@ -175,6 +176,7 @@
         "langgraph.prebuilt",
     ),
     "openai_agents": ("agents",),
+    "pytorch": ("torch",),
 }
 
 _NOT_PATCHABLE_VIA_ENVVAR = {"ddtrace_api"}
diff --git a/ddtrace/contrib/internal/pytorch/__init__.py b/ddtrace/contrib/internal/pytorch/__init__.py
new file mode 100644
index 00000000000..e07b96bd3d0
--- /dev/null
+++ b/ddtrace/contrib/internal/pytorch/__init__.py
@@ -0,0 +1,46 @@
+"""
+The pytorch integration traces PyTorch distributed training jobs.
+
+Always-on: a single long-lived ``pytorch.rank`` span is emitted per rank.
+Tags: ``rank``, ``world_size``, ``framework`` (DDP / FSDP / DeepSpeed),
+``launcher``, ``torch.distributed.backend``, ``training_job_id``
+(auto-resolved from ``RAY_JOB_ID``, ``TORCHELASTIC_RUN_ID``,
+``KUBEFLOW_TRAINING_JOB_ID``, ``SLURM_JOB_ID``, or a per-rank UUID),
+and Ray Train run context when running under Ray Train.
+
+
+Enabling
+~~~~~~~~
+
+The PyTorch integration is **opt-in**. Enable explicitly via::
+
+    DD_PATCH_MODULES=pytorch:true
+
+or programmatically::
+
+    import ddtrace
+    ddtrace.patch(pytorch=True)
+
+
+Global configuration
+~~~~~~~~~~~~~~~~~~~~
+
+.. py:data:: ddtrace.config.pytorch["service"]
+
+   The service name reported by default for pytorch spans.
+
+   This option can also be set with the ``DD_PYTORCH_SERVICE`` environment variable.
+
+   Default: ``"pytorch"``
+
+"""
+
+from ddtrace import config
+
+
+config._add(  # type: ignore[no-untyped-call]
+    "pytorch",
+    {
+        "_default_service": "pytorch",
+    },
+)
diff --git a/ddtrace/contrib/internal/pytorch/_c_tracer.py b/ddtrace/contrib/internal/pytorch/_c_tracer.py
new file mode 100644
index 00000000000..3fa4e718a14
--- /dev/null
+++ b/ddtrace/contrib/internal/pytorch/_c_tracer.py
@@ -0,0 +1,148 @@
+"""ctypes bridge to the dd-trace-c global parent context API.
+
+The C tracer is injected into the process via LD_PRELOAD by the Datadog
+injection layer — dd-trace-py does not load it. All public functions here
+are silent no-ops when the C tracer is not present.
+"""
+
+import ctypes
+from typing import Any
+from typing import Callable
+from typing import Optional
+
+from ddtrace.internal.logger import get_logger
+
+
+log = get_logger(__name__)
+
+_lib: Optional[ctypes.CDLL] = None
+_loaded: bool = False
+_set_fn: Optional[Callable[..., None]] = None
+_clear_fn: Optional[Callable[[], None]] = None
+_step_begin_fn: Optional[Callable[[], None]] = None
+_step_end_fn: Optional[Callable[[], None]] = None
+
+
+def _load() -> bool:
+    """Bind to C tracer symbols already in the process namespace. Returns True on success."""
+    global _lib, _loaded, _set_fn, _clear_fn, _step_begin_fn, _step_end_fn
+    if _loaded:
+        return _lib is not None
+    _loaded = True
+
+    try:
+        # ctypes.CDLL(None) opens the global symbol table, which includes any
+        # library injected via LD_PRELOAD — no explicit library loading needed.
+        lib = ctypes.CDLL(None)
+        fn = lib.dd_set_global_parent_context
+        fn.restype = None
+        fn.argtypes = [
+            ctypes.c_uint64,  # trace_id (low 64 bits)
+            ctypes.c_uint64,  # trace_id_hi (high 64 bits)
+            ctypes.c_uint64,  # span_id
+            ctypes.c_bool,  # has_sampling_priority
+            ctypes.c_int,  # sampling_priority
+            ctypes.POINTER(ctypes.c_char_p),  # keys
+            ctypes.POINTER(ctypes.c_char_p),  # values
+            ctypes.c_size_t,  # count
+        ]
+        _set_fn = fn
+
+        fn2 = lib.dd_clear_global_parent_context
+        fn2.restype = None
+        fn2.argtypes = []
+        _clear_fn = fn2
+    except AttributeError:
+        # C tracer not present in this process — no-op path.
+        return False
+
+    # Step signals — only available in C tracer builds that include training.c.
+    # Silently absent means the heuristic NCCL-group-marker fallback activates.
+    try:
+        fn3 = lib.dd_training_step_begin
+        fn3.restype = None
+        fn3.argtypes = []
+        _step_begin_fn = fn3
+        fn4 = lib.dd_training_step_end
+        fn4.restype = None
+        fn4.argtypes = []
+        _step_end_fn = fn4
+    except AttributeError:
+        pass
+
+    _lib = lib
+    return True
+
+
+def set_parent_context(span: Any, open_kwargs: dict[str, Any]) -> None:
+    """Register *span* as the process-wide parent for all C-tracer root spans.
+
+    No-op when the C tracer is not present. Never raises.
+    """
+    if not _load() or _set_fn is None:
+        return
+    try:
+        trace_id = span.trace_id
+        span_id = ctypes.c_uint64(span.span_id)
+        trace_id_lo = ctypes.c_uint64(trace_id & 0xFFFFFFFFFFFFFFFF)
+        trace_id_hi = ctypes.c_uint64((trace_id >> 64) & 0xFFFFFFFFFFFFFFFF)
+
+        priority = getattr(getattr(span, "context", None), "sampling_priority", None)
+        has_priority = ctypes.c_bool(priority is not None)
+        c_priority = ctypes.c_int(int(priority) if priority is not None else 0)
+
+        # C API uses underscore-separated keys; Python span tags use dot-separated
+        # (e.g. "training_job.id"). These are intentionally different namespaces.
+        tags = {
+            "training_job_id": str(open_kwargs.get("training_job_id") or ""),
+            "rank": str(open_kwargs.get("rank", 0)),
+            "world_size": str(open_kwargs.get("world_size", 1)),
+            "framework": str(open_kwargs.get("framework") or "none"),
+            "service": str(getattr(span, "service", None) or ""),
+        }
+        keys_enc = [k.encode() for k in tags]
+        vals_enc = [v.encode() for v in tags.values()]
+        ArrType = ctypes.c_char_p * len(tags)
+
+        _set_fn(
+            trace_id_lo,
+            trace_id_hi,
+            span_id,
+            has_priority,
+            c_priority,
+            ArrType(*keys_enc),
+            ArrType(*vals_enc),
+            ctypes.c_size_t(len(tags)),
+        )
+    except Exception:
+        log.debug("pytorch: dd_set_global_parent_context failed", exc_info=True)
+
+
+def clear_parent_context() -> None:
+    """Clear the process-wide parent context. No-op when C tracer is absent. Never raises."""
+    if not _load() or _clear_fn is None:
+        return
+    try:
+        _clear_fn()
+    except Exception:
+        log.debug("pytorch: dd_clear_global_parent_context failed", exc_info=True)
+
+
+def step_begin() -> None:
+    """Signal start of a training step (forward pass begins). No-op when C tracer absent. Never raises."""
+    if not _load() or _step_begin_fn is None:
+        return
+    try:
+        _step_begin_fn()
+    except Exception:
+        log.debug("pytorch: dd_training_step_begin failed", exc_info=True)
+
+
+def step_end() -> None:
+    """Signal end of a training step (optimizer step complete). No-op when C tracer absent. Never raises."""
+    if not _load() or _step_end_fn is None:
+        return
+    try:
+        _step_end_fn()
+    except Exception:
+        log.debug("pytorch: dd_training_step_end failed", exc_info=True)
diff --git a/ddtrace/contrib/internal/pytorch/_device.py b/ddtrace/contrib/internal/pytorch/_device.py
new file mode 100644
index 00000000000..0f62e7137bd
--- /dev/null
+++ b/ddtrace/contrib/internal/pytorch/_device.py
@@ -0,0 +1,313 @@
+"""Device-id discovery for pytorch.rank span tagging.
+
+Device id is a fleet-stable dimension (GPU UUID, not rank). Custom-metric
+cardinality stays bounded by physical fleet size regardless of job count;
+job attribution lives on the rank-root span via training_job.id and rank.
+"""
+
+import os
+import socket
+import threading
+from typing import Any
+from typing import NamedTuple
+from typing import Optional
+
+from ddtrace.internal.settings import env
+
+
+class DeviceInfo(NamedTuple):
+    device_id: str
+    device_index: Optional[int]
+    kind: str  # "cuda" | "cpu"
+    hostname: str
+    # New fields — defensive Optional[...] because older torch versions may
+    # not expose all of them, and CPU-only hosts return None for all.
+    gpu_name: Optional[str] = None
+    gpu_compute_capability: Optional[str] = None  # e.g. "8.0"
+    gpu_sm_count: Optional[int] = None
+    gpu_total_memory_bytes: Optional[int] = None
+    gpu_driver_version: Optional[str] = None
+
+
+_cache: Optional[DeviceInfo] = None
+_lock = threading.Lock()
+
+
+def _reset_child_state() -> None:
+    global _cache, _lock
+    _cache = None
+    _lock = threading.Lock()
+
+
+if hasattr(os, "register_at_fork"):
+    os.register_at_fork(after_in_child=_reset_child_state)
+
+
+def _cuda_is_available() -> bool:
+    try:
+        import torch
+
+        return bool(torch.cuda.is_available())
+    except Exception:
+        return False
+
+
+def _cuda_index(local_rank: int) -> Optional[int]:
+    # current_device() is unreliable at bootstrap (returns 0 for all ranks before
+    # set_device runs). Priority: LOCAL_RANK env → Ray Train API → current_device().
+    try:
+        env_local = env.get("LOCAL_RANK")
+        if env_local is not None and env_local != "":
+            return int(env_local)
+    except Exception:  # nosec B110
+        pass
+    try:
+        import ray.train
+
+        ctx = ray.train.get_context()
+        return int(ctx.get_local_rank())
+    except Exception:  # nosec B110
+        pass
+    try:
+        import torch
+
+        return int(torch.cuda.current_device())
+    except Exception:
+        return None
+
+
+def _cuda_visible_to_physical(visible_idx: int) -> int:
+    """Map a CUDA-visible device index to the physical NVML index.
+
+    When CUDA_VISIBLE_DEVICES remaps or subsets GPUs, the CUDA-visible index
+    (used by LOCAL_RANK / torch.cuda.current_device) differs from the physical
+    GPU index that NVML requires.
+    """
+    raw = env.get("CUDA_VISIBLE_DEVICES") or ""
+    if not raw or raw == "NoDevFiles":
+        return visible_idx
+    try:
+        # UUID entries (e.g. "GPU-abc123") are not integers; fall back to visible_idx.
+        physical = [int(x.strip()) for x in raw.split(",") if x.strip().lstrip("-").isdigit()]
+        if physical and visible_idx < len(physical):
+            return physical[visible_idx]
+    except (ValueError, IndexError):
+        pass
+    return visible_idx
+
+
+def _cuda_visible_uuid_at(visible_idx: int) -> Optional[str]:
+    """Return the UUID at visible_idx in CUDA_VISIBLE_DEVICES (GPU-... / MIG-... format), or None."""
+    raw = env.get("CUDA_VISIBLE_DEVICES") or ""
+    if not raw or raw == "NoDevFiles":
+        return None
+    entries = [x.strip() for x in raw.split(",") if x.strip()]
+    uuids = [e for e in entries if e.upper().startswith(("GPU-", "MIG-"))]
+    if not uuids or visible_idx >= len(uuids):
+        return None
+    return uuids[visible_idx]
+
+
+def _query_cuda_uuid(idx: int) -> Optional[str]:
+    # UUID-format CUDA_VISIBLE_DEVICES (k8s NVIDIA device plugin): use UUID handle
+    # directly — passing the visible ordinal to nvmlDeviceGetHandleByIndex returns
+    # the wrong physical device.
+    env_uuid = _cuda_visible_uuid_at(idx)
+    if env_uuid is not None:
+        try:
+            import pynvml
+
+            pynvml.nvmlInit()
+            try:
+                encoded = env_uuid.encode() if isinstance(env_uuid, str) else env_uuid
+                handle = pynvml.nvmlDeviceGetHandleByUUID(encoded)
+                raw = pynvml.nvmlDeviceGetUUID(handle)
+                return raw.decode() if isinstance(raw, bytes) else str(raw)
+            finally:
+                try:
+                    pynvml.nvmlShutdown()
+                except Exception:  # nosec B110
+                    pass
+        except Exception:  # nosec B110
+            pass
+        return env_uuid
+
+    # Prefer pynvml (stable across torch versions); fall back to torch device-properties UUID (2.0+).
+    try:
+        import pynvml
+
+        pynvml.nvmlInit()
+        try:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(_cuda_visible_to_physical(idx))
+            raw = pynvml.nvmlDeviceGetUUID(handle)
+            return raw.decode() if isinstance(raw, bytes) else str(raw)
+        finally:
+            try:
+                pynvml.nvmlShutdown()
+            except Exception:  # nosec B110
+                pass
+    except Exception:  # nosec B110
+        pass
+    try:
+        import torch
+
+        props = torch.cuda.get_device_properties(idx)
+        uuid = getattr(props, "uuid", None)
+        if uuid is not None:
+            return str(uuid)
+    except Exception:  # nosec B110
+        pass
+    return None
+
+
+def _query_cuda_props(idx: int) -> dict[str, Any]:
+    """Best-effort fetch of additional device fields from
+    `torch.cuda.get_device_properties(idx)`. Returns a dict with only the
+    fields we managed to read; missing fields are omitted.
+    """
+    out: dict[str, Any] = {}
+    try:
+        import torch  # noqa: PLC0415
+
+        props = torch.cuda.get_device_properties(idx)
+    except Exception:
+        return out
+    name = getattr(props, "name", None)
+    if name:
+        out["gpu_name"] = str(name)
+    major = getattr(props, "major", None)
+    minor = getattr(props, "minor", None)
+    if major is not None and minor is not None:
+        out["gpu_compute_capability"] = f"{int(major)}.{int(minor)}"
+    sm = getattr(props, "multi_processor_count", None)
+    if sm is not None:
+        try:
+            out["gpu_sm_count"] = int(sm)
+        except Exception:  # nosec B110
+            pass
+    total = getattr(props, "total_memory", None)
+    if total is not None:
+        try:
+            out["gpu_total_memory_bytes"] = int(total)
+        except Exception:  # nosec B110
+            pass
+    return out
+
+
+def _query_cuda_driver_version() -> Optional[str]:
+    try:
+        import pynvml  # noqa: PLC0415
+
+        pynvml.nvmlInit()
+        try:
+            raw = pynvml.nvmlSystemGetDriverVersion()
+            return raw.decode() if isinstance(raw, bytes) else str(raw)
+        finally:
+            try:
+                pynvml.nvmlShutdown()
+            except Exception:  # nosec B110
+                pass
+    except Exception:
+        return None
+
+
+def _hostname() -> str:
+    try:
+        return socket.gethostname()
+    except Exception:
+        return "unknown-host"
+
+
+def discover(local_rank: int) -> DeviceInfo:
+    """Resolve and cache the device id. Idempotent — second call returns the cached value."""
+    global _cache
+    with _lock:
+        if _cache is not None:
+            return _cache
+        host = _hostname()
+        if _cuda_is_available():
+            idx = _cuda_index(local_rank)
+            if idx is None:
+                # Can't reliably map local_rank → physical device; skip UUID lookup.
+                _cache = DeviceInfo(
+                    device_id="%s:cuda:unknown" % host,
+                    device_index=None,
+                    kind="cuda",
+                    hostname=host,
+                )
+                return _cache
+            uuid = _query_cuda_uuid(idx)
+            device_id = uuid if uuid else "%s:cuda:%d" % (host, idx)
+            props = _query_cuda_props(idx)
+            driver_v = _query_cuda_driver_version()
+            _cache = DeviceInfo(
+                device_id=device_id,
+                device_index=idx,
+                kind="cuda",
+                hostname=host,
+                gpu_name=props.get("gpu_name"),
+                gpu_compute_capability=props.get("gpu_compute_capability"),
+                gpu_sm_count=props.get("gpu_sm_count"),
+                gpu_total_memory_bytes=props.get("gpu_total_memory_bytes"),
+                gpu_driver_version=driver_v,
+            )
+        else:
+            # CPU: one logical device per host for cardinality bounding.
+            _cache = DeviceInfo(
+                device_id="%s:cpu" % host,
+                device_index=None,
+                kind="cpu",
+                hostname=host,
+            )
+        return _cache
+
+
+def get() -> Optional[DeviceInfo]:
+    """Return the cached DeviceInfo, or None if `discover` has not yet run."""
+    return _cache
+
+
+# Per-GPU peak FLOPs by dtype, in FLOPS (not TFLOPS).
+# Maintenance: add new GPUs here as needed. Values from official datasheets.
+_PEAK_FLOPS_TABLE: dict[tuple[str, str], float] = {
+    # NVIDIA H100 SXM5 / PCIe — figures for tensor cores
+    ("H100", "bfloat16"): 989e12,
+    ("H100", "float16"): 989e12,
+    ("H100", "tf32"): 495e12,
+    ("H100", "float32"): 67e12,
+    # NVIDIA A100 SXM4 / PCIe
+    ("A100", "bfloat16"): 312e12,
+    ("A100", "float16"): 312e12,
+    ("A100", "tf32"): 156e12,
+    ("A100", "float32"): 19.5e12,
+    # NVIDIA L40 / L4 — Ada Lovelace. fp16 shares bf16 tensor-core path;
+    # fp32 is the non-tensor ALU peak per datasheet.
+    ("L40", "bfloat16"): 181e12,
+    ("L40", "float16"): 181e12,
+    ("L40", "float32"): 90.5e12,
+    ("L4", "bfloat16"): 121e12,
+    ("L4", "float16"): 121e12,
+    ("L4", "float32"): 30.3e12,
+    # NVIDIA V100
+    ("V100", "float16"): 125e12,
+    ("V100", "float32"): 15.7e12,
+    # NVIDIA T4
+    ("T4", "float16"): 65e12,
+    ("T4", "float32"): 8.1e12,
+    # AMD MI300X — CDNA3 matrix peaks per datasheet; fp32 is the vector ALU peak.
+    ("MI300", "bfloat16"): 1300e12,
+    ("MI300", "float16"): 1300e12,
+    ("MI300", "float32"): 163.4e12,
+}
+
+
+def lookup_peak_flops(gpu_name: Optional[str], dtype: str) -> Optional[float]:
+    """Best-effort lookup: substring-match `gpu_name` against table prefixes.
+    Returns None if no match.
+    """
+    if not gpu_name:
+        return None
+    for (prefix, dt), v in _PEAK_FLOPS_TABLE.items():
+        if dt == dtype and prefix in gpu_name:
+            return v
+    return None
diff --git a/ddtrace/contrib/internal/pytorch/_distributed.py b/ddtrace/contrib/internal/pytorch/_distributed.py
new file mode 100644
index 00000000000..cf3d8f7a12d
--- /dev/null
+++ b/ddtrace/contrib/internal/pytorch/_distributed.py
@@ -0,0 +1,517 @@
+"""Distributed-training bootstrap: wraps init/destroy_process_group to
+open and close the pytorch.rank lifetime span.
+"""
+
+import contextvars
+import threading
+from typing import Any
+from typing import Optional
+
+import torch
+
+from ddtrace.contrib.internal.pytorch._utils import get_cached_job_id
+from ddtrace.contrib.internal.pytorch._utils import job_id_env_set
+from ddtrace.contrib.internal.pytorch._utils import resolve_job_id_from_env
+from ddtrace.contrib.internal.pytorch._utils import set_cached_job_id
+from ddtrace.contrib.internal.trace_utils import unwrap as _unwrap
+from ddtrace.contrib.internal.trace_utils import wrap as _wrap
+from ddtrace.internal import core
+from ddtrace.internal import forksafe
+from ddtrace.internal.logger import get_logger
+from ddtrace.internal.settings import env
+
+
+log = get_logger(__name__)
+
+_no_env_job_id_warned: bool = False
+_installed: bool = False
+_optimizer_wrapped: bool = False
+_fsdp_hook_registered: bool = False
+_deepspeed_hook_registered: bool = False
+
+# Tracks the active ExecutionContext for the current distributed training session.
+# Presence (non-None) doubles as the "bootstrapped" flag.
+# AIDEV-NOTE: ContextVar is per-thread — safe because init/destroy_process_group always run on the same thread in DDP.
+_rank_ctx: contextvars.ContextVar[Optional[core.ExecutionContext[Any]]] = contextvars.ContextVar(
+    "pytorch_rank_ctx", default=None
+)
+
+_cached_distributed_backend: Optional[str] = None
+
+
+def _step_profiling_enabled() -> bool:
+    return env.get("DD_TRAINING_STEP_PROFILING", "false").lower() in ("true", "1")
+
+
+# Wire-format env var names set by the Ray contrib on worker processes.
+# AIDEV-NOTE: duplicated from ddtrace.contrib.internal.ray intentionally —
+# contrib-to-contrib imports break isolation (ray contrib may not be installed).
+# If these names ever change, update both sides.
+_RAY_SUBMISSION_ID_ENV = "_RAY_SUBMISSION_ID"
+_RAY_JOB_NAME_ENV = "_RAY_JOB_NAME"
+_RAY_RUN_METADATA_ENV = "_DD_RAY_RUN_METADATA"
+
+
+def _reset_child_state() -> None:
+    global \
+        _no_env_job_id_warned, \
+        _cached_distributed_backend, \
+        _fsdp_hook_registered, \
+        _deepspeed_hook_registered, \
+        _optimizer_wrapped
+    ctx = _rank_ctx.get()
+    if ctx is not None:
+        _rank_ctx.set(None)
+        # AIDEV-NOTE: Deferred imports + manual reset — import system may be unsafe post-fork.
+        try:
+            from ddtrace.internal.core import _CURRENT_CONTEXT  # noqa: PLC0415
+            from ddtrace.internal.core import ROOT_CONTEXT_ID  # noqa: PLC0415
+            from ddtrace.internal.core import ExecutionContext  # noqa: PLC0415
+
+            _CURRENT_CONTEXT.set(ExecutionContext(ROOT_CONTEXT_ID))
+        except Exception:  # nosec B110
+            pass
+    _no_env_job_id_warned = False
+    _cached_distributed_backend = None
+    _fsdp_hook_registered = False
+    _deepspeed_hook_registered = False
+    _optimizer_wrapped = False
+
+
+forksafe.register(_reset_child_state)
+
+
+def _distributed_available() -> bool:
+    try:
+        return bool(torch.distributed.is_available())
+    except Exception:
+        return False
+
+
+def _get_cached_backend() -> Optional[str]:
+    """One-shot lookup of ``torch.distributed.get_backend()``. Caches the
+    result on first successful call. The backend (nccl/gloo/mpi) does not
+    change during the lifetime of a process group.
+    """
+    global _cached_distributed_backend
+    if _cached_distributed_backend is not None:
+        return _cached_distributed_backend
+    try:
+        if _distributed_available() and torch.distributed.is_initialized():
+            _cached_distributed_backend = str(torch.distributed.get_backend())
+    except Exception:
+        return None
+    return _cached_distributed_backend
+
+
+def _populate_ray_run_metadata() -> None:
+    """Read Ray-set env vars into the run-metadata cache so _tag_ray_run_context can find them."""
+    sub = env.get(_RAY_SUBMISSION_ID_ENV)
+    rn = env.get(_RAY_JOB_NAME_ENV)
+    md_json = env.get(_RAY_RUN_METADATA_ENV)
+    metadata: dict[str, Any] = {}
+    if md_json:
+        try:
+            import json  # noqa: PLC0415
+
+            metadata = json.loads(md_json) or {}
+        except Exception:  # nosec B110
+            pass
+    if sub or rn or metadata:
+        from ddtrace.contrib.internal.pytorch._utils import set_cached_run_metadata  # noqa: PLC0415
+
+        set_cached_run_metadata(submission_id=sub, run_name=rn, metadata=metadata or None)
+
+
+def _detect_launcher() -> Optional[str]:
+    """Return a best-guess launcher name from env, or None."""
+    if env.get("TORCHELASTIC_RUN_ID"):
+        return "torchrun"
+    if env.get("RAY_JOB_ID"):
+        return "ray"
+    if env.get("SLURM_JOB_ID"):
+        return "slurm"
+    if env.get("KUBEFLOW_TRAINING_JOB_ID"):
+        return "kubeflow"
+    return None
+
+
+def _bootstrap_distributed() -> None:
+    """Capture rank/world_size and open the pytorch.rank span.
+
+    Cross-rank correlation requires an env-supplied id (RAY_JOB_ID,
+    TORCHELASTIC_RUN_ID, KUBEFLOW_TRAINING_JOB_ID, SLURM_JOB_ID). When none
+    is resolved, training_job.id is left unset so missing correlation is visible.
+    """
+    global _no_env_job_id_warned
+
+    cached = get_cached_job_id()
+    env_id_present = job_id_env_set()
+    job_id = cached or resolve_job_id_from_env()
+
+    rank: int = 0
+    world_size: int = 1
+    try:
+        if _distributed_available() and torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+    except Exception:
+        log.exception("pytorch: failed to capture rank/world_size; defaulting to single-rank")
+
+    publishable_job_id: Optional[str] = job_id
+    if not cached and not env_id_present:
+        publishable_job_id = None
+        if not _no_env_job_id_warned:
+            log.warning(
+                "pytorch: no shared training job id resolved from env "
+                "(DD_PYTORCH_JOB_ID, RAY_JOB_ID, TORCHELASTIC_RUN_ID, "
+                "KUBEFLOW_TRAINING_JOB_ID, SLURM_JOB_ID). Cross-rank trace "
+                "correlation will be DISABLED for this run — spans will not "
+                "carry the training_job.id tag."
+            )
+            _no_env_job_id_warned = True
+
+    if publishable_job_id is not None:
+        set_cached_job_id(publishable_job_id, is_default=True)
+
+    _populate_ray_run_metadata()
+
+    from ddtrace.contrib.internal.pytorch import _device  # noqa: PLC0415
+    from ddtrace.contrib.internal.pytorch import _rank_root  # noqa: PLC0415
+
+    try:
+        _device.discover(local_rank=rank)
+    except Exception:
+        log.exception("pytorch: device discovery failed")
+
+    try:
+        _rank_root.open_rank_span(
+            rank=rank,
+            world_size=world_size,
+            framework="none",
+            training_job_id=publishable_job_id,
+        )
+    except Exception:
+        log.exception("pytorch: rank-root span open failed")
+
+    if _step_profiling_enabled():
+        try:
+            from ddtrace.contrib.internal.pytorch import _c_tracer  # noqa: PLC0415
+
+            _c_tracer.step_begin()
+        except Exception:
+            log.debug("pytorch: step_begin after bootstrap failed", exc_info=True)
+
+
+def _wrapped_init_process_group(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any:
+    already = _rank_ctx.get() is not None
+
+    result = wrapped(*args, **kwargs)  # let exceptions propagate; do NOT open context yet
+
+    if not already:
+        ctx = core.context_with_data("pytorch.rank", _dispatch_end_event=False)  # type: ignore[no-untyped-call]
+        # AIDEV-NOTE: __enter__() updates _CURRENT_CONTEXT so child spans are parented here; _dispatch_end_event=False
+        # defers the ended event — dispatch_ended_event() + __exit__() are called in _wrapped_destroy_process_group.
+        ctx.__enter__()
+        _rank_ctx.set(ctx)
+        try:
+            _bootstrap_distributed()
+        except Exception:
+            log.exception("pytorch: bootstrap failed inside init_process_group wrapper")
+    return result
+
+
+def _is_world_group(group: Any) -> bool:
+    """Return True if group is the default WORLD process group.
+
+    Handles both the no-arg (None) and the explicit group=torch.distributed.group.WORLD forms.
+    """
+    if group is None:
+        return True
+    try:
+        return group is torch.distributed.group.WORLD
+    except AttributeError:
+        return False
+
+
+def _wrapped_destroy_process_group(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any:
+    group = kwargs.get("group", args[0] if args else None)
+    try:
+        result = wrapped(*args, **kwargs)
+        return result
+    finally:
+        # Close the rank span only when the default (WORLD) process group is
+        # destroyed. Subgroup destroys must not end the span.
+        if _is_world_group(group):
+            if _step_profiling_enabled():
+                try:
+                    from ddtrace.contrib.internal.pytorch import _c_tracer  # noqa: PLC0415
+
+                    _c_tracer.step_end()
+                except Exception:
+                    log.debug("pytorch: step_end before rank-root close failed", exc_info=True)
+            try:
+                from ddtrace.contrib.internal.pytorch import _rank_root  # noqa: PLC0415
+
+                _rank_root.close()
+            except Exception:
+                log.debug("pytorch: rank-root close raised", exc_info=True)
+            ctx = _rank_ctx.get()
+            if ctx is not None:
+                ctx.dispatch_ended_event()
+                ctx.__exit__(None, None, None)
+                _rank_ctx.set(None)
+            global _cached_distributed_backend
+            _cached_distributed_backend = None
+            try:
+                from ddtrace.contrib.internal.pytorch._utils import set_cached_job_id  # noqa: PLC0415
+
+                set_cached_job_id(None, is_default=True)
+            except Exception:  # nosec B110
+                pass
+
+
+def _wrapped_ddp_init(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any:
+    result = wrapped(*args, **kwargs)
+    try:
+        from ddtrace.contrib.internal.pytorch import _rank_root  # noqa: PLC0415
+
+        _rank_root.set_framework("ddp")
+    except Exception:
+        log.debug("pytorch: failed to update rank-root framework tag", exc_info=True)
+    return result
+
+
+def _install_ddp() -> None:
+    try:
+        import torch.nn.parallel.distributed  # noqa: F401
+    except Exception:
+        return
+    if not hasattr(torch.nn.parallel.distributed, "DistributedDataParallel"):
+        return
+    _wrap(
+        "torch.nn.parallel.distributed",
+        "DistributedDataParallel.__init__",
+        _wrapped_ddp_init,
+    )
+
+
+def _uninstall_ddp() -> None:
+    try:
+        import torch.nn.parallel.distributed  # noqa: F401
+    except Exception:
+        return
+    if not hasattr(torch.nn.parallel.distributed, "DistributedDataParallel"):
+        return
+    try:
+        _unwrap(torch.nn.parallel.distributed.DistributedDataParallel, "__init__")
+    except Exception:
+        log.debug("pytorch: failed to unwrap DDP.__init__", exc_info=True)
+
+
+def _wrapped_fsdp_init(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any:
+    result = wrapped(*args, **kwargs)
+    try:
+        from ddtrace.contrib.internal.pytorch import _rank_root  # noqa: PLC0415
+
+        _rank_root.set_framework("fsdp")
+    except Exception:
+        log.debug("pytorch: failed to update rank-root framework tag", exc_info=True)
+    return result
+
+
+def _install_fsdp() -> None:
+    # AIDEV-NOTE: defer the import of torch.distributed.fsdp until the user
+    # actually imports it. Eagerly importing it pulls _dynamo + sympy (~1.3s
+    # startup cost) for every DDP workload that never uses FSDP.
+    global _fsdp_hook_registered
+    if _fsdp_hook_registered:
+        return
+    from wrapt import register_post_import_hook
+
+    def _do_install(_module: object) -> None:
+        if not _installed:
+            return
+        try:
+            import torch.distributed.fsdp as _fsdp  # noqa: PLC0415
+
+            if hasattr(_fsdp.FullyShardedDataParallel.__init__, "__wrapped__"):
+                return
+            _wrap(
+                "torch.distributed.fsdp",
+                "FullyShardedDataParallel.__init__",
+                _wrapped_fsdp_init,
+            )
+        except Exception:
+            log.exception("pytorch: failed to install FSDP wrapper")
+
+    register_post_import_hook(_do_install, "torch.distributed.fsdp")
+    _fsdp_hook_registered = True
+
+
+def _uninstall_fsdp() -> None:
+    try:
+        from torch.distributed.fsdp import FullyShardedDataParallel
+    except Exception:
+        return
+    try:
+        _unwrap(FullyShardedDataParallel, "__init__")
+    except Exception:
+        log.debug("pytorch: failed to unwrap FSDP.__init__", exc_info=True)
+
+
+def _wrapped_deepspeed_init(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any:
+    result = wrapped(*args, **kwargs)
+    try:
+        from ddtrace.contrib.internal.pytorch import _rank_root  # noqa: PLC0415
+
+        _rank_root.set_framework("deepspeed")
+    except Exception:
+        log.debug("pytorch: failed to update rank-root framework tag", exc_info=True)
+    return result
+
+
+def _install_deepspeed() -> None:
+    global _deepspeed_hook_registered
+    if _deepspeed_hook_registered:
+        return
+    from wrapt import register_post_import_hook
+
+    def _do_install(deepspeed: object) -> None:
+        if not _installed:
+            return
+        if not hasattr(deepspeed, "initialize"):
+            return
+        if hasattr(deepspeed.initialize, "__wrapped__"):
+            return
+        try:
+            _wrap("deepspeed", "initialize", _wrapped_deepspeed_init)
+        except Exception:
+            log.exception("pytorch: failed to install deepspeed wrapper")
+
+    register_post_import_hook(_do_install, "deepspeed")
+    _deepspeed_hook_registered = True
+
+
+def _uninstall_deepspeed() -> None:
+    try:
+        import deepspeed  # noqa: F401
+    except Exception:
+        return
+    try:
+        _unwrap(deepspeed, "initialize")
+    except Exception:
+        log.debug("pytorch: failed to unwrap deepspeed.initialize", exc_info=True)
+
+
+def _install_optimizer_step() -> None:
+    global _optimizer_wrapped
+    if _optimizer_wrapped or not _step_profiling_enabled():
+        return
+    if not hasattr(torch.optim, "Optimizer"):
+        return
+    try:
+        _wrap("torch.optim", "Optimizer.step", _wrapped_optimizer_step)
+        _optimizer_wrapped = True
+    except Exception:
+        log.debug("pytorch: failed to wrap Optimizer.step", exc_info=True)
+
+
+def _uninstall_optimizer_step() -> None:
+    global _optimizer_wrapped
+    if not _optimizer_wrapped:
+        return
+    if not hasattr(torch.optim, "Optimizer"):
+        return
+    try:
+        _unwrap(torch.optim.Optimizer, "step")
+    except Exception:
+        log.debug("pytorch: failed to unwrap Optimizer.step", exc_info=True)
+    _optimizer_wrapped = False
+
+
+def _wrapped_optimizer_step(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any:
+    from ddtrace.contrib.internal.pytorch import _c_tracer  # noqa: PLC0415
+
+    _c_tracer.step_end()  # close step N: optimizer phase ends
+    result = wrapped(*args, **kwargs)
+    _c_tracer.step_begin()  # open step N+1: forward starts
+    return result
+
+
+def install() -> None:
+    global _installed
+    if _installed:
+        return
+    _installed = True
+    if _distributed_available() and hasattr(torch.distributed, "init_process_group"):
+        _wrap("torch.distributed", "init_process_group", _wrapped_init_process_group)
+    if _distributed_available() and hasattr(torch.distributed, "destroy_process_group"):
+        _wrap("torch.distributed", "destroy_process_group", _wrapped_destroy_process_group)
+    _install_ddp()
+    _install_fsdp()
+    _install_deepspeed()
+    _install_optimizer_step()
+    # Late-patch bootstrap: if init_process_group was called before patch(),
+    # our wrapper will never fire. Run bootstrap now.
+    if _distributed_available():
+        try:
+            if torch.distributed.is_initialized() and _rank_ctx.get() is None:
+                ctx = core.context_with_data("pytorch.rank", _dispatch_end_event=False)  # type: ignore[no-untyped-call]
+                ctx.__enter__()
+                _rank_ctx.set(ctx)
+                _bootstrap_distributed()
+        except Exception:
+            log.exception("pytorch: late-patch bootstrap failed")
+
+
+def uninstall() -> None:
+    global _installed, _fsdp_hook_registered, _deepspeed_hook_registered
+    if _installed:
+        _installed = False
+        if _distributed_available():
+            for fn in ("destroy_process_group", "init_process_group"):
+                if hasattr(torch.distributed, fn):
+                    try:
+                        _unwrap(torch.distributed, fn)
+                    except Exception:
+                        log.debug("pytorch: failed to unwrap torch.distributed.%s", fn, exc_info=True)
+        _uninstall_ddp()
+        _uninstall_fsdp()
+        _fsdp_hook_registered = False
+        _uninstall_deepspeed()
+        _deepspeed_hook_registered = False
+        _uninstall_optimizer_step()
+    try:
+        from ddtrace.contrib.internal.pytorch import _device as _device_mod  # noqa: PLC0415
+
+        _device_mod._cache = None
+    except Exception:  # nosec B110
+        pass
+    try:
+        from ddtrace.contrib.internal.pytorch._utils import clear_cached_run_metadata  # noqa: PLC0415
+
+        clear_cached_run_metadata()
+    except Exception:  # nosec B110
+        pass
+    try:
+        from ddtrace.contrib.internal.pytorch import _rank_root  # noqa: PLC0415
+
+        _rank_root.close()
+    except Exception:
+        log.debug("pytorch: rank-root close raised in uninstall", exc_info=True)
+    ctx = _rank_ctx.get()
+    if ctx is not None:
+        ctx.dispatch_ended_event()
+        ctx.__exit__(None, None, None)
+        _rank_ctx.set(None)
+    try:
+        from ddtrace.contrib.internal.pytorch import _utils as _utils_mod  # noqa: PLC0415
+
+        _utils_mod._default_job_id = None
+        _utils_mod._tls_job_id = threading.local()
+    except Exception:
+        log.debug("pytorch: failed to reset cached job id on uninstall", exc_info=True)
+    global _no_env_job_id_warned, _cached_distributed_backend
+    _no_env_job_id_warned = False
+    _cached_distributed_backend = None
diff --git a/ddtrace/contrib/internal/pytorch/_rank_root.py b/ddtrace/contrib/internal/pytorch/_rank_root.py
new file mode 100644
index 00000000000..f40845f9c79
--- /dev/null
+++ b/ddtrace/contrib/internal/pytorch/_rank_root.py
@@ -0,0 +1,393 @@
+"""Per-rank lifetime span for PyTorch distributed training.
+
+Emits one ``pytorch.rank`` span per rank, open for the lifetime of the
+distributed process group. Carries ``rank``, ``world_size``, ``framework``,
+``training_job.id``, and device tags.
+
+The span is rotated every ``_rotation_interval_s`` seconds (default 600)
+so partial data is visible during long runs. Rotated spans carry
+``_dd.was_long_running=1``.
+"""
+
+import atexit
+import threading
+from typing import Any
+from typing import Optional
+
+from ddtrace import config
+from ddtrace import tracer
+from ddtrace.contrib.internal.pytorch import _c_tracer
+from ddtrace.contrib.internal.pytorch import _device
+from ddtrace.contrib.internal.trace_utils import int_service
+from ddtrace.internal import forksafe
+from ddtrace.internal.logger import get_logger
+from ddtrace.internal.settings import env
+from ddtrace.internal.threads import Lock
+
+
+log = get_logger(__name__)
+
+_lock = Lock()
+_span: Optional[Any] = None
+_atexit_registered = False
+_rotation_interval_s: int = 600
+_rotation_timer: Optional[threading.Timer] = None
+_open_kwargs: dict[str, Any] = {}
+
+
+def _build_span(kwargs: dict[str, Any]) -> Optional[Any]:
+    rank = kwargs["rank"]
+    world_size = kwargs["world_size"]
+    framework = kwargs["framework"]
+    training_job_id = kwargs["training_job_id"]
+    try:
+        span = tracer.start_span(
+            "pytorch.rank",
+            service=int_service(None, config.pytorch, default="pytorch"),
+            child_of=tracer.current_span() if framework == "ray" else None,
+            activate=False,
+        )
+    except Exception:
+        log.debug("pytorch: failed to open pytorch.rank span", exc_info=True)
+        return None
+    try:
+        span._set_attribute("rank", int(rank))
+        span._set_attribute("world_size", int(world_size))
+        span.set_tag("framework", framework or "none")
+        span.set_tag("component", "pytorch")
+        span.set_tag("debug.level", "0")
+        if training_job_id:
+            span.set_tag("training_job.id", training_job_id)
+            span.set_tag("job_id", training_job_id)
+        # Force-keep: losing this per-rank anchor to base sampling permanently
+        # breaks workload attribution via the span's time range.
+        span.set_tag("manual.keep")
+        info = _device.get()
+        if info is not None:
+            span.set_tag("device.id", info.device_id)
+            span.set_tag("device.kind", info.kind)
+            span.set_tag("host", info.hostname)
+            if info.device_index is not None:
+                span._set_attribute("device.index", info.device_index)
+            if info.gpu_name:
+                span.set_tag("device.gpu.name", info.gpu_name)
+            if info.gpu_compute_capability:
+                span.set_tag("device.gpu.compute_capability", info.gpu_compute_capability)
+            if info.gpu_sm_count is not None:
+                span._set_attribute("device.gpu.sm_count", info.gpu_sm_count)
+            if info.gpu_total_memory_bytes is not None:
+                span._set_attribute("device.gpu.total_memory_bytes", info.gpu_total_memory_bytes)
+            if info.gpu_driver_version:
+                span.set_tag("device.gpu.driver_version", info.gpu_driver_version)
+
+        try:
+            import torch  # noqa: PLC0415
+
+            torch_ver = getattr(torch, "__version__", "") or ""
+            if torch_ver:
+                span.set_tag("torch.version", str(torch_ver))
+            cuda_ver = getattr(getattr(torch, "version", None), "cuda", None)
+            if cuda_ver:
+                span.set_tag("torch.cuda.version", str(cuda_ver))
+            hip_ver = getattr(getattr(torch, "version", None), "hip", None)
+            if hip_ver:
+                span.set_tag("torch.cuda.hip_version", str(hip_ver))
+            try:
+                nccl_ver = torch.cuda.nccl.version()
+                if isinstance(nccl_ver, tuple) and nccl_ver:
+                    span.set_tag("torch.cuda.nccl_version", ".".join(str(p) for p in nccl_ver))
+            except Exception:  # nosec B110
+                pass
+            cudnn = getattr(getattr(torch, "backends", None), "cudnn", None)
+            if cudnn is not None:
+                try:
+                    span.set_tag("torch.cudnn.enabled", "true" if bool(cudnn.enabled) else "false")
+                except Exception:  # nosec B110
+                    pass
+                try:
+                    span.set_tag("torch.cudnn.benchmark", "true" if bool(cudnn.benchmark) else "false")
+                except Exception:  # nosec B110
+                    pass
+                try:
+                    span.set_tag(
+                        "torch.cudnn.deterministic",
+                        "true" if bool(cudnn.deterministic) else "false",
+                    )
+                except Exception:  # nosec B110
+                    pass
+                try:
+                    v = cudnn.version()
+                    if isinstance(v, int):
+                        span._set_attribute("torch.cudnn.version", v)
+                except Exception:  # nosec B110
+                    pass
+            try:
+                prec = torch.get_float32_matmul_precision()
+                if prec:
+                    span.set_tag("torch.float32_matmul_precision", str(prec))
+            except Exception:  # nosec B110
+                pass
+            try:
+                if torch.backends.mps.is_available():
+                    span.set_tag("torch.mps.available", "true")
+            except Exception:  # nosec B110
+                pass
+        except Exception:
+            log.debug("pytorch.rank: torch invariants tagging failed", exc_info=True)
+
+        try:
+            for envvar, tag in (
+                ("NCCL_DEBUG", "nccl.debug"),
+                ("NCCL_SOCKET_IFNAME", "nccl.socket_ifname"),
+                ("NCCL_IB_DISABLE", "nccl.ib_disable"),
+                ("NCCL_P2P_DISABLE", "nccl.p2p_disable"),
+                ("NCCL_ALGO", "nccl.algo"),
+                ("NCCL_PROTO", "nccl.proto"),
+                ("TORCH_NCCL_ASYNC_ERROR_HANDLING", "nccl.async_error_handling"),
+                ("CUDA_VISIBLE_DEVICES", "device.cuda.visible_devices"),
+                ("MASTER_ADDR", "pytorch.master_addr"),
+            ):
+                val = env.get(envvar)
+                if val:
+                    span.set_tag(tag, str(val))
+            for envvar, facet in (
+                ("LOCAL_RANK", "pytorch.local_rank"),
+                ("LOCAL_WORLD_SIZE", "pytorch.local_world_size"),
+                ("GROUP_RANK", "pytorch.group_rank"),
+                ("GROUP_WORLD_SIZE", "pytorch.group_world_size"),
+                ("MASTER_PORT", "pytorch.master_port"),
+            ):
+                val = env.get(envvar)
+                if val:
+                    try:
+                        span._set_attribute(facet, int(val))
+                    except Exception:  # nosec B110
+                        pass
+        except Exception:
+            log.debug("pytorch.rank: env-signal tagging failed", exc_info=True)
+
+        try:
+            from ddtrace.contrib.internal.pytorch._distributed import _detect_launcher  # noqa: PLC0415
+            from ddtrace.contrib.internal.pytorch._distributed import _get_cached_backend  # noqa: PLC0415
+
+            launcher = _detect_launcher()
+            if launcher:
+                span.set_tag("launcher", launcher)
+            backend = _get_cached_backend()
+            if backend:
+                span.set_tag("torch.distributed.backend", backend)
+        except Exception:
+            log.debug("pytorch.rank: launcher/backend tagging failed", exc_info=True)
+
+        _tag_ray_run_context(span)
+    except Exception:
+        log.debug("pytorch: failed to tag pytorch.rank span", exc_info=True)
+    return span
+
+
+def _tag_ray_run_context(span: Any) -> None:
+    """Apply Ray Train run-context tags from the pytorch-utils cache. Best-effort and idempotent."""
+    try:
+        from ddtrace.contrib.internal.pytorch._utils import get_cached_run_metadata  # noqa: PLC0415
+
+        rm = get_cached_run_metadata()
+        rn = rm.get("run_name")
+        sub = rm.get("submission_id")
+        md = rm.get("metadata") or {}
+        if rn:
+            span.set_tag("ray.train.run_name", rn)
+        if sub:
+            span.set_tag("ray.submission_id", sub)
+        for k, v in md.items():
+            try:
+                span.set_tag(f"ray.metadata.{k}", str(v))
+            except Exception:
+                log.debug("pytorch.rank: failed to set metadata tag %s", k, exc_info=True)
+    except Exception:
+        log.debug("pytorch.rank: failed to apply Ray run metadata", exc_info=True)
+
+
+def retag_ray_run_context() -> None:
+    """Re-apply Ray Train run-context tags to the currently-open ``pytorch.rank`` span.
+
+    Called immediately after the pytorch-utils run-metadata cache is populated
+    so the long-running rank span carries ``ray.submission_id`` for its full
+    lifetime rather than only at close (when the cache may have been cleared).
+    No-op when no rank span is open.
+    """
+    with _lock:
+        span = _span
+    if span is None:
+        return
+    try:
+        _tag_ray_run_context(span)
+    except Exception:
+        log.debug("pytorch.rank: retag_ray_run_context failed", exc_info=True)
+
+
+def _schedule_rotation() -> None:
+    """Start the next rotation timer. Must be called while holding _lock."""
+    global _rotation_timer
+    t = threading.Timer(_rotation_interval_s, _rotate_span)
+    t.daemon = True
+    t.name = "dd-pytorch-rank-rotation"
+    t.start()
+    _rotation_timer = t
+
+
+def _rotate_span() -> None:
+    """Finish the current rank span and open a fresh one. Called by the rotation timer."""
+    global _span, _rotation_timer
+
+    with _lock:
+        old_span = _span
+        if old_span is None:
+            return
+
+    new_span = _build_span(_open_kwargs)
+
+    with _lock:
+        if _span is not old_span:
+            # Span was closed or replaced while we were building — discard.
+            if new_span is not None:
+                try:
+                    new_span.finish()
+                except Exception:  # nosec B110
+                    pass
+            return
+        _span = new_span
+        _schedule_rotation()
+
+    # Point C tracer at the new span BEFORE finishing the old one —
+    # ensures no gap in coverage for GPU-level root spans.
+    if new_span is not None:
+        _c_tracer.set_parent_context(new_span, _open_kwargs)
+
+    try:
+        old_span.set_tag("_dd.was_long_running", 1)
+        _tag_ray_run_context(old_span)
+        old_span.finish()
+    except Exception:
+        log.debug("pytorch: span rotation finish failed", exc_info=True)
+
+    try:
+        _safe_flush(tracer)
+    except Exception:
+        log.debug("pytorch: span rotation flush failed", exc_info=True)
+
+
+def open_rank_span(rank: int, world_size: int, framework: str, training_job_id: Optional[str]) -> None:
+    """Open the per-rank lifetime span. Idempotent — second call is a no-op."""
+    global _span, _atexit_registered, _open_kwargs
+    with _lock:
+        if _span is not None:
+            return
+        if not _atexit_registered:
+            atexit.register(close)
+            _atexit_registered = True
+        # Set _open_kwargs under lock so the rotation timer always sees a
+        # consistent snapshot — _rotate_span reads it outside the lock.
+        _open_kwargs = {
+            "rank": rank,
+            "world_size": world_size,
+            "framework": framework,
+            "training_job_id": training_job_id,
+        }
+
+    new_span = _build_span(_open_kwargs)
+
+    won_race = False
+    with _lock:
+        if _span is None:
+            _span = new_span
+            won_race = True
+            _schedule_rotation()
+        else:
+            # Lost the race to another concurrent open_rank_span() — discard.
+            if new_span is not None:  # type: ignore[unreachable]
+                try:
+                    new_span.finish()
+                except Exception:  # nosec B110
+                    pass
+
+    if won_race and new_span is not None:
+        _c_tracer.set_parent_context(new_span, _open_kwargs)
+
+
+def set_framework(name: str) -> None:
+    """Update the ``framework`` tag on the open ``pytorch.rank`` span."""
+    if not name:
+        return
+    with _lock:
+        span = _span
+        _open_kwargs["framework"] = name  # keep rotation in sync
+    if span is None:
+        return
+    try:
+        span.set_tag("framework", name)
+    except Exception:
+        log.debug("pytorch: failed to set framework tag", exc_info=True)
+    _c_tracer.set_parent_context(span, _open_kwargs)
+
+
+def close() -> None:
+    """Finish the per-rank span. Safe to call when no span is open."""
+    global _span, _atexit_registered, _rotation_timer
+    with _lock:
+        span = _span
+        _span = None
+        timer = _rotation_timer
+        _rotation_timer = None
+        if _atexit_registered:
+            try:
+                atexit.unregister(close)
+            except Exception:  # nosec B110
+                pass
+            _atexit_registered = False
+
+    if timer is not None:
+        timer.cancel()
+
+    if span is None:
+        return
+    try:
+        _tag_ray_run_context(span)
+        span.finish()
+        # Flush in a daemon thread so close() never stalls the caller
+        # (e.g. destroy_process_group). The thread is best-effort; on
+        # normal process exit atexit fires close() and the daemon gets
+        # a chance to complete before the interpreter shuts down.
+        threading.Thread(
+            target=lambda: _safe_flush(tracer),
+            name="dd-pytorch-rank-root-flush",
+            daemon=True,
+        ).start()
+    except Exception:
+        log.exception("pytorch: rank-root span close failed")
+    finally:
+        _c_tracer.clear_parent_context()
+
+
+def _safe_flush(_tracer: Any) -> None:
+    try:
+        _tracer.flush()
+    except Exception:
+        log.debug("pytorch: tracer.flush during rank-root close raised", exc_info=True)
+
+
+def _reset_child_state() -> None:
+    # Clear inherited state; timer threads do not survive fork.
+    global _span, _lock, _atexit_registered, _rotation_timer, _open_kwargs
+    _span = None
+    _lock = Lock()
+    _atexit_registered = False
+    _rotation_timer = None
+    _open_kwargs = {}
+    # Clear C tracer parent pointer — child must not inherit a dangling span ref.
+    try:
+        _c_tracer.clear_parent_context()
+    except Exception:  # nosec B110
+        pass
+
+
+forksafe.register(_reset_child_state)
diff --git a/ddtrace/contrib/internal/pytorch/_test_helpers.py b/ddtrace/contrib/internal/pytorch/_test_helpers.py
new file mode 100644
index 00000000000..a41b126cc57
--- /dev/null
+++ b/ddtrace/contrib/internal/pytorch/_test_helpers.py
@@ -0,0 +1,56 @@
+"""Test-only helpers for the PyTorch integration.
+
+Previously exposed as ``_*_for_tests`` symbols on the production submodules;
+moved here so the production modules don't carry test-only API surface.
+Import as::
+
+    from ddtrace.contrib.internal.pytorch import _test_helpers as th
+"""
+
+from typing import Any
+from typing import Optional
+
+
+def reset_metrics_state() -> None:
+    pass
+
+
+def current_rank_span() -> Optional[Any]:
+    from ddtrace.contrib.internal.pytorch import _rank_root
+
+    return _rank_root._span
+
+
+def close_rank_root() -> None:
+    """Force-close the rank-root span and reset module state (test isolation)."""
+    from ddtrace.contrib.internal.pytorch import _rank_root
+
+    with _rank_root._lock:
+        span = _rank_root._span
+        _rank_root._span = None
+        _rank_root._atexit_registered = False
+        timer = _rank_root._rotation_timer
+        _rank_root._rotation_timer = None
+        _rank_root._open_kwargs = {}
+    if timer is not None:
+        try:
+            timer.cancel()
+        except Exception:  # nosec B110
+            pass
+    if span is not None:
+        try:
+            span.finish()
+        except Exception:  # nosec B110
+            pass
+
+
+def set_atexit_registered(value: bool) -> None:
+    from ddtrace.contrib.internal.pytorch import _rank_root
+
+    _rank_root._atexit_registered = value
+
+
+def reset_device_cache() -> None:
+    from ddtrace.contrib.internal.pytorch import _device
+
+    _device._cache = None
diff --git a/ddtrace/contrib/internal/pytorch/_utils.py b/ddtrace/contrib/internal/pytorch/_utils.py
new file mode 100644
index 00000000000..028546c244b
--- /dev/null
+++ b/ddtrace/contrib/internal/pytorch/_utils.py
@@ -0,0 +1,223 @@
+import os
+import threading
+import types as _types_mp
+from typing import Any
+from typing import Optional
+import uuid
+
+from ddtrace.internal.logger import get_logger
+from ddtrace.internal.settings import env
+
+
+log = get_logger(__name__)
+
+_JOB_ID_ENV_CHAIN = (
+    "DD_PYTORCH_JOB_ID",  # explicit user override — wins over all launchers
+    "RAY_JOB_ID",  # Ray Train / Tune — preferred so Ray-driven traces are consistent
+    "TORCHELASTIC_RUN_ID",  # torch.distributed.elastic / torchrun
+    "KUBEFLOW_TRAINING_JOB_ID",  # Kubeflow Training Operator
+    "SLURM_JOB_ID",  # SLURM
+)
+
+# Generous limit avoids silent intake truncation; strip whitespace from scheduler IDs.
+_JOB_ID_MAX_LEN = 200
+
+# `job_id` is a legacy alias kept for dashboard back-compat.
+TRAINING_JOB_ID_TAG = "training_job.id"
+
+# Process-wide job-id cache; thread-local overrides take precedence.
+_default_job_id: Optional[str] = None
+
+# Thread-local override for multi-worker isolation.
+_tls_job_id = threading.local()
+
+
+def set_cached_job_id(value: Optional[str], *, is_default: bool = False) -> None:
+    """Cache the resolved training job id so non-distributed emitters can tag spans.
+
+    Pass ``is_default=True`` only from ``_distributed._bootstrap_distributed`` to seed
+    the process-wide default. Other callers write only to a thread-local override so
+    concurrent Ray Train workers with different job ids don't trample each other.
+    """
+    global _default_job_id
+    _tls_job_id.value = value
+    if is_default:
+        _default_job_id = value
+
+
+def get_cached_job_id() -> Optional[str]:
+    val: Optional[str] = getattr(_tls_job_id, "value", None)
+    if val is not None:
+        return val
+    return _default_job_id
+
+
+def get_rank() -> int:
+    """Return the current process rank from the RANK env var; falls back to 0."""
+    try:
+        rank = env.get("RANK")
+        if rank:
+            return int(rank)
+    except Exception:  # nosec B110
+        pass
+    return 0
+
+
+def set_training_job_id_tag(span: Any) -> None:
+    """Tag ``span`` with training_job.id/job_id, manual.keep, and Ray run-context tags.
+
+    Never raises; tag-setting failures are swallowed because instrumentation
+    must not crash user code.
+    """
+    job_id = get_cached_job_id()
+    if job_id:
+        try:
+            span.set_tag(TRAINING_JOB_ID_TAG, job_id)
+            span.set_tag("job_id", job_id)
+        except Exception:
+            log.debug("pytorch: failed to set training_job.id tag", exc_info=True)
+    try:
+        span.set_tag("manual.keep")
+    except Exception:
+        log.debug("pytorch: failed to set manual.keep tag", exc_info=True)
+    try:
+        rm = get_cached_run_metadata()
+        sub = rm.get("submission_id")
+        if sub:
+            span.set_tag("ray.submission_id", sub)
+        md = rm.get("metadata") or {}
+        job_name = md.get("job_name")
+        if job_name:
+            span.set_tag("ray.metadata.job_name", str(job_name))
+    except Exception:
+        log.debug("pytorch: failed to propagate ray run metadata to step span", exc_info=True)
+
+
+def resolve_job_id_from_env() -> str:
+    """Walk the job-id env-var chain (DD_PYTORCH_JOB_ID → RAY_JOB_ID → … → UUID fallback)."""
+    for var in _JOB_ID_ENV_CHAIN:
+        raw = env.get(var)
+        if not raw:
+            continue
+        value = raw.strip()
+        if not value:
+            continue
+        return str(value[:_JOB_ID_MAX_LEN])
+    return str(uuid.uuid4())
+
+
+def job_id_env_set() -> bool:
+    """True iff at least one env var in ``_JOB_ID_ENV_CHAIN`` is set to a non-empty value."""
+    for var in _JOB_ID_ENV_CHAIN:
+        raw = env.get(var)
+        if raw and raw.strip():
+            return True
+    return False
+
+
+# Ray Train run-context cache. Writers use _run_metadata_lock; readers use the lock-free view.
+_run_metadata: dict[str, Any] = {}
+_run_metadata_lock = threading.Lock()
+_run_metadata_view: _types_mp.MappingProxyType[str, Any] = _types_mp.MappingProxyType[str, Any]({})
+
+
+def _publish_view_locked() -> None:
+    """Rebuild and atomically replace `_run_metadata_view`. Caller MUST
+    hold `_run_metadata_lock`.
+    """
+    global _run_metadata_view
+    raw: dict[str, Any] = {}
+    rn = _run_metadata.get("run_name")
+    sub = _run_metadata.get("submission_id")
+    md = _run_metadata.get("metadata") or {}
+    if rn is not None:
+        raw["run_name"] = rn
+    if sub is not None:
+        raw["submission_id"] = sub
+    if md:
+        raw["metadata"] = _types_mp.MappingProxyType[str, Any](dict(md))
+    _run_metadata_view = _types_mp.MappingProxyType[str, Any](raw)
+
+
+def set_cached_run_metadata(
+    *,
+    run_name: Optional[str] = None,
+    submission_id: Optional[str] = None,
+    metadata: Optional[dict[str, Any]] = None,
+) -> None:
+    """Update the run-metadata cache. None values leave the existing
+    entry intact (partial updates compose).
+    """
+    with _run_metadata_lock:
+        if run_name is not None:
+            _run_metadata["run_name"] = run_name
+        if submission_id is not None:
+            _run_metadata["submission_id"] = submission_id
+        if metadata is not None:
+            _run_metadata["metadata"] = dict(metadata)
+        _publish_view_locked()
+
+
+def get_cached_run_metadata() -> "_types_mp.MappingProxyType[str, Any]":
+    """Lock-free read of the latest published snapshot.
+
+    Returns a `MappingProxyType[str, Any]` — read-only at runtime. Callers that
+    need a mutable dict should `dict(get_cached_run_metadata())`.
+    """
+    return _run_metadata_view
+
+
+def clear_cached_run_metadata() -> None:
+    """Public helper for tests and worker-restore paths that need to
+    fully reset the cache.
+    """
+    with _run_metadata_lock:
+        _run_metadata.clear()
+        _publish_view_locked()
+
+
+def get_run_metadata_snapshot() -> dict[str, Any]:
+    """Return a snapshot suitable for later restore. Deep-copies the
+    nested `metadata` dict so callers cannot mutate live cache state.
+    """
+    with _run_metadata_lock:
+        return {
+            "run_name": _run_metadata.get("run_name"),
+            "submission_id": _run_metadata.get("submission_id"),
+            "metadata": dict(_run_metadata.get("metadata") or {}),
+        }
+
+
+def restore_run_metadata_snapshot(snapshot: dict[str, Any]) -> None:
+    """Replace the cache with a previously taken snapshot. None fields
+    are cleared (unconditional overwrite — unlike `set_cached_*`).
+    """
+    with _run_metadata_lock:
+        _run_metadata.clear()
+        rn = snapshot.get("run_name")
+        sub = snapshot.get("submission_id")
+        md = snapshot.get("metadata")
+        if rn is not None:
+            _run_metadata["run_name"] = rn
+        if sub is not None:
+            _run_metadata["submission_id"] = sub
+        if md is not None:
+            _run_metadata["metadata"] = dict(md)
+        _publish_view_locked()
+
+
+def _reset_child_state() -> None:
+    global _run_metadata, _run_metadata_lock, _run_metadata_view
+    _run_metadata = {}
+    _run_metadata_lock = threading.Lock()
+    _run_metadata_view = _types_mp.MappingProxyType[str, Any]({})
+    global _default_job_id
+    _default_job_id = None
+    try:
+        _tls_job_id.value = None
+    except AttributeError:
+        pass
+
+
+if hasattr(os, "register_at_fork"):
+    os.register_at_fork(after_in_child=_reset_child_state)
diff --git a/ddtrace/contrib/internal/pytorch/patch.py b/ddtrace/contrib/internal/pytorch/patch.py
new file mode 100644
index 00000000000..ad24000d73a
--- /dev/null
+++ b/ddtrace/contrib/internal/pytorch/patch.py
@@ -0,0 +1,45 @@
+import torch
+
+from ddtrace.internal.logger import get_logger
+from ddtrace.internal.utils.version import parse_version
+
+
+log = get_logger(__name__)
+
+TORCH_VERSION = parse_version(str(getattr(torch, "__version__", "")))
+
+
+def get_version() -> str:
+    # torch.__version__ is a `TorchVersion` (a str subclass); the contrib test
+    # harness checks `type(version) == str`, so cast to a plain str here.
+    return str(getattr(torch, "__version__", ""))
+
+
+def _supported_versions() -> dict[str, str]:
+    return {"torch": ">=2.0"}
+
+
+def patch() -> None:
+    if getattr(torch, "_datadog_patch", False):
+        return
+    if TORCH_VERSION < (2, 0, 0) or TORCH_VERSION >= (3, 0, 0):
+        log.warning(
+            "pytorch: torch version %s is not supported (supported: >=2.0,<3.0); skipping instrumentation",
+            torch.__version__,
+        )
+        return
+    torch._datadog_patch = True
+    # Imported inside patch() so the module-level import of `_distributed`
+    # doesn't pull in `torch.distributed.*` symbols at module import time.
+    from ddtrace.contrib.internal.pytorch import _distributed
+
+    _distributed.install()
+
+
+def unpatch() -> None:
+    if not getattr(torch, "_datadog_patch", False):
+        return
+    torch._datadog_patch = False
+    from ddtrace.contrib.internal.pytorch import _distributed
+
+    _distributed.uninstall()
diff --git a/ddtrace/internal/settings/_config.py b/ddtrace/internal/settings/_config.py
index 0b3701da402..1aa0cb6fb21 100644
--- a/ddtrace/internal/settings/_config.py
+++ b/ddtrace/internal/settings/_config.py
@@ -204,6 +204,7 @@
         "openai_agents",
         "mcp",
         "mlflow",
+        "pytorch",
         "ray",
         "aiokafka",
         "google_cloud_pubsub",
diff --git a/ddtrace/internal/settings/_supported_configurations.py b/ddtrace/internal/settings/_supported_configurations.py
index 8912cb27a09..78ec618b542 100644
--- a/ddtrace/internal/settings/_supported_configurations.py
+++ b/ddtrace/internal/settings/_supported_configurations.py
@@ -405,6 +405,8 @@
         "DD_PYTEST_SERVICE",
         "DD_PYTEST_USE_NEW_PLUGIN",
         "DD_PYTEST_USE_NEW_PLUGIN_BETA",
+        "DD_PYTORCH_JOB_ID",
+        "DD_PYTORCH_SERVICE",
         "DD_RAY_SERVICE",
         "DD_REDISCLUSTER_CMD_MAX_LENGTH",
         "DD_REDISCLUSTER_SERVICE",
@@ -609,6 +611,7 @@
         "DD_TRACE_PYRAMID_ENABLED",
         "DD_TRACE_PYTEST_BDD_ENABLED",
         "DD_TRACE_PYTEST_ENABLED",
+        "DD_TRACE_PYTORCH_ENABLED",
         "DD_TRACE_RATE_LIMIT",
         "DD_TRACE_RAY_ARGS_KWARGS",
         "DD_TRACE_RAY_CORE_API",
@@ -661,6 +664,7 @@
         "DD_TRACE_WSGI_ENABLED",
         "DD_TRACE_X_DATADOG_TAGS_MAX_LENGTH",
         "DD_TRACE_YAAREDIS_ENABLED",
+        "DD_TRAINING_STEP_PROFILING",
         "DD_UNITTEST_OPERATION_NAME",
         "DD_UNITTEST_SERVICE",
         "DD_UNLOAD_MODULES_FROM_SITECUSTOMIZE",
@@ -778,6 +782,7 @@
         "_DD_PROFILING_STACK_MAX_THREADS",
         "_DD_PYTEST_XDIST_INFERRED_SERVICE",
         "_DD_PY_SSI_INJECT",
+        "_DD_RAY_RUN_METADATA",
         "_DD_REMOTE_CONFIGURATION_ADDITIONAL_HEADERS",
         "_DD_REMOTE_CONFIGURATION_LOG_PAYLOADS",
         "_DD_REMOTE_CONFIGURATION_SKIP_SHUTDOWN",
@@ -896,6 +901,7 @@
     "DD_PYRAMID_SERVICE": ["DD_PYRAMID_SERVICE_NAME"],
     "DD_PYTEST_BDD_SERVICE": ["DD_PYTEST_BDD_SERVICE_NAME"],
     "DD_PYTEST_SERVICE": ["DD_PYTEST_SERVICE_NAME"],
+    "DD_PYTORCH_SERVICE": ["DD_PYTORCH_SERVICE_NAME"],
     "DD_RAY_SERVICE": ["DD_RAY_SERVICE_NAME"],
     "DD_REDISCLUSTER_SERVICE": ["DD_REDISCLUSTER_SERVICE_NAME"],
     "DD_REDIS_SERVICE": ["DD_REDIS_SERVICE_NAME"],
diff --git a/releasenotes/notes/pytorch-rank-span-8c74d5227b0d2953.yaml b/releasenotes/notes/pytorch-rank-span-8c74d5227b0d2953.yaml
new file mode 100644
index 00000000000..17e46fbb5b6
--- /dev/null
+++ b/releasenotes/notes/pytorch-rank-span-8c74d5227b0d2953.yaml
@@ -0,0 +1,11 @@
+---
+features:
+  - |
+    pytorch: Adds a ``pytorch.rank`` lifetime span for PyTorch distributed training.
+    The span opens at ``init_process_group`` and closes at ``destroy_process_group``
+    or process exit. Tags include ``rank``, ``world_size``, ``framework``
+    (DDP / FSDP / DeepSpeed), ``launcher``, ``torch.distributed.backend``, and
+    ``training_job_id`` (resolved from launcher environment variables).
+    When running under Ray Train, ``ray.train.run_name``, ``ray.submission_id``,
+    and ``ray.metadata.*`` are also applied.
+    Enable with ``DD_PATCH_MODULES=pytorch:true``.
diff --git a/riotfile.py b/riotfile.py
index 36785a001ae..621ccb4bf56 100644
--- a/riotfile.py
+++ b/riotfile.py
@@ -3151,6 +3151,36 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT
                 ),
             ],
         ),
+        Venv(
+            name="pytorch",
+            command="pytest {cmdargs} tests/contrib/pytorch",
+            venvs=[
+                Venv(
+                    pys=select_pys(min_version="3.9", max_version="3.11"),
+                    pkgs={
+                        "torch": ["~=2.0.0", "~=2.1.0"],
+                    },
+                ),
+                Venv(
+                    pys=select_pys(min_version="3.9", max_version="3.12"),
+                    pkgs={
+                        "torch": ["~=2.2.0", "~=2.3.0"],
+                    },
+                ),
+                Venv(
+                    pys=select_pys(min_version="3.9", max_version="3.12"),
+                    pkgs={
+                        "torch": ["~=2.4.0", "~=2.5.0", "~=2.6.0", "~=2.7.0"],
+                    },
+                ),
+                Venv(
+                    pys=select_pys(min_version="3.12", max_version="3.12"),
+                    pkgs={
+                        "torch": ["~=2.8.0", "~=2.9.0", "~=2.10.0", "~=2.11.0", "~=2.12.0"],
+                    },
+                ),
+            ],
+        ),
         Venv(
             name="vertexai",
             command="pytest {cmdargs} tests/contrib/vertexai",
diff --git a/scripts/integration_registry/registry.yaml b/scripts/integration_registry/registry.yaml
index 06b55ed67aa..93828a2d123 100644
--- a/scripts/integration_registry/registry.yaml
+++ b/scripts/integration_registry/registry.yaml
@@ -847,6 +847,16 @@ integrations:
   dependency_names:
   - pytest_benchmark
 
+- integration_name: pytorch
+  is_external_package: true
+  is_tested: true
+  dependency_names:
+  - torch
+  tested_versions_by_dependency:
+    torch:
+      min: 2.0.1
+      max: 2.12.0
+
 - integration_name: ray
   is_external_package: true
   is_tested: false
diff --git a/supported-configurations.json b/supported-configurations.json
index 001dbfffe7c..999a6cfbbeb 100644
--- a/supported-configurations.json
+++ b/supported-configurations.json
@@ -3003,6 +3003,16 @@
         "default": "false"
       }
     ],
+    "DD_PYTORCH_SERVICE": [
+      {
+        "implementation": "A",
+        "type": "string",
+        "default": "",
+        "aliases": [
+          "DD_PYTORCH_SERVICE_NAME"
+        ]
+      }
+    ],
     "DD_PYNAMODB_SERVICE": [
       {
         "implementation": "A",
@@ -4610,6 +4620,13 @@
         "default": "false"
       }
     ],
+    "DD_TRACE_PYTORCH_ENABLED": [
+      {
+        "implementation": "A",
+        "type": "boolean",
+        "default": "false"
+      }
+    ],
     "DD_TRACE_RAY_ENABLED": [
       {
         "implementation": "A",
@@ -5979,7 +5996,31 @@
         "version": "B",
         "type": "boolean",
         "default": "false",
-        "propertyKeys": ["trace_stats_computation_experimental_client_obfuscation_enabled"]
+        "propertyKeys": [
+          "trace_stats_computation_experimental_client_obfuscation_enabled"
+        ]
+      }
+    ],
+    "DD_PYTORCH_JOB_ID": [
+      {
+        "implementation": "A",
+        "type": "string",
+        "default": null
+      }
+    ],
+    "_DD_RAY_RUN_METADATA": [
+      {
+        "implementation": "A",
+        "type": "json",
+        "default": null,
+        "internal": true
+      }
+    ],
+    "DD_TRAINING_STEP_PROFILING": [
+      {
+        "implementation": "A",
+        "type": "boolean",
+        "default": "false"
       }
     ]
   }
diff --git a/supported_versions_output.json b/supported_versions_output.json
index 9cc34d8b69b..df815217cd5 100644
--- a/supported_versions_output.json
+++ b/supported_versions_output.json
@@ -6,6 +6,13 @@
         "max_tracer_supported": "3.1.0",
         "auto-instrumented": false
     },
+    {
+        "dependency": "botocore",
+        "integration": "aiobotocore",
+        "minimum_tracer_supported": "1.15.32",
+        "max_tracer_supported": "1.42.19",
+        "auto-instrumented": false
+    },
     {
         "dependency": "aiohttp",
         "integration": "aiohttp",
@@ -171,8 +178,8 @@
     {
         "dependency": "botocore",
         "integration": "botocore",
-        "minimum_tracer_supported": "1.34.49",
-        "max_tracer_supported": "1.38.26",
+        "minimum_tracer_supported": "1.15.32",
+        "max_tracer_supported": "1.42.19",
         "pinned": "true",
         "auto-instrumented": true
     },
@@ -656,6 +663,14 @@
         "pinned": "true",
         "auto-instrumented": false
     },
+    {
+        "dependency": "torch",
+        "integration": "pytorch",
+        "minimum_tracer_supported": "2.0.1",
+        "max_tracer_supported": "2.12.0",
+        "pinned": "true",
+        "auto-instrumented": false
+    },
     {
         "dependency": "ray",
         "integration": "ray",
diff --git a/supported_versions_table.csv b/supported_versions_table.csv
index eb4ab4ed9db..ce241443f7c 100644
--- a/supported_versions_table.csv
+++ b/supported_versions_table.csv
@@ -1,5 +1,6 @@
 dependency,integration,minimum_tracer_supported,max_tracer_supported,auto-instrumented
 aiobotocore,aiobotocore,1.0.7,3.1.0,False
+botocore,aiobotocore,1.15.32,1.42.19,False
 aiohttp,aiohttp,3.7.4.post0,3.14.0,True
 aiohttp-jinja2,aiohttp_jinja2,1.5.1,1.6,True
 aiohttp_jinja2,aiohttp_jinja2,1.5.1,1.6,True
@@ -22,7 +23,7 @@ azure-eventhub,azure_eventhubs *,5.12.2,5.15.0,True
 azure-functions,azure_functions *,1.10.1,2.0.0,True
 azure-servicebus,azure_servicebus *,7.14.2,7.14.2,True
 boto3,botocore *,1.34.49,1.38.26,True
-botocore,botocore *,1.34.49,1.38.26,True
+botocore,botocore *,1.15.32,1.42.19,True
 bottle,bottle,0.12.25,0.13.4,True
 celery,celery,5.5.3,5.5.3,True
 cherrypy,cherrypy,17.0.0,18.10.0,False
@@ -91,6 +92,7 @@ pyodbc,pyodbc,4.0.39,5.3.0,True
 pyramid,pyramid,1.10.8,2.0.2,True
 pytest,pytest,6.2.5,9.0.3,False
 pytest-bdd,pytest_bdd *,4.1.0,6.0.1,False
+torch,pytorch *,2.0.1,2.12.0,False
 ray,ray *,2.46.0,2.49.2,False
 redis,redis,4.6.0,6.4.0,True
 redis-py-cluster,rediscluster,2.0.0,2.1.3,True
diff --git a/tests/contrib/pytorch/__init__.py b/tests/contrib/pytorch/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/contrib/pytorch/conftest.py b/tests/contrib/pytorch/conftest.py
new file mode 100644
index 00000000000..201ffb140d7
--- /dev/null
+++ b/tests/contrib/pytorch/conftest.py
@@ -0,0 +1,47 @@
+"""Shared fixtures for the pytorch integration test suite."""
+
+from unittest import mock
+
+import pytest
+
+from ddtrace.contrib.internal.pytorch import _device
+from ddtrace.contrib.internal.pytorch import _distributed
+from ddtrace.contrib.internal.pytorch import _test_helpers as _th
+
+
+# Non-deterministic tags to ignore in snapshot tests (follow Ray's pattern).
+PYTORCH_SNAPSHOT_IGNORES = [
+    "meta.tracer_version",
+    "meta.runtime-id",
+    "metrics._dd.top_level",
+    "metrics._dd.tracer_kr",
+    "metrics._sampling_priority_v1",
+    "metrics.process_id",
+    "name",
+    "resource",
+    "service",
+    "start",
+    "duration",
+]
+
+
+@pytest.fixture
+def pytorch_clean_state():
+    """Reset rank-root span, device cache, and distributed context.
+
+    Compose into autouse fixtures in each test module. Resets _rank_ctx so
+    tests that call into _distributed directly don't leak ExecutionContext
+    across test boundaries.
+    """
+    _distributed._rank_ctx.set(None)
+    _th.reset_device_cache()
+    _th.close_rank_root()
+    with (
+        mock.patch.object(_device, "_cuda_is_available", return_value=False),
+        mock.patch.object(_device, "_hostname", return_value="h-9"),
+    ):
+        _device.discover(local_rank=0)
+    yield
+    _th.close_rank_root()
+    _th.reset_device_cache()
+    _distributed._rank_ctx.set(None)
diff --git a/tests/contrib/pytorch/test_c_tracer.py b/tests/contrib/pytorch/test_c_tracer.py
new file mode 100644
index 00000000000..6703c706d3e
--- /dev/null
+++ b/tests/contrib/pytorch/test_c_tracer.py
@@ -0,0 +1,321 @@
+import sys
+from unittest import mock
+
+import pytest
+
+from ddtrace.contrib.internal.pytorch import _rank_root
+
+
+def _fresh_module():
+    """Import _c_tracer with a clean module cache so _loaded resets."""
+    sys.modules.pop("ddtrace.contrib.internal.pytorch._c_tracer", None)
+    from ddtrace.contrib.internal.pytorch import _c_tracer
+
+    return _c_tracer
+
+
+def _make_fake_lib():
+    lib = mock.MagicMock()
+    lib.dd_set_global_parent_context = mock.MagicMock()
+    lib.dd_set_global_parent_context.restype = None
+    lib.dd_clear_global_parent_context = mock.MagicMock()
+    lib.dd_clear_global_parent_context.restype = None
+    return lib
+
+
+def _make_absent_lib():
+    """A lib handle where the C tracer symbols are not present."""
+    lib = mock.MagicMock()
+    type(lib).dd_set_global_parent_context = mock.PropertyMock(side_effect=AttributeError)
+    return lib
+
+
+def _make_fake_span(trace_id=0xDEADBEEF00000001, span_id=0xCAFE, sampling_priority=1, service="pytorch"):
+    span = mock.Mock()
+    span.trace_id = trace_id
+    span.span_id = span_id
+    span.service = service
+    span.context.sampling_priority = sampling_priority
+    return span
+
+
+# ---------------------------------------------------------------------------
+# _load() — symbol presence determines no-op vs. active path
+# ---------------------------------------------------------------------------
+
+
+def test_set_parent_context_no_op_when_library_absent():
+    mod = _fresh_module()
+    mod._loaded = False
+    with mock.patch("ctypes.CDLL", return_value=_make_absent_lib()):
+        fake_span = mock.Mock()
+        fake_span.trace_id = 0xABC
+        fake_span.span_id = 0x123
+        fake_span.context.sampling_priority = None
+        mod.set_parent_context(fake_span, {"rank": 0, "world_size": 1, "framework": "ddp", "training_job_id": "j1"})
+
+
+def test_clear_parent_context_no_op_when_library_absent():
+    mod = _fresh_module()
+    mod._loaded = False
+    with mock.patch("ctypes.CDLL", return_value=_make_absent_lib()):
+        mod.clear_parent_context()
+
+
+def test_load_uses_global_symbol_table():
+    """_load() calls ctypes.CDLL(None) — no library path, no discovery."""
+    mod = _fresh_module()
+    mod._loaded = False
+    fake_lib = _make_fake_lib()
+    with mock.patch("ctypes.CDLL", return_value=fake_lib) as mock_cdll:
+        mod._load()
+    mock_cdll.assert_called_once_with(None)
+
+
+# ---------------------------------------------------------------------------
+# Correct C function dispatch
+# ---------------------------------------------------------------------------
+
+
+def test_set_parent_context_calls_c_function():
+    mod = _fresh_module()
+    mod._loaded = False
+    fake_lib = _make_fake_lib()
+    with mock.patch("ctypes.CDLL", return_value=fake_lib):
+        span = _make_fake_span(trace_id=0x00000001_DEADBEEF, span_id=0xCAFE, sampling_priority=2)
+        mod.set_parent_context(span, {"rank": 3, "world_size": 8, "framework": "ddp", "training_job_id": "job-xyz"})
+        assert fake_lib.dd_set_global_parent_context.called
+
+
+def test_set_parent_context_128bit_trace_id_split():
+    """High 64 bits are correctly separated from low 64 bits."""
+    mod = _fresh_module()
+    mod._loaded = False
+    fake_lib = _make_fake_lib()
+    captured = {}
+
+    def capture(*args, **kwargs):
+        captured["args"] = args
+
+    fake_lib.dd_set_global_parent_context.side_effect = capture
+
+    with mock.patch("ctypes.CDLL", return_value=fake_lib):
+        trace_id = (0xAAAA << 64) | 0xBBBB
+        span = _make_fake_span(trace_id=trace_id, span_id=0x1111)
+        mod.set_parent_context(span, {"rank": 0, "world_size": 1, "framework": "none", "training_job_id": ""})
+        lo = captured["args"][0].value
+        hi = captured["args"][1].value
+        assert lo == 0xBBBB
+        assert hi == 0xAAAA
+
+
+def test_clear_parent_context_calls_c_function():
+    mod = _fresh_module()
+    mod._loaded = False
+    fake_lib = _make_fake_lib()
+    with mock.patch("ctypes.CDLL", return_value=fake_lib):
+        mod._load()
+        mod.clear_parent_context()
+        assert fake_lib.dd_clear_global_parent_context.called
+
+
+def test_set_parent_context_tag_payload():
+    """Verify the 4 expected tags are sent with correct values."""
+    mod = _fresh_module()
+    mod._loaded = False
+    fake_lib = _make_fake_lib()
+    captured = {}
+
+    def capture(*args, **kwargs):
+        captured["args"] = args
+
+    fake_lib.dd_set_global_parent_context.side_effect = capture
+
+    with mock.patch("ctypes.CDLL", return_value=fake_lib):
+        span = _make_fake_span(trace_id=1, span_id=2, sampling_priority=1)
+        mod.set_parent_context(
+            span,
+            {
+                "training_job_id": "job-abc",
+                "rank": 3,
+                "world_size": 8,
+                "framework": "fsdp",
+            },
+        )
+
+    args = captured["args"]
+    count = args[7].value  # c_size_t
+    assert count == 5
+
+    keys = [args[5][i].decode() for i in range(count)]
+    vals = [args[6][i].decode() for i in range(count)]
+    tag_map = dict(zip(keys, vals))
+
+    assert tag_map["training_job_id"] == "job-abc"
+    assert tag_map["rank"] == "3"
+    assert tag_map["world_size"] == "8"
+    assert tag_map["framework"] == "fsdp"
+    assert tag_map["service"] == "pytorch"
+
+
+def test_set_parent_context_swallows_exception():
+    mod = _fresh_module()
+    mod._loaded = True
+    mod._lib = object()
+    mod._set_fn = mock.Mock(side_effect=RuntimeError("boom"))
+    mod.set_parent_context(_make_fake_span(), {})
+
+
+def test_clear_parent_context_swallows_exception():
+    mod = _fresh_module()
+    mod._loaded = True
+    mod._lib = object()
+    mod._clear_fn = mock.Mock(side_effect=RuntimeError("boom"))
+    mod.clear_parent_context()
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle integration: _rank_root calls _c_tracer at the right moments.
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=False)
+def _fresh_rank_root():
+    """Reset _rank_root module state before and after each test."""
+    import threading
+
+    _rank_root._span = None
+    _rank_root._lock = threading.Lock()
+    _rank_root._atexit_registered = False
+    if _rank_root._rotation_timer is not None:
+        _rank_root._rotation_timer.cancel()
+    _rank_root._rotation_timer = None
+    _rank_root._open_kwargs = {}
+    yield
+    if _rank_root._rotation_timer is not None:
+        _rank_root._rotation_timer.cancel()
+    _rank_root._span = None
+    _rank_root._rotation_timer = None
+
+
+def test_open_rank_span_calls_set_parent_context(_fresh_rank_root):
+    with (
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._c_tracer") as mc,
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._build_span", return_value=_make_fake_span()),
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._schedule_rotation"),
+    ):
+        _rank_root.open_rank_span(rank=0, world_size=4, framework="ddp", training_job_id="job-1")
+        assert mc.set_parent_context.called
+        args = mc.set_parent_context.call_args[0]
+        assert args[1]["framework"] == "ddp"
+        assert args[1]["rank"] == 0
+
+
+def test_close_calls_clear_parent_context(_fresh_rank_root):
+    fake_span = _make_fake_span()
+    fake_span.finish = mock.Mock()
+    _rank_root._span = fake_span
+    with (
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._c_tracer") as mc,
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._safe_flush"),
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._tag_ray_run_context"),
+    ):
+        _rank_root.close()
+        assert mc.clear_parent_context.called
+
+
+def test_rotate_span_updates_context_before_finishing_old(_fresh_rank_root):
+    """set_parent_context(new) must be called BEFORE old_span.finish()."""
+    call_order = []
+    new_span = _make_fake_span(trace_id=2, span_id=200)
+    old_span = _make_fake_span(trace_id=1, span_id=100)
+    old_span.finish = mock.Mock(side_effect=lambda: call_order.append("finish"))
+    old_span.set_tag = mock.Mock()
+
+    _rank_root._span = old_span
+    _rank_root._open_kwargs = {"rank": 0, "world_size": 1, "framework": "ddp", "training_job_id": "j"}
+
+    def fake_set(span, kwargs):
+        call_order.append(("set", span.span_id))
+
+    with (
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._build_span", return_value=new_span),
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._schedule_rotation"),
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._safe_flush"),
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._tag_ray_run_context"),
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._c_tracer") as mc,
+    ):
+        mc.set_parent_context.side_effect = fake_set
+        _rank_root._rotate_span()
+
+    set_idx = next(i for i, x in enumerate(call_order) if isinstance(x, tuple) and x[0] == "set")
+    fin_idx = call_order.index("finish")
+    assert set_idx < fin_idx, f"set must precede finish; order={call_order}"
+
+
+def test_close_clears_c_tracer_even_when_finish_raises(_fresh_rank_root):
+    """clear_parent_context must be called even if span.finish() raises."""
+    fake_span = _make_fake_span()
+    fake_span.finish = mock.Mock(side_effect=RuntimeError("finish failed"))
+    _rank_root._span = fake_span
+    with (
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._c_tracer") as mc,
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._safe_flush"),
+        mock.patch("ddtrace.contrib.internal.pytorch._rank_root._tag_ray_run_context"),
+    ):
+        _rank_root.close()  # must not raise
+        assert mc.clear_parent_context.called, "clear must fire even when finish raises"
+
+
+def test_set_framework_updates_c_tracer_context(_fresh_rank_root):
+    fake_span = _make_fake_span()
+    _rank_root._span = fake_span
+    _rank_root._open_kwargs = {"rank": 0, "world_size": 1, "framework": "none", "training_job_id": "j"}
+    with mock.patch("ddtrace.contrib.internal.pytorch._rank_root._c_tracer") as mc:
+        _rank_root.set_framework("fsdp")
+        assert mc.set_parent_context.called
+        args = mc.set_parent_context.call_args[0]
+        assert args[1]["framework"] == "fsdp"
+
+
+# ---------------------------------------------------------------------------
+# Step signals: step_begin / step_end
+# ---------------------------------------------------------------------------
+
+
+def test_step_begin_noop_when_symbol_absent():
+    """step_begin() is silent when the C symbol was not bound at load time."""
+    mod = _fresh_module()
+    mod._loaded = True
+    mod._lib = object()  # truthy — looks loaded
+    mod._step_begin_fn = None
+    mod.step_begin()  # must not raise
+
+
+def test_step_end_noop_when_symbol_absent():
+    """step_end() is silent when the C symbol was not bound at load time."""
+    mod = _fresh_module()
+    mod._loaded = True
+    mod._lib = object()
+    mod._step_end_fn = None
+    mod.step_end()  # must not raise
+
+
+def test_step_begin_calls_c_symbol():
+    mod = _fresh_module()
+    fn = mock.Mock()
+    mod._loaded = True
+    mod._lib = object()
+    mod._step_begin_fn = fn
+    mod.step_begin()
+    assert fn.called
+
+
+def test_step_end_calls_c_symbol():
+    mod = _fresh_module()
+    fn = mock.Mock()
+    mod._loaded = True
+    mod._lib = object()
+    mod._step_end_fn = fn
+    mod.step_end()
+    assert fn.called
diff --git a/tests/contrib/pytorch/test_device.py b/tests/contrib/pytorch/test_device.py
new file mode 100644
index 00000000000..a7db96bbff0
--- /dev/null
+++ b/tests/contrib/pytorch/test_device.py
@@ -0,0 +1,29 @@
+"""Tests for _cuda_visible_to_physical device index remapping."""
+
+import os
+from unittest import mock
+
+from ddtrace.contrib.internal.pytorch import _device
+
+
+def test_cuda_visible_to_physical_no_remapping():
+    with mock.patch.dict(os.environ, {}, clear=False):
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+        assert _device._cuda_visible_to_physical(1) == 1
+
+
+def test_cuda_visible_to_physical_remapping():
+    with mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "2,4,6"}):
+        assert _device._cuda_visible_to_physical(0) == 2
+        assert _device._cuda_visible_to_physical(1) == 4
+
+
+def test_cuda_visible_to_physical_no_dev_files():
+    with mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "NoDevFiles"}):
+        assert _device._cuda_visible_to_physical(0) == 0
+
+
+def test_cuda_visible_to_physical_uuid_falls_back():
+    with mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "GPU-abc,GPU-def"}):
+        # UUID entries are not integers; function falls back to visible_idx
+        assert _device._cuda_visible_to_physical(1) == 1
diff --git a/tests/contrib/pytorch/test_fork_safety.py b/tests/contrib/pytorch/test_fork_safety.py
new file mode 100644
index 00000000000..a1401b3b9b7
--- /dev/null
+++ b/tests/contrib/pytorch/test_fork_safety.py
@@ -0,0 +1,92 @@
+"""pytorch.rank state is reset in ``fork``-ed children so the child can
+bootstrap its own rank span without inheriting parent state.
+
+The remaining checks verify that the rank-root span reference and the
+distributed bootstrap state are properly cleared across fork.
+"""
+
+import multiprocessing as mp
+import os
+from unittest import mock
+
+import pytest
+
+from ddtrace.contrib.internal.pytorch import _device
+from ddtrace.contrib.internal.pytorch import _distributed
+from ddtrace.contrib.internal.pytorch import _rank_root
+from ddtrace.contrib.internal.pytorch import _test_helpers as _th
+
+
+def _child_assert_fresh(q):
+    # Verify parent's rank span and distributed bootstrap state were reset.
+    try:
+        assert _th.current_rank_span() is None, "rank span leaked into child"
+        assert _distributed._rank_ctx.get() is None, "_rank_ctx leaked into child"
+        assert _device._cache is None, "_device._cache leaked into child"
+        q.put("ok")
+    except AssertionError as e:
+        q.put(str(e))
+
+
+@pytest.mark.skipif(os.name != "posix", reason="fork is POSIX-only")
+def test_fork_resets_rank_root_and_bootstrap_state():
+    _th.reset_device_cache()
+    _th.close_rank_root()
+    with (
+        mock.patch.object(_device, "_cuda_is_available", return_value=False),
+        mock.patch.object(_device, "_hostname", return_value="h-parent"),
+    ):
+        _device.discover(local_rank=0)
+    # Open a rank span and mark distributed as bootstrapped in the parent.
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X")
+    from ddtrace.internal import core
+
+    fake_ctx = core.context_with_data("pytorch.rank", _dispatch_end_event=False)
+    fake_ctx.__enter__()
+    _distributed._rank_ctx.set(fake_ctx)
+
+    ctx = mp.get_context("fork")
+    q = ctx.Queue()
+    p = ctx.Process(target=_child_assert_fresh, args=(q,))
+    p.start()
+    p.join(timeout=10)
+    result = q.get(timeout=1)
+
+    _rank_root.close()
+    # Restore _distributed._rank_ctx so other tests are not affected.
+    fake_ctx.dispatch_ended_event()
+    fake_ctx.__exit__(None, None, None)
+    _distributed._rank_ctx.set(None)
+    assert result == "ok", result
+
+
+@pytest.mark.skipif(os.name != "posix", reason="fork is POSIX-only")
+def test_run_metadata_cleared_after_fork(tmp_path):
+    """Use a file marker rather than multiprocessing.Queue: Queue's
+    feeder thread is fork-unsafe and os._exit skips flush.
+    """
+    import multiprocessing
+
+    from ddtrace.contrib.internal.pytorch import _utils
+    from ddtrace.contrib.internal.pytorch._utils import get_cached_run_metadata
+    from ddtrace.contrib.internal.pytorch._utils import set_cached_run_metadata
+
+    set_cached_run_metadata(run_name="parent-run", submission_id="parent-sub", metadata={"k": "v"})
+
+    marker = tmp_path / "child_metadata.txt"
+
+    def child(path):
+        snap = get_cached_run_metadata()
+        path.write_text("EMPTY" if len(snap) == 0 else "STALE:" + repr(dict(snap)))
+        os._exit(0)
+
+    try:
+        ctx = multiprocessing.get_context("fork")
+        p = ctx.Process(target=child, args=(marker,))
+        p.start()
+        p.join(timeout=5)
+        assert p.exitcode == 0, f"child exited with code {p.exitcode}"
+        content = marker.read_text()
+        assert content == "EMPTY", f"child saw stale metadata: {content}"
+    finally:
+        _utils.clear_cached_run_metadata()
diff --git a/tests/contrib/pytorch/test_long_running_span.py b/tests/contrib/pytorch/test_long_running_span.py
new file mode 100644
index 00000000000..55c4aa04645
--- /dev/null
+++ b/tests/contrib/pytorch/test_long_running_span.py
@@ -0,0 +1,64 @@
+"""Tests for the pytorch.rank span rotation / long-running lifecycle.
+
+Follows the same pattern as tests/contrib/ray/test_long_running_span.py:
+rotation interval is patched to 0 (fires immediately) so tests run
+without real 600-second waits.
+"""
+
+import time
+from unittest import mock
+
+import pytest
+
+from ddtrace.contrib.internal.pytorch import _distributed
+import ddtrace.contrib.internal.pytorch._rank_root as rr
+
+
+@pytest.fixture(autouse=True)
+def _reset(tracer, pytorch_clean_state):  # noqa: F811
+    """Autouse wrapper: pulls in the shared pytorch_clean_state fixture."""
+
+
+def test_rotation_fires_and_replaces_span(_reset):
+    """After _rotation_interval_s elapses the span is replaced."""
+    with mock.patch.object(rr, "_rotation_interval_s", 0):  # fire immediately
+        rr.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-1")
+        first_span = rr._span
+        time.sleep(0.2)
+
+    second_span = rr._span
+    assert second_span is not first_span, "span was not rotated"
+    assert first_span.finished, "old span should be finished after rotation"
+    assert second_span is not None
+
+
+def test_rotation_tags_old_span_was_long_running(_reset):
+    """Rotated spans carry _dd.was_long_running=1."""
+    with mock.patch.object(rr, "_rotation_interval_s", 0):
+        rr.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-1")
+        first_span = rr._span
+        time.sleep(0.2)
+
+    assert first_span.get_metric("_dd.was_long_running") == 1
+
+
+def test_close_cancels_rotation_timer(_reset):
+    """close() cancels the pending rotation timer."""
+    rr.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-1")
+    assert rr._rotation_timer is not None
+    rr.close()
+    assert rr._rotation_timer is None
+
+
+def test_subgroup_destroy_does_not_close_rank_span(_reset):
+    """Destroying a subgroup must not close the pytorch.rank span."""
+    rr.open_rank_span(rank=0, world_size=2, framework="ddp", training_job_id="job-1")
+    original_span = rr._span
+
+    fake_group = object()
+    with mock.patch("torch.distributed.destroy_process_group") as mock_destroy:
+        mock_destroy.return_value = None
+        _distributed._wrapped_destroy_process_group(mock_destroy, None, (fake_group,), {})
+
+    assert rr._span is original_span, "subgroup destroy must not close the rank span"
+    assert not original_span.finished
diff --git a/tests/contrib/pytorch/test_profiler_interaction.py b/tests/contrib/pytorch/test_profiler_interaction.py
new file mode 100644
index 00000000000..204f4d83dea
--- /dev/null
+++ b/tests/contrib/pytorch/test_profiler_interaction.py
@@ -0,0 +1,83 @@
+"""
+Verify our pytorch integration doesn't interfere with torch.profiler.profile().
+Tests:
+  1. profiler runs and captures ops while integration is active
+  2. profiler schedule + step_num callback fires correctly
+  3. profiler works normally after unpatch
+"""
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+import ddtrace
+
+
+@pytest.fixture(autouse=True)
+def _patch_pytorch():
+    ddtrace.patch(pytorch=True)
+    yield
+    ddtrace.patch(pytorch=False)
+
+
+@pytest.fixture()
+def _simple_model():
+    model = nn.Linear(10, 5)
+    optimizer = optim.SGD(model.parameters(), lr=0.01)
+    x = torch.randn(4, 10)
+    y = torch.randn(4, 5)
+    return model, optimizer, x, y
+
+
+def test_profiler_captures_ops_while_integration_active(_simple_model):
+    model, optimizer, x, y = _simple_model
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CPU],
+        record_shapes=True,
+    ) as prof:
+        out = model(x)
+        loss = nn.functional.mse_loss(out, y)
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+
+    events = prof.key_averages()
+    assert len(events) > 0, "profiler captured no events while integration was active"
+
+
+def test_profiler_schedule_fires_while_integration_active(_simple_model):
+    model, optimizer, x, y = _simple_model
+    steps_seen = []
+
+    def trace_handler(p):
+        steps_seen.append(p.step_num)
+
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CPU],
+        schedule=torch.profiler.schedule(wait=0, warmup=0, active=2),
+        on_trace_ready=trace_handler,
+    ) as prof:
+        for _ in range(4):
+            out = model(x)
+            loss = nn.functional.mse_loss(out, y)
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            prof.step()
+
+    assert len(steps_seen) > 0, "on_trace_ready never called while integration was active"
+
+
+def test_profiler_works_after_unpatch(_simple_model):
+    model, optimizer, x, y = _simple_model
+    ddtrace.patch(pytorch=False)  # unpatch early (fixture will also unpatch — idempotent)
+
+    with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU]) as prof:
+        out = model(x)
+        loss = nn.functional.mse_loss(out, y)
+        loss.backward()
+        optimizer.step()
+
+    events = prof.key_averages()
+    assert len(events) > 0, "profiler captured no events after unpatch"
diff --git a/tests/contrib/pytorch/test_pytorch.py b/tests/contrib/pytorch/test_pytorch.py
new file mode 100644
index 00000000000..40919ace734
--- /dev/null
+++ b/tests/contrib/pytorch/test_pytorch.py
@@ -0,0 +1,180 @@
+"""Integration tests for the pytorch.rank lifetime span.
+
+These tests exercise the real patch/unpatch cycle with a CPU-only gloo
+process group and assert that a ``pytorch.rank`` span is emitted with the
+expected tags (rank, world_size, framework, training_job_id).
+"""
+
+import os
+import sys
+
+import pytest
+import torch
+
+
+# torch.distributed.init_process_group cannot be called more than once per
+# process on torch < 2.1; re-init hangs indefinitely with the gloo backend.
+pytestmark = pytest.mark.skipif(
+    tuple(int(x) for x in torch.__version__.split(".")[:2]) < (2, 1),
+    reason="distributed re-init hangs on torch<2.1",
+)
+
+
+@pytest.fixture(autouse=True)
+def _isolated(monkeypatch):
+    """Reset integration state before each test."""
+    from ddtrace.contrib.internal.pytorch import _distributed
+    from ddtrace.contrib.internal.pytorch import _test_helpers as _th
+    from ddtrace.contrib.internal.pytorch.patch import unpatch as pt_unpatch
+
+    _th.close_rank_root()
+    _th.reset_device_cache()
+    _distributed._installed = False
+    _distributed._rank_ctx.set(None)
+    setattr(__import__("torch"), "_datadog_patch", False)
+    yield
+    try:
+        pt_unpatch()
+    except Exception:
+        pass
+    _th.close_rank_root()
+    _th.reset_device_cache()
+    _distributed._rank_ctx.set(None)
+
+
+def _setup_single_rank_gloo():
+    os.environ.setdefault("MASTER_ADDR", "127.0.0.1")
+    os.environ.setdefault("MASTER_PORT", "29555")
+    os.environ.setdefault("RANK", "0")
+    os.environ.setdefault("WORLD_SIZE", "1")
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="gloo", rank=0, world_size=1)
+
+
+def _teardown_gloo():
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+
+
+def test_rank_span_emitted_on_init_process_group(monkeypatch, test_spans):
+    """patch() + init_process_group emits a ``pytorch.rank`` span with the
+    correct rank, world_size, framework, and training_job_id tags.
+    The span is closed by destroy_process_group (the wrapped version) or
+    unpatch(), so we tear down before inspecting spans.
+    """
+    monkeypatch.setenv("TORCHELASTIC_RUN_ID", "test-run-123")
+
+    from ddtrace.contrib.internal.pytorch.patch import patch as pt_patch
+    from ddtrace.contrib.internal.pytorch.patch import unpatch as pt_unpatch
+
+    pt_patch()
+    _setup_single_rank_gloo()
+    try:
+        _teardown_gloo()
+    finally:
+        pt_unpatch()
+
+    spans = test_spans.pop()
+    rank_spans = [s for s in spans if s.name == "pytorch.rank"]
+    assert rank_spans, "no pytorch.rank span emitted"
+    span = rank_spans[0]
+    assert span.get_metric("rank") == 0
+    assert span.get_metric("world_size") == 1
+    assert span.get_tag("framework") is not None
+    assert span.get_tag("training_job.id") is not None
+
+
+def test_rank_span_job_id_from_torchelastic_env(monkeypatch, test_spans):
+    """When TORCHELASTIC_RUN_ID is set the rank span carries that value as
+    training_job.id.
+    """
+    monkeypatch.setenv("TORCHELASTIC_RUN_ID", "elastic-run-99")
+    monkeypatch.delenv("DD_PYTORCH_JOB_ID", raising=False)
+
+    from ddtrace.contrib.internal.pytorch.patch import patch as pt_patch
+    from ddtrace.contrib.internal.pytorch.patch import unpatch as pt_unpatch
+
+    pt_patch()
+    _setup_single_rank_gloo()
+    try:
+        _teardown_gloo()
+    finally:
+        pt_unpatch()
+
+    spans = test_spans.pop()
+    rank_spans = [s for s in spans if s.name == "pytorch.rank"]
+    assert rank_spans, "no pytorch.rank span emitted"
+    span = rank_spans[0]
+    assert span.get_tag("training_job.id") == "elastic-run-99"
+
+
+def test_fsdp_not_eagerly_imported():
+    """patch(pytorch=True) must NOT cause torch.distributed.fsdp to land in
+    sys.modules. Eagerly importing it pulls _dynamo + sympy (~1.3 s startup
+    overhead) for every DDP workload that never touches FSDP.
+    """
+    for _key in list(sys.modules):
+        if _key == "torch.distributed.fsdp" or _key.startswith("torch.distributed.fsdp."):
+            sys.modules.pop(_key)
+
+    from ddtrace.contrib.internal.pytorch.patch import patch as pt_patch
+    from ddtrace.contrib.internal.pytorch.patch import unpatch as pt_unpatch
+
+    try:
+        pt_patch()
+        assert "torch.distributed.fsdp" not in sys.modules, (
+            "_install_fsdp() imported torch.distributed.fsdp eagerly — convert it to register_post_import_hook"
+        )
+    finally:
+        pt_unpatch()
+
+
+@pytest.mark.skipif(
+    tuple(int(x) for x in torch.__version__.split(".")[:2]) >= (2, 6),
+    reason="torch>=2.6 raises on double FSDP operator registration when fsdp is removed "
+    "from sys.modules; the hook itself still works (verified by test_fsdp_not_eagerly_imported)",
+)
+def test_fsdp_wrapper_installed_on_import():
+    """After patch(), importing torch.distributed.fsdp should trigger the
+    post-import hook and wrap FullyShardedDataParallel.__init__.
+    """
+    for _key in list(sys.modules):
+        if _key == "torch.distributed.fsdp" or _key.startswith("torch.distributed.fsdp."):
+            sys.modules.pop(_key)
+
+    from ddtrace.contrib.internal.pytorch.patch import patch as pt_patch
+    from ddtrace.contrib.internal.pytorch.patch import unpatch as pt_unpatch
+
+    try:
+        pt_patch()
+        # Trigger the hook by importing the module.
+        try:
+            from torch.distributed.fsdp import FullyShardedDataParallel
+        except ImportError as e:
+            pytest.skip(f"torch.distributed.fsdp not importable in this environment: {e}")
+
+        assert hasattr(FullyShardedDataParallel.__init__, "__wrapped__"), (
+            "FSDP.__init__ was not wrapped after post-import hook fired"
+        )
+    finally:
+        pt_unpatch()
+
+
+def test_bootstrap_reads_ray_env_vars(monkeypatch):
+    """_bootstrap_distributed() must populate the run-metadata cache from
+    Ray-set env vars so that pytorch.rank spans carry ray.submission_id and
+    ray.train.run_name tags.
+    """
+    monkeypatch.setenv("_RAY_SUBMISSION_ID", "raysubmit_xyz")
+    monkeypatch.setenv("_RAY_JOB_NAME", "my-experiment")
+    monkeypatch.setenv("RAY_JOB_ID", "33000000")
+
+    from ddtrace.contrib.internal.pytorch import _utils
+    from ddtrace.contrib.internal.pytorch._distributed import _populate_ray_run_metadata
+
+    _utils.clear_cached_run_metadata()
+    _populate_ray_run_metadata()
+
+    rm = _utils.get_cached_run_metadata()
+    assert rm.get("submission_id") == "raysubmit_xyz"
+    assert rm.get("run_name") == "my-experiment"
diff --git a/tests/contrib/pytorch/test_pytorch_patch.py b/tests/contrib/pytorch/test_pytorch_patch.py
new file mode 100644
index 00000000000..47cc4fca1c6
--- /dev/null
+++ b/tests/contrib/pytorch/test_pytorch_patch.py
@@ -0,0 +1,93 @@
+import pytest
+
+import ddtrace.contrib.internal.pytorch.patch as pytorch_patch
+from ddtrace.contrib.internal.pytorch.patch import get_version
+from ddtrace.contrib.internal.pytorch.patch import patch
+from ddtrace.contrib.internal.pytorch.patch import unpatch
+from tests.contrib.patch import PatchTestCase
+
+
+class TestPyTorchPatch(PatchTestCase.Base):
+    __integration_name__ = "pytorch"
+    __module_name__ = "torch"
+    __patch_func__ = patch
+    __unpatch_func__ = unpatch
+    __get_version__ = get_version
+
+    def assert_module_patched(self, torch):
+        assert getattr(torch, "_datadog_patch", False) is True
+
+    def assert_not_module_patched(self, torch):
+        assert getattr(torch, "_datadog_patch", False) is False
+
+    def assert_not_module_double_patched(self, torch):
+        assert getattr(torch, "_datadog_patch", False) is True
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_patch_all_does_not_enable_pytorch_by_default(monkeypatch):
+    """Pytorch is opt-in: a plain patch_all() must not flip torch._datadog_patch."""
+    import torch
+
+    from ddtrace._monkey import PATCH_MODULES
+
+    assert PATCH_MODULES.get("pytorch") is False
+
+    if getattr(torch, "_datadog_patch", False):
+        from ddtrace.contrib.internal.pytorch.patch import unpatch
+
+        unpatch()
+
+    from ddtrace._monkey import patch_all
+
+    patch_all()
+    assert getattr(torch, "_datadog_patch", False) is False
+
+
+def test_explicit_patch_pytorch_true_still_works():
+    import torch
+
+    from ddtrace._monkey import patch
+    from ddtrace.contrib.internal.pytorch.patch import unpatch
+
+    if getattr(torch, "_datadog_patch", False):
+        unpatch()
+
+    patch(pytorch=True)
+    try:
+        assert getattr(torch, "_datadog_patch", False) is True
+    finally:
+        unpatch()
+
+
+@pytest.mark.parametrize("bad_version", [(1, 9, 0), (3, 0, 0)])
+def test_patch_skipped_for_unsupported_torch_version(monkeypatch, bad_version):
+    import torch
+
+    if getattr(torch, "_datadog_patch", False):
+        unpatch()
+
+    monkeypatch.setattr(pytorch_patch, "TORCH_VERSION", bad_version)
+    patch()
+    assert getattr(torch, "_datadog_patch", False) is False
+
+
+def test_install_runs_unconditionally(monkeypatch):
+    import torch
+
+    from ddtrace.contrib.internal.pytorch import _distributed
+    from ddtrace.contrib.internal.pytorch.patch import patch
+    from ddtrace.contrib.internal.pytorch.patch import unpatch
+
+    monkeypatch.delenv("RANK", raising=False)
+    monkeypatch.delenv("WORLD_SIZE", raising=False)
+    monkeypatch.setattr(torch.distributed, "is_initialized", lambda: False)
+
+    if getattr(torch, "_datadog_patch", False):
+        unpatch()
+
+    patch()
+    try:
+        assert _distributed._installed is True
+    finally:
+        unpatch()
diff --git a/tests/contrib/pytorch/test_rank_root.py b/tests/contrib/pytorch/test_rank_root.py
new file mode 100644
index 00000000000..a5ec9a06f5f
--- /dev/null
+++ b/tests/contrib/pytorch/test_rank_root.py
@@ -0,0 +1,546 @@
+"""Tests for the pytorch.rank lifetime span."""
+
+import pytest
+
+from ddtrace.contrib.internal.pytorch import _device
+from ddtrace.contrib.internal.pytorch import _rank_root
+from ddtrace.contrib.internal.pytorch import _test_helpers as _th
+
+
+@pytest.fixture(autouse=True)
+def _reset(tracer, pytorch_clean_state):  # noqa: F811
+    """Autouse wrapper: pulls in the shared pytorch_clean_state fixture."""
+
+
+def test_open_creates_span_with_required_tags(tracer):
+    _rank_root.open_rank_span(rank=3, world_size=8, framework="ddp", training_job_id="job-X")
+    span = _th.current_rank_span()
+    assert span is not None
+    assert span.name == "pytorch.rank"
+    assert span.get_tag("training_job.id") == "job-X"
+    assert span.get_metric("rank") == 3
+    assert span.get_metric("world_size") == 8
+    assert span.get_tag("framework") == "ddp"
+    assert span.get_tag("device.id") == "h-9:cpu"
+
+
+def test_open_is_idempotent(tracer):
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X")
+    first = _th.current_rank_span()
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X")
+    second = _th.current_rank_span()
+    assert first is second
+
+
+def test_close_finishes_span(tracer):
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X")
+    span = _th.current_rank_span()
+    _rank_root.close()
+    assert span.finished
+    assert _th.current_rank_span() is None
+
+
+def test_close_without_open_is_safe(tracer):
+    _rank_root.close()  # no error
+
+
+def test_open_registers_atexit_handler(tracer, monkeypatch):
+    """Many users never call `unpatch()` (a `ddtrace-run` process just
+    exits). We register `close` as an atexit hook so the rank span is
+    finished cleanly on normal interpreter shutdown.
+    """
+    handlers = []
+    real_register = _rank_root.atexit.register
+
+    def capture(fn, *a, **kw):
+        handlers.append(fn)
+        return real_register(fn, *a, **kw)
+
+    monkeypatch.setattr(_rank_root.atexit, "register", capture)
+    _th.set_atexit_registered(False)
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X")
+    assert _rank_root.close in handlers
+
+
+def test_atexit_register_unregister_balanced_across_cycles(tracer, monkeypatch):
+    """``close()`` must ``atexit.unregister`` so multiple open/close cycles
+    don't accumulate handlers in the atexit list — only one ``close``
+    callback should be live between cycles.
+    """
+    registered = 0
+    unregistered = 0
+    real_register = _rank_root.atexit.register
+    real_unregister = _rank_root.atexit.unregister
+
+    def capture_register(fn, *a, **kw):
+        nonlocal registered
+        if fn is _rank_root.close:
+            registered += 1
+        return real_register(fn, *a, **kw)
+
+    def capture_unregister(fn):
+        nonlocal unregistered
+        if fn is _rank_root.close:
+            unregistered += 1
+        return real_unregister(fn)
+
+    monkeypatch.setattr(_rank_root.atexit, "register", capture_register)
+    monkeypatch.setattr(_rank_root.atexit, "unregister", capture_unregister)
+    _th.set_atexit_registered(False)
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X")
+    _rank_root.close()
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X")
+    _rank_root.close()
+    assert registered == 2
+    assert unregistered == 2
+
+
+def test_set_framework_updates_open_span_tag(tracer):
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X")
+    span = _th.current_rank_span()
+    assert span.get_tag("framework") == "none"
+    _rank_root.set_framework("ddp")
+    assert span.get_tag("framework") == "ddp"
+
+
+def test_set_framework_noop_without_open_span(tracer):
+    _rank_root.set_framework("ddp")  # no error
+
+
+def test_set_framework_noop_for_empty_string(tracer):
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X")
+    _rank_root.set_framework("")
+    assert _th.current_rank_span().get_tag("framework") == "none"
+
+
+def test_ray_run_context_tagged_at_open_when_cache_populated_early(tracer):
+    """Driver-side path: the Ray Train fit wrapper populates the cache
+    before ``init_process_group`` fires, so the tags land at open.
+    """
+    from ddtrace.contrib.internal.pytorch import _utils
+
+    _utils.set_cached_run_metadata(
+        submission_id="raysubmit_early",
+        metadata={"job_name": "early.job"},
+        run_name="run-early",
+    )
+    try:
+        _rank_root.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-X")
+        span = _th.current_rank_span()
+        assert span.get_tag("ray.submission_id") == "raysubmit_early"
+        assert span.get_tag("ray.metadata.job_name") == "early.job"
+        assert span.get_tag("ray.train.run_name") == "run-early"
+    finally:
+        _utils.clear_cached_run_metadata()
+
+
+def test_ray_run_context_backfilled_at_close_when_cache_populated_late(tracer):
+    """Worker-side path: Ray Train calls ``init_process_group`` itself
+    *before* invoking the wrapped train function, so the cache is empty
+    when the rank span opens. The wrapper populates the cache later,
+    and ``close()`` must backfill the tags before finishing the span.
+    """
+    from ddtrace.contrib.internal.pytorch import _utils
+
+    # Cache empty at open time.
+    _utils.clear_cached_run_metadata()
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-X")
+    span = _th.current_rank_span()
+    assert span.get_tag("ray.submission_id") is None
+    assert span.get_tag("ray.metadata.job_name") is None
+
+    # Wrapper fires after the rank span is already open.
+    _utils.set_cached_run_metadata(
+        submission_id="raysubmit_late",
+        metadata={"job_name": "late.job"},
+        run_name="run-late",
+    )
+    try:
+        _rank_root.close()
+        assert span.get_tag("ray.submission_id") == "raysubmit_late"
+        assert span.get_tag("ray.metadata.job_name") == "late.job"
+        assert span.get_tag("ray.train.run_name") == "run-late"
+    finally:
+        _utils.clear_cached_run_metadata()
+
+
+def test_retag_ray_run_context_tags_live_rank_span(tracer):
+    """Regression: ``ray.submission_id`` was missing on ``pytorch.rank``
+    in live verification because ``_run_train_func_in_worker`` restores
+    the cache to empty before ``_rank_root.close()`` runs at exit. The
+    new ``retag_ray_run_context()`` entrypoint is called by the worker
+    wrap immediately after populating the cache so the tag lands on the
+    live span (not at close, which sees an empty cache).
+    """
+    from ddtrace.contrib.internal.pytorch import _utils
+
+    _utils.clear_cached_run_metadata()
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-X")
+    span = _th.current_rank_span()
+    assert span.get_tag("ray.submission_id") is None
+
+    # Worker wrap populates the cache, then immediately calls retag.
+    _utils.set_cached_run_metadata(
+        submission_id="raysubmit_eager",
+        metadata={"job_name": "eager.job"},
+        run_name="run-eager",
+    )
+    try:
+        _rank_root.retag_ray_run_context()
+        assert span.get_tag("ray.submission_id") == "raysubmit_eager"
+        assert span.get_tag("ray.metadata.job_name") == "eager.job"
+        assert span.get_tag("ray.train.run_name") == "run-eager"
+
+        # Simulate the worker wrap's finally clearing the cache (restore
+        # to empty). The tags must stay on the live span — they were
+        # written eagerly, not pulled at close.
+        _utils.clear_cached_run_metadata()
+        assert span.get_tag("ray.submission_id") == "raysubmit_eager"
+    finally:
+        _utils.clear_cached_run_metadata()
+        _rank_root.close()
+
+
+def test_retag_ray_run_context_noop_when_no_span_open(tracer):
+    """retag_ray_run_context() must not crash when called with no rank
+    span open (e.g., installed but workers never reach init_process_group).
+    """
+    from ddtrace.contrib.internal.pytorch import _utils
+
+    # Ensure no span is open.
+    try:
+        _rank_root.close()
+    except Exception:
+        pass
+
+    _utils.set_cached_run_metadata(submission_id="x", metadata={}, run_name="r")
+    try:
+        # Must not raise.
+        _rank_root.retag_ray_run_context()
+    finally:
+        _utils.clear_cached_run_metadata()
+
+
+def test_rank_root_nests_under_active_ray_worker_span(tracer):
+    """When a `ray.train.worker` span is currently active, the
+    `pytorch.rank` span should become its child (not a new trace root).
+    """
+    ray_worker = tracer.start_span("ray.train.worker", service="ray")
+    tracer.context_provider.activate(ray_worker)
+    try:
+        _rank_root.open_rank_span(rank=0, world_size=1, framework="ray", training_job_id="job-Y")
+        rank_span = _th.current_rank_span()
+        assert rank_span is not None
+        # The rank-root span should share a trace_id with the ray worker.
+        assert rank_span.trace_id == ray_worker.trace_id
+        # And its parent_id should reference the ray worker's span_id.
+        assert rank_span.parent_id == ray_worker.span_id
+    finally:
+        _rank_root.close()
+        ray_worker.finish()
+        tracer.context_provider.activate(None)
+
+
+def test_rank_root_close_flush_is_bounded(monkeypatch):
+    """A slow tracer.flush() must not extend rank-root close beyond a
+    bounded timeout.
+    """
+    import threading
+    import time
+
+    from ddtrace import tracer
+    from ddtrace.contrib.internal.pytorch import _rank_root
+
+    block = threading.Event()
+
+    def slow_flush(*args, **kwargs):
+        block.wait(timeout=10)
+
+    monkeypatch.setattr(tracer, "flush", slow_flush)
+
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="t1")
+    start = time.monotonic()
+    _rank_root.close()
+    elapsed = time.monotonic() - start
+    block.set()
+    # close() joins the flush thread with a 2.0s timeout; allow a small margin above that.
+    assert elapsed < 3.0, f"close took {elapsed:.2f}s; expected bounded < 3s"
+
+
+# ---------------------------------------------------------------------------
+# Task 3: torch / cudnn / nccl / env / launcher / GPU invariant tagging
+# ---------------------------------------------------------------------------
+
+
+def test_detect_launcher_torchrun(monkeypatch):
+    from ddtrace.contrib.internal.pytorch import _distributed
+
+    monkeypatch.setenv("TORCHELASTIC_RUN_ID", "tr-123")
+    monkeypatch.delenv("RAY_JOB_ID", raising=False)
+    monkeypatch.delenv("SLURM_JOB_ID", raising=False)
+    monkeypatch.delenv("KUBEFLOW_TRAINING_JOB_ID", raising=False)
+    assert _distributed._detect_launcher() == "torchrun"
+
+
+def test_detect_launcher_ray(monkeypatch):
+    from ddtrace.contrib.internal.pytorch import _distributed
+
+    monkeypatch.delenv("TORCHELASTIC_RUN_ID", raising=False)
+    monkeypatch.setenv("RAY_JOB_ID", "rayjob-99")
+    monkeypatch.delenv("SLURM_JOB_ID", raising=False)
+    monkeypatch.delenv("KUBEFLOW_TRAINING_JOB_ID", raising=False)
+    assert _distributed._detect_launcher() == "ray"
+
+
+def test_detect_launcher_slurm(monkeypatch):
+    from ddtrace.contrib.internal.pytorch import _distributed
+
+    monkeypatch.delenv("TORCHELASTIC_RUN_ID", raising=False)
+    monkeypatch.delenv("RAY_JOB_ID", raising=False)
+    monkeypatch.setenv("SLURM_JOB_ID", "slurm-42")
+    monkeypatch.delenv("KUBEFLOW_TRAINING_JOB_ID", raising=False)
+    assert _distributed._detect_launcher() == "slurm"
+
+
+def test_detect_launcher_kubeflow(monkeypatch):
+    from ddtrace.contrib.internal.pytorch import _distributed
+
+    monkeypatch.delenv("TORCHELASTIC_RUN_ID", raising=False)
+    monkeypatch.delenv("RAY_JOB_ID", raising=False)
+    monkeypatch.delenv("SLURM_JOB_ID", raising=False)
+    monkeypatch.setenv("KUBEFLOW_TRAINING_JOB_ID", "kf-job-1")
+    assert _distributed._detect_launcher() == "kubeflow"
+
+
+def test_detect_launcher_none(monkeypatch):
+    from ddtrace.contrib.internal.pytorch import _distributed
+
+    for var in (
+        "TORCHELASTIC_RUN_ID",
+        "RAY_JOB_ID",
+        "SLURM_JOB_ID",
+        "KUBEFLOW_TRAINING_JOB_ID",
+    ):
+        monkeypatch.delenv(var, raising=False)
+    assert _distributed._detect_launcher() is None
+
+
+def test_get_cached_backend_caches_result(monkeypatch):
+    from ddtrace.contrib.internal.pytorch import _distributed
+
+    # Reset the cache.
+    _distributed._cached_distributed_backend = None
+    monkeypatch.setattr(
+        "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.is_available",
+        lambda: True,
+    )
+    monkeypatch.setattr(
+        "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.is_initialized",
+        lambda: True,
+    )
+    monkeypatch.setattr(
+        "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.get_backend",
+        lambda: "nccl",
+    )
+    result1 = _distributed._get_cached_backend()
+    assert result1 == "nccl"
+    # Second call should return cached value without calling get_backend again.
+    monkeypatch.setattr(
+        "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.get_backend",
+        lambda: "SHOULD_NOT_BE_CALLED",
+    )
+    result2 = _distributed._get_cached_backend()
+    assert result2 == "nccl"
+    # Clean up.
+    _distributed._cached_distributed_backend = None
+
+
+def test_get_cached_backend_returns_none_when_not_initialized(monkeypatch):
+    from ddtrace.contrib.internal.pytorch import _distributed
+
+    _distributed._cached_distributed_backend = None
+    monkeypatch.setattr(
+        "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.is_available",
+        lambda: True,
+    )
+    monkeypatch.setattr(
+        "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.is_initialized",
+        lambda: False,
+    )
+    assert _distributed._get_cached_backend() is None
+    _distributed._cached_distributed_backend = None
+
+
+def test_rank_span_carries_torch_invariants(monkeypatch):
+    """pytorch.rank span must carry torch version and cuDNN settings."""
+    captured = {}
+
+    class FakeSpan:
+        def __init__(self):
+            self.context = type("C", (), {"sampling_priority": 1})()
+
+        def set_tag(self, k, v=None):
+            captured[k] = v
+
+        def _set_attribute(self, k, v):
+            captured[k] = v
+
+        def finish(self):
+            pass
+
+    fake = FakeSpan()
+    from ddtrace import tracer
+
+    monkeypatch.setattr(tracer, "start_span", lambda *a, **kw: fake)
+
+    # Drop all NCCL / env vars so only torch/cudnn tags appear.
+    for v in (
+        "NCCL_DEBUG",
+        "NCCL_SOCKET_IFNAME",
+        "NCCL_IB_DISABLE",
+        "NCCL_P2P_DISABLE",
+        "NCCL_ALGO",
+        "NCCL_PROTO",
+        "TORCH_NCCL_ASYNC_ERROR_HANDLING",
+        "CUDA_VISIBLE_DEVICES",
+        "MASTER_ADDR",
+        "LOCAL_RANK",
+        "LOCAL_WORLD_SIZE",
+        "GROUP_RANK",
+        "GROUP_WORLD_SIZE",
+        "MASTER_PORT",
+        "TORCHELASTIC_RUN_ID",
+        "RAY_JOB_ID",
+        "SLURM_JOB_ID",
+        "KUBEFLOW_TRAINING_JOB_ID",
+    ):
+        monkeypatch.delenv(v, raising=False)
+
+    _rank_root._span = None
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="t1")
+
+    # torch.__version__ is always populated; cudnn.{enabled,benchmark,deterministic} too.
+    assert "torch.version" in captured
+    assert "torch.cudnn.enabled" in captured
+
+    _rank_root.close()
+
+
+def test_rank_span_carries_env_signals(monkeypatch):
+    """pytorch.rank span must carry NCCL/distributed env vars as tags/facets."""
+    from ddtrace import tracer
+
+    captured = {}
+
+    class FakeSpan:
+        def __init__(self):
+            self.context = type("C", (), {"sampling_priority": 1})()
+
+        def set_tag(self, k, v=None):
+            captured[k] = v
+
+        def _set_attribute(self, k, v):
+            captured[k] = v
+
+        def finish(self):
+            pass
+
+    monkeypatch.setattr(tracer, "start_span", lambda *a, **kw: FakeSpan())
+    monkeypatch.setenv("NCCL_DEBUG", "INFO")
+    monkeypatch.setenv("LOCAL_RANK", "3")
+    monkeypatch.setenv("MASTER_ADDR", "10.0.0.5")
+    monkeypatch.setenv("MASTER_PORT", "29500")
+
+    _rank_root._span = None
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="t1")
+    assert captured.get("nccl.debug") == "INFO"
+    assert captured.get("pytorch.local_rank") == 3
+    assert captured.get("pytorch.master_addr") == "10.0.0.5"
+    assert captured.get("pytorch.master_port") == 29500
+    _rank_root.close()
+
+
+def test_rank_span_carries_launcher_tag(monkeypatch):
+    """pytorch.rank span must carry the `launcher` tag when a launcher env var is set."""
+    from ddtrace import tracer
+
+    captured = {}
+
+    class FakeSpan:
+        def __init__(self):
+            self.context = type("C", (), {"sampling_priority": 1})()
+
+        def set_tag(self, k, v=None):
+            captured[k] = v
+
+        def _set_attribute(self, k, v):
+            captured[k] = v
+
+        def finish(self):
+            pass
+
+    monkeypatch.setattr(tracer, "start_span", lambda *a, **kw: FakeSpan())
+    # Clear all other launcher vars so only torchrun fires.
+    for v in ("RAY_JOB_ID", "SLURM_JOB_ID", "KUBEFLOW_TRAINING_JOB_ID"):
+        monkeypatch.delenv(v, raising=False)
+    monkeypatch.setenv("TORCHELASTIC_RUN_ID", "elastic-run-1")
+
+    _rank_root._span = None
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="t1")
+    assert captured.get("launcher") == "torchrun"
+    _rank_root.close()
+
+
+def test_rank_span_uses_default_pytorch_service(_reset):
+    """pytorch.rank spans use 'pytorch' as service when DD_PYTORCH_SERVICE is unset."""
+    import ddtrace.contrib.internal.pytorch._rank_root as rr
+
+    rr.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-1")
+    span = rr._span
+    assert span.service == "pytorch", f"Expected 'pytorch', got {span.service!r}"
+
+
+def test_rank_span_carries_new_device_gpu_fields(monkeypatch):
+    """pytorch.rank span must expose GPU DeviceInfo fields when populated."""
+    from ddtrace import tracer
+    from ddtrace.contrib.internal.pytorch._device import DeviceInfo
+
+    captured = {}
+
+    class FakeSpan:
+        def __init__(self):
+            self.context = type("C", (), {"sampling_priority": 1})()
+
+        def set_tag(self, k, v=None):
+            captured[k] = v
+
+        def _set_attribute(self, k, v):
+            captured[k] = v
+
+        def finish(self):
+            pass
+
+    monkeypatch.setattr(tracer, "start_span", lambda *a, **kw: FakeSpan())
+
+    # Inject a fake DeviceInfo with GPU fields.
+    fake_info = DeviceInfo(
+        device_id="gpu-uuid-abc",
+        device_index=0,
+        kind="cuda",
+        hostname="node-1",
+        gpu_name="NVIDIA A100",
+        gpu_compute_capability="8.0",
+        gpu_sm_count=108,
+        gpu_total_memory_bytes=85899345920,
+        gpu_driver_version="525.85.12",
+    )
+    monkeypatch.setattr(_device, "get", lambda: fake_info)
+
+    _rank_root._span = None
+    _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="t1")
+    assert captured.get("device.gpu.name") == "NVIDIA A100"
+    assert captured.get("device.gpu.compute_capability") == "8.0"
+    assert captured.get("device.gpu.sm_count") == 108
+    assert captured.get("device.gpu.total_memory_bytes") == 85899345920
+    assert captured.get("device.gpu.driver_version") == "525.85.12"
+    _rank_root.close()
diff --git a/tests/contrib/pytorch/test_repatch_and_exception_paths.py b/tests/contrib/pytorch/test_repatch_and_exception_paths.py
new file mode 100644
index 00000000000..576eb7cb3fe
--- /dev/null
+++ b/tests/contrib/pytorch/test_repatch_and_exception_paths.py
@@ -0,0 +1,124 @@
+"""Regression tests for PyTorch integration edge cases:
+
+* ``install()`` / ``uninstall()`` must be idempotent (no wrapper stacking).
+* A full patch / unpatch / patch cycle must leave exactly one wrapper layer.
+* Exceptions raised inside ``_bootstrap_distributed`` / ``_wrapped_destroy_process_group``
+  must not leave the integration in a broken state.
+"""
+
+import pytest
+import torch
+
+from ddtrace.contrib.internal.pytorch import _distributed
+from ddtrace.contrib.internal.pytorch import _test_helpers as _th
+from ddtrace.contrib.internal.pytorch import patch as pytorch_patch
+
+
+def _force_clean_wraps() -> None:
+    """Defensively remove any pytorch wraps left by earlier tests in this
+    session. Earlier tests (e.g. ``test_layer_one_gating``) call
+    ``_distributed.install()`` directly, bypassing the
+    ``torch._datadog_patch`` flag, so the high-level ``unpatch()`` returns
+    early and leaves wrappers attached. Force ``_installed = True`` and call
+    ``uninstall()`` to walk the canonical teardown path.
+    """
+    setattr(torch, "_datadog_patch", False)
+    _distributed._installed = True
+    try:
+        _distributed.uninstall()
+    except Exception:
+        pass
+
+
+@pytest.fixture
+def _clean_state(monkeypatch):
+    _force_clean_wraps()
+    _th.reset_device_cache()
+    _th.close_rank_root()
+    yield
+    _force_clean_wraps()
+    _th.reset_device_cache()
+    _th.close_rank_root()
+
+
+def _dd_wrapper_depth(fn) -> int:
+    """Count only ``wrapt``-added layers.
+
+    torch itself decorates some distributed functions with ``functools.wraps``,
+    which also sets ``__wrapped__``; we only count layers that are ``wrapt``
+    ``FunctionWrapper`` instances so torch's own decorators are excluded.
+    """
+    import wrapt
+
+    depth = 0
+    f = fn
+    while isinstance(f, wrapt.FunctionWrapper):
+        depth += 1
+        f = f.__wrapped__
+    return depth
+
+
+def _dd_wrapper_depth_ipg() -> int:
+    """Wrapper depth on ``torch.distributed.init_process_group``.
+
+    The integration wraps ``init_process_group`` and ``destroy_process_group`` (not
+    collectives), so we measure idempotency on those two functions.
+    """
+    return _dd_wrapper_depth(torch.distributed.init_process_group)
+
+
+def test_install_is_idempotent_no_wrapper_stacking(_clean_state):
+    """Calling install() twice must not stack wrappers on torch.distributed."""
+    assert _dd_wrapper_depth_ipg() == 0
+    _distributed.install()
+    depth_after_first = _dd_wrapper_depth_ipg()
+    assert depth_after_first == 1
+    _distributed.install()  # must be a no-op
+    assert _dd_wrapper_depth_ipg() == depth_after_first
+    _distributed.uninstall()
+    assert _dd_wrapper_depth_ipg() == 0
+
+
+def test_patch_unpatch_patch_cycle_is_clean(_clean_state):
+    """A full patch/unpatch/patch cycle must leave exactly one wrapper layer."""
+    pytorch_patch.patch()
+    depth_after_first = _dd_wrapper_depth_ipg()
+    pytorch_patch.unpatch()
+    assert _dd_wrapper_depth_ipg() == 0
+    pytorch_patch.patch()
+    assert _dd_wrapper_depth_ipg() == depth_after_first
+
+
+def test_uninstall_is_idempotent(_clean_state):
+    """uninstall() without a prior install() is a no-op."""
+    _distributed.uninstall()
+    _distributed.uninstall()
+
+
+def test_exception_in_bootstrap_does_not_corrupt_install_state(monkeypatch, _clean_state):
+    """If _bootstrap_distributed raises, install() state is still usable."""
+    monkeypatch.setattr(_distributed, "_bootstrap_distributed", lambda: (_ for _ in ()).throw(RuntimeError("boom")))
+    pytorch_patch.patch()
+    # init_process_group wrapper is in place even after a bootstrap failure
+    assert _distributed._installed
+    pytorch_patch.unpatch()
+    assert not _distributed._installed
+
+
+def test_exception_in_destroy_still_closes_rank_span(monkeypatch, _clean_state):
+    """_rank_root.close() is called even when destroy_process_group raises."""
+    closed = []
+    monkeypatch.setattr(
+        "ddtrace.contrib.internal.pytorch._rank_root.close",
+        lambda: closed.append(True),
+    )
+
+    def raising_destroy(*a, **kw):
+        raise RuntimeError("destroy failed")
+
+    monkeypatch.setattr(torch.distributed, "destroy_process_group", raising_destroy)
+    pytorch_patch.patch()
+    with pytest.raises(RuntimeError, match="destroy failed"):
+        torch.distributed.destroy_process_group()
+    assert closed, "_rank_root.close() was not called despite try/finally"
+    pytorch_patch.unpatch()
diff --git a/tests/contrib/pytorch/test_utils.py b/tests/contrib/pytorch/test_utils.py
new file mode 100644
index 00000000000..b5d593d2b38
--- /dev/null
+++ b/tests/contrib/pytorch/test_utils.py
@@ -0,0 +1,218 @@
+from ddtrace.contrib.internal.pytorch._utils import TRAINING_JOB_ID_TAG
+from ddtrace.contrib.internal.pytorch._utils import job_id_env_set
+from ddtrace.contrib.internal.pytorch._utils import resolve_job_id_from_env
+from ddtrace.contrib.internal.pytorch._utils import set_training_job_id_tag
+
+
+def test_resolve_job_id_falls_back_to_torchelastic(monkeypatch):
+    monkeypatch.delenv("DD_PYTORCH_JOB_ID", raising=False)
+    monkeypatch.delenv("RAY_JOB_ID", raising=False)
+    monkeypatch.setenv("TORCHELASTIC_RUN_ID", "elastic-id")
+    monkeypatch.setenv("KUBEFLOW_TRAINING_JOB_ID", "kf-id")
+    monkeypatch.setenv("SLURM_JOB_ID", "slurm-id")
+    assert resolve_job_id_from_env() == "elastic-id"
+
+
+def test_resolve_job_id_falls_back_to_kubeflow(monkeypatch):
+    monkeypatch.delenv("DD_PYTORCH_JOB_ID", raising=False)
+    monkeypatch.delenv("RAY_JOB_ID", raising=False)
+    monkeypatch.delenv("TORCHELASTIC_RUN_ID", raising=False)
+    monkeypatch.setenv("KUBEFLOW_TRAINING_JOB_ID", "kf-id")
+    monkeypatch.setenv("SLURM_JOB_ID", "slurm-id")
+    assert resolve_job_id_from_env() == "kf-id"
+
+
+def test_resolve_job_id_falls_back_to_slurm(monkeypatch):
+    for v in ("DD_PYTORCH_JOB_ID", "TORCHELASTIC_RUN_ID", "KUBEFLOW_TRAINING_JOB_ID", "RAY_JOB_ID"):
+        monkeypatch.delenv(v, raising=False)
+    monkeypatch.setenv("SLURM_JOB_ID", "slurm-id")
+    assert resolve_job_id_from_env() == "slurm-id"
+
+
+def test_resolve_job_id_generates_uuid_when_unset(monkeypatch):
+    for v in (
+        "DD_PYTORCH_JOB_ID",
+        "TORCHELASTIC_RUN_ID",
+        "KUBEFLOW_TRAINING_JOB_ID",
+        "RAY_JOB_ID",
+        "SLURM_JOB_ID",
+    ):
+        monkeypatch.delenv(v, raising=False)
+    job_id = resolve_job_id_from_env()
+    # UUID4 form has 36 chars including hyphens.
+    assert len(job_id) == 36 and job_id.count("-") == 4
+
+
+def test_resolve_job_id_empty_string_falls_through(monkeypatch):
+    monkeypatch.setenv("DD_PYTORCH_JOB_ID", "   ")  # whitespace-only treated as unset
+    monkeypatch.setenv("TORCHELASTIC_RUN_ID", "elastic-id")
+    assert resolve_job_id_from_env() == "elastic-id"
+
+
+def test_dd_pytorch_job_id_wins_over_ray_job_id(monkeypatch):
+    monkeypatch.setenv("DD_PYTORCH_JOB_ID", "user-supplied-id")
+    monkeypatch.setenv("RAY_JOB_ID", "33000000")
+    assert resolve_job_id_from_env() == "user-supplied-id"
+
+
+def test_resolve_job_id_prefers_ray_over_torchelastic(monkeypatch):
+    monkeypatch.delenv("DD_PYTORCH_JOB_ID", raising=False)
+    monkeypatch.setenv("RAY_JOB_ID", "ray-abc")
+    monkeypatch.setenv("TORCHELASTIC_RUN_ID", "te-xyz")
+    monkeypatch.setenv("KUBEFLOW_TRAINING_JOB_ID", "kf-456")
+    monkeypatch.setenv("SLURM_JOB_ID", "slurm-789")
+
+    assert resolve_job_id_from_env() == "ray-abc"
+
+
+def test_job_id_env_set_false_when_all_unset(monkeypatch):
+    for v in ("DD_PYTORCH_JOB_ID", "RAY_JOB_ID", "TORCHELASTIC_RUN_ID", "KUBEFLOW_TRAINING_JOB_ID", "SLURM_JOB_ID"):
+        monkeypatch.delenv(v, raising=False)
+    assert job_id_env_set() is False
+
+
+def test_job_id_env_set_treats_whitespace_as_unset(monkeypatch):
+    for v in ("DD_PYTORCH_JOB_ID", "RAY_JOB_ID", "TORCHELASTIC_RUN_ID", "KUBEFLOW_TRAINING_JOB_ID", "SLURM_JOB_ID"):
+        monkeypatch.delenv(v, raising=False)
+    monkeypatch.setenv("DD_PYTORCH_JOB_ID", "   \t\n")
+    assert job_id_env_set() is False
+
+
+def test_set_training_job_id_tag_sets_both_keys(monkeypatch):
+    from ddtrace.contrib.internal.pytorch import _utils
+
+    monkeypatch.setattr(_utils, "_default_job_id", "ray-abc-123", raising=False)
+    monkeypatch.setattr(_utils._tls_job_id, "value", None, raising=False)
+
+    class _FakeSpan:
+        def __init__(self):
+            self._tags = {}
+
+        def set_tag(self, key, value=None):
+            self._tags[key] = value
+
+    span = _FakeSpan()
+    set_training_job_id_tag(span)
+
+    assert span._tags[TRAINING_JOB_ID_TAG] == "ray-abc-123"
+    assert span._tags["job_id"] == "ray-abc-123"
+
+
+def test_set_training_job_id_tag_noop_when_id_unset(monkeypatch):
+    from ddtrace.contrib.internal.pytorch import _utils
+
+    monkeypatch.setattr(_utils, "_default_job_id", None, raising=False)
+    monkeypatch.setattr(_utils._tls_job_id, "value", None, raising=False)
+
+    class _FakeSpan:
+        def __init__(self):
+            self._tags = {}
+
+        def set_tag(self, key, value=None):
+            self._tags[key] = value
+
+    span = _FakeSpan()
+    set_training_job_id_tag(span)
+    assert "manual.keep" in span._tags
+    assert TRAINING_JOB_ID_TAG not in span._tags
+
+
+def test_set_training_job_id_tag_does_not_acquire_lock(monkeypatch):
+    """A6: this function runs per span on the hot path. The reads it
+    performs must not take `_run_metadata_lock`.
+    """
+    from ddtrace.contrib.internal.pytorch import _utils
+
+    _utils.set_cached_run_metadata(run_name="rn", submission_id="sub", metadata={"k": "v"})
+    _utils.set_cached_job_id("training-abc")
+
+    acquired = []
+    real_lock = _utils._run_metadata_lock
+
+    class WatchingLock:
+        def acquire(self, *a, **kw):
+            acquired.append("acquire")
+            return real_lock.acquire(*a, **kw)
+
+        def release(self):
+            acquired.append("release")
+            return real_lock.release()
+
+        def __enter__(self):
+            self.acquire()
+            return self
+
+        def __exit__(self, *a):
+            self.release()
+
+    monkeypatch.setattr(_utils, "_run_metadata_lock", WatchingLock())
+
+    class FakeSpan:
+        def __init__(self):
+            self.tags = {}
+
+        def set_tag(self, k, v=None):
+            self.tags[k] = v
+
+    for _ in range(100):
+        s = FakeSpan()
+        _utils.set_training_job_id_tag(s)
+        assert s.tags.get("training_job.id") == "training-abc"
+
+    assert acquired == [], f"hot-path span tagging took the lock: {acquired}"
+
+
+def test_get_cached_run_metadata_is_immutable():
+    """NB4: the published view must reject mutation."""
+    import pytest
+
+    from ddtrace.contrib.internal.pytorch import _utils
+
+    _utils.set_cached_run_metadata(run_name="rn", submission_id="sub", metadata={"k": "v"})
+    snap = _utils.get_cached_run_metadata()
+
+    with pytest.raises(TypeError):
+        snap["run_name"] = "mutated"
+    with pytest.raises(TypeError):
+        snap["metadata"]["k"] = "mutated"
+
+
+def test_run_metadata_view_consistent_under_writer_load():
+    """The view is replaced atomically; concurrent readers see either
+    the old snapshot or the new one — never a torn intermediate state.
+    """
+    import threading
+
+    from ddtrace.contrib.internal.pytorch import _utils
+
+    _utils.set_cached_run_metadata(run_name="A", submission_id="A-sub", metadata={"k": "A"})
+
+    ready = threading.Barrier(5)  # 4 readers + main
+    stop = threading.Event()
+    seen_inconsistent = []
+
+    def reader():
+        ready.wait(timeout=5)
+        while not stop.is_set():
+            snap = _utils.get_cached_run_metadata()
+            rn = snap.get("run_name")
+            sub = snap.get("submission_id")
+            md = (snap.get("metadata") or {}).get("k")
+            if rn is None or sub is None or md is None:
+                seen_inconsistent.append(("missing", rn, sub, md))
+                continue
+            if not (rn == sub.split("-")[0] == md):
+                seen_inconsistent.append((rn, sub, md))
+
+    threads = [threading.Thread(target=reader) for _ in range(4)]
+    for t in threads:
+        t.start()
+    try:
+        ready.wait(timeout=5)
+        for label in ("B", "C", "D", "E"):
+            _utils.set_cached_run_metadata(run_name=label, submission_id=f"{label}-sub", metadata={"k": label})
+    finally:
+        stop.set()
+        for t in threads:
+            t.join(timeout=2)
+    assert seen_inconsistent == [], f"saw torn reads: {seen_inconsistent[:5]}"
diff --git a/tests/contrib/suitespec.yml b/tests/contrib/suitespec.yml
index 812a28aa550..35b61fbb265 100644
--- a/tests/contrib/suitespec.yml
+++ b/tests/contrib/suitespec.yml
@@ -167,6 +167,8 @@ components:
     - ddtrace/ext/memcached.py
   pynamodb:
     - ddtrace/contrib/internal/pynamodb/*
+  pytorch:
+    - ddtrace/contrib/internal/pytorch/*
   pyodbc:
     - ddtrace/contrib/internal/pyodbc/*
   pyramid:
@@ -1107,6 +1109,18 @@ suites:
       - '@pynamodb'
       - tests/contrib/pynamodb/*
     snapshot: true
+  pytorch:
+    venvs_per_job: 1
+    skip_venv_artifacts: true
+    skip_pip_cache: true
+    paths:
+      - '@bootstrap'
+      - '@core'
+      - '@contrib'
+      - '@tracing'
+      - '@pytorch'
+      - tests/contrib/pytorch/*
+    snapshot: true
   pyodbc:
     parallelism: 1
     paths: