diff --git a/.gitignore b/.gitignore index 1694325cf45..4b515442720 100644 --- a/.gitignore +++ b/.gitignore @@ -229,4 +229,4 @@ src/native/target* .analysis/ # file created when running scripts/lint -uv.lock \ No newline at end of file +uv.lock diff --git a/.riot/requirements/1059304.txt b/.riot/requirements/1059304.txt new file mode 100644 index 00000000000..d9c2b106d89 --- /dev/null +++ b/.riot/requirements/1059304.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1059304.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +exceptiongroup==1.3.1 +filelock==3.29.1 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.4.2 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.3.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/116b0b8.txt b/.riot/requirements/116b0b8.txt new file mode 100644 index 00000000000..c428ec61c6b --- /dev/null +++ b/.riot/requirements/116b0b8.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/116b0b8.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.1 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.3.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/1346e9d.txt b/.riot/requirements/1346e9d.txt new file mode 100644 index 00000000000..14b0ff6eccf --- /dev/null +++ b/.riot/requirements/1346e9d.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.9 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1346e9d.in +# +attrs==26.1.0 +coverage[toml]==7.10.7 +exceptiongroup==1.3.1 +filelock==3.19.1 +fsspec==2025.10.0 +hypothesis==6.45.0 +iniconfig==2.1.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.2.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==8.4.2 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.13.1 +tomli==2.4.1 +torch==2.5.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/1351aca.txt b/.riot/requirements/1351aca.txt new file mode 100644 index 00000000000..2a5f81ce7a3 --- /dev/null +++ b/.riot/requirements/1351aca.txt @@ -0,0 +1,31 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1351aca.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.11.0 +typing-extensions==4.15.0 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==81.0.0 diff --git a/.riot/requirements/139b6b2.txt b/.riot/requirements/139b6b2.txt new file mode 100644 index 00000000000..156bee5d715 --- /dev/null +++ b/.riot/requirements/139b6b2.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/139b6b2.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +exceptiongroup==1.3.1 +filelock==3.29.1 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.4.2 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.1.2 +typing-extensions==4.15.0 diff --git a/.riot/requirements/16e767e.txt b/.riot/requirements/16e767e.txt new file mode 100644 index 00000000000..6053f8be5f3 --- /dev/null +++ b/.riot/requirements/16e767e.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/16e767e.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.4.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/173555b.txt b/.riot/requirements/173555b.txt new file mode 100644 index 00000000000..8806dadf055 --- /dev/null +++ b/.riot/requirements/173555b.txt @@ -0,0 +1,31 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/173555b.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.12.0 +typing-extensions==4.15.0 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==81.0.0 diff --git a/.riot/requirements/177b157.txt b/.riot/requirements/177b157.txt new file mode 100644 index 00000000000..6db2893b99d --- /dev/null +++ b/.riot/requirements/177b157.txt @@ -0,0 +1,27 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/177b157.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.1 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.0.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/179c655.txt b/.riot/requirements/179c655.txt new file mode 100644 index 00000000000..8b96803664e --- /dev/null +++ b/.riot/requirements/179c655.txt @@ -0,0 +1,31 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate --output-file=.riot/requirements/179c655.txt .riot/requirements/179c655.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.13.1 +torch==2.5.1 +typing-extensions==4.15.0 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==82.0.1 diff --git a/.riot/requirements/17a8226.txt b/.riot/requirements/17a8226.txt new file mode 100644 index 00000000000..67efb17aff2 --- /dev/null +++ b/.riot/requirements/17a8226.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/17a8226.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.1 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.1.2 +typing-extensions==4.15.0 diff --git a/.riot/requirements/181e2d5.txt b/.riot/requirements/181e2d5.txt new file mode 100644 index 00000000000..ae157b66f35 --- /dev/null +++ b/.riot/requirements/181e2d5.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.9 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/181e2d5.in +# +attrs==26.1.0 +coverage[toml]==7.10.7 +exceptiongroup==1.3.1 +filelock==3.19.1 +fsspec==2025.10.0 +hypothesis==6.45.0 +iniconfig==2.1.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.2.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==8.4.2 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.1.2 +typing-extensions==4.15.0 diff --git a/.riot/requirements/1989fbc.txt b/.riot/requirements/1989fbc.txt new file mode 100644 index 00000000000..b1229d837c6 --- /dev/null +++ b/.riot/requirements/1989fbc.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.9 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1989fbc.in +# +attrs==26.1.0 +coverage[toml]==7.10.7 +exceptiongroup==1.3.1 +filelock==3.19.1 +fsspec==2025.10.0 +hypothesis==6.45.0 +iniconfig==2.1.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.2.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==8.4.2 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.4.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/19ca09f.txt b/.riot/requirements/19ca09f.txt new file mode 100644 index 00000000000..d5b1044b891 --- /dev/null +++ b/.riot/requirements/19ca09f.txt @@ -0,0 +1,31 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/19ca09f.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.8.0 +typing-extensions==4.15.0 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==82.0.1 diff --git a/.riot/requirements/1a4c54d.txt b/.riot/requirements/1a4c54d.txt new file mode 100644 index 00000000000..fc009b490fd --- /dev/null +++ b/.riot/requirements/1a4c54d.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.9 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1a4c54d.in +# +attrs==26.1.0 +coverage[toml]==7.10.7 +exceptiongroup==1.3.1 +filelock==3.19.1 +fsspec==2025.10.0 +hypothesis==6.45.0 +iniconfig==2.1.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.2.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==8.4.2 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.3.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/1a9e432.txt b/.riot/requirements/1a9e432.txt new file mode 100644 index 00000000000..09e6848ed2c --- /dev/null +++ b/.riot/requirements/1a9e432.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1a9e432.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.1 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.2.2 +typing-extensions==4.15.0 diff --git a/.riot/requirements/1b254f8.txt b/.riot/requirements/1b254f8.txt new file mode 100644 index 00000000000..bfee11f94c0 --- /dev/null +++ b/.riot/requirements/1b254f8.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1b254f8.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.1 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.3.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/1d55347.txt b/.riot/requirements/1d55347.txt new file mode 100644 index 00000000000..9c1ec80cd8e --- /dev/null +++ b/.riot/requirements/1d55347.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1d55347.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +exceptiongroup==1.3.1 +filelock==3.29.1 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.4.2 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.2.2 +typing-extensions==4.15.0 diff --git a/.riot/requirements/1d6137c.txt b/.riot/requirements/1d6137c.txt new file mode 100644 index 00000000000..ff3d58d89e9 --- /dev/null +++ b/.riot/requirements/1d6137c.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1d6137c.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +exceptiongroup==1.3.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.4.2 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.4.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/1e9ae39.txt b/.riot/requirements/1e9ae39.txt new file mode 100644 index 00000000000..c652dc05312 --- /dev/null +++ b/.riot/requirements/1e9ae39.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1e9ae39.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.13.1 +torch==2.6.0 +typing-extensions==4.15.0 diff --git a/.riot/requirements/1ea7124.txt b/.riot/requirements/1ea7124.txt new file mode 100644 index 00000000000..400d8084475 --- /dev/null +++ b/.riot/requirements/1ea7124.txt @@ -0,0 +1,31 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/1ea7124.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.4.1 +typing-extensions==4.15.0 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==82.0.1 diff --git a/.riot/requirements/1efcde5.txt b/.riot/requirements/1efcde5.txt new file mode 100644 index 00000000000..94ad685f2d5 --- /dev/null +++ b/.riot/requirements/1efcde5.txt @@ -0,0 +1,31 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate --output-file=.riot/requirements/1efcde5.txt .riot/requirements/1efcde5.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.13.1 +torch==2.6.0 +typing-extensions==4.15.0 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==82.0.1 diff --git a/.riot/requirements/21226ae.txt b/.riot/requirements/21226ae.txt new file mode 100644 index 00000000000..a62fe680e93 --- /dev/null +++ b/.riot/requirements/21226ae.txt @@ -0,0 +1,31 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate --output-file=.riot/requirements/21226ae.txt .riot/requirements/21226ae.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.7.1 +typing-extensions==4.15.0 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==82.0.1 diff --git a/.riot/requirements/2dde9bb.txt b/.riot/requirements/2dde9bb.txt new file mode 100644 index 00000000000..db8a451d890 --- /dev/null +++ b/.riot/requirements/2dde9bb.txt @@ -0,0 +1,31 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/2dde9bb.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.9.1 +typing-extensions==4.15.0 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==82.0.1 diff --git a/.riot/requirements/34517c6.txt b/.riot/requirements/34517c6.txt new file mode 100644 index 00000000000..3a238ca594f --- /dev/null +++ b/.riot/requirements/34517c6.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/34517c6.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +exceptiongroup==1.3.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.4.2 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.13.1 +tomli==2.4.1 +torch==2.5.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/6444f67.txt b/.riot/requirements/6444f67.txt new file mode 100644 index 00000000000..a06c7d66b04 --- /dev/null +++ b/.riot/requirements/6444f67.txt @@ -0,0 +1,29 @@ +# +# This file is autogenerated by pip-compile with Python 3.9 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/6444f67.in +# +attrs==26.1.0 +coverage[toml]==7.10.7 +exceptiongroup==1.3.1 +filelock==3.19.1 +hypothesis==6.45.0 +iniconfig==2.1.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.2.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==8.4.2 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.0.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/6fb24b4.txt b/.riot/requirements/6fb24b4.txt new file mode 100644 index 00000000000..5b1f15e607d --- /dev/null +++ b/.riot/requirements/6fb24b4.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.9 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/6fb24b4.in +# +attrs==26.1.0 +coverage[toml]==7.10.7 +exceptiongroup==1.3.1 +filelock==3.19.1 +fsspec==2025.10.0 +hypothesis==6.45.0 +iniconfig==2.1.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.2.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==8.4.2 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.13.1 +tomli==2.4.1 +torch==2.6.0 +typing-extensions==4.15.0 diff --git a/.riot/requirements/7878a79.txt b/.riot/requirements/7878a79.txt new file mode 100644 index 00000000000..d8530ff6e9f --- /dev/null +++ b/.riot/requirements/7878a79.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.9 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/7878a79.in +# +attrs==26.1.0 +coverage[toml]==7.10.7 +exceptiongroup==1.3.1 +filelock==3.19.1 +fsspec==2025.10.0 +hypothesis==6.45.0 +iniconfig==2.1.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.2.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==8.4.2 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.7.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/a9c7746.txt b/.riot/requirements/a9c7746.txt new file mode 100644 index 00000000000..29922b16883 --- /dev/null +++ b/.riot/requirements/a9c7746.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate --output-file=.riot/requirements/a9c7746.txt .riot/requirements/a9c7746.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.1 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.2.2 +typing-extensions==4.15.0 diff --git a/.riot/requirements/afdf8ce.txt b/.riot/requirements/afdf8ce.txt new file mode 100644 index 00000000000..bd5643910cb --- /dev/null +++ b/.riot/requirements/afdf8ce.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/afdf8ce.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +exceptiongroup==1.3.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.4.2 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.13.1 +tomli==2.4.1 +torch==2.6.0 +typing-extensions==4.15.0 diff --git a/.riot/requirements/b77de6a.txt b/.riot/requirements/b77de6a.txt new file mode 100644 index 00000000000..d90b034e1ab --- /dev/null +++ b/.riot/requirements/b77de6a.txt @@ -0,0 +1,29 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/b77de6a.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +exceptiongroup==1.3.1 +filelock==3.29.1 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.4.2 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.0.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/d300b85.txt b/.riot/requirements/d300b85.txt new file mode 100644 index 00000000000..b40dd8fd361 --- /dev/null +++ b/.riot/requirements/d300b85.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/d300b85.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +exceptiongroup==1.3.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.4.2 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.7.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/d598449.txt b/.riot/requirements/d598449.txt new file mode 100644 index 00000000000..7bfe1c31637 --- /dev/null +++ b/.riot/requirements/d598449.txt @@ -0,0 +1,31 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/d598449.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.10.0 +typing-extensions==4.15.0 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==82.0.1 diff --git a/.riot/requirements/dc250d4.txt b/.riot/requirements/dc250d4.txt new file mode 100644 index 00000000000..7d82d707c06 --- /dev/null +++ b/.riot/requirements/dc250d4.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/dc250d4.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.13.1 +torch==2.5.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/e321c89.txt b/.riot/requirements/e321c89.txt new file mode 100644 index 00000000000..1157c900764 --- /dev/null +++ b/.riot/requirements/e321c89.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/e321c89.in +# +attrs==26.1.0 +coverage[toml]==7.14.1 +filelock==3.29.3 +fsspec==2026.4.0 +hypothesis==6.45.0 +iniconfig==2.3.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.6.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==9.0.3 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +torch==2.7.1 +typing-extensions==4.15.0 diff --git a/.riot/requirements/efc40e8.txt b/.riot/requirements/efc40e8.txt new file mode 100644 index 00000000000..b394baf2d6c --- /dev/null +++ b/.riot/requirements/efc40e8.txt @@ -0,0 +1,30 @@ +# +# This file is autogenerated by pip-compile with Python 3.9 +# by the following command: +# +# pip-compile --allow-unsafe --no-annotate .riot/requirements/efc40e8.in +# +attrs==26.1.0 +coverage[toml]==7.10.7 +exceptiongroup==1.3.1 +filelock==3.19.1 +fsspec==2025.10.0 +hypothesis==6.45.0 +iniconfig==2.1.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mock==5.2.0 +mpmath==1.3.0 +networkx==3.2.1 +opentracing==2.4.0 +packaging==26.2 +pluggy==1.6.0 +pygments==2.20.0 +pytest==8.4.2 +pytest-cov==7.1.0 +pytest-mock==3.15.1 +sortedcontainers==2.4.0 +sympy==1.14.0 +tomli==2.4.1 +torch==2.2.2 +typing-extensions==4.15.0 diff --git a/ddtrace/_monkey.py b/ddtrace/_monkey.py index 180cc194d87..f672fb48b82 100644 --- a/ddtrace/_monkey.py +++ b/ddtrace/_monkey.py @@ -109,6 +109,7 @@ "anthropic": True, "crewai": True, "pydantic_ai": True, + "pytorch": False, "vllm": True, "mlflow": config._model_lab_enabled, "subprocess": True, @@ -175,6 +176,7 @@ "langgraph.prebuilt", ), "openai_agents": ("agents",), + "pytorch": ("torch",), } _NOT_PATCHABLE_VIA_ENVVAR = {"ddtrace_api"} diff --git a/ddtrace/contrib/internal/pytorch/__init__.py b/ddtrace/contrib/internal/pytorch/__init__.py new file mode 100644 index 00000000000..e07b96bd3d0 --- /dev/null +++ b/ddtrace/contrib/internal/pytorch/__init__.py @@ -0,0 +1,46 @@ +""" +The pytorch integration traces PyTorch distributed training jobs. + +Always-on: a single long-lived ``pytorch.rank`` span is emitted per rank. +Tags: ``rank``, ``world_size``, ``framework`` (DDP / FSDP / DeepSpeed), +``launcher``, ``torch.distributed.backend``, ``training_job_id`` +(auto-resolved from ``RAY_JOB_ID``, ``TORCHELASTIC_RUN_ID``, +``KUBEFLOW_TRAINING_JOB_ID``, ``SLURM_JOB_ID``, or a per-rank UUID), +and Ray Train run context when running under Ray Train. + + +Enabling +~~~~~~~~ + +The PyTorch integration is **opt-in**. Enable explicitly via:: + + DD_PATCH_MODULES=pytorch:true + +or programmatically:: + + import ddtrace + ddtrace.patch(pytorch=True) + + +Global configuration +~~~~~~~~~~~~~~~~~~~~ + +.. py:data:: ddtrace.config.pytorch["service"] + + The service name reported by default for pytorch spans. + + This option can also be set with the ``DD_PYTORCH_SERVICE`` environment variable. + + Default: ``"pytorch"`` + +""" + +from ddtrace import config + + +config._add( # type: ignore[no-untyped-call] + "pytorch", + { + "_default_service": "pytorch", + }, +) diff --git a/ddtrace/contrib/internal/pytorch/_c_tracer.py b/ddtrace/contrib/internal/pytorch/_c_tracer.py new file mode 100644 index 00000000000..3fa4e718a14 --- /dev/null +++ b/ddtrace/contrib/internal/pytorch/_c_tracer.py @@ -0,0 +1,148 @@ +"""ctypes bridge to the dd-trace-c global parent context API. + +The C tracer is injected into the process via LD_PRELOAD by the Datadog +injection layer — dd-trace-py does not load it. All public functions here +are silent no-ops when the C tracer is not present. +""" + +import ctypes +from typing import Any +from typing import Callable +from typing import Optional + +from ddtrace.internal.logger import get_logger + + +log = get_logger(__name__) + +_lib: Optional[ctypes.CDLL] = None +_loaded: bool = False +_set_fn: Optional[Callable[..., None]] = None +_clear_fn: Optional[Callable[[], None]] = None +_step_begin_fn: Optional[Callable[[], None]] = None +_step_end_fn: Optional[Callable[[], None]] = None + + +def _load() -> bool: + """Bind to C tracer symbols already in the process namespace. Returns True on success.""" + global _lib, _loaded, _set_fn, _clear_fn, _step_begin_fn, _step_end_fn + if _loaded: + return _lib is not None + _loaded = True + + try: + # ctypes.CDLL(None) opens the global symbol table, which includes any + # library injected via LD_PRELOAD — no explicit library loading needed. + lib = ctypes.CDLL(None) + fn = lib.dd_set_global_parent_context + fn.restype = None + fn.argtypes = [ + ctypes.c_uint64, # trace_id (low 64 bits) + ctypes.c_uint64, # trace_id_hi (high 64 bits) + ctypes.c_uint64, # span_id + ctypes.c_bool, # has_sampling_priority + ctypes.c_int, # sampling_priority + ctypes.POINTER(ctypes.c_char_p), # keys + ctypes.POINTER(ctypes.c_char_p), # values + ctypes.c_size_t, # count + ] + _set_fn = fn + + fn2 = lib.dd_clear_global_parent_context + fn2.restype = None + fn2.argtypes = [] + _clear_fn = fn2 + except AttributeError: + # C tracer not present in this process — no-op path. + return False + + # Step signals — only available in C tracer builds that include training.c. + # Silently absent means the heuristic NCCL-group-marker fallback activates. + try: + fn3 = lib.dd_training_step_begin + fn3.restype = None + fn3.argtypes = [] + _step_begin_fn = fn3 + fn4 = lib.dd_training_step_end + fn4.restype = None + fn4.argtypes = [] + _step_end_fn = fn4 + except AttributeError: + pass + + _lib = lib + return True + + +def set_parent_context(span: Any, open_kwargs: dict[str, Any]) -> None: + """Register *span* as the process-wide parent for all C-tracer root spans. + + No-op when the C tracer is not present. Never raises. + """ + if not _load() or _set_fn is None: + return + try: + trace_id = span.trace_id + span_id = ctypes.c_uint64(span.span_id) + trace_id_lo = ctypes.c_uint64(trace_id & 0xFFFFFFFFFFFFFFFF) + trace_id_hi = ctypes.c_uint64((trace_id >> 64) & 0xFFFFFFFFFFFFFFFF) + + priority = getattr(getattr(span, "context", None), "sampling_priority", None) + has_priority = ctypes.c_bool(priority is not None) + c_priority = ctypes.c_int(int(priority) if priority is not None else 0) + + # C API uses underscore-separated keys; Python span tags use dot-separated + # (e.g. "training_job.id"). These are intentionally different namespaces. + tags = { + "training_job_id": str(open_kwargs.get("training_job_id") or ""), + "rank": str(open_kwargs.get("rank", 0)), + "world_size": str(open_kwargs.get("world_size", 1)), + "framework": str(open_kwargs.get("framework") or "none"), + "service": str(getattr(span, "service", None) or ""), + } + keys_enc = [k.encode() for k in tags] + vals_enc = [v.encode() for v in tags.values()] + ArrType = ctypes.c_char_p * len(tags) + + _set_fn( + trace_id_lo, + trace_id_hi, + span_id, + has_priority, + c_priority, + ArrType(*keys_enc), + ArrType(*vals_enc), + ctypes.c_size_t(len(tags)), + ) + except Exception: + log.debug("pytorch: dd_set_global_parent_context failed", exc_info=True) + + +def clear_parent_context() -> None: + """Clear the process-wide parent context. No-op when C tracer is absent. Never raises.""" + if not _load() or _clear_fn is None: + return + try: + _clear_fn() + except Exception: + log.debug("pytorch: dd_clear_global_parent_context failed", exc_info=True) + + +def step_begin() -> None: + """Signal start of a training step (forward pass begins). No-op when C tracer absent. Never raises.""" + if not _load() or _step_begin_fn is None: + return + try: + _step_begin_fn() + except Exception: + log.debug("pytorch: dd_training_step_begin failed", exc_info=True) + + +def step_end() -> None: + """Signal end of a training step (optimizer step complete). No-op when C tracer absent. Never raises.""" + if not _load() or _step_end_fn is None: + return + try: + _step_end_fn() + except Exception: + log.debug("pytorch: dd_training_step_end failed", exc_info=True) diff --git a/ddtrace/contrib/internal/pytorch/_device.py b/ddtrace/contrib/internal/pytorch/_device.py new file mode 100644 index 00000000000..0f62e7137bd --- /dev/null +++ b/ddtrace/contrib/internal/pytorch/_device.py @@ -0,0 +1,313 @@ +"""Device-id discovery for pytorch.rank span tagging. + +Device id is a fleet-stable dimension (GPU UUID, not rank). Custom-metric +cardinality stays bounded by physical fleet size regardless of job count; +job attribution lives on the rank-root span via training_job.id and rank. +""" + +import os +import socket +import threading +from typing import Any +from typing import NamedTuple +from typing import Optional + +from ddtrace.internal.settings import env + + +class DeviceInfo(NamedTuple): + device_id: str + device_index: Optional[int] + kind: str # "cuda" | "cpu" + hostname: str + # New fields — defensive Optional[...] because older torch versions may + # not expose all of them, and CPU-only hosts return None for all. + gpu_name: Optional[str] = None + gpu_compute_capability: Optional[str] = None # e.g. "8.0" + gpu_sm_count: Optional[int] = None + gpu_total_memory_bytes: Optional[int] = None + gpu_driver_version: Optional[str] = None + + +_cache: Optional[DeviceInfo] = None +_lock = threading.Lock() + + +def _reset_child_state() -> None: + global _cache, _lock + _cache = None + _lock = threading.Lock() + + +if hasattr(os, "register_at_fork"): + os.register_at_fork(after_in_child=_reset_child_state) + + +def _cuda_is_available() -> bool: + try: + import torch + + return bool(torch.cuda.is_available()) + except Exception: + return False + + +def _cuda_index(local_rank: int) -> Optional[int]: + # current_device() is unreliable at bootstrap (returns 0 for all ranks before + # set_device runs). Priority: LOCAL_RANK env → Ray Train API → current_device(). + try: + env_local = env.get("LOCAL_RANK") + if env_local is not None and env_local != "": + return int(env_local) + except Exception: # nosec B110 + pass + try: + import ray.train + + ctx = ray.train.get_context() + return int(ctx.get_local_rank()) + except Exception: # nosec B110 + pass + try: + import torch + + return int(torch.cuda.current_device()) + except Exception: + return None + + +def _cuda_visible_to_physical(visible_idx: int) -> int: + """Map a CUDA-visible device index to the physical NVML index. + + When CUDA_VISIBLE_DEVICES remaps or subsets GPUs, the CUDA-visible index + (used by LOCAL_RANK / torch.cuda.current_device) differs from the physical + GPU index that NVML requires. + """ + raw = env.get("CUDA_VISIBLE_DEVICES") or "" + if not raw or raw == "NoDevFiles": + return visible_idx + try: + # UUID entries (e.g. "GPU-abc123") are not integers; fall back to visible_idx. + physical = [int(x.strip()) for x in raw.split(",") if x.strip().lstrip("-").isdigit()] + if physical and visible_idx < len(physical): + return physical[visible_idx] + except (ValueError, IndexError): + pass + return visible_idx + + +def _cuda_visible_uuid_at(visible_idx: int) -> Optional[str]: + """Return the UUID at visible_idx in CUDA_VISIBLE_DEVICES (GPU-... / MIG-... format), or None.""" + raw = env.get("CUDA_VISIBLE_DEVICES") or "" + if not raw or raw == "NoDevFiles": + return None + entries = [x.strip() for x in raw.split(",") if x.strip()] + uuids = [e for e in entries if e.upper().startswith(("GPU-", "MIG-"))] + if not uuids or visible_idx >= len(uuids): + return None + return uuids[visible_idx] + + +def _query_cuda_uuid(idx: int) -> Optional[str]: + # UUID-format CUDA_VISIBLE_DEVICES (k8s NVIDIA device plugin): use UUID handle + # directly — passing the visible ordinal to nvmlDeviceGetHandleByIndex returns + # the wrong physical device. + env_uuid = _cuda_visible_uuid_at(idx) + if env_uuid is not None: + try: + import pynvml + + pynvml.nvmlInit() + try: + encoded = env_uuid.encode() if isinstance(env_uuid, str) else env_uuid + handle = pynvml.nvmlDeviceGetHandleByUUID(encoded) + raw = pynvml.nvmlDeviceGetUUID(handle) + return raw.decode() if isinstance(raw, bytes) else str(raw) + finally: + try: + pynvml.nvmlShutdown() + except Exception: # nosec B110 + pass + except Exception: # nosec B110 + pass + return env_uuid + + # Prefer pynvml (stable across torch versions); fall back to torch device-properties UUID (2.0+). + try: + import pynvml + + pynvml.nvmlInit() + try: + handle = pynvml.nvmlDeviceGetHandleByIndex(_cuda_visible_to_physical(idx)) + raw = pynvml.nvmlDeviceGetUUID(handle) + return raw.decode() if isinstance(raw, bytes) else str(raw) + finally: + try: + pynvml.nvmlShutdown() + except Exception: # nosec B110 + pass + except Exception: # nosec B110 + pass + try: + import torch + + props = torch.cuda.get_device_properties(idx) + uuid = getattr(props, "uuid", None) + if uuid is not None: + return str(uuid) + except Exception: # nosec B110 + pass + return None + + +def _query_cuda_props(idx: int) -> dict[str, Any]: + """Best-effort fetch of additional device fields from + `torch.cuda.get_device_properties(idx)`. Returns a dict with only the + fields we managed to read; missing fields are omitted. + """ + out: dict[str, Any] = {} + try: + import torch # noqa: PLC0415 + + props = torch.cuda.get_device_properties(idx) + except Exception: + return out + name = getattr(props, "name", None) + if name: + out["gpu_name"] = str(name) + major = getattr(props, "major", None) + minor = getattr(props, "minor", None) + if major is not None and minor is not None: + out["gpu_compute_capability"] = f"{int(major)}.{int(minor)}" + sm = getattr(props, "multi_processor_count", None) + if sm is not None: + try: + out["gpu_sm_count"] = int(sm) + except Exception: # nosec B110 + pass + total = getattr(props, "total_memory", None) + if total is not None: + try: + out["gpu_total_memory_bytes"] = int(total) + except Exception: # nosec B110 + pass + return out + + +def _query_cuda_driver_version() -> Optional[str]: + try: + import pynvml # noqa: PLC0415 + + pynvml.nvmlInit() + try: + raw = pynvml.nvmlSystemGetDriverVersion() + return raw.decode() if isinstance(raw, bytes) else str(raw) + finally: + try: + pynvml.nvmlShutdown() + except Exception: # nosec B110 + pass + except Exception: + return None + + +def _hostname() -> str: + try: + return socket.gethostname() + except Exception: + return "unknown-host" + + +def discover(local_rank: int) -> DeviceInfo: + """Resolve and cache the device id. Idempotent — second call returns the cached value.""" + global _cache + with _lock: + if _cache is not None: + return _cache + host = _hostname() + if _cuda_is_available(): + idx = _cuda_index(local_rank) + if idx is None: + # Can't reliably map local_rank → physical device; skip UUID lookup. + _cache = DeviceInfo( + device_id="%s:cuda:unknown" % host, + device_index=None, + kind="cuda", + hostname=host, + ) + return _cache + uuid = _query_cuda_uuid(idx) + device_id = uuid if uuid else "%s:cuda:%d" % (host, idx) + props = _query_cuda_props(idx) + driver_v = _query_cuda_driver_version() + _cache = DeviceInfo( + device_id=device_id, + device_index=idx, + kind="cuda", + hostname=host, + gpu_name=props.get("gpu_name"), + gpu_compute_capability=props.get("gpu_compute_capability"), + gpu_sm_count=props.get("gpu_sm_count"), + gpu_total_memory_bytes=props.get("gpu_total_memory_bytes"), + gpu_driver_version=driver_v, + ) + else: + # CPU: one logical device per host for cardinality bounding. + _cache = DeviceInfo( + device_id="%s:cpu" % host, + device_index=None, + kind="cpu", + hostname=host, + ) + return _cache + + +def get() -> Optional[DeviceInfo]: + """Return the cached DeviceInfo, or None if `discover` has not yet run.""" + return _cache + + +# Per-GPU peak FLOPs by dtype, in FLOPS (not TFLOPS). +# Maintenance: add new GPUs here as needed. Values from official datasheets. +_PEAK_FLOPS_TABLE: dict[tuple[str, str], float] = { + # NVIDIA H100 SXM5 / PCIe — figures for tensor cores + ("H100", "bfloat16"): 989e12, + ("H100", "float16"): 989e12, + ("H100", "tf32"): 495e12, + ("H100", "float32"): 67e12, + # NVIDIA A100 SXM4 / PCIe + ("A100", "bfloat16"): 312e12, + ("A100", "float16"): 312e12, + ("A100", "tf32"): 156e12, + ("A100", "float32"): 19.5e12, + # NVIDIA L40 / L4 — Ada Lovelace. fp16 shares bf16 tensor-core path; + # fp32 is the non-tensor ALU peak per datasheet. + ("L40", "bfloat16"): 181e12, + ("L40", "float16"): 181e12, + ("L40", "float32"): 90.5e12, + ("L4", "bfloat16"): 121e12, + ("L4", "float16"): 121e12, + ("L4", "float32"): 30.3e12, + # NVIDIA V100 + ("V100", "float16"): 125e12, + ("V100", "float32"): 15.7e12, + # NVIDIA T4 + ("T4", "float16"): 65e12, + ("T4", "float32"): 8.1e12, + # AMD MI300X — CDNA3 matrix peaks per datasheet; fp32 is the vector ALU peak. + ("MI300", "bfloat16"): 1300e12, + ("MI300", "float16"): 1300e12, + ("MI300", "float32"): 163.4e12, +} + + +def lookup_peak_flops(gpu_name: Optional[str], dtype: str) -> Optional[float]: + """Best-effort lookup: substring-match `gpu_name` against table prefixes. + Returns None if no match. + """ + if not gpu_name: + return None + for (prefix, dt), v in _PEAK_FLOPS_TABLE.items(): + if dt == dtype and prefix in gpu_name: + return v + return None diff --git a/ddtrace/contrib/internal/pytorch/_distributed.py b/ddtrace/contrib/internal/pytorch/_distributed.py new file mode 100644 index 00000000000..cf3d8f7a12d --- /dev/null +++ b/ddtrace/contrib/internal/pytorch/_distributed.py @@ -0,0 +1,517 @@ +"""Distributed-training bootstrap: wraps init/destroy_process_group to +open and close the pytorch.rank lifetime span. +""" + +import contextvars +import threading +from typing import Any +from typing import Optional + +import torch + +from ddtrace.contrib.internal.pytorch._utils import get_cached_job_id +from ddtrace.contrib.internal.pytorch._utils import job_id_env_set +from ddtrace.contrib.internal.pytorch._utils import resolve_job_id_from_env +from ddtrace.contrib.internal.pytorch._utils import set_cached_job_id +from ddtrace.contrib.internal.trace_utils import unwrap as _unwrap +from ddtrace.contrib.internal.trace_utils import wrap as _wrap +from ddtrace.internal import core +from ddtrace.internal import forksafe +from ddtrace.internal.logger import get_logger +from ddtrace.internal.settings import env + + +log = get_logger(__name__) + +_no_env_job_id_warned: bool = False +_installed: bool = False +_optimizer_wrapped: bool = False +_fsdp_hook_registered: bool = False +_deepspeed_hook_registered: bool = False + +# Tracks the active ExecutionContext for the current distributed training session. +# Presence (non-None) doubles as the "bootstrapped" flag. +# AIDEV-NOTE: ContextVar is per-thread — safe because init/destroy_process_group always run on the same thread in DDP. +_rank_ctx: contextvars.ContextVar[Optional[core.ExecutionContext[Any]]] = contextvars.ContextVar( + "pytorch_rank_ctx", default=None +) + +_cached_distributed_backend: Optional[str] = None + + +def _step_profiling_enabled() -> bool: + return env.get("DD_TRAINING_STEP_PROFILING", "false").lower() in ("true", "1") + + +# Wire-format env var names set by the Ray contrib on worker processes. +# AIDEV-NOTE: duplicated from ddtrace.contrib.internal.ray intentionally — +# contrib-to-contrib imports break isolation (ray contrib may not be installed). +# If these names ever change, update both sides. +_RAY_SUBMISSION_ID_ENV = "_RAY_SUBMISSION_ID" +_RAY_JOB_NAME_ENV = "_RAY_JOB_NAME" +_RAY_RUN_METADATA_ENV = "_DD_RAY_RUN_METADATA" + + +def _reset_child_state() -> None: + global \ + _no_env_job_id_warned, \ + _cached_distributed_backend, \ + _fsdp_hook_registered, \ + _deepspeed_hook_registered, \ + _optimizer_wrapped + ctx = _rank_ctx.get() + if ctx is not None: + _rank_ctx.set(None) + # AIDEV-NOTE: Deferred imports + manual reset — import system may be unsafe post-fork. + try: + from ddtrace.internal.core import _CURRENT_CONTEXT # noqa: PLC0415 + from ddtrace.internal.core import ROOT_CONTEXT_ID # noqa: PLC0415 + from ddtrace.internal.core import ExecutionContext # noqa: PLC0415 + + _CURRENT_CONTEXT.set(ExecutionContext(ROOT_CONTEXT_ID)) + except Exception: # nosec B110 + pass + _no_env_job_id_warned = False + _cached_distributed_backend = None + _fsdp_hook_registered = False + _deepspeed_hook_registered = False + _optimizer_wrapped = False + + +forksafe.register(_reset_child_state) + + +def _distributed_available() -> bool: + try: + return bool(torch.distributed.is_available()) + except Exception: + return False + + +def _get_cached_backend() -> Optional[str]: + """One-shot lookup of ``torch.distributed.get_backend()``. Caches the + result on first successful call. The backend (nccl/gloo/mpi) does not + change during the lifetime of a process group. + """ + global _cached_distributed_backend + if _cached_distributed_backend is not None: + return _cached_distributed_backend + try: + if _distributed_available() and torch.distributed.is_initialized(): + _cached_distributed_backend = str(torch.distributed.get_backend()) + except Exception: + return None + return _cached_distributed_backend + + +def _populate_ray_run_metadata() -> None: + """Read Ray-set env vars into the run-metadata cache so _tag_ray_run_context can find them.""" + sub = env.get(_RAY_SUBMISSION_ID_ENV) + rn = env.get(_RAY_JOB_NAME_ENV) + md_json = env.get(_RAY_RUN_METADATA_ENV) + metadata: dict[str, Any] = {} + if md_json: + try: + import json # noqa: PLC0415 + + metadata = json.loads(md_json) or {} + except Exception: # nosec B110 + pass + if sub or rn or metadata: + from ddtrace.contrib.internal.pytorch._utils import set_cached_run_metadata # noqa: PLC0415 + + set_cached_run_metadata(submission_id=sub, run_name=rn, metadata=metadata or None) + + +def _detect_launcher() -> Optional[str]: + """Return a best-guess launcher name from env, or None.""" + if env.get("TORCHELASTIC_RUN_ID"): + return "torchrun" + if env.get("RAY_JOB_ID"): + return "ray" + if env.get("SLURM_JOB_ID"): + return "slurm" + if env.get("KUBEFLOW_TRAINING_JOB_ID"): + return "kubeflow" + return None + + +def _bootstrap_distributed() -> None: + """Capture rank/world_size and open the pytorch.rank span. + + Cross-rank correlation requires an env-supplied id (RAY_JOB_ID, + TORCHELASTIC_RUN_ID, KUBEFLOW_TRAINING_JOB_ID, SLURM_JOB_ID). When none + is resolved, training_job.id is left unset so missing correlation is visible. + """ + global _no_env_job_id_warned + + cached = get_cached_job_id() + env_id_present = job_id_env_set() + job_id = cached or resolve_job_id_from_env() + + rank: int = 0 + world_size: int = 1 + try: + if _distributed_available() and torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + world_size = torch.distributed.get_world_size() + except Exception: + log.exception("pytorch: failed to capture rank/world_size; defaulting to single-rank") + + publishable_job_id: Optional[str] = job_id + if not cached and not env_id_present: + publishable_job_id = None + if not _no_env_job_id_warned: + log.warning( + "pytorch: no shared training job id resolved from env " + "(DD_PYTORCH_JOB_ID, RAY_JOB_ID, TORCHELASTIC_RUN_ID, " + "KUBEFLOW_TRAINING_JOB_ID, SLURM_JOB_ID). Cross-rank trace " + "correlation will be DISABLED for this run — spans will not " + "carry the training_job.id tag." + ) + _no_env_job_id_warned = True + + if publishable_job_id is not None: + set_cached_job_id(publishable_job_id, is_default=True) + + _populate_ray_run_metadata() + + from ddtrace.contrib.internal.pytorch import _device # noqa: PLC0415 + from ddtrace.contrib.internal.pytorch import _rank_root # noqa: PLC0415 + + try: + _device.discover(local_rank=rank) + except Exception: + log.exception("pytorch: device discovery failed") + + try: + _rank_root.open_rank_span( + rank=rank, + world_size=world_size, + framework="none", + training_job_id=publishable_job_id, + ) + except Exception: + log.exception("pytorch: rank-root span open failed") + + if _step_profiling_enabled(): + try: + from ddtrace.contrib.internal.pytorch import _c_tracer # noqa: PLC0415 + + _c_tracer.step_begin() + except Exception: + log.debug("pytorch: step_begin after bootstrap failed", exc_info=True) + + +def _wrapped_init_process_group(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any: + already = _rank_ctx.get() is not None + + result = wrapped(*args, **kwargs) # let exceptions propagate; do NOT open context yet + + if not already: + ctx = core.context_with_data("pytorch.rank", _dispatch_end_event=False) # type: ignore[no-untyped-call] + # AIDEV-NOTE: __enter__() updates _CURRENT_CONTEXT so child spans are parented here; _dispatch_end_event=False + # defers the ended event — dispatch_ended_event() + __exit__() are called in _wrapped_destroy_process_group. + ctx.__enter__() + _rank_ctx.set(ctx) + try: + _bootstrap_distributed() + except Exception: + log.exception("pytorch: bootstrap failed inside init_process_group wrapper") + return result + + +def _is_world_group(group: Any) -> bool: + """Return True if group is the default WORLD process group. + + Handles both the no-arg (None) and the explicit group=torch.distributed.group.WORLD forms. + """ + if group is None: + return True + try: + return group is torch.distributed.group.WORLD + except AttributeError: + return False + + +def _wrapped_destroy_process_group(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any: + group = kwargs.get("group", args[0] if args else None) + try: + result = wrapped(*args, **kwargs) + return result + finally: + # Close the rank span only when the default (WORLD) process group is + # destroyed. Subgroup destroys must not end the span. + if _is_world_group(group): + if _step_profiling_enabled(): + try: + from ddtrace.contrib.internal.pytorch import _c_tracer # noqa: PLC0415 + + _c_tracer.step_end() + except Exception: + log.debug("pytorch: step_end before rank-root close failed", exc_info=True) + try: + from ddtrace.contrib.internal.pytorch import _rank_root # noqa: PLC0415 + + _rank_root.close() + except Exception: + log.debug("pytorch: rank-root close raised", exc_info=True) + ctx = _rank_ctx.get() + if ctx is not None: + ctx.dispatch_ended_event() + ctx.__exit__(None, None, None) + _rank_ctx.set(None) + global _cached_distributed_backend + _cached_distributed_backend = None + try: + from ddtrace.contrib.internal.pytorch._utils import set_cached_job_id # noqa: PLC0415 + + set_cached_job_id(None, is_default=True) + except Exception: # nosec B110 + pass + + +def _wrapped_ddp_init(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any: + result = wrapped(*args, **kwargs) + try: + from ddtrace.contrib.internal.pytorch import _rank_root # noqa: PLC0415 + + _rank_root.set_framework("ddp") + except Exception: + log.debug("pytorch: failed to update rank-root framework tag", exc_info=True) + return result + + +def _install_ddp() -> None: + try: + import torch.nn.parallel.distributed # noqa: F401 + except Exception: + return + if not hasattr(torch.nn.parallel.distributed, "DistributedDataParallel"): + return + _wrap( + "torch.nn.parallel.distributed", + "DistributedDataParallel.__init__", + _wrapped_ddp_init, + ) + + +def _uninstall_ddp() -> None: + try: + import torch.nn.parallel.distributed # noqa: F401 + except Exception: + return + if not hasattr(torch.nn.parallel.distributed, "DistributedDataParallel"): + return + try: + _unwrap(torch.nn.parallel.distributed.DistributedDataParallel, "__init__") + except Exception: + log.debug("pytorch: failed to unwrap DDP.__init__", exc_info=True) + + +def _wrapped_fsdp_init(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any: + result = wrapped(*args, **kwargs) + try: + from ddtrace.contrib.internal.pytorch import _rank_root # noqa: PLC0415 + + _rank_root.set_framework("fsdp") + except Exception: + log.debug("pytorch: failed to update rank-root framework tag", exc_info=True) + return result + + +def _install_fsdp() -> None: + # AIDEV-NOTE: defer the import of torch.distributed.fsdp until the user + # actually imports it. Eagerly importing it pulls _dynamo + sympy (~1.3s + # startup cost) for every DDP workload that never uses FSDP. + global _fsdp_hook_registered + if _fsdp_hook_registered: + return + from wrapt import register_post_import_hook + + def _do_install(_module: object) -> None: + if not _installed: + return + try: + import torch.distributed.fsdp as _fsdp # noqa: PLC0415 + + if hasattr(_fsdp.FullyShardedDataParallel.__init__, "__wrapped__"): + return + _wrap( + "torch.distributed.fsdp", + "FullyShardedDataParallel.__init__", + _wrapped_fsdp_init, + ) + except Exception: + log.exception("pytorch: failed to install FSDP wrapper") + + register_post_import_hook(_do_install, "torch.distributed.fsdp") + _fsdp_hook_registered = True + + +def _uninstall_fsdp() -> None: + try: + from torch.distributed.fsdp import FullyShardedDataParallel + except Exception: + return + try: + _unwrap(FullyShardedDataParallel, "__init__") + except Exception: + log.debug("pytorch: failed to unwrap FSDP.__init__", exc_info=True) + + +def _wrapped_deepspeed_init(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any: + result = wrapped(*args, **kwargs) + try: + from ddtrace.contrib.internal.pytorch import _rank_root # noqa: PLC0415 + + _rank_root.set_framework("deepspeed") + except Exception: + log.debug("pytorch: failed to update rank-root framework tag", exc_info=True) + return result + + +def _install_deepspeed() -> None: + global _deepspeed_hook_registered + if _deepspeed_hook_registered: + return + from wrapt import register_post_import_hook + + def _do_install(deepspeed: object) -> None: + if not _installed: + return + if not hasattr(deepspeed, "initialize"): + return + if hasattr(deepspeed.initialize, "__wrapped__"): + return + try: + _wrap("deepspeed", "initialize", _wrapped_deepspeed_init) + except Exception: + log.exception("pytorch: failed to install deepspeed wrapper") + + register_post_import_hook(_do_install, "deepspeed") + _deepspeed_hook_registered = True + + +def _uninstall_deepspeed() -> None: + try: + import deepspeed # noqa: F401 + except Exception: + return + try: + _unwrap(deepspeed, "initialize") + except Exception: + log.debug("pytorch: failed to unwrap deepspeed.initialize", exc_info=True) + + +def _install_optimizer_step() -> None: + global _optimizer_wrapped + if _optimizer_wrapped or not _step_profiling_enabled(): + return + if not hasattr(torch.optim, "Optimizer"): + return + try: + _wrap("torch.optim", "Optimizer.step", _wrapped_optimizer_step) + _optimizer_wrapped = True + except Exception: + log.debug("pytorch: failed to wrap Optimizer.step", exc_info=True) + + +def _uninstall_optimizer_step() -> None: + global _optimizer_wrapped + if not _optimizer_wrapped: + return + if not hasattr(torch.optim, "Optimizer"): + return + try: + _unwrap(torch.optim.Optimizer, "step") + except Exception: + log.debug("pytorch: failed to unwrap Optimizer.step", exc_info=True) + _optimizer_wrapped = False + + +def _wrapped_optimizer_step(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any: + from ddtrace.contrib.internal.pytorch import _c_tracer # noqa: PLC0415 + + _c_tracer.step_end() # close step N: optimizer phase ends + result = wrapped(*args, **kwargs) + _c_tracer.step_begin() # open step N+1: forward starts + return result + + +def install() -> None: + global _installed + if _installed: + return + _installed = True + if _distributed_available() and hasattr(torch.distributed, "init_process_group"): + _wrap("torch.distributed", "init_process_group", _wrapped_init_process_group) + if _distributed_available() and hasattr(torch.distributed, "destroy_process_group"): + _wrap("torch.distributed", "destroy_process_group", _wrapped_destroy_process_group) + _install_ddp() + _install_fsdp() + _install_deepspeed() + _install_optimizer_step() + # Late-patch bootstrap: if init_process_group was called before patch(), + # our wrapper will never fire. Run bootstrap now. + if _distributed_available(): + try: + if torch.distributed.is_initialized() and _rank_ctx.get() is None: + ctx = core.context_with_data("pytorch.rank", _dispatch_end_event=False) # type: ignore[no-untyped-call] + ctx.__enter__() + _rank_ctx.set(ctx) + _bootstrap_distributed() + except Exception: + log.exception("pytorch: late-patch bootstrap failed") + + +def uninstall() -> None: + global _installed, _fsdp_hook_registered, _deepspeed_hook_registered + if _installed: + _installed = False + if _distributed_available(): + for fn in ("destroy_process_group", "init_process_group"): + if hasattr(torch.distributed, fn): + try: + _unwrap(torch.distributed, fn) + except Exception: + log.debug("pytorch: failed to unwrap torch.distributed.%s", fn, exc_info=True) + _uninstall_ddp() + _uninstall_fsdp() + _fsdp_hook_registered = False + _uninstall_deepspeed() + _deepspeed_hook_registered = False + _uninstall_optimizer_step() + try: + from ddtrace.contrib.internal.pytorch import _device as _device_mod # noqa: PLC0415 + + _device_mod._cache = None + except Exception: # nosec B110 + pass + try: + from ddtrace.contrib.internal.pytorch._utils import clear_cached_run_metadata # noqa: PLC0415 + + clear_cached_run_metadata() + except Exception: # nosec B110 + pass + try: + from ddtrace.contrib.internal.pytorch import _rank_root # noqa: PLC0415 + + _rank_root.close() + except Exception: + log.debug("pytorch: rank-root close raised in uninstall", exc_info=True) + ctx = _rank_ctx.get() + if ctx is not None: + ctx.dispatch_ended_event() + ctx.__exit__(None, None, None) + _rank_ctx.set(None) + try: + from ddtrace.contrib.internal.pytorch import _utils as _utils_mod # noqa: PLC0415 + + _utils_mod._default_job_id = None + _utils_mod._tls_job_id = threading.local() + except Exception: + log.debug("pytorch: failed to reset cached job id on uninstall", exc_info=True) + global _no_env_job_id_warned, _cached_distributed_backend + _no_env_job_id_warned = False + _cached_distributed_backend = None diff --git a/ddtrace/contrib/internal/pytorch/_rank_root.py b/ddtrace/contrib/internal/pytorch/_rank_root.py new file mode 100644 index 00000000000..f40845f9c79 --- /dev/null +++ b/ddtrace/contrib/internal/pytorch/_rank_root.py @@ -0,0 +1,393 @@ +"""Per-rank lifetime span for PyTorch distributed training. + +Emits one ``pytorch.rank`` span per rank, open for the lifetime of the +distributed process group. Carries ``rank``, ``world_size``, ``framework``, +``training_job.id``, and device tags. + +The span is rotated every ``_rotation_interval_s`` seconds (default 600) +so partial data is visible during long runs. Rotated spans carry +``_dd.was_long_running=1``. +""" + +import atexit +import threading +from typing import Any +from typing import Optional + +from ddtrace import config +from ddtrace import tracer +from ddtrace.contrib.internal.pytorch import _c_tracer +from ddtrace.contrib.internal.pytorch import _device +from ddtrace.contrib.internal.trace_utils import int_service +from ddtrace.internal import forksafe +from ddtrace.internal.logger import get_logger +from ddtrace.internal.settings import env +from ddtrace.internal.threads import Lock + + +log = get_logger(__name__) + +_lock = Lock() +_span: Optional[Any] = None +_atexit_registered = False +_rotation_interval_s: int = 600 +_rotation_timer: Optional[threading.Timer] = None +_open_kwargs: dict[str, Any] = {} + + +def _build_span(kwargs: dict[str, Any]) -> Optional[Any]: + rank = kwargs["rank"] + world_size = kwargs["world_size"] + framework = kwargs["framework"] + training_job_id = kwargs["training_job_id"] + try: + span = tracer.start_span( + "pytorch.rank", + service=int_service(None, config.pytorch, default="pytorch"), + child_of=tracer.current_span() if framework == "ray" else None, + activate=False, + ) + except Exception: + log.debug("pytorch: failed to open pytorch.rank span", exc_info=True) + return None + try: + span._set_attribute("rank", int(rank)) + span._set_attribute("world_size", int(world_size)) + span.set_tag("framework", framework or "none") + span.set_tag("component", "pytorch") + span.set_tag("debug.level", "0") + if training_job_id: + span.set_tag("training_job.id", training_job_id) + span.set_tag("job_id", training_job_id) + # Force-keep: losing this per-rank anchor to base sampling permanently + # breaks workload attribution via the span's time range. + span.set_tag("manual.keep") + info = _device.get() + if info is not None: + span.set_tag("device.id", info.device_id) + span.set_tag("device.kind", info.kind) + span.set_tag("host", info.hostname) + if info.device_index is not None: + span._set_attribute("device.index", info.device_index) + if info.gpu_name: + span.set_tag("device.gpu.name", info.gpu_name) + if info.gpu_compute_capability: + span.set_tag("device.gpu.compute_capability", info.gpu_compute_capability) + if info.gpu_sm_count is not None: + span._set_attribute("device.gpu.sm_count", info.gpu_sm_count) + if info.gpu_total_memory_bytes is not None: + span._set_attribute("device.gpu.total_memory_bytes", info.gpu_total_memory_bytes) + if info.gpu_driver_version: + span.set_tag("device.gpu.driver_version", info.gpu_driver_version) + + try: + import torch # noqa: PLC0415 + + torch_ver = getattr(torch, "__version__", "") or "" + if torch_ver: + span.set_tag("torch.version", str(torch_ver)) + cuda_ver = getattr(getattr(torch, "version", None), "cuda", None) + if cuda_ver: + span.set_tag("torch.cuda.version", str(cuda_ver)) + hip_ver = getattr(getattr(torch, "version", None), "hip", None) + if hip_ver: + span.set_tag("torch.cuda.hip_version", str(hip_ver)) + try: + nccl_ver = torch.cuda.nccl.version() + if isinstance(nccl_ver, tuple) and nccl_ver: + span.set_tag("torch.cuda.nccl_version", ".".join(str(p) for p in nccl_ver)) + except Exception: # nosec B110 + pass + cudnn = getattr(getattr(torch, "backends", None), "cudnn", None) + if cudnn is not None: + try: + span.set_tag("torch.cudnn.enabled", "true" if bool(cudnn.enabled) else "false") + except Exception: # nosec B110 + pass + try: + span.set_tag("torch.cudnn.benchmark", "true" if bool(cudnn.benchmark) else "false") + except Exception: # nosec B110 + pass + try: + span.set_tag( + "torch.cudnn.deterministic", + "true" if bool(cudnn.deterministic) else "false", + ) + except Exception: # nosec B110 + pass + try: + v = cudnn.version() + if isinstance(v, int): + span._set_attribute("torch.cudnn.version", v) + except Exception: # nosec B110 + pass + try: + prec = torch.get_float32_matmul_precision() + if prec: + span.set_tag("torch.float32_matmul_precision", str(prec)) + except Exception: # nosec B110 + pass + try: + if torch.backends.mps.is_available(): + span.set_tag("torch.mps.available", "true") + except Exception: # nosec B110 + pass + except Exception: + log.debug("pytorch.rank: torch invariants tagging failed", exc_info=True) + + try: + for envvar, tag in ( + ("NCCL_DEBUG", "nccl.debug"), + ("NCCL_SOCKET_IFNAME", "nccl.socket_ifname"), + ("NCCL_IB_DISABLE", "nccl.ib_disable"), + ("NCCL_P2P_DISABLE", "nccl.p2p_disable"), + ("NCCL_ALGO", "nccl.algo"), + ("NCCL_PROTO", "nccl.proto"), + ("TORCH_NCCL_ASYNC_ERROR_HANDLING", "nccl.async_error_handling"), + ("CUDA_VISIBLE_DEVICES", "device.cuda.visible_devices"), + ("MASTER_ADDR", "pytorch.master_addr"), + ): + val = env.get(envvar) + if val: + span.set_tag(tag, str(val)) + for envvar, facet in ( + ("LOCAL_RANK", "pytorch.local_rank"), + ("LOCAL_WORLD_SIZE", "pytorch.local_world_size"), + ("GROUP_RANK", "pytorch.group_rank"), + ("GROUP_WORLD_SIZE", "pytorch.group_world_size"), + ("MASTER_PORT", "pytorch.master_port"), + ): + val = env.get(envvar) + if val: + try: + span._set_attribute(facet, int(val)) + except Exception: # nosec B110 + pass + except Exception: + log.debug("pytorch.rank: env-signal tagging failed", exc_info=True) + + try: + from ddtrace.contrib.internal.pytorch._distributed import _detect_launcher # noqa: PLC0415 + from ddtrace.contrib.internal.pytorch._distributed import _get_cached_backend # noqa: PLC0415 + + launcher = _detect_launcher() + if launcher: + span.set_tag("launcher", launcher) + backend = _get_cached_backend() + if backend: + span.set_tag("torch.distributed.backend", backend) + except Exception: + log.debug("pytorch.rank: launcher/backend tagging failed", exc_info=True) + + _tag_ray_run_context(span) + except Exception: + log.debug("pytorch: failed to tag pytorch.rank span", exc_info=True) + return span + + +def _tag_ray_run_context(span: Any) -> None: + """Apply Ray Train run-context tags from the pytorch-utils cache. Best-effort and idempotent.""" + try: + from ddtrace.contrib.internal.pytorch._utils import get_cached_run_metadata # noqa: PLC0415 + + rm = get_cached_run_metadata() + rn = rm.get("run_name") + sub = rm.get("submission_id") + md = rm.get("metadata") or {} + if rn: + span.set_tag("ray.train.run_name", rn) + if sub: + span.set_tag("ray.submission_id", sub) + for k, v in md.items(): + try: + span.set_tag(f"ray.metadata.{k}", str(v)) + except Exception: + log.debug("pytorch.rank: failed to set metadata tag %s", k, exc_info=True) + except Exception: + log.debug("pytorch.rank: failed to apply Ray run metadata", exc_info=True) + + +def retag_ray_run_context() -> None: + """Re-apply Ray Train run-context tags to the currently-open ``pytorch.rank`` span. + + Called immediately after the pytorch-utils run-metadata cache is populated + so the long-running rank span carries ``ray.submission_id`` for its full + lifetime rather than only at close (when the cache may have been cleared). + No-op when no rank span is open. + """ + with _lock: + span = _span + if span is None: + return + try: + _tag_ray_run_context(span) + except Exception: + log.debug("pytorch.rank: retag_ray_run_context failed", exc_info=True) + + +def _schedule_rotation() -> None: + """Start the next rotation timer. Must be called while holding _lock.""" + global _rotation_timer + t = threading.Timer(_rotation_interval_s, _rotate_span) + t.daemon = True + t.name = "dd-pytorch-rank-rotation" + t.start() + _rotation_timer = t + + +def _rotate_span() -> None: + """Finish the current rank span and open a fresh one. Called by the rotation timer.""" + global _span, _rotation_timer + + with _lock: + old_span = _span + if old_span is None: + return + + new_span = _build_span(_open_kwargs) + + with _lock: + if _span is not old_span: + # Span was closed or replaced while we were building — discard. + if new_span is not None: + try: + new_span.finish() + except Exception: # nosec B110 + pass + return + _span = new_span + _schedule_rotation() + + # Point C tracer at the new span BEFORE finishing the old one — + # ensures no gap in coverage for GPU-level root spans. + if new_span is not None: + _c_tracer.set_parent_context(new_span, _open_kwargs) + + try: + old_span.set_tag("_dd.was_long_running", 1) + _tag_ray_run_context(old_span) + old_span.finish() + except Exception: + log.debug("pytorch: span rotation finish failed", exc_info=True) + + try: + _safe_flush(tracer) + except Exception: + log.debug("pytorch: span rotation flush failed", exc_info=True) + + +def open_rank_span(rank: int, world_size: int, framework: str, training_job_id: Optional[str]) -> None: + """Open the per-rank lifetime span. Idempotent — second call is a no-op.""" + global _span, _atexit_registered, _open_kwargs + with _lock: + if _span is not None: + return + if not _atexit_registered: + atexit.register(close) + _atexit_registered = True + # Set _open_kwargs under lock so the rotation timer always sees a + # consistent snapshot — _rotate_span reads it outside the lock. + _open_kwargs = { + "rank": rank, + "world_size": world_size, + "framework": framework, + "training_job_id": training_job_id, + } + + new_span = _build_span(_open_kwargs) + + won_race = False + with _lock: + if _span is None: + _span = new_span + won_race = True + _schedule_rotation() + else: + # Lost the race to another concurrent open_rank_span() — discard. + if new_span is not None: # type: ignore[unreachable] + try: + new_span.finish() + except Exception: # nosec B110 + pass + + if won_race and new_span is not None: + _c_tracer.set_parent_context(new_span, _open_kwargs) + + +def set_framework(name: str) -> None: + """Update the ``framework`` tag on the open ``pytorch.rank`` span.""" + if not name: + return + with _lock: + span = _span + _open_kwargs["framework"] = name # keep rotation in sync + if span is None: + return + try: + span.set_tag("framework", name) + except Exception: + log.debug("pytorch: failed to set framework tag", exc_info=True) + _c_tracer.set_parent_context(span, _open_kwargs) + + +def close() -> None: + """Finish the per-rank span. Safe to call when no span is open.""" + global _span, _atexit_registered, _rotation_timer + with _lock: + span = _span + _span = None + timer = _rotation_timer + _rotation_timer = None + if _atexit_registered: + try: + atexit.unregister(close) + except Exception: # nosec B110 + pass + _atexit_registered = False + + if timer is not None: + timer.cancel() + + if span is None: + return + try: + _tag_ray_run_context(span) + span.finish() + # Flush in a daemon thread so close() never stalls the caller + # (e.g. destroy_process_group). The thread is best-effort; on + # normal process exit atexit fires close() and the daemon gets + # a chance to complete before the interpreter shuts down. + threading.Thread( + target=lambda: _safe_flush(tracer), + name="dd-pytorch-rank-root-flush", + daemon=True, + ).start() + except Exception: + log.exception("pytorch: rank-root span close failed") + finally: + _c_tracer.clear_parent_context() + + +def _safe_flush(_tracer: Any) -> None: + try: + _tracer.flush() + except Exception: + log.debug("pytorch: tracer.flush during rank-root close raised", exc_info=True) + + +def _reset_child_state() -> None: + # Clear inherited state; timer threads do not survive fork. + global _span, _lock, _atexit_registered, _rotation_timer, _open_kwargs + _span = None + _lock = Lock() + _atexit_registered = False + _rotation_timer = None + _open_kwargs = {} + # Clear C tracer parent pointer — child must not inherit a dangling span ref. + try: + _c_tracer.clear_parent_context() + except Exception: # nosec B110 + pass + + +forksafe.register(_reset_child_state) diff --git a/ddtrace/contrib/internal/pytorch/_test_helpers.py b/ddtrace/contrib/internal/pytorch/_test_helpers.py new file mode 100644 index 00000000000..a41b126cc57 --- /dev/null +++ b/ddtrace/contrib/internal/pytorch/_test_helpers.py @@ -0,0 +1,56 @@ +"""Test-only helpers for the PyTorch integration. + +Previously exposed as ``_*_for_tests`` symbols on the production submodules; +moved here so the production modules don't carry test-only API surface. +Import as:: + + from ddtrace.contrib.internal.pytorch import _test_helpers as th +""" + +from typing import Any +from typing import Optional + + +def reset_metrics_state() -> None: + pass + + +def current_rank_span() -> Optional[Any]: + from ddtrace.contrib.internal.pytorch import _rank_root + + return _rank_root._span + + +def close_rank_root() -> None: + """Force-close the rank-root span and reset module state (test isolation).""" + from ddtrace.contrib.internal.pytorch import _rank_root + + with _rank_root._lock: + span = _rank_root._span + _rank_root._span = None + _rank_root._atexit_registered = False + timer = _rank_root._rotation_timer + _rank_root._rotation_timer = None + _rank_root._open_kwargs = {} + if timer is not None: + try: + timer.cancel() + except Exception: # nosec B110 + pass + if span is not None: + try: + span.finish() + except Exception: # nosec B110 + pass + + +def set_atexit_registered(value: bool) -> None: + from ddtrace.contrib.internal.pytorch import _rank_root + + _rank_root._atexit_registered = value + + +def reset_device_cache() -> None: + from ddtrace.contrib.internal.pytorch import _device + + _device._cache = None diff --git a/ddtrace/contrib/internal/pytorch/_utils.py b/ddtrace/contrib/internal/pytorch/_utils.py new file mode 100644 index 00000000000..028546c244b --- /dev/null +++ b/ddtrace/contrib/internal/pytorch/_utils.py @@ -0,0 +1,223 @@ +import os +import threading +import types as _types_mp +from typing import Any +from typing import Optional +import uuid + +from ddtrace.internal.logger import get_logger +from ddtrace.internal.settings import env + + +log = get_logger(__name__) + +_JOB_ID_ENV_CHAIN = ( + "DD_PYTORCH_JOB_ID", # explicit user override — wins over all launchers + "RAY_JOB_ID", # Ray Train / Tune — preferred so Ray-driven traces are consistent + "TORCHELASTIC_RUN_ID", # torch.distributed.elastic / torchrun + "KUBEFLOW_TRAINING_JOB_ID", # Kubeflow Training Operator + "SLURM_JOB_ID", # SLURM +) + +# Generous limit avoids silent intake truncation; strip whitespace from scheduler IDs. +_JOB_ID_MAX_LEN = 200 + +# `job_id` is a legacy alias kept for dashboard back-compat. +TRAINING_JOB_ID_TAG = "training_job.id" + +# Process-wide job-id cache; thread-local overrides take precedence. +_default_job_id: Optional[str] = None + +# Thread-local override for multi-worker isolation. +_tls_job_id = threading.local() + + +def set_cached_job_id(value: Optional[str], *, is_default: bool = False) -> None: + """Cache the resolved training job id so non-distributed emitters can tag spans. + + Pass ``is_default=True`` only from ``_distributed._bootstrap_distributed`` to seed + the process-wide default. Other callers write only to a thread-local override so + concurrent Ray Train workers with different job ids don't trample each other. + """ + global _default_job_id + _tls_job_id.value = value + if is_default: + _default_job_id = value + + +def get_cached_job_id() -> Optional[str]: + val: Optional[str] = getattr(_tls_job_id, "value", None) + if val is not None: + return val + return _default_job_id + + +def get_rank() -> int: + """Return the current process rank from the RANK env var; falls back to 0.""" + try: + rank = env.get("RANK") + if rank: + return int(rank) + except Exception: # nosec B110 + pass + return 0 + + +def set_training_job_id_tag(span: Any) -> None: + """Tag ``span`` with training_job.id/job_id, manual.keep, and Ray run-context tags. + + Never raises; tag-setting failures are swallowed because instrumentation + must not crash user code. + """ + job_id = get_cached_job_id() + if job_id: + try: + span.set_tag(TRAINING_JOB_ID_TAG, job_id) + span.set_tag("job_id", job_id) + except Exception: + log.debug("pytorch: failed to set training_job.id tag", exc_info=True) + try: + span.set_tag("manual.keep") + except Exception: + log.debug("pytorch: failed to set manual.keep tag", exc_info=True) + try: + rm = get_cached_run_metadata() + sub = rm.get("submission_id") + if sub: + span.set_tag("ray.submission_id", sub) + md = rm.get("metadata") or {} + job_name = md.get("job_name") + if job_name: + span.set_tag("ray.metadata.job_name", str(job_name)) + except Exception: + log.debug("pytorch: failed to propagate ray run metadata to step span", exc_info=True) + + +def resolve_job_id_from_env() -> str: + """Walk the job-id env-var chain (DD_PYTORCH_JOB_ID → RAY_JOB_ID → … → UUID fallback).""" + for var in _JOB_ID_ENV_CHAIN: + raw = env.get(var) + if not raw: + continue + value = raw.strip() + if not value: + continue + return str(value[:_JOB_ID_MAX_LEN]) + return str(uuid.uuid4()) + + +def job_id_env_set() -> bool: + """True iff at least one env var in ``_JOB_ID_ENV_CHAIN`` is set to a non-empty value.""" + for var in _JOB_ID_ENV_CHAIN: + raw = env.get(var) + if raw and raw.strip(): + return True + return False + + +# Ray Train run-context cache. Writers use _run_metadata_lock; readers use the lock-free view. +_run_metadata: dict[str, Any] = {} +_run_metadata_lock = threading.Lock() +_run_metadata_view: _types_mp.MappingProxyType[str, Any] = _types_mp.MappingProxyType[str, Any]({}) + + +def _publish_view_locked() -> None: + """Rebuild and atomically replace `_run_metadata_view`. Caller MUST + hold `_run_metadata_lock`. + """ + global _run_metadata_view + raw: dict[str, Any] = {} + rn = _run_metadata.get("run_name") + sub = _run_metadata.get("submission_id") + md = _run_metadata.get("metadata") or {} + if rn is not None: + raw["run_name"] = rn + if sub is not None: + raw["submission_id"] = sub + if md: + raw["metadata"] = _types_mp.MappingProxyType[str, Any](dict(md)) + _run_metadata_view = _types_mp.MappingProxyType[str, Any](raw) + + +def set_cached_run_metadata( + *, + run_name: Optional[str] = None, + submission_id: Optional[str] = None, + metadata: Optional[dict[str, Any]] = None, +) -> None: + """Update the run-metadata cache. None values leave the existing + entry intact (partial updates compose). + """ + with _run_metadata_lock: + if run_name is not None: + _run_metadata["run_name"] = run_name + if submission_id is not None: + _run_metadata["submission_id"] = submission_id + if metadata is not None: + _run_metadata["metadata"] = dict(metadata) + _publish_view_locked() + + +def get_cached_run_metadata() -> "_types_mp.MappingProxyType[str, Any]": + """Lock-free read of the latest published snapshot. + + Returns a `MappingProxyType[str, Any]` — read-only at runtime. Callers that + need a mutable dict should `dict(get_cached_run_metadata())`. + """ + return _run_metadata_view + + +def clear_cached_run_metadata() -> None: + """Public helper for tests and worker-restore paths that need to + fully reset the cache. + """ + with _run_metadata_lock: + _run_metadata.clear() + _publish_view_locked() + + +def get_run_metadata_snapshot() -> dict[str, Any]: + """Return a snapshot suitable for later restore. Deep-copies the + nested `metadata` dict so callers cannot mutate live cache state. + """ + with _run_metadata_lock: + return { + "run_name": _run_metadata.get("run_name"), + "submission_id": _run_metadata.get("submission_id"), + "metadata": dict(_run_metadata.get("metadata") or {}), + } + + +def restore_run_metadata_snapshot(snapshot: dict[str, Any]) -> None: + """Replace the cache with a previously taken snapshot. None fields + are cleared (unconditional overwrite — unlike `set_cached_*`). + """ + with _run_metadata_lock: + _run_metadata.clear() + rn = snapshot.get("run_name") + sub = snapshot.get("submission_id") + md = snapshot.get("metadata") + if rn is not None: + _run_metadata["run_name"] = rn + if sub is not None: + _run_metadata["submission_id"] = sub + if md is not None: + _run_metadata["metadata"] = dict(md) + _publish_view_locked() + + +def _reset_child_state() -> None: + global _run_metadata, _run_metadata_lock, _run_metadata_view + _run_metadata = {} + _run_metadata_lock = threading.Lock() + _run_metadata_view = _types_mp.MappingProxyType[str, Any]({}) + global _default_job_id + _default_job_id = None + try: + _tls_job_id.value = None + except AttributeError: + pass + + +if hasattr(os, "register_at_fork"): + os.register_at_fork(after_in_child=_reset_child_state) diff --git a/ddtrace/contrib/internal/pytorch/patch.py b/ddtrace/contrib/internal/pytorch/patch.py new file mode 100644 index 00000000000..ad24000d73a --- /dev/null +++ b/ddtrace/contrib/internal/pytorch/patch.py @@ -0,0 +1,45 @@ +import torch + +from ddtrace.internal.logger import get_logger +from ddtrace.internal.utils.version import parse_version + + +log = get_logger(__name__) + +TORCH_VERSION = parse_version(str(getattr(torch, "__version__", ""))) + + +def get_version() -> str: + # torch.__version__ is a `TorchVersion` (a str subclass); the contrib test + # harness checks `type(version) == str`, so cast to a plain str here. + return str(getattr(torch, "__version__", "")) + + +def _supported_versions() -> dict[str, str]: + return {"torch": ">=2.0"} + + +def patch() -> None: + if getattr(torch, "_datadog_patch", False): + return + if TORCH_VERSION < (2, 0, 0) or TORCH_VERSION >= (3, 0, 0): + log.warning( + "pytorch: torch version %s is not supported (supported: >=2.0,<3.0); skipping instrumentation", + torch.__version__, + ) + return + torch._datadog_patch = True + # Imported inside patch() so the module-level import of `_distributed` + # doesn't pull in `torch.distributed.*` symbols at module import time. + from ddtrace.contrib.internal.pytorch import _distributed + + _distributed.install() + + +def unpatch() -> None: + if not getattr(torch, "_datadog_patch", False): + return + torch._datadog_patch = False + from ddtrace.contrib.internal.pytorch import _distributed + + _distributed.uninstall() diff --git a/ddtrace/internal/settings/_config.py b/ddtrace/internal/settings/_config.py index 0b3701da402..1aa0cb6fb21 100644 --- a/ddtrace/internal/settings/_config.py +++ b/ddtrace/internal/settings/_config.py @@ -204,6 +204,7 @@ "openai_agents", "mcp", "mlflow", + "pytorch", "ray", "aiokafka", "google_cloud_pubsub", diff --git a/ddtrace/internal/settings/_supported_configurations.py b/ddtrace/internal/settings/_supported_configurations.py index 8912cb27a09..78ec618b542 100644 --- a/ddtrace/internal/settings/_supported_configurations.py +++ b/ddtrace/internal/settings/_supported_configurations.py @@ -405,6 +405,8 @@ "DD_PYTEST_SERVICE", "DD_PYTEST_USE_NEW_PLUGIN", "DD_PYTEST_USE_NEW_PLUGIN_BETA", + "DD_PYTORCH_JOB_ID", + "DD_PYTORCH_SERVICE", "DD_RAY_SERVICE", "DD_REDISCLUSTER_CMD_MAX_LENGTH", "DD_REDISCLUSTER_SERVICE", @@ -609,6 +611,7 @@ "DD_TRACE_PYRAMID_ENABLED", "DD_TRACE_PYTEST_BDD_ENABLED", "DD_TRACE_PYTEST_ENABLED", + "DD_TRACE_PYTORCH_ENABLED", "DD_TRACE_RATE_LIMIT", "DD_TRACE_RAY_ARGS_KWARGS", "DD_TRACE_RAY_CORE_API", @@ -661,6 +664,7 @@ "DD_TRACE_WSGI_ENABLED", "DD_TRACE_X_DATADOG_TAGS_MAX_LENGTH", "DD_TRACE_YAAREDIS_ENABLED", + "DD_TRAINING_STEP_PROFILING", "DD_UNITTEST_OPERATION_NAME", "DD_UNITTEST_SERVICE", "DD_UNLOAD_MODULES_FROM_SITECUSTOMIZE", @@ -778,6 +782,7 @@ "_DD_PROFILING_STACK_MAX_THREADS", "_DD_PYTEST_XDIST_INFERRED_SERVICE", "_DD_PY_SSI_INJECT", + "_DD_RAY_RUN_METADATA", "_DD_REMOTE_CONFIGURATION_ADDITIONAL_HEADERS", "_DD_REMOTE_CONFIGURATION_LOG_PAYLOADS", "_DD_REMOTE_CONFIGURATION_SKIP_SHUTDOWN", @@ -896,6 +901,7 @@ "DD_PYRAMID_SERVICE": ["DD_PYRAMID_SERVICE_NAME"], "DD_PYTEST_BDD_SERVICE": ["DD_PYTEST_BDD_SERVICE_NAME"], "DD_PYTEST_SERVICE": ["DD_PYTEST_SERVICE_NAME"], + "DD_PYTORCH_SERVICE": ["DD_PYTORCH_SERVICE_NAME"], "DD_RAY_SERVICE": ["DD_RAY_SERVICE_NAME"], "DD_REDISCLUSTER_SERVICE": ["DD_REDISCLUSTER_SERVICE_NAME"], "DD_REDIS_SERVICE": ["DD_REDIS_SERVICE_NAME"], diff --git a/releasenotes/notes/pytorch-rank-span-8c74d5227b0d2953.yaml b/releasenotes/notes/pytorch-rank-span-8c74d5227b0d2953.yaml new file mode 100644 index 00000000000..17e46fbb5b6 --- /dev/null +++ b/releasenotes/notes/pytorch-rank-span-8c74d5227b0d2953.yaml @@ -0,0 +1,11 @@ +--- +features: + - | + pytorch: Adds a ``pytorch.rank`` lifetime span for PyTorch distributed training. + The span opens at ``init_process_group`` and closes at ``destroy_process_group`` + or process exit. Tags include ``rank``, ``world_size``, ``framework`` + (DDP / FSDP / DeepSpeed), ``launcher``, ``torch.distributed.backend``, and + ``training_job_id`` (resolved from launcher environment variables). + When running under Ray Train, ``ray.train.run_name``, ``ray.submission_id``, + and ``ray.metadata.*`` are also applied. + Enable with ``DD_PATCH_MODULES=pytorch:true``. diff --git a/riotfile.py b/riotfile.py index 36785a001ae..621ccb4bf56 100644 --- a/riotfile.py +++ b/riotfile.py @@ -3151,6 +3151,36 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT ), ], ), + Venv( + name="pytorch", + command="pytest {cmdargs} tests/contrib/pytorch", + venvs=[ + Venv( + pys=select_pys(min_version="3.9", max_version="3.11"), + pkgs={ + "torch": ["~=2.0.0", "~=2.1.0"], + }, + ), + Venv( + pys=select_pys(min_version="3.9", max_version="3.12"), + pkgs={ + "torch": ["~=2.2.0", "~=2.3.0"], + }, + ), + Venv( + pys=select_pys(min_version="3.9", max_version="3.12"), + pkgs={ + "torch": ["~=2.4.0", "~=2.5.0", "~=2.6.0", "~=2.7.0"], + }, + ), + Venv( + pys=select_pys(min_version="3.12", max_version="3.12"), + pkgs={ + "torch": ["~=2.8.0", "~=2.9.0", "~=2.10.0", "~=2.11.0", "~=2.12.0"], + }, + ), + ], + ), Venv( name="vertexai", command="pytest {cmdargs} tests/contrib/vertexai", diff --git a/scripts/integration_registry/registry.yaml b/scripts/integration_registry/registry.yaml index 06b55ed67aa..93828a2d123 100644 --- a/scripts/integration_registry/registry.yaml +++ b/scripts/integration_registry/registry.yaml @@ -847,6 +847,16 @@ integrations: dependency_names: - pytest_benchmark +- integration_name: pytorch + is_external_package: true + is_tested: true + dependency_names: + - torch + tested_versions_by_dependency: + torch: + min: 2.0.1 + max: 2.12.0 + - integration_name: ray is_external_package: true is_tested: false diff --git a/supported-configurations.json b/supported-configurations.json index 001dbfffe7c..999a6cfbbeb 100644 --- a/supported-configurations.json +++ b/supported-configurations.json @@ -3003,6 +3003,16 @@ "default": "false" } ], + "DD_PYTORCH_SERVICE": [ + { + "implementation": "A", + "type": "string", + "default": "", + "aliases": [ + "DD_PYTORCH_SERVICE_NAME" + ] + } + ], "DD_PYNAMODB_SERVICE": [ { "implementation": "A", @@ -4610,6 +4620,13 @@ "default": "false" } ], + "DD_TRACE_PYTORCH_ENABLED": [ + { + "implementation": "A", + "type": "boolean", + "default": "false" + } + ], "DD_TRACE_RAY_ENABLED": [ { "implementation": "A", @@ -5979,7 +5996,31 @@ "version": "B", "type": "boolean", "default": "false", - "propertyKeys": ["trace_stats_computation_experimental_client_obfuscation_enabled"] + "propertyKeys": [ + "trace_stats_computation_experimental_client_obfuscation_enabled" + ] + } + ], + "DD_PYTORCH_JOB_ID": [ + { + "implementation": "A", + "type": "string", + "default": null + } + ], + "_DD_RAY_RUN_METADATA": [ + { + "implementation": "A", + "type": "json", + "default": null, + "internal": true + } + ], + "DD_TRAINING_STEP_PROFILING": [ + { + "implementation": "A", + "type": "boolean", + "default": "false" } ] } diff --git a/supported_versions_output.json b/supported_versions_output.json index 9cc34d8b69b..df815217cd5 100644 --- a/supported_versions_output.json +++ b/supported_versions_output.json @@ -6,6 +6,13 @@ "max_tracer_supported": "3.1.0", "auto-instrumented": false }, + { + "dependency": "botocore", + "integration": "aiobotocore", + "minimum_tracer_supported": "1.15.32", + "max_tracer_supported": "1.42.19", + "auto-instrumented": false + }, { "dependency": "aiohttp", "integration": "aiohttp", @@ -171,8 +178,8 @@ { "dependency": "botocore", "integration": "botocore", - "minimum_tracer_supported": "1.34.49", - "max_tracer_supported": "1.38.26", + "minimum_tracer_supported": "1.15.32", + "max_tracer_supported": "1.42.19", "pinned": "true", "auto-instrumented": true }, @@ -656,6 +663,14 @@ "pinned": "true", "auto-instrumented": false }, + { + "dependency": "torch", + "integration": "pytorch", + "minimum_tracer_supported": "2.0.1", + "max_tracer_supported": "2.12.0", + "pinned": "true", + "auto-instrumented": false + }, { "dependency": "ray", "integration": "ray", diff --git a/supported_versions_table.csv b/supported_versions_table.csv index eb4ab4ed9db..ce241443f7c 100644 --- a/supported_versions_table.csv +++ b/supported_versions_table.csv @@ -1,5 +1,6 @@ dependency,integration,minimum_tracer_supported,max_tracer_supported,auto-instrumented aiobotocore,aiobotocore,1.0.7,3.1.0,False +botocore,aiobotocore,1.15.32,1.42.19,False aiohttp,aiohttp,3.7.4.post0,3.14.0,True aiohttp-jinja2,aiohttp_jinja2,1.5.1,1.6,True aiohttp_jinja2,aiohttp_jinja2,1.5.1,1.6,True @@ -22,7 +23,7 @@ azure-eventhub,azure_eventhubs *,5.12.2,5.15.0,True azure-functions,azure_functions *,1.10.1,2.0.0,True azure-servicebus,azure_servicebus *,7.14.2,7.14.2,True boto3,botocore *,1.34.49,1.38.26,True -botocore,botocore *,1.34.49,1.38.26,True +botocore,botocore *,1.15.32,1.42.19,True bottle,bottle,0.12.25,0.13.4,True celery,celery,5.5.3,5.5.3,True cherrypy,cherrypy,17.0.0,18.10.0,False @@ -91,6 +92,7 @@ pyodbc,pyodbc,4.0.39,5.3.0,True pyramid,pyramid,1.10.8,2.0.2,True pytest,pytest,6.2.5,9.0.3,False pytest-bdd,pytest_bdd *,4.1.0,6.0.1,False +torch,pytorch *,2.0.1,2.12.0,False ray,ray *,2.46.0,2.49.2,False redis,redis,4.6.0,6.4.0,True redis-py-cluster,rediscluster,2.0.0,2.1.3,True diff --git a/tests/contrib/pytorch/__init__.py b/tests/contrib/pytorch/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/contrib/pytorch/conftest.py b/tests/contrib/pytorch/conftest.py new file mode 100644 index 00000000000..201ffb140d7 --- /dev/null +++ b/tests/contrib/pytorch/conftest.py @@ -0,0 +1,47 @@ +"""Shared fixtures for the pytorch integration test suite.""" + +from unittest import mock + +import pytest + +from ddtrace.contrib.internal.pytorch import _device +from ddtrace.contrib.internal.pytorch import _distributed +from ddtrace.contrib.internal.pytorch import _test_helpers as _th + + +# Non-deterministic tags to ignore in snapshot tests (follow Ray's pattern). +PYTORCH_SNAPSHOT_IGNORES = [ + "meta.tracer_version", + "meta.runtime-id", + "metrics._dd.top_level", + "metrics._dd.tracer_kr", + "metrics._sampling_priority_v1", + "metrics.process_id", + "name", + "resource", + "service", + "start", + "duration", +] + + +@pytest.fixture +def pytorch_clean_state(): + """Reset rank-root span, device cache, and distributed context. + + Compose into autouse fixtures in each test module. Resets _rank_ctx so + tests that call into _distributed directly don't leak ExecutionContext + across test boundaries. + """ + _distributed._rank_ctx.set(None) + _th.reset_device_cache() + _th.close_rank_root() + with ( + mock.patch.object(_device, "_cuda_is_available", return_value=False), + mock.patch.object(_device, "_hostname", return_value="h-9"), + ): + _device.discover(local_rank=0) + yield + _th.close_rank_root() + _th.reset_device_cache() + _distributed._rank_ctx.set(None) diff --git a/tests/contrib/pytorch/test_c_tracer.py b/tests/contrib/pytorch/test_c_tracer.py new file mode 100644 index 00000000000..6703c706d3e --- /dev/null +++ b/tests/contrib/pytorch/test_c_tracer.py @@ -0,0 +1,321 @@ +import sys +from unittest import mock + +import pytest + +from ddtrace.contrib.internal.pytorch import _rank_root + + +def _fresh_module(): + """Import _c_tracer with a clean module cache so _loaded resets.""" + sys.modules.pop("ddtrace.contrib.internal.pytorch._c_tracer", None) + from ddtrace.contrib.internal.pytorch import _c_tracer + + return _c_tracer + + +def _make_fake_lib(): + lib = mock.MagicMock() + lib.dd_set_global_parent_context = mock.MagicMock() + lib.dd_set_global_parent_context.restype = None + lib.dd_clear_global_parent_context = mock.MagicMock() + lib.dd_clear_global_parent_context.restype = None + return lib + + +def _make_absent_lib(): + """A lib handle where the C tracer symbols are not present.""" + lib = mock.MagicMock() + type(lib).dd_set_global_parent_context = mock.PropertyMock(side_effect=AttributeError) + return lib + + +def _make_fake_span(trace_id=0xDEADBEEF00000001, span_id=0xCAFE, sampling_priority=1, service="pytorch"): + span = mock.Mock() + span.trace_id = trace_id + span.span_id = span_id + span.service = service + span.context.sampling_priority = sampling_priority + return span + + +# --------------------------------------------------------------------------- +# _load() — symbol presence determines no-op vs. active path +# --------------------------------------------------------------------------- + + +def test_set_parent_context_no_op_when_library_absent(): + mod = _fresh_module() + mod._loaded = False + with mock.patch("ctypes.CDLL", return_value=_make_absent_lib()): + fake_span = mock.Mock() + fake_span.trace_id = 0xABC + fake_span.span_id = 0x123 + fake_span.context.sampling_priority = None + mod.set_parent_context(fake_span, {"rank": 0, "world_size": 1, "framework": "ddp", "training_job_id": "j1"}) + + +def test_clear_parent_context_no_op_when_library_absent(): + mod = _fresh_module() + mod._loaded = False + with mock.patch("ctypes.CDLL", return_value=_make_absent_lib()): + mod.clear_parent_context() + + +def test_load_uses_global_symbol_table(): + """_load() calls ctypes.CDLL(None) — no library path, no discovery.""" + mod = _fresh_module() + mod._loaded = False + fake_lib = _make_fake_lib() + with mock.patch("ctypes.CDLL", return_value=fake_lib) as mock_cdll: + mod._load() + mock_cdll.assert_called_once_with(None) + + +# --------------------------------------------------------------------------- +# Correct C function dispatch +# --------------------------------------------------------------------------- + + +def test_set_parent_context_calls_c_function(): + mod = _fresh_module() + mod._loaded = False + fake_lib = _make_fake_lib() + with mock.patch("ctypes.CDLL", return_value=fake_lib): + span = _make_fake_span(trace_id=0x00000001_DEADBEEF, span_id=0xCAFE, sampling_priority=2) + mod.set_parent_context(span, {"rank": 3, "world_size": 8, "framework": "ddp", "training_job_id": "job-xyz"}) + assert fake_lib.dd_set_global_parent_context.called + + +def test_set_parent_context_128bit_trace_id_split(): + """High 64 bits are correctly separated from low 64 bits.""" + mod = _fresh_module() + mod._loaded = False + fake_lib = _make_fake_lib() + captured = {} + + def capture(*args, **kwargs): + captured["args"] = args + + fake_lib.dd_set_global_parent_context.side_effect = capture + + with mock.patch("ctypes.CDLL", return_value=fake_lib): + trace_id = (0xAAAA << 64) | 0xBBBB + span = _make_fake_span(trace_id=trace_id, span_id=0x1111) + mod.set_parent_context(span, {"rank": 0, "world_size": 1, "framework": "none", "training_job_id": ""}) + lo = captured["args"][0].value + hi = captured["args"][1].value + assert lo == 0xBBBB + assert hi == 0xAAAA + + +def test_clear_parent_context_calls_c_function(): + mod = _fresh_module() + mod._loaded = False + fake_lib = _make_fake_lib() + with mock.patch("ctypes.CDLL", return_value=fake_lib): + mod._load() + mod.clear_parent_context() + assert fake_lib.dd_clear_global_parent_context.called + + +def test_set_parent_context_tag_payload(): + """Verify the 4 expected tags are sent with correct values.""" + mod = _fresh_module() + mod._loaded = False + fake_lib = _make_fake_lib() + captured = {} + + def capture(*args, **kwargs): + captured["args"] = args + + fake_lib.dd_set_global_parent_context.side_effect = capture + + with mock.patch("ctypes.CDLL", return_value=fake_lib): + span = _make_fake_span(trace_id=1, span_id=2, sampling_priority=1) + mod.set_parent_context( + span, + { + "training_job_id": "job-abc", + "rank": 3, + "world_size": 8, + "framework": "fsdp", + }, + ) + + args = captured["args"] + count = args[7].value # c_size_t + assert count == 5 + + keys = [args[5][i].decode() for i in range(count)] + vals = [args[6][i].decode() for i in range(count)] + tag_map = dict(zip(keys, vals)) + + assert tag_map["training_job_id"] == "job-abc" + assert tag_map["rank"] == "3" + assert tag_map["world_size"] == "8" + assert tag_map["framework"] == "fsdp" + assert tag_map["service"] == "pytorch" + + +def test_set_parent_context_swallows_exception(): + mod = _fresh_module() + mod._loaded = True + mod._lib = object() + mod._set_fn = mock.Mock(side_effect=RuntimeError("boom")) + mod.set_parent_context(_make_fake_span(), {}) + + +def test_clear_parent_context_swallows_exception(): + mod = _fresh_module() + mod._loaded = True + mod._lib = object() + mod._clear_fn = mock.Mock(side_effect=RuntimeError("boom")) + mod.clear_parent_context() + + +# --------------------------------------------------------------------------- +# Lifecycle integration: _rank_root calls _c_tracer at the right moments. +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=False) +def _fresh_rank_root(): + """Reset _rank_root module state before and after each test.""" + import threading + + _rank_root._span = None + _rank_root._lock = threading.Lock() + _rank_root._atexit_registered = False + if _rank_root._rotation_timer is not None: + _rank_root._rotation_timer.cancel() + _rank_root._rotation_timer = None + _rank_root._open_kwargs = {} + yield + if _rank_root._rotation_timer is not None: + _rank_root._rotation_timer.cancel() + _rank_root._span = None + _rank_root._rotation_timer = None + + +def test_open_rank_span_calls_set_parent_context(_fresh_rank_root): + with ( + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._c_tracer") as mc, + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._build_span", return_value=_make_fake_span()), + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._schedule_rotation"), + ): + _rank_root.open_rank_span(rank=0, world_size=4, framework="ddp", training_job_id="job-1") + assert mc.set_parent_context.called + args = mc.set_parent_context.call_args[0] + assert args[1]["framework"] == "ddp" + assert args[1]["rank"] == 0 + + +def test_close_calls_clear_parent_context(_fresh_rank_root): + fake_span = _make_fake_span() + fake_span.finish = mock.Mock() + _rank_root._span = fake_span + with ( + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._c_tracer") as mc, + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._safe_flush"), + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._tag_ray_run_context"), + ): + _rank_root.close() + assert mc.clear_parent_context.called + + +def test_rotate_span_updates_context_before_finishing_old(_fresh_rank_root): + """set_parent_context(new) must be called BEFORE old_span.finish().""" + call_order = [] + new_span = _make_fake_span(trace_id=2, span_id=200) + old_span = _make_fake_span(trace_id=1, span_id=100) + old_span.finish = mock.Mock(side_effect=lambda: call_order.append("finish")) + old_span.set_tag = mock.Mock() + + _rank_root._span = old_span + _rank_root._open_kwargs = {"rank": 0, "world_size": 1, "framework": "ddp", "training_job_id": "j"} + + def fake_set(span, kwargs): + call_order.append(("set", span.span_id)) + + with ( + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._build_span", return_value=new_span), + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._schedule_rotation"), + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._safe_flush"), + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._tag_ray_run_context"), + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._c_tracer") as mc, + ): + mc.set_parent_context.side_effect = fake_set + _rank_root._rotate_span() + + set_idx = next(i for i, x in enumerate(call_order) if isinstance(x, tuple) and x[0] == "set") + fin_idx = call_order.index("finish") + assert set_idx < fin_idx, f"set must precede finish; order={call_order}" + + +def test_close_clears_c_tracer_even_when_finish_raises(_fresh_rank_root): + """clear_parent_context must be called even if span.finish() raises.""" + fake_span = _make_fake_span() + fake_span.finish = mock.Mock(side_effect=RuntimeError("finish failed")) + _rank_root._span = fake_span + with ( + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._c_tracer") as mc, + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._safe_flush"), + mock.patch("ddtrace.contrib.internal.pytorch._rank_root._tag_ray_run_context"), + ): + _rank_root.close() # must not raise + assert mc.clear_parent_context.called, "clear must fire even when finish raises" + + +def test_set_framework_updates_c_tracer_context(_fresh_rank_root): + fake_span = _make_fake_span() + _rank_root._span = fake_span + _rank_root._open_kwargs = {"rank": 0, "world_size": 1, "framework": "none", "training_job_id": "j"} + with mock.patch("ddtrace.contrib.internal.pytorch._rank_root._c_tracer") as mc: + _rank_root.set_framework("fsdp") + assert mc.set_parent_context.called + args = mc.set_parent_context.call_args[0] + assert args[1]["framework"] == "fsdp" + + +# --------------------------------------------------------------------------- +# Step signals: step_begin / step_end +# --------------------------------------------------------------------------- + + +def test_step_begin_noop_when_symbol_absent(): + """step_begin() is silent when the C symbol was not bound at load time.""" + mod = _fresh_module() + mod._loaded = True + mod._lib = object() # truthy — looks loaded + mod._step_begin_fn = None + mod.step_begin() # must not raise + + +def test_step_end_noop_when_symbol_absent(): + """step_end() is silent when the C symbol was not bound at load time.""" + mod = _fresh_module() + mod._loaded = True + mod._lib = object() + mod._step_end_fn = None + mod.step_end() # must not raise + + +def test_step_begin_calls_c_symbol(): + mod = _fresh_module() + fn = mock.Mock() + mod._loaded = True + mod._lib = object() + mod._step_begin_fn = fn + mod.step_begin() + assert fn.called + + +def test_step_end_calls_c_symbol(): + mod = _fresh_module() + fn = mock.Mock() + mod._loaded = True + mod._lib = object() + mod._step_end_fn = fn + mod.step_end() + assert fn.called diff --git a/tests/contrib/pytorch/test_device.py b/tests/contrib/pytorch/test_device.py new file mode 100644 index 00000000000..a7db96bbff0 --- /dev/null +++ b/tests/contrib/pytorch/test_device.py @@ -0,0 +1,29 @@ +"""Tests for _cuda_visible_to_physical device index remapping.""" + +import os +from unittest import mock + +from ddtrace.contrib.internal.pytorch import _device + + +def test_cuda_visible_to_physical_no_remapping(): + with mock.patch.dict(os.environ, {}, clear=False): + os.environ.pop("CUDA_VISIBLE_DEVICES", None) + assert _device._cuda_visible_to_physical(1) == 1 + + +def test_cuda_visible_to_physical_remapping(): + with mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "2,4,6"}): + assert _device._cuda_visible_to_physical(0) == 2 + assert _device._cuda_visible_to_physical(1) == 4 + + +def test_cuda_visible_to_physical_no_dev_files(): + with mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "NoDevFiles"}): + assert _device._cuda_visible_to_physical(0) == 0 + + +def test_cuda_visible_to_physical_uuid_falls_back(): + with mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "GPU-abc,GPU-def"}): + # UUID entries are not integers; function falls back to visible_idx + assert _device._cuda_visible_to_physical(1) == 1 diff --git a/tests/contrib/pytorch/test_fork_safety.py b/tests/contrib/pytorch/test_fork_safety.py new file mode 100644 index 00000000000..a1401b3b9b7 --- /dev/null +++ b/tests/contrib/pytorch/test_fork_safety.py @@ -0,0 +1,92 @@ +"""pytorch.rank state is reset in ``fork``-ed children so the child can +bootstrap its own rank span without inheriting parent state. + +The remaining checks verify that the rank-root span reference and the +distributed bootstrap state are properly cleared across fork. +""" + +import multiprocessing as mp +import os +from unittest import mock + +import pytest + +from ddtrace.contrib.internal.pytorch import _device +from ddtrace.contrib.internal.pytorch import _distributed +from ddtrace.contrib.internal.pytorch import _rank_root +from ddtrace.contrib.internal.pytorch import _test_helpers as _th + + +def _child_assert_fresh(q): + # Verify parent's rank span and distributed bootstrap state were reset. + try: + assert _th.current_rank_span() is None, "rank span leaked into child" + assert _distributed._rank_ctx.get() is None, "_rank_ctx leaked into child" + assert _device._cache is None, "_device._cache leaked into child" + q.put("ok") + except AssertionError as e: + q.put(str(e)) + + +@pytest.mark.skipif(os.name != "posix", reason="fork is POSIX-only") +def test_fork_resets_rank_root_and_bootstrap_state(): + _th.reset_device_cache() + _th.close_rank_root() + with ( + mock.patch.object(_device, "_cuda_is_available", return_value=False), + mock.patch.object(_device, "_hostname", return_value="h-parent"), + ): + _device.discover(local_rank=0) + # Open a rank span and mark distributed as bootstrapped in the parent. + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X") + from ddtrace.internal import core + + fake_ctx = core.context_with_data("pytorch.rank", _dispatch_end_event=False) + fake_ctx.__enter__() + _distributed._rank_ctx.set(fake_ctx) + + ctx = mp.get_context("fork") + q = ctx.Queue() + p = ctx.Process(target=_child_assert_fresh, args=(q,)) + p.start() + p.join(timeout=10) + result = q.get(timeout=1) + + _rank_root.close() + # Restore _distributed._rank_ctx so other tests are not affected. + fake_ctx.dispatch_ended_event() + fake_ctx.__exit__(None, None, None) + _distributed._rank_ctx.set(None) + assert result == "ok", result + + +@pytest.mark.skipif(os.name != "posix", reason="fork is POSIX-only") +def test_run_metadata_cleared_after_fork(tmp_path): + """Use a file marker rather than multiprocessing.Queue: Queue's + feeder thread is fork-unsafe and os._exit skips flush. + """ + import multiprocessing + + from ddtrace.contrib.internal.pytorch import _utils + from ddtrace.contrib.internal.pytorch._utils import get_cached_run_metadata + from ddtrace.contrib.internal.pytorch._utils import set_cached_run_metadata + + set_cached_run_metadata(run_name="parent-run", submission_id="parent-sub", metadata={"k": "v"}) + + marker = tmp_path / "child_metadata.txt" + + def child(path): + snap = get_cached_run_metadata() + path.write_text("EMPTY" if len(snap) == 0 else "STALE:" + repr(dict(snap))) + os._exit(0) + + try: + ctx = multiprocessing.get_context("fork") + p = ctx.Process(target=child, args=(marker,)) + p.start() + p.join(timeout=5) + assert p.exitcode == 0, f"child exited with code {p.exitcode}" + content = marker.read_text() + assert content == "EMPTY", f"child saw stale metadata: {content}" + finally: + _utils.clear_cached_run_metadata() diff --git a/tests/contrib/pytorch/test_long_running_span.py b/tests/contrib/pytorch/test_long_running_span.py new file mode 100644 index 00000000000..55c4aa04645 --- /dev/null +++ b/tests/contrib/pytorch/test_long_running_span.py @@ -0,0 +1,64 @@ +"""Tests for the pytorch.rank span rotation / long-running lifecycle. + +Follows the same pattern as tests/contrib/ray/test_long_running_span.py: +rotation interval is patched to 0 (fires immediately) so tests run +without real 600-second waits. +""" + +import time +from unittest import mock + +import pytest + +from ddtrace.contrib.internal.pytorch import _distributed +import ddtrace.contrib.internal.pytorch._rank_root as rr + + +@pytest.fixture(autouse=True) +def _reset(tracer, pytorch_clean_state): # noqa: F811 + """Autouse wrapper: pulls in the shared pytorch_clean_state fixture.""" + + +def test_rotation_fires_and_replaces_span(_reset): + """After _rotation_interval_s elapses the span is replaced.""" + with mock.patch.object(rr, "_rotation_interval_s", 0): # fire immediately + rr.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-1") + first_span = rr._span + time.sleep(0.2) + + second_span = rr._span + assert second_span is not first_span, "span was not rotated" + assert first_span.finished, "old span should be finished after rotation" + assert second_span is not None + + +def test_rotation_tags_old_span_was_long_running(_reset): + """Rotated spans carry _dd.was_long_running=1.""" + with mock.patch.object(rr, "_rotation_interval_s", 0): + rr.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-1") + first_span = rr._span + time.sleep(0.2) + + assert first_span.get_metric("_dd.was_long_running") == 1 + + +def test_close_cancels_rotation_timer(_reset): + """close() cancels the pending rotation timer.""" + rr.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-1") + assert rr._rotation_timer is not None + rr.close() + assert rr._rotation_timer is None + + +def test_subgroup_destroy_does_not_close_rank_span(_reset): + """Destroying a subgroup must not close the pytorch.rank span.""" + rr.open_rank_span(rank=0, world_size=2, framework="ddp", training_job_id="job-1") + original_span = rr._span + + fake_group = object() + with mock.patch("torch.distributed.destroy_process_group") as mock_destroy: + mock_destroy.return_value = None + _distributed._wrapped_destroy_process_group(mock_destroy, None, (fake_group,), {}) + + assert rr._span is original_span, "subgroup destroy must not close the rank span" + assert not original_span.finished diff --git a/tests/contrib/pytorch/test_profiler_interaction.py b/tests/contrib/pytorch/test_profiler_interaction.py new file mode 100644 index 00000000000..204f4d83dea --- /dev/null +++ b/tests/contrib/pytorch/test_profiler_interaction.py @@ -0,0 +1,83 @@ +""" +Verify our pytorch integration doesn't interfere with torch.profiler.profile(). +Tests: + 1. profiler runs and captures ops while integration is active + 2. profiler schedule + step_num callback fires correctly + 3. profiler works normally after unpatch +""" + +import pytest +import torch +import torch.nn as nn +import torch.optim as optim + +import ddtrace + + +@pytest.fixture(autouse=True) +def _patch_pytorch(): + ddtrace.patch(pytorch=True) + yield + ddtrace.patch(pytorch=False) + + +@pytest.fixture() +def _simple_model(): + model = nn.Linear(10, 5) + optimizer = optim.SGD(model.parameters(), lr=0.01) + x = torch.randn(4, 10) + y = torch.randn(4, 5) + return model, optimizer, x, y + + +def test_profiler_captures_ops_while_integration_active(_simple_model): + model, optimizer, x, y = _simple_model + with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CPU], + record_shapes=True, + ) as prof: + out = model(x) + loss = nn.functional.mse_loss(out, y) + loss.backward() + optimizer.step() + optimizer.zero_grad() + + events = prof.key_averages() + assert len(events) > 0, "profiler captured no events while integration was active" + + +def test_profiler_schedule_fires_while_integration_active(_simple_model): + model, optimizer, x, y = _simple_model + steps_seen = [] + + def trace_handler(p): + steps_seen.append(p.step_num) + + with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CPU], + schedule=torch.profiler.schedule(wait=0, warmup=0, active=2), + on_trace_ready=trace_handler, + ) as prof: + for _ in range(4): + out = model(x) + loss = nn.functional.mse_loss(out, y) + loss.backward() + optimizer.step() + optimizer.zero_grad() + prof.step() + + assert len(steps_seen) > 0, "on_trace_ready never called while integration was active" + + +def test_profiler_works_after_unpatch(_simple_model): + model, optimizer, x, y = _simple_model + ddtrace.patch(pytorch=False) # unpatch early (fixture will also unpatch — idempotent) + + with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU]) as prof: + out = model(x) + loss = nn.functional.mse_loss(out, y) + loss.backward() + optimizer.step() + + events = prof.key_averages() + assert len(events) > 0, "profiler captured no events after unpatch" diff --git a/tests/contrib/pytorch/test_pytorch.py b/tests/contrib/pytorch/test_pytorch.py new file mode 100644 index 00000000000..40919ace734 --- /dev/null +++ b/tests/contrib/pytorch/test_pytorch.py @@ -0,0 +1,180 @@ +"""Integration tests for the pytorch.rank lifetime span. + +These tests exercise the real patch/unpatch cycle with a CPU-only gloo +process group and assert that a ``pytorch.rank`` span is emitted with the +expected tags (rank, world_size, framework, training_job_id). +""" + +import os +import sys + +import pytest +import torch + + +# torch.distributed.init_process_group cannot be called more than once per +# process on torch < 2.1; re-init hangs indefinitely with the gloo backend. +pytestmark = pytest.mark.skipif( + tuple(int(x) for x in torch.__version__.split(".")[:2]) < (2, 1), + reason="distributed re-init hangs on torch<2.1", +) + + +@pytest.fixture(autouse=True) +def _isolated(monkeypatch): + """Reset integration state before each test.""" + from ddtrace.contrib.internal.pytorch import _distributed + from ddtrace.contrib.internal.pytorch import _test_helpers as _th + from ddtrace.contrib.internal.pytorch.patch import unpatch as pt_unpatch + + _th.close_rank_root() + _th.reset_device_cache() + _distributed._installed = False + _distributed._rank_ctx.set(None) + setattr(__import__("torch"), "_datadog_patch", False) + yield + try: + pt_unpatch() + except Exception: + pass + _th.close_rank_root() + _th.reset_device_cache() + _distributed._rank_ctx.set(None) + + +def _setup_single_rank_gloo(): + os.environ.setdefault("MASTER_ADDR", "127.0.0.1") + os.environ.setdefault("MASTER_PORT", "29555") + os.environ.setdefault("RANK", "0") + os.environ.setdefault("WORLD_SIZE", "1") + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="gloo", rank=0, world_size=1) + + +def _teardown_gloo(): + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + +def test_rank_span_emitted_on_init_process_group(monkeypatch, test_spans): + """patch() + init_process_group emits a ``pytorch.rank`` span with the + correct rank, world_size, framework, and training_job_id tags. + The span is closed by destroy_process_group (the wrapped version) or + unpatch(), so we tear down before inspecting spans. + """ + monkeypatch.setenv("TORCHELASTIC_RUN_ID", "test-run-123") + + from ddtrace.contrib.internal.pytorch.patch import patch as pt_patch + from ddtrace.contrib.internal.pytorch.patch import unpatch as pt_unpatch + + pt_patch() + _setup_single_rank_gloo() + try: + _teardown_gloo() + finally: + pt_unpatch() + + spans = test_spans.pop() + rank_spans = [s for s in spans if s.name == "pytorch.rank"] + assert rank_spans, "no pytorch.rank span emitted" + span = rank_spans[0] + assert span.get_metric("rank") == 0 + assert span.get_metric("world_size") == 1 + assert span.get_tag("framework") is not None + assert span.get_tag("training_job.id") is not None + + +def test_rank_span_job_id_from_torchelastic_env(monkeypatch, test_spans): + """When TORCHELASTIC_RUN_ID is set the rank span carries that value as + training_job.id. + """ + monkeypatch.setenv("TORCHELASTIC_RUN_ID", "elastic-run-99") + monkeypatch.delenv("DD_PYTORCH_JOB_ID", raising=False) + + from ddtrace.contrib.internal.pytorch.patch import patch as pt_patch + from ddtrace.contrib.internal.pytorch.patch import unpatch as pt_unpatch + + pt_patch() + _setup_single_rank_gloo() + try: + _teardown_gloo() + finally: + pt_unpatch() + + spans = test_spans.pop() + rank_spans = [s for s in spans if s.name == "pytorch.rank"] + assert rank_spans, "no pytorch.rank span emitted" + span = rank_spans[0] + assert span.get_tag("training_job.id") == "elastic-run-99" + + +def test_fsdp_not_eagerly_imported(): + """patch(pytorch=True) must NOT cause torch.distributed.fsdp to land in + sys.modules. Eagerly importing it pulls _dynamo + sympy (~1.3 s startup + overhead) for every DDP workload that never touches FSDP. + """ + for _key in list(sys.modules): + if _key == "torch.distributed.fsdp" or _key.startswith("torch.distributed.fsdp."): + sys.modules.pop(_key) + + from ddtrace.contrib.internal.pytorch.patch import patch as pt_patch + from ddtrace.contrib.internal.pytorch.patch import unpatch as pt_unpatch + + try: + pt_patch() + assert "torch.distributed.fsdp" not in sys.modules, ( + "_install_fsdp() imported torch.distributed.fsdp eagerly — convert it to register_post_import_hook" + ) + finally: + pt_unpatch() + + +@pytest.mark.skipif( + tuple(int(x) for x in torch.__version__.split(".")[:2]) >= (2, 6), + reason="torch>=2.6 raises on double FSDP operator registration when fsdp is removed " + "from sys.modules; the hook itself still works (verified by test_fsdp_not_eagerly_imported)", +) +def test_fsdp_wrapper_installed_on_import(): + """After patch(), importing torch.distributed.fsdp should trigger the + post-import hook and wrap FullyShardedDataParallel.__init__. + """ + for _key in list(sys.modules): + if _key == "torch.distributed.fsdp" or _key.startswith("torch.distributed.fsdp."): + sys.modules.pop(_key) + + from ddtrace.contrib.internal.pytorch.patch import patch as pt_patch + from ddtrace.contrib.internal.pytorch.patch import unpatch as pt_unpatch + + try: + pt_patch() + # Trigger the hook by importing the module. + try: + from torch.distributed.fsdp import FullyShardedDataParallel + except ImportError as e: + pytest.skip(f"torch.distributed.fsdp not importable in this environment: {e}") + + assert hasattr(FullyShardedDataParallel.__init__, "__wrapped__"), ( + "FSDP.__init__ was not wrapped after post-import hook fired" + ) + finally: + pt_unpatch() + + +def test_bootstrap_reads_ray_env_vars(monkeypatch): + """_bootstrap_distributed() must populate the run-metadata cache from + Ray-set env vars so that pytorch.rank spans carry ray.submission_id and + ray.train.run_name tags. + """ + monkeypatch.setenv("_RAY_SUBMISSION_ID", "raysubmit_xyz") + monkeypatch.setenv("_RAY_JOB_NAME", "my-experiment") + monkeypatch.setenv("RAY_JOB_ID", "33000000") + + from ddtrace.contrib.internal.pytorch import _utils + from ddtrace.contrib.internal.pytorch._distributed import _populate_ray_run_metadata + + _utils.clear_cached_run_metadata() + _populate_ray_run_metadata() + + rm = _utils.get_cached_run_metadata() + assert rm.get("submission_id") == "raysubmit_xyz" + assert rm.get("run_name") == "my-experiment" diff --git a/tests/contrib/pytorch/test_pytorch_patch.py b/tests/contrib/pytorch/test_pytorch_patch.py new file mode 100644 index 00000000000..47cc4fca1c6 --- /dev/null +++ b/tests/contrib/pytorch/test_pytorch_patch.py @@ -0,0 +1,93 @@ +import pytest + +import ddtrace.contrib.internal.pytorch.patch as pytorch_patch +from ddtrace.contrib.internal.pytorch.patch import get_version +from ddtrace.contrib.internal.pytorch.patch import patch +from ddtrace.contrib.internal.pytorch.patch import unpatch +from tests.contrib.patch import PatchTestCase + + +class TestPyTorchPatch(PatchTestCase.Base): + __integration_name__ = "pytorch" + __module_name__ = "torch" + __patch_func__ = patch + __unpatch_func__ = unpatch + __get_version__ = get_version + + def assert_module_patched(self, torch): + assert getattr(torch, "_datadog_patch", False) is True + + def assert_not_module_patched(self, torch): + assert getattr(torch, "_datadog_patch", False) is False + + def assert_not_module_double_patched(self, torch): + assert getattr(torch, "_datadog_patch", False) is True + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_patch_all_does_not_enable_pytorch_by_default(monkeypatch): + """Pytorch is opt-in: a plain patch_all() must not flip torch._datadog_patch.""" + import torch + + from ddtrace._monkey import PATCH_MODULES + + assert PATCH_MODULES.get("pytorch") is False + + if getattr(torch, "_datadog_patch", False): + from ddtrace.contrib.internal.pytorch.patch import unpatch + + unpatch() + + from ddtrace._monkey import patch_all + + patch_all() + assert getattr(torch, "_datadog_patch", False) is False + + +def test_explicit_patch_pytorch_true_still_works(): + import torch + + from ddtrace._monkey import patch + from ddtrace.contrib.internal.pytorch.patch import unpatch + + if getattr(torch, "_datadog_patch", False): + unpatch() + + patch(pytorch=True) + try: + assert getattr(torch, "_datadog_patch", False) is True + finally: + unpatch() + + +@pytest.mark.parametrize("bad_version", [(1, 9, 0), (3, 0, 0)]) +def test_patch_skipped_for_unsupported_torch_version(monkeypatch, bad_version): + import torch + + if getattr(torch, "_datadog_patch", False): + unpatch() + + monkeypatch.setattr(pytorch_patch, "TORCH_VERSION", bad_version) + patch() + assert getattr(torch, "_datadog_patch", False) is False + + +def test_install_runs_unconditionally(monkeypatch): + import torch + + from ddtrace.contrib.internal.pytorch import _distributed + from ddtrace.contrib.internal.pytorch.patch import patch + from ddtrace.contrib.internal.pytorch.patch import unpatch + + monkeypatch.delenv("RANK", raising=False) + monkeypatch.delenv("WORLD_SIZE", raising=False) + monkeypatch.setattr(torch.distributed, "is_initialized", lambda: False) + + if getattr(torch, "_datadog_patch", False): + unpatch() + + patch() + try: + assert _distributed._installed is True + finally: + unpatch() diff --git a/tests/contrib/pytorch/test_rank_root.py b/tests/contrib/pytorch/test_rank_root.py new file mode 100644 index 00000000000..a5ec9a06f5f --- /dev/null +++ b/tests/contrib/pytorch/test_rank_root.py @@ -0,0 +1,546 @@ +"""Tests for the pytorch.rank lifetime span.""" + +import pytest + +from ddtrace.contrib.internal.pytorch import _device +from ddtrace.contrib.internal.pytorch import _rank_root +from ddtrace.contrib.internal.pytorch import _test_helpers as _th + + +@pytest.fixture(autouse=True) +def _reset(tracer, pytorch_clean_state): # noqa: F811 + """Autouse wrapper: pulls in the shared pytorch_clean_state fixture.""" + + +def test_open_creates_span_with_required_tags(tracer): + _rank_root.open_rank_span(rank=3, world_size=8, framework="ddp", training_job_id="job-X") + span = _th.current_rank_span() + assert span is not None + assert span.name == "pytorch.rank" + assert span.get_tag("training_job.id") == "job-X" + assert span.get_metric("rank") == 3 + assert span.get_metric("world_size") == 8 + assert span.get_tag("framework") == "ddp" + assert span.get_tag("device.id") == "h-9:cpu" + + +def test_open_is_idempotent(tracer): + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X") + first = _th.current_rank_span() + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X") + second = _th.current_rank_span() + assert first is second + + +def test_close_finishes_span(tracer): + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X") + span = _th.current_rank_span() + _rank_root.close() + assert span.finished + assert _th.current_rank_span() is None + + +def test_close_without_open_is_safe(tracer): + _rank_root.close() # no error + + +def test_open_registers_atexit_handler(tracer, monkeypatch): + """Many users never call `unpatch()` (a `ddtrace-run` process just + exits). We register `close` as an atexit hook so the rank span is + finished cleanly on normal interpreter shutdown. + """ + handlers = [] + real_register = _rank_root.atexit.register + + def capture(fn, *a, **kw): + handlers.append(fn) + return real_register(fn, *a, **kw) + + monkeypatch.setattr(_rank_root.atexit, "register", capture) + _th.set_atexit_registered(False) + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X") + assert _rank_root.close in handlers + + +def test_atexit_register_unregister_balanced_across_cycles(tracer, monkeypatch): + """``close()`` must ``atexit.unregister`` so multiple open/close cycles + don't accumulate handlers in the atexit list — only one ``close`` + callback should be live between cycles. + """ + registered = 0 + unregistered = 0 + real_register = _rank_root.atexit.register + real_unregister = _rank_root.atexit.unregister + + def capture_register(fn, *a, **kw): + nonlocal registered + if fn is _rank_root.close: + registered += 1 + return real_register(fn, *a, **kw) + + def capture_unregister(fn): + nonlocal unregistered + if fn is _rank_root.close: + unregistered += 1 + return real_unregister(fn) + + monkeypatch.setattr(_rank_root.atexit, "register", capture_register) + monkeypatch.setattr(_rank_root.atexit, "unregister", capture_unregister) + _th.set_atexit_registered(False) + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X") + _rank_root.close() + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X") + _rank_root.close() + assert registered == 2 + assert unregistered == 2 + + +def test_set_framework_updates_open_span_tag(tracer): + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X") + span = _th.current_rank_span() + assert span.get_tag("framework") == "none" + _rank_root.set_framework("ddp") + assert span.get_tag("framework") == "ddp" + + +def test_set_framework_noop_without_open_span(tracer): + _rank_root.set_framework("ddp") # no error + + +def test_set_framework_noop_for_empty_string(tracer): + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="job-X") + _rank_root.set_framework("") + assert _th.current_rank_span().get_tag("framework") == "none" + + +def test_ray_run_context_tagged_at_open_when_cache_populated_early(tracer): + """Driver-side path: the Ray Train fit wrapper populates the cache + before ``init_process_group`` fires, so the tags land at open. + """ + from ddtrace.contrib.internal.pytorch import _utils + + _utils.set_cached_run_metadata( + submission_id="raysubmit_early", + metadata={"job_name": "early.job"}, + run_name="run-early", + ) + try: + _rank_root.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-X") + span = _th.current_rank_span() + assert span.get_tag("ray.submission_id") == "raysubmit_early" + assert span.get_tag("ray.metadata.job_name") == "early.job" + assert span.get_tag("ray.train.run_name") == "run-early" + finally: + _utils.clear_cached_run_metadata() + + +def test_ray_run_context_backfilled_at_close_when_cache_populated_late(tracer): + """Worker-side path: Ray Train calls ``init_process_group`` itself + *before* invoking the wrapped train function, so the cache is empty + when the rank span opens. The wrapper populates the cache later, + and ``close()`` must backfill the tags before finishing the span. + """ + from ddtrace.contrib.internal.pytorch import _utils + + # Cache empty at open time. + _utils.clear_cached_run_metadata() + _rank_root.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-X") + span = _th.current_rank_span() + assert span.get_tag("ray.submission_id") is None + assert span.get_tag("ray.metadata.job_name") is None + + # Wrapper fires after the rank span is already open. + _utils.set_cached_run_metadata( + submission_id="raysubmit_late", + metadata={"job_name": "late.job"}, + run_name="run-late", + ) + try: + _rank_root.close() + assert span.get_tag("ray.submission_id") == "raysubmit_late" + assert span.get_tag("ray.metadata.job_name") == "late.job" + assert span.get_tag("ray.train.run_name") == "run-late" + finally: + _utils.clear_cached_run_metadata() + + +def test_retag_ray_run_context_tags_live_rank_span(tracer): + """Regression: ``ray.submission_id`` was missing on ``pytorch.rank`` + in live verification because ``_run_train_func_in_worker`` restores + the cache to empty before ``_rank_root.close()`` runs at exit. The + new ``retag_ray_run_context()`` entrypoint is called by the worker + wrap immediately after populating the cache so the tag lands on the + live span (not at close, which sees an empty cache). + """ + from ddtrace.contrib.internal.pytorch import _utils + + _utils.clear_cached_run_metadata() + _rank_root.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-X") + span = _th.current_rank_span() + assert span.get_tag("ray.submission_id") is None + + # Worker wrap populates the cache, then immediately calls retag. + _utils.set_cached_run_metadata( + submission_id="raysubmit_eager", + metadata={"job_name": "eager.job"}, + run_name="run-eager", + ) + try: + _rank_root.retag_ray_run_context() + assert span.get_tag("ray.submission_id") == "raysubmit_eager" + assert span.get_tag("ray.metadata.job_name") == "eager.job" + assert span.get_tag("ray.train.run_name") == "run-eager" + + # Simulate the worker wrap's finally clearing the cache (restore + # to empty). The tags must stay on the live span — they were + # written eagerly, not pulled at close. + _utils.clear_cached_run_metadata() + assert span.get_tag("ray.submission_id") == "raysubmit_eager" + finally: + _utils.clear_cached_run_metadata() + _rank_root.close() + + +def test_retag_ray_run_context_noop_when_no_span_open(tracer): + """retag_ray_run_context() must not crash when called with no rank + span open (e.g., installed but workers never reach init_process_group). + """ + from ddtrace.contrib.internal.pytorch import _utils + + # Ensure no span is open. + try: + _rank_root.close() + except Exception: + pass + + _utils.set_cached_run_metadata(submission_id="x", metadata={}, run_name="r") + try: + # Must not raise. + _rank_root.retag_ray_run_context() + finally: + _utils.clear_cached_run_metadata() + + +def test_rank_root_nests_under_active_ray_worker_span(tracer): + """When a `ray.train.worker` span is currently active, the + `pytorch.rank` span should become its child (not a new trace root). + """ + ray_worker = tracer.start_span("ray.train.worker", service="ray") + tracer.context_provider.activate(ray_worker) + try: + _rank_root.open_rank_span(rank=0, world_size=1, framework="ray", training_job_id="job-Y") + rank_span = _th.current_rank_span() + assert rank_span is not None + # The rank-root span should share a trace_id with the ray worker. + assert rank_span.trace_id == ray_worker.trace_id + # And its parent_id should reference the ray worker's span_id. + assert rank_span.parent_id == ray_worker.span_id + finally: + _rank_root.close() + ray_worker.finish() + tracer.context_provider.activate(None) + + +def test_rank_root_close_flush_is_bounded(monkeypatch): + """A slow tracer.flush() must not extend rank-root close beyond a + bounded timeout. + """ + import threading + import time + + from ddtrace import tracer + from ddtrace.contrib.internal.pytorch import _rank_root + + block = threading.Event() + + def slow_flush(*args, **kwargs): + block.wait(timeout=10) + + monkeypatch.setattr(tracer, "flush", slow_flush) + + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="t1") + start = time.monotonic() + _rank_root.close() + elapsed = time.monotonic() - start + block.set() + # close() joins the flush thread with a 2.0s timeout; allow a small margin above that. + assert elapsed < 3.0, f"close took {elapsed:.2f}s; expected bounded < 3s" + + +# --------------------------------------------------------------------------- +# Task 3: torch / cudnn / nccl / env / launcher / GPU invariant tagging +# --------------------------------------------------------------------------- + + +def test_detect_launcher_torchrun(monkeypatch): + from ddtrace.contrib.internal.pytorch import _distributed + + monkeypatch.setenv("TORCHELASTIC_RUN_ID", "tr-123") + monkeypatch.delenv("RAY_JOB_ID", raising=False) + monkeypatch.delenv("SLURM_JOB_ID", raising=False) + monkeypatch.delenv("KUBEFLOW_TRAINING_JOB_ID", raising=False) + assert _distributed._detect_launcher() == "torchrun" + + +def test_detect_launcher_ray(monkeypatch): + from ddtrace.contrib.internal.pytorch import _distributed + + monkeypatch.delenv("TORCHELASTIC_RUN_ID", raising=False) + monkeypatch.setenv("RAY_JOB_ID", "rayjob-99") + monkeypatch.delenv("SLURM_JOB_ID", raising=False) + monkeypatch.delenv("KUBEFLOW_TRAINING_JOB_ID", raising=False) + assert _distributed._detect_launcher() == "ray" + + +def test_detect_launcher_slurm(monkeypatch): + from ddtrace.contrib.internal.pytorch import _distributed + + monkeypatch.delenv("TORCHELASTIC_RUN_ID", raising=False) + monkeypatch.delenv("RAY_JOB_ID", raising=False) + monkeypatch.setenv("SLURM_JOB_ID", "slurm-42") + monkeypatch.delenv("KUBEFLOW_TRAINING_JOB_ID", raising=False) + assert _distributed._detect_launcher() == "slurm" + + +def test_detect_launcher_kubeflow(monkeypatch): + from ddtrace.contrib.internal.pytorch import _distributed + + monkeypatch.delenv("TORCHELASTIC_RUN_ID", raising=False) + monkeypatch.delenv("RAY_JOB_ID", raising=False) + monkeypatch.delenv("SLURM_JOB_ID", raising=False) + monkeypatch.setenv("KUBEFLOW_TRAINING_JOB_ID", "kf-job-1") + assert _distributed._detect_launcher() == "kubeflow" + + +def test_detect_launcher_none(monkeypatch): + from ddtrace.contrib.internal.pytorch import _distributed + + for var in ( + "TORCHELASTIC_RUN_ID", + "RAY_JOB_ID", + "SLURM_JOB_ID", + "KUBEFLOW_TRAINING_JOB_ID", + ): + monkeypatch.delenv(var, raising=False) + assert _distributed._detect_launcher() is None + + +def test_get_cached_backend_caches_result(monkeypatch): + from ddtrace.contrib.internal.pytorch import _distributed + + # Reset the cache. + _distributed._cached_distributed_backend = None + monkeypatch.setattr( + "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.is_available", + lambda: True, + ) + monkeypatch.setattr( + "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.is_initialized", + lambda: True, + ) + monkeypatch.setattr( + "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.get_backend", + lambda: "nccl", + ) + result1 = _distributed._get_cached_backend() + assert result1 == "nccl" + # Second call should return cached value without calling get_backend again. + monkeypatch.setattr( + "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.get_backend", + lambda: "SHOULD_NOT_BE_CALLED", + ) + result2 = _distributed._get_cached_backend() + assert result2 == "nccl" + # Clean up. + _distributed._cached_distributed_backend = None + + +def test_get_cached_backend_returns_none_when_not_initialized(monkeypatch): + from ddtrace.contrib.internal.pytorch import _distributed + + _distributed._cached_distributed_backend = None + monkeypatch.setattr( + "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.is_available", + lambda: True, + ) + monkeypatch.setattr( + "ddtrace.contrib.internal.pytorch._distributed.torch.distributed.is_initialized", + lambda: False, + ) + assert _distributed._get_cached_backend() is None + _distributed._cached_distributed_backend = None + + +def test_rank_span_carries_torch_invariants(monkeypatch): + """pytorch.rank span must carry torch version and cuDNN settings.""" + captured = {} + + class FakeSpan: + def __init__(self): + self.context = type("C", (), {"sampling_priority": 1})() + + def set_tag(self, k, v=None): + captured[k] = v + + def _set_attribute(self, k, v): + captured[k] = v + + def finish(self): + pass + + fake = FakeSpan() + from ddtrace import tracer + + monkeypatch.setattr(tracer, "start_span", lambda *a, **kw: fake) + + # Drop all NCCL / env vars so only torch/cudnn tags appear. + for v in ( + "NCCL_DEBUG", + "NCCL_SOCKET_IFNAME", + "NCCL_IB_DISABLE", + "NCCL_P2P_DISABLE", + "NCCL_ALGO", + "NCCL_PROTO", + "TORCH_NCCL_ASYNC_ERROR_HANDLING", + "CUDA_VISIBLE_DEVICES", + "MASTER_ADDR", + "LOCAL_RANK", + "LOCAL_WORLD_SIZE", + "GROUP_RANK", + "GROUP_WORLD_SIZE", + "MASTER_PORT", + "TORCHELASTIC_RUN_ID", + "RAY_JOB_ID", + "SLURM_JOB_ID", + "KUBEFLOW_TRAINING_JOB_ID", + ): + monkeypatch.delenv(v, raising=False) + + _rank_root._span = None + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="t1") + + # torch.__version__ is always populated; cudnn.{enabled,benchmark,deterministic} too. + assert "torch.version" in captured + assert "torch.cudnn.enabled" in captured + + _rank_root.close() + + +def test_rank_span_carries_env_signals(monkeypatch): + """pytorch.rank span must carry NCCL/distributed env vars as tags/facets.""" + from ddtrace import tracer + + captured = {} + + class FakeSpan: + def __init__(self): + self.context = type("C", (), {"sampling_priority": 1})() + + def set_tag(self, k, v=None): + captured[k] = v + + def _set_attribute(self, k, v): + captured[k] = v + + def finish(self): + pass + + monkeypatch.setattr(tracer, "start_span", lambda *a, **kw: FakeSpan()) + monkeypatch.setenv("NCCL_DEBUG", "INFO") + monkeypatch.setenv("LOCAL_RANK", "3") + monkeypatch.setenv("MASTER_ADDR", "10.0.0.5") + monkeypatch.setenv("MASTER_PORT", "29500") + + _rank_root._span = None + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="t1") + assert captured.get("nccl.debug") == "INFO" + assert captured.get("pytorch.local_rank") == 3 + assert captured.get("pytorch.master_addr") == "10.0.0.5" + assert captured.get("pytorch.master_port") == 29500 + _rank_root.close() + + +def test_rank_span_carries_launcher_tag(monkeypatch): + """pytorch.rank span must carry the `launcher` tag when a launcher env var is set.""" + from ddtrace import tracer + + captured = {} + + class FakeSpan: + def __init__(self): + self.context = type("C", (), {"sampling_priority": 1})() + + def set_tag(self, k, v=None): + captured[k] = v + + def _set_attribute(self, k, v): + captured[k] = v + + def finish(self): + pass + + monkeypatch.setattr(tracer, "start_span", lambda *a, **kw: FakeSpan()) + # Clear all other launcher vars so only torchrun fires. + for v in ("RAY_JOB_ID", "SLURM_JOB_ID", "KUBEFLOW_TRAINING_JOB_ID"): + monkeypatch.delenv(v, raising=False) + monkeypatch.setenv("TORCHELASTIC_RUN_ID", "elastic-run-1") + + _rank_root._span = None + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="t1") + assert captured.get("launcher") == "torchrun" + _rank_root.close() + + +def test_rank_span_uses_default_pytorch_service(_reset): + """pytorch.rank spans use 'pytorch' as service when DD_PYTORCH_SERVICE is unset.""" + import ddtrace.contrib.internal.pytorch._rank_root as rr + + rr.open_rank_span(rank=0, world_size=1, framework="ddp", training_job_id="job-1") + span = rr._span + assert span.service == "pytorch", f"Expected 'pytorch', got {span.service!r}" + + +def test_rank_span_carries_new_device_gpu_fields(monkeypatch): + """pytorch.rank span must expose GPU DeviceInfo fields when populated.""" + from ddtrace import tracer + from ddtrace.contrib.internal.pytorch._device import DeviceInfo + + captured = {} + + class FakeSpan: + def __init__(self): + self.context = type("C", (), {"sampling_priority": 1})() + + def set_tag(self, k, v=None): + captured[k] = v + + def _set_attribute(self, k, v): + captured[k] = v + + def finish(self): + pass + + monkeypatch.setattr(tracer, "start_span", lambda *a, **kw: FakeSpan()) + + # Inject a fake DeviceInfo with GPU fields. + fake_info = DeviceInfo( + device_id="gpu-uuid-abc", + device_index=0, + kind="cuda", + hostname="node-1", + gpu_name="NVIDIA A100", + gpu_compute_capability="8.0", + gpu_sm_count=108, + gpu_total_memory_bytes=85899345920, + gpu_driver_version="525.85.12", + ) + monkeypatch.setattr(_device, "get", lambda: fake_info) + + _rank_root._span = None + _rank_root.open_rank_span(rank=0, world_size=1, framework="none", training_job_id="t1") + assert captured.get("device.gpu.name") == "NVIDIA A100" + assert captured.get("device.gpu.compute_capability") == "8.0" + assert captured.get("device.gpu.sm_count") == 108 + assert captured.get("device.gpu.total_memory_bytes") == 85899345920 + assert captured.get("device.gpu.driver_version") == "525.85.12" + _rank_root.close() diff --git a/tests/contrib/pytorch/test_repatch_and_exception_paths.py b/tests/contrib/pytorch/test_repatch_and_exception_paths.py new file mode 100644 index 00000000000..576eb7cb3fe --- /dev/null +++ b/tests/contrib/pytorch/test_repatch_and_exception_paths.py @@ -0,0 +1,124 @@ +"""Regression tests for PyTorch integration edge cases: + +* ``install()`` / ``uninstall()`` must be idempotent (no wrapper stacking). +* A full patch / unpatch / patch cycle must leave exactly one wrapper layer. +* Exceptions raised inside ``_bootstrap_distributed`` / ``_wrapped_destroy_process_group`` + must not leave the integration in a broken state. +""" + +import pytest +import torch + +from ddtrace.contrib.internal.pytorch import _distributed +from ddtrace.contrib.internal.pytorch import _test_helpers as _th +from ddtrace.contrib.internal.pytorch import patch as pytorch_patch + + +def _force_clean_wraps() -> None: + """Defensively remove any pytorch wraps left by earlier tests in this + session. Earlier tests (e.g. ``test_layer_one_gating``) call + ``_distributed.install()`` directly, bypassing the + ``torch._datadog_patch`` flag, so the high-level ``unpatch()`` returns + early and leaves wrappers attached. Force ``_installed = True`` and call + ``uninstall()`` to walk the canonical teardown path. + """ + setattr(torch, "_datadog_patch", False) + _distributed._installed = True + try: + _distributed.uninstall() + except Exception: + pass + + +@pytest.fixture +def _clean_state(monkeypatch): + _force_clean_wraps() + _th.reset_device_cache() + _th.close_rank_root() + yield + _force_clean_wraps() + _th.reset_device_cache() + _th.close_rank_root() + + +def _dd_wrapper_depth(fn) -> int: + """Count only ``wrapt``-added layers. + + torch itself decorates some distributed functions with ``functools.wraps``, + which also sets ``__wrapped__``; we only count layers that are ``wrapt`` + ``FunctionWrapper`` instances so torch's own decorators are excluded. + """ + import wrapt + + depth = 0 + f = fn + while isinstance(f, wrapt.FunctionWrapper): + depth += 1 + f = f.__wrapped__ + return depth + + +def _dd_wrapper_depth_ipg() -> int: + """Wrapper depth on ``torch.distributed.init_process_group``. + + The integration wraps ``init_process_group`` and ``destroy_process_group`` (not + collectives), so we measure idempotency on those two functions. + """ + return _dd_wrapper_depth(torch.distributed.init_process_group) + + +def test_install_is_idempotent_no_wrapper_stacking(_clean_state): + """Calling install() twice must not stack wrappers on torch.distributed.""" + assert _dd_wrapper_depth_ipg() == 0 + _distributed.install() + depth_after_first = _dd_wrapper_depth_ipg() + assert depth_after_first == 1 + _distributed.install() # must be a no-op + assert _dd_wrapper_depth_ipg() == depth_after_first + _distributed.uninstall() + assert _dd_wrapper_depth_ipg() == 0 + + +def test_patch_unpatch_patch_cycle_is_clean(_clean_state): + """A full patch/unpatch/patch cycle must leave exactly one wrapper layer.""" + pytorch_patch.patch() + depth_after_first = _dd_wrapper_depth_ipg() + pytorch_patch.unpatch() + assert _dd_wrapper_depth_ipg() == 0 + pytorch_patch.patch() + assert _dd_wrapper_depth_ipg() == depth_after_first + + +def test_uninstall_is_idempotent(_clean_state): + """uninstall() without a prior install() is a no-op.""" + _distributed.uninstall() + _distributed.uninstall() + + +def test_exception_in_bootstrap_does_not_corrupt_install_state(monkeypatch, _clean_state): + """If _bootstrap_distributed raises, install() state is still usable.""" + monkeypatch.setattr(_distributed, "_bootstrap_distributed", lambda: (_ for _ in ()).throw(RuntimeError("boom"))) + pytorch_patch.patch() + # init_process_group wrapper is in place even after a bootstrap failure + assert _distributed._installed + pytorch_patch.unpatch() + assert not _distributed._installed + + +def test_exception_in_destroy_still_closes_rank_span(monkeypatch, _clean_state): + """_rank_root.close() is called even when destroy_process_group raises.""" + closed = [] + monkeypatch.setattr( + "ddtrace.contrib.internal.pytorch._rank_root.close", + lambda: closed.append(True), + ) + + def raising_destroy(*a, **kw): + raise RuntimeError("destroy failed") + + monkeypatch.setattr(torch.distributed, "destroy_process_group", raising_destroy) + pytorch_patch.patch() + with pytest.raises(RuntimeError, match="destroy failed"): + torch.distributed.destroy_process_group() + assert closed, "_rank_root.close() was not called despite try/finally" + pytorch_patch.unpatch() diff --git a/tests/contrib/pytorch/test_utils.py b/tests/contrib/pytorch/test_utils.py new file mode 100644 index 00000000000..b5d593d2b38 --- /dev/null +++ b/tests/contrib/pytorch/test_utils.py @@ -0,0 +1,218 @@ +from ddtrace.contrib.internal.pytorch._utils import TRAINING_JOB_ID_TAG +from ddtrace.contrib.internal.pytorch._utils import job_id_env_set +from ddtrace.contrib.internal.pytorch._utils import resolve_job_id_from_env +from ddtrace.contrib.internal.pytorch._utils import set_training_job_id_tag + + +def test_resolve_job_id_falls_back_to_torchelastic(monkeypatch): + monkeypatch.delenv("DD_PYTORCH_JOB_ID", raising=False) + monkeypatch.delenv("RAY_JOB_ID", raising=False) + monkeypatch.setenv("TORCHELASTIC_RUN_ID", "elastic-id") + monkeypatch.setenv("KUBEFLOW_TRAINING_JOB_ID", "kf-id") + monkeypatch.setenv("SLURM_JOB_ID", "slurm-id") + assert resolve_job_id_from_env() == "elastic-id" + + +def test_resolve_job_id_falls_back_to_kubeflow(monkeypatch): + monkeypatch.delenv("DD_PYTORCH_JOB_ID", raising=False) + monkeypatch.delenv("RAY_JOB_ID", raising=False) + monkeypatch.delenv("TORCHELASTIC_RUN_ID", raising=False) + monkeypatch.setenv("KUBEFLOW_TRAINING_JOB_ID", "kf-id") + monkeypatch.setenv("SLURM_JOB_ID", "slurm-id") + assert resolve_job_id_from_env() == "kf-id" + + +def test_resolve_job_id_falls_back_to_slurm(monkeypatch): + for v in ("DD_PYTORCH_JOB_ID", "TORCHELASTIC_RUN_ID", "KUBEFLOW_TRAINING_JOB_ID", "RAY_JOB_ID"): + monkeypatch.delenv(v, raising=False) + monkeypatch.setenv("SLURM_JOB_ID", "slurm-id") + assert resolve_job_id_from_env() == "slurm-id" + + +def test_resolve_job_id_generates_uuid_when_unset(monkeypatch): + for v in ( + "DD_PYTORCH_JOB_ID", + "TORCHELASTIC_RUN_ID", + "KUBEFLOW_TRAINING_JOB_ID", + "RAY_JOB_ID", + "SLURM_JOB_ID", + ): + monkeypatch.delenv(v, raising=False) + job_id = resolve_job_id_from_env() + # UUID4 form has 36 chars including hyphens. + assert len(job_id) == 36 and job_id.count("-") == 4 + + +def test_resolve_job_id_empty_string_falls_through(monkeypatch): + monkeypatch.setenv("DD_PYTORCH_JOB_ID", " ") # whitespace-only treated as unset + monkeypatch.setenv("TORCHELASTIC_RUN_ID", "elastic-id") + assert resolve_job_id_from_env() == "elastic-id" + + +def test_dd_pytorch_job_id_wins_over_ray_job_id(monkeypatch): + monkeypatch.setenv("DD_PYTORCH_JOB_ID", "user-supplied-id") + monkeypatch.setenv("RAY_JOB_ID", "33000000") + assert resolve_job_id_from_env() == "user-supplied-id" + + +def test_resolve_job_id_prefers_ray_over_torchelastic(monkeypatch): + monkeypatch.delenv("DD_PYTORCH_JOB_ID", raising=False) + monkeypatch.setenv("RAY_JOB_ID", "ray-abc") + monkeypatch.setenv("TORCHELASTIC_RUN_ID", "te-xyz") + monkeypatch.setenv("KUBEFLOW_TRAINING_JOB_ID", "kf-456") + monkeypatch.setenv("SLURM_JOB_ID", "slurm-789") + + assert resolve_job_id_from_env() == "ray-abc" + + +def test_job_id_env_set_false_when_all_unset(monkeypatch): + for v in ("DD_PYTORCH_JOB_ID", "RAY_JOB_ID", "TORCHELASTIC_RUN_ID", "KUBEFLOW_TRAINING_JOB_ID", "SLURM_JOB_ID"): + monkeypatch.delenv(v, raising=False) + assert job_id_env_set() is False + + +def test_job_id_env_set_treats_whitespace_as_unset(monkeypatch): + for v in ("DD_PYTORCH_JOB_ID", "RAY_JOB_ID", "TORCHELASTIC_RUN_ID", "KUBEFLOW_TRAINING_JOB_ID", "SLURM_JOB_ID"): + monkeypatch.delenv(v, raising=False) + monkeypatch.setenv("DD_PYTORCH_JOB_ID", " \t\n") + assert job_id_env_set() is False + + +def test_set_training_job_id_tag_sets_both_keys(monkeypatch): + from ddtrace.contrib.internal.pytorch import _utils + + monkeypatch.setattr(_utils, "_default_job_id", "ray-abc-123", raising=False) + monkeypatch.setattr(_utils._tls_job_id, "value", None, raising=False) + + class _FakeSpan: + def __init__(self): + self._tags = {} + + def set_tag(self, key, value=None): + self._tags[key] = value + + span = _FakeSpan() + set_training_job_id_tag(span) + + assert span._tags[TRAINING_JOB_ID_TAG] == "ray-abc-123" + assert span._tags["job_id"] == "ray-abc-123" + + +def test_set_training_job_id_tag_noop_when_id_unset(monkeypatch): + from ddtrace.contrib.internal.pytorch import _utils + + monkeypatch.setattr(_utils, "_default_job_id", None, raising=False) + monkeypatch.setattr(_utils._tls_job_id, "value", None, raising=False) + + class _FakeSpan: + def __init__(self): + self._tags = {} + + def set_tag(self, key, value=None): + self._tags[key] = value + + span = _FakeSpan() + set_training_job_id_tag(span) + assert "manual.keep" in span._tags + assert TRAINING_JOB_ID_TAG not in span._tags + + +def test_set_training_job_id_tag_does_not_acquire_lock(monkeypatch): + """A6: this function runs per span on the hot path. The reads it + performs must not take `_run_metadata_lock`. + """ + from ddtrace.contrib.internal.pytorch import _utils + + _utils.set_cached_run_metadata(run_name="rn", submission_id="sub", metadata={"k": "v"}) + _utils.set_cached_job_id("training-abc") + + acquired = [] + real_lock = _utils._run_metadata_lock + + class WatchingLock: + def acquire(self, *a, **kw): + acquired.append("acquire") + return real_lock.acquire(*a, **kw) + + def release(self): + acquired.append("release") + return real_lock.release() + + def __enter__(self): + self.acquire() + return self + + def __exit__(self, *a): + self.release() + + monkeypatch.setattr(_utils, "_run_metadata_lock", WatchingLock()) + + class FakeSpan: + def __init__(self): + self.tags = {} + + def set_tag(self, k, v=None): + self.tags[k] = v + + for _ in range(100): + s = FakeSpan() + _utils.set_training_job_id_tag(s) + assert s.tags.get("training_job.id") == "training-abc" + + assert acquired == [], f"hot-path span tagging took the lock: {acquired}" + + +def test_get_cached_run_metadata_is_immutable(): + """NB4: the published view must reject mutation.""" + import pytest + + from ddtrace.contrib.internal.pytorch import _utils + + _utils.set_cached_run_metadata(run_name="rn", submission_id="sub", metadata={"k": "v"}) + snap = _utils.get_cached_run_metadata() + + with pytest.raises(TypeError): + snap["run_name"] = "mutated" + with pytest.raises(TypeError): + snap["metadata"]["k"] = "mutated" + + +def test_run_metadata_view_consistent_under_writer_load(): + """The view is replaced atomically; concurrent readers see either + the old snapshot or the new one — never a torn intermediate state. + """ + import threading + + from ddtrace.contrib.internal.pytorch import _utils + + _utils.set_cached_run_metadata(run_name="A", submission_id="A-sub", metadata={"k": "A"}) + + ready = threading.Barrier(5) # 4 readers + main + stop = threading.Event() + seen_inconsistent = [] + + def reader(): + ready.wait(timeout=5) + while not stop.is_set(): + snap = _utils.get_cached_run_metadata() + rn = snap.get("run_name") + sub = snap.get("submission_id") + md = (snap.get("metadata") or {}).get("k") + if rn is None or sub is None or md is None: + seen_inconsistent.append(("missing", rn, sub, md)) + continue + if not (rn == sub.split("-")[0] == md): + seen_inconsistent.append((rn, sub, md)) + + threads = [threading.Thread(target=reader) for _ in range(4)] + for t in threads: + t.start() + try: + ready.wait(timeout=5) + for label in ("B", "C", "D", "E"): + _utils.set_cached_run_metadata(run_name=label, submission_id=f"{label}-sub", metadata={"k": label}) + finally: + stop.set() + for t in threads: + t.join(timeout=2) + assert seen_inconsistent == [], f"saw torn reads: {seen_inconsistent[:5]}" diff --git a/tests/contrib/suitespec.yml b/tests/contrib/suitespec.yml index 812a28aa550..35b61fbb265 100644 --- a/tests/contrib/suitespec.yml +++ b/tests/contrib/suitespec.yml @@ -167,6 +167,8 @@ components: - ddtrace/ext/memcached.py pynamodb: - ddtrace/contrib/internal/pynamodb/* + pytorch: + - ddtrace/contrib/internal/pytorch/* pyodbc: - ddtrace/contrib/internal/pyodbc/* pyramid: @@ -1107,6 +1109,18 @@ suites: - '@pynamodb' - tests/contrib/pynamodb/* snapshot: true + pytorch: + venvs_per_job: 1 + skip_venv_artifacts: true + skip_pip_cache: true + paths: + - '@bootstrap' + - '@core' + - '@contrib' + - '@tracing' + - '@pytorch' + - tests/contrib/pytorch/* + snapshot: true pyodbc: parallelism: 1 paths: