Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ cran-comments.md
man-roxygen
data-raw
docs
^.*vtune.*$
revdep
\.covrignore
^\.git$
Expand Down
7 changes: 7 additions & 0 deletions .claude/settings.local.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"permissions": {
"allow": [
"Bash(Rscript *)"
]
}
}
2 changes: 0 additions & 2 deletions .github/workflows/R-CMD-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,6 @@ jobs:
uses: r-lib/actions/setup-r-dependencies@v2
with:
dependencies: '"hard"'
extra-packages: |
github::ms609/TreeTools
needs: benchmark

- name: Benchmark PR
Expand Down
162 changes: 162 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
on:
schedule:
- cron: "24 06 * * 1"
workflow_dispatch:
push:
branches: ["*"]
paths-ignore:
- "Meta**"
- "memcheck**"
- "docs**"
- "**.git"
- "**.json"
- "**.md"
- "**.yaml"
- "**.yml"
- "!**R-CMD-check.yml"
- "**.R[dD]ata"
- "**.Rpro*"
pull_request:
branches: ["*"]
paths-ignore:
- "Meta**"
- "memcheck**"
- "docs**"
- "**.git"
- "**.json"
- "**.md"
- "**.yaml"
- "**.yml"
- "!**R-CMD-check.yml"
- "**.R[dD]ata"
- "**.Rpro*"

name: R-CMD-check

jobs:
benchmark:
runs-on: ubuntu-latest
name: benchmark

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
RSPM: "https://packagemanager.posit.co/cran/__linux__/noble/latest"
_R_CHECK_BUILD_VIGNETTES_: false
_R_CHECK_CRAN_INCOMING_: false
_R_CHECK_FORCE_SUGGESTS_: false
R_REMOTES_STANDALONE: true
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
R_REALLY_FORCE_SYMBOLS: true
PKG_CXXFLAGS: "-O3"

steps:
- name: Checkout PR branch
uses: actions/checkout@v6
with:
path: pr

- name: Checkout target branch
uses: actions/checkout@v6
with:
ref: ${{ github.event.pull_request.base.ref || 'main' }}
path: main

- name: Set up R
uses: r-lib/actions/setup-r@v2
with:
extra-repositories: https://ms609.github.io/packages/

- name: Install R dependencies
uses: r-lib/actions/setup-r-dependencies@v2
with:
dependencies: '"hard"'
extra-packages: |
TreeTools@2.2.0
needs: benchmark

- name: Benchmark PR
uses: ms609/actions/benchmark-step@main
with:
path: pr
output: pr

- name: Benchmark target
uses: ms609/actions/benchmark-step@main
with:
path: main
output: main

- name: Benchmark PR again
uses: ms609/actions/benchmark-step@main
with:
path: pr
output: pr2

- run: dir pr-benchmark-results
working-directory: pr
- run: dir main-benchmark-results
working-directory: pr

- name: Compare benchmarks
id: compare_results
working-directory: pr
run: |
Rscript benchmark/_compare_results.R
shell: bash

- name: Upload PR benchmark results
if: failure()
uses: actions/upload-artifact@v6
with:
name: pr-benchmark-results
path: pr/pr-benchmark-results/*.bench.Rds

- name: Upload main benchmark results
if: failure()
uses: actions/upload-artifact@v6
with:
name: main-benchmark-results
path: pr/main-benchmark-results/*.bench.Rds

- name: Comment on PR
if: always() && github.event_name == 'pull_request' && steps.compare_results.outputs.report != ''
uses: actions/github-script@v7
env:
BENCHMARK_MESSAGE: ${{ steps.compare_results.outputs.report }}
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const benchmarkIdentifier = '<!-- benchmark-comment -->';
const outdatedPrefix = '> **⚠️ This benchmark result is outdated. See the latest comment below.**\n\n';

const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number
});

const previousBenchmarkComments = comments.data.filter(comment =>
comment.user.type === 'Bot' &&
comment.body.includes(benchmarkIdentifier) &&
!comment.body.startsWith('> **⚠️ This benchmark result is outdated.')
);

for (const comment of previousBenchmarkComments) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: comment.id,
body: outdatedPrefix + comment.body
});
}

const newCommentBody = benchmarkIdentifier + '\n' + process.env.BENCHMARK_MESSAGE;

await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: newCommentBody
});

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {contents: read}
Comment thread
github-advanced-security[bot] marked this conversation as resolved.
Fixed
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,16 @@ VignetteBuilder: knitr
Config/Needs/app/optional: uwot
Config/Needs/check: rcmdcheck
Config/Needs/coverage: covr
Config/Needs/memcheck: devtools
Config/Needs/memcheck: pkgdown, testthat
Config/Needs/metadata: codemetar
Config/Needs/revdeps: revdepcheck
Config/Needs/website: openssl, pkgdown, remotes, shinylive
Config/roxygen2/version: 8.0.0
Config/testthat/parallel: false
Config/testthat/edition: 3
SystemRequirements: C++17, pandoc-citeproc
ByteCompile: true
Encoding: UTF-8
Language: en-GB
X-schema.org-keywords: phylogenetics, tree-distance
RoxygenNote: 7.3.3
Roxygen: list(markdown = TRUE)
11 changes: 11 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,17 @@
removing a hard dependency on the compile-time `SL_MAX_SPLITS` constant.
TreeDist now supports trees of any size permitted by TreeTools.

- **Large-tree support (requires TreeTools ≥ 2.3.0):** all distance
functions now accept trees with up to 32 767 tips (previously limited
to `SL_MAX_TIPS`, 2048 with TreeTools ≤ 2.2.0). The R-level tip-count
guard (`.CheckMaxTips()`) detects the TreeTools version at load time and
unlocks the higher ceiling automatically; no code changes are needed.
All integer counters in the C++ hot paths have been widened from `int16`
to `split_int` (`int32`) to handle split counts above 32 767 without
overflow. Direct `lg2[]` table accesses have been replaced with
`lg2_lookup()` fallback helpers so that trees with more tips than
`SL_MAX_TIPS` are computed correctly via `std::log2` / `std::lgamma`.

## Performance

- `RobinsonFoulds()` now uses a fast C++ batch path for cross-distance
Expand Down
17 changes: 17 additions & 0 deletions R/tree_distance.R
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,23 @@ GeneralizedRF <- function(splits1, splits2, nTip, PairScorer,
g[lower.tri(g)]
}

# Floor sub-noise distances to zero before normalization.
# Two sources of numerical noise scale with treesIndependentInfo:
# (1) LAP int64 cost-matrix quantization in *Splits scoring; per-cell
# truncation of up to (max_possible / BIG) bits, summed over n_splits.
# (2) Float-accumulation drift between independently-built tables (e.g.
# InfoRobinsonFoulds vs cpp_splitwise_info_batch sum the same per-split
# info contributions, but using different lookup-table constructions).
# Both grow with the magnitude of the answer, so an absolute sqrt(eps)
# tolerance becomes too tight beyond a few thousand tips. Scaling by
# treesIndependentInfo self-adjusts; pmax(1, ·) preserves the original
# tolerance for tiny trees where these errors are negligible anyway.
.FloorNumericalNoise <- function(ret, treesIndependentInfo) {
tol <- pmax(1, treesIndependentInfo) * .Machine[["double.eps"]] ^ 0.5
ret[ret < tol] <- 0
ret
}

.AllTipsSame <- function(x, y) {
if (is.list(x)) {
xPrime <- x[[1]]
Expand Down
42 changes: 17 additions & 25 deletions R/tree_distance_info.R
Original file line number Diff line number Diff line change
Expand Up @@ -249,16 +249,15 @@ DifferentPhylogeneticInfo <- function(tree1, tree2 = NULL, normalize = FALSE,
if (!is.null(fast)) {
spi <- fast[["info"]]
treesIndependentInfo <- .PairwiseSums(fast[["entropies"]])
ret <- treesIndependentInfo - spi - spi

ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = SplitwiseInfo, Combine = "+")
ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0
attributes(ret) <- attributes(spi)
return(ret)
}

# Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits()
fast_many <- .FastManyManyPath(tree1, tree2, reportMatching,
cpp_shared_phylo_cross_pairs,
Expand All @@ -268,25 +267,22 @@ DifferentPhylogeneticInfo <- function(tree1, tree2 = NULL, normalize = FALSE,
info1 <- fast_many[["info1"]]
info2 <- fast_many[["info2"]]
treesIndependentInfo <- outer(info1, info2, "+")
ret <- treesIndependentInfo - spi - spi

ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = SplitwiseInfo, Combine = "+")
ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0
return(ret)
}

spi <- SharedPhylogeneticInfo(tree1, tree2, normalize = FALSE, diag = FALSE,
reportMatching = reportMatching)
treesIndependentInfo <- .MaxValue(tree1, tree2, SplitwiseInfo)
ret <- treesIndependentInfo - spi - spi
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,

ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = SplitwiseInfo, Combine = "+")

ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 # Catch floating point inaccuracy
attributes(ret) <- attributes(spi)

# Return:
Expand All @@ -310,16 +306,15 @@ ClusteringInfoDistance <- function(tree1, tree2 = NULL, normalize = FALSE,
if (!is.null(fast)) {
mci <- fast[["info"]]
treesIndependentInfo <- .PairwiseSums(fast[["entropies"]])
ret <- treesIndependentInfo - mci - mci

ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = ClusteringEntropy, Combine = "+")
ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0
attributes(ret) <- attributes(mci)
return(ret)
}

# Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits()
fast_many <- .FastManyManyPath(tree1, tree2, reportMatching,
cpp_mutual_clustering_cross_pairs,
Expand All @@ -329,25 +324,22 @@ ClusteringInfoDistance <- function(tree1, tree2 = NULL, normalize = FALSE,
info1 <- fast_many[["info1"]]
info2 <- fast_many[["info2"]]
treesIndependentInfo <- outer(info1, info2, "+")
ret <- treesIndependentInfo - mci - mci

ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = ClusteringEntropy, Combine = "+")
ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0
return(ret)
}

mci <- MutualClusteringInfo(tree1, tree2, normalize = FALSE, diag = FALSE,
reportMatching = reportMatching)
treesIndependentInfo <- .MaxValue(tree1, tree2, ClusteringEntropy)
ret <- treesIndependentInfo - mci - mci

ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = ClusteringEntropy, Combine = "+")

ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 # Handle floating point inaccuracy
attributes(ret) <- attributes(mci)

# Return:
Expand Down
Loading
Loading