diff --git a/.zenodo.json b/.zenodo.json index 972c61305..cd297bfd7 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -101,6 +101,12 @@ "affiliation": "Inserm", "orcid": "0000-0003-3109-9720", "type": "Researcher" + }, + { + "name": "Afolabi, Samuel Timileyin", + "affiliation": "University of Lagos", + "orcid": "0009-0002-7325-6405", + "type": "Researcher" } ] } \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 5dd0b6974..e4b3d998c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,6 +8,9 @@ Authors@R: c(person("Vincent T","van Hees",role=c("aut","cre"), person("Jairo H","Migueles",role="aut", email="jairo@jhmigueles.com", comment = c(ORCID = "0000-0003-0366-6935")), + person("Samuel T","Afolabi",role="ctb", + email="samuelafolabimails@gmail.com", + comment = c(ORCID = "0009-0002-7325-6405")), person("Severine","Sabia",role="ctb"), person("Matthew R","Patterson",role="ctb"), person("Zhou","Fang",role="ctb"), @@ -38,6 +41,6 @@ URL: https://github.com/wadpac/GGIR/, https://wadpac.github.io/GGIR/ BugReports: https://github.com/wadpac/GGIR/issues License: Apache License (== 2.0) | file LICENSE Suggests: testthat, covr, knitr, rmarkdown, actilifecounts, readxl -Imports: data.table, foreach, doParallel, signal, zoo, unisensR, ineq, methods, psych, irr, lubridate, GGIRread, ActCR, read.gt3x +Imports: data.table, foreach, doParallel, signal, zoo, unisensR, ineq, methods, psych, irr, lubridate, GGIRread, ActCR, read.gt3x, arrow Depends: stats, utils, R (>= 3.5) VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index a9e8632c7..e854dfcc7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,7 +45,8 @@ export(g.analyse, g.calibrate, check_log, g.report.part5_dictionary, DFA, ABI, SSP, visualReport, splitRecords, getSplitNames, addSplitNames, getPart1BasicInfo, markerButtonForRest, aggregateEvent, inspect_binFile_brand, - g.part3_correct_guider, g.part3_alignIndexVectors, filterNonwearNight) + g.part3_correct_guider, g.part3_alignIndexVectors, filterNonwearNight, + write_dashboard_parquet, write_epoch_parquet, build_epoch_table) importFrom("grDevices", "colors", "dev.off", "pdf", "rgb", "rainbow", "palette", "adjustcolor", "gray.colors", "dev.list", "png") diff --git a/R/GGIR.R b/R/GGIR.R index 964d3c8f5..a7b89b08b 100644 --- a/R/GGIR.R +++ b/R/GGIR.R @@ -503,4 +503,12 @@ GGIR = function(mode = 1:5, datadir = c(), outputdir = c(), } } } + if (isTRUE(params_output[["save_dashboard_parquet"]])) { + if (verbose == TRUE) print_console_header("Dashboard Parquet export") + write_dashboard_parquet(metadatadir = metadatadir, + params_output = params_output, + params_general = params_general, + params_phyact = params_phyact, + verbose = verbose) + } } diff --git a/R/check_params.R b/R/check_params.R index fc91ebdd1..e0c79e919 100644 --- a/R/check_params.R +++ b/R/check_params.R @@ -134,7 +134,8 @@ check_params = function(params_sleep = c(), params_metrics = c(), boolean_params = c("epochvalues2csv", "save_ms5rawlevels", "save_ms5raw_without_invalid", "storefolderstructure", "dofirstpage", "visualreport", "week_weekend_aggregate.part5", "do.part3.pdf", "outliers.only", "do.visual", "do.sibreport", "visualreport_without_invalid", - "do.part2.pdf", "do.part2.png", "old_visualreport", "require_complete_lastnight_part5") + "do.part2.pdf", "do.part2.png", "old_visualreport", "require_complete_lastnight_part5", + "save_dashboard_parquet") character_params = c("save_ms5raw_format", "timewindow", "sep_reports", "sep_config", "dec_reports", "dec_config", "visualreport_focus", "method_research_vars") diff --git a/R/load_params.R b/R/load_params.R index d32d0bbc1..cb1cf67df 100644 --- a/R/load_params.R +++ b/R/load_params.R @@ -147,7 +147,8 @@ load_params = function(topic = c("sleep", "metrics", "rawdata", old_visualreport = FALSE, visualreport_hrsPerRow = 36, visualreport_focus = "day", visualreport_validcrit = 0, require_complete_lastnight_part5 = FALSE, - method_research_vars = NULL) + method_research_vars = NULL, + save_dashboard_parquet = FALSE) } if ("general" %in% topic) { diff --git a/R/write_parquet.R b/R/write_parquet.R new file mode 100644 index 000000000..dc97d3d9f --- /dev/null +++ b/R/write_parquet.R @@ -0,0 +1,918 @@ +write_dashboard_parquet = function(metadatadir = c(), + params_output = c(), + params_general = c(), + params_phyact = c(), + verbose = TRUE) { + # Produce a single consolidated Parquet file from the CSV reports + + # generated by GGIR, ready for consumption by a DuckDB-WASM dashboard. + # + # This function is called at the end of GGIR() when + + # params_output$save_dashboard_parquet is TRUE. + # It reads the already-generated CSVs, joins them, cleans column names, + # casts types, embeds the variable dictionary as Parquet key-value metadata, + # and writes results/ggir_results.parquet. + + results_dir = paste0(metadatadir, "/results") + if (!dir.exists(results_dir)) { + warning("\nNo results directory found. Skipping Parquet export.", call. = FALSE) + return(invisible(NULL)) + } + + # --------------------------------------------------------------- + # Helper: clean column names to be SQL-friendly + # --------------------------------------------------------------- + clean_colnames = function(x) { + x = tolower(x) + x = gsub("[^a-z0-9_]", "_", x) # replace special chars with _ + x = gsub("_+", "_", x) # collapse multiple underscores + x = gsub("^_|_$", "", x) # trim leading/trailing _ + x + } + + # --------------------------------------------------------------- + # Helper: safely read a CSV, return NULL if not found + # --------------------------------------------------------------- + safe_read_csv = function(filepath) { + if (length(filepath) == 0 || !file.exists(filepath[1])) return(NULL) + tryCatch( + data.table::fread(filepath[1], data.table = FALSE), + error = function(e) { + if (verbose) warning(paste0("\nCould not read: ", filepath[1], " - ", e$message), call. = FALSE) + NULL + } + ) + } + + # --------------------------------------------------------------- + # 1. Read Part 5 day summary (the most granular level) + # Glob for all threshold combos (e.g. part5_daysummary_full_MM_*.csv) + # --------------------------------------------------------------- + p5_files = list.files(paste0(results_dir, "/QC"), + pattern = "^part5_daysummary_full_.*\\.csv$", + full.names = TRUE) + if (length(p5_files) == 0) { + warning("\nNo Part 5 day summary CSVs found. Skipping Parquet export.", call. = FALSE) + return(invisible(NULL)) + } + + # Read and row-bind all Part 5 day summary files + p5_list = lapply(p5_files, function(f) { + df = safe_read_csv(f) + if (!is.null(df) && nrow(df) > 0) df else NULL + }) + p5_list = p5_list[!vapply(p5_list, is.null, logical(1))] + if (length(p5_list) == 0) { + warning("\nPart 5 day summary CSVs are empty. Skipping Parquet export.", call. = FALSE) + return(invisible(NULL)) + } + p5 = do.call(rbind, p5_list) + + # --------------------------------------------------------------- + # 2. Read Part 4 night summary (per-night sleep data) + # --------------------------------------------------------------- + p4_file = paste0(results_dir, "/part4_nightsummary_sleep_cleaned.csv") + p4 = safe_read_csv(p4_file) + + # --------------------------------------------------------------- + # 3. Read Part 2 day summary (daily activity L5/M5/MVPA) + # --------------------------------------------------------------- + p2day_file = paste0(results_dir, "/part2_daysummary.csv") + p2day = safe_read_csv(p2day_file) + + # --------------------------------------------------------------- + # 4. Read Part 2 person summary (recording metadata) + # --------------------------------------------------------------- + p2_file = paste0(results_dir, "/part2_summary.csv") + p2 = safe_read_csv(p2_file) + + # --------------------------------------------------------------- + # 5. Read data quality report + # --------------------------------------------------------------- + qc_file = paste0(results_dir, "/QC/data_quality_report.csv") + qc = safe_read_csv(qc_file) + + # --------------------------------------------------------------- + # 6. Read variable dictionary for Parquet metadata + # --------------------------------------------------------------- + dict_file = paste0(results_dir, "/variableDictionary/part5_dictionary_daysummary_full.csv") + dict = safe_read_csv(dict_file) + + # --------------------------------------------------------------- + # 7. Build the consolidated data frame + # Start with Part 5 (most granular), then left-join others + # --------------------------------------------------------------- + consolidated = p5 + + # Join Part 4 sleep data by ID + calendar_date + if (!is.null(p4) && nrow(p4) > 0) { + # Identify Part 4 columns that are NOT already in Part 5 + p4_unique_cols = setdiff(names(p4), names(consolidated)) + join_keys = c("ID", "calendar_date") + p4_subset = p4[, c(intersect(join_keys, names(p4)), p4_unique_cols), drop = FALSE] + if (all(join_keys %in% names(p4_subset)) && all(join_keys %in% names(consolidated))) { + consolidated = merge(consolidated, p4_subset, + by = join_keys, all.x = TRUE, suffixes = c("", "_p4")) + } + } + + # Join Part 2 day-level data by ID + calendar_date + if (!is.null(p2day) && nrow(p2day) > 0) { + p2day_unique_cols = setdiff(names(p2day), names(consolidated)) + join_keys = c("ID", "calendar_date") + p2day_subset = p2day[, c(intersect(join_keys, names(p2day)), p2day_unique_cols), drop = FALSE] + if (all(join_keys %in% names(p2day_subset)) && all(join_keys %in% names(consolidated))) { + consolidated = merge(consolidated, p2day_subset, + by = join_keys, all.x = TRUE, suffixes = c("", "_p2day")) + } + } + + # Join Part 2 person-level metadata by ID (one row per person, repeated) + if (!is.null(p2) && nrow(p2) > 0) { + p2_unique_cols = setdiff(names(p2), names(consolidated)) + join_key = "ID" + p2_subset = p2[, c(join_key, p2_unique_cols), drop = FALSE] + if (join_key %in% names(p2_subset) && join_key %in% names(consolidated)) { + consolidated = merge(consolidated, p2_subset, + by = join_key, all.x = TRUE, suffixes = c("", "_p2")) + } + } + + # Join QC data by filename + if (!is.null(qc) && nrow(qc) > 0) { + qc_unique_cols = setdiff(names(qc), names(consolidated)) + join_key = "filename" + qc_subset = qc[, c(join_key, qc_unique_cols), drop = FALSE] + if (join_key %in% names(qc_subset) && join_key %in% names(consolidated)) { + consolidated = merge(consolidated, qc_subset, + by = join_key, all.x = TRUE, suffixes = c("", "_qc")) + } + } + + if (nrow(consolidated) == 0) { + warning("\nConsolidated data is empty. Skipping Parquet export.", call. = FALSE) + return(invisible(NULL)) + } + + # --------------------------------------------------------------- + # 8. Attach nested epoch-level time series as list-column + # (must happen before clean_colnames so join keys match) + # --------------------------------------------------------------- + epoch_by_day = tryCatch( + build_epoch_lists_by_day(metadatadir = metadatadir, + params_general = params_general, + params_phyact = params_phyact, + results_dir = results_dir, + verbose = verbose), + error = function(e) NULL + ) + if (!is.null(epoch_by_day) && nrow(epoch_by_day) > 0) { + join_keys = intersect(c("ID", "calendar_date"), names(consolidated)) + if (all(c("ID", "calendar_date") %in% join_keys) && + all(c("ID", "calendar_date") %in% names(epoch_by_day))) { + consolidated = merge(consolidated, epoch_by_day, + by = c("ID", "calendar_date"), + all.x = TRUE, sort = FALSE) + } + } + + # Arrow expects every element of a list column to be a data.frame. + # After a left merge, unmatched rows can carry NA in `epochs`; replace them + # with a typed empty data.frame to keep nested serialization valid. + if ("epochs" %in% names(consolidated)) { + first_df_idx = which(vapply(consolidated$epochs, is.data.frame, logical(1))) + if (length(first_df_idx) > 0) { + empty_epoch_struct = consolidated$epochs[[first_df_idx[1]]][0, , drop = FALSE] + } else if (!is.null(epoch_by_day) && + nrow(epoch_by_day) > 0 && + "epochs" %in% names(epoch_by_day) && + any(vapply(epoch_by_day$epochs, is.data.frame, logical(1)))) { + first_epoch_idx = which(vapply(epoch_by_day$epochs, is.data.frame, logical(1)))[1] + empty_epoch_struct = epoch_by_day$epochs[[first_epoch_idx]][0, , drop = FALSE] + } else { + empty_epoch_struct = data.frame( + timenum = numeric(0), + acc = numeric(0), + class_id = integer(0), + spt = logical(0), + invalid = logical(0), + window = integer(0) + ) + } + + bad_epoch_cells = which(!vapply(consolidated$epochs, is.data.frame, logical(1))) + if (length(bad_epoch_cells) > 0) { + consolidated$epochs[bad_epoch_cells] = replicate( + length(bad_epoch_cells), + empty_epoch_struct, + simplify = FALSE + ) + if (verbose) { + warning( + paste0( + "\nReplaced ", + length(bad_epoch_cells), + " unmatched epoch rows with empty nested epoch tables." + ), + call. = FALSE + ) + } + } + } + + # --------------------------------------------------------------- + # 9. Clean column names for SQL-friendly access + # --------------------------------------------------------------- + names(consolidated) = clean_colnames(names(consolidated)) + + # Deduplicate any identical column names that may have arisen from merges + dupes = duplicated(names(consolidated)) + if (any(dupes)) { + names(consolidated)[dupes] = paste0(names(consolidated)[dupes], "_dup", + seq_len(sum(dupes))) + } + + # --------------------------------------------------------------- + # 10. Cast types for Parquet + # --------------------------------------------------------------- + + # calendar_date -> Date + if ("calendar_date" %in% names(consolidated)) { + consolidated$calendar_date = tryCatch( + as.Date(consolidated$calendar_date), + error = function(e) consolidated$calendar_date + ) + } + + # Boolean columns + bool_cols = c("daysleeper", "sleeplog_used", "acc_available", + "file_corrupt", "file_too_short", "use_temperature") + for (col in bool_cols) { + if (col %in% names(consolidated)) { + consolidated[[col]] = as.logical(as.numeric(consolidated[[col]])) + } + } + + # Integer columns (counts, codes, nights, bouts) + int_patterns = c("^n_", "^nbouts_", "^nblocks_", "cleaning_?code", + "night_number", "window_number", "measurement_?day", + "page", "n_hours_ignored") + for (pat in int_patterns) { + matching = grep(pat, names(consolidated), value = TRUE) + for (col in matching) { + if (is.character(consolidated[[col]]) || is.numeric(consolidated[[col]])) { + consolidated[[col]] = tryCatch( + as.integer(consolidated[[col]]), + warning = function(w) consolidated[[col]], + error = function(e) consolidated[[col]] + ) + } + } + } + + # --------------------------------------------------------------- + # 11. Build Parquet key-value metadata from dictionary + # --------------------------------------------------------------- + kv_metadata = list( + ggir_export = "nested_dashboard", + created_at = as.character(Sys.time()) + ) + if (!is.null(dict) && nrow(dict) > 0 && + "Variable" %in% names(dict) && "Definition" %in% names(dict)) { + for (i in seq_len(nrow(dict))) { + clean_name = clean_colnames(dict$Variable[i]) + kv_metadata[[clean_name]] = dict$Definition[i] + } + } + + # --------------------------------------------------------------- + # 11b. Embed behavioral codes (class_id <-> class_name mapping) + # --------------------------------------------------------------- + ms5_root = paste0(metadatadir, "/meta/ms5.outraw") + if (dir.exists(ms5_root)) { + bc_files = list.files(ms5_root, pattern = "^behavioralcodes.*\\.csv$", + full.names = TRUE) + if (length(bc_files) > 0) { + bc = tryCatch( + data.table::fread(bc_files[length(bc_files)], data.table = FALSE), + error = function(e) NULL + ) + if (!is.null(bc) && nrow(bc) > 0 && + all(c("class_name", "class_id") %in% names(bc))) { + bc_json = paste0("{", + paste(vapply(seq_len(nrow(bc)), function(i) { + paste0("\"", bc$class_id[i], "\":\"", bc$class_name[i], "\"") + }, character(1)), collapse = ","), + "}") + kv_metadata[["behavioral_codes"]] = bc_json + } + } + } + + # --------------------------------------------------------------- + # 11c. Embed threshold configuration and acc metric + # --------------------------------------------------------------- + threshold_lig = params_phyact[["threshold.lig"]] + threshold_mod = params_phyact[["threshold.mod"]] + threshold_vig = params_phyact[["threshold.vig"]] + if (!is.null(threshold_lig)) { + kv_metadata[["threshold_lig"]] = as.character(threshold_lig) + } + if (!is.null(threshold_mod)) { + kv_metadata[["threshold_mod"]] = as.character(threshold_mod) + } + if (!is.null(threshold_vig)) { + kv_metadata[["threshold_vig"]] = as.character(threshold_vig) + } + if (!is.null(params_phyact[["part6_threshold_combi"]])) { + kv_metadata[["threshold_combi"]] = params_phyact[["part6_threshold_combi"]] + } + if (!is.null(params_general[["acc.metric"]])) { + kv_metadata[["acc_metric"]] = params_general[["acc.metric"]] + } + + # Estimate epoch length from the nested epoch time series + if (!is.null(epoch_by_day) && nrow(epoch_by_day) > 0 && + "epochs" %in% names(epoch_by_day)) { + first_epoch = epoch_by_day$epochs[[1]] + if (!is.null(first_epoch) && is.data.frame(first_epoch) && + nrow(first_epoch) > 1 && "timenum" %in% names(first_epoch)) { + diffs = diff(first_epoch$timenum[1:min(nrow(first_epoch), 100)]) + diffs = diffs[is.finite(diffs) & diffs > 0] + if (length(diffs) > 0) { + kv_metadata[["epoch_length_seconds"]] = as.character(stats::median(diffs)) + } + } + } + + # --------------------------------------------------------------- + # 12. Write Parquet file + # --------------------------------------------------------------- + # Name parquet after participant ID when possible. + # If multiple IDs exist, use a stable fallback filename. + parquet_basename = "ggir_results" + if ("id" %in% names(consolidated)) { + ids = unique(as.character(consolidated$id)) + } else if ("ID" %in% names(consolidated)) { + ids = unique(as.character(consolidated$ID)) + } else { + ids = character(0) + } + ids = ids[!is.na(ids) & nzchar(ids)] + if (length(ids) == 1) { + parquet_basename = ids[1] + } else if (length(ids) > 1) { + parquet_basename = "multiple_participants" + if (verbose) { + warning("\nMultiple participant IDs found. Using 'multiple_participants.parquet'.", call. = FALSE) + } + } + parquet_basename = gsub("[^A-Za-z0-9._-]", "_", parquet_basename) + parquet_path = paste0(results_dir, "/", parquet_basename, ".parquet") + + # Convert to Arrow table so we can attach key-value metadata + tbl = arrow::arrow_table(consolidated) + tbl$metadata = kv_metadata + + arrow::write_parquet(tbl, parquet_path) + + if (verbose) { + cat(paste0("\n Parquet file written: ", parquet_path)) + cat(paste0("\n Rows: ", nrow(consolidated), + ", Columns: ", ncol(consolidated), "\n")) + } + + invisible(parquet_path) +} + +build_epoch_table = function(metadatadir = c(), + params_general = c(), + params_phyact = c(), + results_dir = NULL, + verbose = TRUE) { + # Build a flat epoch-level data.frame from ms5.outraw time series. + # + # Returns a list with: + # epoch_df - data.frame with one row per epoch, including ID, filename, + # timenum, ACC, SleepPeriodTime, invalidepoch, class_id, + # guider, window, calendar_date, and weekday. + # config_name - basename of the ms5.outraw configuration used. + # Returns NULL if no usable data is found. + + if (is.null(results_dir) || !length(results_dir)) { + results_dir = paste0(metadatadir, "/results") + } + + ms5_root = paste0(metadatadir, "/meta/ms5.outraw") + if (!dir.exists(ms5_root)) { + if (verbose) warning("\nNo ms5.outraw directory found. Cannot build epoch table.", call. = FALSE) + return(NULL) + } + + # Determine which ms5.outraw configuration folder to use + config_dir = character(0) + if (!is.null(params_phyact[["part6_threshold_combi"]]) && + length(params_phyact[["part6_threshold_combi"]]) > 0 && + nchar(params_phyact[["part6_threshold_combi"]][1]) > 0) { + cfg_name = params_phyact[["part6_threshold_combi"]][1] + cfg_path = paste0(ms5_root, "/", cfg_name) + if (dir.exists(cfg_path)) { + config_dir = cfg_path + } + } + if (!length(config_dir)) { + subdirs = list.dirs(ms5_root, recursive = FALSE, full.names = TRUE) + if (length(subdirs) > 0) { + has_ts = vapply( + subdirs, + function(d) { + any(file.exists(list.files(d, pattern = "\\.(csv|RData)$", full.names = TRUE))) + }, + logical(1) + ) + subdirs = subdirs[has_ts] + } + if (length(subdirs) > 0) { + config_dir = subdirs[1] + if (verbose) { + cat(paste0("\n Using ms5.outraw configuration: ", basename(config_dir))) + } + } + } + if (!length(config_dir) || !dir.exists(config_dir)) { + if (verbose) warning("\nNo ms5.outraw configuration with time series found.", call. = FALSE) + return(NULL) + } + + # Prefer CSV over RData when both are available + csv_files = list.files(config_dir, pattern = "[.]csv$", full.names = TRUE) + rda_files = list.files(config_dir, pattern = "[.]RData$", full.names = TRUE) + files = if (length(csv_files) > 0) csv_files else rda_files + if (length(files) == 0) { + if (verbose) warning("\nNo ms5.outraw time series files found in configuration folder.", call. = FALSE) + return(NULL) + } + + # Map filename -> ID from part2_summary.csv if available + id_map = NULL + p2_file = paste0(results_dir, "/part2_summary.csv") + if (file.exists(p2_file)) { + p2 = tryCatch( + data.table::fread(p2_file, data.table = FALSE), + error = function(e) NULL + ) + if (!is.null(p2) && all(c("filename", "ID") %in% names(p2))) { + id_map = unique(p2[, c("filename", "ID")]) + } + } + + desiredtz_global = "" + if (!is.null(params_general[["desiredtz"]])) desiredtz_global = params_general[["desiredtz"]] + + epoch_list = vector("list", length(files)) + + for (i in seq_along(files)) { + f = files[i] + df = NULL + filename_val = NA_character_ + tz_file = desiredtz_global + + if (grepl("[.]csv$", f, ignore.case = TRUE)) { + df = tryCatch( + data.table::fread(f, data.table = FALSE), + error = function(e) { + if (verbose) warning(paste0("\nCould not read ms5 CSV: ", f, " - ", e$message), call. = FALSE) + NULL + } + ) + } else { + env = new.env() + ok = tryCatch( + { + load(f, envir = env) + TRUE + }, + error = function(e) { + if (verbose) warning(paste0("\nCould not load ms5 RData: ", f, " - ", e$message), call. = FALSE) + FALSE + } + ) + if (ok && exists("mdat", envir = env, inherits = FALSE)) { + df = get("mdat", envir = env) + } + if (exists("desiredtz_part1", envir = env, inherits = FALSE)) { + tz_candidate = get("desiredtz_part1", envir = env) + if (!is.null(tz_candidate) && nchar(tz_candidate) > 0) { + tz_file = tz_candidate + } + } + if (exists("filename", envir = env, inherits = FALSE)) { + filename_val = as.character(get("filename", envir = env)) + } + } + + if (is.null(df) || nrow(df) == 0) next + if (!("timenum" %in% names(df))) next + + # Derive filename if not present on data + if ("filename" %in% names(df)) { + filename_val = as.character(df$filename[1]) + } else { + if (is.na(filename_val)) { + filename_val = basename(f) + } + df$filename = filename_val + } + + # Attach ID via filename if mapping is available + if (!("ID" %in% names(df)) && !is.null(id_map) && !is.na(filename_val)) { + id_match = id_map[id_map$filename == filename_val, , drop = FALSE] + if (nrow(id_match) > 0) { + df$ID = id_match$ID[1] + } + } + + # Derive calendar_date and weekday from timenum and timezone + if (is.null(tz_file) || nchar(tz_file) == 0) tz_file = "UTC" + ts_posix = tryCatch( + as.POSIXct(df$timenum, origin = "1970-01-01", tz = tz_file), + error = function(e) NULL + ) + if (!is.null(ts_posix)) { + df$calendar_date = as.Date(ts_posix) + df$weekday = weekdays(ts_posix) + } + + epoch_list[[i]] = df + } + + epoch_list = epoch_list[!vapply(epoch_list, is.null, logical(1))] + if (length(epoch_list) == 0) { + if (verbose) warning("\nNo valid epoch-level time series found.", call. = FALSE) + return(NULL) + } + + # Enforce consistent column set across recordings (fill missing with NA) + all_cols = unique(unlist(lapply(epoch_list, names))) + epoch_list = lapply(epoch_list, function(df) { + missing = setdiff(all_cols, names(df)) + for (col in missing) df[[col]] = NA + df[, all_cols, drop = FALSE] + }) + + epoch_df = do.call(rbind, epoch_list) + + list(epoch_df = epoch_df, config_name = basename(config_dir)) +} + +write_epoch_parquet = function(metadatadir = c(), + params_output = c(), + params_general = c(), + params_phyact = c(), + verbose = TRUE) { + # Produce an epoch-level Parquet file from the ms5.outraw time series. + # The resulting table is flat with one row per epoch. + + results_dir = paste0(metadatadir, "/results") + if (!dir.exists(results_dir)) { + warning("\nNo results directory found. Skipping epoch Parquet export.", call. = FALSE) + return(invisible(NULL)) + } + + result = build_epoch_table(metadatadir = metadatadir, + params_general = params_general, + params_phyact = params_phyact, + results_dir = results_dir, + verbose = verbose) + if (is.null(result)) { + return(invisible(NULL)) + } + + epoch_df = result$epoch_df + config_name = result$config_name + + # --------------------------------------------------------------- + # Type coercions + # --------------------------------------------------------------- + if ("calendar_date" %in% names(epoch_df)) { + epoch_df$calendar_date = as.Date(epoch_df$calendar_date) + } + + bool_cols = c("SleepPeriodTime", "invalidepoch") + for (col in bool_cols) { + if (col %in% names(epoch_df)) { + epoch_df[[col]] = as.logical(epoch_df[[col]]) + } + } + + int_cols = c("class_id", "window", "guider") + for (col in int_cols) { + if (col %in% names(epoch_df)) { + epoch_df[[col]] = tryCatch( + as.integer(epoch_df[[col]]), + warning = function(w) epoch_df[[col]], + error = function(e) epoch_df[[col]] + ) + } + } + + # --------------------------------------------------------------- + # Build Arrow table and attach Parquet key-value metadata + # --------------------------------------------------------------- + parquet_path = paste0(results_dir, "/ggir_epochs.parquet") + + tbl = arrow::arrow_table(epoch_df) + + kv_metadata = list( + ggir_export = "epoch_timeseries", + created_at = as.character(Sys.time()), + ms5_configuration = config_name + ) + + if (!is.null(params_general[["acc.metric"]])) { + kv_metadata[["acc_metric"]] = params_general[["acc.metric"]] + } + + # Estimate epoch length (in seconds) from timenum differences + if ("timenum" %in% names(epoch_df)) { + ord = order(epoch_df$timenum) + max_n = min(length(ord), 1001L) + if (max_n > 1L) { + diffs = diff(epoch_df$timenum[ord][seq_len(max_n)]) + diffs = diffs[is.finite(diffs) & diffs > 0] + if (length(diffs) > 0) { + epoch_len = stats::median(diffs) + kv_metadata[["epoch_length_seconds"]] = as.character(epoch_len) + } + } + } + + tbl$metadata = kv_metadata + + arrow::write_parquet(tbl, parquet_path) + + if (verbose) { + cat(paste0("\n Epoch Parquet file written: ", parquet_path)) + cat(paste0("\n Rows: ", nrow(epoch_df), + ", Columns: ", ncol(epoch_df), "\n")) + } + + invisible(parquet_path) +} + +build_epoch_lists_by_day = function(metadatadir = c(), + params_general = c(), + params_phyact = c(), + results_dir = c(), + verbose = TRUE) { + # Helper to construct a per-day list-column of epoch-level time series + # from ms5.outraw time series. Returns a data.frame with columns: + # ID, calendar_date, epochs (list of data.frames). + + if (!length(results_dir)) { + results_dir = paste0(metadatadir, "/results") + } + ms5_root = paste0(metadatadir, "/meta/ms5.outraw") + if (!dir.exists(ms5_root)) { + if (verbose == TRUE) { + warning("\nNo ms5.outraw directory found. Cannot build epoch lists.", call. = FALSE) + } + return(NULL) + } + + # Determine which ms5.outraw configuration folder to use + config_dir = character(0) + if (!is.null(params_phyact[["part6_threshold_combi"]]) && + length(params_phyact[["part6_threshold_combi"]]) > 0 && + nchar(params_phyact[["part6_threshold_combi"]][1]) > 0) { + cfg_name = params_phyact[["part6_threshold_combi"]][1] + cfg_path = paste0(ms5_root, "/", cfg_name) + if (dir.exists(cfg_path)) { + config_dir = cfg_path + } + } + if (!length(config_dir)) { + subdirs = list.dirs(ms5_root, recursive = FALSE, full.names = TRUE) + if (length(subdirs) > 0) { + has_ts = vapply( + subdirs, + function(d) { + any(file.exists(list.files(d, pattern = "\\.(csv|RData)$", full.names = TRUE))) + }, + logical(1) + ) + subdirs = subdirs[has_ts] + } + if (length(subdirs) > 0) { + config_dir = subdirs[1] + if (verbose == TRUE) { + cat(paste0("\n Using ms5.outraw configuration for nesting: ", basename(config_dir))) + } + } + } + if (!length(config_dir) || !dir.exists(config_dir)) { + if (verbose == TRUE) { + warning("\nNo ms5.outraw configuration with time series found. Cannot build epoch lists.", call. = FALSE) + } + return(NULL) + } + + # Prefer CSV over RData when both are available + csv_files = list.files(config_dir, pattern = "[.]csv$", full.names = TRUE) + rda_files = list.files(config_dir, pattern = "[.]RData$", full.names = TRUE) + files = if (length(csv_files) > 0) csv_files else rda_files + if (length(files) == 0) { + if (verbose == TRUE) { + warning("\nNo ms5.outraw time series files found in configuration folder. Cannot build epoch lists.", call. = FALSE) + } + return(NULL) + } + + # Map filename -> ID from part2_summary.csv if available + id_map = NULL + normalize_filename = function(x) { + x = tolower(as.character(x)) + x = gsub("\\\\", "/", x) + basename(x) + } + p2_file = paste0(results_dir, "/part2_summary.csv") + if (file.exists(p2_file)) { + p2 = tryCatch( + data.table::fread(p2_file, data.table = FALSE), + error = function(e) NULL + ) + if (!is.null(p2) && all(c("filename", "ID") %in% names(p2))) { + id_map = unique(p2[, c("filename", "ID")]) + id_map$filename_norm = normalize_filename(id_map$filename) + } + } + + desiredtz_global = "" + if (!is.null(params_general[["desiredtz"]])) desiredtz_global = params_general[["desiredtz"]] + + epoch_list = vector("list", length(files)) + + for (i in seq_along(files)) { + f = files[i] + df = NULL + filename_val = NA_character_ + tz_file = desiredtz_global + + if (grepl("[.]csv$", f, ignore.case = TRUE)) { + df = tryCatch( + data.table::fread(f, data.table = FALSE), + error = function(e) { + if (verbose == TRUE) warning(paste0("\nCould not read ms5 CSV: ", f, " - ", e$message), call. = FALSE) + NULL + } + ) + } else { + env = new.env() + ok = tryCatch( + { + load(f, envir = env) + TRUE + }, + error = function(e) { + if (verbose == TRUE) warning(paste0("\nCould not load ms5 RData: ", f, " - ", e$message), call. = FALSE) + FALSE + } + ) + if (ok && exists("mdat", envir = env, inherits = FALSE)) { + df = get("mdat", envir = env) + } + if (exists("desiredtz_part1", envir = env, inherits = FALSE)) { + tz_candidate = get("desiredtz_part1", envir = env) + if (!is.null(tz_candidate) && nchar(tz_candidate) > 0) { + tz_file = tz_candidate + } + } + if (exists("filename", envir = env, inherits = FALSE)) { + filename_val = as.character(get("filename", envir = env)) + } + } + + if (is.null(df) || nrow(df) == 0) next + if (!("timenum" %in% names(df))) next + + # Ensure filename column + if ("filename" %in% names(df)) { + filename_val = as.character(df$filename[1]) + } else { + if (is.na(filename_val)) { + filename_val = basename(f) + } + df$filename = filename_val + } + + # Attach ID via filename if mapping is available + if (!("ID" %in% names(df)) && !is.null(id_map) && !is.na(filename_val)) { + filename_norm = normalize_filename(filename_val) + id_match = id_map[id_map$filename_norm == filename_norm, , drop = FALSE] + if (nrow(id_match) > 0) { + df$ID = id_match$ID[1] + } + } + + # Derive calendar_date from timenum and timezone + if (is.null(tz_file) || nchar(tz_file) == 0) tz_file = "UTC" + ts_posix = tryCatch( + as.POSIXct(df$timenum, origin = "1970-01-01", tz = tz_file), + error = function(e) NULL + ) + if (!is.null(ts_posix)) { + df$calendar_date = as.Date(ts_posix) + } + + epoch_list[[i]] = df + } + + epoch_list = epoch_list[!vapply(epoch_list, is.null, logical(1))] + if (length(epoch_list) == 0) { + if (verbose == TRUE) { + warning("\nNo valid epoch-level time series found for nesting.", call. = FALSE) + } + return(NULL) + } + + epoch_df = do.call(rbind, epoch_list) + + # Ensure join keys and ordering columns exist + if (!all(c("ID", "calendar_date", "timenum") %in% names(epoch_df))) { + if (verbose == TRUE) { + warning("\nEpoch data lacks ID, calendar_date or timenum. Cannot build nested epochs.", call. = FALSE) + } + return(NULL) + } + + # Decide optional fields globally + has_acc = "ACC" %in% names(epoch_df) + has_anglez = "anglez" %in% names(epoch_df) + has_lux = "lux" %in% names(epoch_df) + has_temperature = "temperature" %in% names(epoch_df) + has_steps = "steps" %in% names(epoch_df) + has_sibdetection = "sibdetection" %in% names(epoch_df) + has_guider = "guider" %in% names(epoch_df) + + keys = unique(epoch_df[, c("ID", "calendar_date")]) + keys = keys[order(keys$ID, keys$calendar_date), , drop = FALSE] + epochs_col = vector("list", nrow(keys)) + + for (i in seq_len(nrow(keys))) { + sel = which(epoch_df$ID == keys$ID[i] & + epoch_df$calendar_date == keys$calendar_date[i]) + sub = epoch_df[sel, , drop = FALSE] + if (nrow(sub) == 0) { + epochs_col[[i]] = data.frame() + next + } + sub = sub[order(sub$timenum), , drop = FALSE] + + acc_col = if (has_acc) as.numeric(sub$ACC) else rep(NA_real_, nrow(sub)) + class_col = if ("class_id" %in% names(sub)) as.integer(sub$class_id) else rep(NA_integer_, nrow(sub)) + spt_col = if ("SleepPeriodTime" %in% names(sub)) as.logical(sub$SleepPeriodTime) else rep(NA, nrow(sub)) + invalid_col = if ("invalidepoch" %in% names(sub)) as.logical(sub$invalidepoch) else rep(NA, nrow(sub)) + window_col = if ("window" %in% names(sub)) as.integer(sub$window) else rep(NA_integer_, nrow(sub)) + + struct_df = data.frame( + timenum = as.numeric(sub$timenum), + acc = acc_col, + class_id = class_col, + spt = spt_col, + invalid = invalid_col, + window = window_col + ) + + if (has_anglez) { + anglez_col = if ("anglez" %in% names(sub)) as.numeric(sub$anglez) else rep(NA_real_, nrow(sub)) + struct_df$anglez = anglez_col + } + if (has_lux) { + lux_col = if ("lux" %in% names(sub)) as.numeric(sub$lux) else rep(NA_real_, nrow(sub)) + struct_df$lux = lux_col + } + if (has_temperature) { + temp_col = if ("temperature" %in% names(sub)) as.numeric(sub$temperature) else rep(NA_real_, nrow(sub)) + struct_df$temperature = temp_col + } + if (has_steps) { + steps_col = if ("steps" %in% names(sub)) as.integer(sub$steps) else rep(NA_integer_, nrow(sub)) + struct_df$steps = steps_col + } + if (has_sibdetection) { + sib_col = if ("sibdetection" %in% names(sub)) as.integer(sub$sibdetection) else rep(NA_integer_, nrow(sub)) + struct_df$sibdetection = sib_col + } + if (has_guider) { + guider_col = if ("guider" %in% names(sub)) as.integer(sub$guider) else rep(NA_integer_, nrow(sub)) + struct_df$guider = guider_col + } + + epochs_col[[i]] = struct_df + } + + out = keys + out$epochs = epochs_col + out +} diff --git a/inst/CITATION b/inst/CITATION index ab9893990..6be643918 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -27,6 +27,9 @@ bibentry(bibtype = "Manual", person("Victor","Barreto Mesquita", role="ctb", email="victormesquita40@hotmail.com"), person("Gaia","Segantin",role="ctb"), + person("Samuel Timileyin","Afolabi", role="ctb", + email="samuelafolabimails@gmail.com", + comment = c(ORCID = "0009-0002-7325-6405")), person("Medical Research Council UK", role = c("cph", "fnd")), person("Accelting", role = c("cph", "fnd")), person("French National Research Agency", role = c("cph", "fnd"))), diff --git a/man/GGIR.Rd b/man/GGIR.Rd index a5eb46156..c24a87adc 100644 --- a/man/GGIR.Rd +++ b/man/GGIR.Rd @@ -1941,6 +1941,20 @@ GGIR(mode = 1:5, output. The variables are intended for methodological research only and are by default turned off. } + + \item{save_dashboard_parquet}{ + Boolean (default = FALSE). + If TRUE, GGIR will export a consolidated Parquet file named + \file{ggir_results.parquet} to the \file{results/} subfolder of the + output directory after all requested GGIR parts have completed. + The file merges the Part 5 day summary, Part 4 night summary, + Part 2 day and person summaries, and the data quality report into + a single row-per-day table, and embeds per-day epoch-level time series + as a nested list-column (\code{epochs}). + Key-value Parquet metadata is attached to record the variable dictionary, + activity threshold configuration, and accelerometer metric used. + This output is intended for use with DuckDB-WASM powered dashboards. + Requires the \pkg{arrow} package to be installed.} } } } diff --git a/man/build_epoch_table.Rd b/man/build_epoch_table.Rd new file mode 100644 index 000000000..43342d7b3 --- /dev/null +++ b/man/build_epoch_table.Rd @@ -0,0 +1,84 @@ +\name{build_epoch_table} +\alias{build_epoch_table} +\title{ + Build a flat epoch table from ms5.outraw time series +} +\description{ + Reads epoch-level files from \code{meta/ms5.outraw} and constructs a single + flat data frame with one row per epoch. +} +\usage{ +build_epoch_table( + metadatadir = c(), + params_general = c(), + params_phyact = c(), + results_dir = NULL, + verbose = TRUE +) +} +\arguments{ + \item{metadatadir}{ + Directory that holds a folder 'meta' and inside this a folder 'basic' + which contains the milestone data produced by \link{g.part1}. The folder structure + is normally created by \link{g.part1} and \link{GGIR} will recognise what + the value of metadatadir is. + } + \item{params_general}{ + See details in \link{GGIR}. + } + \item{params_phyact}{ + See details in \link{GGIR}. + } + \item{results_dir}{ + Optional character scalar. Path to \code{results}. If \code{NULL}, it is + derived from \code{metadatadir}. + } + \item{verbose}{ + See details in \link{GGIR}. + } +} +\details{ + The function: + \enumerate{ + \item Selects an ms5 configuration folder (prefer requested threshold + combination if available). + \item Reads time-series files (CSV preferred over RData). + \item Infers missing \code{filename} and maps \code{filename -> ID} + from \code{results/part2_summary.csv} when needed. + \item Derives \code{calendar_date} and \code{weekday} from epoch + \code{timenum} using timezone information. + \item Harmonizes column sets across recordings by filling missing columns + with \code{NA}. + } +} +\value{ + Either \code{NULL} (if no usable epoch data were found), or a list with: + \itemize{ + \item \code{epoch_df}: data frame with one row per epoch. + \item \code{config_name}: selected ms5 configuration folder name. + } +} +\seealso{ + \code{\link{write_epoch_parquet}}, + \code{\link{write_dashboard_parquet}} +} +\examples{ +\dontrun{ +res <- build_epoch_table( + metadatadir = "path/to/output_run/output_test_run", + params_general = list(desiredtz = "UTC"), + params_phyact = list(part6_threshold_combi = "WW_L40M100V400_T5A5"), + verbose = TRUE +) + +if (!is.null(res)) { + names(res) + nrow(res$epoch_df) + res$config_name +} +} +} +\author{ + Samuel Timileyin Afolabi +} +\keyword{internal} diff --git a/man/write_dashboard_parquet.Rd b/man/write_dashboard_parquet.Rd new file mode 100644 index 000000000..b6988d233 --- /dev/null +++ b/man/write_dashboard_parquet.Rd @@ -0,0 +1,89 @@ +\name{write_dashboard_parquet} +\alias{write_dashboard_parquet} +\title{ + Write consolidated dashboard Parquet output +} +\description{ + Creates a single dashboard-ready Parquet file by combining GGIR summary outputs + and attaching nested epoch-level time series by day. +} +\usage{ +write_dashboard_parquet( + metadatadir = c(), + params_output = c(), + params_general = c(), + params_phyact = c(), + verbose = TRUE +) +} +\arguments{ + \item{metadatadir}{ + Directory that holds a folder 'meta' and inside this a folder 'basic' + which contains the milestone data produced by \link{g.part1}. The folder structure + is normally created by \link{g.part1} and \link{GGIR} will recognise what + the value of metadatadir is. + } + \item{params_output}{ + See details in \link{GGIR}. + } + \item{params_general}{ + See details in \link{GGIR}. + } + \item{params_phyact}{ + See details in \link{GGIR}. + } + \item{verbose}{ + See details in \link{GGIR}. + } +} +\details{ + The function reads already-generated CSV reports, starts from Part 5 day summary + as the base table, and left-joins additional fields from: + \itemize{ + \item Part 4 night summary + \item Part 2 day summary + \item Part 2 participant summary + \item quality-control report + } + + It then calls \code{\link{build_epoch_lists_by_day}} to construct a nested + \code{epochs} list-column and merges that into the day-level table by + \code{ID} and \code{calendar_date}. + + Column names are cleaned for SQL-friendly use, common columns are type-cast, + and Parquet key-value metadata is embedded (variable definitions, thresholds, + behavioral-code mapping, acceleration metric, and estimated epoch length). + + Output filename is ID-based for single-participant exports and falls back to a + multi-participant name otherwise. + + The generated Parquet file can be opened directly in the GGIR Web Dashboard + (\url{https://samuelafolabi.github.io/ggir-web-dashboard/}), where processing + happens entirely in the browser. +} +\value{ + Invisibly returns the written Parquet file path, or \code{NULL} (invisibly) if + export is skipped because required data are missing. The generated Parquet file + can be opened directly in the GGIR Web Dashboard: + \url{https://samuelafolabi.github.io/ggir-web-dashboard/} +} +\seealso{ + \code{\link{GGIR}}, + \code{\link{write_epoch_parquet}}, + \code{\link{build_epoch_table}} +} +\examples{ +\dontrun{ +write_dashboard_parquet( + metadatadir = "path/to/output_run/output_test_run", + params_output = list(save_dashboard_parquet = TRUE), + params_general = list(desiredtz = "UTC", acc.metric = "ENMO"), + params_phyact = list(part6_threshold_combi = "WW_L40M100V400_T5A5"), + verbose = TRUE +) +} +} +\author{ + Samuel Timileyin Afolabi +} +\keyword{IO} diff --git a/man/write_epoch_parquet.Rd b/man/write_epoch_parquet.Rd new file mode 100644 index 000000000..6a1dec027 --- /dev/null +++ b/man/write_epoch_parquet.Rd @@ -0,0 +1,75 @@ +\name{write_epoch_parquet} +\alias{write_epoch_parquet} +\title{ + Write flat epoch-level Parquet output +} +\description{ + Builds a flat epoch-level table from \code{meta/ms5.outraw} time-series data + and writes it as a Parquet file. This is a legacy/compatibility export pathway; + the preferred dashboard export is \code{\link{write_dashboard_parquet}}. +} +\usage{ +write_epoch_parquet( + metadatadir = c(), + params_output = c(), + params_general = c(), + params_phyact = c(), + verbose = TRUE +) +} +\arguments{ + \item{metadatadir}{ + Directory that holds a folder 'meta' and inside this a folder 'basic' + which contains the milestone data produced by \link{g.part1}. The folder structure + is normally created by \link{g.part1} and \link{GGIR} will recognise what + the value of metadatadir is. + } + \item{params_output}{ + See details in \link{GGIR}. + } + \item{params_general}{ + See details in \link{GGIR}. + } + \item{params_phyact}{ + See details in \link{GGIR}. + } + \item{verbose}{ + See details in \link{GGIR}. + } +} +\details{ + This function uses \code{\link{build_epoch_table}} to read and harmonize + epoch-level data (one row per epoch), then applies type coercions and writes + \code{results/ggir_epochs.parquet}. + + Metadata stored in the Parquet footer includes export type, creation timestamp, + selected ms5 configuration name, acceleration metric, and estimated epoch + length in seconds. + + This export path is mainly useful for workflows that prefer a separate flat + epoch table rather than nested epochs inside the dashboard parquet. +} +\value{ + Invisibly returns the written Parquet file path, or \code{NULL} (invisibly) + when export is skipped. +} +\seealso{ + \code{\link{GGIR}}, + \code{\link{build_epoch_table}}, + \code{\link{write_dashboard_parquet}} +} +\examples{ +\dontrun{ +write_epoch_parquet( + metadatadir = "path/to/output_run/output_test_run", + params_output = list(), + params_general = list(desiredtz = "UTC", acc.metric = "ENMO"), + params_phyact = list(part6_threshold_combi = "WW_L40M100V400_T5A5"), + verbose = TRUE +) +} +} +\author{ + Samuel Timileyin Afolabi +} +\keyword{internal} diff --git a/tests/testthat/test_load_check_params.R b/tests/testthat/test_load_check_params.R index 65e231372..098393309 100644 --- a/tests/testthat/test_load_check_params.R +++ b/tests/testthat/test_load_check_params.R @@ -17,7 +17,7 @@ test_that("load_params can load parameters", { expect_equal(length(params$params_247), 25) expect_equal(length(params$params_cleaning), 28) expect_equal(length(params$params_phyact), 14) - expect_equal(length(params$params_output), 28) + expect_equal(length(params$params_output), 29) expect_equal(length(params$params_general), 22) params_sleep = params$params_sleep diff --git a/tests/testthat/test_write_parquet.R b/tests/testthat/test_write_parquet.R new file mode 100644 index 000000000..43299e07a --- /dev/null +++ b/tests/testthat/test_write_parquet.R @@ -0,0 +1,191 @@ +library(GGIR) +context("write_dashboard_parquet") + +make_fake_metadatadir = function(include_optional = TRUE) { + td = tempfile("ggir_parquet_test_") + dir.create(file.path(td, "results", "QC"), recursive = TRUE) + + # --- Part 5 day summary (required) --- + p5 = data.frame( + ID = c("P01", "P01", "P02"), + filename = c("P01.csv", "P01.csv", "P02.csv"), + calendar_date = c("2024-01-01", "2024-01-02", "2024-01-01"), + ENMO_mean = c(0.032, 0.041, 0.028), + stringsAsFactors = FALSE + ) + write.csv(p5, + file.path(td, "results", "QC", + "part5_daysummary_full_MM_L40M100V400_T5A5.csv"), + row.names = FALSE) + + if (include_optional) { + # --- Part 4 night summary (optional) --- + p4 = data.frame( + ID = c("P01", "P02"), + calendar_date = c("2024-01-01", "2024-01-01"), + sleeponset_ts = c("2024-01-01 23:00:00", "2024-01-01 22:30:00"), + stringsAsFactors = FALSE + ) + write.csv(p4, + file.path(td, "results", + "part4_nightsummary_sleep_cleaned.csv"), + row.names = FALSE) + + # --- Part 2 day summary (optional) --- + p2day = data.frame( + ID = c("P01", "P01", "P02"), + calendar_date = c("2024-01-01", "2024-01-02", "2024-01-01"), + L5 = c(0.010, 0.012, 0.009), + stringsAsFactors = FALSE + ) + write.csv(p2day, + file.path(td, "results", "part2_daysummary.csv"), + row.names = FALSE) + + # --- Part 2 person summary (optional) --- + p2 = data.frame( + ID = c("P01", "P02"), + filename = c("P01.csv", "P02.csv"), + n_valid_days = c(2L, 1L), + stringsAsFactors = FALSE + ) + write.csv(p2, + file.path(td, "results", "part2_summary.csv"), + row.names = FALSE) + + # --- Data quality report (optional, joined by filename) --- + qc = data.frame( + filename = c("P01.csv", "P02.csv"), + qc_flag = c(0L, 1L), + stringsAsFactors = FALSE + ) + write.csv(qc, + file.path(td, "results", "QC", "data_quality_report.csv"), + row.names = FALSE) + } + + td +} + +# Minimal params objects (only fields actually used by the function) +fake_params_output = list(save_dashboard_parquet = TRUE) +fake_params_general = list(desiredtz = "UTC", acc.metric = "ENMO") +fake_params_phyact = list(threshold.lig = 40, threshold.mod = 100, + threshold.vig = 400, + part6_threshold_combi = NULL) + +# --------------------------------------------------------------------------- +# Test 1: happy path – Parquet file is created with the right shape +# --------------------------------------------------------------------------- +test_that("write_dashboard_parquet creates a Parquet file with correct rows and columns", { + skip_on_cran() + skip_if_not_installed("arrow") + + td = make_fake_metadatadir(include_optional = TRUE) + on.exit(unlink(td, recursive = TRUE), add = TRUE) + + out = write_dashboard_parquet( + metadatadir = td, + params_output = fake_params_output, + params_general = fake_params_general, + params_phyact = fake_params_phyact, + verbose = FALSE + ) + + parquet_path = file.path(td, "results", "ggir_results.parquet") + + # File must exist and the function must return its path + expect_true(file.exists(parquet_path)) + expect_equal(out, parquet_path) + + # Read back and verify shape + result = arrow::read_parquet(parquet_path) + + # Should have one row per Part 5 day-summary row (3 rows in our mock) + expect_equal(nrow(result), 3) + + # Key columns from Part 5 must be present (after clean_colnames lowercasing) + expect_true("id" %in% names(result)) + expect_true("calendar_date" %in% names(result)) + expect_true("enmo_mean" %in% names(result)) + + # Optional Part 4 column should have been joined in + expect_true("sleeponset_ts" %in% names(result)) + + # Optional Part 2 day-summary column should have been joined in + expect_true("l5" %in% names(result)) +}) + +# --------------------------------------------------------------------------- +# Test 2: missing optional CSVs – function still succeeds (graceful degradation) +# --------------------------------------------------------------------------- +test_that("write_dashboard_parquet succeeds when optional CSVs are absent", { + skip_on_cran() + skip_if_not_installed("arrow") + + td = make_fake_metadatadir(include_optional = FALSE) + on.exit(unlink(td, recursive = TRUE), add = TRUE) + + # Should produce no warnings about missing files crashing the export + out = write_dashboard_parquet( + metadatadir = td, + params_output = fake_params_output, + params_general = fake_params_general, + params_phyact = fake_params_phyact, + verbose = FALSE + ) + + parquet_path = file.path(td, "results", "ggir_results.parquet") + expect_true(file.exists(parquet_path)) + + result = arrow::read_parquet(parquet_path) + expect_equal(nrow(result), 3) # still 3 rows from Part 5 +}) + +# --------------------------------------------------------------------------- +# Test 3: no results directory – function warns and returns NULL +# --------------------------------------------------------------------------- +test_that("write_dashboard_parquet returns NULL with a warning when results dir is missing", { + skip_on_cran() + skip_if_not_installed("arrow") + + td = tempfile("ggir_empty_") + dir.create(td) + on.exit(unlink(td, recursive = TRUE), add = TRUE) + + expect_warning( + out <- write_dashboard_parquet( + metadatadir = td, + params_output = fake_params_output, + params_general = fake_params_general, + params_phyact = fake_params_phyact, + verbose = FALSE + ), + regexp = "No results directory" + ) + expect_null(out) +}) + +# --------------------------------------------------------------------------- +# Test 4: no Part 5 CSVs present – function warns and returns NULL +# --------------------------------------------------------------------------- +test_that("write_dashboard_parquet returns NULL with a warning when Part 5 CSVs are missing", { + skip_on_cran() + skip_if_not_installed("arrow") + + td = tempfile("ggir_nop5_") + dir.create(file.path(td, "results", "QC"), recursive = TRUE) + on.exit(unlink(td, recursive = TRUE), add = TRUE) + + expect_warning( + out <- write_dashboard_parquet( + metadatadir = td, + params_output = fake_params_output, + params_general = fake_params_general, + params_phyact = fake_params_phyact, + verbose = FALSE + ), + regexp = "No Part 5 day summary" + ) + expect_null(out) +}) diff --git a/vignettes/GGIRParameters.Rmd b/vignettes/GGIRParameters.Rmd index 0cb5b63aa..24d0860e7 100644 --- a/vignettes/GGIRParameters.Rmd +++ b/vignettes/GGIRParameters.Rmd @@ -255,6 +255,7 @@ find a description and default value for all the arguments. | dofirstpage | visualreport | params_output | | visualreport | visualreport | params_output | | viewingwindow | visualreport | params_output | +| save_dashboard_parquet | end of GGIR run | params_output | # Arguments/parameters description {#default-argument-values}