diff --git a/NAMESPACE b/NAMESPACE index 484df87..2d7a0f8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,11 +6,13 @@ S3method(glance,dbreg) S3method(plot,dbbinsreg) S3method(predict,dbreg) S3method(print,dbbinsreg) +S3method(print,dbivreg) S3method(print,dbreg) S3method(tidy,dbreg) S3method(tinyplot,dbbinsreg) S3method(vcov,dbreg) export(dbbinsreg) +export(dbivreg) export(dbreg) export(glance) export(gof) diff --git a/R/dbivreg.R b/R/dbivreg.R new file mode 100644 index 0000000..0983bf5 --- /dev/null +++ b/R/dbivreg.R @@ -0,0 +1,1940 @@ +#' Instrumental-variables regression on a database backend +#' +#' @md +#' @description +#' Estimates a linear IV model via exact sufficient statistics on a database +#' backend. The current implementation supports three exact strategies: +#' +#' - `strategy = "moments"` for models without absorbed fixed effects. +#' - `strategy = "compress"` for repeated no-FE design rows. +#' - `strategy = "demean"` for one or more absorbed fixed effects. +#' +#' Multi-way fixed effects use exact alternating projections when there are +#' more than two fixed effects, or when two-way fixed effects are unbalanced or +#' weights are supplied. +#' +#' @param fml A fixest-style IV formula. Use `y ~ x1 + x2 | d ~ z` for models +#' without absorbed fixed effects, and `y ~ x1 + x2 | fe1 + fe2 + fe3 | d ~ z` +#' when fixed effects are present. The IV block must always be the final +#' pipe-separated block. +#' @param conn Database connection, or `NULL` to create an ephemeral DuckDB +#' connection. +#' @param table,data,path Mutually exclusive data source arguments with +#' precedence `table > data > path`. +#' @param weights Optional character string naming an observation-weight column. +#' @param vcov Character string or one-sided clustering formula. Character +#' options are `"iid"` and `"hc1"`. +#' @param strategy Character string selecting the IV backend. Supported values +#' are `"auto"` (default), `"moments"`, `"demean"` (alias `"within"`), +#' `"compress"`, and `"mundlak"`. The latter currently errors because it is +#' not yet implemented for IV. +#' @param cluster Optional clustering variable, supplied as a one-sided formula +#' or character string. +#' @param sql_only Logical indicating whether to return only the underlying SQL +#' query. +#' @param data_only Logical indicating whether to return only the aggregated +#' sufficient statistics. +#' @param drop_missings Logical indicating whether incomplete cases should be +#' dropped. +#' @param verbose Logical indicating whether progress messages are printed. +#' @param ... Additional unused arguments. +#' +#' @return A list of class `"dbivreg"` and `"dbreg"`. +#' @export +dbivreg = function( + fml, + conn = NULL, + table = NULL, + data = NULL, + path = NULL, + weights = NULL, + vcov = c("iid", "hc1"), + strategy = c("auto", "moments", "demean", "within", "compress", "mundlak"), + cluster = NULL, + sql_only = FALSE, + data_only = FALSE, + drop_missings = TRUE, + verbose = getOption("dbreg.verbose", FALSE), + ... +) { + verbose = isTRUE(verbose) + strategy = match.arg(strategy) + if (strategy == "within") { + strategy = "demean" + } + dots = list(...) + if (length(dots)) { + dot_names = names(dots) + if (is.null(dot_names)) { + dot_names = rep("", length(dots)) + } + legacy_args = intersect(dot_names, c("endog", "instruments")) + if (length(legacy_args) > 0) { + stop( + "`endog` and `instruments` are no longer supported. ", + "Use fixest-style IV formulas like `y ~ x | d ~ z` or `y ~ x | fe1 + fe2 | d ~ z`." + ) + } + stop( + "Unused arguments: ", + paste(ifelse(nzchar(dot_names), paste0("`", dot_names, "`"), ""), collapse = ", ") + ) + } + + vcov_parsed = parse_vcov_args(vcov, cluster, valid_types = c("iid", "hc1")) + inputs = process_dbivreg_inputs( + fml = fml, + conn = conn, + table = table, + data = data, + path = path, + weights = weights, + vcov = vcov_parsed[["vcov_type"]], + cluster = vcov_parsed[["cluster_var"]], + strategy = strategy, + sql_only = sql_only, + data_only = data_only, + drop_missings = drop_missings, + verbose = verbose + ) + + cleanup_conn = function() { + if (isTRUE(inputs[["own_conn"]]) && dbIsValid(inputs[["conn"]])) { + dbDisconnect(inputs[["conn"]], shutdown = TRUE) + } + } + on.exit(cleanup_conn(), add = TRUE) + + cleanup_registered = function() { + if (!is.null(inputs[["registered_table"]]) && dbIsValid(inputs[["conn"]])) { + try(duckdb_unregister(inputs[["conn"]], inputs[["registered_table"]]), silent = TRUE) + } + } + on.exit(cleanup_registered(), add = TRUE) + + chosen_strategy = choose_dbivreg_strategy(inputs) + + result = switch( + chosen_strategy, + "moments" = execute_iv_moments_strategy(inputs), + "compress" = execute_iv_compress_strategy(inputs), + "demean" = execute_iv_demean_strategy(inputs), + stop("Unknown IV strategy: ", chosen_strategy) + ) + + if (inputs[["sql_only"]]) { + cat(result) + return(invisible(result)) + } + if (inputs[["data_only"]]) { + return(result) + } + + result[["strategy"]] = chosen_strategy + class(result) = c("dbivreg", "dbreg") + result +} + +#' Process and validate dbivreg inputs +#' +#' Returns an environment so downstream strategy helpers follow the post-#65 +#' dbreg convention of reading shared state with `inputs[["..."]]`. +#' +#' @keywords internal +process_dbivreg_inputs = function( + fml, + conn, + table, + data, + path, + weights, + vcov, + cluster, + strategy, + sql_only, + data_only, + drop_missings, + verbose +) { + vcov_type_req = vcov + cluster_var = cluster + + parsed_fml = dbivreg_parse_formula(fml) + yvar = parsed_fml[["yvar"]] + fe = parsed_fml[["fe"]] + + db_setup = setup_db_connection(conn, table, data, path, caller = "dbivreg") + conn = db_setup[["conn"]] + own_conn = db_setup[["own_conn"]] + from_statement = db_setup[["from_statement"]] + registered_table = db_setup[["registered_table"]] + + if (!is.null(weights)) { + if (!is.character(weights) || length(weights) != 1) { + stop("`weights` must be a single character string (column name) or NULL.") + } + if (!is.null(data)) { + if (!weights %in% names(data)) { + stop("Weight column '", weights, "' not found in data.") + } + wv = data[[weights]] + if (any(!is.finite(wv) & !is.na(wv))) { + stop("Weights must be finite.") + } + if (any(wv < 0, na.rm = TRUE)) { + stop("Weights must be non-negative.") + } + } else { + neg_sql = glue( + "SELECT 1 FROM (SELECT * {from_statement}) t WHERE {weights} < 0 LIMIT 1" + ) + has_neg = tryCatch(nrow(dbGetQuery(conn, neg_sql)) > 0, error = function(e) FALSE) + if (isTRUE(has_neg)) { + stop("Weights must be non-negative.") + } + } + } + + if (isTRUE(drop_missings) || !is.null(weights)) { + if (grepl("WHERE|LIMIT|ORDER\\s+BY|GROUP\\s+BY|HAVING", from_statement, ignore.case = TRUE)) { + from_statement = glue("FROM (SELECT * {from_statement}) AS subq") + } + where_clauses = character(0) + if (isTRUE(drop_missings)) { + miss_vars = unique(c( + yvar, + parsed_fml[["exog_vars"]], + parsed_fml[["endog_vars"]], + parsed_fml[["instr_vars"]], + fe, + cluster_var + )) + miss_vars = miss_vars[!is.na(miss_vars) & nzchar(miss_vars)] + where_clauses = c(where_clauses, paste0(miss_vars, " IS NOT NULL")) + } + if (!is.null(weights)) { + where_clauses = c(where_clauses, paste0(weights, " > 0")) + } + if (length(where_clauses) > 0) { + from_statement = glue(" + {from_statement} + WHERE {paste(where_clauses, collapse = ' AND ')} + ") + } + } + + design_source = sub("^FROM\\s+", "", from_statement, ignore.case = TRUE) + exog_design = dbivreg_expand_block(parsed_fml[["exog_rhs"]], conn, design_source, fe) + endog_design = dbivreg_expand_block(parsed_fml[["endog_rhs"]], conn, design_source, fe) + instr_design = dbivreg_expand_block(parsed_fml[["instr_rhs"]], conn, design_source, fe) + + if (!length(endog_design[["names"]])) { + stop("At least one endogenous regressor is required in the IV block.") + } + if (!length(instr_design[["names"]])) { + stop("At least one excluded instrument is required in the IV block.") + } + + overlap_x_endog = intersect(exog_design[["names"]], endog_design[["names"]]) + if (length(overlap_x_endog) > 0) { + stop( + "Endogenous regressors cannot also appear as included exogenous regressors. ", + "Overlapping term(s): ", + paste(standardize_coef_names(overlap_x_endog), collapse = ", ") + ) + } + overlap_endog_iv = intersect(endog_design[["names"]], instr_design[["names"]]) + if (length(overlap_endog_iv) > 0) { + stop( + "Excluded instruments cannot also appear as endogenous regressors. ", + "Overlapping term(s): ", + paste(standardize_coef_names(overlap_endog_iv), collapse = ", ") + ) + } + overlap_exog_iv = intersect(exog_design[["names"]], instr_design[["names"]]) + if (length(overlap_exog_iv) > 0) { + warning( + "Included exogenous regressors are already valid instruments. ", + "Dropping duplicate entries from the excluded-instrument block: ", + paste(standardize_coef_names(overlap_exog_iv), collapse = ", ") + ) + keep_iv = !instr_design[["names"]] %in% overlap_exog_iv + instr_design[["sql"]] = instr_design[["sql"]][keep_iv] + instr_design[["names"]] = instr_design[["names"]][keep_iv] + } + if (length(instr_design[["names"]]) < length(endog_design[["names"]])) { + stop("Need at least as many excluded instrument columns as endogenous regressor columns.") + } + + x_all_sql = c(exog_design[["sql"]], endog_design[["sql"]]) + x_all_names = c(exog_design[["names"]], endog_design[["names"]]) + z_all_names = c(exog_design[["names"]], instr_design[["names"]]) + + union_map = c( + stats::setNames(x_all_sql, x_all_names), + stats::setNames(instr_design[["sql"]], instr_design[["names"]]) + ) + union_map = union_map[!duplicated(names(union_map))] + union_names = names(union_map) + union_sql = unname(union_map) + + list2env(list( + fml = fml, + structural_fml = dbivreg_structural_formula( + yvar = yvar, + exog_rhs = parsed_fml[["exog_rhs_text"]], + endog_rhs = parsed_fml[["endog_rhs_text"]], + fe_rhs = parsed_fml[["fe_rhs_text"]], + env = parsed_fml[["env"]] + ), + yvar = yvar, + fe = fe, + exog_names = exog_design[["names"]], + exog_sql = exog_design[["sql"]], + endog_names = endog_design[["names"]], + endog_sql = endog_design[["sql"]], + instr_names = instr_design[["names"]], + instr_sql = instr_design[["sql"]], + endog_vars = standardize_coef_names(endog_design[["names"]]), + instr_vars = standardize_coef_names(instr_design[["names"]]), + x_all_names = x_all_names, + z_all_names = z_all_names, + union_names = union_names, + union_sql = union_sql, + conn = conn, + from_statement = from_statement, + weights = weights, + vcov_type_req = vcov_type_req, + cluster_var = cluster_var, + strategy = strategy, + sql_only = sql_only, + data_only = data_only, + verbose = verbose, + own_conn = own_conn, + registered_table = registered_table + ), parent = emptyenv()) +} + +#' @keywords internal +choose_dbivreg_strategy = function(inputs) { + strategy = inputs[["strategy"]] + fe = inputs[["fe"]] + verbose = inputs[["verbose"]] + + if (strategy == "auto") { + chosen_strategy = if (length(fe) == 0) "moments" else "demean" + if (verbose) { + message("[dbivreg] Auto strategy: ", chosen_strategy) + } + } else { + chosen_strategy = strategy + if (verbose) { + message("[dbivreg] Using strategy: ", chosen_strategy) + } + } + + if (chosen_strategy == "mundlak") { + stop( + "dbivreg() does not yet implement strategy = '", + chosen_strategy, + "'. Supported IV strategies are 'moments', 'compress', and 'demean'." + ) + } + if (chosen_strategy == "compress" && length(fe) > 0) { + stop( + "strategy = 'compress' is only available without fixed effects in dbivreg(). ", + "Use strategy = 'demean' or strategy = 'auto' when fixed effects are present." + ) + } + if (chosen_strategy == "moments" && length(fe) > 0) { + stop( + "strategy = 'moments' is only available without fixed effects. ", + "Use strategy = 'demean' or strategy = 'auto'." + ) + } + if (chosen_strategy == "demean" && length(fe) < 1) { + stop( + "strategy = 'demean' requires at least one fixed effect. ", + "Use strategy = 'moments' or strategy = 'auto' for models without fixed effects." + ) + } + + chosen_strategy +} + +#' @keywords internal +execute_iv_moments_strategy = function(inputs) { + weight_alias = "__dbivreg_weight" + base_select = c(sprintf("%s AS %s", inputs[["yvar"]], inputs[["yvar"]])) + for (i in seq_along(inputs[["union_names"]])) { + if (identical(inputs[["union_sql"]][i], inputs[["union_names"]][i])) { + base_select = c(base_select, inputs[["union_names"]][i]) + } else { + base_select = c(base_select, sprintf("%s AS %s", inputs[["union_sql"]][i], inputs[["union_names"]][i])) + } + } + if (!is.null(inputs[["weights"]])) { + base_select = c(base_select, sprintf("%s AS %s", inputs[["weights"]], weight_alias)) + } + if (!is.null(inputs[["cluster_var"]]) && !inputs[["cluster_var"]] %in% c(inputs[["yvar"]], inputs[["union_names"]], weight_alias)) { + base_select = c(base_select, inputs[["cluster_var"]]) + } + + weights_expr = if (is.null(inputs[["weights"]])) NULL else sql_weight_expr(weight_alias) + moment_terms = dbivreg_union_moment_terms( + conn = inputs[["conn"]], + union_names = inputs[["union_names"]], + y_expr = inputs[["yvar"]], + weights_expr = weights_expr, + has_intercept = TRUE + ) + + cte_sql = paste0( + "WITH base AS (\n SELECT ", + paste(base_select, collapse = ", "), + " ", + inputs[["from_statement"]], + "\n)" + ) + moments_sql = paste0( + cte_sql, + "\nSELECT\n ", + paste(moment_terms, collapse = ",\n "), + "\nFROM base" + ) + + if (inputs[["sql_only"]]) { + return(moments_sql) + } + if (inputs[["verbose"]]) { + message(if (!is.null(inputs[["weights"]])) "[dbivreg] Executing weighted IV moments SQL\n" else "[dbivreg] Executing IV moments SQL\n") + } + + moments_df = dbGetQuery(inputs[["conn"]], moments_sql) + if (inputs[["data_only"]]) { + return(moments_df) + } + + dbivreg_finalize_fit( + inputs = inputs, + moment_df = moments_df, + query_string = moments_sql, + cte_sql = cte_sql, + cte_name = "base", + x_names_full = c("(Intercept)", inputs[["x_all_names"]]), + z_names_full = c("(Intercept)", inputs[["z_all_names"]]), + has_intercept = TRUE, + weights_expr = weights_expr, + x_vars_sql = inputs[["x_all_names"]], + z_vars_sql = inputs[["z_all_names"]], + yvar_sql = inputs[["yvar"]] + ) +} + +#' @keywords internal +execute_iv_compress_strategy = function(inputs) { + if (length(inputs[["fe"]]) > 0) { + stop( + "strategy = 'compress' is only available without fixed effects in dbivreg(). ", + "Use strategy = 'demean' or strategy = 'auto' when fixed effects are present." + ) + } + + yvar = inputs[["yvar"]] + weight_alias = "__dbivreg_weight" + base_select = c(sprintf("%s AS %s", yvar, yvar)) + for (i in seq_along(inputs[["union_names"]])) { + if (identical(inputs[["union_sql"]][i], inputs[["union_names"]][i])) { + base_select = c(base_select, inputs[["union_names"]][i]) + } else { + base_select = c(base_select, sprintf("%s AS %s", inputs[["union_sql"]][i], inputs[["union_names"]][i])) + } + } + if (!is.null(inputs[["weights"]])) { + base_select = c(base_select, sprintf("%s AS %s", inputs[["weights"]], weight_alias)) + } + if (!is.null(inputs[["cluster_var"]]) && !inputs[["cluster_var"]] %in% c(yvar, inputs[["union_names"]], weight_alias)) { + base_select = c(base_select, inputs[["cluster_var"]]) + } + + weights_expr = if (is.null(inputs[["weights"]])) NULL else sql_weight_expr(weight_alias) + compressed_terms = c( + sql_count(inputs[["conn"]], "n"), + if (is.null(weights_expr)) sql_count(inputs[["conn"]], "sum_w") else glue("SUM({weights_expr}) AS sum_w"), + sql_weighted_sum(yvar, weights_expr, "sum_wy"), + sql_weighted_sum(glue("({yvar}) * ({yvar})"), weights_expr, "sum_wy_sq") + ) + compressed_moment_terms = dbivreg_compressed_moment_terms( + conn = inputs[["conn"]], + union_names = inputs[["union_names"]], + has_intercept = TRUE + ) + group_cols_sql = paste(inputs[["union_names"]], collapse = ", ") + + cte_sql = paste0( + "WITH base AS (\n SELECT ", + paste(base_select, collapse = ", "), + " ", + inputs[["from_statement"]], + "\n)" + ) + compress_sql = paste0( + cte_sql, + ",\ncompressed AS (\n SELECT\n ", + group_cols_sql, + ",\n ", + paste(compressed_terms, collapse = ",\n "), + "\n FROM base\n GROUP BY ", + group_cols_sql, + "\n),\nmoments AS (\n SELECT\n ", + paste(compressed_moment_terms, collapse = ",\n "), + "\n FROM compressed\n)\nSELECT * FROM moments" + ) + + if (inputs[["sql_only"]]) { + return(compress_sql) + } + if (inputs[["verbose"]]) { + message(if (!is.null(inputs[["weights"]])) "[dbivreg] Executing weighted compressed IV SQL\n" else "[dbivreg] Executing compressed IV SQL\n") + } + + moments_df = dbGetQuery(inputs[["conn"]], compress_sql) + if (inputs[["data_only"]]) { + return(moments_df) + } + + compression_ratio = moments_df[["n_compressed"]] / max(moments_df[["n_total"]], 1) + if (inputs[["verbose"]] && compression_ratio > 0.8) { + warning(paste0( + sprintf("[dbivreg] compression ineffective (%.1f%% of original rows). ", 100 * compression_ratio), + "Consider strategy = 'moments'." + )) + } + + result = dbivreg_finalize_fit( + inputs = inputs, + moment_df = moments_df, + query_string = compress_sql, + cte_sql = cte_sql, + cte_name = "base", + x_names_full = c("(Intercept)", inputs[["x_all_names"]]), + z_names_full = c("(Intercept)", inputs[["z_all_names"]]), + has_intercept = TRUE, + weights_expr = weights_expr, + x_vars_sql = inputs[["x_all_names"]], + z_vars_sql = inputs[["z_all_names"]], + yvar_sql = yvar + ) + result[["nobs"]] = moments_df[["n_compressed"]] + result[["compression_ratio"]] = compression_ratio + result +} + +#' @keywords internal +execute_iv_demean_strategy = function(inputs) { + yvar = inputs[["yvar"]] + fe = inputs[["fe"]] + union_names = inputs[["union_names"]] + weights_expr_base = sql_weight_expr(inputs[["weights"]]) + weights_expr_demeaned = if (is.null(inputs[["weights"]])) NULL else sql_weight_expr("weights") + + all_var_names = c(yvar, union_names) + cluster_var = inputs[["cluster_var"]] + ap_tables = NULL + + use_ap = FALSE + if (length(fe) >= 2) { + if (length(fe) > 2) { + use_ap = TRUE + } else { + is_balanced = dbreg_is_balanced_panel(inputs[["conn"]], inputs[["from_statement"]], fe) + use_ap = !is.null(inputs[["weights"]]) || !isTRUE(is_balanced) + } + if (isTRUE(use_ap) && inputs[["verbose"]]) { + message("[dbivreg] Using alternating projections for FE demeaning") + } + if (isTRUE(use_ap) && inputs[["sql_only"]]) { + stop("[dbivreg] sql_only is not supported for alternating projections.", call. = FALSE) + } + } + + if (length(fe) == 1) { + fe1 = fe[1] + base_select = c(fe1, yvar) + for (i in seq_along(union_names)) { + base_select = c(base_select, sprintf("%s AS %s", inputs[["union_sql"]][i], union_names[i])) + } + if (!is.null(inputs[["weights"]]) && !inputs[["weights"]] %in% c(fe1, yvar, union_names)) { + base_select = c(base_select, inputs[["weights"]]) + } + if (!is.null(cluster_var) && !cluster_var %in% c(fe1, yvar, union_names, inputs[["weights"]])) { + base_select = c(base_select, cluster_var) + } + + means_cols = paste( + vapply( + all_var_names, + function(v) sql_weighted_mean(v, weights_expr_base, paste0(v, "_mean")), + character(1) + ), + collapse = ", " + ) + tilde_exprs = paste( + sprintf("(b.%s - gm.%s_mean) AS %s_tilde", all_var_names, all_var_names, all_var_names), + collapse = ",\n " + ) + if (!is.null(inputs[["weights"]])) { + tilde_exprs = paste( + tilde_exprs, + sprintf("b.%s AS weights", inputs[["weights"]]), + sep = ",\n " + ) + } + if (!is.null(cluster_var) && !cluster_var %in% fe1) { + tilde_exprs = paste( + tilde_exprs, + sprintf("b.%s AS %s", cluster_var, cluster_var), + sep = ",\n " + ) + } + + cte_sql = paste0( + "WITH base AS ( + SELECT ", paste(base_select, collapse = ", "), " ", + inputs[["from_statement"]], + " + ), + group_means AS ( + SELECT ", + fe1, + ", ", + means_cols, + " FROM base GROUP BY ", + fe1, + " + ), + demeaned AS ( + SELECT + b.", + fe1, + ", + ", + tilde_exprs, + " + FROM base b + JOIN group_means gm ON b.", + fe1, + " = gm.", + fe1, + " + )" + ) + } else { + fe1 = fe[1] + fe2 = fe[2] + + if (isTRUE(use_ap)) { + ap_res = dbreg_alternating_projections( + conn = inputs[["conn"]], + from_statement = inputs[["from_statement"]], + fe = fe, + yvar = yvar, + xvars_sql = inputs[["union_sql"]], + xvar_names = union_names, + weights = inputs[["weights"]], + cluster_var = cluster_var, + verbose = inputs[["verbose"]] + ) + ap_tables = c(ap_res[["table"]], ap_res[["base_table"]]) + weights_expr_demeaned = "__w" + cte_sql = paste0("WITH demeaned AS (SELECT * FROM ", ap_res[["table"]], ")") + } else { + base_select = c(fe1, fe2, yvar) + for (i in seq_along(union_names)) { + base_select = c(base_select, sprintf("%s AS %s", inputs[["union_sql"]][i], union_names[i])) + } + if (!is.null(inputs[["weights"]]) && !inputs[["weights"]] %in% c(fe1, fe2, yvar, union_names)) { + base_select = c(base_select, inputs[["weights"]]) + } + if (!is.null(cluster_var) && !cluster_var %in% c(fe1, fe2, yvar, union_names, inputs[["weights"]])) { + base_select = c(base_select, cluster_var) + } + + unit_means_cols = paste( + vapply( + all_var_names, + function(v) sql_weighted_mean(v, weights_expr_base, paste0(v, "_u")), + character(1) + ), + collapse = ", " + ) + time_means_cols = paste( + vapply( + all_var_names, + function(v) sql_weighted_mean(v, weights_expr_base, paste0(v, "_t")), + character(1) + ), + collapse = ", " + ) + overall_cols = paste( + vapply( + all_var_names, + function(v) sql_weighted_mean(v, weights_expr_base, paste0(v, "_o")), + character(1) + ), + collapse = ", " + ) + tilde_exprs = paste( + sprintf( + "(b.%s - um.%s_u - tm.%s_t + o.%s_o) AS %s_tilde", + all_var_names, + all_var_names, + all_var_names, + all_var_names, + all_var_names + ), + collapse = ",\n " + ) + if (!is.null(cluster_var) && !cluster_var %in% c(fe1, fe2)) { + tilde_exprs = paste( + tilde_exprs, + sprintf("b.%s AS %s", cluster_var, cluster_var), + sep = ",\n " + ) + } + if (!is.null(inputs[["weights"]])) { + tilde_exprs = paste( + tilde_exprs, + sprintf("b.%s AS weights", inputs[["weights"]]), + sep = ",\n " + ) + } + + cte_sql = paste0( + "WITH base AS ( + SELECT ", paste(base_select, collapse = ", "), " ", + inputs[["from_statement"]], + " + ), + unit_means AS ( + SELECT ", + fe1, + ", ", + unit_means_cols, + " FROM base GROUP BY ", + fe1, + " + ), + time_means AS ( + SELECT ", + fe2, + ", ", + time_means_cols, + " FROM base GROUP BY ", + fe2, + " + ), + overall AS ( + SELECT ", + overall_cols, + " FROM base + ), + demeaned AS ( + SELECT + b.", + fe1, + ", + b.", + fe2, + ", + ", + tilde_exprs, + " + FROM base b + JOIN unit_means um ON b.", + fe1, + " = um.", + fe1, + " + JOIN time_means tm ON b.", + fe2, + " = tm.", + fe2, + " + CROSS JOIN overall o + )" + ) + } + } + + fe_count_terms = vapply(seq_along(fe), function(k) { + sql_count(inputs[["conn"]], sprintf("n_fe%d", k), fe[k], distinct = TRUE) + }, character(1)) + if (length(fe) == 1) { + fe_count_terms = c(fe_count_terms, "1 AS n_fe2") + } + moment_terms = c( + sql_count(inputs[["conn"]], "n_total"), + fe_count_terms, + sql_weighted_sum( + glue("CAST({yvar}_tilde AS FLOAT) * CAST({yvar}_tilde AS FLOAT)"), + weights_expr_demeaned, + "sum_y_sq" + ) + ) + for (i in seq_along(union_names)) { + moment_terms = c( + moment_terms, + sql_weighted_sum( + glue("CAST({union_names[i]}_tilde AS FLOAT) * CAST({yvar}_tilde AS FLOAT)"), + weights_expr_demeaned, + sprintf("sum_u%d_y", i) + ), + sql_weighted_sum( + glue("CAST({union_names[i]}_tilde AS FLOAT) * CAST({union_names[i]}_tilde AS FLOAT)"), + weights_expr_demeaned, + sprintf("sum_u%d_%d", i, i) + ) + ) + } + if (length(union_names) > 1) { + for (i in seq_along(union_names)) { + if (i == 1) { + next + } + for (j in seq_len(i - 1)) { + moment_terms = c( + moment_terms, + sql_weighted_sum( + glue("CAST({union_names[j]}_tilde AS FLOAT) * CAST({union_names[i]}_tilde AS FLOAT)"), + weights_expr_demeaned, + sprintf("sum_u%d_%d", j, i) + ) + ) + } + } + } + + demean_sql = paste0( + cte_sql, + ", + moments AS ( + SELECT + ", + paste(moment_terms, collapse = ",\n "), + " + FROM demeaned + ) + SELECT * FROM moments" + ) + + if (inherits(inputs[["conn"]], "AthenaConnection")) { + demean_sql = gsub("FLOAT", "REAL", demean_sql, fixed = TRUE) + } + if (inputs[["sql_only"]]) { + return(demean_sql) + } + if (inputs[["verbose"]]) { + message(if (!is.null(inputs[["weights"]])) "[dbivreg] Executing weighted demeaned IV SQL\n" else "[dbivreg] Executing demeaned IV SQL\n") + } + + ap_cleanup = function() { + if (!is.null(ap_tables)) { + backend = detect_backend(inputs[["conn"]])[["name"]] + for (tbl in ap_tables) { + drop_table_if_exists(inputs[["conn"]], tbl, backend) + } + } + } + on.exit(ap_cleanup(), add = TRUE) + + demean_df = dbGetQuery(inputs[["conn"]], demean_sql) + if (inputs[["data_only"]]) { + return(demean_df) + } + + dbivreg_finalize_fit( + inputs = inputs, + moment_df = demean_df, + query_string = demean_sql, + cte_sql = cte_sql, + cte_name = "demeaned", + x_names_full = inputs[["x_all_names"]], + z_names_full = inputs[["z_all_names"]], + has_intercept = FALSE, + weights_expr = weights_expr_demeaned, + x_vars_sql = paste0(inputs[["x_all_names"]], "_tilde"), + z_vars_sql = paste0(inputs[["z_all_names"]], "_tilde"), + yvar_sql = paste0(inputs[["yvar"]], "_tilde") + ) +} + +#' @keywords internal +dbivreg_finalize_fit = function( + inputs, + moment_df, + query_string, + cte_sql, + cte_name, + x_names_full, + z_names_full, + has_intercept, + weights_expr, + x_vars_sql, + z_vars_sql, + yvar_sql +) { + moms = dbivreg_reconstruct_moments(moment_df, inputs[["union_names"]], has_intercept) + + S_xx = moms$S_uu[x_names_full, x_names_full, drop = FALSE] + S_xy = moms$S_uy[x_names_full, , drop = FALSE] + S_zz = moms$S_uu[z_names_full, z_names_full, drop = FALSE] + S_zx = moms$S_uu[z_names_full, x_names_full, drop = FALSE] + S_zy = moms$S_uy[z_names_full, , drop = FALSE] + + if (qr(S_zz, tol = 1e-10)$rank < ncol(S_zz)) { + stop("Instrument matrix is rank deficient.") + } + z_inv = solve_with_fallback(S_zz, diag(ncol(S_zz)))$XtX_inv + A = Matrix::t(S_zx) %*% z_inv %*% S_zx + if (qr(A, tol = 1e-10)$rank < ncol(A)) { + stop("IV system is not identified or regressors are collinear after instrumentation.") + } + rhs = Matrix::t(S_zx) %*% z_inv %*% S_zy + solve_result = solve_with_fallback(A, rhs) + betahat = solve_result$betahat + A_inv = solve_result$XtX_inv + rownames(betahat) = x_names_full + + rss = as.numeric( + moms$sum_y_sq - + 2 * Matrix::t(betahat) %*% S_xy + + Matrix::t(betahat) %*% S_xx %*% betahat + ) + + n_fe = length(inputs[["fe"]]) + n_fe_levels = if (n_fe == 0) { + numeric(0) + } else { + stats::setNames( + vapply(seq_len(n_fe), function(k) { + val = moment_df[[sprintf("n_fe%d", k)]] + if (is.null(val)) 1 else as.numeric(val) + }, numeric(1)), + inputs[["fe"]] + ) + } + df_fe = if (n_fe == 0) 0 else sum(n_fe_levels) - (n_fe - 1) + df_res = max(moment_df$n_total - length(x_names_full) - df_fe, 1) + n_params = length(x_names_full) + df_fe + + tss = if (has_intercept) { + moms$sum_y_sq - (moms$sum_wy^2 / moms$sum_w) + } else { + moms$sum_y_sq + } + + meat = NULL + is_athena = inherits(inputs[["conn"]], "AthenaConnection") + if (inputs[["vcov_type_req"]] == "hc1") { + meat = compute_iv_meat_sql( + conn = inputs[["conn"]], + cte_sql = cte_sql, + x_vars = x_names_full[!x_names_full %in% "(Intercept)"], + z_vars = z_names_full[!z_names_full %in% "(Intercept)"], + yvar = inputs[["yvar"]], + betahat = betahat, + weights_expr = weights_expr, + cte_name = cte_name, + has_intercept = has_intercept, + is_athena = is_athena, + x_vars_sql = x_vars_sql, + z_vars_sql = z_vars_sql, + yvar_sql = yvar_sql + ) + } else if (inputs[["vcov_type_req"]] == "cluster") { + meat = compute_iv_meat_cluster_sql( + conn = inputs[["conn"]], + cte_sql = cte_sql, + x_vars = x_names_full[!x_names_full %in% "(Intercept)"], + z_vars = z_names_full[!z_names_full %in% "(Intercept)"], + yvar = inputs[["yvar"]], + betahat = betahat, + cluster_var = inputs[["cluster_var"]], + weights_expr = weights_expr, + cte_name = cte_name, + has_intercept = has_intercept, + is_athena = is_athena, + x_vars_sql = x_vars_sql, + z_vars_sql = z_vars_sql, + yvar_sql = yvar_sql + ) + } + + vcov_mat = compute_iv_vcov( + vcov_type = inputs[["vcov_type_req"]], + A_inv = A_inv, + ZtZ_inv = z_inv, + ZtX = S_zx, + rss = rss, + df_res = df_res, + nobs_orig = moment_df$n_total, + n_params = n_params, + meat = meat + ) + attr(vcov_mat, "rss") = rss + attr(vcov_mat, "tss") = tss + + coeftable = gen_coeftable(betahat, vcov_mat, df_res) + iv_diagnostics = compute_dbivreg_diagnostics( + inputs = inputs, + moms = moms, + cte_sql = cte_sql, + cte_name = cte_name, + has_intercept = has_intercept, + weights_expr = weights_expr, + n_total = moment_df$n_total, + df_fe = df_fe, + rss = rss, + betahat = betahat, + S_zz = S_zz, + S_zx = S_zx, + S_zy = S_zy, + meat = meat + ) + + out = list( + coeftable = coeftable, + vcov = vcov_mat, + fml = inputs[["structural_fml"]], + yvar = inputs[["yvar"]], + xvars = standardize_coef_names(x_names_full[x_names_full != "(Intercept)"]), + fe = inputs[["fe"]], + weights = inputs[["weights"]], + query_string = query_string, + nobs = 1L, + nobs_orig = moment_df$n_total, + estimator = "2SLS", + endogenous = inputs[["endog_vars"]], + instruments = inputs[["instr_vars"]], + diagnostics = iv_diagnostics, + df_residual = df_res, + n_fe_levels = if (n_fe > 0) n_fe_levels else NULL, + n_fe1 = if ("n_fe1" %in% names(moment_df)) moment_df$n_fe1 else NULL, + n_fe2 = if ("n_fe2" %in% names(moment_df)) moment_df$n_fe2 else NULL + ) + if (n_fe > 2) { + for (k in seq.int(3L, n_fe)) { + out[[sprintf("n_fe%d", k)]] = unname(n_fe_levels[k]) + } + } + out +} + +#' @keywords internal +dbivreg_reconstruct_moments = function(moment_df, union_names, has_intercept) { + if (has_intercept) { + union_all = c("(Intercept)", union_names) + S_uu = matrix(0, length(union_all), length(union_all), dimnames = list(union_all, union_all)) + S_uy = matrix(0, length(union_all), 1, dimnames = list(union_all, "")) + + S_uu["(Intercept)", "(Intercept)"] = moment_df$sum_w + S_uy["(Intercept)", ] = moment_df$sum_wy + for (i in seq_along(union_names)) { + S_uu["(Intercept)", union_names[i]] = S_uu[union_names[i], "(Intercept)"] = moment_df[[sprintf("sum_u%d", i)]] + S_uu[union_names[i], union_names[i]] = moment_df[[sprintf("sum_u%d_%d", i, i)]] + S_uy[union_names[i], ] = moment_df[[sprintf("sum_u%d_y", i)]] + } + if (length(union_names) > 1) { + for (i in seq_along(union_names)) { + if (i == 1) { + next + } + for (j in seq_len(i - 1)) { + S_uu[union_names[i], union_names[j]] = S_uu[union_names[j], union_names[i]] = moment_df[[sprintf("sum_u%d_%d", j, i)]] + } + } + } + list( + S_uu = S_uu, + S_uy = S_uy, + sum_w = moment_df$sum_w, + sum_wy = moment_df$sum_wy, + sum_y_sq = moment_df$sum_wy_sq + ) + } else { + S_uu = matrix(0, length(union_names), length(union_names), dimnames = list(union_names, union_names)) + S_uy = matrix(0, length(union_names), 1, dimnames = list(union_names, "")) + + for (i in seq_along(union_names)) { + S_uu[i, i] = moment_df[[sprintf("sum_u%d_%d", i, i)]] + S_uy[i, ] = moment_df[[sprintf("sum_u%d_y", i)]] + } + if (length(union_names) > 1) { + for (i in seq_along(union_names)) { + if (i == 1) { + next + } + for (j in seq_len(i - 1)) { + S_uu[i, j] = S_uu[j, i] = moment_df[[sprintf("sum_u%d_%d", j, i)]] + } + } + } + list( + S_uu = S_uu, + S_uy = S_uy, + sum_w = NA_real_, + sum_wy = NA_real_, + sum_y_sq = moment_df$sum_y_sq + ) + } +} + +#' @keywords internal +dbivreg_union_moment_terms = function(conn, union_names, y_expr, weights_expr, has_intercept) { + terms = c( + sql_count(conn, "n_total"), + if (is.null(weights_expr)) sql_count(conn, "sum_w") else glue("SUM({weights_expr}) AS sum_w"), + sql_weighted_sum(y_expr, weights_expr, "sum_wy"), + sql_weighted_sum(glue("({y_expr}) * ({y_expr})"), weights_expr, "sum_wy_sq") + ) + for (i in seq_along(union_names)) { + if (has_intercept) { + terms = c(terms, sql_weighted_sum(union_names[i], weights_expr, sprintf("sum_u%d", i))) + } + terms = c( + terms, + sql_weighted_sum(glue("({union_names[i]}) * ({y_expr})"), weights_expr, sprintf("sum_u%d_y", i)), + sql_weighted_sum(glue("({union_names[i]}) * ({union_names[i]})"), weights_expr, sprintf("sum_u%d_%d", i, i)) + ) + } + if (length(union_names) > 1) { + for (i in seq_along(union_names)) { + if (i == 1) { + next + } + for (j in seq_len(i - 1)) { + terms = c( + terms, + sql_weighted_sum(glue("({union_names[j]}) * ({union_names[i]})"), weights_expr, sprintf("sum_u%d_%d", j, i)) + ) + } + } + } + terms +} + +#' @keywords internal +dbivreg_compressed_moment_terms = function(conn, union_names, has_intercept) { + terms = c( + "SUM(n) AS n_total", + sql_count(conn, "n_compressed"), + "SUM(sum_w) AS sum_w", + "SUM(sum_wy) AS sum_wy", + "SUM(sum_wy_sq) AS sum_wy_sq" + ) + for (i in seq_along(union_names)) { + if (has_intercept) { + terms = c( + terms, + glue("SUM(sum_w * ({union_names[i]})) AS sum_u{i}") + ) + } + terms = c( + terms, + glue("SUM(sum_wy * ({union_names[i]})) AS sum_u{i}_y"), + glue("SUM(sum_w * ({union_names[i]}) * ({union_names[i]})) AS sum_u{i}_{i}") + ) + } + if (length(union_names) > 1) { + for (i in seq_along(union_names)) { + if (i == 1) { + next + } + for (j in seq_len(i - 1)) { + terms = c( + terms, + glue("SUM(sum_w * ({union_names[j]}) * ({union_names[i]})) AS sum_u{j}_{i}") + ) + } + } + } + terms +} + +#' @keywords internal +dbivreg_structural_formula = function(yvar, exog_rhs, endog_rhs, fe_rhs = NULL, env = parent.frame()) { + rhs_parts = c(dbivreg_trim_ws(exog_rhs), dbivreg_trim_ws(endog_rhs)) + rhs_parts = rhs_parts[nzchar(rhs_parts)] + rhs = paste(rhs_parts, collapse = " + ") + if (is.null(fe_rhs) || !nzchar(dbivreg_trim_ws(fe_rhs))) { + return(Formula(stats::as.formula(paste(yvar, "~", rhs), env = env))) + } + Formula(stats::as.formula( + paste(yvar, "~", rhs, "|", dbivreg_trim_ws(fe_rhs)), + env = env + )) +} + +#' @keywords internal +dbivreg_parse_formula = function(fml) { + if (inherits(fml, "Formula")) { + fml = formula(fml) + } + if (!inherits(fml, "formula")) { + stop("`fml` must be a formula.") + } + + env = environment(fml) + fml_text = paste(deparse(fml, width.cutoff = 500L), collapse = " ") + main_tilde = dbivreg_find_top_level(fml_text, "~") + if (is.na(main_tilde)) { + stop( + "dbivreg() requires a fixest-style IV formula: ", + "`y ~ x | d ~ z` or `y ~ x | fe1 + fe2 | d ~ z`." + ) + } + + lhs_text = dbivreg_trim_ws(substr(fml_text, 1L, main_tilde - 1L)) + rhs_text = dbivreg_trim_ws(substr(fml_text, main_tilde + 1L, nchar(fml_text))) + pipe_parts = dbivreg_split_top_level(rhs_text, "|") + if (!(length(pipe_parts) %in% c(2L, 3L))) { + stop( + "dbivreg() requires a fixest-style IV formula with the IV block last: ", + "`y ~ x | d ~ z` or `y ~ x | fe1 + fe2 | d ~ z`." + ) + } + + exog_rhs_text = pipe_parts[1] + fe_rhs_text = if (length(pipe_parts) == 3L) pipe_parts[2] else NULL + iv_rhs_text = pipe_parts[length(pipe_parts)] + + iv_tilde = dbivreg_find_top_level(iv_rhs_text, "~") + if (is.na(iv_tilde)) { + stop( + "The final pipe-separated block must be the IV specification, e.g. `d ~ z`." + ) + } + + endog_rhs_text = dbivreg_trim_ws(substr(iv_rhs_text, 1L, iv_tilde - 1L)) + instr_rhs_text = dbivreg_trim_ws(substr(iv_rhs_text, iv_tilde + 1L, nchar(iv_rhs_text))) + if (!nzchar(endog_rhs_text) || !nzchar(instr_rhs_text)) { + stop("The IV block must contain both endogenous regressors and excluded instruments.") + } + + y_formula = stats::as.formula(paste(lhs_text, "~ 1"), env = env) + yvar = all.vars(y_formula) + if (length(yvar) != 1L) { + stop("Exactly one outcome variable required.") + } + + exog_rhs = stats::as.formula(paste("~", exog_rhs_text), env = env) + endog_rhs = stats::as.formula(paste("~", endog_rhs_text), env = env) + instr_rhs = stats::as.formula(paste("~", instr_rhs_text), env = env) + + fe = NULL + if (!is.null(fe_rhs_text)) { + fe_rhs = stats::as.formula(paste("~", fe_rhs_text), env = env) + fe_terms = attr(terms(fe_rhs), "term.labels") + fe_vars = all.vars(fe_rhs) + if (!length(fe_terms)) { + stop("Fixed-effects block cannot be empty.") + } + if (length(fe_terms) != length(fe_vars) || !all(fe_terms %in% fe_vars)) { + stop("Fixed effects must be specified as simple additive variable names.") + } + fe = unique(fe_terms) + } + + list( + fml = fml, + env = env, + yvar = yvar, + exog_rhs = exog_rhs, + exog_rhs_text = exog_rhs_text, + exog_vars = all.vars(exog_rhs), + endog_rhs = endog_rhs, + endog_rhs_text = endog_rhs_text, + endog_vars = all.vars(endog_rhs), + instr_rhs = instr_rhs, + instr_rhs_text = instr_rhs_text, + instr_vars = all.vars(instr_rhs), + fe = fe, + fe_rhs_text = fe_rhs_text + ) +} + +#' @keywords internal +#' Expands the right-hand side of the structural formula into SQL expressions for the design matrix, including fixed effects if specified. +dbivreg_expand_block = function(rhs_formula, conn, table, fe_vars = character()) { + if (is.null(fe_vars)) { + fe_vars = character() + } + sql_design = sql_model_matrix( + rhs_formula, + conn, + table, + expand = "all", + fe_vars = fe_vars + ) + if (anyDuplicated(sql_design$col_names) > 0) { + dupes = unique(sql_design$col_names[duplicated(sql_design$col_names)]) + stop( + "IV design matrix contains duplicate column names: ", + paste(standardize_coef_names(dupes), collapse = ", ") + ) + } + list( + sql = sql_design$select_exprs, + names = sql_design$col_names, + vars = all.vars(rhs_formula), + term_labels = attr(terms(rhs_formula), "term.labels") + ) +} + +#' @keywords internal +dbivreg_find_top_level = function(x, delim) { + chars = strsplit(x, "", fixed = TRUE)[[1]] + if (!length(chars)) { + return(NA_integer_) + } + + depth_paren = 0L + depth_bracket = 0L + depth_brace = 0L + in_single = FALSE + in_double = FALSE + in_backtick = FALSE + + for (i in seq_along(chars)) { + ch = chars[i] + prev = if (i > 1L) chars[i - 1L] else "" + + if (in_single) { + if (ch == "'" && prev != "\\") in_single = FALSE + next + } + if (in_double) { + if (ch == "\"" && prev != "\\") in_double = FALSE + next + } + if (in_backtick) { + if (ch == "`") in_backtick = FALSE + next + } + + if (ch == "'") { + in_single = TRUE + next + } + if (ch == "\"") { + in_double = TRUE + next + } + if (ch == "`") { + in_backtick = TRUE + next + } + + if (ch == "(") { + depth_paren = depth_paren + 1L + next + } + if (ch == ")") { + depth_paren = max(depth_paren - 1L, 0L) + next + } + if (ch == "[") { + depth_bracket = depth_bracket + 1L + next + } + if (ch == "]") { + depth_bracket = max(depth_bracket - 1L, 0L) + next + } + if (ch == "{") { + depth_brace = depth_brace + 1L + next + } + if (ch == "}") { + depth_brace = max(depth_brace - 1L, 0L) + next + } + + if ( + ch == delim && + depth_paren == 0L && + depth_bracket == 0L && + depth_brace == 0L + ) { + return(i) + } + } + + NA_integer_ +} + +#' @keywords internal +dbivreg_split_top_level = function(x, delim) { + chars = strsplit(x, "", fixed = TRUE)[[1]] + if (!length(chars)) { + return(character(0)) + } + + starts = 1L + parts = character(0) + repeat { + subx = paste(chars[starts:length(chars)], collapse = "") + pos = dbivreg_find_top_level(subx, delim) + if (is.na(pos)) { + parts = c(parts, dbivreg_trim_ws(subx)) + break + } + end = starts + pos - 2L + parts = c(parts, dbivreg_trim_ws(paste(chars[starts:end], collapse = ""))) + starts = starts + pos + } + parts +} + +#' @keywords internal +dbivreg_trim_ws = function(x) { + gsub("^\\s+|\\s+$", "", x) +} + +#' @keywords internal +compute_iv_vcov = function( + vcov_type = "iid", + A_inv, + ZtZ_inv, + ZtX, + rss, + df_res, + nobs_orig, + n_params, + meat = NULL +) { + if (vcov_type == "iid") { + sigma2 = rss / df_res + vcov_mat = sigma2 * A_inv + attr(vcov_mat, "type") = "iid" + } else { + if (is.null(meat)) { + stop("Robust IV variance requires a meat matrix.") + } + middle = Matrix::t(ZtX) %*% ZtZ_inv %*% meat %*% ZtZ_inv %*% ZtX + if (vcov_type == "hc1") { + scale_hc1 = nobs_orig / df_res + vcov_mat = scale_hc1 * (A_inv %*% middle %*% A_inv) + attr(vcov_mat, "type") = "hc1" + } else if (vcov_type == "cluster") { + n_clusters = attr(meat, "n_clusters") + if (is.null(n_clusters)) { + stop("Clustered IV meat matrix missing n_clusters attribute.") + } + scale_cr1 = (n_clusters / (n_clusters - 1)) * ((nobs_orig - 1) / (nobs_orig - n_params)) + vcov_mat = scale_cr1 * (A_inv %*% middle %*% A_inv) + attr(vcov_mat, "type") = "cluster" + attr(vcov_mat, "n_clusters") = n_clusters + } else { + stop("Unsupported vcov type: ", vcov_type) + } + } + dimnames(vcov_mat) = dimnames(A_inv) + vcov_mat +} + +#' @keywords internal +compute_dbivreg_diagnostics = function( + inputs, + moms, + cte_sql, + cte_name, + has_intercept, + weights_expr, + n_total, + df_fe, + rss, + betahat, + S_zz, + S_zx, + S_zy, + meat +) { + overid = compute_dbivreg_overid( + inputs = inputs, + betahat = betahat, + S_zz = S_zz, + S_zx = S_zx, + S_zy = S_zy, + rss = rss, + n_total = n_total, + meat = meat + ) + list( + first_stage_f = compute_dbivreg_first_stage_f( + inputs = inputs, + moms = moms, + n_total = n_total, + df_fe = df_fe, + has_intercept = has_intercept + ), + first_stage_wald = compute_dbivreg_first_stage_wald( + inputs = inputs, + moms = moms, + cte_sql = cte_sql, + cte_name = cte_name, + has_intercept = has_intercept, + weights_expr = weights_expr, + n_total = n_total, + df_fe = df_fe + ), + overid = overid, + sargan = if (identical(overid$test, "Sargan")) overid else NULL, + hansen_j = if (identical(overid$test, "Hansen J")) overid else NULL + ) +} + +#' @keywords internal +compute_dbivreg_first_stage_f = function(inputs, moms, n_total, df_fe, has_intercept) { + excluded = inputs[["instr_names"]] + endogenous = inputs[["endog_names"]] + if (!length(excluded) || !length(endogenous)) { + return(NULL) + } + + unrestricted = c(if (has_intercept) "(Intercept)", inputs[["exog_names"]], excluded) + restricted = c(if (has_intercept) "(Intercept)", inputs[["exog_names"]]) + q = length(excluded) + out = vector("list", length(endogenous)) + names(out) = standardize_coef_names(endogenous) + + for (j in seq_along(endogenous)) { + y_name = endogenous[j] + S_uu = moms$S_uu[unrestricted, unrestricted, drop = FALSE] + S_uy = moms$S_uu[unrestricted, y_name, drop = FALSE] + fit_u = solve_with_fallback(S_uu, S_uy) + beta_u = fit_u$betahat + yy = moms$S_uu[y_name, y_name] + rss_u = as.numeric(yy - 2 * Matrix::t(beta_u) %*% S_uy + Matrix::t(beta_u) %*% S_uu %*% beta_u) + + if (length(restricted) == 0) { + rss_r = yy + } else { + S_rr = moms$S_uu[restricted, restricted, drop = FALSE] + S_ry = moms$S_uu[restricted, y_name, drop = FALSE] + fit_r = solve_with_fallback(S_rr, S_ry) + beta_r = fit_r$betahat + rss_r = as.numeric(yy - 2 * Matrix::t(beta_r) %*% S_ry + Matrix::t(beta_r) %*% S_rr %*% beta_r) + } + + df2 = max(n_total - length(unrestricted) - df_fe, 1) + stat = ((rss_r - rss_u) / q) / (rss_u / df2) + stat = max(as.numeric(stat), 0) + out[[j]] = list( + stat = stat, + p = stats::pf(stat, q, df2, lower.tail = FALSE), + df1 = q, + df2 = df2 + ) + } + + out +} + +#' @keywords internal +compute_dbivreg_first_stage_wald = function( + inputs, + moms, + cte_sql, + cte_name, + has_intercept, + weights_expr, + n_total, + df_fe +) { + excluded = inputs[["instr_names"]] + endogenous = inputs[["endog_names"]] + if (!length(excluded) || !length(endogenous)) { + return(NULL) + } + + unrestricted = c(if (has_intercept) "(Intercept)", inputs[["exog_names"]], excluded) + regressors = c(inputs[["exog_names"]], excluded) + is_athena = inherits(inputs[["conn"]], "AthenaConnection") + q = length(excluded) + vcov_label = format_vcov_label(inputs[["vcov_type_req"]], NULL) + out = vector("list", length(endogenous)) + names(out) = standardize_coef_names(endogenous) + + for (j in seq_along(endogenous)) { + y_name = endogenous[j] + S_uu = moms$S_uu[unrestricted, unrestricted, drop = FALSE] + S_uy = moms$S_uu[unrestricted, y_name, drop = FALSE] + fit_u = solve_with_fallback(S_uu, S_uy) + beta_u = fit_u$betahat + XtX_inv = fit_u$XtX_inv + rownames(beta_u) = unrestricted + + yy = moms$S_uu[y_name, y_name] + rss_u = as.numeric(yy - 2 * Matrix::t(beta_u) %*% S_uy + Matrix::t(beta_u) %*% S_uu %*% beta_u) + df2 = max(n_total - length(unrestricted) - df_fe, 1) + + if (inputs[["vcov_type_req"]] == "iid") { + vcov_u = (rss_u / df2) * XtX_inv + attr(vcov_u, "type") = "iid" + } else if (inputs[["vcov_type_req"]] == "hc1") { + meat = compute_meat_sql( + conn = inputs[["conn"]], + cte_sql = cte_sql, + vars = regressors, + yvar = y_name, + betahat = beta_u, + is_athena = is_athena, + var_suffix = if (has_intercept) "" else "_tilde", + cte_name = cte_name, + has_intercept = has_intercept, + vars_sql = if (has_intercept) regressors else paste0(regressors, "_tilde"), + weights_expr = weights_expr + ) + vcov_u = compute_vcov( + vcov_type = "hc1", + strategy = if (has_intercept) "moments" else "demean", + XtX_inv = XtX_inv, + rss = rss_u, + df_res = df2, + nobs_orig = n_total, + n_params = length(unrestricted) + df_fe, + meat = meat + ) + vcov_label = format_vcov_label("hc1", NULL) + } else if (inputs[["vcov_type_req"]] == "cluster") { + meat = compute_meat_cluster_sql( + conn = inputs[["conn"]], + cte_sql = cte_sql, + vars = regressors, + yvar = y_name, + betahat = beta_u, + cluster_var = inputs[["cluster_var"]], + is_athena = is_athena, + var_suffix = if (has_intercept) "" else "_tilde", + cte_name = cte_name, + has_intercept = has_intercept, + vars_sql = if (has_intercept) regressors else paste0(regressors, "_tilde"), + weights_expr = weights_expr + ) + vcov_u = compute_vcov( + vcov_type = "cluster", + strategy = if (has_intercept) "moments" else "demean", + XtX_inv = XtX_inv, + rss = rss_u, + df_res = df2, + nobs_orig = n_total, + n_params = length(unrestricted) + df_fe, + meat = meat + ) + n_clusters = attr(vcov_u, "n_clusters") + vcov_u = vcov_u * ((n_total - 1) / n_total) + attr(vcov_u, "type") = "cluster" + attr(vcov_u, "n_clusters") = n_clusters + vcov_label = format_vcov_label("cluster", attr(vcov_u, "n_clusters")) + } else { + stop("Unsupported vcov type for first-stage diagnostics: ", inputs[["vcov_type_req"]]) + } + + beta_q = beta_u[excluded, , drop = FALSE] + vcov_q = vcov_u[excluded, excluded, drop = FALSE] + inv_vcov_q = solve_with_fallback(vcov_q, beta_q)$betahat + stat = as.numeric(Matrix::t(beta_q) %*% inv_vcov_q) / q + stat = max(stat, 0) + out[[j]] = list( + stat = stat, + p = stats::pf(stat, q, df2, lower.tail = FALSE), + df1 = q, + df2 = df2, + vcov = vcov_label + ) + } + + out +} + +#' @keywords internal +compute_dbivreg_overid = function(inputs, betahat, S_zz, S_zx, S_zy, rss, n_total, meat = NULL) { + df = length(inputs[["instr_names"]]) - length(inputs[["endog_names"]]) + if (df <= 0) { + test_name = if (identical(inputs[["vcov_type_req"]], "iid")) "Sargan" else "Hansen J" + return(list(stat = NA_real_, p = NA_real_, df = max(df, 0L), test = test_name)) + } + + moments = S_zy - S_zx %*% betahat + if (identical(inputs[["vcov_type_req"]], "iid")) { + z_inv = solve_with_fallback(S_zz, moments)$betahat + stat = as.numeric(Matrix::t(moments) %*% z_inv) / (rss / n_total) + test_name = "Sargan" + } else { + if (is.null(meat)) { + return(list(stat = NA_real_, p = NA_real_, df = df, test = "Hansen J")) + } + omega_inv_m = solve_with_fallback(meat, moments)$betahat + stat = as.numeric(Matrix::t(moments) %*% omega_inv_m) + test_name = "Hansen J" + } + stat = max(stat, 0) + + list( + stat = stat, + p = stats::pchisq(stat, df = df, lower.tail = FALSE), + df = df, + test = test_name + ) +} + +#' @keywords internal +compute_iv_meat_sql = function( + conn, + cte_sql, + x_vars, + z_vars, + yvar, + betahat, + weights_expr = NULL, + cte_name = "base", + has_intercept = TRUE, + is_athena = FALSE, + x_vars_sql = x_vars, + z_vars_sql = z_vars, + yvar_sql = yvar +) { + beta_vals = as.numeric(betahat[c(if (has_intercept) "(Intercept)" else NULL, x_vars), 1]) + if (has_intercept) { + intercept_val = beta_vals[1] + beta_vals = beta_vals[-1] + } + + weight_sql = if (is.null(weights_expr)) NULL else sprintf("CAST((%s) * (%s) AS FLOAT)", weights_expr, weights_expr) + build_sum = function(parts, alias) { + parts = parts[!vapply(parts, is.null, logical(1))] + sprintf("SUM(%s) AS %s", paste(parts, collapse = " * "), alias) + } + + beta_terms = if (length(x_vars_sql)) { + paste(sprintf("%.15g * %s", beta_vals, x_vars_sql), collapse = " + ") + } else { + "0" + } + if (has_intercept) { + resid_expr = sprintf("(%s - %.15g - (%s))", yvar_sql, intercept_val, beta_terms) + } else { + resid_expr = sprintf("(%s - (%s))", yvar_sql, beta_terms) + } + + meat_terms = character(0) + if (has_intercept) { + meat_terms = c(meat_terms, build_sum(c( + sprintf("CAST(%s AS FLOAT)", resid_expr), + sprintf("CAST(%s AS FLOAT)", resid_expr), + weight_sql + ), "meat_0_0")) + for (j in seq_along(z_vars_sql)) { + meat_terms = c(meat_terms, build_sum(c( + sprintf("CAST(%s AS FLOAT)", resid_expr), + sprintf("CAST(%s AS FLOAT)", resid_expr), + sprintf("CAST(%s AS FLOAT)", z_vars_sql[j]), + weight_sql + ), sprintf("meat_0_%d", j))) + } + } + for (i in seq_along(z_vars_sql)) { + for (j in i:length(z_vars_sql)) { + meat_terms = c(meat_terms, build_sum(c( + sprintf("CAST(%s AS FLOAT)", resid_expr), + sprintf("CAST(%s AS FLOAT)", resid_expr), + sprintf("CAST(%s AS FLOAT)", z_vars_sql[i]), + sprintf("CAST(%s AS FLOAT)", z_vars_sql[j]), + weight_sql + ), sprintf("meat_%d_%d", i, j))) + } + } + + meat_sql = paste0( + cte_sql, + ",\nmeat AS (SELECT ", + paste(meat_terms, collapse = ", "), + " FROM ", + cte_name, + ")\nSELECT * FROM meat" + ) + if (is_athena) { + meat_sql = gsub("FLOAT", "REAL", meat_sql, fixed = TRUE) + } + + meat_df = dbGetQuery(conn, meat_sql) + vars_all = if (has_intercept) c("(Intercept)", z_vars) else z_vars + meat_mat = matrix(0, length(vars_all), length(vars_all), dimnames = list(vars_all, vars_all)) + + if (has_intercept) { + meat_mat[1, 1] = meat_df$meat_0_0 + for (j in seq_along(z_vars)) { + val = meat_df[[sprintf("meat_0_%d", j)]] + meat_mat[1, j + 1] = meat_mat[j + 1, 1] = val + } + } + for (i in seq_along(z_vars)) { + for (j in i:length(z_vars)) { + val = meat_df[[sprintf("meat_%d_%d", i, j)]] + idx_i = if (has_intercept) i + 1 else i + idx_j = if (has_intercept) j + 1 else j + meat_mat[idx_i, idx_j] = meat_mat[idx_j, idx_i] = val + } + } + meat_mat +} + +#' @keywords internal +compute_iv_meat_cluster_sql = function( + conn, + cte_sql, + x_vars, + z_vars, + yvar, + betahat, + cluster_var, + weights_expr = NULL, + cte_name = "base", + has_intercept = TRUE, + is_athena = FALSE, + x_vars_sql = x_vars, + z_vars_sql = z_vars, + yvar_sql = yvar +) { + beta_vals = as.numeric(betahat[c(if (has_intercept) "(Intercept)" else NULL, x_vars), 1]) + if (has_intercept) { + intercept_val = beta_vals[1] + beta_vals = beta_vals[-1] + } + + weight_sql = if (is.null(weights_expr)) NULL else sprintf("CAST(%s AS FLOAT)", weights_expr) + build_sum = function(parts, alias) { + parts = parts[!vapply(parts, is.null, logical(1))] + sprintf("SUM(%s) AS %s", paste(parts, collapse = " * "), alias) + } + + beta_terms = if (length(x_vars_sql)) { + paste(sprintf("%.15g * %s", beta_vals, x_vars_sql), collapse = " + ") + } else { + "0" + } + if (has_intercept) { + resid_expr = sprintf("(%s - %.15g - (%s))", yvar_sql, intercept_val, beta_terms) + } else { + resid_expr = sprintf("(%s - (%s))", yvar_sql, beta_terms) + } + + score_terms = character(0) + if (has_intercept) { + score_terms = c(score_terms, build_sum(c( + sprintf("CAST(%s AS FLOAT)", resid_expr), + weight_sql + ), "score_0")) + } + for (j in seq_along(z_vars_sql)) { + score_terms = c(score_terms, build_sum(c( + sprintf("CAST(%s AS FLOAT)", resid_expr), + sprintf("CAST(%s AS FLOAT)", z_vars_sql[j]), + weight_sql + ), sprintf("score_%d", j))) + } + + cluster_sql = paste0( + cte_sql, + ",\ncluster_scores AS (\n SELECT ", + cluster_var, + ", ", + paste(score_terms, collapse = ", "), + "\n FROM ", + cte_name, + "\n GROUP BY ", + cluster_var, + "\n)\nSELECT * FROM cluster_scores" + ) + if (is_athena) { + cluster_sql = gsub("FLOAT", "REAL", cluster_sql, fixed = TRUE) + } + + cluster_df = dbGetQuery(conn, cluster_sql) + vars_all = if (has_intercept) c("(Intercept)", z_vars) else z_vars + meat_mat = matrix(0, length(vars_all), length(vars_all), dimnames = list(vars_all, vars_all)) + score_cols = if (has_intercept) c("score_0", paste0("score_", seq_along(z_vars))) else paste0("score_", seq_along(z_vars)) + + for (i in seq_len(nrow(cluster_df))) { + s_g = as.numeric(cluster_df[i, score_cols]) + meat_mat = meat_mat + tcrossprod(s_g) + } + attr(meat_mat, "n_clusters") = nrow(cluster_df) + meat_mat +} + +#' Print method for dbivreg objects +#' +#' @param x A `dbivreg` object. +#' @param ... Additional unused arguments. +#' @export +print.dbivreg = function(x, ...) { + ct = x[["coeftable"]] + colnames(ct) = c("Estimate", "Std. Error", "t value", "Pr(>|t|)") + + se_type = attr(x$vcov, "type") + n_clusters = attr(x$vcov, "n_clusters") + se_type = format_vcov_label(se_type, n_clusters) + + if (x$strategy == "moments") { + cat("Moments-based 2SLS estimation, Dep. Var.:", x$yvar, "\n") + cat("Observations.:", prettyNum(x$nobs_orig, big.mark = ","), "\n") + } else if (x$strategy == "demean") { + mstring = if (length(x$fe) == 1) { + "Demeaned" + } else if (length(x$fe) == 2) { + "Double Demeaned" + } else { + "Multi-way Demeaned" + } + cat(mstring, "2SLS estimation, Dep. Var.:", x$yvar, "\n") + cat("Observations.:", prettyNum(x$nobs_orig, big.mark = ","), "\n") + } + cat("Endogenous:", paste(x$endogenous, collapse = ", "), "\n") + cat("Excluded instruments:", paste(x$instruments, collapse = ", "), "\n") + cat("Standard Errors:", se_type, "\n") + if (!is.null(x$diagnostics$first_stage_f)) { + fst_lines = vapply( + names(x$diagnostics$first_stage_f), + function(nm) { + diag = x$diagnostics$first_stage_f[[nm]] + sprintf("%s = %.3f (p=%.3g)", nm, diag$stat, diag$p) + }, + character(1) + ) + cat("First-stage F:", paste(fst_lines, collapse = ", "), "\n") + } + if (!identical(se_type, "IID") && !is.null(x$diagnostics$first_stage_wald)) { + wald_lines = vapply( + names(x$diagnostics$first_stage_wald), + function(nm) { + diag = x$diagnostics$first_stage_wald[[nm]] + sprintf("%s = %.3f (p=%.3g)", nm, diag$stat, diag$p) + }, + character(1) + ) + cat("First-stage Wald:", paste(wald_lines, collapse = ", "), "\n") + } + if (!is.null(x$diagnostics$overid) && is.finite(x$diagnostics$overid$stat)) { + cat( + paste0("Over-ID (", x$diagnostics$overid$test, "):"), + sprintf("%.3f (p=%.3g, df=%d)", x$diagnostics$overid$stat, x$diagnostics$overid$p, x$diagnostics$overid$df), + "\n" + ) + } + + print_coeftable(ct, gof_vals = gof(x), has_fes = !is.null(x$fe)) + invisible(ct) +} diff --git a/R/print.R b/R/print.R index ea6f358..1619b6e 100644 --- a/R/print.R +++ b/R/print.R @@ -36,16 +36,7 @@ print.dbreg = function(x, fe = FALSE, ...) { } se_type = attr(x[["vcov"]], "type") n_clusters = attr(x[["vcov"]], "n_clusters") - se_type = switch( - se_type, - "iid" = "IID", - "hc1" = "Heteroskedasticity-robust", - "cluster" = if (!is.null(n_clusters)) { - sprintf("Clustered (%d clusters)", n_clusters) - } else { - "Clustered" - } - ) + se_type = format_vcov_label(se_type, n_clusters) if (x[["strategy"]] == "compress") { cat("Compressed OLS estimation, Dep. Var.:", x[["yvar"]], "\n") cat( @@ -236,4 +227,3 @@ print.dbbinsreg = function(x, ...) { invisible(x) } - diff --git a/R/utils.R b/R/utils.R index d7b9137..8546711 100644 --- a/R/utils.R +++ b/R/utils.R @@ -20,6 +20,18 @@ env2env = function(source, target, keys = NULL) { #' @keywords internal standardize_coef_names = function(x) gsub("_x_", ":", x, fixed = TRUE) +#' Format variance-covariance type labels for printing +#' @keywords internal +format_vcov_label = function(vcov_type, n_clusters = NULL) { + switch( + vcov_type, + "iid" = "IID", + "hc1" = "Heteroskedasticity-robust", + "cluster" = if (!is.null(n_clusters)) sprintf("Clustered (%d clusters)", n_clusters) else "Clustered", + vcov_type + ) +} + #' Generate coefficient table from estimates and vcov matrix #' @keywords internal gen_coeftable = function(betahat, vcov_mat, df_residual) { diff --git a/inst/tinytest/test_dbivreg.R b/inst/tinytest/test_dbivreg.R new file mode 100644 index 0000000..8faf0c7 --- /dev/null +++ b/inst/tinytest/test_dbivreg.R @@ -0,0 +1,534 @@ +library(dbreg) +library(fixest) + +rename_fixest_iv = function(x) { + names(x) = sub("^fit_", "", names(x)) + x +} + +extract_fixest_ivstat = function(stats, type, term) { + key = paste0(type, "::", term) + if (!is.null(stats[[key]])) { + return(stats[[key]]) + } + if (all(c("stat", "p", "df1", "df2") %in% names(stats))) { + return(stats) + } + NULL +} + +expect_iv_match = function(db_fit, fx_fit, tol_coef = 1e-6, tol_se = 1e-6, label = "IV") { + fx_coef = rename_fixest_iv(coef(fx_fit)) + expect_true( + max(abs(db_fit$coeftable[names(fx_coef), "estimate"] - fx_coef)) < tol_coef, + info = paste(label, "coefficients match fixest") + ) + + fx_se = rename_fixest_iv(se(fx_fit)) + expect_true( + max(abs(db_fit$coeftable[names(fx_se), "std.error"] - fx_se)) < tol_se, + info = paste(label, "standard errors match fixest") + ) +} + +manual_hansen_j = function(Z, u, cluster = NULL, weights = NULL) { + if (is.null(weights)) { + weights = rep(1, length(u)) + } + scores = Z * as.numeric(u * weights) + g = colSums(scores) + + if (is.null(cluster)) { + omega = crossprod(scores) + } else { + omega = matrix(0, ncol(scores), ncol(scores)) + for (cl in unique(cluster)) { + s_g = colSums(scores[cluster == cl, , drop = FALSE]) + omega = omega + tcrossprod(s_g) + } + } + + as.numeric(t(g) %*% qr.solve(omega, g)) +} + +set.seed(2026) + +tol = 1e-6 +tol_cluster = 1e-5 + +n = 600L +cluster = factor(sample(1:60, n, replace = TRUE)) +z = rnorm(n) +x = rnorm(n) +u = rnorm(n) +d = 0.9 * z + 0.5 * x + rnorm(n) +y = 1 + 2 * d + 0.3 * x + u +w = runif(n, min = 0.4, max = 2.2) + +dat = data.frame(y = y, d = d, x = x, z = z, w = w, cluster = cluster) + +db_iid = dbivreg( + y ~ x | d ~ z, + data = dat, + weights = "w", + vcov = "iid", + strategy = "moments" +) +fx_iid = feols(y ~ x | d ~ z, data = dat, weights = ~w, vcov = "iid") +expect_iv_match(db_iid, fx_iid, tol_coef = tol, tol_se = tol, label = "No-FE IID") +fs_iid = fitstat(fx_iid, ~ ivf1 + sargan, simplify = TRUE) +expect_equal( + db_iid$diagnostics$first_stage_f$d$stat, + extract_fixest_ivstat(fs_iid, "ivf1", "d")$stat, + tolerance = tol, + info = "No-FE IID first-stage F matches fixest" +) +expect_true( + is.na(db_iid$diagnostics$overid$stat) && is.na(fs_iid$sargan), + info = "Exactly identified models report no Sargan statistic" +) + +db_hc1 = dbivreg( + y ~ x | d ~ z, + data = dat, + weights = "w", + vcov = "hc1", + strategy = "moments" +) +fx_hc1 = feols(y ~ x | d ~ z, data = dat, weights = ~w, vcov = "hc1") +expect_iv_match(db_hc1, fx_hc1, tol_coef = tol, tol_se = tol, label = "No-FE HC1") + +db_cl = dbivreg( + y ~ x | d ~ z, + data = dat, + weights = "w", + vcov = ~cluster, + strategy = "moments" +) +fx_cl = feols(y ~ x | d ~ z, data = dat, weights = ~w, vcov = ~cluster) +expect_iv_match(db_cl, fx_cl, tol_coef = tol, tol_se = tol_cluster, label = "No-FE cluster") +fs_cl = fitstat(fx_cl, ~ ivf1 + ivwald1, simplify = TRUE) +expect_equal( + db_cl$diagnostics$first_stage_wald$d$stat, + extract_fixest_ivstat(fs_cl, "ivwald1", "d")$stat, + tolerance = tol_cluster, + info = "No-FE cluster first-stage Wald matches fixest" +) + +expect_true( + dbivreg( + y ~ x | d ~ z, + data = dat, + weights = "w", + vcov = "iid", + strategy = "auto" + )$strategy == "moments", + info = "Auto strategy selects moments when no fixed effects are present" +) + +set.seed(2034) + +n_comp_cells = 80L +comp_reps = sample(2:6, n_comp_cells, replace = TRUE) +dat_comp_cell = data.frame( + cell = seq_len(n_comp_cells), + cluster = factor(sample(1:20, n_comp_cells, replace = TRUE)) +) +dat_comp_cell$x = round(rnorm(n_comp_cells), 2) +dat_comp_cell$z = round(rnorm(n_comp_cells), 2) +v_comp_cell = rnorm(n_comp_cells) +dat_comp_cell$d = round(0.8 * dat_comp_cell$z + 0.4 * dat_comp_cell$x + v_comp_cell, 2) + +dat_comp = dat_comp_cell[rep(seq_len(n_comp_cells), comp_reps), ] +rownames(dat_comp) = NULL +dat_comp$w = runif(nrow(dat_comp), min = 0.5, max = 2.0) +dat_comp$y = 1.0 + 2.0 * dat_comp$d + 0.3 * dat_comp$x + + 0.6 * v_comp_cell[dat_comp$cell] + rnorm(nrow(dat_comp)) + +db_comp_hc1 = dbivreg( + y ~ x | d ~ z, + data = dat_comp, + weights = "w", + vcov = "hc1", + strategy = "compress" +) +fx_comp_hc1 = feols(y ~ x | d ~ z, data = dat_comp, weights = ~w, vcov = "hc1") +expect_iv_match(db_comp_hc1, fx_comp_hc1, tol_coef = tol, tol_se = tol, label = "Compressed IV HC1") +expect_true( + db_comp_hc1$nobs < db_comp_hc1$nobs_orig && db_comp_hc1$compression_ratio < 1, + info = "Compressed IV records fewer design rows than original observations" +) +fs_comp_hc1 = fitstat(fx_comp_hc1, ~ ivwald1, simplify = TRUE) +expect_equal( + db_comp_hc1$diagnostics$first_stage_wald$d$stat, + extract_fixest_ivstat(fs_comp_hc1, "ivwald1", "d")$stat, + tolerance = tol, + info = "Compressed IV first-stage Wald matches fixest" +) + +db_comp_cl = dbivreg( + y ~ x | d ~ z, + data = dat_comp, + weights = "w", + vcov = ~cluster, + strategy = "compress" +) +fx_comp_cl = feols(y ~ x | d ~ z, data = dat_comp, weights = ~w, vcov = ~cluster) +expect_iv_match(db_comp_cl, fx_comp_cl, tol_coef = tol, tol_se = tol_cluster, label = "Compressed IV cluster") + +sql_comp = invisible(capture.output( + dbivreg( + y ~ x | d ~ z, + data = dat_comp, + strategy = "compress", + sql_only = TRUE + ) +)) +expect_true(any(grepl("compressed AS", sql_comp)), info = "Compressed IV sql_only exposes compressed CTE") + +set.seed(2027) + +n_fe = 700L +dat_fe1 = data.frame( + fe1 = factor(sample(1:25, n_fe, replace = TRUE)), + cluster = factor(sample(1:35, n_fe, replace = TRUE)) +) + +alpha = rnorm(25)[dat_fe1$fe1] +dat_fe1$z = rnorm(n_fe) +dat_fe1$x = rnorm(n_fe) +v = rnorm(n_fe) +dat_fe1$d = 0.8 * dat_fe1$z + 0.4 * dat_fe1$x + alpha + v +dat_fe1$y = 2.0 * dat_fe1$d + 0.3 * dat_fe1$x + alpha + 0.6 * v + rnorm(n_fe) +dat_fe1$w = runif(n_fe, min = 0.5, max = 2.0) + +db_fe1 = dbivreg( + y ~ x | fe1 | d ~ z, + data = dat_fe1, + weights = "w", + vcov = "hc1", + strategy = "demean" +) +fx_fe1 = feols(y ~ x | fe1 | d ~ z, data = dat_fe1, weights = ~w, vcov = "hc1") +expect_iv_match(db_fe1, fx_fe1, tol_coef = tol, tol_se = tol, label = "One-way FE HC1") + +db_fe1_within = dbivreg( + y ~ x | fe1 | d ~ z, + data = dat_fe1, + weights = "w", + vcov = "iid", + strategy = "within" +) +fx_fe1_iid = feols(y ~ x | fe1 | d ~ z, data = dat_fe1, weights = ~w, vcov = "iid") +expect_iv_match(db_fe1_within, fx_fe1_iid, tol_coef = tol, tol_se = tol, label = "Within alias") +expect_true(db_fe1_within$strategy == "demean", info = "Within alias normalizes to demean") + +expect_true( + dbivreg( + y ~ x | fe1 | d ~ z, + data = dat_fe1, + weights = "w", + vcov = "iid", + strategy = "auto" + )$strategy == "demean", + info = "Auto strategy selects demean when fixed effects are present" +) + +set.seed(2028) + +n_units = 20L +n_time = 5L +dat_fe2 = expand.grid(unit = 1:n_units, time = 1:n_time) +dat_fe2$fe1 = factor(dat_fe2$unit) +dat_fe2$fe2 = factor(dat_fe2$time) + +alpha_2 = rnorm(n_units)[dat_fe2$unit] +tau_2 = rnorm(n_time)[dat_fe2$time] +dat_fe2$z = rnorm(nrow(dat_fe2)) +dat_fe2$x = rnorm(nrow(dat_fe2)) +v_2 = rnorm(nrow(dat_fe2)) +dat_fe2$d = 0.9 * dat_fe2$z + 0.4 * dat_fe2$x + alpha_2 + tau_2 + v_2 +dat_fe2$y = 2.0 * dat_fe2$d + 0.3 * dat_fe2$x + alpha_2 + tau_2 + 0.5 * v_2 + rnorm(nrow(dat_fe2)) + +db_fe2 = dbivreg( + y ~ x | fe1 + fe2 | d ~ z, + data = dat_fe2, + vcov = "iid", + strategy = "demean" +) +fx_fe2 = feols(y ~ x | fe1 + fe2 | d ~ z, data = dat_fe2, vcov = "iid") +expect_iv_match(db_fe2, fx_fe2, tol_coef = tol, tol_se = tol, label = "Two-way FE balanced IID") +fs_fe2 = fitstat(fx_fe2, ~ ivf1 + ivwald1, simplify = TRUE) +expect_equal( + db_fe2$diagnostics$first_stage_f$d$stat, + extract_fixest_ivstat(fs_fe2, "ivf1", "d")$stat, + tolerance = tol, + info = "Two-way FE first-stage F matches fixest" +) + +set.seed(2029) + +n_units_ap = 30L +n_time_ap = 6L +dat_ap = expand.grid(unit = 1:n_units_ap, time = 1:n_time_ap) +dat_ap$fe1 = factor(dat_ap$unit) +dat_ap$fe2 = factor(dat_ap$time) +dat_ap$cluster = factor(sample(1:30, nrow(dat_ap), replace = TRUE)) + +alpha_ap = rnorm(n_units_ap)[dat_ap$unit] +tau_ap = rnorm(n_time_ap)[dat_ap$time] +dat_ap$z = rnorm(nrow(dat_ap)) +dat_ap$x = rnorm(nrow(dat_ap)) +v_ap = rnorm(nrow(dat_ap)) +dat_ap$d = 0.8 * dat_ap$z + 0.5 * dat_ap$x + alpha_ap + tau_ap + v_ap +dat_ap$y = 2.0 * dat_ap$d + 0.3 * dat_ap$x + alpha_ap + tau_ap + 0.6 * v_ap + rnorm(nrow(dat_ap)) +dat_ap$w = runif(nrow(dat_ap), min = 0.5, max = 2.0) + +drop_idx = sample(seq_len(nrow(dat_ap)), size = round(0.2 * nrow(dat_ap))) +dat_ap = dat_ap[-drop_idx, ] + +db_ap = dbivreg( + y ~ x | fe1 + fe2 | d ~ z, + data = dat_ap, + weights = "w", + vcov = ~cluster, + strategy = "demean" +) +fx_ap = feols(y ~ x | fe1 + fe2 | d ~ z, data = dat_ap, weights = ~w, vcov = ~cluster) +expect_iv_match(db_ap, fx_ap, tol_coef = tol, tol_se = tol_cluster, label = "Two-way FE AP cluster") +fs_ap = fitstat(fx_ap, ~ ivf1 + ivwald1, simplify = TRUE) +expect_equal( + db_ap$diagnostics$first_stage_wald$d$stat, + extract_fixest_ivstat(fs_ap, "ivwald1", "d")$stat, + tolerance = tol_cluster, + info = "Two-way FE cluster first-stage Wald matches fixest" +) + +set.seed(2033) + +n_units_fe3 = 18L +n_time_fe3 = 4L +n_market_fe3 = 3L +dat_fe3 = expand.grid(unit = 1:n_units_fe3, time = 1:n_time_fe3, market = 1:n_market_fe3) +dat_fe3$fe1 = factor(dat_fe3$unit) +dat_fe3$fe2 = factor(dat_fe3$time) +dat_fe3$fe3 = factor(dat_fe3$market) + +alpha_3 = rnorm(n_units_fe3)[dat_fe3$unit] +tau_3 = rnorm(n_time_fe3)[dat_fe3$time] +gamma_3 = rnorm(n_market_fe3)[dat_fe3$market] +dat_fe3$z = rnorm(nrow(dat_fe3)) +dat_fe3$x = rnorm(nrow(dat_fe3)) +v_3 = rnorm(nrow(dat_fe3)) +dat_fe3$d = 0.8 * dat_fe3$z + 0.4 * dat_fe3$x + alpha_3 + tau_3 + gamma_3 + v_3 +dat_fe3$y = 1.9 * dat_fe3$d + 0.25 * dat_fe3$x + alpha_3 + tau_3 + gamma_3 + + 0.5 * v_3 + rnorm(nrow(dat_fe3)) + +db_fe3 = dbivreg( + y ~ x | fe1 + fe2 + fe3 | d ~ z, + data = dat_fe3, + vcov = "iid", + strategy = "demean" +) +fx_fe3 = feols(y ~ x | fe1 + fe2 + fe3 | d ~ z, data = dat_fe3, vcov = "iid") +expect_iv_match(db_fe3, fx_fe3, tol_coef = tol, tol_se = tol, label = "Three-way FE AP IID") +fs_fe3 = fitstat(fx_fe3, ~ ivf1, simplify = TRUE) +expect_equal( + db_fe3$diagnostics$first_stage_f$d$stat, + extract_fixest_ivstat(fs_fe3, "ivf1", "d")$stat, + tolerance = tol, + info = "Three-way FE first-stage F matches fixest" +) +expect_equal( + db_fe3$n_fe_levels, + c(fe1 = n_units_fe3, fe2 = n_time_fe3, fe3 = n_market_fe3), + info = "Three-way FE levels are retained in the fitted object" +) +expect_equal( + db_fe3$n_fe3, + n_market_fe3, + info = "Third fixed-effect count is available by position" +) + +set.seed(2032) + +n_over = 700L +dat_over = data.frame( + x = rnorm(n_over), + z1 = rnorm(n_over), + z2 = rnorm(n_over), + cluster = factor(sample(1:50, n_over, replace = TRUE)) +) +v_over = rnorm(n_over) +dat_over$d = 0.8 * dat_over$z1 + 0.3 * dat_over$z2 + 0.4 * dat_over$x + v_over +dat_over$y = 1.0 + 2.0 * dat_over$d + 0.4 * dat_over$x + 0.6 * v_over + rnorm(n_over) + +db_over = dbivreg( + y ~ x | d ~ z1 + z2, + data = dat_over, + vcov = "iid", + strategy = "moments" +) +fx_over = feols(y ~ x | d ~ z1 + z2, data = dat_over, vcov = "iid") +expect_iv_match(db_over, fx_over, tol_coef = tol, tol_se = tol, label = "Overidentified IID") +fs_over = fitstat(fx_over, ~ ivf1 + sargan, simplify = TRUE) +expect_equal( + db_over$diagnostics$first_stage_f$d$stat, + extract_fixest_ivstat(fs_over, "ivf1", "d")$stat, + tolerance = tol, + info = "Overidentified first-stage F matches fixest" +) +expect_equal( + db_over$diagnostics$overid$stat, + fs_over$sargan$stat, + tolerance = tol, + info = "Sargan statistic matches fixest" +) +expect_equal( + db_over$diagnostics$overid$p, + fs_over$sargan$p, + tolerance = tol, + info = "Sargan p-value matches fixest" +) +expect_true( + identical(db_over$diagnostics$overid$test, "Sargan"), + info = "IID overidentification test is labeled Sargan" +) + +db_over_hc1 = dbivreg( + y ~ x | d ~ z1 + z2, + data = dat_over, + vcov = "hc1", + strategy = "moments" +) +Z_over = cbind("(Intercept)" = 1, x = dat_over$x, z1 = dat_over$z1, z2 = dat_over$z2) +X_over = cbind("(Intercept)" = 1, x = dat_over$x, d = dat_over$d) +u_over_hc1 = as.numeric(dat_over$y - X_over %*% coef(db_over_hc1)[colnames(X_over)]) +expect_equal( + db_over_hc1$diagnostics$overid$stat, + manual_hansen_j(Z_over, u_over_hc1), + tolerance = tol, + info = "HC1 Hansen J matches manual calculation" +) +expect_true( + identical(db_over_hc1$diagnostics$overid$test, "Hansen J"), + info = "HC1 overidentification test is labeled Hansen J" +) + +db_over_cl = dbivreg( + y ~ x | d ~ z1 + z2, + data = dat_over, + vcov = ~cluster, + strategy = "moments" +) +u_over_cl = as.numeric(dat_over$y - X_over %*% coef(db_over_cl)[colnames(X_over)]) +expect_equal( + db_over_cl$diagnostics$overid$stat, + manual_hansen_j(Z_over, u_over_cl, cluster = dat_over$cluster), + tolerance = tol_cluster, + info = "Cluster-robust Hansen J matches manual calculation" +) +expect_true( + identical(db_over_cl$diagnostics$overid$test, "Hansen J"), + info = "Cluster overidentification test is labeled Hansen J" +) + +set.seed(2030) + +n_fac = 900L +dat_fac = data.frame( + g = factor(sample(letters[1:3], n_fac, replace = TRUE)) +) +g_b = as.numeric(dat_fac$g == "b") +g_c = as.numeric(dat_fac$g == "c") +dat_fac$x = rnorm(n_fac) +dat_fac$z = rnorm(n_fac) +v_fac = rnorm(n_fac) +dat_fac$d = 0.8 * dat_fac$z + 0.4 * dat_fac$x + 0.6 * g_b - 0.3 * g_c + + 0.5 * dat_fac$z * g_b - 0.4 * dat_fac$z * g_c + v_fac +dat_fac$y = 1.0 + 1.8 * dat_fac$d + 0.9 * dat_fac$d * g_b - 0.7 * dat_fac$d * g_c + + 0.3 * dat_fac$x + 0.5 * g_b - 0.2 * g_c + 0.7 * v_fac + rnorm(n_fac) + +db_fac = dbivreg( + y ~ x + g | d + d:g ~ z + z:g, + data = dat_fac, + vcov = "hc1", + strategy = "moments" +) +fx_fac = feols(y ~ x + g | d + d:g ~ z + z:g, data = dat_fac, vcov = "hc1") +expect_iv_match(db_fac, fx_fac, tol_coef = tol, tol_se = tol, label = "Factor-expanded IV moments") +fs_fac = fitstat(fx_fac, ~ ivf1 + ivwald1, simplify = TRUE) +for (nm in names(db_fac$diagnostics$first_stage_wald)) { + expect_equal( + db_fac$diagnostics$first_stage_wald[[nm]]$stat, + extract_fixest_ivstat(fs_fac, "ivwald1", nm)$stat, + tolerance = tol_cluster, + info = paste("Factor-expanded first-stage Wald matches fixest for", nm) + ) +} + +sql_fac = invisible(capture.output( + dbivreg( + y ~ x + g | d + d:g ~ z + z:g, + data = dat_fac, + sql_only = TRUE + ) +)) +expect_true(any(grepl("CASE WHEN", sql_fac)), info = "sql_only expands factor IV terms in SQL") + +set.seed(2031) + +n_fac_fe = 850L +dat_fac_fe = data.frame( + g = factor(sample(letters[1:3], n_fac_fe, replace = TRUE)), + fe = factor(sample(1:30, n_fac_fe, replace = TRUE)) +) +gfe_b = as.numeric(dat_fac_fe$g == "b") +gfe_c = as.numeric(dat_fac_fe$g == "c") +alpha_fe = rnorm(30)[dat_fac_fe$fe] +dat_fac_fe$x = rnorm(n_fac_fe) +dat_fac_fe$z = rnorm(n_fac_fe) +v_fac_fe = rnorm(n_fac_fe) +dat_fac_fe$d = 0.7 * dat_fac_fe$z + 0.3 * dat_fac_fe$x + 0.5 * gfe_b - 0.2 * gfe_c + + 0.4 * dat_fac_fe$z * gfe_b - 0.3 * dat_fac_fe$z * gfe_c + alpha_fe + v_fac_fe +dat_fac_fe$y = 1.5 * dat_fac_fe$d + 0.7 * dat_fac_fe$d * gfe_b - 0.5 * dat_fac_fe$d * gfe_c + + 0.4 * dat_fac_fe$x + 0.4 * gfe_b - 0.1 * gfe_c + alpha_fe + 0.6 * v_fac_fe + rnorm(n_fac_fe) + +db_fac_fe = dbivreg( + y ~ x + g | fe | d + d:g ~ z + z:g, + data = dat_fac_fe, + vcov = "iid", + strategy = "demean" +) +fx_fac_fe = feols(y ~ x + g | fe | d + d:g ~ z + z:g, data = dat_fac_fe, vcov = "iid") +expect_iv_match(db_fac_fe, fx_fac_fe, tol_coef = tol, tol_se = tol, label = "Factor-expanded IV demean") + +expect_error( + dbivreg(y ~ x | d ~ z, data = dat, endog = ~d, instruments = ~z), + "no longer supported" +) + +expect_error( + dbivreg(y ~ x | fe1, data = dat_fe1), + "fixest-style IV formula|IV specification|IV block" +) + +expect_error( + dbivreg(y ~ x | d ~ z, data = dat, strategy = "mundlak"), + "does not yet implement strategy = 'mundlak'" +) + +expect_error( + dbivreg(y ~ x | fe1 | d ~ z, data = dat_fe1, strategy = "compress"), + "only available without fixed effects" +) + +expect_error( + dbivreg(y ~ x | fe1 | d ~ z, data = dat_fe1, strategy = "moments"), + "only available without fixed effects" +) + +expect_error( + dbivreg(y ~ x | d ~ z, data = dat, strategy = "demean"), + "requires at least one fixed effect" +) diff --git a/inst/tinytest/test_dbivreg_nyc.R b/inst/tinytest/test_dbivreg_nyc.R new file mode 100644 index 0000000..693d939 --- /dev/null +++ b/inst/tinytest/test_dbivreg_nyc.R @@ -0,0 +1,112 @@ +# NYC Taxi tests (large data smoke + backend validation) +# ----------------------------------------------------------------------------- + +if (!tolower(Sys.getenv("DBREG_TEST_NYC")) %in% c("true", "1")) { + exit_file("Run `Sys.setenv(DBREG_TEST_NYC = 'TRUE')` to enable NYC taxi tests") +} + +nyc_path = here::here("nyc-taxi/year=2012") +if (!dir.exists(nyc_path)) { + exit_file("NYC taxi data not found at nyc-taxi/year=2012") +} + +library(dbreg) +library(DBI) +library(duckdb) +library(fixest) + +rename_fixest_iv = function(x) { + names(x) = sub("^fit_", "", names(x)) + x +} + +con = dbConnect(duckdb::duckdb(), shutdown = TRUE) + +dbExecute(con, sprintf(" + CREATE VIEW nyc_iv AS + SELECT + trip_distance AS x, + fare_amount / 10.0 AS z, + passenger_count AS fe1, + dayofweek(dropoff_datetime) AS fe2, + 5.0 * (fare_amount / 10.0) + 0.5 * trip_distance + + 0.8 * passenger_count + 0.2 * dayofweek(dropoff_datetime) + tip_amount / 10.0 AS d, + 1.0 + 2.0 * ( + 5.0 * (fare_amount / 10.0) + 0.5 * trip_distance + + 0.8 * passenger_count + 0.2 * dayofweek(dropoff_datetime) + tip_amount / 10.0 + ) + + 0.3 * trip_distance + 0.8 * passenger_count + + 0.2 * dayofweek(dropoff_datetime) + tip_amount / 20.0 AS y, + 1.0 + passenger_count / 10.0 AS w + FROM read_parquet('%s/month=1/*.parquet') + WHERE trip_distance > 0 AND trip_distance < 20 + AND fare_amount > 0 AND fare_amount < 100 + AND passenger_count > 0 + LIMIT 30000 +", nyc_path)) + +nyc_local = dbGetQuery(con, "SELECT * FROM nyc_iv") + +nyc_iv = dbivreg( + y ~ x | d ~ z, + conn = con, + table = "nyc_iv", + weights = "w", + vcov = "hc1" +) + +expect_true( + inherits(nyc_iv, "dbivreg"), + info = "dbivreg returns dbivreg object on NYC backend data" +) +expect_true( + all(is.finite(nyc_iv$coeftable[, c("estimate", "std.error")])), + info = "dbivreg returns finite coefficients and standard errors on NYC backend data" +) + +fx_iv = feols(y ~ x | d ~ z, data = nyc_local, weights = ~w, vcov = "hc1") +fx_coef = rename_fixest_iv(coef(fx_iv)) +expect_true( + max(abs(nyc_iv$coeftable[names(fx_coef), "estimate"] - fx_coef)) < 1e-6, + info = "dbivreg coefficients match fixest on NYC backend view" +) + +fx_se = rename_fixest_iv(se(fx_iv)) +expect_true( + max(abs(nyc_iv$coeftable[names(fx_se), "std.error"] - fx_se)) < 1e-6, + info = "dbivreg HC1 standard errors match fixest on NYC backend view" +) + +nyc_iv_fe = dbivreg( + y ~ x | fe1 + fe2 | d ~ z, + conn = con, + table = "nyc_iv", + weights = "w", + vcov = "hc1", + strategy = "auto" +) + +expect_true( + nyc_iv_fe$strategy == "demean", + info = "dbivreg auto strategy selects demean on NYC FE-IV backend data" +) +expect_true( + all(is.finite(nyc_iv_fe$coeftable[, c("estimate", "std.error")])), + info = "dbivreg FE-IV returns finite coefficients and standard errors on NYC backend data" +) + +fx_iv_fe = feols(y ~ x | fe1 + fe2 | d ~ z, data = nyc_local, weights = ~w, vcov = "hc1") +fx_fe_coef = rename_fixest_iv(coef(fx_iv_fe)) +expect_true( + max(abs(nyc_iv_fe$coeftable[names(fx_fe_coef), "estimate"] - fx_fe_coef)) < 1e-6, + info = "dbivreg FE-IV coefficients match fixest on NYC backend view" +) + +fx_fe_se = rename_fixest_iv(se(fx_iv_fe)) +expect_true( + max(abs(nyc_iv_fe$coeftable[names(fx_fe_se), "std.error"] - fx_fe_se)) < 1e-6, + info = "dbivreg FE-IV HC1 standard errors match fixest on NYC backend view" +) + +dbDisconnect(con) +rm(con) diff --git a/man/dbivreg.Rd b/man/dbivreg.Rd new file mode 100644 index 0000000..a886ee2 --- /dev/null +++ b/man/dbivreg.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dbivreg.R +\name{dbivreg} +\alias{dbivreg} +\title{Instrumental-variables regression on a database backend} +\usage{ +dbivreg( + fml, + conn = NULL, + table = NULL, + data = NULL, + path = NULL, + weights = NULL, + vcov = c("iid", "hc1"), + strategy = c("auto", "moments", "demean", "within", "compress", "mundlak"), + cluster = NULL, + sql_only = FALSE, + data_only = FALSE, + drop_missings = TRUE, + verbose = getOption("dbreg.verbose", FALSE), + ... +) +} +\arguments{ +\item{fml}{A fixest-style IV formula. Use \code{y ~ x1 + x2 | d ~ z} for models +without absorbed fixed effects, and \code{y ~ x1 + x2 | fe1 + fe2 + fe3 | d ~ z} +when fixed effects are present. The IV block must always be the final +pipe-separated block.} + +\item{conn}{Database connection, or \code{NULL} to create an ephemeral DuckDB +connection.} + +\item{table, data, path}{Mutually exclusive data source arguments with +precedence \verb{table > data > path}.} + +\item{weights}{Optional character string naming an observation-weight column.} + +\item{vcov}{Character string or one-sided clustering formula. Character +options are \code{"iid"} and \code{"hc1"}.} + +\item{strategy}{Character string selecting the IV backend. Supported values +are \code{"auto"} (default), \code{"moments"}, \code{"demean"} (alias +\code{"within"}), \code{"compress"}, and \code{"mundlak"}. The latter currently +errors because it is not yet implemented for IV.} + +\item{cluster}{Optional clustering variable, supplied as a one-sided formula +or character string.} + +\item{sql_only}{Logical indicating whether to return only the underlying SQL +query.} + +\item{data_only}{Logical indicating whether to return only the aggregated +sufficient statistics.} + +\item{drop_missings}{Logical indicating whether incomplete cases should be +dropped.} + +\item{verbose}{Logical indicating whether progress messages are printed.} + +\item{...}{Additional unused arguments. Passing legacy \code{endog} or +\code{instruments} arguments now errors.} +} +\value{ +A list of class \code{"dbivreg"} and \code{"dbreg"}. +} +\description{ +Estimates a linear IV model via exact sufficient statistics on a database +backend. The current implementation supports three exact strategies: +\itemize{ +\item \code{strategy = "moments"} for models without absorbed fixed effects. +\item \code{strategy = "compress"} for repeated no-FE design rows. +\item \code{strategy = "demean"} for one or more absorbed fixed effects. +} + +Multi-way fixed effects use exact alternating projections when there are +more than two fixed effects, or when two-way fixed effects are unbalanced or +weights are supplied. +} diff --git a/man/print.dbivreg.Rd b/man/print.dbivreg.Rd new file mode 100644 index 0000000..48fb4ce --- /dev/null +++ b/man/print.dbivreg.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dbivreg.R +\name{print.dbivreg} +\alias{print.dbivreg} +\title{Print method for dbivreg objects} +\usage{ +\method{print}{dbivreg}(x, ...) +} +\arguments{ +\item{x}{A \code{dbivreg} object.} + +\item{...}{Additional unused arguments.} +} +\description{ +Print method for \code{dbivreg} objects. +}