Can standardize by either 1) log()
-transforming and then applying scale()
(mean-center and scaled by standard deviation), or 2) if regressed_on
variables are given, then log-transforming, running a linear regression to obtain
the stats::residuals()
, and finally scaled. Use regressed_on
to try to
remove influence of potential confounding.
nc_standardize(data, cols = everything(), regressed_on = NULL)
Data frame.
Metabolic variables that will make up the network.
Optional. A character vector of variables to regress the metabolic variables on. Use if you want to standardize the metabolic variables on variables that are known to influence them, e.g. sex or age. Calculates the residuals from a linear regression model.
Outputs a tibble object, with the original metabolic variables now standardized.
nc_estimate_links for more detailed examples or the vignette("NetCoupler")
.
# Don't regress on any variable
simulated_data %>%
nc_standardize(starts_with("metabolite_"))
#> # A tibble: 2,000 × 18
#> metabolite_1 metabolite_2 metabolite_3 metabolite_4 metabolite_5 metabolite_6
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.318 0.105 1.30 NA -0.181 -0.441
#> 2 -1.06 0.622 -1.04 -0.113 -2.14 -0.774
#> 3 -0.467 1.04 1.13 0.888 0.910 -0.280
#> 4 1.16 0.162 0.115 -0.664 0.730 -0.202
#> 5 0.336 -2.80 1.19 1.66 1.02 1.25
#> 6 0.114 0.319 -0.0494 -1.34 -0.192 0.442
#> 7 0.586 0.232 0.533 1.84 0.822 -1.58
#> 8 0.959 -0.763 -0.375 0.523 1.29 1.24
#> 9 -0.589 -1.01 -0.241 -1.28 -0.789 -0.0464
#> 10 -0.516 -0.575 0.484 -0.369 -0.432 1.48
#> # … with 1,990 more rows, and 12 more variables: metabolite_7 <dbl>,
#> # metabolite_8 <dbl>, metabolite_9 <dbl>, metabolite_10 <dbl>,
#> # outcome_continuous <dbl>, metabolite_12 <dbl>, metabolite_11 <dbl>,
#> # exposure <dbl>, age <dbl>, id <int>, outcome_event_time <dbl>,
#> # outcome_binary <int>
# Extract residuals by regressing on a variable
simulated_data %>%
nc_standardize(starts_with("metabolite_"), "age")
#> # A tibble: 2,000 × 18
#> metabolite_1 metabolite_2 metabolite_3 metabolite_4 metabolite_5 metabolite_6
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.296 0.121 1.28 NA -0.198 -0.457
#> 2 -1.07 0.626 -1.04 -0.0980 -2.16 -0.781
#> 3 -0.483 1.05 1.11 0.920 0.902 -0.289
#> 4 1.18 0.147 0.137 -0.714 0.745 -0.191
#> 5 0.339 -2.80 1.19 1.66 1.02 1.25
#> 6 0.108 0.322 -0.0530 -1.33 -0.197 0.440
#> 7 0.584 0.234 0.532 1.84 0.822 -1.59
#> 8 0.971 -0.770 -0.362 0.497 1.30 1.25
#> 9 -0.616 -0.994 -0.264 -1.22 -0.809 -0.0623
#> 10 -0.528 -0.568 0.476 -0.346 -0.440 1.48
#> # … with 1,990 more rows, and 12 more variables: metabolite_7 <dbl>,
#> # metabolite_8 <dbl>, metabolite_9 <dbl>, metabolite_10 <dbl>,
#> # outcome_continuous <dbl>, metabolite_12 <dbl>, metabolite_11 <dbl>,
#> # exposure <dbl>, age <dbl>, id <int>, outcome_event_time <dbl>,
#> # outcome_binary <int>
# Works with factors too
simulated_data %>%
dplyr::mutate(Sex = as.factor(sample(rep(c("F", "M"), times = nrow(.) / 2)))) %>%
nc_standardize(starts_with("metabolite_"), c("age", "Sex"))
#> # A tibble: 2,000 × 19
#> metabolite_1 metabolite_2 metabolite_3 metabolite_4 metabolite_5 metabolite_6
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.310 0.108 1.26 NA -0.199 -0.487
#> 2 -1.05 0.613 -1.06 -0.103 -2.16 -0.811
#> 3 -0.496 1.07 1.13 0.925 0.903 -0.259
#> 4 1.17 0.160 0.151 -0.709 0.746 -0.161
#> 5 0.325 -2.79 1.21 1.66 1.02 1.28
#> 6 0.122 0.309 -0.0668 -1.33 -0.198 0.410
#> 7 0.598 0.220 0.519 1.84 0.821 -1.62
#> 8 0.985 -0.783 -0.375 0.492 1.30 1.22
#> 9 -0.630 -0.981 -0.251 -1.22 -0.808 -0.0322
#> 10 -0.542 -0.555 0.490 -0.341 -0.440 1.51
#> # … with 1,990 more rows, and 13 more variables: metabolite_7 <dbl>,
#> # metabolite_8 <dbl>, metabolite_9 <dbl>, metabolite_10 <dbl>,
#> # outcome_continuous <dbl>, metabolite_12 <dbl>, metabolite_11 <dbl>,
#> # exposure <dbl>, age <dbl>, id <int>, outcome_event_time <dbl>,
#> # outcome_binary <int>, Sex <fct>