Data description
Related pages: Data visualization
Resources¶
Useful packages¶
ยง1. Quick checks¶
Sample size and variables¶
Using skimr
Using summarytools
data_summary <- summarytools::dfSummary(
dataset_xxx,
varnumbers = TRUE,
labels.col = if_label,
graph.magnif = 1,
valid.col = FALSE,
na.col = TRUE,
style = "grid",
plain.ascii = FALSE,
max.string.width = 25,
split.table = 30,
tmp.img.dir = "/tmp"
)
summarytools::view(
data_summary,
footnote = NA,
file = file.path(path_html, paste0(name, "_summary.html"))
)
Missing values¶
Continuous variables¶
Quantiles¶
Histogram¶
Draw a histogram:
CDF¶
ggplot() +
stat_ecdf(
data = DATA,
aes(x = VAR),
linewidth = 0.8
) +
scale_y_continuous(
limits = c(0, 1),
expand = c(0.02, 0),
breaks = seq(0, 1, 0.1)
) +
scale_x_continuous(
# limits = c(-1, 1),
expand = c(0.02, 0),
# breaks = seq(-1, 1, 0.1)
) +
labs(
x = "[VARIABLE NAME]",
y = "Cumulative Probability",
) +
theme_minimal() +
theme(
panel.grid.major.y = element_line(color = "gray90", linewidth = 0.3),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(color = "gray90", linewidth = 0.3),
panel.grid.minor.x = element_blank(),
axis.title.x = element_text(margin = margin(t = 8)),
axis.title.y = element_text(margin = margin(r = 8)),
axis.title = element_text(size = 10, color = "black"),
axis.text = element_text(size = 9, color = "black"),
axis.line = element_line(color = "black", linewidth = 0),
axis.ticks.x = element_line(color = "black", linewidth = 0.3),
axis.ticks.y = element_line(color = "black", linewidth = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 9, color = "black"),
legend.key.height = unit(10, "pt"),
legend.key.width = unit(10, "pt"),
legend.margin = margin(b = -5),
panel.border = element_rect(color = "black", fill = NA)
)
Categorical variables¶
ยง2. Other patterns¶
TBD
Subgroup comparisons¶
TBD
Time trends¶
TBD
Case studies¶
TBD