Skip to content

Data description

Related pages: Data visualization

Resources

Useful packages

ยง1. Quick checks

Sample size and variables

Using skimr

skimr::skim()

Using summarytools

data_summary <- summarytools::dfSummary(
    dataset_xxx,
    varnumbers = TRUE,
    labels.col = if_label,
    graph.magnif = 1, 
    valid.col = FALSE,
    na.col = TRUE,
    style = "grid", 
    plain.ascii = FALSE,
    max.string.width = 25,
    split.table = 30,
    tmp.img.dir = "/tmp"
)
summarytools::view(
    data_summary,
    footnote = NA,
    file = file.path(path_html, paste0(name, "_summary.html"))
)
d // equivalent to `describe`
d, s // equivalent to `describe, short`
describe(df)
describe(df, :all)

Missing values

finalfit::missing_plot()
misstable

Continuous variables

Quantiles

quantile(x, probs)
quantile(x, probs = seq(0, 1, 0.25))

Histogram

Draw a histogram:

hist(VECTOR_NAME)
DATA |>
  ggplot() +
  geom_histogram(
    aes(x = VARIABLE),
    binwidth = 0.1,
    color = "black",
    fill = "lightgray"
  ) +
  labs(
    x = "[VAR]",
    y = "Count"
  ) +
  theme_minimal()
su <var>
su <var>, d // equivalent to `summarize, detail`
codebook <var>
histogram <var>
histogram() # from DataFrames.jl

CDF

ggplot() +
stat_ecdf(
  data = DATA,
  aes(x = VAR),
  linewidth = 0.8
) + 
scale_y_continuous(
  limits = c(0, 1),
  expand = c(0.02, 0),
  breaks = seq(0, 1, 0.1)
) +
scale_x_continuous(
  # limits = c(-1, 1),
  expand = c(0.02, 0),
  # breaks = seq(-1, 1, 0.1)
) +
labs(
  x = "[VARIABLE NAME]",
  y = "Cumulative Probability",
) +
theme_minimal() +
theme(
  panel.grid.major.y = element_line(color = "gray90", linewidth = 0.3),
  panel.grid.minor.y = element_blank(),
  panel.grid.major.x = element_line(color = "gray90", linewidth = 0.3),
  panel.grid.minor.x = element_blank(),
  axis.title.x = element_text(margin = margin(t = 8)),
  axis.title.y = element_text(margin = margin(r = 8)),
  axis.title = element_text(size = 10, color = "black"),
  axis.text = element_text(size = 9, color = "black"),
  axis.line = element_line(color = "black", linewidth = 0),
  axis.ticks.x = element_line(color = "black", linewidth = 0.3),
  axis.ticks.y = element_line(color = "black", linewidth = 0.3),
  legend.position = "top",
  legend.title = element_blank(),
  legend.text = element_text(size = 9, color = "black"),
  legend.key.height = unit(10, "pt"),
  legend.key.width = unit(10, "pt"),
  legend.margin = margin(b = -5),
  panel.border = element_rect(color = "black", fill = NA)
)

Categorical variables

table()
dplyr::count(col_name)
janitor::tabyl(col_name)

ยง2. Other patterns

TBD

Subgroup comparisons

TBD

TBD

Case studies

TBD