Skip to content

Commit 2a995eb

Browse files
committed
isolate chunks for set 1
1 parent 71dbe2a commit 2a995eb

8 files changed

+710
-611
lines changed

instructors/01-practical-tutors.qmd

Lines changed: 14 additions & 422 deletions
Large diffs are not rendered by default.
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# nolint start
2+
3+
# Practical 1
4+
# Activity 1
5+
6+
# Load packages ----------------------------------------------------------
7+
library(cleanepi)
8+
library(linelist)
9+
library(incidence2)
10+
library(tidyverse)
11+
12+
13+
# Adapt the data dictionary ----------------------------------------------
14+
15+
# Replace 'variable_name' when you have the information
16+
dat_dictionary <- tibble::tribble(
17+
~options, ~values, ~grp, ~orders,
18+
"1", "male", "sex_fem_2", 1L,
19+
"2", "female", "sex_fem_2", 2L
20+
)
21+
22+
dat_dictionary
23+
24+
25+
# Read raw data ----------------------------------------------------------
26+
dat_raw <- readr::read_csv(
27+
"https://epiverse-trace.github.io/tutorials-early/data/linelist-date_of_birth.csv"
28+
)
29+
30+
dat_raw
31+
32+
33+
# Clean and standardize data ---------------------------------------------
34+
35+
# How many cleanepi functions did you use to get clean data?
36+
dat_clean <- dat_raw %>%
37+
cleanepi::standardize_column_names() %>%
38+
cleanepi::standardize_dates(
39+
target_columns = c(
40+
"date_of_admission",
41+
"date_of_birth",
42+
"date_first_pcr_positive_test"
43+
)
44+
) %>%
45+
cleanepi::check_date_sequence(
46+
target_columns = c(
47+
"date_of_birth",
48+
"date_first_pcr_positive_test",
49+
"date_of_admission"
50+
)
51+
) %>%
52+
# using data_dictionary requires valid missing entries
53+
cleanepi::replace_missing_values(
54+
target_columns = "sex_fem_2",
55+
na_strings = "-99"
56+
) %>%
57+
cleanepi::clean_using_dictionary(dictionary = dat_dictionary) %>%
58+
cleanepi::remove_constants() %>%
59+
cleanepi::remove_duplicates(
60+
target_columns = c("study_id", "date_of_birth")
61+
)
62+
63+
dat_clean
64+
65+
66+
# Create time span variable ----------------------------------------------
67+
68+
# What time span unit best describes the 'delay' from 'onset' to 'death'?
69+
dat_timespan <- dat_clean %>%
70+
cleanepi::timespan(
71+
target_column = "date_of_birth",
72+
end_date = Sys.Date(),
73+
span_unit = "years",
74+
span_column_name = "timespan_variable",
75+
span_remainder_unit = "months"
76+
) %>%
77+
# skimr::skim(timespan_variable)
78+
# Categorize the delay numerical variable
79+
dplyr::mutate(
80+
timespan_category = base::cut(
81+
x = timespan_variable,
82+
breaks = c(0, 20, 35, 60, 80),
83+
include.lowest = TRUE,
84+
right = FALSE
85+
)
86+
)
87+
88+
dat_timespan
89+
90+
91+
# nolint end
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# nolint start
2+
3+
# Practical 1
4+
# Activity 1
5+
6+
# Load packages ----------------------------------------------------------
7+
library(cleanepi)
8+
library(linelist)
9+
library(incidence2)
10+
library(tidyverse)
11+
12+
13+
# Adapt the data dictionary ----------------------------------------------
14+
15+
# Replace 'variable_name' when you have the information
16+
dat_dictionary <- tibble::tribble(
17+
~options, ~values, ~grp, ~orders,
18+
"1", "male", "sex", 1L,
19+
"2", "female", "sex", 2L,
20+
"M", "male", "sex", 3L,
21+
"F", "female", "sex", 4L,
22+
"m", "male", "sex", 5L,
23+
"f", "female", "sex", 6L
24+
)
25+
26+
dat_dictionary
27+
28+
29+
# Read raw data ----------------------------------------------------------
30+
dat_raw <- readr::read_csv(
31+
"https://epiverse-trace.github.io/tutorials-early/data/covid_simulated_data.csv"
32+
)
33+
34+
dat_raw
35+
36+
37+
# Clean and standardize data ---------------------------------------------
38+
39+
# How many cleanepi functions did you use to get clean data?
40+
dat_clean <- dat_raw %>%
41+
cleanepi::standardize_column_names() %>%
42+
cleanepi::standardize_dates(
43+
target_columns = c(
44+
"date_onset",
45+
"date_admission",
46+
"date_outcome",
47+
"date_first_contact",
48+
"date_last_contact"
49+
)
50+
) %>%
51+
cleanepi::check_date_sequence(
52+
target_columns = c(
53+
"date_first_contact",
54+
"date_last_contact",
55+
"date_onset",
56+
"date_admission",
57+
"date_outcome"
58+
)
59+
) %>%
60+
cleanepi::convert_to_numeric(target_columns = "age") %>%
61+
# dplyr::count(sex)
62+
# using data_dictionary requires valid missing entries
63+
cleanepi::replace_missing_values(
64+
target_columns = "sex",
65+
na_strings = "-99"
66+
) %>%
67+
cleanepi::clean_using_dictionary(dictionary = dat_dictionary) %>%
68+
cleanepi::remove_constants() %>%
69+
cleanepi::remove_duplicates(
70+
target_columns = c("case_id", "case_name")
71+
)
72+
73+
dat_clean
74+
75+
76+
# Create time span variable ----------------------------------------------
77+
78+
# What time span unit best describes the 'delay' from 'onset' to 'death'?
79+
dat_timespan <- dat_clean %>%
80+
cleanepi::timespan(
81+
target_column = "date_onset",
82+
end_date = "date_outcome",
83+
span_unit = "days",
84+
span_column_name = "timespan_variable",
85+
span_remainder_unit = NULL
86+
) %>%
87+
# skimr::skim(timespan_variable)
88+
# Categorize the delay numerical variable
89+
dplyr::mutate(
90+
timespan_category = base::cut(
91+
x = timespan_variable,
92+
breaks = c(0, 10, 15, 40),
93+
include.lowest = TRUE,
94+
right = FALSE
95+
)
96+
)
97+
98+
dat_timespan
99+
100+
101+
# nolint end
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# nolint start
2+
3+
# Practical 1
4+
# Activity 1
5+
6+
# Load packages ----------------------------------------------------------
7+
library(cleanepi)
8+
library(linelist)
9+
library(incidence2)
10+
library(tidyverse)
11+
12+
13+
# Adapt the data dictionary ----------------------------------------------
14+
15+
# Replace 'variable_name' when you have the information
16+
dat_dictionary <- tibble::tribble(
17+
~options, ~values, ~grp, ~orders,
18+
"1", "male", "variable_name", 1L,
19+
"2", "female", "variable_name", 2L,
20+
"M", "male", "variable_name", 3L,
21+
"F", "female", "variable_name", 4L,
22+
"m", "male", "variable_name", 5L,
23+
"f", "female", "variable_name", 6L
24+
)
25+
26+
dat_dictionary
27+
28+
29+
# Read raw data ----------------------------------------------------------
30+
dat_raw <- readr::read_csv(
31+
"https://epiverse-trace.github.io/tutorials-early/data/delta_full-messy.csv"
32+
)
33+
34+
dat_raw
35+
36+
37+
# Clean and standardize data ---------------------------------------------
38+
39+
# How many cleanepi functions did you use to get clean data?
40+
dat_clean <- dat_raw %>%
41+
cleanepi::standardize_column_names() %>%
42+
cleanepi::standardize_dates(target_columns = "date") %>% #
43+
cleanepi::convert_to_numeric(target_columns = "exp_num") %>%
44+
cleanepi::check_date_sequence(
45+
target_columns = c("last_exp_date", "date")
46+
)
47+
48+
dat_clean
49+
50+
51+
# Create time span variable ----------------------------------------------
52+
53+
# What time span unit best describes the 'delay' from 'onset' to 'death'?
54+
dat_timespan <- dat_clean %>%
55+
cleanepi::timespan(
56+
target_column = "last_exp_date",
57+
end_date = "date",
58+
span_unit = "days",
59+
span_column_name = "timespan_variable",
60+
span_remainder_unit = NULL
61+
) %>%
62+
# skimr::skim(timespan_variable)
63+
# Categorize the delay numerical variable
64+
dplyr::mutate(
65+
timespan_category = base::cut(
66+
x = timespan_variable,
67+
breaks = c(0, 30, 100, 600),
68+
include.lowest = TRUE,
69+
right = FALSE
70+
)
71+
)
72+
73+
dat_timespan
74+
75+
76+
# nolint end
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# nolint start
2+
3+
# Practical 1
4+
# Activity 2
5+
6+
# Validate linelist ------------------------------------------------------
7+
8+
# Activate error message
9+
linelist::lost_tags_action(action = "error")
10+
# linelist::lost_tags_action(action = "warning")
11+
12+
# Print tag types, names, and data to guide make_linelist
13+
linelist::tags_types()
14+
linelist::tags_names()
15+
dat_timespan
16+
17+
# Does the age variable pass the validation step?
18+
dat_validate <- dat_timespan %>%
19+
# Tag variables
20+
linelist::make_linelist(
21+
id = "study_id",
22+
date_reporting = "date_first_pcr_positive_test",
23+
gender = "sex_fem_2",
24+
# age = "timespan_category", # does not pass validation
25+
age = "timespan_variable",
26+
occupation = "timespan_category" # Categorical variable
27+
) %>%
28+
# Validate linelist
29+
linelist::validate_linelist() %>%
30+
# Test safeguard
31+
# dplyr::select(case_id, date_onset, sex)
32+
# INSTEAD
33+
linelist::tags_df()
34+
35+
36+
# Create incidence -------------------------------------------------------
37+
38+
# What is the most appropriate time-aggregate (days, months) to plot?
39+
dat_incidence <- dat_validate %>%
40+
# Transform from individual-level to time-aggregate
41+
incidence2::incidence(
42+
date_index = "date_reporting",
43+
groups = "occupation", # OR any categorical variable
44+
interval = "month",
45+
complete_dates = TRUE
46+
)
47+
48+
49+
# Plot epicurve ----------------------------------------------------------
50+
51+
# Do arguments like 'fill', 'show_cases', 'angle', 'n_breaks' improve the plot?
52+
dat_incidence %>%
53+
plot(
54+
fill = "occupation", # <KEEP OR DROP>
55+
show_cases = TRUE, # <KEEP OR DROP>
56+
angle = 45, # <KEEP OR DROP>
57+
n_breaks = 5 # <KEEP OR DROP>
58+
)
59+
60+
# Find plot() arguments at ?incidence2:::plot.incidence2()
61+
62+
# nolint end
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# nolint start
2+
3+
# Practical 1
4+
# Activity 2
5+
6+
# Validate linelist ------------------------------------------------------
7+
8+
# Activate error message
9+
linelist::lost_tags_action(action = "error")
10+
# linelist::lost_tags_action(action = "warning")
11+
12+
# Print tag types, names, and data to guide make_linelist
13+
linelist::tags_types()
14+
linelist::tags_names()
15+
dat_timespan
16+
17+
# Does the age variable pass the validation step?
18+
dat_validate <- dat_timespan %>%
19+
# Tag variables
20+
linelist::make_linelist(
21+
id = "case_id",
22+
date_onset = "date_onset",
23+
gender = "sex",
24+
age = "age",
25+
outcome = "outcome",
26+
occupation = "timespan_category" # Categorical variable
27+
) %>%
28+
# Validate linelist
29+
linelist::validate_linelist() %>%
30+
# Test safeguard
31+
# dplyr::select(case_id, date_onset, sex)
32+
# INSTEAD
33+
linelist::tags_df()
34+
35+
36+
# Create incidence -------------------------------------------------------
37+
38+
# What is the most appropriate time-aggregate (days, months) to plot?
39+
dat_incidence <- dat_validate %>%
40+
# Transform from individual-level to time-aggregate
41+
incidence2::incidence(
42+
date_index = "date_onset",
43+
groups = "outcome", # OR any categorical variable
44+
interval = "day",
45+
complete_dates = TRUE
46+
)
47+
48+
49+
# Plot epicurve ----------------------------------------------------------
50+
51+
# Do arguments like 'fill', 'show_cases', 'angle', 'n_breaks' improve the plot?
52+
dat_incidence %>%
53+
plot(
54+
angle = 45, # <KEEP OR DROP>
55+
n_breaks = 5 # <KEEP OR DROP>
56+
)
57+
58+
# Find plot() arguments at ?incidence2:::plot.incidence2()
59+
60+
# nolint end

0 commit comments

Comments
 (0)