epiverse-trace
diff --git a/‎instructors/01-practical-tutors.qmd‎
Lines changed: 14 additions & 422 deletions b/‎instructors/01-practical-tutors.qmd‎
Lines changed: 14 additions & 422 deletions
diff --git a/‎instructors/fig/01-practical-instructor-1-G1.R‎
Lines changed: 91 additions & 0 deletions b/‎instructors/fig/01-practical-instructor-1-G1.R‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎instructors/fig/01-practical-instructor-1-G2.R‎
Lines changed: 101 additions & 0 deletions b/‎instructors/fig/01-practical-instructor-1-G2.R‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎instructors/fig/01-practical-instructor-1-G3.R‎
Lines changed: 76 additions & 0 deletions b/‎instructors/fig/01-practical-instructor-1-G3.R‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎instructors/fig/01-practical-instructor-2-G1.R‎
Lines changed: 62 additions & 0 deletions b/‎instructors/fig/01-practical-instructor-2-G1.R‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎instructors/fig/01-practical-instructor-2-G2.R‎
Lines changed: 60 additions & 0 deletions b/‎instructors/fig/01-practical-instructor-2-G2.R‎
Lines changed: 60 additions & 0 deletions
@@ -0,0 +1,91 @@
+# nolint start
+
+# Practical 1
+# Activity 1
+
+# Load packages ----------------------------------------------------------
+library(cleanepi)
+library(linelist)
+library(incidence2)
+library(tidyverse)
+
+
+# Adapt the data dictionary ----------------------------------------------
+
+# Replace 'variable_name' when you have the information
+dat_dictionary <- tibble::tribble(
+  ~options,  ~values,        ~grp, ~orders,
+       "1",   "male", "sex_fem_2",      1L,
+       "2", "female", "sex_fem_2",      2L
+)
+
+dat_dictionary
+
+
+# Read raw data ----------------------------------------------------------
+dat_raw <- readr::read_csv(
+  "https://epiverse-trace.github.io/tutorials-early/data/linelist-date_of_birth.csv"
+  )
+
+dat_raw
+
+
+# Clean and standardize data ---------------------------------------------
+
+# How many cleanepi functions did you use to get clean data?
+dat_clean <- dat_raw %>%
+  cleanepi::standardize_column_names() %>%
+    cleanepi::standardize_dates(
+      target_columns = c(
+        "date_of_admission",
+        "date_of_birth",
+        "date_first_pcr_positive_test"
+      )
+    ) %>%
+    cleanepi::check_date_sequence(
+      target_columns = c(
+        "date_of_birth",
+        "date_first_pcr_positive_test",
+        "date_of_admission"
+      )
+    ) %>%
+    # using data_dictionary requires valid missing entries
+    cleanepi::replace_missing_values(
+      target_columns = "sex_fem_2",
+      na_strings = "-99"
+    ) %>%
+    cleanepi::clean_using_dictionary(dictionary = dat_dictionary) %>%
+    cleanepi::remove_constants() %>%
+    cleanepi::remove_duplicates(
+      target_columns = c("study_id", "date_of_birth")
+    )
+
+dat_clean
+
+
+# Create time span variable ----------------------------------------------
+
+# What time span unit best describes the 'delay' from 'onset' to 'death'?
+dat_timespan <- dat_clean %>%
+  cleanepi::timespan(
+    target_column = "date_of_birth",
+    end_date = Sys.Date(),
+    span_unit = "years",
+    span_column_name = "timespan_variable",
+    span_remainder_unit = "months"
+  ) %>%
+  # skimr::skim(timespan_variable)
+  # Categorize the delay numerical variable
+  dplyr::mutate(
+    timespan_category = base::cut(
+      x = timespan_variable,
+      breaks = c(0, 20, 35, 60, 80), 
+      include.lowest = TRUE,
+      right = FALSE
+    )
+  )
+
+dat_timespan
+
+
+# nolint end
@@ -0,0 +1,101 @@
+# nolint start
+
+# Practical 1
+# Activity 1
+
+# Load packages ----------------------------------------------------------
+library(cleanepi)
+library(linelist)
+library(incidence2)
+library(tidyverse)
+
+
+# Adapt the data dictionary ----------------------------------------------
+
+# Replace 'variable_name' when you have the information
+dat_dictionary <- tibble::tribble(
+  ~options,  ~values,  ~grp, ~orders,
+       "1",   "male", "sex",      1L,
+       "2", "female", "sex",      2L,
+       "M",   "male", "sex",      3L,
+       "F", "female", "sex",      4L,
+       "m",   "male", "sex",      5L,
+       "f", "female", "sex",      6L
+)
+
+dat_dictionary
+
+
+# Read raw data ----------------------------------------------------------
+dat_raw <- readr::read_csv(
+  "https://epiverse-trace.github.io/tutorials-early/data/covid_simulated_data.csv"
+  )
+
+dat_raw
+
+
+# Clean and standardize data ---------------------------------------------
+
+# How many cleanepi functions did you use to get clean data?
+dat_clean <- dat_raw %>%
+  cleanepi::standardize_column_names() %>%
+    cleanepi::standardize_dates(
+      target_columns = c(
+        "date_onset",
+        "date_admission",
+        "date_outcome",
+        "date_first_contact",
+        "date_last_contact"
+      )
+    ) %>%
+    cleanepi::check_date_sequence(
+      target_columns = c(
+        "date_first_contact",
+        "date_last_contact",
+        "date_onset",
+        "date_admission",
+        "date_outcome"
+      )
+    ) %>%
+    cleanepi::convert_to_numeric(target_columns = "age") %>%
+    # dplyr::count(sex)
+    # using data_dictionary requires valid missing entries
+    cleanepi::replace_missing_values(
+      target_columns = "sex",
+      na_strings = "-99"
+    ) %>%
+    cleanepi::clean_using_dictionary(dictionary = dat_dictionary) %>%
+    cleanepi::remove_constants() %>%
+    cleanepi::remove_duplicates(
+      target_columns = c("case_id", "case_name")
+    )
+
+dat_clean
+
+
+# Create time span variable ----------------------------------------------
+
+# What time span unit best describes the 'delay' from 'onset' to 'death'?
+dat_timespan <- dat_clean %>%
+  cleanepi::timespan(
+    target_column = "date_onset",
+    end_date = "date_outcome",
+    span_unit = "days",
+    span_column_name = "timespan_variable",
+    span_remainder_unit = NULL
+  ) %>%
+  # skimr::skim(timespan_variable)
+  # Categorize the delay numerical variable
+  dplyr::mutate(
+    timespan_category = base::cut(
+      x = timespan_variable,
+      breaks = c(0, 10, 15, 40), 
+      include.lowest = TRUE,
+      right = FALSE
+    )
+  )
+
+dat_timespan
+
+
+# nolint end
@@ -0,0 +1,76 @@
+# nolint start
+
+# Practical 1
+# Activity 1
+
+# Load packages ----------------------------------------------------------
+library(cleanepi)
+library(linelist)
+library(incidence2)
+library(tidyverse)
+
+
+# Adapt the data dictionary ----------------------------------------------
+
+# Replace 'variable_name' when you have the information
+dat_dictionary <- tibble::tribble(
+  ~options,  ~values,            ~grp, ~orders,
+       "1",   "male", "variable_name",      1L,
+       "2", "female", "variable_name",      2L,
+       "M",   "male", "variable_name",      3L,
+       "F", "female", "variable_name",      4L,
+       "m",   "male", "variable_name",      5L,
+       "f", "female", "variable_name",      6L
+)
+
+dat_dictionary
+
+
+# Read raw data ----------------------------------------------------------
+dat_raw <- readr::read_csv(
+  "https://epiverse-trace.github.io/tutorials-early/data/delta_full-messy.csv"
+  )
+
+dat_raw
+
+
+# Clean and standardize data ---------------------------------------------
+
+# How many cleanepi functions did you use to get clean data?
+dat_clean <- dat_raw %>%
+  cleanepi::standardize_column_names() %>%
+    cleanepi::standardize_dates(target_columns = "date") %>% #
+    cleanepi::convert_to_numeric(target_columns = "exp_num") %>%
+    cleanepi::check_date_sequence(
+      target_columns = c("last_exp_date", "date")
+    )
+
+dat_clean
+
+
+# Create time span variable ----------------------------------------------
+
+# What time span unit best describes the 'delay' from 'onset' to 'death'?
+dat_timespan <- dat_clean %>%
+  cleanepi::timespan(
+    target_column = "last_exp_date",
+    end_date = "date",
+    span_unit = "days",
+    span_column_name = "timespan_variable",
+    span_remainder_unit = NULL
+  ) %>%
+  # skimr::skim(timespan_variable)
+  # Categorize the delay numerical variable
+  dplyr::mutate(
+    timespan_category = base::cut(
+      x = timespan_variable,
+      breaks = c(0, 30, 100, 600), 
+      include.lowest = TRUE,
+      right = FALSE
+    )
+  )
+
+dat_timespan
+
+
+# nolint end
@@ -0,0 +1,62 @@
+# nolint start
+
+# Practical 1
+# Activity 2
+
+# Validate linelist ------------------------------------------------------
+
+# Activate error message
+linelist::lost_tags_action(action = "error")
+# linelist::lost_tags_action(action = "warning")
+
+# Print tag types, names, and data to guide make_linelist
+linelist::tags_types()
+linelist::tags_names()
+dat_timespan
+
+# Does the age variable pass the validation step?
+dat_validate <- dat_timespan %>% 
+  # Tag variables
+  linelist::make_linelist(
+    id = "study_id",
+    date_reporting = "date_first_pcr_positive_test",
+    gender = "sex_fem_2",
+    # age = "timespan_category", # does not pass validation
+    age = "timespan_variable",
+    occupation = "timespan_category" # Categorical variable
+  ) %>% 
+  # Validate linelist
+  linelist::validate_linelist() %>% 
+  # Test safeguard
+  # dplyr::select(case_id, date_onset, sex)
+  # INSTEAD
+  linelist::tags_df()
+
+
+# Create incidence -------------------------------------------------------
+
+# What is the most appropriate time-aggregate (days, months) to plot?
+dat_incidence <- dat_validate %>%  
+  # Transform from individual-level to time-aggregate
+  incidence2::incidence(
+    date_index = "date_reporting",
+    groups = "occupation", # OR any categorical variable
+    interval = "month",
+    complete_dates = TRUE
+  )
+
+
+# Plot epicurve ----------------------------------------------------------
+
+# Do arguments like 'fill', 'show_cases', 'angle', 'n_breaks' improve the plot?
+dat_incidence %>% 
+  plot(
+    fill = "occupation", # <KEEP OR DROP>
+    show_cases = TRUE, # <KEEP OR DROP>
+    angle = 45, # <KEEP OR DROP>
+    n_breaks = 5 # <KEEP OR DROP>
+  )
+
+# Find plot() arguments at ?incidence2:::plot.incidence2()
+
+# nolint end
@@ -0,0 +1,60 @@
+# nolint start
+
+# Practical 1
+# Activity 2
+
+# Validate linelist ------------------------------------------------------
+
+# Activate error message
+linelist::lost_tags_action(action = "error")
+# linelist::lost_tags_action(action = "warning")
+
+# Print tag types, names, and data to guide make_linelist
+linelist::tags_types()
+linelist::tags_names()
+dat_timespan
+
+# Does the age variable pass the validation step?
+dat_validate <- dat_timespan %>% 
+  # Tag variables
+  linelist::make_linelist(
+    id = "case_id",
+    date_onset = "date_onset",
+    gender = "sex",
+    age = "age",
+    outcome = "outcome",
+    occupation = "timespan_category" # Categorical variable
+  ) %>% 
+  # Validate linelist
+  linelist::validate_linelist() %>% 
+  # Test safeguard
+  # dplyr::select(case_id, date_onset, sex)
+  # INSTEAD
+  linelist::tags_df()
+
+
+# Create incidence -------------------------------------------------------
+
+# What is the most appropriate time-aggregate (days, months) to plot?
+dat_incidence <- dat_validate %>%  
+  # Transform from individual-level to time-aggregate
+  incidence2::incidence(
+    date_index = "date_onset",
+    groups = "outcome", # OR any categorical variable
+    interval = "day",
+    complete_dates = TRUE
+  )
+
+
+# Plot epicurve ----------------------------------------------------------
+
+# Do arguments like 'fill', 'show_cases', 'angle', 'n_breaks' improve the plot?
+dat_incidence %>% 
+  plot(
+    angle = 45, # <KEEP OR DROP>
+    n_breaks = 5 # <KEEP OR DROP>
+  )
+
+# Find plot() arguments at ?incidence2:::plot.incidence2()
+
+# nolint end