## ----child = "../setup.Rmd"-------------------------------------------

## ----setup, include=FALSE---------------------------------------------
# R options
options(
  htmltools.dir.version = FALSE,
  dplyr.print_min = 6, 
  dplyr.print_max = 6,
  tibble.width = 65,
  width = 65
  )
# figure height, width, dpi
knitr::opts_chunk$set(echo = TRUE, 
                      fig.width = 8, 
                      fig.asp = 0.618,
                      out.width = "60%",
                      fig.align = "center",
                      dpi = 300,
                      message = FALSE)
# ggplot2
ggplot2::theme_set(ggplot2::theme_gray(base_size = 16))
# set seed
set.seed(1234)
# fontawesome
htmltools::tagList(rmarkdown::html_dependency_font_awesome())
# magick
dev.off <- function(){
  invisible(grDevices::dev.off())
}
# conflicted
library(conflicted)
conflict_prefer("filter", "dplyr")
# xaringanExtra
library(xaringanExtra)
xaringanExtra::use_panelset()
# output number of lines
hook_output <- knitr::knit_hooks$get("output")
knitr::knit_hooks$set(output = function(x, options) {
  lines <- options$output.lines
  if (is.null(lines)) {
    return(hook_output(x, options))  # pass to default hook
  }
  x <- unlist(strsplit(x, "\n"))
  more <- "..."
  if (length(lines)==1) {        # first n lines
    if (length(x) > lines) {
      # truncate the output, but add ....
      x <- c(head(x, lines), more)
    }
  } else {
    x <- c(more, x[lines], more)
  }
  # paste these lines together
  x <- paste(c(x, ""), collapse = "\n")
  hook_output(x, options)
})


## ----packages, echo=FALSE, message=FALSE, warning=FALSE---------------
library(tidyverse)
library(readxl)
library(skimr)
library(knitr)
library(DT)
library(here)


## ---------------------------------------------------------------------
getwd()


## ---------------------------------------------------------------------
setwd("~")
getwd()


## ----eval = FALSE-----------------------------------------------------
## file_path <- "C:\\Users\\39349\Documents\\teaching_2023\\ADPI\\slides\datamanaging\\data"
## daily_show <- read.csv(file_path, ...)


## ----eval = FALSE-----------------------------------------------------
## daily_show <- read.csv("data/daily_show_guests.csv", ...)

## ----include = FALSE--------------------------------------------------
daily_show <- read.csv("data/daily_show_guests.csv", skip = 4)


## ----echo = FALSE-----------------------------------------------------
dirpath_shortcuts <- data.frame(abbr = c("`~`", "`.`", "`..`", "`../..`"),
                                meaning = c("Home directory",
                                            "Current working directory",
                                            "One directory up from current working directory",
                                            "Two directories up from current working directory"))
knitr::kable(dirpath_shortcuts, col.names = c("Shorthand", "Meaning"))


## ----eval=F-----------------------------------------------------------
## list.files()


## ---------------------------------------------------------------------
paste("Sunday", "Monday", "Tuesday")


## ---------------------------------------------------------------------
paste("Sunday", "Monday", "Tuesday", sep = "")


## ----eval=F-----------------------------------------------------------
## paste0("Sunday", "Monday", "Tuesday")


## ----echo = FALSE-----------------------------------------------------
read.table.funcs <- data.frame(func = c("`read.table`", "`read.csv`", "`read.csv2`",
                                        "`read.delim`", "`read.delim2`"),
                               sep = c("white space", "comma", "semi-colon",
                                       "tab", "tab"),
                               dec = c("period", "period", "comma",
                                       "period", "comma"))
knitr::kable(read.table.funcs, col.names = c("Function", "Separator", "Decimal point"))


## ----echo = FALSE-----------------------------------------------------
read.table_opts <- data.frame(Option = c("`sep`",
                                         "`skip`",
                                         "`header`",
                                         "`as.is`",
                                         "`nrows`",
                                         "`na.strings`"),
                              Description = c("What is the delimiter in the data?",
                                              "How many lines of the start of the file should you skip?",
                                              "Does the first line you read give column names?",
                                              "Should you bring in strings as characters, not factors?",
                                              "How many rows do you want to read in?",
                                              "How are missing values coded?"))
knitr::kable(read.table_opts)


## The `readr` package is a member of the **tidyverse** (https://www.tidyverse.org/) of packages.

## ----tidyverse, echo = F, out.width = "55%"---------------------------
knitr::include_graphics("img/tidyverse.png")


## ----echo=FALSE, out.width="80%"--------------------------------------
knitr::include_graphics("img/readr.png")


## ----echo=FALSE, out.width="80%"--------------------------------------
knitr::include_graphics("img/readxl.png")


## ---------------------------------------------------------------------
nobel <- read_csv(file = "data/nobel.csv")
nobel


## ----cache=TRUE-------------------------------------------------------
df <- data.frame(
  x = 1:3, 
  y = letters[1:3]
)

write_csv(df, file = "data/df.csv")


## ----cache=TRUE-------------------------------------------------------
df <- tribble(
  ~x, ~y,
  1,  "a",
  2,  "b",
  3,  "c"
)

write_csv(df, file = "data/df_2.csv")


## ---------------------------------------------------------------------
read_csv("data/df.csv")
read_csv("data/df_2.csv")


## ----message=FALSE----------------------------------------------------
edibnb_badnames <- read_csv("data/edibnb-badnames.csv")
names(edibnb_badnames)


## ---------------------------------------------------------------------
edibnb_col_names <- read_csv("data/edibnb-badnames.csv",
                             col_names = c("id", "price", 
                                           "neighbourhood", "accommodates",
                                           "bathroom", "bedroom", 
                                           "bed", "review_scores_rating", 
                                           "n_reviews", "url"))

names(edibnb_col_names)


## ----warning=FALSE----------------------------------------------------
edibnb_clean_names <- read_csv("data/edibnb-badnames.csv") %>%
  janitor::clean_names()

names(edibnb_clean_names)


## ----echo=FALSE, out.width="100%"-------------------------------------
knitr::include_graphics("img/df-na.png")


## ----eval=F-----------------------------------------------------------
## read_csv("data/df-na.csv")


## ----echo=FALSE-------------------------------------------------------
read_csv("data/df-na.csv") %>% print(n = 10)


## ----eval=FALSE-------------------------------------------------------
## read_csv("data/df-na.csv",
##          na = c("", "NA", ".", "9999", "Not applicable"))


## ----echo=FALSE, out.width="100%"-------------------------------------
knitr::include_graphics("img/df-na.png")


## ----echo=FALSE,message=F---------------------------------------------
read_csv("data/df-na.csv", 
  na = c("", "NA", ".", "9999",
         "Not applicable")) %>% 
  print(n = 10)


## ----eval=FALSE-------------------------------------------------------
## read_csv("data/df-na.csv", col_types = list(col_double(),
##                                             col_character(),
##                                             col_character()))


## ----echo=FALSE-------------------------------------------------------
read_csv("data/df-na.csv", col_types = list(col_double(), 
                                            col_character(), 
                                            col_character())) %>%
  print(n = 10)


## ----warning=FALSE----------------------------------------------------
dat <- read_csv("data/df-na.csv", col_types = list(col_double(), col_character(), col_character())) %>% print(n = 10)
problems(dat)


## ---------------------------------------------------------------------
dat <- read_csv("data/df-na.csv", col_types = 
              list(col_number(), col_character(), col_character())) %>% print(n = 10)


## ----message=TRUE, output.lines=7, eval=T-----------------------------
read_csv("data/df-na.csv")


## ----message=TRUE, eval=F---------------------------------------------
## read_csv("data/df-na.csv", show_col_types = F)


## ----echo=FALSE-------------------------------------------------------
knitr::include_graphics("img/fav-food/fav-food.png")


## ---------------------------------------------------------------------
fav_food <- read_excel("data/favourite-food.xlsx") #<<

fav_food


## ----echo=FALSE-------------------------------------------------------
knitr::include_graphics("img/fav-food/fav-food-names.png")


## ----warning=FALSE----------------------------------------------------
fav_food <- read_excel("data/favourite-food.xlsx") %>%
  janitor::clean_names() #<<

fav_food 


## ----echo=FALSE-------------------------------------------------------
knitr::include_graphics("img/fav-food/fav-food-nas.png")


## ----warning=FALSE----------------------------------------------------
fav_food <- read_excel("data/favourite-food.xlsx",
                       na = c("N/A", "99999")) %>% #<<
  janitor::clean_names()

fav_food 


## ----warning=FALSE----------------------------------------------------
fav_food <- fav_food %>%
  mutate( #<<
    age = if_else(age == "five", "5", age), #<<
    age = as.numeric(age) #<<
    ) #<<

glimpse(fav_food) 


## ----echo=FALSE-------------------------------------------------------
knitr::include_graphics("img/fav-food/fav-food-age.png")


## ---------------------------------------------------------------------
fav_food %>%
  count(ses)


## ----echo=FALSE-------------------------------------------------------
knitr::include_graphics("img/fav-food/fav-food-ses.png")


## ----warning=FALSE----------------------------------------------------
fav_food <- fav_food %>%
  mutate(ses = fct_relevel(ses, "Low", "Middle", "High")) #<<

fav_food %>%
  count(ses)


## ---------------------------------------------------------------------
fav_food <- fav_food %>%
  mutate(ses = factor(ses, c("Low", "Middle", "High"))) #<<

fav_food %>%
  count(ses)


## ----warning=FALSE----------------------------------------------------
fav_food <- read_excel("data/favourite-food.xlsx", na = c("N/A", "99999")) %>%
  janitor::clean_names() %>%
  mutate(
    age = if_else(age == "five", "5", age), 
    age = as.numeric(age),
    ses = fct_relevel(ses, "Low", "Middle", "High")
  )

fav_food


## ---------------------------------------------------------------------
write_csv(fav_food, file = "data/fav-food-clean.csv")

fav_food_clean <- read_csv("data/fav-food-clean.csv")


## ---------------------------------------------------------------------
fav_food_clean %>%
  count(ses)


## ----eval=FALSE-------------------------------------------------------
## read_rds(path)
## write_rds(x, path)


## ---------------------------------------------------------------------
write_rds(fav_food, file = "data/fav-food-clean.rds")

fav_food_clean <- read_rds("data/fav-food-clean.rds")

fav_food_clean %>%
  count(ses)


## ----echo=FALSE-------------------------------------------------------
sales <- read_excel("data/sales.xlsx", skip = 3, col_names = c("id", "n"))
sales


## ----echo=FALSE-------------------------------------------------------
sales %>%
  mutate(
    is_brand_name = str_detect(id, "Brand"),
    brand = if_else(is_brand_name, id, NA_character_)
  ) %>%
  fill(brand) %>%
  filter(!is_brand_name) %>%
  select(brand, id, n) %>%
  mutate(
    id = as.numeric(id),
    n = as.numeric(n)
  )