## ----child = "../setup.Rmd"------------------------------------------- ## ----setup, include=FALSE--------------------------------------------- # R options options( htmltools.dir.version = FALSE, dplyr.print_min = 6, dplyr.print_max = 6, tibble.width = 65, width = 65 ) # figure height, width, dpi knitr::opts_chunk$set(echo = TRUE, fig.width = 8, fig.asp = 0.618, out.width = "60%", fig.align = "center", dpi = 300, message = FALSE) # ggplot2 ggplot2::theme_set(ggplot2::theme_gray(base_size = 16)) # set seed set.seed(1234) # fontawesome htmltools::tagList(rmarkdown::html_dependency_font_awesome()) # magick dev.off <- function(){ invisible(grDevices::dev.off()) } # conflicted library(conflicted) conflict_prefer("filter", "dplyr") # xaringanExtra library(xaringanExtra) xaringanExtra::use_panelset() # output number of lines hook_output <- knitr::knit_hooks$get("output") knitr::knit_hooks$set(output = function(x, options) { lines <- options$output.lines if (is.null(lines)) { return(hook_output(x, options)) # pass to default hook } x <- unlist(strsplit(x, "\n")) more <- "..." if (length(lines)==1) { # first n lines if (length(x) > lines) { # truncate the output, but add .... x <- c(head(x, lines), more) } } else { x <- c(more, x[lines], more) } # paste these lines together x <- paste(c(x, ""), collapse = "\n") hook_output(x, options) }) ## ----packages, echo=FALSE, message=FALSE, warning=FALSE--------------- library(tidyverse) library(readxl) library(skimr) library(knitr) library(DT) library(here) ## --------------------------------------------------------------------- getwd() ## --------------------------------------------------------------------- setwd("~") getwd() ## ----eval = FALSE----------------------------------------------------- ## file_path <- "C:\\Users\\39349\Documents\\teaching_2023\\ADPI\\slides\datamanaging\\data" ## daily_show <- read.csv(file_path, ...) ## ----eval = FALSE----------------------------------------------------- ## daily_show <- read.csv("data/daily_show_guests.csv", ...) ## ----include = FALSE-------------------------------------------------- daily_show <- read.csv("data/daily_show_guests.csv", skip = 4) ## ----echo = FALSE----------------------------------------------------- dirpath_shortcuts <- data.frame(abbr = c("`~`", "`.`", "`..`", "`../..`"), meaning = c("Home directory", "Current working directory", "One directory up from current working directory", "Two directories up from current working directory")) knitr::kable(dirpath_shortcuts, col.names = c("Shorthand", "Meaning")) ## ----eval=F----------------------------------------------------------- ## list.files() ## --------------------------------------------------------------------- paste("Sunday", "Monday", "Tuesday") ## --------------------------------------------------------------------- paste("Sunday", "Monday", "Tuesday", sep = "") ## ----eval=F----------------------------------------------------------- ## paste0("Sunday", "Monday", "Tuesday") ## ----echo = FALSE----------------------------------------------------- read.table.funcs <- data.frame(func = c("`read.table`", "`read.csv`", "`read.csv2`", "`read.delim`", "`read.delim2`"), sep = c("white space", "comma", "semi-colon", "tab", "tab"), dec = c("period", "period", "comma", "period", "comma")) knitr::kable(read.table.funcs, col.names = c("Function", "Separator", "Decimal point")) ## ----echo = FALSE----------------------------------------------------- read.table_opts <- data.frame(Option = c("`sep`", "`skip`", "`header`", "`as.is`", "`nrows`", "`na.strings`"), Description = c("What is the delimiter in the data?", "How many lines of the start of the file should you skip?", "Does the first line you read give column names?", "Should you bring in strings as characters, not factors?", "How many rows do you want to read in?", "How are missing values coded?")) knitr::kable(read.table_opts) ## The `readr` package is a member of the **tidyverse** (https://www.tidyverse.org/) of packages. ## ----tidyverse, echo = F, out.width = "55%"--------------------------- knitr::include_graphics("img/tidyverse.png") ## ----echo=FALSE, out.width="80%"-------------------------------------- knitr::include_graphics("img/readr.png") ## ----echo=FALSE, out.width="80%"-------------------------------------- knitr::include_graphics("img/readxl.png") ## --------------------------------------------------------------------- nobel <- read_csv(file = "data/nobel.csv") nobel ## ----cache=TRUE------------------------------------------------------- df <- data.frame( x = 1:3, y = letters[1:3] ) write_csv(df, file = "data/df.csv") ## ----cache=TRUE------------------------------------------------------- df <- tribble( ~x, ~y, 1, "a", 2, "b", 3, "c" ) write_csv(df, file = "data/df_2.csv") ## --------------------------------------------------------------------- read_csv("data/df.csv") read_csv("data/df_2.csv") ## ----message=FALSE---------------------------------------------------- edibnb_badnames <- read_csv("data/edibnb-badnames.csv") names(edibnb_badnames) ## --------------------------------------------------------------------- edibnb_col_names <- read_csv("data/edibnb-badnames.csv", col_names = c("id", "price", "neighbourhood", "accommodates", "bathroom", "bedroom", "bed", "review_scores_rating", "n_reviews", "url")) names(edibnb_col_names) ## ----warning=FALSE---------------------------------------------------- edibnb_clean_names <- read_csv("data/edibnb-badnames.csv") %>% janitor::clean_names() names(edibnb_clean_names) ## ----echo=FALSE, out.width="100%"------------------------------------- knitr::include_graphics("img/df-na.png") ## ----eval=F----------------------------------------------------------- ## read_csv("data/df-na.csv") ## ----echo=FALSE------------------------------------------------------- read_csv("data/df-na.csv") %>% print(n = 10) ## ----eval=FALSE------------------------------------------------------- ## read_csv("data/df-na.csv", ## na = c("", "NA", ".", "9999", "Not applicable")) ## ----echo=FALSE, out.width="100%"------------------------------------- knitr::include_graphics("img/df-na.png") ## ----echo=FALSE,message=F--------------------------------------------- read_csv("data/df-na.csv", na = c("", "NA", ".", "9999", "Not applicable")) %>% print(n = 10) ## ----eval=FALSE------------------------------------------------------- ## read_csv("data/df-na.csv", col_types = list(col_double(), ## col_character(), ## col_character())) ## ----echo=FALSE------------------------------------------------------- read_csv("data/df-na.csv", col_types = list(col_double(), col_character(), col_character())) %>% print(n = 10) ## ----warning=FALSE---------------------------------------------------- dat <- read_csv("data/df-na.csv", col_types = list(col_double(), col_character(), col_character())) %>% print(n = 10) problems(dat) ## --------------------------------------------------------------------- dat <- read_csv("data/df-na.csv", col_types = list(col_number(), col_character(), col_character())) %>% print(n = 10) ## ----message=TRUE, output.lines=7, eval=T----------------------------- read_csv("data/df-na.csv") ## ----message=TRUE, eval=F--------------------------------------------- ## read_csv("data/df-na.csv", show_col_types = F) ## ----echo=FALSE------------------------------------------------------- knitr::include_graphics("img/fav-food/fav-food.png") ## --------------------------------------------------------------------- fav_food <- read_excel("data/favourite-food.xlsx") #<< fav_food ## ----echo=FALSE------------------------------------------------------- knitr::include_graphics("img/fav-food/fav-food-names.png") ## ----warning=FALSE---------------------------------------------------- fav_food <- read_excel("data/favourite-food.xlsx") %>% janitor::clean_names() #<< fav_food ## ----echo=FALSE------------------------------------------------------- knitr::include_graphics("img/fav-food/fav-food-nas.png") ## ----warning=FALSE---------------------------------------------------- fav_food <- read_excel("data/favourite-food.xlsx", na = c("N/A", "99999")) %>% #<< janitor::clean_names() fav_food ## ----warning=FALSE---------------------------------------------------- fav_food <- fav_food %>% mutate( #<< age = if_else(age == "five", "5", age), #<< age = as.numeric(age) #<< ) #<< glimpse(fav_food) ## ----echo=FALSE------------------------------------------------------- knitr::include_graphics("img/fav-food/fav-food-age.png") ## --------------------------------------------------------------------- fav_food %>% count(ses) ## ----echo=FALSE------------------------------------------------------- knitr::include_graphics("img/fav-food/fav-food-ses.png") ## ----warning=FALSE---------------------------------------------------- fav_food <- fav_food %>% mutate(ses = fct_relevel(ses, "Low", "Middle", "High")) #<< fav_food %>% count(ses) ## --------------------------------------------------------------------- fav_food <- fav_food %>% mutate(ses = factor(ses, c("Low", "Middle", "High"))) #<< fav_food %>% count(ses) ## ----warning=FALSE---------------------------------------------------- fav_food <- read_excel("data/favourite-food.xlsx", na = c("N/A", "99999")) %>% janitor::clean_names() %>% mutate( age = if_else(age == "five", "5", age), age = as.numeric(age), ses = fct_relevel(ses, "Low", "Middle", "High") ) fav_food ## --------------------------------------------------------------------- write_csv(fav_food, file = "data/fav-food-clean.csv") fav_food_clean <- read_csv("data/fav-food-clean.csv") ## --------------------------------------------------------------------- fav_food_clean %>% count(ses) ## ----eval=FALSE------------------------------------------------------- ## read_rds(path) ## write_rds(x, path) ## --------------------------------------------------------------------- write_rds(fav_food, file = "data/fav-food-clean.rds") fav_food_clean <- read_rds("data/fav-food-clean.rds") fav_food_clean %>% count(ses) ## ----echo=FALSE------------------------------------------------------- sales <- read_excel("data/sales.xlsx", skip = 3, col_names = c("id", "n")) sales ## ----echo=FALSE------------------------------------------------------- sales %>% mutate( is_brand_name = str_detect(id, "Brand"), brand = if_else(is_brand_name, id, NA_character_) ) %>% fill(brand) %>% filter(!is_brand_name) %>% select(brand, id, n) %>% mutate( id = as.numeric(id), n = as.numeric(n) )