## ---------------------------------------------------- dir_path <- "../../slides/datamanaging/data/" list.files(dir_path) file_path <- paste0(dir_path, "daily_show_guests.csv") library(tidyverse) read_lines(file_path, n_max=10) daily_show <- read.csv(file_path, skip=4) str(daily_show) head(daily_show) daily_show <- read_csv(file_path, skip=4) spec(daily_show) ; problems(daily_show) str(daily_show) ; glimpse(daily_show) ; head(daily_show) daily_show <- rename(daily_show, year = YEAR, job = GoogleKnowlege_Occupation, date = Show, category = Group, guest_name = Raw_Guest_List) select(daily_show, job, date, category, guest_name) # or select(daily_show, -year) select(daily_show, last_col()) # or daily_show[,ncol(daily_show)] arrange(daily_show, category) arrange(daily_show, desc(date)) #arrange(daily_show, desc(year), desc(date)) daily_show %>% select(category) %>% arrange(category) %>% unique # or daily_show %>% pull(category) %>% sort %>% unique # or daily_show %>% distinct(category) %>% arrange(category) scientists <- filter(daily_show, category == "Science") head(scientists) scientists %>% count(job, sort=T) scientists <- scientists %>% filter(job%in%c("neurosurgeon", "surgeon", "Surgeon")) head(scientists) ## ------------------------------------------------------ file_path <- paste0(dir_path, "nobel.csv") read_lines(file_path, n_max=10) nobel <- read_csv(file_path) glimpse(nobel) head(nobel) nobel %>% distinct(category) %>% arrange(category) nobel_stem <- nobel %>% filter(category %in% c("Physics", "Medicine", "Chemistry", "Economics")) nobel_nonstem <- nobel %>% filter(!(category %in% c("Physics", "Medicine", "Chemistry", "Economics"))) nrow(nobel_stem)+nrow(nobel_nonstem)==nrow(nobel) write_csv(nobel_stem, file=paste0(dir_path, "nobel-stem.csv")) options(digits=3) nobel %>% count(gender) %>% mutate(prop = n/sum(n)*100) # alternatively nobel %>% group_by(gender) %>% summarize(n = n()) # What is "org"? nobel %>% filter(gender=="org") nobel_mf <- nobel %>% filter(gender!="org") # What is the distribution of "share" between males and females? nobel_mf %>% count(gender, share) nobel_mf %>% group_by(gender) %>% count(share) %>% mutate(prop = n/sum(n)*100) # What is the distribution of "category" between males and females? nobel_mf %>% group_by(gender) %>% count(category, sort=T) %>% mutate(prop = n/sum(n)*100, cum_sum = cumsum(prop)) nobel_mf %>% mutate(stem = ifelse(category %in% c("Physics", "Medicine", "Chemistry", "Economics"), "stem", "nonstem")) %>% group_by(gender) %>% count(stem) %>% mutate(prop = n/sum(n)*100) # Do femal nobel winners increase over time? nobel %>% filter(gender=="female") %>% count(year) %>% arrange(n) nobel %>% filter(gender=="female") %>% group_by(year) %>% summarize(n = n()) # ... by age ...