diff --git a/README.md b/README.md index 391ec8f6..56936ed5 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,7 @@ This repository contains _all_ programming exercises for the [_Programming Skills for Data Science_](https://programming-for-data-science.github.io/) book. -Solutions can be found in the `solution` branch. \ No newline at end of file +Solutions can be found in the `solution` branch. + +### "Cidade Deus" is a great film + diff --git a/chapter-05-exercises/exercise-1/.RData b/chapter-05-exercises/exercise-1/.RData new file mode 100644 index 00000000..eae5b658 Binary files /dev/null and b/chapter-05-exercises/exercise-1/.RData differ diff --git a/chapter-05-exercises/exercise-1/.Rhistory b/chapter-05-exercises/exercise-1/.Rhistory new file mode 100644 index 00000000..d2857143 --- /dev/null +++ b/chapter-05-exercises/exercise-1/.Rhistory @@ -0,0 +1,2 @@ +too_expensive = FALES +too_expensive = FALSE diff --git a/chapter-05-exercises/exercise-1/exercise.R b/chapter-05-exercises/exercise-1/exercise.R index 71eb3f2a..f88a199c 100644 --- a/chapter-05-exercises/exercise-1/exercise.R +++ b/chapter-05-exercises/exercise-1/exercise.R @@ -1,26 +1,30 @@ # Exercise 1: practice with basic R syntax # Create a variable `hometown` that stores the city in which you were born - +hometown <- "Toledo" # Assign your name to the variable `my_name` - +my_name <- "Alejandro" # Assign your height (in inches) to a variable `my_height` - +my_height <- 1.74 # Create a variable `puppies` equal to the number of puppies you'd like to have - +puppies <- 0 # Create a variable `puppy_price`, which is how much you think a puppy costs - +puppy_price <- 20 # Create a variable `total_cost` that has the total cost of all of your puppies - +total_cost <- 20 # Create a boolean variable `too_expensive`, set to TRUE if the cost is greater # than $1,000 - +if total_cost > 1000: + too_expensive = TRUE +else + too_expensive = FALSE + # Create a variable `max_puppies`, which is the number of puppies you can # afford for $1,000 diff --git a/chapter-06-exercises/exercise-1/.Rhistory b/chapter-06-exercises/exercise-1/.Rhistory new file mode 100644 index 00000000..e69de29b diff --git a/chapter-08-exercises/exercise-1/.RData b/chapter-08-exercises/exercise-1/.RData new file mode 100644 index 00000000..4fd3e210 Binary files /dev/null and b/chapter-08-exercises/exercise-1/.RData differ diff --git a/chapter-08-exercises/exercise-1/.Rhistory b/chapter-08-exercises/exercise-1/.Rhistory new file mode 100644 index 00000000..974c4346 --- /dev/null +++ b/chapter-08-exercises/exercise-1/.Rhistory @@ -0,0 +1,80 @@ +meals +# Create a vector `my_breakfast` of everything you ate for breakfast +my_breakfast <- c("leche", "galletas") +# Create a vector `my_lunch` of everything you ate (or will eat) for lunch +my_lunch <- c("tomate", "lechuga", "coliflor") +# Create a list `meals` that has contains your breakfast and lunch +meals <- my_lunch + my_breakfast +# Create a list `meals` that has contains your breakfast and lunch +meals <- my_lunch, my_breakfast +# Create a list `meals` that has contains your breakfast and lunch +meals <- c(my_lunch, my_breakfast) +meals +# Add a "dinner" element to your `meals` list that has what you plan to eat +# for dinner +dinner <- c("hamburguesa", "patatas fritas") +meals <- c(meals, dinner) +meals +# Use dollar notation to extract your `dinner` element from your list +# and save it in a vector called 'dinner' +dinner2 <- meals[2] +dinner2 +# Use dollar notation to extract your `dinner` element from your list +# and save it in a vector called 'dinner' +dinner2 <- meals[-1] +dinner2 +# Use dollar notation to extract your `dinner` element from your list +# and save it in a vector called 'dinner' +dinner2 <- meals[length(meals)] +dinner2 +meals +meals <- c(c(meals), c(dinner) +meals <- c(c(meals), c(dinner)) +# Use dollar notation to extract your `dinner` element from your list +# and save it in a vector called 'dinner' +dinner2 <- meals[length(meals)] +dinner2 +# Use double-bracket notation to extract your `lunch` element from your list +# and save it in your list as the element at index 5 (no reason beyond practice) +meals[1:4] +# Create a list that has the number of items you ate for each meal +# Hint: use the `lappy()` function to apply the `length()` function to each item +l <- list(breakfast : c("leche","cafe"), +dinner : c("hamburguesa", "patatas")) +# Create a list that has the number of items you ate for each meal +# Hint: use the `lappy()` function to apply the `length()` function to each item +l <- list(breakfast = c("leche","cafe"), +dinner = c("hamburguesa", "patatas")) +l +lapply(l, length()) +l[1] +lapply(l[1], length()) +length(l[1]) +length(l[2]) +lapply(l, length) +lapply(l, length) +return "pizza" +# Write a function `add_pizza` that adds pizza to a given meal vector, and +# returns the pizza-fied vector +add_pizza <- function(){ +return "pizza" +} +return p +return p; +return +return p +return p +return("pizza") +# Write a function `add_pizza` that adds pizza to a given meal vector, and +# returns the pizza-fied vector +add_pizza <- function(){ +return("pizza") +} +lapply(l, add_pizza) +# Write a function `add_pizza` that adds pizza to a given meal vector, and +# returns the pizza-fied vector +add_pizza <- function(ventrada, vsalida){ +vsalida <- c(ventrada, "pizza") +return(vsalida) +} +lapply(l, add_pizza) diff --git a/chapter-08-exercises/exercise-1/exercise.R b/chapter-08-exercises/exercise-1/exercise.R index 75f89c33..77b4476b 100644 --- a/chapter-08-exercises/exercise-1/exercise.R +++ b/chapter-08-exercises/exercise-1/exercise.R @@ -1,25 +1,29 @@ # Exercise 1: creating and accessing lists # Create a vector `my_breakfast` of everything you ate for breakfast - +my_breakfast <- c("leche", "galletas") # Create a vector `my_lunch` of everything you ate (or will eat) for lunch - +my_lunch <- c("tomate", "lechuga", "coliflor") # Create a list `meals` that has contains your breakfast and lunch - +meals <- c(my_lunch, my_breakfast) +meals # Add a "dinner" element to your `meals` list that has what you plan to eat # for dinner - +dinner <- c("hamburguesa", "patatas fritas") +meals <- c(meals, dinner) +meals # Use dollar notation to extract your `dinner` element from your list # and save it in a vector called 'dinner' - +dinner2 <- meals[length(meals)] +dinner2 # Use double-bracket notation to extract your `lunch` element from your list # and save it in your list as the element at index 5 (no reason beyond practice) - +meals[1:4] # Use single-bracket notation to extract your breakfast and lunch from your list # and save them to a list called `early_meals` @@ -30,11 +34,17 @@ # Create a list that has the number of items you ate for each meal # Hint: use the `lappy()` function to apply the `length()` function to each item - +l <- list(breakfast = c("leche","cafe"), + dinner = c("hamburguesa", "patatas")) +lapply(l, length) # Write a function `add_pizza` that adds pizza to a given meal vector, and # returns the pizza-fied vector - +add_pizza <- function(ventrada, vsalida){ + vsalida <- c(ventrada, "pizza") + return(vsalida) +} +lapply(l, add_pizza) # Create a vector `better_meals` that is all your meals, but with pizza! diff --git a/chapter-17-exercises/exercise-1/.RData b/chapter-17-exercises/exercise-1/.RData new file mode 100644 index 00000000..16521446 Binary files /dev/null and b/chapter-17-exercises/exercise-1/.RData differ diff --git a/chapter-17-exercises/exercise-1/.Rhistory b/chapter-17-exercises/exercise-1/.Rhistory new file mode 100644 index 00000000..bf15c678 --- /dev/null +++ b/chapter-17-exercises/exercise-1/.Rhistory @@ -0,0 +1,512 @@ +#origin y dest +#¿average speed? ¿distance / air_time ? +city <- flights %>% +group_by(dest) %>% +summarize(h_speed = mean((distance/air_time), na.rm = TRUE)) +city +# Which city was flown to with the highest average speed? +#origin y dest +#¿average speed? ¿distance / air_time ? +city <- flights %>% +group_by(dest) %>% +summarize(h_speed = mean((distance/air_time), na.rm = TRUE)) %>% +head(1) +city +# Which city was flown to with the highest average speed? +#origin y dest +#¿average speed? ¿distance / air_time ? +city <- flights %>% +group_by(origin) %>% +summarize(h_speed = mean((distance/air_time), na.rm = TRUE)) %>% +head(1) +city +dep_delay_by_month +# What was the average departure delay in each month? +# Save this as a data frame `dep_delay_by_month` +# Hint: you'll have to perform a grouping operation then summarizing your data +#group by month y luego avg columa dep_delay +dep_delay_by_month <- flights %>% +group_by(month) %>% +summarise(avg_delay = mean(dep_delay, na.rm = TRUE)) +dep_delay_by_month +dep_delay_by_month +# Which month had the greatest average departure delay? +max_avg <- dep_delay_by_month %>% +filter(avg_delay == max(avg_delay)) %>% +select(month) +max_avg +# Which month had the greatest average departure delay? +max_avg <- dep_delay_by_month %>% +filter(avg_delay == max(avg_delay)) +# select(month) +max_avg +# To which destinations were the average arrival delays the highest? +# Hint: you'll have to perform a grouping operation then summarize your data +# You can use the `head()` function to view just the first few rows +h_dest_delay <- flights %>% +group_by(dest) %>% +summarise(avg_delay = mean(arr_delay, na.rm = TRUE)) %>% +head(1) +h_dest_delay +by_dest %>% +rename(faa = dest)%>% +left_join(airports, by=c("dest" = "faa")) +by_dest %>% +rename(faa = dest)%>% +left_join(airports, by="faa") +by_dest %>% +group_by(dest) %>% +summarise(avg_delay = mean(arr_delay, na.rm = TRUE)) %>% +rename(faa = dest)%>% +left_join(airports, by="faa") +by_dest <- flights %>% +group_by(dest) %>% +summarise(avg_delay = mean(arr_delay, na.rm = TRUE)) %>% +rename(faa = dest)%>% +left_join(airports, by="faa") +by_dest +# Which city was flown to with the highest average speed? +#origin y dest +#¿average speed? ¿distance / air_time ? +city <- flights %>% +mutate(speed = (distance/air_time)) %>%speed +group_by(dest) %>% +summarize(h_speed = mean(speed, na.rm = TRUE)) %>% +left_join(airports, by="faa") +# Which city was flown to with the highest average speed? +#origin y dest +#¿average speed? ¿distance / air_time ? +city <- flights %>% +mutate(speed = (distance/air_time)) %>% +group_by(dest) %>% +summarize(h_speed = mean(speed, na.rm = TRUE)) %>% +left_join(airports, by="faa") +by_dest <- flights %>% +group_by(dest) %>% +summarise(avg_delay = mean(arr_delay, na.rm = TRUE)) %>% +rename(faa = dest)%>% +left_join(airports, by="faa") +by_dest +# To which destinations were the average arrival delays the highest? +# Hint: you'll have to perform a grouping operation then summarize your data +# You can use the `head()` function to view just the first few rows +h_dest_delay <- flights %>% +group_by(dest) %>% +summarise(avg_delay = mean(arr_delay, na.rm = TRUE)) %>% +head(1) +h_dest_delay +by_dest <- flights %>% +group_by(dest) %>% +summarise(avg_delay = mean(arr_delay, na.rm = TRUE)) %>% +rename(faa = dest)%>% +left_join(airports, by="faa") +by_dest +city +# You can look up these airports in the `airports` data frame! +View(airports) +library("nycflights13") +library("dplyr") +# What was the average departure delay in each month? +# Save this as a data frame `dep_delay_by_month` +# Hint: you'll have to perform a grouping operation then summarizing your data +#group by month y luego avg columa dep_delay +dep_delay_by_month <- flights %>% +group_by(month) %>% +summarise(avg_delay = mean(dep_delay, na.rm = TRUE)) +dep_delay_by_month +# Which month had the greatest average departure delay? +max_avg <- dep_delay_by_month %>% +filter(avg_delay == max(avg_delay)) +# select(month) +max_avg +# If your above data frame contains just two columns (e.g., "month", and "delay" +# in that order), you can create a scatterplot by passing that data frame to the +# `plot()` function +plot(dep_delay_by_month) +# To which destinations were the average arrival delays the highest? +# Hint: you'll have to perform a grouping operation then summarize your data +# You can use the `head()` function to view just the first few rows +h_dest_delay <- flights %>% +group_by(dest) %>% +summarise(avg_delay = mean(arr_delay, na.rm = TRUE)) %>% +head(1) +h_dest_delay +by_dest <- flights %>% +group_by(dest) %>% +summarise(avg_delay = mean(arr_delay, na.rm = TRUE)) %>% +rename(faa = dest)%>% +left_join(airports, by="faa") +by_dest +city <- flights %>% +mutate(speed = (distance/air_time)) %>% +group_by(dest) %>% +summarize(h_speed = mean(speed, na.rm = TRUE)) %>% +left_join(airports, by="faa") +by_dest <- flights %>% +group_by(dest) %>% +summarise(avg_delay = mean(arr_delay, na.rm = TRUE)) %>% +rename(faa = dest)%>% +left_join(airports, by="faa") +by_dest +library(midwest) +library("midwest") +# Change the color of each point based on the state it is in +ggplot(data = midwest) + +geom_point( +mapping = aes(x = percollege, y = percadultpoverty, color = state) +) + +scale_color_brewer(palette = "Set3") # use the "Set3" color palette +install.library(midwest) +install.packages(midwest) +install.packages("midwest") +# Load packages +library("dplyr") +library("ggmap") +library("ggplot2") +# Load data from GitHub +permit_data <- read.csv( +"https://raw.githubusercontent.com/programming-for-data-science/in-action/master/interactive-vis/data/Building_Permits.csv", +stringsAsFactors = F) +# Formatting and filtering data +samp <- permit_data %>% +filter(!is.na(Longitude), !is.na(Latitude), Longitude <0, Latitude < 100) %>% +filter(PermitTypeDesc == "New") %>% +mutate(year = as.numeric(substr(IssuedDate, 1, 4))) %>% +filter(year > 2010) +# Chart option 1: years as categories +qmplot( +data = samp, +x = Longitude, +y = Latitude, +color = as.character(year) +) + scale_color_discrete() + +labs(color = "Year") +# Chart option 2: years as continuous +qmplot( +data = samp, +x = Longitude, +y = Latitude, +color = year +) + scale_color_continuous() + +labs(color = "Year") +install.packages("ggmap") +library("ggmap") +# Chart option 1: years as categories +qmplot( +data = samp, +x = Longitude, +y = Latitude, +color = as.character(year) +) + scale_color_discrete() + +labs(color = "Year") +df <- data.frame( +label = c("A", "B", "C", "D"), +value = 1:4, +stringsAsFactors = F +) +ggplot(df) + +geom_col(mapping = aes(x = label, y = value, fill = "blue")) +ggplot(df) + +geom_col(mapping = aes(x = label, y = value, fill = blue)) +ggplot(df) + +geom_col(mapping = aes(x = label, y = value, fill = blue)) +# Install and load the `ggplot2` package +# You will also want to load `dplyr` +library(ggplot2) +?diamonds +# For this exercise you'll be working with the `diamonds` data set included in +# the ggplot2 library +# Use `?diamonds` to get more information about this data set (including the +# column descriptions. Also check the _column names_ and the _number of rows_ +# in the data set +nrows(diamonds) +# For this exercise you'll be working with the `diamonds` data set included in +# the ggplot2 library +# Use `?diamonds` to get more information about this data set (including the +# column descriptions. Also check the _column names_ and the _number of rows_ +# in the data set +nrows("diamonds") +diamonds +nrow(diamonds) +ncol(diamonds) +colnames(diamonds) +# This data set has A LOT of rows. To make things a bit more readable, +# use dplyr's `sample_n()` function to get a random 1000 rows from the data set +# Store this sample in a variable `diamonds_sample` +diamonds_sample <- sample_n(1000) +library(dplyr) +# This data set has A LOT of rows. To make things a bit more readable, +# use dplyr's `sample_n()` function to get a random 1000 rows from the data set +# Store this sample in a variable `diamonds_sample` +diamonds_sample <- sample_n(tbl = diamonds, size=1000) +nrow(diamonds_sample) +View(diamonds_sample) +# Start by making a new `ggplot` with the `diamonds_sample` as the data (no +# geometry yet) +# What do you see? (What did you expect?) +ggplot(data = diamonds_sample) +# Draw a scatter plot (with point geometry) with for the `diamonds_sample` set, +# with the `carat` mapped to the x-position and `price` mapped to the y-position. +scatter(x = "carat", y = "price" ) +# Draw a scatter plot (with point geometry) with for the `diamonds_sample` set, +# with the `carat` mapped to the x-position and `price` mapped to the y-position. +ggplot(data = diamonds_sample) + +geom_point(x = "carat", y = "price") +# Draw a scatter plot (with point geometry) with for the `diamonds_sample` set, +# with the `carat` mapped to the x-position and `price` mapped to the y-position. +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = "carat", y = "price")) +# Draw the same plot as above, but color each of the points based on their +# clarity. +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = "carat", y = "price", colour = "clarity")) +# Draw a scatter plot (with point geometry) with for the `diamonds_sample` set, +# with the `carat` mapped to the x-position and `price` mapped to the y-position. +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y =price)) +# Draw the same plot as above, but color each of the points based on their +# clarity. +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price, colour = clarity)) +# Draw the same plot as above, but for the entire `diamonds` data set. Note this +# may take a few seconds to generate. +ggplot(data = diamonds) + +geom_point(mapping = aes(x = carat, y = price, colour = clarity)) +# Draw another scatter plot for `diamonds_sample` of price (y) by carat (x), +# but with all of the dots colored "blue". +# Hint: you'll need to set the color channel, not map a value to it! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price, colour = )) + +scale_color_brewer(palette = "Set3") +# Draw another scatter plot for `diamonds_sample` of price (y) by carat (x), +# but with all of the dots colored "blue". +# Hint: you'll need to set the color channel, not map a value to it! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price)) + +scale_color_brewer(palette = "Set3") +# Draw another scatter plot for `diamonds_sample` of price (y) by carat (x), +# but with all of the dots colored "blue". +# Hint: you'll need to set the color channel, not map a value to it! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price)) + +scale_color_brewer(palette = "Set1") +# Draw another scatter plot for `diamonds_sample` of price (y) by carat (x), +# but with all of the dots colored "blue". +# Hint: you'll need to set the color channel, not map a value to it! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price, colour = clarity)) + +scale_color_brewer(palette = "Set1") +# Draw another scatter plot for `diamonds_sample` of price (y) by carat (x), +# but with all of the dots colored "blue". +# Hint: you'll need to set the color channel, not map a value to it! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price, colour = clarity)) + +scale_color_brewer(palette = "Set3") +# Draw another scatter plot for `diamonds_sample` of price (y) by carat (x), +# but with all of the dots colored "blue". +# Hint: you'll need to set the color channel, not map a value to it! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price, colour = clarity)) + +scale_color_brewer(palette = "Blue") +# Draw another scatter plot for `diamonds_sample` of price (y) by carat (x), +# but with all of the dots colored "blue". +# Hint: you'll need to set the color channel, not map a value to it! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price, colour = clarity)) + +scale_color_brewer(palette = "PuBu") +# Draw a scatter plot for `diamonds_sample` of `price` by `carat`, where each +# point has an aesthetic _shape_ based on the diamond's `cut`. +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price, shape=4)) +# Draw a scatter plot for `diamonds_sample` of `price` by `carat`, where each +# point has an aesthetic _shape_ based on the diamond's `cut`. +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price, shape = cut )) +# Draw a scatter plot for `diamonds_sample` of *`cut`* by `carat`, where each +# point has an aesthetic _size_ based on the diamond's *`price`* +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = cut, shape = price )) +# Draw a scatter plot for `diamonds_sample` of `price` by `carat`, where each +# point has an aesthetic _shape_ based on the diamond's `cut`. +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price, shape = cut )) +# Draw a scatter plot for `diamonds_sample` of *`cut`* by `carat`, where each +# point has an aesthetic _size_ based on the diamond's *`price`* +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = cut, size = price )) +# Try coloring the above plot based on the diamond's price! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = cut, size = price, colour = price )) +# Try coloring the above plot based on the diamond's price! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = cut, size = price, colour = price )) + +scale_color_brewer(palette = "3") +# Try coloring the above plot based on the diamond's price! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = cut, size = price, colour = price )) + +scale_color_brewer(palette = "3-class") +# Try coloring the above plot based on the diamond's price! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = cut, size = price, colour = price )) + +scale_color_brewer(palette = "Set1") +# Try coloring the above plot based on the diamond's price! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = cut, size = price)) + +scale_color_brewer(palette = "Set1") +# Try coloring the above plot based on the diamond's price! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = cut, size = price)) + +scale_color_brewer(palette = "Set3") +# Try coloring the above plot based on the diamond's price! +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = cut, size = price)) + +scale_color_brewer(palette = "PuBu") +# Draw a scatter plot (with point geometry) with for the `diamonds_sample` set, +# with the `carat` mapped to the x-position and `price` mapped to the y-position. +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y =price), color = "blue") +# Draw the same plot as above, but color each of the points based on their +# clarity. +ggplot(data = diamonds_sample) + +geom_point(mapping = aes(x = carat, y = price, colour = clarity)) +cd /home/alejandro/Descargas +%cd% "/home/alejandro/Descargas" +cd "/home/alejandro/Descargas" +df <- data.frame("/home/alejandro/Descargas/madrid_2001.csv.zip") +ncols(df) +ncol(df) +df <- data.frame("/home/alejandro/Descargas/madrid_2001.csv") +ncol(df) +ncol(df) +ncols(df) +colnames(df) +df <- data.frame("/home/alejandro/Descargas/madrid_2001.csv") +colnames(df) +df[1] +df <- read.csv("/home/alejandro/Descargas/madrid_2001.csv") +colnames(df) +nrow(df) +ncol(df) +nrow(df) +ncol(df), nrow(df) +ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2001.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2002.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2003.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2004.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2005.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2006.csv"); ncol(df); nrow(df) +f <- +df <- read.csv("/home/alejandro/Descargas/madrid_2007.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2008.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2009.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2010.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2011.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2012.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2013.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2014.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2015.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2016.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2017.csv"); ncol(df); nrow(df) +df <- read.csv("/home/alejandro/Descargas/madrid_2018.csv"); ncol(df); nrow(df) +# Load necessary packages (`dplyr`, `ggplot2`, and `plotly`) +library(dplyr) +library(ggplot2) +library(plotly) +library(plotly) +# Exercise 1: Creating a grouped bar chart of cancer rates in King County, WA +# (using plotly) +library(dplyr) +library(ggplot2) +library(plotly) +# Load the `"data/IHME_WASHINGTON_MORTALITY_RATES_1980_2014.csv` file +# into a variable `mortality_rates` +# Make sure strings are *not* read in as factors +mortality_rates <- read.csv("data/IHME_WASHINGTON_MORTALITY_RATES_1980_2014.csv") +setwd("~/Dropbox/Data/UPM/1.DP/book-exercises/chapter-17-exercises/exercise-1") +# Load the `"data/IHME_WASHINGTON_MORTALITY_RATES_1980_2014.csv` file +# into a variable `mortality_rates` +# Make sure strings are *not* read in as factors +mortality_rates <- read.csv("data/IHME_WASHINGTON_MORTALITY_RATES_1980_2014.csv") +# This is actually a very large and rich dataset, but we will only focus on +# a small subset of it. Create a new data frame `plot_data` by filtering the +# `mortality_rates` data to the following: +# - The `location_name` is "King County" +# - The `sex` is *not* "Both" +# - The `cause_name` is "Neoplasms" +# - The `year_id` is greater than 2004 +# - Only keep the columns `sex`, `year_id`, and `mortality_rate` +plot_data <- mortality_rates %>% +filter(location_name == "King County") +# This is actually a very large and rich dataset, but we will only focus on +# a small subset of it. Create a new data frame `plot_data` by filtering the +# `mortality_rates` data to the following: +# - The `location_name` is "King County" +# - The `sex` is *not* "Both" +# - The `cause_name` is "Neoplasms" +# - The `year_id` is greater than 2004 +# - Only keep the columns `sex`, `year_id`, and `mortality_rate` +plot_data <- mortality_rates %>% +filter(location_name == "King County") %>% +filter(sex != "Both") +# This is actually a very large and rich dataset, but we will only focus on +# a small subset of it. Create a new data frame `plot_data` by filtering the +# `mortality_rates` data to the following: +# - The `location_name` is "King County" +# - The `sex` is *not* "Both" +# - The `cause_name` is "Neoplasms" +# - The `year_id` is greater than 2004 +# - Only keep the columns `sex`, `year_id`, and `mortality_rate` +plot_data <- mortality_rates %>% +filter(location_name == "King County") %>% +filter(sex != "Both") %>% +filter(cause_name == "Neoplasms") %>% +filter(year_id > 2004) %>% +select(sex, year_id, mortality_rate) +colnames(plot_data) +head(plot_data) +# Using ggplot2 (recall chapter 16), make a grouped ("dodge") bar chart of +# mortality rates each year, with different bars for each sex. +# Store this plot in a variable `mort_plot` +mort_plot <- ggplot(data = "plot_data") + +geom_bar(mapping = aes(x = year, y = sex )) +# Using ggplot2 (recall chapter 16), make a grouped ("dodge") bar chart of +# mortality rates each year, with different bars for each sex. +# Store this plot in a variable `mort_plot` +mort_plot <- ggplot(data = plot_data) + +geom_bar(mapping = aes(x = year, y = sex )) +# To make this plot interactive, pass `mort_plot` to the `ggplotly()` function +# (which is part of the `plotly` package). This will make your plot interactive! +ggplotly(mort_plot) +# Using ggplot2 (recall chapter 16), make a grouped ("dodge") bar chart of +# mortality rates each year, with different bars for each sex. +# Store this plot in a variable `mort_plot` +mort_plot <- ggplot(data = plot_data) + +geom_bar(mapping = aes(x = year_id, y = sex )) +# To make this plot interactive, pass `mort_plot` to the `ggplotly()` function +# (which is part of the `plotly` package). This will make your plot interactive! +ggplotly(mort_plot) +# Using ggplot2 (recall chapter 16), make a grouped ("dodge") bar chart of +# mortality rates each year, with different bars for each sex. +# Store this plot in a variable `mort_plot` +mort_plot <- ggplot(data = plot_data) + +geom_bar(mapping = aes(x = year_id, y = sex )) +# To make this plot interactive, pass `mort_plot` to the `ggplotly()` function +# (which is part of the `plotly` package). This will make your plot interactive! +ggplotly(mort_plot) +head(plot_data) +tail(plot_data) +# Using ggplot2 (recall chapter 16), make a grouped ("dodge") bar chart of +# mortality rates each year, with different bars for each sex. +# Store this plot in a variable `mort_plot` +mort_plot <- ggplot(data = plot_data) + +geom_bar(mapping = aes(x = year_id, y = mortality_rate )) +# To make this plot interactive, pass `mort_plot` to the `ggplotly()` function +# (which is part of the `plotly` package). This will make your plot interactive! +ggplotly(mort_plot) +# Using the `plot_ly()` function from the `plotly` package, pass in `plot_data` +# as the data, and specify `year_id` as the x variable, mortality_rate as +# the y variable, and `sex` as the color variable. +# (make sure to specify these as *formulas*) +# Also set the plot type to "bar". Store the result in a variable. +plot_ly(data=plot_data) diff --git a/chapter-17-exercises/exercise-1/exercise.R b/chapter-17-exercises/exercise-1/exercise.R index 48c73172..6597779f 100755 --- a/chapter-17-exercises/exercise-1/exercise.R +++ b/chapter-17-exercises/exercise-1/exercise.R @@ -1,7 +1,9 @@ # Exercise 1: Creating a grouped bar chart of cancer rates in King County, WA # (using plotly) - # Load necessary packages (`dplyr`, `ggplot2`, and `plotly`) +library(dplyr) +library(ggplot2) +library(plotly) # Set your working directory using the RStudio menu: @@ -10,7 +12,7 @@ # Load the `"data/IHME_WASHINGTON_MORTALITY_RATES_1980_2014.csv` file # into a variable `mortality_rates` # Make sure strings are *not* read in as factors - +mortality_rates <- read.csv("data/IHME_WASHINGTON_MORTALITY_RATES_1980_2014.csv") # This is actually a very large and rich dataset, but we will only focus on # a small subset of it. Create a new data frame `plot_data` by filtering the @@ -20,26 +22,36 @@ # - The `cause_name` is "Neoplasms" # - The `year_id` is greater than 2004 # - Only keep the columns `sex`, `year_id`, and `mortality_rate` +plot_data <- mortality_rates %>% + filter(location_name == "King County") %>% + filter(sex != "Both") %>% + filter(cause_name == "Neoplasms") %>% + filter(year_id > 2004) %>% + select(sex, year_id, mortality_rate) +colnames(plot_data) +tail(plot_data) # Using ggplot2 (recall chapter 16), make a grouped ("dodge") bar chart of # mortality rates each year, with different bars for each sex. # Store this plot in a variable `mort_plot` - +mort_plot <- ggplot(data = plot_data) + + geom_bar(mapping = aes(x = year_id, y = sex )) # To make this plot interactive, pass `mort_plot` to the `ggplotly()` function # (which is part of the `plotly` package). This will make your plot interactive! - +ggplotly(mort_plot) # As an alternative to making a ggplot chart interactive, we can build the same # plot using the plotly API directly +#plot_ly # Using the `plot_ly()` function from the `plotly` package, pass in `plot_data` # as the data, and specify `year_id` as the x variable, mortality_rate as # the y variable, and `sex` as the color variable. # (make sure to specify these as *formulas*) # Also set the plot type to "bar". Store the result in a variable. - +plot_ly(data = plot_data, ) # You should see that the cancer mortaility rates for female and males