gap <- read.csv("data/gapminder_data.csv")
install.packages(c("ggplot2", "cowplot", "tidyr", "dplyr", 
    "knitr", "rmarkdown", "formatR"))Type in MICKLEY for the room name
x <- 1:4
x
x * 2
y <- 6:9
x + y
x > 2
a <- x < 2
b <- x > 3
a | b
log(x)* operator etc gives you element-wise multiplicationFor matrix multiplication you need to use %*%
m <- matrix(1:12, nrow=3, ncol=4)
m * -1
m * m
m %*% matrix(1, nrow=4, ncol=1)
number <- 37
if (number > 100) {
    print("greater than 100")
} else {
    print("less than 100")
}
print("Finished checking")number > 100number < 100number == 37number != 37
if (number > 100) {
    print("greater than 100")
}
number = -3
if (number > 0) {
     print(1)
} else if (number < 0) {
     print(-1)
} else {
     print(0)
}
number1 = 15
number2 = 40
if (number1 >= 0 & number2 >= 0) {
    print("Both numbers are positive")
} else {
    print("At least one number is negative")
}
if (number1 >= 0 | number2 >= 0) {
    print("At least one number is not negative")
} else {
    print("Both numbers are negative")
}---------- Socrative #1 ----------: Test whether number is between 45 and 50 (inclusive)
This is a tricky one, 3 correct answers
numbers <- c(1:10)
print(numbers)
for (number in numbers) {
     print(number)
}
for(i in 1:5){
    print(numbers[i])
}1:10 returns a sequence
letter <- "z"
print(letter)
for (letter in c("a", "b", "c")) {
    print(letter)
}
print(letter)---------- Socrative #2 ----------: For loop to calculate sum of vector
numbers <- c(4, 8, 15, 16, 23, 42)
running_sum = 0
for (number in numbers) {
     running_sum = running_sum + number
}
print(running_sum)
for(i in 1:10){
    print(gap$year[i])
}Instead of 10, I could use nrow(gap)
# First for loop
for(i in 1:3){
    # Second for loop
    for(j in c('a', 'b', 'c')){
        print(paste(i, j))
    } # End of second
} # End of firstGives me every combination. Notice the order of nesting matters
---------- Socrative #3 ----------: For loop to find years with life expectancy < 35
for (i in 1:10) {
     if (gap$lifeExp[i] < 35) {
          print(gap$year[i])
     }
}
for (i in 1:nrow(gap)) {
     if (gap$lifeExp[i] < 35) {
          print(paste(gap$country[i], gap$year[i]))
     }
}Break R with a While loop, stop the script
z <- 1
while (z > 0.1) {
    print(z)
}runif() is used to pick a random number between zero and 1
runif(1)
z <- 1
while (z > 0.1) {
    z <- runif(1)
    print(z)
}QUESTION: How many of you end up doing the same thing more than once while analyzing your data?
read.csv() is a function. We give it a filename and it gives us a dataframe
fahr_to_kelvin <- function(temp) {
    kelvin <- ((temp - 32) * (5 / 9)) + 273.15
    return(kelvin)
}
# Without running previous code: 
fahr_to_kelvin(32)fahr_to_kelvin(32)fahr_to_kelvin(212)
kelvin_to_celsius <- function(temp) {
    celsius <- temp - 273.15
    return(celsius)
}
kelvin_to_celsius(0)print(celsius)---------- Socrative #4 ----------: Testing variable scope
Both B and D are correct.
Defensive programming: notice how we used a different name for the output of the function = better
fahr_to_celsius <- function(temp) {
    temp_k <- fahr_to_kelvin(temp)
    temp_c <- kelvin_to_celsius(temp_k)
    return(temp_c)
}
# freezing point of water in Celsius
fahr_to_celsius(32.0)kelvin_to_celsius(fahr_to_kelvin(32.0))---------- Socrative #4 ----------: Write a C to F function
celsius_to_fahr <- function(temp){
  fahr <- temp * 9 / 5 + 32
  return(fahr)
}
celsius_to_fahr(100)Now write a function that takes two arguments: one the temp to be converted, and another that says whether to convert from fahrenheit to celsius or celsius to fahrenheit. Using if...then, make the same function do both.
tempconvert(temp = 14, to = "fahrenheit")
tempconvert <- function(temp, to) {
     if (to == "fahrenheit") {
          converted = temp * 9 / 5 + 32
     } else if (to == "celsius") {
          converted = (temp - 32) * (5 / 9)
     }
     return(converted)
}
tempconvert(100, "fahrenheit")
tempconvert(212, "celsius")tempconvert(212, "Celsius")
adding an "else" is a good idea:
else {
    converted = NA
    print("Error: 'to' was not fahrenheit or celsius")
}
tempconvert <- function(temp, to = "celsius") {
    # Converts a temperature from celsius to fahrenheit
    #   or from fahrenheit to celsius.
    # Takes a temperature and the desired unit as arguments
    # Returns the converted temperature
    # Example: tempconvert(212, "celsius") => 100
tempconvert(212)
# Takes a dataset and multiplies the population column
# with the GDP per capita column.
calcGDP <- function(dat) {
  gdp <- dat$pop * dat$gdpPercap
  return(gdp)
}
calcGDP(head(gapminder))
calcGDP <- function(dat, year=NULL, country=NULL) {
  if(!is.null(year)) {
    dat <- dat[dat$year %in% year, ]
  }
  if (!is.null(country)) {
    dat <- dat[dat$country %in% country,]
  }
  gdp <- dat$pop * dat$gdpPercap
  new <- cbind(dat, gdp=gdp)
  return(new)
}
head(calcGDP(gapminder, year=2007))
calcGDP(gapminder, country="Australia")
We'll cover the first two, since the same stuff applies to everyone
We're going to use dplyr and tidyr instead.
install.packages(c("tidyr", "dplyr", "knitr", "rmarkdown", "formatR"))library(dplyr)
head(gap)
yr_country_gdp <- select(gap, year, country, gdpPercap)
head(year_country_gdp)Select lets us subset columns, but what if we want to subset rows? filter() does that
str(gap)
gap_eu <- filter(gap, continent == "Europe")
str(gap_eu)We can stack these functions to do both
yr_country_gdp_eu <- filter(select(gap, year, country, gdpPercap), 
    continent=="Europe")yr_country_gdp <- gap %>% select(year, country, gdpPercap)
yr_country_gdp_eu <- gap %>%
    filter(continent=="Europe") %>%
    select(year,country,gdpPercap)Walk through this example, showing the flow
---------- Socrative #1 ----------: Filter using 2 filters and select 3 cols
Write a command with pipes that filters the gapminder dataset to only include data from 2007 in Africa, and then select the year, country, and lifeExp columns.
How many rows are left in the resulting dataset? If you're not sure how to find the number of rows, discuss with your neighbors.
africa_07_lifeExp %
    filter(continent == "Africa") %>% 
    filter(year == 2007) %>%
    select(year, country, lifeExp)
nrow(africa_07_lifeExp)
africa_07_lifeExp %
    filter(continent == "Africa", year == 2007) %>% 
    select(year, country, lifeExp)
str(africa_07_lifeExp)
mean_gdp <- gap %>% summarize(meanGDP = mean(gdpPercap))mean(gap$gdpPercap)
gdp_by_cont <- gap %>%
    group_by(continent) %>%
    summarize(mean_gdp = mean(gdpPercap))
gdp_by_contHere we're grouping by continent, which means we calculate a separate mean for each one
---------- Socrative #2 ----------: Avg lifeExp by year for Africa
Let's compute the average life expectancy across all African countries by year. In how many years did average African life expectancy decrease?
africa_lifeExp_yr <- gap %>%
    filter(continent == "Africa") %>%
    group_by(year) %>%
    summarize(avg_life = mean(lifeExp))
africa_lifeExp_yrWhat if we wanted to create a new column for gdp per billion people w/o condensing our data down? Use mutate()
bill_gdp_country_07 <- gap %>%
    filter(year == 2007) %>%
    mutate(billion_gdp = gdpPercap * pop / 10^9) %>%
    select(continent, country, billion_gdp)
head(bill_gdp_country_07)We can group multiple variables and summarize multiple things
gdp_by_cont <- gap %>%
    group_by(continent, year) %>%
    summarize(mean_gdp = mean(gdpPercap), 
        sd_gdp = sd(gdpPercap), 
        mean_pop = mean(pop), 
        sample_size = n(),
        se_gdp = sd_gdp / sqrt(sample_size))
gdp_by_contgdp_by_continents %>% data.frame()We can combine dplyr and ggplot2
gap %>% filter(continent == "Americas") %>%
    ggplot(aes(x = year, y = lifeExp, color = country)) + 
    geom_line() + 
    geom_point()select(), filter(), group_by(), summarize(), and mutate()install.packages("tidyr")library(tidyr)
Genus   Weight  Length
Ursus   122     82
Felis   5       14
Genus   Measurement     Value
Ursus   Weight          122
Ursus   Length          82
Felis   Weight          5
Felis   Length          14---------- Socrative #3 ----------: What format is the gap dataset
Answer: intermediate
gap_wide <- read.csv("data/gapminder_wide.csv", stringsAsFactors = FALSE)
str(gap_wide)
We're going to convert it to long format with gather()
gap_long <- gap_wide %>%
     gather(obstype_year, obs_values, starts_with('pop'),
            starts_with('lifeExp'), starts_with('gdpPercap'))
str(gap_long)
head(gap_long)We could also just exclude the columns we don't want to gather
gap_long <- gap_wide %>% 
    gather(obstype_year, obs_values, -continent, -country)
str(gap_long)
gap_long <- gap_long %>% 
     separate(obstype_year, into = c('obs_type', 'year'), sep = "_") %>%
     mutate(year = as.integer(year))---------- Socrative #4 ----------: Use dplyr on the long dataset
Using gap_long, summarize the mean life expectancy by continent
gap_long %>%
    filter(obs_type == "lifeExp") %>%
    group_by(continent) %>%
    summarize(lifeExp = mean(obs_values))
gap_normal <- gap_long %>% 
    spread(obs_type, obs_values)
head(gap_normal)
dim(gap_normal)
dim(gap)
names(gap_normal)
names(gap)
gap_normal <- gap_normal %>%
    select(country, year, pop, continent, lifeExp, gdpPercap)
names(gap_normal)
all_equal(gap_normal, gap)
head(gap_normal)
head(gap)
gap_normal <- gap_normal %>% 
    arrange(country, continent, year)
all.equal(gap_normal, gap)
head(gap_normal)
head(gap)Defensive programming: It's a good idea to do checks all the time (like all_equal)
gap_wide_new <- gap_long %>% 
    unite(var_names, obs_type, year, sep = "_") %>%
    spread(var_names, obs_values)
str(gap_wide_new)gather(), spread(), separate(), unite()Question: How many of you have made a plot in R? How many of you have used ggplot?
plot(x = gap$gdpPercap, y = gap$lifeExp)install.packages(c("tidyr", "dplyr", "knitr", "rmarkdown", "formatR"))library(ggplot2)
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp)) +
    geom_point()ggplot(data = gap, aes(x = gdpPercap, y = lifeExp))---------- Socrative #1 ----------: Modify a ggplot graph
ggplot(data = gap, aes(x=year, y=lifeExp)) + geom_point()
ggplot(data = gap, aes(x = year, y = lifeExp, 
    by = country, color = continent)) +
    geom_line()
ggplot(data = gap, aes(x = year, y = lifeExp, 
    by = country, color = continent)) +
    geom_line() + 
    geom_point()
ggplot(data = gap, aes(x = year, y = lifeExp, by = country)) +
    geom_line(aes(color = continent)) + 
    geom_point()
ggplot(data = gap, aes(x = year, y = lifeExp, by = country)) +
    geom_line(aes(color = continent)) + 
    geom_point(color = "blue")
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp, color = continent)) +
    geom_point()
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp)) +
    geom_point(alpha = 0.5) + 
    scale_x_log10()Here we also use transparency so that overlapping points are easier to see. This works with any geometry. We could also set this to use a data column inside aes()
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp)) +
    geom_point() + scale_x_log10() + geom_smooth(method="lm")
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp)) +
    geom_point() + scale_x_log10() + geom_smooth(method="lm", size=1.5)---------- Socrative #2 ----------: Color by continent & add separate trends
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp, color = continent)) +
    geom_point(size = 1.5) +
    scale_x_log10() +
    geom_smooth(method="lm")
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp, color = continent)) +
    geom_point(size = 2, aes(shape = continent)) + 
    scale_x_log10() + 
    geom_smooth(method="lm")
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp, color = continent)) +
    geom_point(size = 2, aes(shape = continent)) + 
    scale_x_log10() + 
    geom_smooth(method="lm") + 
    scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 10)) + 
    theme_bw() + 
    labs(title = "Effects of per-capita GDP on Life Expectancy", 
         x = "GDP per Capita ($)", 
         y = "Life Expectancy (yrs)", 
         color = "Continents", 
         shape = "Continents")ggsave(file = "life_expectancy.png")ggsave(file = "life_expectancy.pdf")---------- Socrative #3 ----------: Optional ggsave & ggplot variables - ggsave will overwrite the graph
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp, color = continent)) + 
    facet_wrap(~ year) + 
    geom_point(size = 2, aes(shape = continent)) + 
    scale_x_log10() +
    geom_smooth(method = "lm")
L.countries <- gap %>% 
    filter(country %in% c("Lebanon", "Lesotho", "Liberia", "Libya"))
L.countries
ggplot(L.countries, aes(x = year, y = lifeExp, color = country)) + 
    geom_line() + facet_wrap( ~ country)Draws a panel for each unique value in that column
lifeExp_country <- function(data, countries) {
     country_subset <- data %>%
          filter(country %in% countries)
     ggplot(country_subset, aes(x = year, y = lifeExp, color = country)) + 
          geom_line() + facet_wrap( ~ country)
}
lifeExp_country(gap, c("Ethiopia", "Australia", "Canada"))library(cowplot)Text size
theme(
    # Text size for axis ticks
    axis.text.y = element_text(size = 14),
    axis.text.x = element_text(size = 14),
    # Text size for axis labels
    # Also move them away from the axes a bit for more space
    axis.title.x = element_text(size = 18, face = "bold", vjust = -1),
    axis.title.y = element_text(size = 18, face = "bold", vjust = 1.5),
    # Plot title size, move away from plot
    plot.title = element_text(size = 20, face = "bold", vjust = 5)
    )Adjust the legend
theme(
    # Text size
    legend.text = element_text(size = 14),
    legend.title = element_text(size = 16, face = "bold"),
    # Position
    legend.position = c(x = 0.8, y = 0.2)
    )Create a boxplot showing the spread of life expectancy for each continent
ggplot(data = gap, aes(x = continent, y = lifeExp)) + 
   geom_boxplot() + 
   geom_jitter(width = 0.2, alpha = 0.5, color = "tomato")
ggplot(data = gap, aes(x = continent, y = lifeExp)) + 
    geom_boxplot() + 
    geom_jitter(width = 0.2, alpha = 0.5, size = 2, 
        aes(color = factor(year)))Create a grouped barplot showing life expectancy by year for each continent
ggplot(data = gap, aes(x = continent)) + 
    geom_bar()
ggplot(data = gap, aes(x = continent, y = lifeExp, fill = factor(year))) + 
    geom_bar(stat = "summary", fun.y = "mean", position = "dodge")Question: How many of you could just send your code to your advisor or collaborator?
Basically a fancy lab notebook!
head(cars, 30) and run it### Heading* Bullet or - Bullet1. list**bold***italic*[Software Carpentry](http://software-carpentry.org/)---------- Socrative ----------: R Notebook Challenge
Modify your R notebook to load the data from gap, show the first 10 lines, and display a graph.
Give each R code section a heading and short summary. - 
    ```
    ## Reading & Displaying Data
    This chunk reads in the gap dataset and shows the first 10 lines
    ``{r}
    gap <- read.csv("data/gapminder-data.csv")
    head(gap, 10)
    ``
    ## Graph Gap Dataset
    This chunk shows a graph
    ``{r}
    library(ggplot2)
    ggplot(data = gap, aes(x = gdpPercap, y = lifeExp)) + 
      geom_point()
    ``
    ```
There's a format to publish reports to github
github_document:
    toc: yes
    toc_depth: 5You can name chunks, and control output for graphs
{r "test_graph", fig.height = 6, fig.width = 8, dpi = 300}sessionInfo() in my R Notebooks so I know what package versions I used
packratGo to Tools > Terminal > New Terminal or Git > Gear > Shell
git remote add origin https://github.com/mickley/rstudio-test.git
git push -u origin master