gap <- read.csv("data/gapminder_data.csv")
install.packages(c("ggplot2", "cowplot", "tidyr", "dplyr",
"knitr", "rmarkdown", "formatR"))
Type in MICKLEY for the room name
x <- 1:4
x
x * 2
y <- 6:9
x + y
x > 2
a <- x < 2
b <- x > 3
a | b
log(x)
*
operator etc gives you element-wise multiplicationFor matrix multiplication you need to use %*%
m <- matrix(1:12, nrow=3, ncol=4)
m * -1
m * m
m %*% matrix(1, nrow=4, ncol=1)
number <- 37
if (number > 100) {
print("greater than 100")
} else {
print("less than 100")
}
print("Finished checking")
number > 100
number < 100
number == 37
number != 37
if (number > 100) {
print("greater than 100")
}
number = -3
if (number > 0) {
print(1)
} else if (number < 0) {
print(-1)
} else {
print(0)
}
number1 = 15
number2 = 40
if (number1 >= 0 & number2 >= 0) {
print("Both numbers are positive")
} else {
print("At least one number is negative")
}
if (number1 >= 0 | number2 >= 0) {
print("At least one number is not negative")
} else {
print("Both numbers are negative")
}
---------- Socrative #1 ----------: Test whether number is between 45 and 50 (inclusive)
This is a tricky one, 3 correct answers
numbers <- c(1:10)
print(numbers)
for (number in numbers) {
print(number)
}
for(i in 1:5){
print(numbers[i])
}
1:10
returns a sequence
letter <- "z"
print(letter)
for (letter in c("a", "b", "c")) {
print(letter)
}
print(letter)
---------- Socrative #2 ----------: For loop to calculate sum of vector
numbers <- c(4, 8, 15, 16, 23, 42)
running_sum = 0
for (number in numbers) {
running_sum = running_sum + number
}
print(running_sum)
for(i in 1:10){
print(gap$year[i])
}
Instead of 10, I could use nrow(gap)
# First for loop
for(i in 1:3){
# Second for loop
for(j in c('a', 'b', 'c')){
print(paste(i, j))
} # End of second
} # End of first
Gives me every combination. Notice the order of nesting matters
---------- Socrative #3 ----------: For loop to find years with life expectancy < 35
for (i in 1:10) {
if (gap$lifeExp[i] < 35) {
print(gap$year[i])
}
}
for (i in 1:nrow(gap)) {
if (gap$lifeExp[i] < 35) {
print(paste(gap$country[i], gap$year[i]))
}
}
Break R with a While loop, stop the script
z <- 1
while (z > 0.1) {
print(z)
}
runif()
is used to pick a random number between zero and 1
runif(1)
z <- 1
while (z > 0.1) {
z <- runif(1)
print(z)
}
QUESTION: How many of you end up doing the same thing more than once while analyzing your data?
read.csv()
is a function. We give it a filename and it gives us a dataframe
fahr_to_kelvin <- function(temp) {
kelvin <- ((temp - 32) * (5 / 9)) + 273.15
return(kelvin)
}
# Without running previous code:
fahr_to_kelvin(32)
fahr_to_kelvin(32)
fahr_to_kelvin(212)
kelvin_to_celsius <- function(temp) {
celsius <- temp - 273.15
return(celsius)
}
kelvin_to_celsius(0)
print(celsius)
---------- Socrative #4 ----------: Testing variable scope
Both B and D are correct.
Defensive programming: notice how we used a different name for the output of the function = better
fahr_to_celsius <- function(temp) {
temp_k <- fahr_to_kelvin(temp)
temp_c <- kelvin_to_celsius(temp_k)
return(temp_c)
}
# freezing point of water in Celsius
fahr_to_celsius(32.0)
kelvin_to_celsius(fahr_to_kelvin(32.0))
---------- Socrative #4 ----------: Write a C to F function
celsius_to_fahr <- function(temp){
fahr <- temp * 9 / 5 + 32
return(fahr)
}
celsius_to_fahr(100)
Now write a function that takes two arguments: one the temp to be converted, and another that says whether to convert from fahrenheit to celsius or celsius to fahrenheit. Using if...then, make the same function do both.
tempconvert(temp = 14, to = "fahrenheit")
tempconvert <- function(temp, to) {
if (to == "fahrenheit") {
converted = temp * 9 / 5 + 32
} else if (to == "celsius") {
converted = (temp - 32) * (5 / 9)
}
return(converted)
}
tempconvert(100, "fahrenheit")
tempconvert(212, "celsius")
tempconvert(212, "Celsius")
adding an "else" is a good idea:
else {
converted = NA
print("Error: 'to' was not fahrenheit or celsius")
}
tempconvert <- function(temp, to = "celsius") {
# Converts a temperature from celsius to fahrenheit
# or from fahrenheit to celsius.
# Takes a temperature and the desired unit as arguments
# Returns the converted temperature
# Example: tempconvert(212, "celsius") => 100
tempconvert(212)
# Takes a dataset and multiplies the population column
# with the GDP per capita column.
calcGDP <- function(dat) {
gdp <- dat$pop * dat$gdpPercap
return(gdp)
}
calcGDP(head(gapminder))
calcGDP <- function(dat, year=NULL, country=NULL) {
if(!is.null(year)) {
dat <- dat[dat$year %in% year, ]
}
if (!is.null(country)) {
dat <- dat[dat$country %in% country,]
}
gdp <- dat$pop * dat$gdpPercap
new <- cbind(dat, gdp=gdp)
return(new)
}
head(calcGDP(gapminder, year=2007))
calcGDP(gapminder, country="Australia")
We'll cover the first two, since the same stuff applies to everyone
We're going to use dplyr and tidyr instead.
install.packages(c("tidyr", "dplyr", "knitr", "rmarkdown", "formatR"))
library(dplyr)
head(gap)
yr_country_gdp <- select(gap, year, country, gdpPercap)
head(year_country_gdp)
Select lets us subset columns, but what if we want to subset rows? filter() does that
str(gap)
gap_eu <- filter(gap, continent == "Europe")
str(gap_eu)
We can stack these functions to do both
yr_country_gdp_eu <- filter(select(gap, year, country, gdpPercap),
continent=="Europe")
yr_country_gdp <- gap %>% select(year, country, gdpPercap)
yr_country_gdp_eu <- gap %>%
filter(continent=="Europe") %>%
select(year,country,gdpPercap)
Walk through this example, showing the flow
---------- Socrative #1 ----------: Filter using 2 filters and select 3 cols
Write a command with pipes that filters the gapminder dataset to only include data from 2007 in Africa, and then select the year, country, and lifeExp columns.
How many rows are left in the resulting dataset? If you're not sure how to find the number of rows, discuss with your neighbors.
africa_07_lifeExp %
filter(continent == "Africa") %>%
filter(year == 2007) %>%
select(year, country, lifeExp)
nrow(africa_07_lifeExp)
africa_07_lifeExp %
filter(continent == "Africa", year == 2007) %>%
select(year, country, lifeExp)
str(africa_07_lifeExp)
mean_gdp <- gap %>% summarize(meanGDP = mean(gdpPercap))
mean(gap$gdpPercap)
gdp_by_cont <- gap %>%
group_by(continent) %>%
summarize(mean_gdp = mean(gdpPercap))
gdp_by_cont
Here we're grouping by continent, which means we calculate a separate mean for each one
---------- Socrative #2 ----------: Avg lifeExp by year for Africa
Let's compute the average life expectancy across all African countries by year. In how many years did average African life expectancy decrease?
africa_lifeExp_yr <- gap %>%
filter(continent == "Africa") %>%
group_by(year) %>%
summarize(avg_life = mean(lifeExp))
africa_lifeExp_yr
What if we wanted to create a new column for gdp per billion people w/o condensing our data down? Use mutate()
bill_gdp_country_07 <- gap %>%
filter(year == 2007) %>%
mutate(billion_gdp = gdpPercap * pop / 10^9) %>%
select(continent, country, billion_gdp)
head(bill_gdp_country_07)
We can group multiple variables and summarize multiple things
gdp_by_cont <- gap %>%
group_by(continent, year) %>%
summarize(mean_gdp = mean(gdpPercap),
sd_gdp = sd(gdpPercap),
mean_pop = mean(pop),
sample_size = n(),
se_gdp = sd_gdp / sqrt(sample_size))
gdp_by_cont
gdp_by_continents %>% data.frame()
We can combine dplyr and ggplot2
gap %>% filter(continent == "Americas") %>%
ggplot(aes(x = year, y = lifeExp, color = country)) +
geom_line() +
geom_point()
select(), filter(), group_by(), summarize(), and mutate()
install.packages("tidyr")
library(tidyr)
Genus Weight Length
Ursus 122 82
Felis 5 14
Genus Measurement Value
Ursus Weight 122
Ursus Length 82
Felis Weight 5
Felis Length 14
---------- Socrative #3 ----------: What format is the gap dataset
Answer: intermediate
gap_wide <- read.csv("data/gapminder_wide.csv", stringsAsFactors = FALSE)
str(gap_wide)
We're going to convert it to long format with gather()
gap_long <- gap_wide %>%
gather(obstype_year, obs_values, starts_with('pop'),
starts_with('lifeExp'), starts_with('gdpPercap'))
str(gap_long)
head(gap_long)
We could also just exclude the columns we don't want to gather
gap_long <- gap_wide %>%
gather(obstype_year, obs_values, -continent, -country)
str(gap_long)
gap_long <- gap_long %>%
separate(obstype_year, into = c('obs_type', 'year'), sep = "_") %>%
mutate(year = as.integer(year))
---------- Socrative #4 ----------: Use dplyr on the long dataset
Using gap_long, summarize the mean life expectancy by continent
gap_long %>%
filter(obs_type == "lifeExp") %>%
group_by(continent) %>%
summarize(lifeExp = mean(obs_values))
gap_normal <- gap_long %>%
spread(obs_type, obs_values)
head(gap_normal)
dim(gap_normal)
dim(gap)
names(gap_normal)
names(gap)
gap_normal <- gap_normal %>%
select(country, year, pop, continent, lifeExp, gdpPercap)
names(gap_normal)
all_equal(gap_normal, gap)
head(gap_normal)
head(gap)
gap_normal <- gap_normal %>%
arrange(country, continent, year)
all.equal(gap_normal, gap)
head(gap_normal)
head(gap)
Defensive programming: It's a good idea to do checks all the time (like all_equal)
gap_wide_new <- gap_long %>%
unite(var_names, obs_type, year, sep = "_") %>%
spread(var_names, obs_values)
str(gap_wide_new)
gather(), spread(), separate(), unite()
Question: How many of you have made a plot in R? How many of you have used ggplot?
plot(x = gap$gdpPercap, y = gap$lifeExp)
install.packages(c("tidyr", "dplyr", "knitr", "rmarkdown", "formatR"))
library(ggplot2)
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp)) +
geom_point()
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp))
---------- Socrative #1 ----------: Modify a ggplot graph
ggplot(data = gap, aes(x=year, y=lifeExp)) + geom_point()
ggplot(data = gap, aes(x = year, y = lifeExp,
by = country, color = continent)) +
geom_line()
ggplot(data = gap, aes(x = year, y = lifeExp,
by = country, color = continent)) +
geom_line() +
geom_point()
ggplot(data = gap, aes(x = year, y = lifeExp, by = country)) +
geom_line(aes(color = continent)) +
geom_point()
ggplot(data = gap, aes(x = year, y = lifeExp, by = country)) +
geom_line(aes(color = continent)) +
geom_point(color = "blue")
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point()
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp)) +
geom_point(alpha = 0.5) +
scale_x_log10()
Here we also use transparency so that overlapping points are easier to see. This works with any geometry. We could also set this to use a data column inside aes()
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp)) +
geom_point() + scale_x_log10() + geom_smooth(method="lm")
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp)) +
geom_point() + scale_x_log10() + geom_smooth(method="lm", size=1.5)
---------- Socrative #2 ----------: Color by continent & add separate trends
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point(size = 1.5) +
scale_x_log10() +
geom_smooth(method="lm")
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point(size = 2, aes(shape = continent)) +
scale_x_log10() +
geom_smooth(method="lm")
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point(size = 2, aes(shape = continent)) +
scale_x_log10() +
geom_smooth(method="lm") +
scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 10)) +
theme_bw() +
labs(title = "Effects of per-capita GDP on Life Expectancy",
x = "GDP per Capita ($)",
y = "Life Expectancy (yrs)",
color = "Continents",
shape = "Continents")
ggsave(file = "life_expectancy.png")
ggsave(file = "life_expectancy.pdf")
---------- Socrative #3 ----------: Optional ggsave & ggplot variables - ggsave will overwrite the graph
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp, color = continent)) +
facet_wrap(~ year) +
geom_point(size = 2, aes(shape = continent)) +
scale_x_log10() +
geom_smooth(method = "lm")
L.countries <- gap %>%
filter(country %in% c("Lebanon", "Lesotho", "Liberia", "Libya"))
L.countries
ggplot(L.countries, aes(x = year, y = lifeExp, color = country)) +
geom_line() + facet_wrap( ~ country)
Draws a panel for each unique value in that column
lifeExp_country <- function(data, countries) {
country_subset <- data %>%
filter(country %in% countries)
ggplot(country_subset, aes(x = year, y = lifeExp, color = country)) +
geom_line() + facet_wrap( ~ country)
}
lifeExp_country(gap, c("Ethiopia", "Australia", "Canada"))
library(cowplot)
Text size
theme(
# Text size for axis ticks
axis.text.y = element_text(size = 14),
axis.text.x = element_text(size = 14),
# Text size for axis labels
# Also move them away from the axes a bit for more space
axis.title.x = element_text(size = 18, face = "bold", vjust = -1),
axis.title.y = element_text(size = 18, face = "bold", vjust = 1.5),
# Plot title size, move away from plot
plot.title = element_text(size = 20, face = "bold", vjust = 5)
)
Adjust the legend
theme(
# Text size
legend.text = element_text(size = 14),
legend.title = element_text(size = 16, face = "bold"),
# Position
legend.position = c(x = 0.8, y = 0.2)
)
Create a boxplot showing the spread of life expectancy for each continent
ggplot(data = gap, aes(x = continent, y = lifeExp)) +
geom_boxplot() +
geom_jitter(width = 0.2, alpha = 0.5, color = "tomato")
ggplot(data = gap, aes(x = continent, y = lifeExp)) +
geom_boxplot() +
geom_jitter(width = 0.2, alpha = 0.5, size = 2,
aes(color = factor(year)))
Create a grouped barplot showing life expectancy by year for each continent
ggplot(data = gap, aes(x = continent)) +
geom_bar()
ggplot(data = gap, aes(x = continent, y = lifeExp, fill = factor(year))) +
geom_bar(stat = "summary", fun.y = "mean", position = "dodge")
Question: How many of you could just send your code to your advisor or collaborator?
Basically a fancy lab notebook!
head(cars, 30)
and run it### Heading
* Bullet or - Bullet
1. list
**bold**
*italic*
[Software Carpentry](http://software-carpentry.org/)
---------- Socrative ----------: R Notebook Challenge
Modify your R notebook to load the data from gap, show the first 10 lines, and display a graph.
Give each R code section a heading and short summary. -
```
## Reading & Displaying Data
This chunk reads in the gap dataset and shows the first 10 lines
``{r}
gap <- read.csv("data/gapminder-data.csv")
head(gap, 10)
``
## Graph Gap Dataset
This chunk shows a graph
``{r}
library(ggplot2)
ggplot(data = gap, aes(x = gdpPercap, y = lifeExp)) +
geom_point()
``
```
There's a format to publish reports to github
github_document:
toc: yes
toc_depth: 5
You can name chunks, and control output for graphs
{r "test_graph", fig.height = 6, fig.width = 8, dpi = 300}
sessionInfo()
in my R Notebooks so I know what package versions I used
packrat
Go to Tools > Terminal > New Terminal or Git > Gear > Shell
git remote add origin https://github.com/mickley/rstudio-test.git
git push -u origin master