Using tidyverse tools with Pew Research Center survey data in R

Nick Hatley
Jun 12 · 16 min read
(Illustration by Selena Qian/Pew Research Center)

What is the “tidyverse”?

Packages

install.packages("tidyverse")
install.packages("haven")
library(tidyverse) 
#loads all "core" tidyverse packages like
#dplyr, tidyr, forcats, and ggplot2
library(haven)

Pipes

mutate(arrange(filter(starwars, !is.na(height)), desc(height)), height = 0.393701 * height)
starwars %>% 
filter(!is.na(height)) %>%
arrange(desc(height)) %>%
mutate(height = 0.393701 * height)

Loading the data into R

Apr17 <- read_sav("Apr17 public.sav", 
user_na = TRUE) %>%
as_factor()

Adding variables with mutate

Apr17 <- Apr17 %>% 
mutate(trump_approval = case_when(

q1 == "Approve" & q1a == "Very strongly" ~ "Strongly approve",
q1 == "Approve" & q1a != "Very strongly" ~ "Not strongly approve",
q1 == "Disapprove" & q1a == "Very strongly" ~ "Strongly disapprove",
q1 == "Disapprove" & q1a != "Very strongly" ~ "Not strongly disapprove",
q1 == "Don't know/Refused (VOL.)" |
q1a == "Don't know/Refused (VOL.)" ~ "Refused"
) #this parentheses closes call to
#case_when and sends it to
#fct_relevel with %>%
%>%
fct_relevel("Strongly approve",
"Not strongly approve",
"Not strongly disapprove",
"Strongly disapprove",
"Refused"
) #this parentheses closes our call to fct_relevel
) #this parentheses closes our call to mutate
table(Apr17$trump_approval, Apr17$q1)
##
## Approve Disapprove Don't know/
## Refused (VOL.)
##Strongly approve 476 0 0
##Not strongly approve 130 0 0
##Not strongly disapprove 0 143 0
##Strongly disapprove 0 676 0
##Refused 0 0 76
levels(Apr17$educ2)## [1] "Less than high school (Grades 1-8 or no formal schooling)"                                
## [2] "High school incomplete (Grades 9-11 or Grade 12 with NO diploma)"
## [3] "High school graduate (Grade 12 with diploma or GED certificate)"
## [4] "Some college, no degree (includes some community college)"
## [5] "Two year associate degree from a college or university"
## [6] "Four year college or university degree/Bachelor's degree (e.g., BS, BA, AB)"
## [7] "Some postgraduate or professional schooling, no postgraduate degree"
## [8] "Postgraduate or professional degree, including master's, doctorate, medical or law degree"
## [9] "Don't know/Refused (VOL.)"
Apr17 <- Apr17 %>% 
mutate(educ_cat = fct_collapse(educ2,
"High school grad or less" = c(
"Less than high school (Grades 1-8 or no formal schooling)",
"High school incomplete (Grades 9-11 or Grade 12 with NO diploma)",
"High school graduate (Grade 12 with diploma or GED certificate)"
),
"Some college" = c(
"Some college, no degree (includes some community college)",
"Two year associate degree from a college or university"
),
"College grad+" = c(
"Four year college or university degree/Bachelor's degree (e.g., BS, BA, AB)",
"Some postgraduate or professional schooling, no postgraduate degree",
"Postgraduate or professional degree, including master's, doctorate, medical or law degree"
)
) #this parentheses closes our call
#to fct_collapse
) #this parentheses closes our call to mutate

Getting weighted estimates with group_by and summarise

trump_approval <- Apr17 %>% 
group_by(trump_approval) %>%
summarise(weighted_n = sum(weight))
trump_approval <- Apr17 %>% 
##group by trump_approval to calculate weighted totals
##by taking the sum of the weights
group_by(trump_approval) %>%
summarise(weighted_n = sum(weight)) %>%
##add the weighted_group_size to get the total weighted n and
##divide weighted_n by weighted_group_size to get the proportions
mutate(weighted_group_size = sum(weighted_n),
weighted_estimate = weighted_n / weighted_group_size
)

trump_approval
## # A tibble: 5 x 4
##trump_approval weighted_n weighted_group_size weighted_estimate
## <fct> <dbl> <dbl> <dbl>
## 1 Strongly
## approve 1293. 4319. 0.299
## 2 Not strongly
## approve 408. 4319. 0.0945
## 3 Not strongly
## disapprove 458. 4319. 0.106
## 4 Strongly
##disapprove 1884. 4319. 0.436
## 5 Refused 275. 4319. 0.0636
trump_estimates_educ <- Apr17 %>% 
##group by educ and trump approval to get weighted n's per group
group_by(educ_cat, trump_approval) %>%
##calculate the total number of people in each answer and education category using survey weights (weight)
summarise(weighted_n = sum(weight)) %>%
##group by education to calculate education category size
group_by(educ_cat) %>%
##add columns for total group size and the proportion
mutate(weighted_group_size = sum(weighted_n),
weighted_estimate = weighted_n/weighted_group_size)

trump_estimates_educ
## # A tibble: 17 x 5
## # Groups: educ_cat [4]
## educ_cat trump_approval weighted_n weighted_group_… weighted_estima…
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 High schoo… Strongly appro… 550. 1710. 0.322
## 2 High schoo… Not strongly a… 207. 1710. 0.121
## 3 High schoo… Not strongly d… 221. 1710. 0.129
## 4 High schoo… Strongly disap… 593. 1710. 0.347
## 5 High schoo… Refused 140. 1710. 0.0817
## 6 Some colle… Strongly appro… 404. 1337. 0.302
## 7 Some colle… Not strongly a… 111. 1337. 0.0833
## 8 Some colle… Not strongly d… 128. 1337. 0.0959
## 9 Some colle… Strongly disap… 605. 1337. 0.453
## 10 Some colle… Refused 88.1 1337. 0.0659
## 11 College gr… Strongly appro… 336. 1258. 0.267
## 12 College gr… Not strongly a… 89.9 1258. 0.0715
## 13 College gr… Not strongly d… 109. 1258. 0.0870
## 14 College gr… Strongly disap… 676. 1258. 0.537
## 15 College gr… Refused 47.0 1258. 0.0373
## 16 Don't know… Strongly appro… 2.97 13.3 0.224
## 17 Don't know… Strongly disap… 10.3 13.3 0.776

Rearranging data with gather()

Apr17 <- Apr17 %>% 
select(resp_id = psraid,
weight,
trump_approval,
educ_cat, racethn, gen5)
head(Apr17)## # A tibble: 6 x 6
## resp_id weight trump_approval educ_cat racethn gen5
## <dbl> <dbl> <fct> <fct> <fct> <fct>
## 1 100005 2.94 Strongly disappro… College grad+ Black, non… Silent (1…
## 2 100010 1.32 Not strongly appr… Some college Hispanic Boomer (1…
## 3 100021 1.24 Strongly disappro… College grad+ White, non… Silent (1…
## 4 100028 4.09 Strongly approve Some college White, non… Boomer (1…
## 5 100037 1.12 Refused College grad+ White, non… Boomer (1…
## 6 100039 6.68 Strongly disappro… High school gra… Black, non… Boomer (1…
Apr17_long <- Apr17 %>% 
gather(key = subgroup_variable, value = subgroup,
educ_cat, racethn, gen5)
trump_estimates <- Apr17_long %>% 
#group by subgroup_variable, subgroup, and trump approval
group_by(subgroup_variable, subgroup, trump_approval) %>%
#calculate the total number of people in each answer and education #category using survey weights (weight)
summarise(weighted_n = sum(weight)) %>%
#group by subgroup only to calculate subgroup category size
group_by(subgroup) %>%
#add columns for total group size and the proportion
mutate(weighted_group_size = sum(weighted_n),
weighted_estimate = weighted_n/weighted_group_size)
trump_estimates <- trump_estimates %>% 
select(-weighted_n, -weighted_group_size)
trump_estimates
## # A tibble: 70 x 4
## # Groups: subgroup [14]
## subgroup_variable subgroup trump_approval weighted_estima…
## <chr> <chr> <fct> <dbl>
## 1 educ_cat College grad+ Strongly approve 0.267
## 2 educ_cat College grad+ Not strongly appr… 0.0715
## 3 educ_cat College grad+ Not strongly disa… 0.0870
## 4 educ_cat College grad+ Strongly disappro… 0.537
## 5 educ_cat College grad+ Refused 0.0373
## 6 educ_cat Don't know/Refuse… Strongly approve 0.0203
## 7 educ_cat Don't know/Refuse… Strongly disappro… 0.0704
## 8 educ_cat High school grad … Strongly approve 0.322
## 9 educ_cat High school grad … Not strongly appr… 0.121
## 10 educ_cat High school grad … Not strongly disa… 0.129
## # … with 60 more rows
trump_estimates %>% 
filter(trump_approval != "Refused") %>%
filter(!(subgroup %in%
c("Don't know/Refused (VOL.)", "DK/Ref"))) %>%
ggplot(
aes(
x = weighted_estimate,
y = subgroup
)
) +
geom_point() +
scale_x_continuous(limits = c(0, .8),
breaks = seq(0, .6, by = .2),
labels = scales::percent(
seq(0, .6, by = .2), accuracy = 1)
) +
facet_grid(cols = vars(trump_approval),
rows = vars(subgroup_variable),
scales = "free_y",
space = "free"
) +
theme_bw() +
theme(axis.title.y = element_blank())

Pew Research Center: Decoded

The "how" behind the numbers, facts and trends shaping your world.

Nick Hatley

Written by

Research Analyst, Pew Research Center. Tweet about research methods, elections, R and the NBA. Views are my own.

Pew Research Center: Decoded

The "how" behind the numbers, facts and trends shaping your world.