Using tidyverse tools with Pew Research Center survey data in R

Nick Hatley
Jun 12, 2019 · 16 min read
Image for post
Image for post
(Illustration by Selena Qian/Pew Research Center)

What is the “tidyverse”?

Packages

install.packages("tidyverse")
install.packages("haven")
library(tidyverse) 
#loads all "core" tidyverse packages like
#dplyr, tidyr, forcats, and ggplot2
library(haven)

Pipes

mutate(arrange(filter(starwars, !is.na(height)), desc(height)), height = 0.393701 * height)
starwars %>% 
filter(!is.na(height)) %>%
arrange(desc(height)) %>%
mutate(height = 0.393701 * height)

Loading the data into R

Apr17 <- read_sav("Apr17 public.sav", 
user_na = TRUE) %>%
as_factor()

Adding variables with mutate

Apr17 <- Apr17 %>% 
mutate(trump_approval = case_when(

q1 == "Approve" & q1a == "Very strongly" ~ "Strongly approve",
q1 == "Approve" & q1a != "Very strongly" ~ "Not strongly approve",
q1 == "Disapprove" & q1a == "Very strongly" ~ "Strongly disapprove",
q1 == "Disapprove" & q1a != "Very strongly" ~ "Not strongly disapprove",
q1 == "Don't know/Refused (VOL.)" |
q1a == "Don't know/Refused (VOL.)" ~ "Refused"
) #this parentheses closes call to
#case_when and sends it to
#fct_relevel with %>%
%>%
fct_relevel("Strongly approve",
"Not strongly approve",
"Not strongly disapprove",
"Strongly disapprove",
"Refused"
) #this parentheses closes our call to fct_relevel
) #this parentheses closes our call to mutate
table(Apr17$trump_approval, Apr17$q1)
##
## Approve Disapprove Don't know/
## Refused (VOL.)
##Strongly approve 476 0 0
##Not strongly approve 130 0 0
##Not strongly disapprove 0 143 0
##Strongly disapprove 0 676 0
##Refused 0 0 76
levels(Apr17$educ2)## [1] "Less than high school (Grades 1-8 or no formal schooling)"                                
## [2] "High school incomplete (Grades 9-11 or Grade 12 with NO diploma)"
## [3] "High school graduate (Grade 12 with diploma or GED certificate)"
## [4] "Some college, no degree (includes some community college)"
## [5] "Two year associate degree from a college or university"
## [6] "Four year college or university degree/Bachelor's degree (e.g., BS, BA, AB)"
## [7] "Some postgraduate or professional schooling, no postgraduate degree"
## [8] "Postgraduate or professional degree, including master's, doctorate, medical or law degree"
## [9] "Don't know/Refused (VOL.)"
Apr17 <- Apr17 %>% 
mutate(educ_cat = fct_collapse(educ2,
"High school grad or less" = c(
"Less than high school (Grades 1-8 or no formal schooling)",
"High school incomplete (Grades 9-11 or Grade 12 with NO diploma)",
"High school graduate (Grade 12 with diploma or GED certificate)"
),
"Some college" = c(
"Some college, no degree (includes some community college)",
"Two year associate degree from a college or university"
),
"College grad+" = c(
"Four year college or university degree/Bachelor's degree (e.g., BS, BA, AB)",
"Some postgraduate or professional schooling, no postgraduate degree",
"Postgraduate or professional degree, including master's, doctorate, medical or law degree"
)
) #this parentheses closes our call
#to fct_collapse
) #this parentheses closes our call to mutate

Getting weighted estimates with group_by and summarise

trump_approval <- Apr17 %>% 
group_by(trump_approval) %>%
summarise(weighted_n = sum(weight))
trump_approval <- Apr17 %>% 
##group by trump_approval to calculate weighted totals
##by taking the sum of the weights
group_by(trump_approval) %>%
summarise(weighted_n = sum(weight)) %>%
##add the weighted_group_size to get the total weighted n and
##divide weighted_n by weighted_group_size to get the proportions
mutate(weighted_group_size = sum(weighted_n),
weighted_estimate = weighted_n / weighted_group_size
)

trump_approval
## # A tibble: 5 x 4
##trump_approval weighted_n weighted_group_size weighted_estimate
## <fct> <dbl> <dbl> <dbl>
## 1 Strongly
## approve 1293. 4319. 0.299
## 2 Not strongly
## approve 408. 4319. 0.0945
## 3 Not strongly
## disapprove 458. 4319. 0.106
## 4 Strongly
##disapprove 1884. 4319. 0.436
## 5 Refused 275. 4319. 0.0636
trump_estimates_educ <- Apr17 %>% 
##group by educ and trump approval to get weighted n's per group
group_by(educ_cat, trump_approval) %>%
##calculate the total number of people in each answer and education category using survey weights (weight)
summarise(weighted_n = sum(weight)) %>%
##group by education to calculate education category size
group_by(educ_cat) %>%
##add columns for total group size and the proportion
mutate(weighted_group_size = sum(weighted_n),
weighted_estimate = weighted_n/weighted_group_size)

trump_estimates_educ
## # A tibble: 17 x 5
## # Groups: educ_cat [4]
## educ_cat trump_approval weighted_n weighted_group_… weighted_estima…
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 High schoo… Strongly appro… 550. 1710. 0.322
## 2 High schoo… Not strongly a… 207. 1710. 0.121
## 3 High schoo… Not strongly d… 221. 1710. 0.129
## 4 High schoo… Strongly disap… 593. 1710. 0.347
## 5 High schoo… Refused 140. 1710. 0.0817
## 6 Some colle… Strongly appro… 404. 1337. 0.302
## 7 Some colle… Not strongly a… 111. 1337. 0.0833
## 8 Some colle… Not strongly d… 128. 1337. 0.0959
## 9 Some colle… Strongly disap… 605. 1337. 0.453
## 10 Some colle… Refused 88.1 1337. 0.0659
## 11 College gr… Strongly appro… 336. 1258. 0.267
## 12 College gr… Not strongly a… 89.9 1258. 0.0715
## 13 College gr… Not strongly d… 109. 1258. 0.0870
## 14 College gr… Strongly disap… 676. 1258. 0.537
## 15 College gr… Refused 47.0 1258. 0.0373
## 16 Don't know… Strongly appro… 2.97 13.3 0.224
## 17 Don't know… Strongly disap… 10.3 13.3 0.776

Rearranging data with gather()

Apr17 <- Apr17 %>% 
select(resp_id = psraid,
weight,
trump_approval,
educ_cat, racethn, gen5)
head(Apr17)## # A tibble: 6 x 6
## resp_id weight trump_approval educ_cat racethn gen5
## <dbl> <dbl> <fct> <fct> <fct> <fct>
## 1 100005 2.94 Strongly disappro… College grad+ Black, non… Silent (1…
## 2 100010 1.32 Not strongly appr… Some college Hispanic Boomer (1…
## 3 100021 1.24 Strongly disappro… College grad+ White, non… Silent (1…
## 4 100028 4.09 Strongly approve Some college White, non… Boomer (1…
## 5 100037 1.12 Refused College grad+ White, non… Boomer (1…
## 6 100039 6.68 Strongly disappro… High school gra… Black, non… Boomer (1…
Apr17_long <- Apr17 %>% 
gather(key = subgroup_variable, value = subgroup,
educ_cat, racethn, gen5)
Image for post
Image for post
trump_estimates <- Apr17_long %>% 
#group by subgroup_variable, subgroup, and trump approval
group_by(subgroup_variable, subgroup, trump_approval) %>%
#calculate the total number of people in each answer and education #category using survey weights (weight)
summarise(weighted_n = sum(weight)) %>%
#group by subgroup only to calculate subgroup category size
group_by(subgroup) %>%
#add columns for total group size and the proportion
mutate(weighted_group_size = sum(weighted_n),
weighted_estimate = weighted_n/weighted_group_size)
trump_estimates <- trump_estimates %>% 
select(-weighted_n, -weighted_group_size)
trump_estimates
## # A tibble: 70 x 4
## # Groups: subgroup [14]
## subgroup_variable subgroup trump_approval weighted_estima…
## <chr> <chr> <fct> <dbl>
## 1 educ_cat College grad+ Strongly approve 0.267
## 2 educ_cat College grad+ Not strongly appr… 0.0715
## 3 educ_cat College grad+ Not strongly disa… 0.0870
## 4 educ_cat College grad+ Strongly disappro… 0.537
## 5 educ_cat College grad+ Refused 0.0373
## 6 educ_cat Don't know/Refuse… Strongly approve 0.0203
## 7 educ_cat Don't know/Refuse… Strongly disappro… 0.0704
## 8 educ_cat High school grad … Strongly approve 0.322
## 9 educ_cat High school grad … Not strongly appr… 0.121
## 10 educ_cat High school grad … Not strongly disa… 0.129
## # … with 60 more rows
trump_estimates %>% 
filter(trump_approval != "Refused") %>%
filter(!(subgroup %in%
c("Don't know/Refused (VOL.)", "DK/Ref"))) %>%
ggplot(
aes(
x = weighted_estimate,
y = subgroup
)
) +
geom_point() +
scale_x_continuous(limits = c(0, .8),
breaks = seq(0, .6, by = .2),
labels = scales::percent(
seq(0, .6, by = .2), accuracy = 1)
) +
facet_grid(cols = vars(trump_approval),
rows = vars(subgroup_variable),
scales = "free_y",
space = "free"
) +
theme_bw() +
theme(axis.title.y = element_blank())
Image for post

Pew Research Center: Decoded

The "how" behind the numbers, facts and trends shaping your…

Welcome to a place where words matter. On Medium, smart voices and original ideas take center stage - with no ads in sight. Watch

Follow all the topics you care about, and we’ll deliver the best stories for you to your homepage and inbox. Explore

Get unlimited access to the best stories on Medium — and support writers while you’re at it. Just $5/month. Upgrade

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store