resources

Here are a number of helpful resources to dig deeper into data wrangling and tidying using tidyverse packages:

UO Bootcamp modules

Cheat sheets

load packages

if (!require(pacman)) {
  install.packages('pacman')
}
pacman::p_load("tidyverse", "here", install = TRUE)

load the data file

First, check your current working directory

here()

## [1] "/Users/danicosme/Documents/code/cnlab/PLTV/pltv-seminar"

Modify the path as needed to where your week4_data.csv file is downloaded. Note, my path is a little different, but for you, it should be in the data/ directory in the same folder as this script.

Import the csv file and assign it to the variable data

data = read_csv(here("static", "labs", "data", "week4_data.csv"))

check the data file

Check the names of the variables using names()

names(data)

##   [1] "StartDate"             "EndDate"               "Status"               
##   [4] "Progress"              "Duration (in seconds)" "Finished"             
##   [7] "RecordedDate"          "ResponseId"            "DistributionChannel"  
##  [10] "UserLanguage"          "Q151_First Click"      "Q151_Last Click"      
##  [13] "Q151_Page Submit"      "Q151_Click Count"      "consent"              
##  [16] "vote_registered"       "behavior_mailin"       "behavior_voting"      
##  [19] "behavior_pollworker"   "attitude_1"            "attitude_2"           
##  [22] "sn_injunctive"         "sn_descriptive"        "CE_attitudes_1"       
##  [25] "CE_attitudes_2"        "CE_attitudes_3"        "CE_attitudes_4"       
##  [28] "CE_attitudes_5"        "CE_attitudes_6"        "CE_attitudes_7"       
##  [31] "CE_attitudes_8"        "CE_checklist_1"        "CE_checklist_2"       
##  [34] "CE_checklist_3"        "CE_checklist_4"        "CE_checklist_5"       
##  [37] "CE_checklist_6"        "CE_checklist_7"        "CE_checklist_8"       
##  [40] "CE_checklist_9"        "CE_checklist_10"       "CE_checklist_11"      
##  [43] "CE_checklist_12"       "CE_checklist_13"       "CE_checklist_14"      
##  [46] "CE_checklist_15"       "CE_checklist_16"       "CE_checklist_17"      
##  [49] "CE_voting"             "CE_awareness"          "CE_talk_politics"     
##  [52] "CE_politics_know1"     "CE_politics_know2"     "CE_know3"             
##  [55] "CE_media_1"            "CE_media_2"            "CE_media_3"           
##  [58] "CE_media_4"            "CE_media_5"            "CE_media_6"           
##  [61] "vote_motive_1"         "vote_motive_2"         "vote_motive_3"        
##  [64] "vote_motive_4"         "vote_motive_5"         "advantages"           
##  [67] "disadvantages"         "barriers"              "solutions"            
##  [70] "reasons_no"            "reasons_no_12_TEXT"    "reasons_yes"          
##  [73] "reasons_yes_19_TEXT"   "politics_party"        "politics_party_9_TEXT"
##  [76] "politics_ideology"     "pol_expr&contcreat_1"  "pol_expr&contcreat_2" 
##  [79] "pol_expr&contcreat_3"  "pol_eff_1"             "pol_eff_2"            
##  [82] "pol_eff_3"             "pol_eff_4"             "soc_trust1"           
##  [85] "soc_trust2"            "soc_trust3"            "pol_trust"            
##  [88] "IAF_1"                 "IAF_2"                 "IAF_3"                
##  [91] "IAF_4"                 "IAF_5"                 "IAF_6"                
##  [94] "IAF_7"                 "IAF_8"                 "IAF_9"                
##  [97] "IAF_10"                "gender"                "gender_4_TEXT"        
## [100] "hispanic_latinx"       "race"                  "race_self"            
## [103] "ses_subj_1"            "ses_subj_2"            "ses_subj_3"           
## [106] "ses_subj_4"            "ses_subj_5"            "ses_subj_6"           
## [109] "ses_subj_7"            "ses_subj_8"            "ses_subj_9"           
## [112] "ses_subj_10"           "ses_degree"            "ses_degree_father"    
## [115] "ses_degree_mother"

Check the data types using glimpse()

glimpse(data)

## Rows: 183
## Columns: 115
## $ StartDate               <chr> "Start Date", "{\"ImportId\":\"startDate\",\"t…
## $ EndDate                 <chr> "End Date", "{\"ImportId\":\"endDate\",\"timeZ…
## $ Status                  <chr> "Response Type", "{\"ImportId\":\"status\"}", …
## $ Progress                <chr> "Progress", "{\"ImportId\":\"progress\"}", "10…
## $ `Duration (in seconds)` <chr> "Duration (in seconds)", "{\"ImportId\":\"dura…
## $ Finished                <chr> "Finished", "{\"ImportId\":\"finished\"}", "1"…
## $ RecordedDate            <chr> "Recorded Date", "{\"ImportId\":\"recordedDate…
## $ ResponseId              <chr> "Response ID", "{\"ImportId\":\"_recordId\"}",…
## $ DistributionChannel     <chr> "Distribution Channel", "{\"ImportId\":\"distr…
## $ UserLanguage            <chr> "User Language", "{\"ImportId\":\"userLanguage…
## $ `Q151_First Click`      <chr> "Timing - First Click", "{\"ImportId\":\"QID14…
## $ `Q151_Last Click`       <chr> "Timing - Last Click", "{\"ImportId\":\"QID149…
## $ `Q151_Page Submit`      <chr> "Timing - Page Submit", "{\"ImportId\":\"QID14…
## $ `Q151_Click Count`      <chr> "Timing - Click Count", "{\"ImportId\":\"QID14…
## $ consent                 <chr> "By clicking the \"I agree\" button below, you…
## $ vote_registered         <chr> "Did you register to vote in the 2020 presiden…
## $ behavior_mailin         <chr> "Did you request a mail-in ballot to vote in t…
## $ behavior_voting         <chr> "Did you vote in the 2020 presidential electio…
## $ behavior_pollworker     <chr> "Did you serve as a poll worker for the 2020 p…
## $ attitude_1              <chr> "My voting in the 2020 presidential election w…
## $ attitude_2              <chr> "My voting in the 2020 presidential election w…
## $ sn_injunctive           <chr> "Most people who are important to me approve o…
## $ sn_descriptive          <chr> "Most people who are important to me voted in …
## $ CE_attitudes_1          <chr> "To what extent do you agree or disagree with …
## $ CE_attitudes_2          <chr> "To what extent do you agree or disagree with …
## $ CE_attitudes_3          <chr> "To what extent do you agree or disagree with …
## $ CE_attitudes_4          <chr> "To what extent do you agree or disagree with …
## $ CE_attitudes_5          <chr> "To what extent do you agree or disagree with …
## $ CE_attitudes_6          <chr> "To what extent do you agree or disagree with …
## $ CE_attitudes_7          <chr> "To what extent do you agree or disagree with …
## $ CE_attitudes_8          <chr> "To what extent do you agree or disagree with …
## $ CE_checklist_1          <chr> "For each of the following items, please indic…
## $ CE_checklist_2          <chr> "For each of the following items, please indic…
## $ CE_checklist_3          <chr> "For each of the following items, please indic…
## $ CE_checklist_4          <chr> "For each of the following items, please indic…
## $ CE_checklist_5          <chr> "For each of the following items, please indic…
## $ CE_checklist_6          <chr> "For each of the following items, please indic…
## $ CE_checklist_7          <chr> "For each of the following items, please indic…
## $ CE_checklist_8          <chr> "For each of the following items, please indic…
## $ CE_checklist_9          <chr> "For each of the following items, please indic…
## $ CE_checklist_10         <chr> "For each of the following items, please indic…
## $ CE_checklist_11         <chr> "For each of the following items, please indic…
## $ CE_checklist_12         <chr> "For each of the following items, please indic…
## $ CE_checklist_13         <chr> "For each of the following items, please indic…
## $ CE_checklist_14         <chr> "For each of the following items, please indic…
## $ CE_checklist_15         <chr> "For each of the following items, please indic…
## $ CE_checklist_16         <chr> "For each of the following items, please indic…
## $ CE_checklist_17         <chr> "For each of the following items, please indic…
## $ CE_voting               <chr> "We know that most people don't vote in all el…
## $ CE_awareness            <chr> "Some people seem to follow what's going on in…
## $ CE_talk_politics        <chr> "How often do you talk about current event or …
## $ CE_politics_know1       <chr> "Would you say that one of the parties is more…
## $ CE_politics_know2       <chr> "Which party is more conservative?", "{\"Impor…
## $ CE_know3                <chr> "How much of a majority is required for the U.…
## $ CE_media_1              <chr> "Below are some ways that people get news and …
## $ CE_media_2              <chr> "Below are some ways that people get news and …
## $ CE_media_3              <chr> "Below are some ways that people get news and …
## $ CE_media_4              <chr> "Below are some ways that people get news and …
## $ CE_media_5              <chr> "Below are some ways that people get news and …
## $ CE_media_6              <chr> "Below are some ways that people get news and …
## $ vote_motive_1           <chr> "Different people vote for different reasons. …
## $ vote_motive_2           <chr> "Different people vote for different reasons. …
## $ vote_motive_3           <chr> "Different people vote for different reasons. …
## $ vote_motive_4           <chr> "Different people vote for different reasons. …
## $ vote_motive_5           <chr> "Different people vote for different reasons. …
## $ advantages              <chr> "In your opinion, what are the greatest advant…
## $ disadvantages           <chr> "In your opinion, what do you see as the possi…
## $ barriers                <chr> "In your opinion, what are the barriers to vot…
## $ solutions               <chr> "In your opinion, how can students help each o…
## $ reasons_no              <chr> "Some people choose to vote, while others do n…
## $ reasons_no_12_TEXT      <chr> "Some people choose to vote, while others do n…
## $ reasons_yes             <chr> "Some people choose to vote, while others do n…
## $ reasons_yes_19_TEXT     <chr> "Some people choose to vote, while others do n…
## $ politics_party          <chr> "In politics today, would you consider yoursel…
## $ politics_party_9_TEXT   <chr> "In politics today, would you consider yoursel…
## $ politics_ideology       <chr> "We hear a lot of talk these days about libera…
## $ `pol_expr&contcreat_1`  <chr> "How often do you take part in each of the fol…
## $ `pol_expr&contcreat_2`  <chr> "How often do you take part in each of the fol…
## $ `pol_expr&contcreat_3`  <chr> "How often do you take part in each of the fol…
## $ pol_eff_1               <chr> "To what extent do you agree or disagree with …
## $ pol_eff_2               <chr> "To what extent do you agree or disagree with …
## $ pol_eff_3               <chr> "To what extent do you agree or disagree with …
## $ pol_eff_4               <chr> "To what extent do you agree or disagree with …
## $ soc_trust1              <chr> "Do you think most people would try to take ad…
## $ soc_trust2              <chr> "Would you say that most of the time people tr…
## $ soc_trust3              <chr> "Generally speaking, would you say that most p…
## $ pol_trust               <chr> "How much of the time do you think you can tru…
## $ IAF_1                   <chr> "Below is a collection of statements about you…
## $ IAF_2                   <chr> "Below is a collection of statements about you…
## $ IAF_3                   <chr> "Below is a collection of statements about you…
## $ IAF_4                   <chr> "Below is a collection of statements about you…
## $ IAF_5                   <chr> "Below is a collection of statements about you…
## $ IAF_6                   <chr> "Below is a collection of statements about you…
## $ IAF_7                   <chr> "Below is a collection of statements about you…
## $ IAF_8                   <chr> "Below is a collection of statements about you…
## $ IAF_9                   <chr> "Below is a collection of statements about you…
## $ IAF_10                  <chr> "Below is a collection of statements about you…
## $ gender                  <chr> "In the next section, you will be asked some q…
## $ gender_4_TEXT           <chr> "In the next section, you will be asked some q…
## $ hispanic_latinx         <chr> "Do you identify as Hispanic or Latinx?", "{\"…
## $ race                    <chr> "What race or races do you consider yourself t…
## $ race_self               <chr> "We use the categories presented above for sta…
## $ ses_subj_1              <chr> "Think of this ladder as representing where pe…
## $ ses_subj_2              <chr> "Think of this ladder as representing where pe…
## $ ses_subj_3              <chr> "Think of this ladder as representing where pe…
## $ ses_subj_4              <chr> "Think of this ladder as representing where pe…
## $ ses_subj_5              <chr> "Think of this ladder as representing where pe…
## $ ses_subj_6              <chr> "Think of this ladder as representing where pe…
## $ ses_subj_7              <chr> "Think of this ladder as representing where pe…
## $ ses_subj_8              <chr> "Think of this ladder as representing where pe…
## $ ses_subj_9              <chr> "Think of this ladder as representing where pe…
## $ ses_subj_10             <chr> "Think of this ladder as representing where pe…
## $ ses_degree              <chr> "What is the highest degree or level of school…
## $ ses_degree_father       <chr> "The next questions ask about your parents.\nI…
## $ ses_degree_mother       <chr> "What is the highest degree or level of school…

Check the number of rows and columns using nrow() and ncol()

nrow(data)

## [1] 183

ncol(data)

## [1] 115

Check the first 10 rows using head()

head(data, n = 10)

View the dataframe using View()

View(data)

rename columns

Rename the following columns using rename():

pol_expr&contcreat_1 –> political_expression_1
pol_expr&contcreat_2 –> political_expression_3
pol_expr&contcreat_3 –> political_expression_3

The %>% operator is called a pipe. You can think of it as meaning “and then”. So, we’re going to take the data variable and then rename three columns

# rename variables
data_renamed = data %>%
  rename("political_expression_1" = `pol_expr&contcreat_1`,
         "political_expression_2" = `pol_expr&contcreat_2`,
         "political_expression_3" = `pol_expr&contcreat_3`)

# check the names
names(data_renamed)

##   [1] "StartDate"              "EndDate"                "Status"                
##   [4] "Progress"               "Duration (in seconds)"  "Finished"              
##   [7] "RecordedDate"           "ResponseId"             "DistributionChannel"   
##  [10] "UserLanguage"           "Q151_First Click"       "Q151_Last Click"       
##  [13] "Q151_Page Submit"       "Q151_Click Count"       "consent"               
##  [16] "vote_registered"        "behavior_mailin"        "behavior_voting"       
##  [19] "behavior_pollworker"    "attitude_1"             "attitude_2"            
##  [22] "sn_injunctive"          "sn_descriptive"         "CE_attitudes_1"        
##  [25] "CE_attitudes_2"         "CE_attitudes_3"         "CE_attitudes_4"        
##  [28] "CE_attitudes_5"         "CE_attitudes_6"         "CE_attitudes_7"        
##  [31] "CE_attitudes_8"         "CE_checklist_1"         "CE_checklist_2"        
##  [34] "CE_checklist_3"         "CE_checklist_4"         "CE_checklist_5"        
##  [37] "CE_checklist_6"         "CE_checklist_7"         "CE_checklist_8"        
##  [40] "CE_checklist_9"         "CE_checklist_10"        "CE_checklist_11"       
##  [43] "CE_checklist_12"        "CE_checklist_13"        "CE_checklist_14"       
##  [46] "CE_checklist_15"        "CE_checklist_16"        "CE_checklist_17"       
##  [49] "CE_voting"              "CE_awareness"           "CE_talk_politics"      
##  [52] "CE_politics_know1"      "CE_politics_know2"      "CE_know3"              
##  [55] "CE_media_1"             "CE_media_2"             "CE_media_3"            
##  [58] "CE_media_4"             "CE_media_5"             "CE_media_6"            
##  [61] "vote_motive_1"          "vote_motive_2"          "vote_motive_3"         
##  [64] "vote_motive_4"          "vote_motive_5"          "advantages"            
##  [67] "disadvantages"          "barriers"               "solutions"             
##  [70] "reasons_no"             "reasons_no_12_TEXT"     "reasons_yes"           
##  [73] "reasons_yes_19_TEXT"    "politics_party"         "politics_party_9_TEXT" 
##  [76] "politics_ideology"      "political_expression_1" "political_expression_2"
##  [79] "political_expression_3" "pol_eff_1"              "pol_eff_2"             
##  [82] "pol_eff_3"              "pol_eff_4"              "soc_trust1"            
##  [85] "soc_trust2"             "soc_trust3"             "pol_trust"             
##  [88] "IAF_1"                  "IAF_2"                  "IAF_3"                 
##  [91] "IAF_4"                  "IAF_5"                  "IAF_6"                 
##  [94] "IAF_7"                  "IAF_8"                  "IAF_9"                 
##  [97] "IAF_10"                 "gender"                 "gender_4_TEXT"         
## [100] "hispanic_latinx"        "race"                   "race_self"             
## [103] "ses_subj_1"             "ses_subj_2"             "ses_subj_3"            
## [106] "ses_subj_4"             "ses_subj_5"             "ses_subj_6"            
## [109] "ses_subj_7"             "ses_subj_8"             "ses_subj_9"            
## [112] "ses_subj_10"            "ses_degree"             "ses_degree_father"     
## [115] "ses_degree_mother"

# select only names that start with "political" using the grepl() function 
# grepl() uses regular expressions to match patterns

names(data_renamed)[grepl("political", names(data_renamed))]

## [1] "political_expression_1" "political_expression_2" "political_expression_3"

filter out responses

Filter out the following responses using filter():

Remove test responses (DistributionChannel == "preview")
Incomplete responses (Finished == 0)
Participants who didn’t consent (consent == 0)

# filter
data_filtered = data_renamed %>%
  filter(!DistributionChannel == "preview") %>%
  filter(Finished == 1 & consent == 1)

# check the number of rows
nrow(data_filtered)

## [1] 166

recode responses

The variable CE_voting had a response option outside of the 1-4 scale range, and this was coded as 99 to flag people who were not eligible to vote in previous election.

We want to recode that as missing data using NA.

Let’s do this using mutate(), ifelse() and recode()`.

ifelse() is a logical statement that means “if test is true, do X; otherwise (i.e. if test is false) do Y”

# check unique response values
unique(data_filtered$CE_voting)

## [1] "99" "2"  "3"  "4"  "1"

# recode
data_recoded = data_filtered %>%
  mutate(CE_voting = ifelse(test = CE_voting == 99,
                            yes = NA, 
                            no = CE_voting))

# check unique response values again
unique(data_recoded$CE_voting)

## [1] NA  "2" "3" "4" "1"

mean(as.numeric(data_filtered$CE_voting), na.rm = TRUE)

## [1] 45.45783

select a subset of columns

Select the following columns using select(), and contains() and starts_with() to match patterns of the column names:

ResponseId
behavior_voting
all columns that start with CE_attitudes
all columns that contain checklist

data_select = data_recoded %>%
  select(ResponseId, behavior_voting, starts_with("CE_attitudes"), contains("checklist"))

data_select

convert from wide to long format

To more easily wrangle the data for multiple columns at once, we’re going to convert from the wide to the long format using pivot_longer().

Do this for all variables except ResponseId and behavior_voting by specifying these columns with the cols = argument.

Let’s also rename the name column (the default) to scale_name using names_to = "scale_name".

data_long = data_select %>%
  pivot_longer(cols = -c(ResponseId, behavior_voting), names_to = "scale_name")

data_long

extract item number from scale name

scale_name contains both the names of the scale (CE_attitudes or CE_checklist) as well as the item number.

Let’s use extract() to separate these components into two separate columns called scale_name and item.

The col = argument specifies the original column name, the into = argument is where we specify the names of the columns we want to create, and regex = specifies the regular expression pattern.

Regular expressions are SUPER helpful for wrangling and tidying. Learn more about them with this cheat sheet.

data_scale = data_long %>%
  # extract the item number from scale
  extract(col = "scale_name", into = c("scale_name", "item"), regex = "(CE_attitudes|CE_checklist)_([0-9]+)") %>%
  # reorder variables using `select()`
  select(ResponseId, behavior_voting, scale_name, item, value)

data_scale

convert response values to numeric

Currently the responses in value are character strings. To use them as numbers, we need to convert the column to numeric values using mutate() and as.numeric().

# check variable type
typeof(data_scale$value)

## [1] "character"

# convert to numeric
data_numeric = data_scale %>%
  mutate(value = as.numeric(value))

# check variable type
typeof(data_numeric$value)

## [1] "double"

reverse score variables

Let’s say CE_attitudes_1 is a reverse-scored item, so we want to flip the scale so it matches the other items. The scale ranges from 1 to 7, we’ll convert e.g. 1–>7 to 7–>1

We can do this using mutate() and ifelse().

# check range
range(data_select$CE_attitudes_1)

## [1] "2" "7"

data_rev = data_numeric %>%
  mutate(value = ifelse(test = scale_name == "CE_attitudes" & item == 1,
                        yes = 8 - value, # subtract the value from scale max + 1
                        no = value))

data_rev

recode yes/no variables

CE_checklist is a series of yes/no questions that are currently coded as 1/2. We want to recode these to be 1/0 to make it easier to sum later on.

We’ll also recode voting_behavior from 1/2 to yes/no to make it clearer what these responses mean. Let’s try doing this using recode() instead of ifelse().

data_yn = data_rev %>%
  mutate(value = ifelse(test = scale_name == "CE_checklist" & value == 2,
                        yes = 0,
                        no = value),
         behavior_voting = recode(behavior_voting,
                                  "1" = "yes",
                                  "2" = "no"))

data_yn

summarize

Now that we’ve wrangled and tidyied our data, we can summarize it using summarize().

calculate mean civic engagement attitude across people

Because we just want to look at attitudes, we’ll first subset these items using filter().

Then, using group_by() we’ll state that we want to calculate a mean across all items in the filtered scale_name column.

To calculate the mean of value, we’ll use summarize() and mean(). Because we want to ignore any missing data (specified as NA in the dataframe), we’ll also use the na.rm = TRUE argument in the mean() function.

data_yn %>% 
  filter(scale_name == "CE_attitudes") %>%
  group_by(scale_name) %>%
  summarize(mean = mean(value, na.rm = TRUE))

# this is what would happen if we didn't first filter out `CE_checklist`
data_yn %>% 
  #filter(scale_name == "CE_attitudes") %>%
  group_by(scale_name) %>%
  summarize(mean = mean(value, na.rm = TRUE))

calculate mean and SD civic engagement attitude by voting behavior

This time, let’s use group_by() to calculate means and standard deviations of the civic engagement attitudes separately for people who voted and didn’t vote, specified in behavior_voting.

data_yn %>% 
  filter(scale_name == "CE_attitudes") %>%
  group_by(scale_name, behavior_voting) %>%
  summarize(mean = mean(value, na.rm = TRUE),
            sd = sd(value, na.rm = TRUE))

calculate means and sums per person

This time, let’s calculate these stats for each person separately rather than across all data points using ResponseId as a grouping factor.

data_yn %>% 
  group_by(ResponseId, scale_name) %>%
  summarize(mean = mean(value, na.rm = TRUE),
            sum = sum(value, na.rm = TRUE))

assignment

Now it’s your turn to apply what you’ve learned.

select

Select the following subset of columns from data_recoded:

ResponseId
vote_registered
variables that contain behavior
variables that start with attitude but not those that start with CE_attitudes

Assign this to assignment_data

assignment_data = data_recoded %>%
  select()

convert to long format

Assign this to assignment_data1

assignment_data1 = assignment_data %>%
  pivot_longer()

recode

Recode the values in value as follows: * 1 = “yes” * 2 = "no

Assign this to assignment_data2

assignment_data2 = assignment_data1 %>%
  mutate()

convert from long to wide format

Use pivot_wider() to go from the long to the wide format. Here, we don’t need to specify any arguments. Just use %>% pivot_wider().

Assign this to assignment_data3

assignment_data3 = assignment_data2 %>%
  pivot_wider()

filter

From assignment_data3, filter only people who reported being registered to vote (vote_registered).

Assign this to assignment_data4.

convert to numeric

Convert attitude_1 to numeric.

Assign this to assignment_data5.

combine steps

Instead of doing each step separately and assigning them to new variables each time, use %>% to link the steps together.

Assign this to assignment_combined

summarize

From assignment_combined, calculate the mean of attitude_1 for yes and no responses to behavior_mailin separately using group_by() and summarize().

When using the mean() function, include na.rm = TRUE to ignore missing values.

This time, do the same thing but with mutate() instead of summarize(). What’s different?

Week 4: Data wrangling & tidying

Dani Cosme

2021-04-02

resources

load packages

load the data file

check the data file

rename columns

filter out responses

recode responses

select a subset of columns

convert from wide to long format

extract item number from scale name

convert response values to numeric

reverse score variables

recode yes/no variables

summarize

calculate mean civic engagement attitude across people

calculate mean and SD civic engagement attitude by voting behavior

calculate means and sums per person

assignment

select

convert to long format

recode

convert from long to wide format

filter

convert to numeric

combine steps

summarize