Skip to content

Commit 0400b4c

Browse files
Update plots.Rmd
1 parent 86513d1 commit 0400b4c

1 file changed

Lines changed: 77 additions & 90 deletions

File tree

vignettes/plots.Rmd

Lines changed: 77 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,14 @@ options(rmarkdown.html_vignette.check_title = FALSE)
2323
# Overview
2424

2525

26-
This vignette recreates the style of a figure in [Garrison and Rodgers (2016)](https://www.sciencedirect.com/science/article/pii/S0160289616300162) using `ggplot2` and example data from NLSY79 on SES and flu vaccinations.
26+
This vignette recreates the style of a figure in [Garrison and Rodgers (2016)](https://www.sciencedirect.com/science/article/pii/S0160289616300162) using `ggplot2` and synthetic data structured with `discord_data()`. The figure illustrates the patterns of relationships between sibling differences in socioeconomic status (SES) and sibling differences in flu vaccinations.
2727

2828

29-
# Data Preparation
29+
# Data Generation and Preparation
3030

3131
## Data Cleaning
3232

33-
This section reuses the data preparation pipeline developed in the regression vignette.
3433

35-
That vignette demonstrated how to set up data for discordant regression analysis by using discord data processing tools. Those tools facilitate the construction of kinship links, including identifying sibling pairs, merging sibling characteristics, and calculating pair-level variables.
36-
37-
Here, we reuse that same pipeline to prepare the data for plotting.
38-
Specifically, we apply the same kinship pairing, data merging, and cleaning procedures, but our focus is now on visualizing patterns rather than fitting regression models.
3934

4035
The underlying dataset is the NLSY79, which includes measures of flu vaccination and socioeconomic status (SES) for kinship pairs.
4136
As in the regression vignette, we restrict the sample to individuals who are housemates and have a relatedness of 0.5.
@@ -57,69 +52,61 @@ library(gridExtra)
5752
library(ggExtra)
5853
library(janitor)
5954
60-
# Load the data
61-
data(data_flu_ses)
62-
63-
# Get kinship links for individuals with the following variables:
64-
list_vars <- c(
65-
"FLU_total", "FLU_2008", "FLU_2010",
66-
"FLU_2012", "FLU_2014", "FLU_2016",
67-
"S00_H40", "RACE", "SEX"
68-
)
69-
70-
df_link_pairs <- Links79PairExpanded %>%
71-
filter(RelationshipPath == "Gen1Housemates", RFull == 0.5)
72-
55+
data(mz_signif)
56+
data(mz_nonsignif)
57+
set.seed(18)
58+
59+
discord_mz_sig <- discord_data(mz_signif,
60+
outcome = "y1",
61+
predictors = "y2",
62+
id = "id",
63+
sex = NULL,
64+
race = NULL,
65+
demographics = "none",
66+
pair_identifiers = c("_1", "_2"),
67+
fast = TRUE
68+
) %>%
69+
rename_with(~ str_replace_all(., c("y1" = "flu", "y2" = "SES")), everything())
70+
71+
discord_mz_nsig <- discord_data(mz_nonsignif,
72+
outcome = "y1",
73+
predictors = "y2",
74+
id = "id",
75+
sex = NULL,
76+
race = NULL,
77+
demographics = "none",
78+
pair_identifiers = c("_1", "_2"),
79+
fast = TRUE
80+
) %>%
81+
rename_with(~ str_replace_all(., c("y1" = "flu", "y2" = "SES")), everything())
82+
83+
df_mz_signif <- mz_signif %>%
84+
rename_with(~ str_replace_all(., c("y1" = "flu", "y2" = "SES")), everything())
85+
86+
df_mz_nonsignif <- mz_nonsignif %>%
87+
rename_with(~ str_replace_all(., c("y1" = "flu", "y2" = "SES")), everything())
7388
74-
df_link <- CreatePairLinksSingleEntered(
75-
outcomeDataset = data_flu_ses,
76-
linksPairDataset = df_link_pairs,
77-
outcomeNames = list_vars
78-
)
79-
80-
df_consistent_kin <- df_link %>%
81-
group_by(SubjectTag_S1, SubjectTag_S2) %>%
82-
count(
83-
FLU_2008_S1, FLU_2010_S1,
84-
FLU_2012_S1, FLU_2014_S1,
85-
FLU_2016_S1, FLU_2008_S2,
86-
FLU_2010_S2, FLU_2012_S2,
87-
FLU_2014_S2, FLU_2016_S2
88-
) %>%
89-
na.omit()
90-
91-
# Create the df_flu_modeling object with only consistent responders.
92-
# Clean the column names with the {janitor} package.
93-
df_flu_modeling <- semi_join(df_link,
94-
df_consistent_kin,
95-
by = c(
96-
"SubjectTag_S1",
97-
"SubjectTag_S2"
98-
)
99-
) %>%
100-
clean_names()
10189
```
10290
</details>
10391

10492
## Creating the Discord Data
10593

10694
With the data prepared, we restructure it using `discord_data()`.
10795

108-
```{r, warning=FALSE}
96+
```{r, warning=FALSE, eval=FALSE}
10997
library(tidyverse)
11098
library(ggplot2)
11199
112-
df_discord_flu <- discord_data(
113-
data = df_flu_modeling,
114-
outcome = "flu_total",
115-
predictors = "s00_h40",
116-
id = "extended_id",
117-
sex = "sex",
118-
race = "race",
119-
pair_identifiers = c("_s1", "_s2"),
120-
demographics = "both"
121-
) %>%
122-
filter(!is.na(s00_h40_mean), !is.na(flu_total_mean))
100+
df_discord_flu <- discord_data(df_mz_signif,
101+
outcome = "flu",
102+
predictors = "SES",
103+
id = "id",
104+
sex = NULL,
105+
race = NULL,
106+
demographics = "none",
107+
pair_identifiers = c("_1", "_2"),
108+
fast = TRUE
109+
)
123110
```
124111

125112
Because we are interested in differences between kin, we create a new variable, `ses_diff_group`, that classifies SES differences into three categories: "More Advantaged", "Equally Advantaged", and "Less Advantaged". This variable is later used to group observations in the marginal density plots. They serve to help visualize how the distributions of mean SES and mean flu vaccinations differ across these SES difference categories.
@@ -129,9 +116,9 @@ df_discord_flu <- df_discord_flu %>%
129116
mutate(
130117
ses_mean_group = factor(
131118
case_when(
132-
as.numeric(scale(s00_h40_mean)) > 0.5 ~ "More Advantaged",
133-
as.numeric(scale(s00_h40_mean)) < -0.5 ~ "Less Advantaged",
134-
abs(as.numeric(scale(s00_h40_mean))) <= 0.5 ~ "Equally Advantaged"
119+
as.numeric(scale(SES_mean)) > 0.5 ~ "More Advantaged",
120+
as.numeric(scale(SES_mean)) < -0.5 ~ "Less Advantaged",
121+
abs(as.numeric(scale(SES_mean))) <= 0.5 ~ "Equally Advantaged"
135122
),
136123
levels = c(
137124
"Less Advantaged",
@@ -142,9 +129,9 @@ df_discord_flu <- df_discord_flu %>%
142129
# # Classify Difference Grouping
143130
ses_diff_group = factor(
144131
case_when(
145-
as.numeric(scale(s00_h40_diff)) > 0.5 ~ "More Advantaged",
146-
as.numeric(scale(s00_h40_diff)) < -0.5 ~ "Less Advantaged",
147-
abs(as.numeric(scale(s00_h40_diff))) <= 0.5 ~ "Equally Advantaged"
132+
as.numeric(scale(SES_diff)) > 0.5 ~ "More Advantaged",
133+
as.numeric(scale(SES_diff)) < -0.5 ~ "Less Advantaged",
134+
abs(as.numeric(scale(SES_diff))) <= 0.5 ~ "Equally Advantaged"
148135
),
149136
levels = c(
150137
"Less Advantaged",
@@ -171,7 +158,7 @@ color_na <- "#AD78B6" # purple for missing values
171158
color_shading_3 <- c(color_shading_4[2], color_na, color_shading_4[3])
172159
173160
# Determine the range of SES differences for color scaling
174-
max_val_scale <- max(abs(df_discord_flu$s00_h40_diff), na.rm = TRUE)
161+
max_val_scale <- max(abs(df_discord_flu$SES_diff), na.rm = TRUE)
175162
176163
# values <- seq(-max_val_scale, max_val_scale, length = length(color_shading_4))
177164
```
@@ -190,11 +177,11 @@ The first step is to create the base plot with sibling 1 data. In the next code
190177
```{r individual, echo=TRUE, message=FALSE}
191178
# Individual level plot
192179
plot_indiv <- plot_indiv_sib1 <- ggplot(
193-
df_flu_modeling,
180+
df_mz_signif,
194181
aes(
195-
x = s00_h40_s1,
196-
y = flu_total_s1,
197-
color = s00_h40_s1 - s00_h40_s2
182+
x = SES_1,
183+
y = flu_1,
184+
color = SES_1 - SES_2
198185
)
199186
) +
200187
geom_point(
@@ -232,9 +219,9 @@ plot_indiv <- plot_indiv +
232219
size = 0.8, alpha = 0.8, na.rm = TRUE,
233220
position = position_jitter(width = 0.2, height = 0.2),
234221
aes(
235-
x = s00_h40_s2,
236-
y = flu_total_s2,
237-
color = s00_h40_s2 - s00_h40_s1 # this reverses the color difference so sibling 2 points use the opposite color gradient compared to sibling 1, making it visually clear which sibling is being represented and how their SES difference is encoded
222+
x = SES_2,
223+
y = flu_2,
224+
color = SES_2 - SES_1 # this reverses the color difference so sibling 2 points use the opposite color gradient compared to sibling 1, making it visually clear which sibling is being represented and how their SES difference is encoded
238225
)
239226
) +
240227
scale_colour_gradientn(
@@ -260,11 +247,11 @@ The individual-level plot shows a positive association between SES and flu vacci
260247

261248
```{r}
262249
plot_indiv_s00 <- ggplot(
263-
df_flu_modeling,
250+
df_mz_signif,
264251
aes(
265-
x = s00_h40_s1,
266-
y = s00_h40_s2,
267-
color = s00_h40_s1 - s00_h40_s2
252+
x = SES_1,
253+
y = SES_2,
254+
color = SES_1 - SES_2
268255
)
269256
) +
270257
geom_point(
@@ -293,11 +280,11 @@ plot_indiv_s00 +
293280

294281
```{r}
295282
plot_indiv_flu <- ggplot(
296-
df_flu_modeling,
283+
df_mz_signif,
297284
aes(
298-
x = flu_total_s1,
299-
y = flu_total_s2,
300-
color = s00_h40_s1 - s00_h40_s2
285+
x = flu_1,
286+
y = flu_2,
287+
color = SES_1 - SES_2
301288
)
302289
) +
303290
geom_point(
@@ -334,8 +321,8 @@ This section creates a between-family plot that visualizes mean SES at age 40 ag
334321
```{r scatter, message=FALSE, include=FALSE, echo=TRUE}
335322
# Main scatter plot
336323
plot_btwn <- ggplot(df_discord_flu, aes(
337-
x = s00_h40_mean,
338-
y = flu_total_mean,
324+
x = SES_mean,
325+
y = flu_mean,
339326
color = ses_diff_group,
340327
)) +
341328
geom_point( # this layer creates invisible points to all the marginal plots to align correctly
@@ -347,7 +334,7 @@ plot_btwn <- ggplot(df_discord_flu, aes(
347334
size = 1.8, alpha = 0.8, na.rm = TRUE,
348335
shape = 21,
349336
aes(
350-
fill = s00_h40_diff,
337+
fill = SES_diff,
351338
colour = ses_diff_group
352339
),
353340
group = 1,
@@ -414,7 +401,7 @@ An alternative approach is to create marginal density plots separately and arran
414401
```{r plot-raw-data, message=FALSE}
415402
# Marginal X density (SES mean)
416403
plot_xdensity <- ggplot(df_discord_flu, aes(
417-
x = s00_h40_mean,
404+
x = SES_mean,
418405
group = ses_diff_group,
419406
color = ses_diff_group
420407
)) +
@@ -439,7 +426,7 @@ And for the Y density plot:
439426
```{r}
440427
# Marginal Y density (Flu mean)
441428
plot_ydensity <- ggplot(df_discord_flu, aes(
442-
x = flu_total_mean,
429+
x = flu_mean,
443430
group = ses_diff_group,
444431
color = ses_diff_group
445432
)) +
@@ -498,7 +485,7 @@ grid.arrange(
498485
widths = c(4, 1)
499486
),
500487
heights = c(1.5, 4),
501-
top = textGrob("Sibling Differences in SES and Flu Vaccinations",
488+
top = textGrob("Sibling Means in SES and Flu Vaccinations",
502489
gp = gpar(
503490
fontsize = 20,
504491
font = 3
@@ -535,8 +522,8 @@ This plot compares differences in SES at age 40 to differences in flu vaccinatio
535522
536523
# Main scatter plot
537524
plot_within <- ggplot(df_discord_flu, aes(
538-
x = s00_h40_diff,
539-
y = flu_total_diff,
525+
x = SES_diff,
526+
y = flu_diff,
540527
color = ses_diff_group
541528
)) +
542529
geom_point( # this layer creates invisible points to all the marginal plots to align correctly
@@ -548,7 +535,7 @@ plot_within <- ggplot(df_discord_flu, aes(
548535
size = 1.8, alpha = 0.9, na.rm = TRUE,
549536
shape = 21,
550537
aes(
551-
fill = s00_h40_diff,
538+
fill = SES_diff,
552539
colour = ses_diff_group
553540
),
554541
group = 1,

0 commit comments

Comments
 (0)