-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDescriptive_Statistics.rmd
321 lines (266 loc) · 11.5 KB
/
Descriptive_Statistics.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
---
title: "TIL Invasion Descriptive Statistics"
date: "`r Sys.Date()`"
output:
html_document:
toc: yes
toc_depth: 4
pdf_document:
toc: yes
toc_depth: 4
---
```{r setup, include=FALSE}
knitr::opts_knit$set(root.dir = "/data")
knitr::opts_chunk$set(collapse = T, warning = F, fig.width=10, fig.height=5, echo = F)
library(readr)
library(plyr)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(survival)
library(survminer)
```
\newpage
```{r load and parse, include = F}
df = as.data.frame(readr::read_csv(file = params$csvPath))
droppable = c(grep("*patch*", colnames(df)), ## *_ regex used to be flexible with subtype specific patch metrics
grep("patchRatio", colnames(df)),
grep("*percent_pos", colnames(df)),
grep("overlap", colnames(df)))
## For testing
#params = list(csvPath = "~/Desktop/tilalign/wsinfer_fullSamples/outputs/withSubtypes/output.csv",
# nVars = 2,
# survCensor = "censorA.0yes.1no",
# survTime = "survivalA")
## ==== Check for patients with NA invasion, could be 1) sampInfo given for WSI not predicted or 2) no cancer patches predicted. Notify and remove
missingTIL = factor(rep("OK",nrow(df)),
levels = c("OK", "not aligned", "no cancer"))
if(anyNA(df$scaled_PP)){
missingTIL[df$n_Canc_patch == 0] = "no cancer"
missingTIL[which(is.na(df$n_Canc_patch))] = "not aligned"
}
## If none of the above columns exist, don't run the drop
if(length(droppable) > 0){
df = df[,-droppable]
}
## Uncomment below if just want invasion and survival
# df = df[,-c(1,6,7)]
## Above is for testing
toFacet = colnames(df)[which(!colnames(df) %in% c("slideID","scaled_PP","TIL_Class",params$survCensor, params$survTime))]
if(length(toFacet) == 0){ # if there are no additional columns provided,
warning("No additional sample information provided beyond survival and TIL. Only univariable analyses will be performed")
runFacet = FALSE
} else {
runFacet = TRUE
}
## If both surv columns exist, run survival section
survExists = params$survTime %in% colnames(df) &
params$survCensor %in% colnames(df)
facetHeight = max(
c(length(toFacet)*2,
5
)
)
```
# Descriptive Statistics
## Overall Distribution
```{r overall dist, fig.height = 6}
palette = c("#00BFC4","#F8766D")
plots = list()
df$TIL_Class = factor(df$TIL_Class, levels = c("Low","High"))
plots[["hist"]] = ggpubr::gghistogram(data = df, x = "scaled_PP", fill = "TIL_Class", binwidth = .1, palette = palette)
df = df[which(missingTIL == "OK"),]
tmp = df %>% plyr::count("TIL_Class")
plots[["bar"]] = ggpubr::ggbarplot(tmp, x = "TIL_Class", y = "freq",
fill = "TIL_Class", order = c("Low","High"), palette = palette) +
labs(y = "Num of WSI per class")
ggpubr::ggarrange(plotlist = plots, ncol = 2, common.legend = T)
```
In the analyzed dataset, you have provided **`r nrow(df)` samples** to be analyzed and **`r length(toFacet)` columns** to subdivide the samples. The pipeline calls Low TIL vs High TIL around the **mean** of the dataset **(`r mean(df$scaled_PP, na.rm = T)`)**. This results in:
* `r sum(df$TIL_Class == "Low", na.rm=T)` Low invasion samples (`r signif((sum(df$TIL_Class == "Low", na.rm=T)/sum(!is.na(df$TIL_Class)))*100,digits = 4)`%) and
* `r sum(df$TIL_Class == "High", na.rm=T)` High invasion samples (`r signif((sum(df$TIL_Class == "High", na.rm=T)/sum(!is.na(df$TIL_Class)))*100,digits = 4)`%).
Values for invasion were not computed for **`r sum(is.na(df$scaled_PP))` samples**. Of the missing samples, **`r sum(missingTIL == "no cancer")`** were aligned but had no predicted cancer presence, while **`r sum(missingTIL == "not aligned")`** had sample information provided but no TIL invasion data. Both will be dropped prior to downstream analyses. Below we will compare invasion characteristics across your provided additional features (all additional columns beyond survival, scaled_PP, and TIL_Class) and, if possible, perform some clinical correlations with your outcome information.
\newpage
```{r class facetHeader, results='asis', eval=runFacet}
cat("## Faceted by variables of interest\n\n")
cat("> Note: This section is only run if extra information (e.g., Stage, Grade) are provided.\n\n")
cat("### Class of Invasion\n\n")
```
```{r class with facet, eval=runFacet, fig.height=facetHeight}
writeLines("Here we test if invasion class is independent of your variable of interest using a chi-squared test")
## Remove those with missing TIL info
df = df[which(missingTIL == "OK"),]
plots = list()
for(i in toFacet){
tmp = df[,c(i, "TIL_Class")]
chi = table(tmp[,i],tmp$TIL_Class)
if(any(chi==0)){
chi = "Not run, at least one strata had 0 entries"
tmp = tmp %>% plyr::count()
plots[[i]] = ggpubr::ggbarplot(tmp, x = i, y = "freq",
fill = "TIL_Class", palette = palette) +
labs(y = "Num of WSI",
caption = chi)
} else{
chi = chisq.test(chi)
#
tmp = tmp %>% plyr::count()
plots[[i]] = ggpubr::ggbarplot(tmp, x = i, y = "freq",
fill = "TIL_Class", palette = palette) +
labs(y = "Num of WSI",
caption = paste0("Chisq pval: ", signif(chi$p.value,3), "\n",
"Chi-stat: ", signif(chi$statistic,6)))
}
}
x = ggarrange(plotlist = plots, ncol = length(toFacet), common.legend = T)
x
```
\newpage
```{r cont facetHeader, results='asis', eval=runFacet}
cat("### Continuous Invasion\n\n")
cat("For continuous invasion, we use a non-parametric Wilcoxon Rank-Sum if your provided feature has 2 levels. If there are 3+ levels, ANOVA is used.\n\n")
```
```{r continuous with facet, eval=runFacet, fig.height=facetHeight}
plots = list()
for(i in toFacet){
if(length(levels(as.factor(df[,i]))) > 2){
plots[[i]] = ggpubr::ggviolin(df, x = i,
y = "scaled_PP",
fill = i, trim = T) +
stat_compare_means(method = "anova",
label.x.npc = "center")
} else {
plots[[i]] = ggpubr::ggviolin(df, x = i,
y = "scaled_PP",
fill = i, trim = T) +
stat_compare_means(method = "wilcox",
label.x.npc = "center")
}
}
ggarrange(plotlist = plots, ncol = length(toFacet))
```
\newpage
```{r surv Header, results='asis', eval=survExists}
cat("# Survival\n\n")
cat("> Note: This section is only run if survival information (i.e., 'Time to Event' and 'Is sample censored') are provided.\n\n")
cat("## Kaplan-Meier\n\n")
cat("For the Kaplan-Meier section, TIL Class is used to categorize samples and the log-rank test is used to compute p-values.\n\n")
cat("### Univariate\n\n")
```
```{r km univar, fig.height = 6, eval = survExists}
tmp.df = df
colnames(tmp.df)[which(colnames(tmp.df)==params$survTime)] = "survTime"
colnames(tmp.df)[which(colnames(tmp.df)==params$survCensor)] = "survCensor"
tmp.df$SurvObj <- with(tmp.df, Surv(time = survTime,
event = survCensor,type = "right"))
fit <- survfit(SurvObj ~ TIL_Class,
data = tmp.df,
type = "kaplan-meier")
km = ggsurvplot(fit,
data = tmp.df,
palette = palette,
risk.table = T,
pval = TRUE,
censor = TRUE,
surv.mean.line = "hv",
legend.title = "TIL Infiltration") + ggtitle("Survival by TIL Class")
km
```
\newpage
```{r surv KM facetHeader, results='asis', eval=survExists & runFacet}
cat("### By Vars of Interest\n\n")
```
```{r km facet, fig.height = 6, fig.width = 12, eval = survExists & runFacet}
plots = list()
for(i in toFacet){
km = ggsurvplot_facet(fit,
data = tmp.df,
risk.table = T,
facet.by = i,
palette = palette,
pval = TRUE,
censor = TRUE,
surv.mean.line = "hv",
legend.title = "TIL Infiltration",
xlab = "Time") + ggtitle(paste0("Survival by TIL Class + ", i))
plots[[i]] = km
}
ggarrange(plotlist = plots, common.legend = T)
```
\newpage
```{r cox Header, results='asis', eval=survExists}
cat("## Cox Regression\n\n")
cat("For the Cox regression section, both TIL Class (categorical) and scaled_PP (continuous) are used to categorize samples. While KM plots are limited to bivariate, this section will run all analyses in both bivariable and in a larger model using all included variables.\n\n")
cat("### Univariate Categorical\n\n")
```
```{r cox cat univar, fig.height = 3, fig.width = 8, eval = survExists}
tmp.df$TIL_Class = factor(tmp.df$TIL_Class, levels = c("Low","High"))
fit.coxph <- coxph(Surv(time = tmp.df$survTime,
event = tmp.df$survCensor,
type = "right") ~ TIL_Class, data = tmp.df)
confInts = summary(fit.coxph)$conf.int
if(anyNA(confInts) | any(is.infinite(confInts))){
message("\n . . . Cox confidence interval too wide to plot, returning summary of fit instead . . . \n")
print(summary(fit.coxph))
} else{
x = ggforest(fit.coxph, data = tmp.df, main = "TIL Class")
x
}
```
```{r cox cont Header, results='asis', eval=survExists}
cat("### Univariate Continuous\n\n")
```
```{r cox cont univar, fig.height = 2, fig.width = 8, eval = survExists}
fit.coxph <- coxph(Surv(time = tmp.df$survTime,
event = tmp.df$survCensor,
type = "right") ~ scaled_PP, data = tmp.df)
confInts = summary(fit.coxph)$conf.int
if(anyNA(confInts) | any(is.infinite(confInts))){
message(" . . . Cox confidence interval too wide to plot, returning summary of fit instead . . . ")
print(summary(fit.coxph))
} else{
x = ggforest(fit.coxph, data = tmp.df, main = "Continuous Invasion (scaled by SD)")
x
}
```
\newpage
```{r surv cox facetHeader, results='asis', eval=survExists & runFacet}
cat("### By Vars of Interest (one at a time)\n\n")
```
```{r cox bivar, fig.width = 8, eval = survExists & runFacet}
plots = list()
for(i in toFacet){
newTmp = tmp.df
#colnames(newTmp)[which(colnames(newTmp) == i)] = "Var"
fit.coxph <- SurvObj ~ .
toUse = paste0("scaled_PP+",i)
fit.coxph = reformulate(toUse, fit.coxph[[2]])
fit.coxph = coxph(fit.coxph, data = newTmp)
confInts = summary(fit.coxph)$conf.int
if(anyNA(confInts) | any(is.infinite(confInts))){ ## If plot would fail, skip it.
message(paste0("\n . . . Cox confidence interval too wide to plot for 'Invasion + ",i, "' returning summary of fit instead . . . \n"))
print(summary(fit.coxph))
} else{
plots[[i]] = ggforest(fit.coxph, data = newTmp, main = paste0("Bivariate with ", i))
}
}
if(length(plots) > 0){
ggarrange(plotlist = plots, nrow = length(toFacet))
}
```
\newpage
```{r multi cox facetHeader, results='asis', eval=survExists & (length(toFacet)>1)}
cat("### Multivariable Cox (all features)\n\n")
```
```{r cox multivar, fig.width = 8, eval = survExists & (length(toFacet)>1)}
newTmp = tmp.df[,c("scaled_PP", "SurvObj", toFacet)]
fit.coxph <- coxph(SurvObj ~ ., data = newTmp)
confInts = summary(fit.coxph)$conf.int
if(anyNA(confInts) | any(is.infinite(confInts))){ ## If plot would fail, skip it.
message("\n . . . Cox confidence interval too wide to plot for multivar, returning summary of fit instead . . . \n")
print(summary(fit.coxph))
} else{
x = ggforest(fit.coxph, data = tmp.df, main = "All Var")
x
}
```