-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline.R
70 lines (49 loc) · 1.94 KB
/
pipeline.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#source functions and set working directory ----
setwd("~/R/aRchiteutis/")
source("source_functions.R")
#preparing file ----
df <- get_counts(
path = "PATH/TO/KRAKEN/REPORTS",
pattern = "SUBSET_CHARACTERS",
legend = "PATH/TO/LEGEND",
trim_char = "CHARACTER_TO_SPLIT_FILENAMES")
#calculate number of reads, species and genus
summarise(df[df$clade == "R",], sum(N)) %>% unlist #number of classified reads
df$taxa[df$clade == "S"] %>% droplevels %>% levels %>% length #number of species
df$taxa[df$clade == "G"] %>% droplevels %>% levels %>% length #number of genus
#if you don't totally trust Kraken, drop all species levels
df <- df[df$clade != "S",]
#trim several taxa or chose clade to use (examples)
dfT <- df %>% df_taxa_trim(top_taxa = 12) #get any top taxa
dfO <- df %>% df_tidy_drop_unclassified %>% df_taxa_trim(top_taxa = 0)
dfO <- dfO[dfO$taxa != "other Bacteria",] %>% df_rescale #exclude "other" levels
dfC <- df %>% df_tidy_drop_unclassified %>% df_get_top_taxa("G", 50)
dfC <- df[df$taxa %in% dfC,] #get top genus taxa
#donuts
df2donut(dfT)
df2donut(dfT[dfT$taxa != "unclassified",], show.legend = F, color = "white") +
theme(text = element_text(size = 30),
legend.position = "bottom")
#profiles
df2composition(dfT)
df2composition(dfO)
df2composition(dfC)
#barplots
df2barplot(dfO) + coord_trans(x = "log") + theme(text = element_text(size = 20))
#base R heatmap
df2heatmap(df, "G", trim = 10,
scale = "column", Colv = NA)
#base cluster
df2cluster(dfC, k_means = 10)
#2D cluster by legend variables
df2clust2d(dfC, k_means = 10, legend_detect = "pupa")
#correlations
df2corrplot(dfO, log2_scale = T)
#chord. for big amount of data strongly recommend save plots without observing
df2chord(dfC, k_means = 10)
#tSNE for taxa
taxa <- df %>% df_tidy_drop_unclassified %>% df_get_top_taxa("S", 20)
df2tsne(df, clade_trim = "S", counts = F, log2df = F, taxa = taxa)
#PCA
df2pca_sample(df, "S", scale = T)
df2pca_sp(dfO, scale = T)