Skip to content

Commit

Permalink
GLOBI overhaul part 2
Browse files Browse the repository at this point in the history
Pipeline slightly reordered, but still some stuff to do (e.g., add the sp. and third-word-drops to PREDICT pipeline + a save-out-file). There's tons of errors in this version - e.g., "Pathogen" has made it through the whole workflow.
  • Loading branch information
Colin J. Carlson committed Apr 4, 2021
1 parent 152c7b3 commit debf141
Show file tree
Hide file tree
Showing 12 changed files with 257,602 additions and 77,553 deletions.
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,11 @@ GenBank_as_Edgelist.csv
GenBank-Taxized.csv
SRA_as_Edgelist.edges
.Rproj.user
Intermediate/clover.csv
Intermediate/clover.csv
*.zip
Intermediate/Virion-Temp.csv
Intermediate/Virion-Temp.csv
Intermediate/GLOBI-parsed.csv
Intermediate/GBTaxized.zip
Code_Dev/TaxonomyTempIn.csv
Intermediate/Virion-Temp.csv
1 change: 0 additions & 1 deletion Code/02_2_Merge_in_PREDICT.R
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,6 @@ predict %>% rename(Host_Original = "Host") %>% left_join(test) -> predict

test %>% filter(is.na(HostGenus))


##### Bind into VIRION


Expand Down
22 changes: 20 additions & 2 deletions Code_Dev/X_Digest GLOBI.R → Code/02_3_Digest GLOBI.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
library(tidyverse)
library(taxize)

if(!file.exists("Intermediate/GLOBI-parsed.csv")) {

globi <- read_csv('Source/GLOBI-raw.csv')

globi %>%
Expand Down Expand Up @@ -84,7 +86,7 @@ for (i in 1:nrow(ncbi.tax.virus)) {

ncbi.tax.virus$Virus[i] <- str_squish(str_replace(ncbi.tax.virus$Virus[i], " sp\\.", ""))

ncbi.num <- taxize::get_uid(ncbi.tax.virus$Virus[i], rank_query = )
ncbi.num <- taxize::get_uid(ncbi.tax.virus$Virus[i])
ncbi.high <- taxize::classification(ncbi.num, db = "ncbi")

match = 0
Expand Down Expand Up @@ -133,9 +135,12 @@ ncbi.tax.host <- data.frame(Host_Intermediate = ncbi.names,

for (i in 1:nrow(ncbi.tax.host)) {

match = 0
ncbi.tax.host$Host[i] <- str_squish(str_replace(ncbi.tax.host$Host[i], " sp\\.", ""))
ncbi.tax.host$Host[i] <- str_squish(str_replace(ncbi.tax.host$Host[i], " gen\\.", ""))
ncbi.tax.host$Host[i] = word(ncbi.tax.host$Host[i], start = 1, end = 2)
if(str_count(ncbi.tax.host$Host[i], " ")>1) {
ncbi.tax.host$Host[i] = word(ncbi.tax.host$Host[i], start = 1, end = 2)
}

ncbi.num <- taxize::get_uid(ncbi.tax.host$Host[i], rank_query = c("species"))
ncbi.high <- taxize::classification(ncbi.num, db = "ncbi")
Expand Down Expand Up @@ -186,8 +191,21 @@ for (i in 1:nrow(ncbi.tax.host)) {

# NEED TO ADD A STEP TO BIND IN THE TAXONOMY

globi %>%
left_join(ncbi.tax.host) %>%
left_join(ncbi.tax.virus) %>%
select(-Host_Intermediate) %>%
select(-Virus_Intermediate) %>%
unique() -> globi

write_csv(globi, "Intermediate/GLOBI-parsed.csv")

}

##### Bind into VIRION

globi <- read_csv("Intermediate/GLOBI-parsed.csv")

virion <- read_csv('Intermediate/Virion-Temp.csv')

virion <- bind_rows(virion, globi) %>%
Expand Down
30 changes: 22 additions & 8 deletions Code/03_Clean_SRA_Merge.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,22 @@ if(file.exists("Intermediate/clover.csv")){
# Mark the ones that are in CLOVER
clo %<>% mutate(Clover = 1)

sra.m %>% mutate(Virus = tolower(Virus),
Host = tolower(Host)) -> sra.m2

# Join CLOVER and SRA
clo %<>% right_join(sra.m)
clo %<>% right_join(sra.m2, by = c("Pathogen_Harmonised" = "Virus", "HostHarmonised_Taxize" = "Host"))

# Mark the ones that AREN'T in CLOVER
clo$Clover[is.na(clo$Clover)] <- 0

# Check out how score behaves

ggplot(clo, aes(x = factor(Clover), y = log(score))) +
geom_violin()

# Time to threshold this in such a way that we're sure of what we're working with ####

clo %>% mutate(rownum = c(1:nrow(clo)),
scaled = log(score)/max(log(score))) -> clo

clo %>%
select(Host, Virus, Clover, rownum, scaled) %>%
select(HostHarmonised_Taxize, Pathogen_Harmonised, Clover, rownum, scaled) %>%
unique() %>%
select(rownum, Clover, scaled) ->
clo.sub
Expand All @@ -75,6 +73,22 @@ th <- optimal.thresholds(clo.sub,
req.spec = 0.99,
na.rm = TRUE)

# Check out how score behaves

ggplot(clo, aes(x = factor(Clover), y = log(score)/max(log(clo$score)), fill = factor(Clover))) +
geom_violin() +
theme_bw() +
xlab("SRA association recorded in CLOVER?") + ylab("Maximum recorded virus hits in sample (log-transformed, rescaled)") +
geom_line(y = th$scaled[th$Method == 'MaxKappa'], col = 'red') +
geom_hline(aes(yintercept = th$scaled[th$Method == 'MaxKappa']), col = 'red', lty = 2) +
theme(axis.title.x = element_text(vjust = -1.5),
axis.title.y = element_text(vjust = 5)) +
scale_fill_manual(values = c("#00AFBB", "#00AFBB")) +
theme(plot.margin = unit(c(1,1,1,1),"cm"),
legend.position = 'n')

# Cut it off

clo %<>% filter(Clover == 1 | scaled > th$scaled[th$Method == 'MaxKappa'])

clo %>% count(Clover)
Expand All @@ -87,7 +101,7 @@ cut.score <- th$scaled[th$Method == 'MaxKappa']

virion <- read_csv("Intermediate/Virion-Temp.csv")

maxscore <- max(log(sra.m$score))
maxscore <- max(log(clo$score))

sra.v %>%
# sra.m %>%
Expand Down
13 changes: 0 additions & 13 deletions Code/04_Dissassemble_Virion.R

This file was deleted.

2 changes: 2 additions & 0 deletions Code/0X_Flag_Contaminants.R → Code/04_Flag_Contaminants.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,5 @@ problems <- c("Vesicular stomatitis virus",
virion %<>% mutate(VirusFlag = ifelse((Virus %in% problems) & (Database == 'SRA'), 'Yes', 'No'))

write_csv(virion, "Virion/Virion-Master.csv")

file.remove("Intermediate/Virion-Temp.csv")
File renamed without changes.
Loading

0 comments on commit debf141

Please sign in to comment.