Skip to content

Commit

Permalink
Adding SRA/GenBank/Clover standardisation
Browse files Browse the repository at this point in the history
  • Loading branch information
gfalbery committed Jan 19, 2021
1 parent c5337b9 commit bd5484b
Show file tree
Hide file tree
Showing 6 changed files with 38,498 additions and 37,481 deletions.
37 changes: 25 additions & 12 deletions Code/01 - Clean GenBank edgelist.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

# 01_Setup ####

library(tidyverse); library(taxize); library(magrittr)
library(tidyverse); library(taxize); library(magrittr); library(fs); library(zip)

# setwd("~/Github/virion")
# setwd(here::here())
Expand All @@ -24,17 +24,31 @@ if(!file.exists("Source/sequences.csv")){
gb <- data.table::fread("Source/sequences.csv") %>%
as_tibble

gb %>%
rename(Publication_Date = Release_Date) %>%
mutate_at("Publication_Date", ~.x %>% # Modifying date column to make sense
str_split("T") %>% # Splitting at this midpoint
map_chr(1) %>% # Taking the first component
lubridate::ymd() # Coding as YMD (shouldn't throw errors)
) -> gb

if(0){ # Removing GenBank entries with only one word names?

gb %>% mutate(NWords = str_count(Host, " ")) %>%
filter(NWords>1) -> gb

}

hosts_vec <- unique(na.omit(gb$Host))
# Add import download step? ####
# https://cran.r-project.org/web/packages/biomartr/biomartr.pdf
# Or https://ropensci.org/blog/2020/11/10/coronaviruses-and-hosts/
# Or https://rdrr.io/cran/insect/man/searchGB.html


# Taxizing ####

hosts_vec <- unique(na.omit(gb$Host))

host.dictionary <- readRDS("Intermediate/HostDictionary.RDS")

NotInDictionary <- hosts_vec %>% # Identify host names not in the dictionary
Expand Down Expand Up @@ -110,33 +124,32 @@ gb2 %>% filter(Selected_class %in% c("Mammalia", # Selecting host taxa
select(Virus = Species, # Selecting and renaming columns
Host = Accepted_name,
Selected_family, Selected_order, Selected_class,
Publication_Date = Release_Date,
Publication_Date,
Collection_Date) %>%
unique() -> gb2

# Renaming to match other databases
gb2 %>%
mutate_at("Publication_Date", ~.x %>% # Modifying date column to make sense
str_split("T") %>% # Splitting at this midpoint
map_chr(1) %>% # Taking the first component
lubridate::ymd() # Coding as YMD (shouldn't throw errors)
) -> gb2
rename_all(~.x %>% str_replace_all("Selected_", "Host") %>%
str_replace_all(c("class" = "Class", "order" = "Order", "family" = "Family"))) ->

gb2

if(1){

gb2 %>% # Selecting just the first identification of a given association
arrange(Host, Virus, Publication_Date) %>%
group_by(Host, Virus) %>%
# dplyr::count() %>% pull(n) %>% table
filter(Publication_Date == min(Publication_Date)) %>%
mutate(N = 1:n()) %>% filter(N == 1) %>%
dplyr::select(-N) -> gb2

}

data.table::fwrite(gb2, 'Intermediate/GenBank-Taxized.csv')

library(fs)

file_delete("Intermediate/GBTaxized.zip")

zip(zipfile = './Intermediate/GBTaxized.zip',
files = './Intermediate/GenBank-Taxized.csv')
zip(zipfile = 'Intermediate/GBTaxized.zip',
files = 'Intermediate/GenBank-Taxized.csv')
2 changes: 1 addition & 1 deletion Code/02 - Create VIRION from GenBank and CLOVER.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ if(file.exists("Intermediate/clover.csv")){
clo <-
read.csv("https://raw.githubusercontent.com/viralemergence/clover/main/output/Clover_v1.0_NBCIreconciled_20201218.csv")

write.csv(clo, file = "Intermediate/clover.csv")
write.csv(clo, file = "Intermediate/clover.csv", row.names = F)

}

Expand Down
Loading

0 comments on commit bd5484b

Please sign in to comment.