Add extractAt() method for BSgenome objects

A cleaner/faster replacement for getSeq(). The plan is to deprecate getSeq() at some point.
Bioconductor · Nov 10, 2019 · 9726318 · 9726318
1 parent 3936c0c
commit 9726318
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 20 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: BSgenome
 Title: Software infrastructure for efficient representation of full genomes and their SNPs
 Description: Infrastructure shared by all the Biostrings-based genome data
 	packages.
-Version: 1.55.0
+Version: 1.55.1
 Encoding: UTF-8
 Author: Hervé Pagès
 Maintainer: H. Pagès <[email protected]>
@@ -42,6 +42,7 @@ Collate: utils.R
 	available.genomes.R
 	injectSNPs.R
 	getSeq-methods.R
+	extractAt-methods.R
 	bsapply.R
 	BSgenomeViews-class.R
 	BSgenome-utils.R

diff --git a/NAMESPACE b/NAMESPACE
@@ -78,7 +78,7 @@ exportMethods(
     granges,
 
     ## Methods for generics defined in the Biostrings package:
-    getSeq,
+    getSeq, extractAt,
     seqtype, alphabetFrequency, hasOnlyBaseLetters,
     uniqueLetters, letterFrequency,
     oligonucleotideFrequency, nucleotideFrequencyAt,

diff --git a/R/OnDiskNamedSequences-class.R b/R/OnDiskNamedSequences-class.R
@@ -281,23 +281,10 @@ setMethod("loadSubseqsFromLinearSequence", "TwobitNamedSequences",
     if (!is(ranges, "IntegerRanges"))
         stop("'ranges' must be an IntegerRanges object")
     seqlength <- .get_seqlength(x, seqname)
-    ranges_start <- start(ranges)
-    ranges_end <- end(ranges)
-    if (max(ranges_end - ranges_start) >= seqlength)
-        stop("loading regions that are longer than the whole circular ",
-             "sequence \"", seqname, "\" is not supported")
-    start0 <- ranges_start - 1L  # 0-based start
-    shift <- start0 %% seqlength - start0
-    L_start <- ranges_start + shift
-    end1 <- ranges_end + shift
-    L_end <- pmin(end1, seqlength)
-    R_start <- 1L
-    R_end <- pmax(end1, seqlength) - seqlength
-    L_ans <- loadSubseqsFromLinearSequence(x, seqname,
-                                           IRanges(L_start, L_end))
-    R_ans <- loadSubseqsFromLinearSequence(x, seqname,
-                                           IRanges(R_start, R_end))
-    xscat(L_ans, R_ans)
+    LRranges <- splitLRranges(ranges, seqlength, seqname)
+    Lans <- loadSubseqsFromLinearSequence(x, seqname, LRranges$L)
+    Rans <- loadSubseqsFromLinearSequence(x, seqname, LRranges$R)
+    xscat(Lans, Rans)
 }
 
 loadSubseqsFromStrandedSequence <- function(x, seqname, ranges, strand,

diff --git a/R/extractAt-methods.R b/R/extractAt-methods.R
@@ -0,0 +1,104 @@
+### =========================================================================
+### extractAt() methods
+### -------------------------------------------------------------------------
+
+
+.slow_extract_genome_sequences_at <- function(x, at)
+{
+    stopifnot(is(x, "BSgenome"), is(at, "GRanges"),
+              identical(seqinfo(x), seqinfo(at)))
+
+    seqlevels(at) <- seqlevelsInUse(at)
+    grl <- split(at, seqnames(at))
+    grl_seqlevels <- names(grl)
+
+    ## Loop over the sequence names and extract the ranges.
+    dnaset_list <- lapply(seq_along(grl),
+        function(i) {
+            gr <- grl[[i]]
+            if (length(gr) == 0L)
+                return(DNAStringSet())
+            seqlevel <- grl_seqlevels[i]
+            subject <- x[[seqlevel]]
+            masks(subject) <- NULL
+            is_circular <- isCircular(x)[[seqlevel]]
+            ## 'seqlevel' will only be used for display in case of error.
+            loadSubseqsFromStrandedSequence(subject, seqlevel,
+                                            ranges(gr), strand(gr),
+                                            is_circular)
+        }
+    )
+
+    ## "unsplit" 'dnaset_list'.
+    unsplit_list_of_XVectorList("DNAStringSet", dnaset_list,
+                                as.factor(seqnames(at)))
+}
+
+### Use fast import() method for TwoBitFile objects.
+.fast_extract_genome_sequences_at <- function(x, at)
+{
+    stopifnot(is(x, "BSgenome"), is(at, "GRanges"),
+              identical(seqinfo(x), seqinfo(at)))
+
+    ## import() doesn't seem safe. For example it accepts ranges with
+    ## a start < 1L and returns a sequence with stuff in it that looks
+    ## wrong! (but what would look right anyway?)
+    ## So we check that the ranges in 'at' are valid, just to be safe.
+    out_of_bound_idx <- GenomicRanges:::get_out_of_bound_index(at)
+    if (length(out_of_bound_idx) != 0L)
+        stop(wmsg("some ranges are out of bounds"))
+
+    seqlevels_map <- setNames(names(x@user_seqnames), x@user_seqnames)
+    seqlevels(at) <- seqlevels_map[seqlevels(at)]
+    seqlevels(at) <- seqlevelsInUse(at)
+
+    LRat <- splitLRgranges(at)
+    ## import() ignores the strand of the ranges in 'which'.
+    Lans <- import(x@single_sequences@twobitfile, which=LRat$L)
+    Rans <- import(x@single_sequences@twobitfile, which=LRat$R)
+    ans <- xscat(Lans, Rans)
+
+    idx <- which(strand(at) == "-")
+    ans[idx] <- reverseComplement(ans[idx])
+    ans
+}
+
+### Must handle user defined seqlevels, injected SNPs, circularity, and strand.
+.extract_genome_sequences_at <- function(x, at)
+{
+    stopifnot(is(x, "BSgenome"), is(at, "GRanges"))
+
+    invalid_seqlevels <- setdiff(seqlevels(at), seqlevels(x))
+    if (length(invalid_seqlevels) != 0L)
+        stop(wmsg("'at' contains invalid chromosome name(s): ",
+                  paste(invalid_seqlevels, sep=", ")))
+    si <- merge(seqinfo(x), seqinfo(at))
+    seqlevels(at) <- seqlevels(si)
+    seqinfo(x) <- seqinfo(at) <- si
+
+    if (is.null(SNPlocs_pkgname(x)) &&
+        is(x@single_sequences, "TwobitNamedSequences"))
+    {
+        FUN <- .fast_extract_genome_sequences_at
+    } else {
+        FUN <- .slow_extract_genome_sequences_at
+    }
+    ans <- FUN(x, at)
+    names(ans) <- names(at)
+    ans
+}
+
+.extractAt_BSgenome <- function(x, at)
+{
+    if (is(at, "GRanges"))
+        return(.extract_genome_sequences_at(x, at))
+    if (is(at, "GRangesList")) {
+        unlisted_at <- unlist(at, use.names=FALSE)
+        unlisted_ans <- .extract_genome_sequences_at(x, unlisted_at)
+        return(relist(unlisted_ans, at))
+    }
+    stop(wmsg("'at' must be a GRanges or GRangesList object"))
+}
+
+setMethod("extractAt", "BSgenome", .extractAt_BSgenome)
+
diff --git a/R/utils.R b/R/utils.R
@@ -1,6 +1,9 @@
 ### =========================================================================
-### Some low-level internal (i.e. non-exported) utilities
+### Miscellaneous low-level utils
 ### -------------------------------------------------------------------------
+###
+### Unless stated otherwise, nothing in this file is exported.
+###
 
 
 get_data_annotation_contrib_url <- function(type=getOption("pkgType"))
@@ -33,3 +36,73 @@ encode_letters_as_bytes <- function(x) charToRaw(paste(x, collapse=""))
 
 decode_bytes_as_letters <- function(x) rawToChar(x, multiple=TRUE)
 
+
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+### Used in the context of sequence extraction from circular chromosomes
+###
+
+### Return 2 IRanges objects parallel to 'x'.
+splitLRranges <- function(x, seqlength, seqname)
+{
+    stopifnot(is(x, "IntegerRanges"),
+              isSingleInteger(seqlength),
+              seqlength >= 1L)
+
+    x_start <- start(x)
+    x_end <- end(x)
+    start0 <- x_start - 1L  # 0-based start
+    shift <- start0 %% seqlength - start0
+    end1 <- x_end + shift
+    L_start <- x_start + shift
+    L_end <- pmin(end1, seqlength)
+    R_start <- rep.int(1L, length(x))
+    R_end <- pmax(end1 - seqlength, 0L)
+    idx <- which(R_end > seqlength)
+    if (length(idx) != 0L)
+        stop(wmsg("A region to extract from circular ",
+                  "sequence \"", seqname, "\" crosses the end/start ",
+                  "of the circle more than once. ",
+                  "This is not supported at the moment, sorry!"))
+    Lans <- IRanges(L_start, L_end)
+    Rans <- IRanges(R_start, R_end)
+    list(L=Lans, R=Rans)
+}
+
+### Return 2 GRanges objects parallel to 'x'.
+splitLRgranges <- function(x)
+{
+    stopifnot(is(x, "GenomicRanges"))
+
+    x_seqnames_id <- as.integer(seqnames(x))
+    seqlengths <- unname(seqlengths(x))[x_seqnames_id]
+    if (any(width(x) != 0L & seqlengths == 0L))
+        stop(wmsg("cannot extract stuff from empty sequences"))
+    x_is_circ <- unname(isCircular(x)) %in% TRUE
+    split_idx <- which(x_is_circ[x_seqnames_id] & width(x) != 0L)
+
+    Lans <- Rans <- granges(x)
+    width(Rans) <- 0L
+    if (length(split_idx) == 0L)
+        return(list(L=Lans, R=Rans))
+
+    x_start <- start(x)[split_idx]
+    x_end <- end(x)[split_idx]
+    seqlengths <- seqlengths[split_idx]
+    start0 <- x_start - 1L  # 0-based start
+    shift <- start0 %% seqlengths - start0
+    end1 <- x_end + shift
+    L_start <- x_start + shift
+    L_end <- pmin(end1, seqlengths)
+    R_start <- rep.int(1L, length(split_idx))
+    R_end <- pmax(end1 - seqlengths, 0L)
+    idx <- which(R_end > seqlengths)
+    if (length(idx) != 0L)
+        stop(wmsg("A region to extract from a circular ",
+                  "sequence crosses the end/start ",
+                  "of the circle more than once. ",
+                  "This is not supported at the moment, sorry!"))
+    ranges(Lans)[split_idx] <- IRanges(L_start, L_end)
+    ranges(Rans)[split_idx] <- IRanges(R_start, R_end)
+    list(L=Lans, R=Rans)
+}
+