Skip to content

Commit

Permalink
Add extractAt() method for BSgenome objects
Browse files Browse the repository at this point in the history
A cleaner/faster replacement for getSeq(). The plan is to deprecate
getSeq() at some point.
  • Loading branch information
hpages committed Nov 10, 2019
1 parent 3936c0c commit 9726318
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 20 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Package: BSgenome
Title: Software infrastructure for efficient representation of full genomes and their SNPs
Description: Infrastructure shared by all the Biostrings-based genome data
packages.
Version: 1.55.0
Version: 1.55.1
Encoding: UTF-8
Author: Hervé Pagès
Maintainer: H. Pagès <[email protected]>
Expand Down Expand Up @@ -42,6 +42,7 @@ Collate: utils.R
available.genomes.R
injectSNPs.R
getSeq-methods.R
extractAt-methods.R
bsapply.R
BSgenomeViews-class.R
BSgenome-utils.R
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ exportMethods(
granges,

## Methods for generics defined in the Biostrings package:
getSeq,
getSeq, extractAt,
seqtype, alphabetFrequency, hasOnlyBaseLetters,
uniqueLetters, letterFrequency,
oligonucleotideFrequency, nucleotideFrequencyAt,
Expand Down
21 changes: 4 additions & 17 deletions R/OnDiskNamedSequences-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,23 +281,10 @@ setMethod("loadSubseqsFromLinearSequence", "TwobitNamedSequences",
if (!is(ranges, "IntegerRanges"))
stop("'ranges' must be an IntegerRanges object")
seqlength <- .get_seqlength(x, seqname)
ranges_start <- start(ranges)
ranges_end <- end(ranges)
if (max(ranges_end - ranges_start) >= seqlength)
stop("loading regions that are longer than the whole circular ",
"sequence \"", seqname, "\" is not supported")
start0 <- ranges_start - 1L # 0-based start
shift <- start0 %% seqlength - start0
L_start <- ranges_start + shift
end1 <- ranges_end + shift
L_end <- pmin(end1, seqlength)
R_start <- 1L
R_end <- pmax(end1, seqlength) - seqlength
L_ans <- loadSubseqsFromLinearSequence(x, seqname,
IRanges(L_start, L_end))
R_ans <- loadSubseqsFromLinearSequence(x, seqname,
IRanges(R_start, R_end))
xscat(L_ans, R_ans)
LRranges <- splitLRranges(ranges, seqlength, seqname)
Lans <- loadSubseqsFromLinearSequence(x, seqname, LRranges$L)
Rans <- loadSubseqsFromLinearSequence(x, seqname, LRranges$R)
xscat(Lans, Rans)
}

loadSubseqsFromStrandedSequence <- function(x, seqname, ranges, strand,
Expand Down
104 changes: 104 additions & 0 deletions R/extractAt-methods.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
### =========================================================================
### extractAt() methods
### -------------------------------------------------------------------------


.slow_extract_genome_sequences_at <- function(x, at)
{
stopifnot(is(x, "BSgenome"), is(at, "GRanges"),
identical(seqinfo(x), seqinfo(at)))

seqlevels(at) <- seqlevelsInUse(at)
grl <- split(at, seqnames(at))
grl_seqlevels <- names(grl)

## Loop over the sequence names and extract the ranges.
dnaset_list <- lapply(seq_along(grl),
function(i) {
gr <- grl[[i]]
if (length(gr) == 0L)
return(DNAStringSet())
seqlevel <- grl_seqlevels[i]
subject <- x[[seqlevel]]
masks(subject) <- NULL
is_circular <- isCircular(x)[[seqlevel]]
## 'seqlevel' will only be used for display in case of error.
loadSubseqsFromStrandedSequence(subject, seqlevel,
ranges(gr), strand(gr),
is_circular)
}
)

## "unsplit" 'dnaset_list'.
unsplit_list_of_XVectorList("DNAStringSet", dnaset_list,
as.factor(seqnames(at)))
}

### Use fast import() method for TwoBitFile objects.
.fast_extract_genome_sequences_at <- function(x, at)
{
stopifnot(is(x, "BSgenome"), is(at, "GRanges"),
identical(seqinfo(x), seqinfo(at)))

## import() doesn't seem safe. For example it accepts ranges with
## a start < 1L and returns a sequence with stuff in it that looks
## wrong! (but what would look right anyway?)
## So we check that the ranges in 'at' are valid, just to be safe.
out_of_bound_idx <- GenomicRanges:::get_out_of_bound_index(at)
if (length(out_of_bound_idx) != 0L)
stop(wmsg("some ranges are out of bounds"))

seqlevels_map <- setNames(names(x@user_seqnames), x@user_seqnames)
seqlevels(at) <- seqlevels_map[seqlevels(at)]
seqlevels(at) <- seqlevelsInUse(at)

LRat <- splitLRgranges(at)
## import() ignores the strand of the ranges in 'which'.
Lans <- import(x@single_sequences@twobitfile, which=LRat$L)
Rans <- import(x@single_sequences@twobitfile, which=LRat$R)
ans <- xscat(Lans, Rans)

idx <- which(strand(at) == "-")
ans[idx] <- reverseComplement(ans[idx])
ans
}

### Must handle user defined seqlevels, injected SNPs, circularity, and strand.
.extract_genome_sequences_at <- function(x, at)
{
stopifnot(is(x, "BSgenome"), is(at, "GRanges"))

invalid_seqlevels <- setdiff(seqlevels(at), seqlevels(x))
if (length(invalid_seqlevels) != 0L)
stop(wmsg("'at' contains invalid chromosome name(s): ",
paste(invalid_seqlevels, sep=", ")))
si <- merge(seqinfo(x), seqinfo(at))
seqlevels(at) <- seqlevels(si)
seqinfo(x) <- seqinfo(at) <- si

if (is.null(SNPlocs_pkgname(x)) &&
is(x@single_sequences, "TwobitNamedSequences"))
{
FUN <- .fast_extract_genome_sequences_at
} else {
FUN <- .slow_extract_genome_sequences_at
}
ans <- FUN(x, at)
names(ans) <- names(at)
ans
}

.extractAt_BSgenome <- function(x, at)
{
if (is(at, "GRanges"))
return(.extract_genome_sequences_at(x, at))
if (is(at, "GRangesList")) {
unlisted_at <- unlist(at, use.names=FALSE)
unlisted_ans <- .extract_genome_sequences_at(x, unlisted_at)
return(relist(unlisted_ans, at))
}
stop(wmsg("'at' must be a GRanges or GRangesList object"))
}

setMethod("extractAt", "BSgenome", .extractAt_BSgenome)

75 changes: 74 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
### =========================================================================
### Some low-level internal (i.e. non-exported) utilities
### Miscellaneous low-level utils
### -------------------------------------------------------------------------
###
### Unless stated otherwise, nothing in this file is exported.
###


get_data_annotation_contrib_url <- function(type=getOption("pkgType"))
Expand Down Expand Up @@ -33,3 +36,73 @@ encode_letters_as_bytes <- function(x) charToRaw(paste(x, collapse=""))

decode_bytes_as_letters <- function(x) rawToChar(x, multiple=TRUE)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Used in the context of sequence extraction from circular chromosomes
###

### Return 2 IRanges objects parallel to 'x'.
splitLRranges <- function(x, seqlength, seqname)
{
stopifnot(is(x, "IntegerRanges"),
isSingleInteger(seqlength),
seqlength >= 1L)

x_start <- start(x)
x_end <- end(x)
start0 <- x_start - 1L # 0-based start
shift <- start0 %% seqlength - start0
end1 <- x_end + shift
L_start <- x_start + shift
L_end <- pmin(end1, seqlength)
R_start <- rep.int(1L, length(x))
R_end <- pmax(end1 - seqlength, 0L)
idx <- which(R_end > seqlength)
if (length(idx) != 0L)
stop(wmsg("A region to extract from circular ",
"sequence \"", seqname, "\" crosses the end/start ",
"of the circle more than once. ",
"This is not supported at the moment, sorry!"))
Lans <- IRanges(L_start, L_end)
Rans <- IRanges(R_start, R_end)
list(L=Lans, R=Rans)
}

### Return 2 GRanges objects parallel to 'x'.
splitLRgranges <- function(x)
{
stopifnot(is(x, "GenomicRanges"))

x_seqnames_id <- as.integer(seqnames(x))
seqlengths <- unname(seqlengths(x))[x_seqnames_id]
if (any(width(x) != 0L & seqlengths == 0L))
stop(wmsg("cannot extract stuff from empty sequences"))
x_is_circ <- unname(isCircular(x)) %in% TRUE
split_idx <- which(x_is_circ[x_seqnames_id] & width(x) != 0L)

Lans <- Rans <- granges(x)
width(Rans) <- 0L
if (length(split_idx) == 0L)
return(list(L=Lans, R=Rans))

x_start <- start(x)[split_idx]
x_end <- end(x)[split_idx]
seqlengths <- seqlengths[split_idx]
start0 <- x_start - 1L # 0-based start
shift <- start0 %% seqlengths - start0
end1 <- x_end + shift
L_start <- x_start + shift
L_end <- pmin(end1, seqlengths)
R_start <- rep.int(1L, length(split_idx))
R_end <- pmax(end1 - seqlengths, 0L)
idx <- which(R_end > seqlengths)
if (length(idx) != 0L)
stop(wmsg("A region to extract from a circular ",
"sequence crosses the end/start ",
"of the circle more than once. ",
"This is not supported at the moment, sorry!"))
ranges(Lans)[split_idx] <- IRanges(L_start, L_end)
ranges(Rans)[split_idx] <- IRanges(R_start, R_end)
list(L=Lans, R=Rans)
}

0 comments on commit 9726318

Please sign in to comment.