diff --git a/README.md b/README.md index 183b57225..28c4d8485 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ This package implements a variety of data structures, including - DataStructures.IntSet - Priority Queue - Fenwick Tree +- SparseIntSet Resources --------- diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 5ba66f37a..e0ce83862 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -39,3 +39,73 @@ SUITE[["heap","mutable", "min", "push"]] = @benchmarkable push_heap(h, $xs) setup=(h=MutableBinaryMinHeap{Float64}()) SUITE[["heap","mutable", "min", "pop"]] = @benchmarkable pop_heap(h) setup=(h=MutableBinaryMinHeap{Float64}($xs)) + +SUITE["SparseIntSet"] = BenchmarkGroup() + +rand_setup = ( + Random.seed!(1234); + ids1 = rand(1:30000, 1000); + ids2 = rand(1:30000, 1000); +) + +function create_fill_packed(ids1) + y = SparseIntSet() + for i in ids1 + push!(y, i) + end +end + +SUITE["SparseIntSet"]["create_fill"] = + @benchmarkable create_fill_packed(ids1) setup=rand_setup + +SUITE["SparseIntSet"]["in while not in"] = + @benchmarkable in(23, y) evals=1000 setup=(y = SparseIntSet();) +SUITE["SparseIntSet"]["in while in"] = + @benchmarkable in(5199, y) evals=1000 setup=(y=SparseIntSet(); push!(y, 5199)) + +function pop_push(y) + pop!(y, 5199) + push!(y, 5199) +end + +SUITE["SparseIntSet"]["pop push worst case"] = @benchmarkable pop_push(y) setup=(y=SparseIntSet(); push!(y, 5199)) +SUITE["SparseIntSet"]["pop push"] = @benchmarkable pop_push(y) setup=(y=SparseIntSet(); push!(y, 5199); push!(y, 5200)) + +function iterate_one_bench(x) + t = 0 + for i in x + t += i + end + return t +end +function iterate_two_bench(x,y) + t = 0 + for (ix, iy) in zip(x, y) + t += ix + iy + end + return t +end +function iterate_two_exclude_one_bench(x,y,z) + t = 0 + for (ix, iy) in zip(x, y, exclude=(z,)) + t += ix + iy + end + return t +end + +x_y_z_setup = ( + Random.seed!(1234); + x = SparseIntSet(rand(1:30000, 1000)); + y = SparseIntSet(rand(1:30000, 1000)); + z = SparseIntSet(rand(1:30000, 1000)); +) + +SUITE["SparseIntSet"]["iterate one"] = + @benchmarkable iterate_one_bench(x) setup=x_y_z_setup + +SUITE["SparseIntSet"]["iterate two"] = + @benchmarkable iterate_two_bench(x,y) setup=x_y_z_setup + +SUITE["SparseIntSet"]["iterate two exclude one"] = + @benchmarkable iterate_two_exclude_one_bench(x,y,z) setup=x_y_z_setup + diff --git a/docs/src/index.md b/docs/src/index.md index 16e3a101d..1e2578f3e 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -41,5 +41,6 @@ Pages = [ "mutable_linked_list.md" "intset.md", "sorted_containers.md", + "sparse_int_set.md" ] ``` diff --git a/docs/src/sparse_int_set.md b/docs/src/sparse_int_set.md new file mode 100644 index 000000000..b63a62809 --- /dev/null +++ b/docs/src/sparse_int_set.md @@ -0,0 +1,8 @@ +# DataStructures.SparseIntSet + +Implementation of a __Sparse Integer Set__, for background see [Sparse Sets](https://www.computist.xyz/2018/06/sparse-sets.html). +Only positive non-zero `Int`s are allowed inside the set. +The idea is to have one **packed** `Vector` storing all the `Int`s contained in the set as to allow for fast iteration, and a sparse, paged **reverse** `Vector` with the position of a particular `Int` inside the **packed** `Vector`. This allows for very fast iteration, insertion and deletion of indices. +Most behavior is similar to a normal `IntSet`, however `collect`, `first` and `last` are with respected to the **packed** vector, in which the ordering is not guaranteed. +The **reverse** `Vector` is paged, meaning that it is a `Vector{Vector{Int}}` where each of the `Vector{Int}`s has the length of one memory page of `Int`s. Every time an index that was not yet in the range of the already present pages, a new one will be created and added to the **reverse**, allowing for dynamical growth. +Popping the last `Int` of a particular page will automatically clean up the memory of that page. diff --git a/src/DataStructures.jl b/src/DataStructures.jl index 7ec2f770e..cad5f76e9 100644 --- a/src/DataStructures.jl +++ b/src/DataStructures.jl @@ -104,4 +104,6 @@ module DataStructures export PriorityQueue, peek include("priorityqueue.jl") + include("sparse_int_set.jl") + export SparseIntSet end diff --git a/src/sparse_int_set.jl b/src/sparse_int_set.jl new file mode 100644 index 000000000..64da3c12f --- /dev/null +++ b/src/sparse_int_set.jl @@ -0,0 +1,249 @@ +const INT_PER_PAGE = div(ccall(:jl_getpagesize, Clong, ()), sizeof(Int)) +# we use this to mark pages not in use, it must never be written to. +const NULL_INT_PAGE = Vector{Int}() + +mutable struct SparseIntSet + packed ::Vector{Int} + reverse::Vector{Vector{Int}} + counters::Vector{Int} # counts the number of real elements in each page of reverse. +end + +SparseIntSet() = SparseIntSet(Int[], Vector{Int}[], Int[]) + +SparseIntSet(indices) = union!(SparseIntSet(), indices) + +eltype(::Type{SparseIntSet}) = Int + +empty(::SparseIntSet) = SparseIntSet() + +function empty!(s::SparseIntSet) + empty!(s.packed) + empty!(s.reverse) + empty!(s.counters) + return s +end + +isempty(s::SparseIntSet) = isempty(s.packed) + +copy(s::SparseIntSet) = copy!(SparseIntSet(), s) + +function copy!(to::SparseIntSet, from::SparseIntSet) + to.packed = copy(from.packed) + #we want to keep the null pages === NULL_INT_PAGE + resize!(to.reverse, length(from.reverse)) + for i in eachindex(from.reverse) + page = from.reverse[i] + if page === NULL_INT_PAGE + to.reverse[i] = NULL_INT_PAGE + else + to.reverse[i] = copy(from.reverse[i]) + end + end + to.counters = copy(from.counters) + return to +end + +function pageid_offset(s::SparseIntSet, i) + pageid = div(i - 1, INT_PER_PAGE) + 1 + return pageid, (i - 1) & (INT_PER_PAGE - 1) + 1 +end + +function in(i, s::SparseIntSet) + pageid, offset = pageid_offset(s, i) + if pageid > length(s.reverse) + return false + else + page = @inbounds s.reverse[pageid] + return page !== NULL_INT_PAGE && @inbounds page[offset] != 0 + end +end + +length(s::SparseIntSet) = length(s.packed) + +function push!(s::SparseIntSet, i::Integer) + i <= 0 && throw(DomainError("Only positive Ints allowed.")) + + pageid, offset = pageid_offset(s, i) + pages = s.reverse + plen = length(pages) + + if pageid > plen + # Create new null pages up to pageid and fresh (zero-filled) one at pageid + sizehint!(pages, pageid) + sizehint!(s.counters, pageid) + for i in 1:pageid - plen - 1 + push!(pages, NULL_INT_PAGE) + push!(s.counters, 0) + end + push!(pages, zeros(Int, INT_PER_PAGE)) + push!(s.counters, 0) + elseif pages[pageid] === NULL_INT_PAGE + #assign a page to previous null page + pages[pageid] = zeros(Int, INT_PER_PAGE) + end + page = pages[pageid] + if page[offset] == 0 + @inbounds page[offset] = length(s) + 1 + @inbounds s.counters[pageid] += 1 + push!(s.packed, i) + return s + end + return s +end + +function push!(s::SparseIntSet, is::Integer...) + for i in is + push!(s, i) + end + return s +end + +Base.@propagate_inbounds function pop!(s::SparseIntSet) + if isempty(s) + throw(ArgumentError("Cannot pop an empty set.")) + end + id = pop!(s.packed) + pageid, offset = pageid_offset(s, id) + @inbounds s.reverse[pageid][offset] = 0 + @inbounds s.counters[pageid] -= 1 + cleanup!(s, pageid) + return id +end + +Base.@propagate_inbounds function pop!(s::SparseIntSet, id::Integer) + id < 0 && throw(ArgumentError("Int to pop needs to be positive.")) + + @boundscheck if !in(id, s) + throw(BoundsError(s, id)) + end + @inbounds begin + packed_endid = s.packed[end] + from_page, from_offset = pageid_offset(s, id) + to_page, to_offset = pageid_offset(s, packed_endid) + + packed_id = s.reverse[from_page][from_offset] + s.packed[packed_id] = packed_endid + s.reverse[to_page][to_offset] = s.reverse[from_page][from_offset] + s.reverse[from_page][from_offset] = 0 + s.counters[from_page] -= 1 + pop!(s.packed) + end + cleanup!(s, from_page) + return id +end + +function cleanup!(s::SparseIntSet, pageid::Int) + if s.counters[pageid] == 0 + s.reverse[pageid] = NULL_INT_PAGE + end +end + +function pop!(s::SparseIntSet, id::Integer, default) + id < 0 && throw(ArgumentError("Int to pop needs to be positive.")) + return in(id, s) ? (@inbounds pop!(s, id)) : default +end +popfirst!(s::SparseIntSet) = pop!(s, first(s)) + +iterate(set::SparseIntSet, args...) = iterate(set.packed, args...) + +last(s::SparseIntSet) = isempty(s) ? throw(ArgumentError("Empty set has no last element.")) : last(s.packed) + +union(s::SparseIntSet, ns) = union!(copy(s), ns) +function union!(s::SparseIntSet, ns) + for n in ns + push!(s, n) + end + return s +end + +intersect(s1::SparseIntSet) = copy(s1) +intersect(s1::SparseIntSet, ss...) = intersect(s1, intersect(ss...)) +function intersect(s1::SparseIntSet, ns) + s = SparseIntSet() + for n in ns + n in s1 && push!(s, n) + end + return s +end + +intersect!(s1::SparseIntSet, ss...) = intersect!(s1, intersect(ss...)) + +#Is there a more performant way to do this? +intersect!(s1::SparseIntSet, ns) = copy!(s1, intersect(s1, ns)) + +setdiff(s::SparseIntSet, ns) = setdiff!(copy(s), ns) +function setdiff!(s::SparseIntSet, ns) + for n in ns + pop!(s, n, nothing) + end + return s +end + +function ==(s1::SparseIntSet, s2::SparseIntSet) + length(s1) != length(s2) && return false + return all(in(s1), s2) +end + +issubset(a::SparseIntSet, b::SparseIntSet) = isequal(a, intersect(a, b)) + +<(a::SparseIntSet, b::SparseIntSet) = ( a<=b ) && !isequal(a, b) +<=(a::SparseIntSet, b::SparseIntSet) = issubset(a, b) + +function findfirst_packed_id(i, s::SparseIntSet) + pageid, offset = pageid_offset(s, i) + if pageid > length(s.counters) || s.counters[pageid] == 0 + return 0 + end + @inbounds id = s.reverse[pageid][offset] + return id +end + +collect(s::SparseIntSet) = copy(s.packed) + +struct ZippedSparseIntSetIterator{VT,IT} + valid_sets::VT + shortest_set::SparseIntSet + excluded_sets::IT + function ZippedSparseIntSetIterator(valid_sets::SparseIntSet...; exclude::NTuple{N, SparseIntSet}=()) where{N} + shortest = valid_sets[findmin(map(length, valid_sets))[2]] + new{typeof(valid_sets), NTuple{N, SparseIntSet}}(valid_sets, shortest, exclude) + end +end + +Base.zip(s::SparseIntSet...;kwargs...) = ZippedSparseIntSetIterator(s...;kwargs...) + +@inline length(it::ZippedSparseIntSetIterator) = length(it.shortest_set) + +# we know it is not in_excluded, as there are no excluded +in_excluded(id, it::ZippedSparseIntSetIterator{VT,Tuple{}}) where {VT} = false + +function in_excluded(id, it) + for e in it.excluded_sets + if id in e + return true + end + end + return false +end + +@inline function id_tids(it, state) + id = it.shortest_set.packed[state] + return id, map(x -> findfirst_packed_id(id, x), it.valid_sets) +end + +Base.@propagate_inbounds function iterate(it::ZippedSparseIntSetIterator, state=1) + iterator_length = length(it) + if state > iterator_length + return nothing + end + id, tids = id_tids(it, state) + while any(iszero, tids) || in_excluded(id, it) + state += 1 + if state > iterator_length + return nothing + end + + id, tids = id_tids(it, state) + end + return tids, state + 1 +end diff --git a/test/runtests.jl b/test/runtests.jl index 23d4b0c04..961f796a2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,6 +8,7 @@ import DataStructures: IntSet @test [] == detect_ambiguities(Base, Core, DataStructures) tests = ["int_set", + "sparse_int_set", "deque", "circ_deque", "sorted_containers", @@ -28,7 +29,7 @@ tests = ["int_set", "sorting", "priority_queue", "fenwick", - "robin_dict" + "robin_dict", ] if length(ARGS) > 0 diff --git a/test/test_sparse_int_set.jl b/test/test_sparse_int_set.jl new file mode 100644 index 000000000..71c4432c2 --- /dev/null +++ b/test/test_sparse_int_set.jl @@ -0,0 +1,189 @@ +using DataStructures, Test +import DataStructures: SparseIntSet + +@testset "SparseIntSet" begin + @testset "Construction, collect" begin + data_in = (1,5,100) + s = SparseIntSet(data_in) + data_out = collect(s) + @test all(map(d->in(d,data_out), data_in)) + @test length(data_out) == length(data_in) + end + + @testset "eltype, empty" begin + @test eltype(SparseIntSet()) == Int + @test eltype(typeof(SparseIntSet())) == Int + @test isequal(empty(SparseIntSet([1,2,3])), SparseIntSet()) + end + + @testset "Core Functionality" begin + s = SparseIntSet([1,2,10,20,200,300,1000,10000,10002]) + @test last(s) == 10002 + @test first(s) == 1 + @test length(s) == 9 + @test pop!(s) == 10002 + @test length(s) == 8 + @test popfirst!(s) == 1 + @test length(s) == 7 + @test !in(1,s) + @test !in(10002,s) + @test in(10000,s) + @test_throws ArgumentError first(SparseIntSet()) + @test_throws ArgumentError last(SparseIntSet()) + t = copy(s) + s = SparseIntSet() + push!(s, 1, 2, 100) + @test 1 in s + @test !(3 in s) + @test 2 in s + @test 100 in s + @test !(101 in s) + @test !(1000 in s) + @test first(s) == 1 + @test last(s) == 100 + @test s == SparseIntSet([1, 2, 100]) + push!(s, 1000) + @test [i for i in s] == [1, 2, 100, 1000] + @test pop!(s) == 1000 + @test s == SparseIntSet([1, 2, 100]) + push!(s, 5000) + push!(s, 2000) + pop!(s, 5000) + @test s.reverse[end] === DataStructures.NULL_INT_PAGE + b = 1:1000 + s = SparseIntSet(b) + @test collect(s) == collect(b) + @test length(s) == length(b) + @test pop!(s, 100) == 100 + @test_throws BoundsError pop!(s, 100) + @test pop!(s, 100, 1) == 1 + @test pop!(s, 99, 1) == 99 + @test !in(500000, s) + @test !in(99, s) + end + + @testset "setdiff / symdiff" begin + @test setdiff(SparseIntSet([1, 2, 3, 4]), SparseIntSet([2, 4, 5, 6])) == SparseIntSet([1, 3]) + end + + @testset "setdiff!" begin + s2 = SparseIntSet([1, 2, 3, 4]) + setdiff!(s2, SparseIntSet([2, 4, 5, 6])) + + @test s2 == SparseIntSet([1, 3]) + end + + @testset "issue #7851" begin + @test_throws DomainError SparseIntSet(-1) + @test !(-1 in SparseIntSet(1:10)) + end + @testset "Copy, copy!, empty" begin + s1 = SparseIntSet([1,2,3]) + s2 = empty(s1) + push!(s2, 10000) + @test !in(10000, s1) + copy!(s2, s1) + @test !in(10000, s2) + push!(s2, 10000) + @test !in(10000, s1) + s3 = copy(s2) + push!(s3, 1000) + @test !in(1000, s2) + pop!(s3, 1000) + pop!(s2, 10000) + @test in(10000, s3) + pop!(s3, 10000) + @test s3 == s2 == s1 + @test collect(s3) == collect(s2) == [1,2,3] + + + end + + @testset "Push, union" begin + # Push, union + s1 = SparseIntSet() + @test_throws DomainError push!(s1, -1) + push!(s1, 1, 10, 100, 1000) + @test collect(s1) == [1, 10, 100, 1000] + push!(s1, 606) + @test collect(s1) == [1, 10, 100, 1000, 606] + s2 = SparseIntSet() + @test s2 === union!(s2, s1) + s3 = SparseIntSet([1, 10, 100]) + union!(s3, [1, 606, 1000]) + s4 = union(SparseIntSet([1, 100, 1000]), SparseIntSet([10, 100, 606])) + @test s1 == s2 == s3 == s4 + end + + @testset "pop!, delete!" begin + s = SparseIntSet(1:2:10) + @test pop!(s, 1) == 1 + @test !(1 in s) + @test_throws BoundsError pop!(s, 1) + @test_throws ArgumentError pop!(s, -1) + @test_throws ArgumentError pop!(s, -1, 1) + @test pop!(s, 1, 0) == 0 + is = copy(s.packed) + for i in is; pop!(s, i); end + @test isempty(s) + push!(s, 1:2:10...) + @test pop!(s) == 9 + @test pop!(s) == 7 + @test popfirst!(s) == 1 + @test popfirst!(s) == 5 + @test collect(s) == [3] + empty!(s) + @test isempty(s) + end + + @testset "Intersect" begin + @test isempty(intersect(SparseIntSet())) + @test isempty(intersect(SparseIntSet(1:10), SparseIntSet())) + @test isempty(intersect(SparseIntSet(), SparseIntSet(1:10))) + + @test intersect(SparseIntSet([1,2,3])) == SparseIntSet([1,2,3]) + + @test intersect(SparseIntSet(1:7), SparseIntSet(3:10)) == + intersect(SparseIntSet(3:10), SparseIntSet(1:7)) == SparseIntSet(3:7) + + @test intersect!(SparseIntSet(1:10), SparseIntSet(1:4), 1:5, [1,2,10]) == SparseIntSet(1:2) + end + + @testset "Setdiff" begin + s1 = SparseIntSet(1:100) + setdiff!(s1, SparseIntSet(1:2:100)) + s2 = setdiff(SparseIntSet(1:100), SparseIntSet(1:2:100)) + @test s1 == s2 == SparseIntSet(2:2:100) + + s1 = SparseIntSet(1:10) + s2 = SparseIntSet([1:2; 6:100]) + @test setdiff(s1, s2) == setdiff(s1, [1:2; 6:100]) == SparseIntSet(3:5) + end + + @testset "Subsets, equality" begin + @test SparseIntSet(2:2:10) < SparseIntSet(1:10) + @test !(SparseIntSet(2:2:10) < SparseIntSet(2:2:10)) + @test SparseIntSet(2:2:10) <= SparseIntSet(2:10) + @test SparseIntSet(2:2:10) <= SparseIntSet(2:2:10) + end + @testset "zip" begin + a = SparseIntSet([1,2,3,5, 6, 9, 12, 24]) + b = SparseIntSet([6, 12, 24, 1000, 2000, 3000]) + c = SparseIntSet(2:2:100) + d = SparseIntSet(6:3:100) + e = SparseIntSet((6, 12)) + s1 = 0 + it = zip(a, b, c, d, e) + for (ia, ib, ic, id, ie) in it + s1 += a.packed[ia] + b.packed[ib] + c.packed[ic] + d.packed[id] + e.packed[ie] + end + @test s1 == 5*(6+12) + s1 = 0 + it = zip(a, b, c, d, exclude=(e,)) + for (ia, ib, ic, id) in it + s1 += a.packed[ia] + b.packed[ib] + c.packed[ic] + d.packed[id] + end + @test s1 == 4*24 + end + +end