implemented SparseIntSet (#533)

* implemented SparseIntSet * added tests for coveralls, removed unnecessary added to README * implemented comments, not immutable yet * implemented SparseIntSet * added tests for coveralls, removed unnecessary added to README * implemented comments, not immutable yet * added SparseIntSet benchmarks * code cleanup, assure! comment, removed current_id * fixed test * made SparseIntSet immutable * added less worst case bench * Added auto cleanup! on vanilla pop!, dirty_pop! is without cleanup. * Apply suggestions from code review Co-Authored-By: Lyndon White <[email protected]> * mutable + cleanup! * only do cleanup when there is actually a zero counter * changed to use NULL_INT_PAGE, simplified cleanup! and push!! * docs * code cleanup * Apply suggestions from code review Co-Authored-By: Lyndon White <[email protected]> * corrected copy, in, code cleanup, removed complement * cleaned up imports * immutable zip iterator, semver bump, removed entity_id * Update Project.toml Co-Authored-By: Lyndon White <[email protected]> * length better length in iterator
JuliaCollections · Oct 4, 2019 · 0c70c9c · 0c70c9c · oxinabox · Oct 4, 2019
1 parent e0bd1a7
commit 0c70c9c
Show file tree

Hide file tree

Showing 8 changed files with 522 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -29,6 +29,7 @@ This package implements a variety of data structures, including
 -   DataStructures.IntSet
 -   Priority Queue
 -   Fenwick Tree
+-   SparseIntSet
 
 Resources
 ---------

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -39,3 +39,73 @@ SUITE[["heap","mutable", "min", "push"]] =
     @benchmarkable push_heap(h, $xs) setup=(h=MutableBinaryMinHeap{Float64}())
 SUITE[["heap","mutable", "min", "pop"]] =
     @benchmarkable pop_heap(h) setup=(h=MutableBinaryMinHeap{Float64}($xs))
+
+SUITE["SparseIntSet"] = BenchmarkGroup()
+
+rand_setup =  (
+	Random.seed!(1234);
+	ids1 = rand(1:30000, 1000);
+	ids2 = rand(1:30000, 1000);
+)
+
+function create_fill_packed(ids1)
+	y = SparseIntSet()
+	for i in ids1
+		push!(y, i)
+	end
+end
+
+SUITE["SparseIntSet"]["create_fill"] =
+	@benchmarkable create_fill_packed(ids1) setup=rand_setup
+
+SUITE["SparseIntSet"]["in while not in"] =
+	@benchmarkable in(23, y) evals=1000 setup=(y = SparseIntSet();)
+SUITE["SparseIntSet"]["in while in"] =
+	@benchmarkable in(5199, y) evals=1000 setup=(y=SparseIntSet(); push!(y, 5199))
+
+function pop_push(y)
+	pop!(y, 5199)
+	push!(y, 5199)
+end
+
+SUITE["SparseIntSet"]["pop push worst case"] = @benchmarkable pop_push(y) setup=(y=SparseIntSet(); push!(y, 5199))
+SUITE["SparseIntSet"]["pop push"] = @benchmarkable pop_push(y) setup=(y=SparseIntSet(); push!(y, 5199); push!(y, 5200))
+
+function iterate_one_bench(x)
+	t = 0
+	for i in x
+		t += i
+	end
+	return t
+end
+function iterate_two_bench(x,y)
+	t = 0
+	for (ix, iy) in zip(x, y)
+		t += ix + iy
+	end
+	return t
+end
+function iterate_two_exclude_one_bench(x,y,z)
+	t = 0
+	for (ix, iy) in zip(x, y, exclude=(z,))
+		t += ix + iy
+	end
+	return t
+end
+
+x_y_z_setup = (
+	Random.seed!(1234);
+	x = SparseIntSet(rand(1:30000, 1000));
+	y = SparseIntSet(rand(1:30000, 1000));
+	z = SparseIntSet(rand(1:30000, 1000));
+)
+
+SUITE["SparseIntSet"]["iterate one"] =
+	@benchmarkable iterate_one_bench(x) setup=x_y_z_setup
+
+SUITE["SparseIntSet"]["iterate two"] =
+	@benchmarkable iterate_two_bench(x,y) setup=x_y_z_setup
+
+SUITE["SparseIntSet"]["iterate two exclude one"] =
+	@benchmarkable iterate_two_exclude_one_bench(x,y,z) setup=x_y_z_setup
+
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -41,5 +41,6 @@ Pages = [
     "mutable_linked_list.md"
     "intset.md",
     "sorted_containers.md",
+    "sparse_int_set.md"
 ]
 ```
diff --git a/docs/src/sparse_int_set.md b/docs/src/sparse_int_set.md
@@ -0,0 +1,8 @@
+# DataStructures.SparseIntSet
+
+Implementation of a __Sparse Integer Set__, for background see [Sparse Sets](https://www.computist.xyz/2018/06/sparse-sets.html).
+Only positive non-zero `Int`s are allowed inside the set. 
+The idea is to have one **packed** `Vector` storing all the `Int`s contained in the set as to allow for fast iteration, and a sparse, paged **reverse** `Vector` with the position of a particular `Int` inside the **packed** `Vector`. This allows for very fast iteration, insertion and deletion of indices.
+Most behavior is similar to a normal `IntSet`, however `collect`, `first` and `last` are with respected to the **packed** vector, in which the ordering is not guaranteed. 
+The **reverse** `Vector` is paged, meaning that it is a `Vector{Vector{Int}}` where each of the `Vector{Int}`s has the length of one memory page of `Int`s. Every time an index that was not yet in the range of the already present pages, a new one will be created and added to the **reverse**, allowing for dynamical growth. 
+Popping the last `Int` of a particular page will automatically clean up the memory of that page. 
diff --git a/src/DataStructures.jl b/src/DataStructures.jl
@@ -104,4 +104,6 @@ module DataStructures
     export PriorityQueue, peek
 
     include("priorityqueue.jl")
+    include("sparse_int_set.jl")
+    export SparseIntSet
 end
diff --git a/src/sparse_int_set.jl b/src/sparse_int_set.jl
@@ -0,0 +1,249 @@
+const INT_PER_PAGE = div(ccall(:jl_getpagesize, Clong, ()), sizeof(Int))
+# we use this to mark pages not in use, it must never be written to.
+const NULL_INT_PAGE = Vector{Int}()
+
+mutable struct SparseIntSet
+    packed ::Vector{Int}
+    reverse::Vector{Vector{Int}}
+    counters::Vector{Int}  # counts the number of real elements in each page of reverse. 
+end
+
+SparseIntSet() = SparseIntSet(Int[], Vector{Int}[], Int[])
+
+SparseIntSet(indices) = union!(SparseIntSet(), indices)
+
+eltype(::Type{SparseIntSet}) = Int
+
+empty(::SparseIntSet) = SparseIntSet()
+
+function empty!(s::SparseIntSet)
+    empty!(s.packed)
+    empty!(s.reverse)
+    empty!(s.counters)
+    return s
+end
+
+isempty(s::SparseIntSet) = isempty(s.packed)
+
+copy(s::SparseIntSet) = copy!(SparseIntSet(), s)
+
+function copy!(to::SparseIntSet, from::SparseIntSet)
+    to.packed = copy(from.packed)
+    #we want to keep the null pages === NULL_INT_PAGE
+    resize!(to.reverse, length(from.reverse))
+    for i in eachindex(from.reverse)
+        page = from.reverse[i]
+        if page === NULL_INT_PAGE
+            to.reverse[i] = NULL_INT_PAGE
+        else
+            to.reverse[i] = copy(from.reverse[i])
+        end
+    end
+    to.counters = copy(from.counters)
+    return to
+end
+
+function pageid_offset(s::SparseIntSet, i)
+    pageid = div(i - 1, INT_PER_PAGE) + 1
+    return pageid, (i - 1) & (INT_PER_PAGE - 1) + 1
+end
+
+function in(i, s::SparseIntSet)
+    pageid, offset = pageid_offset(s, i)
+    if pageid > length(s.reverse)
+        return false
+    else
+        page = @inbounds s.reverse[pageid]
+        return page !== NULL_INT_PAGE &&  @inbounds page[offset] != 0
+    end
+end
+
+length(s::SparseIntSet) = length(s.packed)
+
+function push!(s::SparseIntSet, i::Integer)
+    i <= 0 && throw(DomainError("Only positive Ints allowed."))
+
+    pageid, offset = pageid_offset(s, i)
+    pages = s.reverse
+    plen = length(pages)
+
+    if pageid > plen
+        # Create new null pages up to pageid and fresh (zero-filled) one at pageid
+        sizehint!(pages, pageid)
+        sizehint!(s.counters, pageid)
+        for i in 1:pageid - plen - 1
+            push!(pages, NULL_INT_PAGE)
+            push!(s.counters, 0)
+        end
+        push!(pages, zeros(Int, INT_PER_PAGE))
+        push!(s.counters, 0)
+    elseif pages[pageid] === NULL_INT_PAGE
+        #assign a page to previous null page
+        pages[pageid] = zeros(Int, INT_PER_PAGE)
+    end
+    page = pages[pageid]
+    if page[offset] == 0
+        @inbounds page[offset] = length(s) + 1
+        @inbounds s.counters[pageid] += 1
+        push!(s.packed, i)
+        return s
+    end
+    return s
+end
+
+function push!(s::SparseIntSet, is::Integer...)
+    for i in is
+        push!(s, i)
+    end
+    return s
+end
+
+Base.@propagate_inbounds function pop!(s::SparseIntSet)
+    if isempty(s)
+        throw(ArgumentError("Cannot pop an empty set."))
+    end
+    id = pop!(s.packed)
+    pageid, offset = pageid_offset(s, id)
+    @inbounds s.reverse[pageid][offset] = 0
+    @inbounds s.counters[pageid] -= 1
+    cleanup!(s, pageid)
+    return id
+end
+
+Base.@propagate_inbounds function pop!(s::SparseIntSet, id::Integer)
+    id < 0 && throw(ArgumentError("Int to pop needs to be positive."))
+
+    @boundscheck if !in(id, s)
+        throw(BoundsError(s, id))
+    end
+    @inbounds begin
+        packed_endid = s.packed[end] 
+        from_page, from_offset = pageid_offset(s, id)
+        to_page, to_offset = pageid_offset(s, packed_endid)
+
+        packed_id = s.reverse[from_page][from_offset]
+        s.packed[packed_id] = packed_endid
+        s.reverse[to_page][to_offset] = s.reverse[from_page][from_offset]
+        s.reverse[from_page][from_offset] = 0
+        s.counters[from_page] -= 1
+        pop!(s.packed)
+    end
+    cleanup!(s, from_page)
+    return id
+end
+
+function cleanup!(s::SparseIntSet, pageid::Int)
+    if s.counters[pageid] == 0
+        s.reverse[pageid] = NULL_INT_PAGE
+    end
+end
+
+function pop!(s::SparseIntSet, id::Integer, default)
+    id < 0 && throw(ArgumentError("Int to pop needs to be positive."))
+    return in(id, s) ? (@inbounds pop!(s, id)) : default
+end
+popfirst!(s::SparseIntSet) = pop!(s, first(s))
+
+iterate(set::SparseIntSet, args...) = iterate(set.packed, args...) 
+
+last(s::SparseIntSet) = isempty(s) ? throw(ArgumentError("Empty set has no last element.")) : last(s.packed)
+
+union(s::SparseIntSet, ns) = union!(copy(s), ns)
+function union!(s::SparseIntSet, ns)
+    for n in ns
+        push!(s, n)
+    end
+    return s
+end
+
+intersect(s1::SparseIntSet) = copy(s1)
+intersect(s1::SparseIntSet, ss...) = intersect(s1, intersect(ss...))
+function intersect(s1::SparseIntSet, ns)
+    s = SparseIntSet()
+    for n in ns
+        n in s1 && push!(s, n)
+    end
+    return s
+end
+
+intersect!(s1::SparseIntSet, ss...) = intersect!(s1, intersect(ss...))
+
+#Is there a more performant way to do this?
+intersect!(s1::SparseIntSet, ns) = copy!(s1, intersect(s1, ns))
+
+setdiff(s::SparseIntSet, ns) = setdiff!(copy(s), ns)
+function setdiff!(s::SparseIntSet, ns)
+    for n in ns
+        pop!(s, n, nothing)
+    end
+    return s
+end
+
+function ==(s1::SparseIntSet, s2::SparseIntSet)
+    length(s1) != length(s2) && return false
+    return all(in(s1), s2)
+end
+
+issubset(a::SparseIntSet, b::SparseIntSet) = isequal(a, intersect(a, b))
+
+<(a::SparseIntSet, b::SparseIntSet) = ( a<=b ) && !isequal(a, b)
+<=(a::SparseIntSet, b::SparseIntSet) = issubset(a, b)
+
+function findfirst_packed_id(i, s::SparseIntSet)
+    pageid, offset = pageid_offset(s, i)
+    if pageid > length(s.counters) || s.counters[pageid] == 0
+        return 0
+    end
+    @inbounds id = s.reverse[pageid][offset]
+    return id
+end
+
+collect(s::SparseIntSet) = copy(s.packed)
+
+struct ZippedSparseIntSetIterator{VT,IT}
+    valid_sets::VT
+    shortest_set::SparseIntSet
+    excluded_sets::IT
+    function ZippedSparseIntSetIterator(valid_sets::SparseIntSet...; exclude::NTuple{N, SparseIntSet}=()) where{N}
+        shortest = valid_sets[findmin(map(length, valid_sets))[2]]
+        new{typeof(valid_sets), NTuple{N, SparseIntSet}}(valid_sets, shortest, exclude)
+    end
+end
+
+Base.zip(s::SparseIntSet...;kwargs...) = ZippedSparseIntSetIterator(s...;kwargs...)
+
+@inline length(it::ZippedSparseIntSetIterator) = length(it.shortest_set)
+
+# we know it is not in_excluded, as there are no excluded
+in_excluded(id, it::ZippedSparseIntSetIterator{VT,Tuple{}}) where {VT} = false
+
+function in_excluded(id, it)
+    for e in it.excluded_sets
+        if id in e
+            return true
+        end
+    end
+    return false
+end
+
+@inline function id_tids(it, state)
+    id = it.shortest_set.packed[state]
+    return id, map(x -> findfirst_packed_id(id, x), it.valid_sets)
+end
+
+Base.@propagate_inbounds function iterate(it::ZippedSparseIntSetIterator, state=1)
+    iterator_length = length(it)
+    if state > iterator_length
+        return nothing
+    end
+    id, tids = id_tids(it, state)
+    while any(iszero, tids) || in_excluded(id, it)
+        state += 1
+        if state > iterator_length
+            return nothing
+        end
+
+        id, tids = id_tids(it, state)
+    end
+    return tids, state + 1
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,6 +8,7 @@ import DataStructures: IntSet
 @test [] == detect_ambiguities(Base, Core, DataStructures)
 
 tests = ["int_set",
+         "sparse_int_set",
          "deque",
          "circ_deque",
          "sorted_containers",
@@ -28,7 +29,7 @@ tests = ["int_set",
          "sorting",
          "priority_queue", 
          "fenwick", 
-         "robin_dict"
+         "robin_dict",
         ]
 
 if length(ARGS) > 0