Skip to content

Commit

Permalink
implemented SparseIntSet (#533)
Browse files Browse the repository at this point in the history
* implemented SparseIntSet

* added tests for coveralls, removed unnecessary added to README

* implemented comments, not immutable yet

* implemented SparseIntSet

* added tests for coveralls, removed unnecessary added to README

* implemented comments, not immutable yet

* added SparseIntSet benchmarks

* code cleanup, assure! comment, removed current_id

* fixed test

* made SparseIntSet immutable

* added less worst case bench

* Added auto cleanup! on vanilla pop!, dirty_pop! is without cleanup.

* Apply suggestions from code review

Co-Authored-By: Lyndon White <[email protected]>

* mutable + cleanup!

* only do cleanup when there is actually a zero counter

* changed to use NULL_INT_PAGE, simplified cleanup! and push!!

* docs

* code cleanup

* Apply suggestions from code review

Co-Authored-By: Lyndon White <[email protected]>

* corrected copy, in, code cleanup, removed complement

* cleaned up imports

* immutable zip iterator, semver bump, removed entity_id

* Update Project.toml

Co-Authored-By: Lyndon White <[email protected]>

* length better length in iterator
  • Loading branch information
louisponet authored and oxinabox committed Oct 4, 2019
1 parent e0bd1a7 commit 0c70c9c
Show file tree
Hide file tree
Showing 8 changed files with 522 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ This package implements a variety of data structures, including
- DataStructures.IntSet
- Priority Queue
- Fenwick Tree
- SparseIntSet

Resources
---------
Expand Down
70 changes: 70 additions & 0 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,73 @@ SUITE[["heap","mutable", "min", "push"]] =
@benchmarkable push_heap(h, $xs) setup=(h=MutableBinaryMinHeap{Float64}())
SUITE[["heap","mutable", "min", "pop"]] =
@benchmarkable pop_heap(h) setup=(h=MutableBinaryMinHeap{Float64}($xs))

SUITE["SparseIntSet"] = BenchmarkGroup()

rand_setup = (
Random.seed!(1234);
ids1 = rand(1:30000, 1000);
ids2 = rand(1:30000, 1000);
)

function create_fill_packed(ids1)
y = SparseIntSet()
for i in ids1
push!(y, i)
end
end

SUITE["SparseIntSet"]["create_fill"] =
@benchmarkable create_fill_packed(ids1) setup=rand_setup

SUITE["SparseIntSet"]["in while not in"] =
@benchmarkable in(23, y) evals=1000 setup=(y = SparseIntSet();)
SUITE["SparseIntSet"]["in while in"] =
@benchmarkable in(5199, y) evals=1000 setup=(y=SparseIntSet(); push!(y, 5199))

function pop_push(y)
pop!(y, 5199)
push!(y, 5199)
end

SUITE["SparseIntSet"]["pop push worst case"] = @benchmarkable pop_push(y) setup=(y=SparseIntSet(); push!(y, 5199))
SUITE["SparseIntSet"]["pop push"] = @benchmarkable pop_push(y) setup=(y=SparseIntSet(); push!(y, 5199); push!(y, 5200))

function iterate_one_bench(x)
t = 0
for i in x
t += i
end
return t
end
function iterate_two_bench(x,y)
t = 0
for (ix, iy) in zip(x, y)
t += ix + iy
end
return t
end
function iterate_two_exclude_one_bench(x,y,z)
t = 0
for (ix, iy) in zip(x, y, exclude=(z,))
t += ix + iy
end
return t
end

x_y_z_setup = (
Random.seed!(1234);
x = SparseIntSet(rand(1:30000, 1000));
y = SparseIntSet(rand(1:30000, 1000));
z = SparseIntSet(rand(1:30000, 1000));
)

SUITE["SparseIntSet"]["iterate one"] =
@benchmarkable iterate_one_bench(x) setup=x_y_z_setup

SUITE["SparseIntSet"]["iterate two"] =
@benchmarkable iterate_two_bench(x,y) setup=x_y_z_setup

SUITE["SparseIntSet"]["iterate two exclude one"] =
@benchmarkable iterate_two_exclude_one_bench(x,y,z) setup=x_y_z_setup

1 change: 1 addition & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,6 @@ Pages = [
"mutable_linked_list.md"
"intset.md",
"sorted_containers.md",
"sparse_int_set.md"
]
```
8 changes: 8 additions & 0 deletions docs/src/sparse_int_set.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# DataStructures.SparseIntSet

Implementation of a __Sparse Integer Set__, for background see [Sparse Sets](https://www.computist.xyz/2018/06/sparse-sets.html).
Only positive non-zero `Int`s are allowed inside the set.
The idea is to have one **packed** `Vector` storing all the `Int`s contained in the set as to allow for fast iteration, and a sparse, paged **reverse** `Vector` with the position of a particular `Int` inside the **packed** `Vector`. This allows for very fast iteration, insertion and deletion of indices.
Most behavior is similar to a normal `IntSet`, however `collect`, `first` and `last` are with respected to the **packed** vector, in which the ordering is not guaranteed.
The **reverse** `Vector` is paged, meaning that it is a `Vector{Vector{Int}}` where each of the `Vector{Int}`s has the length of one memory page of `Int`s. Every time an index that was not yet in the range of the already present pages, a new one will be created and added to the **reverse**, allowing for dynamical growth.
Popping the last `Int` of a particular page will automatically clean up the memory of that page.
2 changes: 2 additions & 0 deletions src/DataStructures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,4 +104,6 @@ module DataStructures
export PriorityQueue, peek

include("priorityqueue.jl")
include("sparse_int_set.jl")
export SparseIntSet
end
249 changes: 249 additions & 0 deletions src/sparse_int_set.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
const INT_PER_PAGE = div(ccall(:jl_getpagesize, Clong, ()), sizeof(Int))
# we use this to mark pages not in use, it must never be written to.
const NULL_INT_PAGE = Vector{Int}()

mutable struct SparseIntSet
packed ::Vector{Int}
reverse::Vector{Vector{Int}}
counters::Vector{Int} # counts the number of real elements in each page of reverse.
end

SparseIntSet() = SparseIntSet(Int[], Vector{Int}[], Int[])

SparseIntSet(indices) = union!(SparseIntSet(), indices)

eltype(::Type{SparseIntSet}) = Int

empty(::SparseIntSet) = SparseIntSet()

function empty!(s::SparseIntSet)
empty!(s.packed)
empty!(s.reverse)
empty!(s.counters)
return s
end

isempty(s::SparseIntSet) = isempty(s.packed)

copy(s::SparseIntSet) = copy!(SparseIntSet(), s)

function copy!(to::SparseIntSet, from::SparseIntSet)
to.packed = copy(from.packed)
#we want to keep the null pages === NULL_INT_PAGE
resize!(to.reverse, length(from.reverse))
for i in eachindex(from.reverse)
page = from.reverse[i]
if page === NULL_INT_PAGE
to.reverse[i] = NULL_INT_PAGE
else
to.reverse[i] = copy(from.reverse[i])
end
end
to.counters = copy(from.counters)
return to
end

function pageid_offset(s::SparseIntSet, i)
pageid = div(i - 1, INT_PER_PAGE) + 1
return pageid, (i - 1) & (INT_PER_PAGE - 1) + 1
end

function in(i, s::SparseIntSet)
pageid, offset = pageid_offset(s, i)
if pageid > length(s.reverse)
return false
else
page = @inbounds s.reverse[pageid]
return page !== NULL_INT_PAGE && @inbounds page[offset] != 0
end
end

length(s::SparseIntSet) = length(s.packed)

function push!(s::SparseIntSet, i::Integer)
i <= 0 && throw(DomainError("Only positive Ints allowed."))

pageid, offset = pageid_offset(s, i)
pages = s.reverse
plen = length(pages)

if pageid > plen
# Create new null pages up to pageid and fresh (zero-filled) one at pageid
sizehint!(pages, pageid)
sizehint!(s.counters, pageid)
for i in 1:pageid - plen - 1
push!(pages, NULL_INT_PAGE)
push!(s.counters, 0)
end
push!(pages, zeros(Int, INT_PER_PAGE))
push!(s.counters, 0)
elseif pages[pageid] === NULL_INT_PAGE
#assign a page to previous null page
pages[pageid] = zeros(Int, INT_PER_PAGE)
end
page = pages[pageid]
if page[offset] == 0
@inbounds page[offset] = length(s) + 1
@inbounds s.counters[pageid] += 1
push!(s.packed, i)
return s
end
return s
end

function push!(s::SparseIntSet, is::Integer...)
for i in is
push!(s, i)
end
return s
end

Base.@propagate_inbounds function pop!(s::SparseIntSet)
if isempty(s)
throw(ArgumentError("Cannot pop an empty set."))
end
id = pop!(s.packed)
pageid, offset = pageid_offset(s, id)
@inbounds s.reverse[pageid][offset] = 0
@inbounds s.counters[pageid] -= 1
cleanup!(s, pageid)
return id
end

Base.@propagate_inbounds function pop!(s::SparseIntSet, id::Integer)
id < 0 && throw(ArgumentError("Int to pop needs to be positive."))

@boundscheck if !in(id, s)
throw(BoundsError(s, id))
end
@inbounds begin
packed_endid = s.packed[end]
from_page, from_offset = pageid_offset(s, id)
to_page, to_offset = pageid_offset(s, packed_endid)

packed_id = s.reverse[from_page][from_offset]
s.packed[packed_id] = packed_endid
s.reverse[to_page][to_offset] = s.reverse[from_page][from_offset]
s.reverse[from_page][from_offset] = 0
s.counters[from_page] -= 1
pop!(s.packed)
end
cleanup!(s, from_page)
return id
end

function cleanup!(s::SparseIntSet, pageid::Int)
if s.counters[pageid] == 0
s.reverse[pageid] = NULL_INT_PAGE
end
end

function pop!(s::SparseIntSet, id::Integer, default)
id < 0 && throw(ArgumentError("Int to pop needs to be positive."))
return in(id, s) ? (@inbounds pop!(s, id)) : default
end
popfirst!(s::SparseIntSet) = pop!(s, first(s))

iterate(set::SparseIntSet, args...) = iterate(set.packed, args...)

last(s::SparseIntSet) = isempty(s) ? throw(ArgumentError("Empty set has no last element.")) : last(s.packed)

union(s::SparseIntSet, ns) = union!(copy(s), ns)
function union!(s::SparseIntSet, ns)
for n in ns
push!(s, n)
end
return s
end

intersect(s1::SparseIntSet) = copy(s1)
intersect(s1::SparseIntSet, ss...) = intersect(s1, intersect(ss...))
function intersect(s1::SparseIntSet, ns)
s = SparseIntSet()
for n in ns
n in s1 && push!(s, n)
end
return s
end

intersect!(s1::SparseIntSet, ss...) = intersect!(s1, intersect(ss...))

#Is there a more performant way to do this?
intersect!(s1::SparseIntSet, ns) = copy!(s1, intersect(s1, ns))

setdiff(s::SparseIntSet, ns) = setdiff!(copy(s), ns)
function setdiff!(s::SparseIntSet, ns)
for n in ns
pop!(s, n, nothing)
end
return s
end

function ==(s1::SparseIntSet, s2::SparseIntSet)
length(s1) != length(s2) && return false
return all(in(s1), s2)
end

issubset(a::SparseIntSet, b::SparseIntSet) = isequal(a, intersect(a, b))

<(a::SparseIntSet, b::SparseIntSet) = ( a<=b ) && !isequal(a, b)
<=(a::SparseIntSet, b::SparseIntSet) = issubset(a, b)

function findfirst_packed_id(i, s::SparseIntSet)
pageid, offset = pageid_offset(s, i)
if pageid > length(s.counters) || s.counters[pageid] == 0
return 0
end
@inbounds id = s.reverse[pageid][offset]
return id
end

collect(s::SparseIntSet) = copy(s.packed)

struct ZippedSparseIntSetIterator{VT,IT}
valid_sets::VT
shortest_set::SparseIntSet
excluded_sets::IT
function ZippedSparseIntSetIterator(valid_sets::SparseIntSet...; exclude::NTuple{N, SparseIntSet}=()) where{N}
shortest = valid_sets[findmin(map(length, valid_sets))[2]]
new{typeof(valid_sets), NTuple{N, SparseIntSet}}(valid_sets, shortest, exclude)
end
end

Base.zip(s::SparseIntSet...;kwargs...) = ZippedSparseIntSetIterator(s...;kwargs...)

@inline length(it::ZippedSparseIntSetIterator) = length(it.shortest_set)

# we know it is not in_excluded, as there are no excluded
in_excluded(id, it::ZippedSparseIntSetIterator{VT,Tuple{}}) where {VT} = false

function in_excluded(id, it)
for e in it.excluded_sets
if id in e
return true
end
end
return false
end

@inline function id_tids(it, state)
id = it.shortest_set.packed[state]
return id, map(x -> findfirst_packed_id(id, x), it.valid_sets)
end

Base.@propagate_inbounds function iterate(it::ZippedSparseIntSetIterator, state=1)
iterator_length = length(it)
if state > iterator_length
return nothing
end
id, tids = id_tids(it, state)
while any(iszero, tids) || in_excluded(id, it)
state += 1
if state > iterator_length
return nothing
end

id, tids = id_tids(it, state)
end
return tids, state + 1
end
3 changes: 2 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import DataStructures: IntSet
@test [] == detect_ambiguities(Base, Core, DataStructures)

tests = ["int_set",
"sparse_int_set",
"deque",
"circ_deque",
"sorted_containers",
Expand All @@ -28,7 +29,7 @@ tests = ["int_set",
"sorting",
"priority_queue",
"fenwick",
"robin_dict"
"robin_dict",
]

if length(ARGS) > 0
Expand Down
Loading

2 comments on commit 0c70c9c

@oxinabox
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Error while trying to register: "Tag with name 0.17.1 already exists and points to a different commit"

Please sign in to comment.