add PersistentDict based on a HAMT (#51164)

The implementation is based on a [Hash Array Mapped Trie (HAMT)](https://en.wikipedia.org/wiki/Hash_array_mapped_trie) following [Bagwell (2000)](http:https://infoscience.epfl.ch/record/64398/files/idealhashtrees.pdf). A HAMT uses a fixed branching factor (commonly 32) together with each node being sparse. In order to search for an entry we take the hash of the key and chunk it up into blocks, with a branching factor of 32 each block is 5 bits. We use those 5 bits to calculate the index inside the node and use a bitmap within the node to keep track if an element is already set. This makes search a `log(32, n)` operation. Persistency is implemented by path-copying. When we insert/delete a value into the HAMT we copy each node along the path into a new HAMT, all other nodes are shared with the previous HAMT. A noteable implementation choice is that I didn't add a (resizeable) root table. Normally this root table is dense and uses the first `t` bits to calculate an index within. This makes large HAMT a bit cheaper since the root-table effectivly folds multiple lookup steps into one. It does hurt persistent use-cases since path-copying means that we also copy the root node/table. Importantly the HAMT itself is not immutable/persistent, the use of it as part of the `PersistentDict` is. Direct mutation of the underlying data breaks the persistentcy invariants. One could use the HAMT to implement a non-persistent dictionary (or other datastructures). As an interesting side-note we could use a related data-structure [Ctrie](http:https://lamp.epfl.ch/~prokopec/ctries-snapshot.pdf) to implement a concurrent lock-free dictionary. Ctrie also support `O(1)` snapshotting so we could replace the HAMT used here with a Ctrie.
JuliaLang · Sep 7, 2023 · 8599e2f · 8599e2f
1 parent 27fa5de
commit 8599e2f
Show file tree

Hide file tree

Showing 4 changed files with 527 additions and 0 deletions.
diff --git a/base/dict.jl b/base/dict.jl
@@ -869,3 +869,145 @@ empty(::ImmutableDict, ::Type{K}, ::Type{V}) where {K, V} = ImmutableDict{K,V}()
 _similar_for(c::AbstractDict, ::Type{Pair{K,V}}, itr, isz, len) where {K, V} = empty(c, K, V)
 _similar_for(c::AbstractDict, ::Type{T}, itr, isz, len) where {T} =
  throw(ArgumentError("for AbstractDicts, similar requires an element type of Pair;\n if calling map, consider a comprehension instead"))
+
+
+include("hamt.jl")
+using .HashArrayMappedTries
+const HAMT = HashArrayMappedTries
+
+struct PersistentDict{K,V} <: AbstractDict{K,V}
+ trie::HAMT.HAMT{K,V}
+end
+
+"""
+ PersistentDict
+
+`PersistentDict` is a dictionary implemented as an hash array mapped trie,
+which is optimal for situations where you need persistence, each operation
+returns a new dictonary separate from the previous one, but the underlying
+implementation is space-efficient and may share storage across multiple
+separate dictionaries.
+
+ PersistentDict(KV::Pair)
+
+# Examples
+
+```jldoctest
+julia> dict = Base.PersistentDict(:a=>1)
+Base.PersistentDict{Symbol, Int64} with 1 entry:
+ :a => 1
+
+julia> dict2 = Base.delete(dict, :a)
+Base.PersistentDict{Symbol, Int64}()
+
+julia> dict3 = Base.PersistentDict(dict, :a=>2)
+Base.PersistentDict{Symbol, Int64} with 1 entry:
+ :a => 2
+```
+"""
+PersistentDict
+
+PersistentDict{K,V}() where {K,V} = PersistentDict(HAMT.HAMT{K,V}())
+PersistentDict(KV::Pair{K,V}) where {K,V} = PersistentDict(HAMT.HAMT(KV...))
+PersistentDict(dict::PersistentDict, pair::Pair) = PersistentDict(dict, pair...)
+function PersistentDict(dict::PersistentDict{K,V}, key::K, val::V) where {K,V}
+ trie = dict.trie
+ h = hash(key)
+ found, present, trie, i, bi, top, hs = HAMT.path(trie, key, h, #=persistent=# true)
+ HAMT.insert!(found, present, trie, i, bi, hs, val)
+ return PersistentDict(top)
+end
+
+function PersistentDict(kv::Pair, rest::Pair...)
+ dict = PersistentDict(kv)
+ for kv in rest
+ key, value = kv
+ dict = PersistentDict(dict, key, value)
+ end
+ return dict
+end
+
+eltype(::PersistentDict{K,V}) where {K,V} = Pair{K,V}
+
+function in(key_val::Pair{K,V}, dict::PersistentDict{K,V}, valcmp=(==)) where {K,V}
+ trie = dict.trie
+ if HAMT.islevel_empty(trie)
+ return false
+ end
+
+ key, val = key_val
+
+ h = hash(key)
+ found, present, trie, i, _, _, _ = HAMT.path(trie, key, h)
+ if found && present
+ leaf = @inbounds trie.data[i]::HAMT.Leaf{K,V}
+ return valcmp(val, leaf.val) && return true
+ end
+ return false
+end
+
+function haskey(dict::PersistentDict{K}, key::K) where K
+ trie = dict.trie
+ h = hash(key)
+ found, present, _, _, _, _, _ = HAMT.path(trie, key, h)
+ return found && present
+end
+
+function getindex(dict::PersistentDict{K,V}, key::K) where {K,V}
+ trie = dict.trie
+ if HAMT.islevel_empty(trie)
+ throw(KeyError(key))
+ end
+ h = hash(key)
+ found, present, trie, i, _, _, _ = HAMT.path(trie, key, h)
+ if found && present
+ leaf = @inbounds trie.data[i]::HAMT.Leaf{K,V}
+ return leaf.val
+ end
+ throw(KeyError(key))
+end
+
+function get(dict::PersistentDict{K,V}, key::K, default::V) where {K,V}
+ trie = dict.trie
+ if HAMT.islevel_empty(trie)
+ return default
+ end
+ h = hash(key)
+ found, present, trie, i, _, _, _ = HAMT.path(trie, key, h)
+ if found && present
+ leaf = @inbounds trie.data[i]::HAMT.Leaf{K,V}
+ return leaf.val
+ end
+ return default
+end
+
+function get(default::Callable, dict::PersistentDict{K,V}, key::K) where {K,V}
+ trie = dict.trie
+ if HAMT.islevel_empty(trie)
+ return default
+ end
+ h = hash(key)
+ found, present, trie, i, _, _, _ = HAMT.path(trie, key, h)
+ if found && present
+ leaf = @inbounds trie.data[i]::HAMT.Leaf{K,V}
+ return leaf.val
+ end
+ return default()
+end
+
+iterate(dict::PersistentDict, state=nothing) = HAMT.iterate(dict.trie, state)
+
+function delete(dict::PersistentDict{K}, key::K) where K
+ trie = dict.trie
+ h = hash(key)
+ found, present, trie, i, bi, top, _ = HAMT.path(trie, key, h, #=persistent=# true)
+ if found && present
+ deleteat!(trie.data, i)
+ HAMT.unset!(trie, bi)
+ end
+ return PersistentDict(top)
+end
+
+length(dict::PersistentDict) = HAMT.length(dict.trie)
+isempty(dict::PersistentDict) = HAMT.isempty(dict.trie)
+empty(::PersistentDict, ::Type{K}, ::Type{V}) where {K, V} = PersistentDict{K, V}()