bloomfilter_lock.hpp

/****************************************************************************************************
 * resource_lock:
 * A framework for scalable read/write locking.
 * Released under the terms of the MIT License: https://opensource.org/licenses/MIT
 ***************************************************************************************************/
#pragma once

#include <atomic>
#include <condition_variable>
#include <exception>
#include <iostream>
#include <mutex>
#include <pthread.h>
#include <sys/types.h>
#include <unordered_map>
#include <vector>
#include <queue>
#include <linux/futex.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <errno.h>
#include <bits/stdc++.h> 

namespace bloomfilter_lock
{

    template <typename T>
    class BloomFilterLock;
    class _LockRecord;
    class LockIntention;    
    

    class Key
    {
    /*
     * Key:
     * Identifies a range of the key space to lock.
     * In this implementation, the key space is a 4 tuple
     * (0-63, 0-63, 0-63, 0-63) which is derived from the low 6 bits
     * of each byte of a uint32_t which can be passed in or generated at random.
     * The key space generated by this scheme is 2^24 which is sufficient to guarantee
     * fine grain locking for just about all applications.  If a larger key space is
     * required the technique can be extended easily at the expense of a bigger key object.
     * Note that the technique used in this example does not rely on independent (and computationally intensive)
     * hash functions as such schemes will either entail a time penalty to keep calculating the hash or a memory
     * penalty to save the result of the calculation.  The predicted max number of items concurrently in the filter
     * in this scheme is not expected to exceed 10. Because of the ability to lock entire swaths of locks with
     * the same key prefix with this convention, it is still possible to efficiently lock thousands of locks simultaneously
     * in this scheme.
     * 
     * Key(0) is a special value. It indicates a null locking request. It will not result in any locks being obtained.
     * A random number key generation scheme should bitwise or the result of
     * random number generation with 0x01 or some other bit which is present in 0x3F3F3F3F to guarantee the result is
     * a valid key. Any key which bitwise ands with 0xC0C0C0C0 to a non-zero value maps to the 0 key.
     */
    public:
        Key(uint32_t key):
        m_ui32(key & 0x3F3F3F3F)
        {
        }

        uint32_t value () const {return m_ui32;}
       
        /* prefix_key:
         * Return a Key which can be used to lock all keys sharing a prefix of prefix_length bytes with this key.
         * A prefix length of 0 will return a copy of this same key. A prefix length greater than 3 is equivalent to
         * a prefix length of 3
         */
        Key prefix_key(uint8_t prefix_length = 1)
        {
            return Key(*this, prefix_length);
        }
            
    private:
        
        template <typename T>
        friend class BloomFilterLock;
        
        friend class _LockRecord;
        friend class LockIntention;
        Key(const Key& input, uint8_t prefix_length):
        m_ui32(input.m_ui32)
        {
            prefix_length = prefix_length <= 3 ? prefix_length : 3;
            for(auto i = 0; i < prefix_length; ++i)
            {
                m_ui8[i] |= 0x80;
            }
        }        
        
        union
        {
            uint32_t m_ui32;
            uint8_t m_ui8[4];
        };
    };

    
    // Tracks intention to lock a set of resources in a series of bits.    
    struct LockIntention
    {
        LockIntention():
            m_read_indicators(),
            m_write_indicators(),
            m_exclusive_read_indicators(),
            m_exclusive_write_indicators(),
            m_min_reads(0),
            m_min_writes(0)
        {
            
        }
        
        template<typename T>
        LockIntention(const T& reads, const T& writes):
            LockIntention()
        {
            set(reads, writes);
        }
        
        LockIntention(const std::initializer_list<Key>& reads, const std::initializer_list<Key>& writes):
            LockIntention()
        {
            set(reads, writes);
        }
        
        template<typename T>
        void set(const T& reads, const T& writes)
        {
            for(auto key: reads)
            {
                if (key.m_ui32 == 0)
                    continue;
                
                m_min_reads += 1;
                for(auto i = 0; i < 4; ++i)
                {
                    m_read_indicators[i] |= (size_t(1) << (key.m_ui8[i] & 0x3F));
                    if (key.m_ui8[i] & 0x80)
                        m_exclusive_read_indicators[i] = true;
                }   
            }
        
            for(auto key: writes)
            {
                if (key.m_ui32 == 0)
                    continue;
                
                m_min_reads += 1;
                m_min_writes += 1;
                for(auto i = 0; i < 4; ++i)
                {
                    m_write_indicators[i] |= (size_t(1) << (key.m_ui8[i] & 0x3F));
                    m_read_indicators[i] |= (size_t(1) << (key.m_ui8[i] & 0x3F));
                    if (key.m_ui8[i] & 0x80)
                    {
                        m_exclusive_read_indicators[i] = true;
                        m_exclusive_write_indicators[i] = true;
                    }
                }
            }            
        }
        
        LockIntention(const LockIntention& rhs) = default;
        
        void clear()
        {
            for(auto i = 0; i < 4; ++i)
            {
                m_read_indicators[i] = 0;
                m_write_indicators[i] = 0;
                m_exclusive_read_indicators[i] = false;
                m_exclusive_write_indicators[i] = false;
                m_min_reads = 0;
                m_min_writes = 0;
            }    
        }
                       
        bool _prefix_compatibility_check(const size_t lhs_bits[4], const bool lhs_exclusive_indicators[4], 
                                         const size_t rhs_bits[4], const bool rhs_exclusive_indicators[4])
        {            
            bool prefix_check_enabled = lhs_exclusive_indicators[0] || rhs_exclusive_indicators[0];
            
            for(auto i = 0; i < 4; ++i)
            {
                if (not (lhs_bits[i] & rhs_bits[i]))
                {
                    if (not prefix_check_enabled)
                        return true;
                
                    if (lhs_exclusive_indicators[i] || rhs_exclusive_indicators[i])
                        return true;
                    
                    // There's a non-empty prefix match between the lhs and rhs ending at index i - 1
                    // which indicates merge incompatibility.
                    return false;
                }
                    
                if (prefix_check_enabled)
                {
                    if (i > 0 && (not rhs_exclusive_indicators[i] && rhs_exclusive_indicators[i-1]))
                        return false;
                    
                    if (i > 0 && (not lhs_exclusive_indicators[i] and lhs_exclusive_indicators[i-1]))
                        return false;                    
                }            
            }
            return false;
        }
        
        
        bool merge(const LockIntention& rhs)
        {
            // Merging with self is an error.
            if (&rhs == this)
                return false;
            
            if (not(m_min_reads || m_min_writes))
            {
                *this = rhs;
                return true;
            }
            
            // Returns true if the passed in lock intention was successfully merged into this one.
            if (not (_prefix_compatibility_check(m_write_indicators, m_exclusive_write_indicators,
                    rhs.m_read_indicators, rhs.m_exclusive_read_indicators)))
                return false;
                
            if (not _prefix_compatibility_check(m_read_indicators, m_exclusive_read_indicators,
                    rhs.m_write_indicators, rhs.m_exclusive_write_indicators))
                return false;
            
            for(auto i = 0; i < 4; ++i)
            {
                m_write_indicators[i] |= rhs.m_write_indicators[i];
                m_read_indicators[i] |= rhs.m_read_indicators[i];
                if (rhs.m_exclusive_read_indicators[0])
                    m_exclusive_read_indicators[i] = (m_exclusive_read_indicators[i] && rhs.m_exclusive_read_indicators[i]);
                
                if (rhs.m_exclusive_write_indicators[0])
                    m_exclusive_write_indicators[i] = (m_exclusive_write_indicators[i] && rhs.m_exclusive_write_indicators[i]);
            }
            m_min_reads += rhs.m_min_reads;
            m_min_writes += rhs.m_min_writes;
            return true;
        }
        
        static LockIntention from_read_key(Key key)
        {
            return LockIntention({key}, {Key(0)});            
        }
        
        static LockIntention from_write_key(Key key)
        {
            return LockIntention({Key(0)}, {key});
        }
                
        size_t m_read_indicators[4];
        size_t m_write_indicators[4];
        bool m_exclusive_read_indicators[4];
        bool m_exclusive_write_indicators[4];
        
        // Min read and write counts based on number of keys at construction time.
        // merge adds values from merged element. Note that these are min bounds. The total number of
        // intended reads and writes can be higher
        size_t m_min_reads;
        size_t m_min_writes;
    };
    
    
    struct _FutexWrapper
    {
    /* _FutexWrapper
     * Wrapper around the FUTEX_WAIT and FUTEX_WAKE system calls. Will be
     * switched to the wrappers in boost.sync when c++ modules are available.
     */
        _FutexWrapper():
            m_futex(0) {}
    
        void reset()
        {
            m_futex = 0;
        }
        
        void wait()
        {
            while(1)
            {
                int result = syscall(SYS_futex, &m_futex, FUTEX_WAIT_PRIVATE, 0, 0, 0, 0);
                if (result == 0 && m_futex == 1)
                    break;
                
                if (result == -1)
                {
                    if (errno == EAGAIN)
                    {
                        if (m_futex == 1)
                            break;
                        continue;
                    }
                    std::cerr << "Unexpected errno " << errno << " from futex_wait" << std::endl;
                    std::terminate();
                }                
            }
        }
        
        void signal()
        {            
            int result = -1;            
            while(1)
            {
                m_futex = 1;
                result = syscall(SYS_futex, &m_futex, FUTEX_WAKE_PRIVATE, INT_MAX, 0, 0, 0);
                if (result >= 0)
                    return;
                if (result == -1 && errno == EAGAIN)
                {
                    errno = 0;
                    continue;
                }
                std::cerr << "Unexpected error code " << errno << " from futex_wake" << std::endl;
                std::terminate();
            }
        }
        
        int32_t m_futex;
    };

        
    class _SpinLock
    {
    /* _SpinLock
     * Simple spin lock based on c++ atomics. The boost.sync library has
     * similar functionality. The will be switched to the boost.sync version
     * when c++ modules functionality is widely available.
     */
    public:
        _SpinLock():
        m_lock(false)
        {}
        
        void lock()
        {
            bool locked = false;
            while(!m_lock.compare_exchange_weak(locked, true, std::memory_order_acquire))
            {
                locked = false;
            }
        }

        void unlock()
        {
            m_lock.store(false, std::memory_order_release);
        }
        private:
            std::atomic<bool> m_lock;
    };    


    class _LockRecord
    {
    /* LockRecord:
     * The main structure used to track the series of resources to be locked in a locking batch via a BloomFilterLock
     */
    public:

        enum RecordType
        {
            None = 0,
            ReadOnly = 1,
            ReadWrite = 2,
            Exclusive = 3
        };

        _LockRecord() :
            m_num_waiting(0),
            m_num_locking(0),
            m_active(false),
            m_record_type(None),
            m_num_requests(0)
        {
        }

        bool merge_lock_request(const LockIntention& l);

        bool global_write_request()
        {
            // This is always called under the mutex in BloomFilterLock
            // Don't need to hold m_lock when updating.
            if (m_record_type == None)
            {
                m_record_type = RecordType::Exclusive;
                return true;
            }

            return false;
        }

        bool merge_read_lock_request(Key key);
        bool merge_write_lock_request(Key key);
        
        bool global_read_request()
        {
            if (m_record_type == None)
            {
                m_record_type = ReadOnly;
                return true;
            }
            else if (m_record_type == ReadOnly)
            {
                return true;
            }

            return false;
        }

        void clear()
        {          
            // A lock is not needed on the clear step as the cleared record
            // is queued back into the resource pool under the mutex in
            // BloomFilterLock and then allocated and re-used under the same
            // mutex and this is sufficient to establish happens-before
            m_num_waiting = 0;
            m_num_locking = 0;
            m_active = false;
            m_record_type = None;           
            m_num_requests = 0;
            m_lock_intention.clear();
            m_futex.reset();
        }

        RecordType record_type() const
        {
            return  m_record_type;
        }

        void activate()
        {      
            // Holding this lock while signalling the futex establishes
            // happens-before on the state change m_futex = 0 -> 1 for the
            // receiver of the futex signal.
            std::unique_lock<_SpinLock> guard(m_lock);            
            if (!m_active)
            {
                m_active = true;                
                m_futex.signal();
            }
        }

        void _wait_impl()
        {                                        
            std::unique_lock<_SpinLock> guard(m_lock);
            if (!m_active)
            {
                guard.unlock();
                m_futex.wait();
                guard.lock();
            }
            ++m_num_locking;
            --m_num_waiting;
        }
                
        void _latch()
        {
           ++m_num_waiting;
        }
                
        void wait()
        {             
            _wait_impl();
        }
        
        bool release()
        {            
            decltype(m_num_locking) num_locking = 0;
            decltype(m_num_waiting) num_waiting = 0;
            {
                std::unique_lock<_SpinLock> guard(m_lock);
                num_locking = --m_num_locking;
                num_waiting = m_num_waiting;
            }
            bool return_value = (num_locking == 0 && num_waiting == 0);           
            
            // Returns true if the caller to release is responsible for freeing this LockRecord and activating the
            // next record in the lock queue.
            return return_value;
        }

        void close()
        {
            std::unique_lock<_SpinLock> guard(m_lock);
            if (m_active)
            {
                std::cerr << "close called while lock record is active" << std::endl;                
            }
        }

    private:
        size_t m_num_waiting;
        std::size_t m_num_locking;
        bool  m_active;
        
        size_t m_num_requests;
        RecordType m_record_type;
        LockIntention m_lock_intention;

        _FutexWrapper m_futex;
        _SpinLock m_lock;
    };

    
    template <typename T>
    class _TLResourceTracker
    {
        // Keeps track of BloomFilterLocks owned by the current thread. This is to prevent recursive locks. This is
        // not allowed.
    public:
        _TLResourceTracker():
            m_count(0),
            m_locks(16)
        {}
    
        void track(BloomFilterLock<T>* lock)
        {
            for (auto n = 0; n < m_count; ++n)
            {
                if (m_locks[n] == lock)
                    std::terminate();
            }
            if (m_count < m_locks.size())
            {
                m_locks[m_count] = lock;
                ++m_count;
            }
            else
            {
                m_locks.push_back(lock);
                ++m_count;
            }
        }
        
        
        void untrack(BloomFilterLock<T> * lock)
        {
            for (auto n = 0; n < m_count; ++n)
            {
                if (m_locks[n] == lock)
                {
                    m_locks[n] = nullptr;
                    if (m_count > 1)
                    {
                        m_locks[n] = m_locks[n-1];
                    }
                    --m_count;
                    return;
                }
            }
            std::terminate();
        }
        
    private:
        // Vector of resource locks owned by this thread
        std::vector<BloomFilterLock<T>*> m_locks;
        size_t m_count; // count of resource locks currently owned. The capacity of m_locks could be greater.
    };

    
    template <typename InternalLockType=std::mutex>        
    class BloomFilterLock
    {
    public:
        
        BloomFilterLock();
        BloomFilterLock(const BloomFilterLock& rhs) = delete;
        BloomFilterLock& operator = (const BloomFilterLock& rhs) = delete;
        ~BloomFilterLock();
        void global_read_lock();
        void global_write_lock();
        
        template <typename T>
        void multilock(const T& reads, const T& writes);
        void multilock(const LockIntention& l);
        void read_lock(Key readKey);
        void write_lock(Key writeKey);
        void unlock();

    private:

        _LockRecord *allocate_lock_record();
        inline void wait_at_queue_front(std::unique_lock<InternalLockType>& guard)
        {
            _LockRecord * r = m_lock_queue.front();
            r->_latch();            
                        
            if (!m_active_lock_record)
            {
                m_active_lock_record = r;                
                m_lock_queue.pop();
                r->activate();
                // The queue framework requires there to be a record in the queue at all times.
                if (m_lock_queue.empty())
                    m_lock_queue.push(allocate_lock_record());
            }

            guard.unlock();
            r->wait();            
        }

        inline void wait_at_queue_back(std::unique_lock<InternalLockType>& guard, _LockRecord * new_record)
        {
            new_record->_latch();
            m_lock_queue.push(new_record);
            guard.unlock();
            new_record->wait();
            
        }
        
        inline void wait_at_queue_back(std::unique_lock<InternalLockType>& guard)
        {
            auto queue_back = m_lock_queue.back();
            queue_back->_latch();
            guard.unlock();
            queue_back->wait();
            
        }

        /* Track the set of resource locks held by each thread.  This is here to prevent an attempt to make a lock
         * request on a BloomFilterLock through which some resources are already locked.  That pattern is not permissible
         * via the resource_lock scheme.  An exception results if that occurs.  Consider a set of item Keys
         * which can all be locked collectively via their controlling Key instead in that case.
         */
        static thread_local _TLResourceTracker<InternalLockType> tl_existing_locks;

        _LockRecord* m_active_lock_record;
        std::vector<_LockRecord*> m_record_pool;
        std::queue<_LockRecord*> m_lock_queue;
        InternalLockType m_mutex; // For locking recordPool and internal structures.
        bool m_closing; // Set to true during the destructor sequence.        
    };
}

#include "bloomfilter_lock_impl.hpp"