/*

current version:  may 9th, 2003 

 

this file contains a class template for elements stored in a B-tree, and a

class for a B-tree node, which provides all the methods needed to search,

insert, or delete.  a sample "main" function is also provided to show

how to use the B-tree.

to understand the identifiers and comments, visualize the tree as having

its root node at the top of the diagram, its leaf nodes at the bottom of

the diagram, and each node as containing an array oriented horizontally,

with the smallest element on the left and the largest element on the right.

the zeroth element of a node contains only a subtree pointer, no key value

or payload.

 

a b-tree grows "upward", by splitting the root node when the node's capacity

is exceeded.  conversely, the depth of the tree is always reduced by merging

the last remaining element of the root node with the elements of its two

child nodes, so the tree contracts "from the top".

 

this code may be freely copied.

programmer: toucan@textelectric.net

algorithm and pseudo-code found in:

"fundamentals of data structures in pascal", by Horowitz and Sahni

 

there is a java applet on the web that draws a b-tree diagram and allows the

user to perform insertions and deletions, so you can see how it grows and shrinks,

at: http://www.mars.dti.ne.jp/~torao/program/structure/btree.html

*/

 

 

#include "stdafx.h"

#include <iostream>

#include <string>

#include <vector>

#include <strstream>

using namespace std;

 

class Node;

Node* invalid_ptr = reinterpret_cast<Node*> (-1);

Node* null_ptr = reinterpret_cast<Node*> (0);

const int invalid_index = -1;

const int max_elements = 200;  // max elements in a node

// size limit for the array in a vector object.  best performance was

// at 800 bytes.

const int max_array_bytes = 800; 

 

template<class key, class payload> class Element {

// contains a key value, a payload, and a pointer toward the subtree

// containing key values greater than this->m_key but lower than the

// key value of the next element to the right

 

public:

    key m_key;

    payload m_payload;

    Node* mp_subtree;

public:

    bool operator>   (Element& other) const { return m_key >  other.m_key; }

    bool operator<   (Element& other) const { return m_key <  other.m_key; }

    bool operator>=  (Element& other) const { return m_key >= other.m_key; }

    bool operator<=  (Element& other) const { return m_key <= other.m_key; }

    bool operator==  (Element& other) const { return m_key == other.m_key; }

    bool valid () const { return mp_subtree != invalid_ptr; }

    void invalidate () { mp_subtree = invalid_ptr; }

    Element& operator= (const Element& other) {

        m_key = other.m_key;

        m_payload = other.m_payload;

        mp_subtree = other.mp_subtree;

        return *this;

    }

    Element () { mp_subtree = null_ptr; }

    void dump (); 

}; //______________________________________________________________________

 

template<class key, class payload> void Element<key, payload>::dump () {

    cout << "key=" << m_key << "sub=" << mp_subtree << ' ';

} //_______________________________________________________________________

 

typedef Element<string, string> Elem;

 

class RootTracker;

 

class Node {

protected:

    // locality of reference, beneficial to effective cache utilization,

    // is provided by a "vector" container rather than a "list"

    vector<Elem> m_vector;

    // number of elements currently in m_vector, including the zeroth element

    // which has only a subtree, no key value or payload.

    int m_count;

    Node* mp_parent;

    bool is_leaf();

    bool vector_insert (Elem& element);

    bool vector_insert_for_split (Elem& element);

    bool split_insert (Elem& element);

    bool vector_delete (Elem& target);

    bool vector_delete (int target_pos);

    void insert_zeroth_subtree (Node* subtree);

    void set_debug();

    int key_count () { return m_count-1; }

    Elem& largest_key () { return m_vector[m_count-1]; }

    Elem& smallest_key () { return m_vector[1]; }

    Elem& smallest_key_in_subtree();

    int index_has_subtree ();

    Node* right_sibling (int& parent_index_this);

    Node* left_sibling (int& parent_index_this);

    Node* rotate_from_left(int parent_index_this);

    Node* rotate_from_right(int parent_index_this);

    Node* merge_right (int parent_index_this);

    Node* merge_left (int parent_index_this);

    bool merge_into_root ();

    int minimum_keys ();

#ifdef _DEBUG

    Elem debug[8];

#endif

public:

    Elem& search (Elem& desired, Node*& last_visited); 

    bool tree_insert (Elem& element);

    bool delete_element (Elem& target);

    int delete_all_subtrees ();

    Node* find_root();

    // to return a reference when a search fails.

    static Elem m_failure;

    // the root of the tree may change.  this attribute keeps it accessible.

    RootTracker& m_root;

    Elem& operator[] (int i) { return m_vector[i]; }

    // node cannot be instantiated without a root tracker

    Node (RootTracker& root_track);

    void dump ();

   

}; //______________________________________________________________________

 

class RootTracker {

// all the node instances that belong to a given tree have a reference to one

// instance of RootTracker.  as the Node instance that is the root may change

// or the original root may be deleted, Node instances must access the root

// of the tree through this tracker, and update the root pointer when they

// perform insertions or deletions.  using a static attribute in the Node

// class to hold the root pointer would limit a program to just one B-tree.

protected:

    Node* mp_root;

public:

    RootTracker() { mp_root = null_ptr; }

    void set_root (Node* old_root, Node* new_root) {

        // ensure that the root is only being changed by a node belonging to the

        // same tree as the current root

        if (old_root != mp_root)

            throw "wrong old_root in RootTracker::set_root";

        else

            mp_root = new_root;

    }

    Node* get_root () { return mp_root; }

 

    ~RootTracker () {

        // safety measure

        if (mp_root) {

            mp_root->delete_all_subtrees();

            delete mp_root;

        }

    }

}; //_______________________________________________________________________

 

int Node::minimum_keys () {

    // minus 1 for the empty slot left for splitting the node

    int size = m_vector.size();

    int ceiling_func = (size-1)/2;

    if (ceiling_func*2 < size-1)

        ceiling_func++;

    return ceiling_func-1;  // for clarity, may increment then decrement

} //________________________________________________________________________

 

inline void Node::set_debug() {

#ifdef _DEBUG

// the contents of an STL vector are not visible in the visual C++ debugger,

// so this function copies up to eight elements from the STL vector into

// a simple C++ array.

    for (int i=0; i<m_count && i<8; i++) {

        debug[i] = m_vector[i];

        if (m_vector[i].mp_subtree)

            m_vector[i].mp_subtree->set_debug();

    }

    for ( ; i<8; i++)

        debug[i] = m_failure;

#endif

} //________________________________________________________________________

 

void Node::insert_zeroth_subtree (Node* subtree) {

    m_vector[0].mp_subtree = subtree;

    m_vector[0].m_key = "";

    m_count = 1;

    if (subtree)

        subtree->mp_parent = this;

} //_________________________________________________________________________

 

void Node::dump (){

// write out the keys in this node and all its subtrees, along with

// node adresses, for debugging purposes

        if (this == m_root.get_root())

            cout << "ROOT\n";

        cout << "\nthis=" << this << endl;

        cout << "parent=" << mp_parent << " count=" << m_count << endl;

        for (int i=0; i<m_count; i++)

            m_vector[i].dump();

        for (i=0; i<m_count; i++)

            if (m_vector[i].mp_subtree)

                m_vector[i].mp_subtree->dump();

        cout << endl;

} //________________________________________________________________________

 

Node::Node (RootTracker& root_track)  : m_root(root_track) {

// limit the size of the vector to 4 kilobytes max and 200 entries max.

        int num_elements = max_elements*sizeof(Elem)<=max_array_bytes ?

                           max_elements : max_array_bytes/sizeof(Elem);

        if (num_elements < 6)  // in case key or payload is really huge

            num_elements = 6;

        m_vector.resize (num_elements);

        m_count = 0;

        mp_parent = 0;

        insert_zeroth_subtree (0);

} //________________________________________________________________________

 

Node* Node::find_root () {

    Node* current = this;

    while (current->mp_parent)

        current = current->mp_parent;

    return current;

} //__________________________________________________________________________

 

bool Node::is_leaf () {

    return m_vector[0].mp_subtree==0;

} //________________________________________________________________________

 

int Node::delete_all_subtrees () {

// return the number of nodes deleted

    int count_deleted = 0;

    for (int i=0; i< m_count; i++) {

        if (!m_vector[i].mp_subtree)

            continue;

        else if (m_vector[i].mp_subtree->is_leaf()) {

            delete m_vector[i].mp_subtree;

            count_deleted++;

        }

        else

            count_deleted += m_vector[i].mp_subtree->delete_all_subtrees();

    }

    return count_deleted;

} //_______________________________________________________________________

 

bool Node::vector_insert (Elem& element) {

// this method merely tries to insert the argument into the current node.

// it does not concern itself with the entire tree.

// if the element can fit into m_vector, slide all the elements

// greater than the argument forward one position and copy the argument

// into the newly vacated slot, then return true.  otherwise return false.

// note: the tree_insert method will already have verified that the key

// value of the argument element is not present in the tree.

   

    if (m_count >= m_vector.size()-1) // leave an extra slot for split_insert

        return false;

    int i = m_count;

   

    while (i>0 && m_vector[i-1]>element) {

        m_vector[i] = m_vector[i-1];

        i--;

    }

    if (element.mp_subtree)

        element.mp_subtree->mp_parent = this;

    m_vector[i] = element;

    m_count++;

    return true;

} //__________________________________________________________________

 

bool Node::vector_delete (Elem& target) {

// delete a single element in the vector belonging to *this node.

// if the target is not found, do not look in subtrees, just return false.

 

    int target_pos = -1;

    int first = 1;

    int last = m_count-1;

    // perform binary search

    while (last-first > 1) {

        int mid = first+(last-first)/2;

        if (target>=m_vector[mid])

            first = mid;

        else

            last = mid;

    }

    if (m_vector[first]==target)

        target_pos = first;

    else if (m_vector[last]==target)

        target_pos = last;

    else

        return false;

    // the element's subtree, if it exists, is to be deleted or re-attached

    // in a different function.  not a concern here.  just shift all the

    // elements in positions greater than target_pos.

    for (int i=target_pos; i<m_count; i++)

        m_vector[i] = m_vector[i+1];

   

    m_count--;

    return true;

} //____________________________________________________________________

 

bool Node::vector_delete (int target_pos) {

// delete a single element in the vector belonging to *this node.

// the element is identified by position, not value.

 

    if (target_pos<0 || target_pos>=m_count)

        return false;

  

    // the element's subtree, if it exists, is to be deleted or re-attached

    // in a different function.  not a concern here.  just shift all the

    // elements in positions greater than target_pos.

    for (int i=target_pos; i<m_count; i++)

        m_vector[i] = m_vector[i+1];

   

    m_count--;

    return true;

} //____________________________________________________________________

 

bool Node::vector_insert_for_split (Elem& element) {

// this method insert an element that is in excess of the nominal capacity of

// the node, using the extra slot that always remains unused during normal

// insertions.  this method should only be called from split_insert()

 

    if (m_count >= m_vector.size()) // error

        return false;

    int i = m_count;

   

    while (i>0 && m_vector[i-1]>element) {

        m_vector[i] = m_vector[i-1];

        i--;

    }

    if (element.mp_subtree)

        element.mp_subtree->mp_parent = this;

    m_vector[i] = element;

    m_count++;

    return true;

} //__________________________________________________________________

 

bool Node::split_insert (Elem& element) {

 

    // split_insert should only be called if node is full

    if (m_count != m_vector.size()-1)

        throw "bad m_count in split_insert";

 

    vector_insert_for_split (element);

    int split_point = m_count/2;

    if (2*split_point < m_count)  // perform the "ceiling function"

        split_point++;

    // new node receives the rightmost half of elements in *this node

    Node* new_node = new Node(m_root);

    Elem upward_element = m_vector[split_point];

    new_node->insert_zeroth_subtree (upward_element.mp_subtree);

    upward_element.mp_subtree = new_node;

    // element that gets added to the parent of this node

    for (int i=1; i<m_count-split_point; i++)

        new_node->vector_insert(m_vector[split_point+i]);

    new_node->m_count = m_count-split_point;

    m_count = split_point;

    new_node->mp_parent = mp_parent;

 

    // now insert the new node into the parent, splitting it if necessary

    if (mp_parent && mp_parent->vector_insert(upward_element))

        return true;

    else if (mp_parent && mp_parent->split_insert(upward_element))

        return true;

    else if (!mp_parent) { // this node was the root

        Node* new_root = new Node(m_root);

        new_root->insert_zeroth_subtree(this);

        this->mp_parent = new_root;

        new_node->mp_parent = new_root;

        new_root->vector_insert (upward_element);

        m_root.set_root (m_root.get_root(),  new_root);

        new_root->mp_parent = 0;

    }

    return true;

   

}//__________________________________________________________________

 

bool Node::tree_insert (Elem& element) {

    Node* last_visited_ptr = this;

    if (search(element, last_visited_ptr).valid())  // element already in tree

        return false;

    if (last_visited_ptr->vector_insert(element))

        return true;

    return last_visited_ptr->split_insert(element);

} //__________________________________________________________________

 

bool Node::delete_element (Elem& target) {

// target is just a package for the key value.  the reference does not

// provide the address of the Elem instance to be deleted.

 

 

    // first find the node contain the Elem instance with the given key

    Node* node = 0;

    int parent_index_this = invalid_index;

    Elem& found = search (target, node);

    if (!found.valid())

        return false;

 

    if (node->is_leaf() && node->key_count() > node->minimum_keys())

        return node->vector_delete (target);

    else if (node->is_leaf()) {

        node->vector_delete (target);

        // loop invariant: if _node_ is not null_ptr, it points to a node

        // that has lost an element and needs to import one from a sibling

        // or merge with a sibling and import one from its parent.

        // after an iteration of the loop, _node_ may become null or

        // it may point to its parent if an element was imported from the

        // parent and this caused the parent to fall below the minimum

        // element count.

        while (node) {

            // NOTE: the "this" pointer may no longer be valid after the first

            // iteration of this loop!!!

            if (node==node->find_root() && node->is_leaf())

                break;

            if (node==node->find_root() && !node->is_leaf()) // sanity check

                throw "node should not be root in delete_element loop";

            // is an extra element available from the right sibling (if any)

            Node* right = node->right_sibling(parent_index_this);

            if (right && right->key_count() > right->minimum_keys())

                node = node->rotate_from_right(parent_index_this);

            else {

                // is an extra element available from the left sibling (if any)

                Node* left = node->left_sibling(parent_index_this);

                if (left && left->key_count() > left->minimum_keys())

                    node = node->rotate_from_left(parent_index_this);

                else if (right)

                    node = node->merge_right(parent_index_this);

                else if (left)

                    node = node->merge_left(parent_index_this);

            }

        }

    }

    else {

        Elem& smallest_in_subtree = found.mp_subtree->smallest_key_in_subtree();

        found.m_key = smallest_in_subtree.m_key;

        found.m_payload = smallest_in_subtree.m_payload;

        found.mp_subtree->delete_element (smallest_in_subtree);

    }

    return true;

} //___________________________________________________________________

 

Node* Node::rotate_from_right(int parent_index_this) {

    // new element to be added to this node

    Elem underflow_filler = (*mp_parent)[parent_index_this+1];

    // right sibling of this node

    Node* right_sib = (*mp_parent)[parent_index_this+1].mp_subtree;

    underflow_filler.mp_subtree = (*right_sib)[0].mp_subtree;

    // copy the entire element

    (*mp_parent)[parent_index_this+1] = (*right_sib)[1];

    // now restore correct pointer

    (*mp_parent)[parent_index_this+1].mp_subtree = right_sib;

    vector_insert (underflow_filler);

    right_sib->vector_delete(0);

    (*right_sib)[0].m_key = "";

    (*right_sib)[0].m_payload = "";

    return null_ptr; // parent node still has same element count

} //_______________________________________________________________________

 

Node* Node::rotate_from_left(int parent_index_this) {

    // new element to be added to this node

    Elem underflow_filler = (*mp_parent)[parent_index_this];

    // left sibling of this node

    Node* left_sib = (*mp_parent)[parent_index_this-1].mp_subtree;

    underflow_filler.mp_subtree = (*this)[0].mp_subtree;

    (*this)[0].mp_subtree = (*left_sib)[left_sib->m_count-1].mp_subtree;

    if ((*this)[0].mp_subtree)

        (*this)[0].mp_subtree->mp_parent = this;

    // copy the entire element

    (*mp_parent)[parent_index_this] = (*left_sib)[left_sib->m_count-1];

    // now restore correct pointer

    (*mp_parent)[parent_index_this].mp_subtree = this;

    vector_insert (underflow_filler);

    left_sib->vector_delete(left_sib->m_count-1);

    return null_ptr; // parent node still has same element count

} //_______________________________________________________________________

 

Node* Node::merge_right (int parent_index_this) {

// copy elements from the right sibling into this node, along with the

// element in the parent node vector that has the right sibling as it subtree.

// the right sibling and that parent element are then deleted

 

    Elem parent_elem = (*mp_parent)[parent_index_this+1];

    Node* right_sib = (*mp_parent)[parent_index_this+1].mp_subtree;

    parent_elem.mp_subtree = (*right_sib)[0].mp_subtree;

    vector_insert (parent_elem);

    for (int i=1; i<right_sib->m_count; i++)

        vector_insert ((*right_sib)[i]);

    mp_parent->vector_delete (parent_index_this+1);

    delete right_sib;

    if (mp_parent==find_root() && !mp_parent->key_count()) {

        m_root.set_root(m_root.get_root(), this);

        delete mp_parent;

        mp_parent = 0;

        return null_ptr;

    }

    else if (mp_parent==find_root() && mp_parent->key_count())

        return null_ptr;

    if (mp_parent&& mp_parent->key_count() >= mp_parent->minimum_keys())

        return null_ptr; // no need for parent to import an element

    return mp_parent; // parent must import an element

} //_______________________________________________________________________

 

Node* Node::merge_left (int parent_index_this) {

// copy all elements from this node into the left sibling, along with the

// element in the parent node vector that has this node as its subtree.

// this node and its parent element are then deleted.

 

    Elem parent_elem = (*mp_parent)[parent_index_this];

    parent_elem.mp_subtree = (*this)[0].mp_subtree;

    Node* left_sib = (*mp_parent)[parent_index_this-1].mp_subtree;

    left_sib->vector_insert (parent_elem);

    for (int i=1; i<m_count; i++)

        left_sib->vector_insert (m_vector[i]);

    mp_parent->vector_delete (parent_index_this);

    Node* parent_node = mp_parent;  // copy before deleting this node

    if (mp_parent==find_root() && !mp_parent->key_count()) {

        m_root.set_root(m_root.get_root(), left_sib);

        delete mp_parent;

        left_sib->mp_parent = null_ptr;

        delete this;

        return null_ptr;

    }

    else if (mp_parent==find_root() && mp_parent->key_count()) {

        delete this;

        return null_ptr;

    }

    delete this;

    if (parent_node->key_count() >= parent_node->minimum_keys())

        return null_ptr; // no need for parent to import an element

    return parent_node; // parent must import an element

 

} //_______________________________________________________________________

 

Node* Node::right_sibling (int& parent_index_this) {

    parent_index_this = index_has_subtree (); // for element with THIS as subtree

    if (parent_index_this == invalid_index)

        return 0;

    // now mp_parent is known not to be null

    if (parent_index_this >= mp_parent->m_count-1)

        return 0;  // no right sibling

    return mp_parent->m_vector[parent_index_this+1].mp_subtree;  // might be null

} //__________________________________________________________________________

 

Node* Node::left_sibling (int& parent_index_this) {

    parent_index_this = index_has_subtree (); // for element with THIS as subtree

    if (parent_index_this == invalid_index)

        return 0;

    // now mp_parent is known not to be null

    if (parent_index_this==0)

        return 0;  // no left sibling

    return mp_parent->m_vector[parent_index_this-1].mp_subtree;  // might be null

} //____________________________________________________________________________

 

int Node::index_has_subtree () {

// return the element in this node's parent that has the "this" pointer as its subtree

    if (!mp_parent)

        return invalid_index;

    int first = 0;

    int last = mp_parent->m_count-1;

    while (last-first > 1) {

        int mid = first+(last-first)/2;

        Elem& smallest = smallest_key();

        if (smallest_key()>=mp_parent->m_vector[mid])

            first = mid;

        else

            last = mid;

    }

    if (mp_parent->m_vector[first].mp_subtree == this)

        return first;

    else if (mp_parent->m_vector[last].mp_subtree == this)

        return last;

    else

        throw "error in index_has_subtree";

} //__________________________________________________________________

 

Elem& Node::smallest_key_in_subtree () {

    if (is_leaf())

        return m_vector[1];

    else

        return m_vector[0].mp_subtree->smallest_key_in_subtree();

} //___________________________________________________________________

 

Elem& Node::search (Elem& desired, Node*& last_visited_ptr) {

    // the zeroth element of the vector is a special case (no key value or

    // payload, just a subtree).  the seach starts at the *this node, not

    // at the root of the b-tree.

    Node* current = this;

    if (!key_count())

        current = 0;

    while (current) {

        last_visited_ptr = current;

        // if desired is less than all values in current node

        if (current->m_count>1 && desired<current->m_vector[1])

            current = current->m_vector[0].mp_subtree;

        // if desired is greater than all values in current node

        else if (desired>current->m_vector[current->m_count-1])

            current = current->m_vector[current->m_count-1].mp_subtree;

 

        else {

            // binary search of the node

            int first = 1;

            int last = current->m_count-1;

            while (last-first > 1) {

                int mid = first+(last-first)/2;

                if (desired>=current->m_vector[mid])

                    first = mid;

                else

                    last = mid;

            }

            if (current->m_vector[first]==desired)

                return current->m_vector[first];

            if (current->m_vector[last]==desired)

                return current->m_vector[last];

            else if (current->m_vector[last]>desired)

                current = current->m_vector[first].mp_subtree;

            else

                current = current->m_vector[last].mp_subtree;

        }

    }

 

    return m_failure;

 

} //_____________________________________________________________________

 

 

// initialize static data at file scope

Elem Node::m_failure = Elem();

 

// for the high-resolution timer

#include <windows.h>

 

int _tmain(int argc, _TCHAR* argv[])

{

// the main function is just some code to test the b-tree.  it inserts 100,000 elements,

// then searches for each of them, then deletes them in reverse order (also tested in

// forward order) and searches for all 100,000 elements after each deletion to ensure that

// all remaining elements remain accessible.

 

    __int64 frequency, start, end, total;

    QueryPerformanceFrequency( (LARGE_INTEGER *)&frequency );

 

    Node::m_failure.invalidate();

    Node::m_failure.m_key = "";

    Elem elem;

 

    RootTracker tracker;  // maintains a pointer to the current root of the b-tree

    Node* root_ptr = new Node(tracker);

    tracker.set_root (null_ptr, root_ptr);

   

    vector<string> search_vect;

    // prepare the key strings

    search_vect.resize (100000);

    int search_count = 0;

    for (int i=0; i<100000; i++) {

        strstream stm;

        stm << i;

        stm >> search_vect[search_count++];

    }

    string s;

    cout << "finished preparing key strings\n";

    QueryPerformanceCounter ( (LARGE_INTEGER *)&start);

    for (i=0; i<100000; i++) {

        elem.m_key = search_vect[i];

        elem.m_payload = search_vect[i]+" hi you";

        tracker.get_root()->tree_insert(elem);

    }

    cout << "finished inserting elements\n";

    Node * last;

    for (i=0; i<100000; i++) {

        Elem desired;

        desired.m_key = search_vect[i];

        Elem& result = tracker.get_root()->search (desired, last);

    }

    cout << "finished searching for elements\n";

    for (i=99999; i>=0; i--) {

        Elem target;

        target.m_key = search_vect[i];

        tracker.get_root()->delete_element (target);

        Node * last;

    }

 

    QueryPerformanceCounter ( (LARGE_INTEGER *)&end);

    total = (end-start)/(frequency/1000);

    cout << "total millisec for 100000 elements: " << (int)total << endl;

 

    cout << "after deletion" << endl;

    tracker.get_root()->dump();

    getchar();

    return 0;

}