#include #include #include "index.h" #include "io.h" #include "common.h" #include "bitmap.h" #include #include #include #define SWAP(type, x, y) do { type __tmp__; __tmp__ = y; y = x; x = __tmp__; } while (0) /* private functions */ void _btree_insert_into_node(btree_t *tree, btree_node_t *node, btree_entry_t *entry); void _btree_remove_from_node(btree_t *tree, btree_node_t *node, unsigned index); void _btree_optimize_if_needed(btree_t *tree) ; int _btree_save_header(btree_t *tree) { return file_write(tree->file, 0, &tree->header, sizeof(btree_header_t)); } page_t _btree_alloc(btree_t *tree) { page_t page = 1, bitmap_page = 1; long long bit; uint8_t bitmap[PAGE_SIZE]; do { file_read(tree->file, bitmap_page, bitmap, PAGE_SIZE); bit = bitmap_find_first(bitmap, PAGE_SIZE); if (bit == -1) { bitmap_page += PAGE_SIZE * 8 + 1; page += PAGE_SIZE * 8 + 1; } } while (bit == -1); bitmap_set(bitmap, bit); file_write(tree->file, bitmap_page, bitmap, PAGE_SIZE); page += bit + 1; printfv(VERBOSITY_VERBOSE, "[alloc] allocated page %zu.\n", page); return page; } void _btree_page_free(btree_t *tree, page_t page) { printfv(VERBOSITY_VERBOSE, "[alloc] freeing page %zu.\n", page); uint8_t bitmap[PAGE_SIZE]; file_read(tree->file, 1 + (page / (PAGE_SIZE * 8)) * PAGE_SIZE, bitmap, PAGE_SIZE); bitmap_unset(bitmap, page - 2); file_write(tree->file, 1, bitmap, PAGE_SIZE); } btree_node_t* _btree_load_node(btree_node_t *node, btree_t *tree, page_t page) { char buffer[PAGE_SIZE]; file_read(tree->file, page, buffer, PAGE_SIZE); memcpy(&node->header, buffer, sizeof(btree_node_header_t)); memcpy(node->entries, buffer + sizeof(btree_node_header_t), NODE_SIZE_MAX); node->page = page; return node; } void _btree_write_node(btree_node_t *node, btree_t *tree, page_t page) { char buffer[PAGE_SIZE] = {}; node->page = page; memcpy(buffer, &node->header, sizeof(btree_node_header_t)); memcpy(buffer + sizeof(btree_node_header_t), node->entries, NODE_SIZE(node) - sizeof(btree_node_header_t)); file_write(tree->file, page, buffer, NODE_SIZE(node)); } bool _btree_stack_empty(btree_t *tree) { return tree->trace.current < tree->trace.stack; } void _btree_stack_push(btree_t *tree, btree_node_t *node, unsigned n) { printfv(VERBOSITY_VERBOSE, "[btree] Pushing %zu page on parent stack, entry index %u\n", node->page, n); tree->trace.current++; tree->trace.current->node = *node; tree->trace.current->entry = btree_get_entry(&tree->trace.current->node, n); tree->trace.current->position = n; } btree_stack_elem_t* _btree_stack_pop(btree_t *tree) { if (_btree_stack_empty(tree)) { return NULL; } return tree->trace.current--; } void _btree_stack_clear(btree_t *tree) { tree->trace.current = tree->trace.stack - 1; } unsigned _btree_concat_entries(void* current, size_t na, void* added, size_t nb) { // allocate temporary buffer on a stack void* buffer = alloca(SIZEOF_ENTRIES(na + nb)); unsigned k = 0; page_t last = PAGE_NONE; for (unsigned i = 0, j = 0; i < na || j < nb; k++) { btree_entry_t *dest = NODE_ENTRY(buffer, k); if (i >= na) { *dest = *NODE_ENTRY(added, j++); } else if (j >= nb) { *dest = *NODE_ENTRY(current, i++); } else { btree_entry_t *a = NODE_ENTRY(current, i); btree_entry_t *b = NODE_ENTRY(added, j); if (a->key < b->key) { *dest = *a; i++; } else { *dest = *b; j++; } } if (last) { dest->left = last; } last = dest->right; } // copy from buffer to first operand memcpy(current, buffer, SIZEOF_ENTRIES(na + nb)); return k; } void _btree_node_insert_entry(btree_node_t* node, btree_entry_t entry) { printfv(VERBOSITY_VERBOSE, "[btree] inserting { %zu < %zu > %zu } on %zu\n", entry.left, entry.key, entry.right, node->page); _btree_concat_entries(node->entries, node->header.entries, &entry, 1); node->header.entries++; } void _btree_init(btree_t *tree, char* mode) { char path[PATH_MAX]; strcpy(path, tree->header.main); char *dir = dirname(path); sprintf(path, "%s/%s", dir, tree->header.main); _btree_stack_clear(tree); tree->main = tape_open(path, mode); } int btree_init(btree_t *tree, char* filename, size_t d) { char path[PATH_MAX]; strcpy(path, filename); char* ext = strrchr(path, '.'); if (ext) *ext = 0; sprintf(tree->header.main, "%s.dat", path); printfv(VERBOSITY_DEBUG, "[btree] Initializing btree in file %s d=%zu, main=%s.\n", filename, d, tree->header.main); tree->file = file_open(filename, "w+"); if (!tree->file) { fprintf(stderr, "Cannot open file %s.\n", filename); return BTREE_ERR_CANNOT_OPEN_FILE; } tree->header.d = d; tree->header.page_size = PAGE_SIZE; tree->header.root = _btree_alloc(tree); tree->header.records = 0; tree->header.changes = 0; _btree_save_header(tree); btree_node_t root; root.header.flags = NODE_IS_ROOT | NODE_IS_LEAF; root.header.entries = 0; _btree_write_node(&root, tree, tree->header.root); _btree_init(tree, "w"); return 0; } int btree_open(btree_t *tree, char* filename) { printfv(VERBOSITY_DEBUG, "[btree] Trying to open btree index in file %s\n", filename); tree->file = file_open(filename, "r+"); if (!tree->file) { fprintf(stderr, "Cannot open file %s.\n", filename); return BTREE_ERR_PAGE_SIZE_DIFFERENT; } file_read(tree->file, 0, &tree->header, sizeof(btree_header_t)); printfv(VERBOSITY_DEBUG, "[btree] Found BTREE d=%zu, ps=%zu, root=%" PRIu64 ", mainfile=%s\n", tree->header.d, tree->header.page_size, tree->header.root, tree->header.main); if (tree->header.page_size != PAGE_SIZE) { fprintf(stderr, "BTree page size mismatch, expecting %d got %zu, closing.\n", PAGE_SIZE, tree->header.page_size); file_close(tree->file); return BTREE_ERR_PAGE_SIZE_DIFFERENT; } _btree_init(tree, "rb+"); return 0; } btree_siblings_t _btree_get_siblings(btree_t *tree, page_t page) { btree_siblings_t result = { PAGE_NONE, PAGE_NONE, NULL, NULL }; if (_btree_stack_empty(tree)) { return result; } btree_node_t *parent = &tree->trace.current->node; size_t total = parent->header.entries + 1; for (unsigned i = 0; i < total; i++) { page_t current = *(page_t*)NODE_ENTRY(parent->entries, i); if (current == page) { if (i > 0) { result.left = *(page_t*)NODE_ENTRY(parent->entries, i - 1); result.left_entry = NODE_ENTRY(parent->entries, i - 1); } if (i < total) { result.right = *(page_t*)NODE_ENTRY(parent->entries, i + 1); result.right_entry = NODE_ENTRY(parent->entries, i); } break; } } return result; } void _btree_rebalance(btree_t *tree, btree_node_t *left, btree_node_t *right, btree_entry_t* parent) { btree_node_t *pnode = &tree->trace.current->node; size_t nl = left->header.entries; size_t nr = right->header.entries; size_t n = nl; btree_entry_t rotated = *parent; rotated.left = NODE_ENTRY(left->entries, nl - 1)->right; rotated.right = NODE_ENTRY(right->entries, 0)->left; void* buffer = alloca(SIZEOF_ENTRIES(nl + nr + 1)); memcpy(buffer, left->entries, SIZEOF_ENTRIES(nl)); n = _btree_concat_entries(buffer, n, right->entries, nr); n = _btree_concat_entries(buffer, n, &rotated, 1); size_t pivot = n / 2; left->header.entries = 0; right->header.entries = 0; for (unsigned i = 0; i < n; i++) { btree_entry_t* entry = NODE_ENTRY(buffer, i); if (i < pivot) { _btree_node_insert_entry(left, *entry); } else if (i == pivot) { parent->key = entry->key; parent->location = entry->location; } else { _btree_node_insert_entry(right, *entry); } } _btree_write_node(left, tree, left->page); _btree_write_node(right, tree, right->page); _btree_write_node(pnode, tree, pnode->page); } bool _btree_compensate_insert(btree_t *tree, btree_node_t *old, btree_entry_t *entry) { btree_siblings_t siblings = _btree_get_siblings(tree, old->page); btree_node_t other; if (siblings.left) { _btree_load_node(&other, tree, siblings.left); if (other.header.entries < 2*tree->header.d) { _btree_node_insert_entry(old, *entry); _btree_rebalance(tree, &other, old, siblings.left_entry); return true; } } if (siblings.right) { _btree_load_node(&other, tree, siblings.right); if (other.header.entries < 2*tree->header.d) { _btree_node_insert_entry(old, *entry); _btree_rebalance(tree, old, &other, siblings.right_entry); return true; } } return false; } void _btree_split_node(btree_t *tree, btree_node_t *old, btree_entry_t *entry) { btree_node_t *parent = NULL; size_t n = old->header.entries; size_t half = n / 2; void *buffer = alloca(SIZEOF_ENTRIES(n + 1)); memcpy(buffer, old->entries, SIZEOF_ENTRIES(n)); _btree_concat_entries(buffer, n, entry, 1); if (!_btree_stack_empty(tree)) { btree_stack_elem_t *trace = _btree_stack_pop(tree); parent = &trace->node; } btree_node_t left = { .header = { .flags = old->header.flags & NODE_IS_LEAF, .entries = 0, }, .page = old->page, }; btree_node_t right = { .header = { .flags = old->header.flags & NODE_IS_LEAF, .entries = 0, }, .page = _btree_alloc(tree), }; if (!parent) { parent = alloca(sizeof(btree_node_t)); parent->header = (btree_node_header_t){ .flags = NODE_IS_ROOT, .entries = 0, }; parent->page = _btree_alloc(tree); tree->header.root = parent->page; _btree_save_header(tree); printfv(VERBOSITY_DEBUG, "[btree] Designated new root %zu.\n", parent->page); }; printfv(VERBOSITY_DEBUG, "[btree] Spliting node %zu.\n", old->page); btree_entry_t parent_entry = *NODE_ENTRY(buffer, half); parent_entry.left = left.page; parent_entry.right = right.page; _btree_insert_into_node(tree, parent, &parent_entry); for (unsigned int i = 0; i < half; i++) { _btree_node_insert_entry(&left, *NODE_ENTRY(buffer, i)); _btree_node_insert_entry(&right, *NODE_ENTRY(buffer, half + i + 1)); } _btree_write_node(&left, tree, left.page); _btree_write_node(&right, tree, right.page); } void _btree_insert_into_node(btree_t *tree, btree_node_t *node, btree_entry_t *entry) { printfv(VERBOSITY_DEBUG, "[btree] Inserting onto page %zu.\n", node->page); if (node->header.entries < 2 * tree->header.d) { _btree_node_insert_entry(node, *entry); _btree_write_node(node, tree, node->page); return; } printfv(VERBOSITY_DEBUG, "[btree] %zu Overflow!\n", node->page); if (_btree_compensate_insert(tree, node, entry)) { return; } printfv(VERBOSITY_DEBUG, "[btree] Unable to compensate %zu!\n", node->page); _btree_split_node(tree, node, entry); } page_t btree_find(btree_t *tree, record_key_t key, btree_entry_t* dest, btree_node_t *node, unsigned *index) { if (!node) { node = alloca(sizeof(btree_node_t)); } page_t page = tree->header.root; btree_entry_t *entry; _btree_stack_clear(tree); do { _btree_load_node(node, tree, page); for(unsigned i = 0; i < node->header.entries; i++) { entry = btree_get_entry(node, i); if (entry->key == key) { if (dest) memcpy(dest, entry, sizeof(btree_entry_t)); if (index) *index = i; return page; } if (entry->key > key && entry->left) { page = entry->left; _btree_stack_push(tree, node, i); break; } } if (page == node->page && ~node->header.flags & NODE_IS_LEAF) { page = entry->right; _btree_stack_push(tree, node, node->header.entries - 1); } } while (page && ~node->header.flags & NODE_IS_LEAF); return PAGE_NONE; } page_t btree_insert(btree_t *tree, record_t record) { record_key_t key = record.key; btree_node_t node; if (btree_find(tree, key, NULL, &node, NULL) != PAGE_NONE) { printfv(VERBOSITY_DEBUG, "[btree] Record with key %zu already exists.\n", key); return PAGE_NONE; } offset_t offset = tape_append(tree->main, &record, sizeof(record)); btree_entry_t entry = { PAGE_NONE, key, offset, PAGE_NONE }; _btree_insert_into_node(tree, &node, &entry); tree->header.changes++; tree->header.records++; _btree_optimize_if_needed(tree); return node.page; } void _btree_merge(btree_t *tree, btree_node_t* left, btree_node_t* right) { printfv(VERBOSITY_DEBUG, "[btree] Merging %zu with %zu.\n", left->page, right->page); size_t nl = left->header.entries, nr = right->header.entries, n = 0; btree_stack_elem_t *parent = _btree_stack_pop(tree); btree_entry_t middle = *parent->entry; middle.left = NODE_ENTRY(left->entries, nl - 1)->right; middle.right = NODE_ENTRY(right->entries, 0)->left; void *buffer = alloca(SIZEOF_ENTRIES(nl + nr + 1)); n = _btree_concat_entries(buffer, n, left->entries, nl); n = _btree_concat_entries(buffer, n, &middle, 1); n = _btree_concat_entries(buffer, n, right->entries, nr); memcpy(left->entries, buffer, SIZEOF_ENTRIES(n)); left->header.entries = n; _btree_page_free(tree, right->page); parent->entry->left = left->page; parent->entry->right = left->page; _btree_remove_from_node(tree, &parent->node, parent->position); if (parent->node.header.flags & NODE_IS_ROOT && parent->node.header.entries == 0) { _btree_page_free(tree, parent->node.page); left->header.flags |= NODE_IS_ROOT; tree->header.root = left->page; printfv(VERBOSITY_DEBUG, "[btree] Designated new root %zu.\n", left->page); } else { _btree_write_node(&parent->node, tree, parent->node.page); } _btree_write_node(left, tree, left->page); } void _btree_fix_underflow(btree_t *tree, btree_node_t *node) { btree_node_t sibling; btree_siblings_t siblings = _btree_get_siblings(tree, node->page); btree_entry_t *entry; if (siblings.left != PAGE_NONE) { _btree_load_node(&sibling, tree, siblings.left); entry = siblings.left_entry; if (sibling.header.entries + node->header.entries >= 2*tree->header.d) { printfv(VERBOSITY_VERBOSE, "[btree] rebalancing with left sibling %zu.\n", siblings.left); _btree_rebalance(tree, &sibling, node, siblings.left_entry); return; } } if (siblings.right != PAGE_NONE) { _btree_load_node(&sibling, tree, siblings.right); entry = siblings.right_entry; if (sibling.header.entries + node->header.entries >= 2*tree->header.d) { printfv(VERBOSITY_VERBOSE, "[btree] rebalancing with right sibling %zu.\n", siblings.right); _btree_rebalance(tree, node, &sibling, siblings.right_entry); return; } } printfv(VERBOSITY_DEBUG, "[btree] Enable to rebalance.\n"); if (siblings.right != PAGE_NONE) { _btree_merge(tree, node, &sibling); } else { _btree_merge(tree, &sibling, node); } } void _btree_remove_from_node(btree_t *tree, btree_node_t *node, unsigned index) { printfv(VERBOSITY_DEBUG, "[btree] Removing %u entry from %zu.\n", index, node->page); size_t n = node->header.entries; void *buffer = alloca(SIZEOF_ENTRIES(n)); btree_entry_t* current; unsigned m = 0; for (unsigned i = 0; i < n; i++) { current = NODE_ENTRY(node->entries, i); if (i != index) { m = _btree_concat_entries(buffer, m, current, 1); } } memcpy(node->entries, buffer, SIZEOF_ENTRIES(node->header.entries)); node->header.entries = m; if ((~node->header.flags & NODE_IS_ROOT) && node->header.entries < tree->header.d) { printfv(VERBOSITY_DEBUG, "[btree] Underflow in %zu.\n", node->page); _btree_fix_underflow(tree, node); } else { _btree_write_node(node, tree, node->page); } } page_t btree_remove(btree_t *tree, record_key_t key) { btree_node_t node; btree_entry_t *entry; unsigned index; if (btree_find(tree, key, NULL, &node, &index) == PAGE_NONE) { printfv(VERBOSITY_DEBUG, "[btree] Record with key %zu does not exist.\n", key); // 404 return PAGE_NONE; } entry = btree_get_entry(&node, index); if (node.header.flags & NODE_IS_LEAF) { printfv(VERBOSITY_DEBUG, "[btree] Removing record with key %zu from leaf %zu.\n", entry->key, node.page); _btree_remove_from_node(tree, &node, index); _btree_write_node(&node, tree, node.page); } else { printfv(VERBOSITY_DEBUG, "[btree] Removing record with key %zu from node %zu.\n", entry->key, node.page); btree_node_t *replacement; btree_entry_t *replaced; _btree_stack_push(tree, &node, index); replacement = &tree->trace.current->node; _btree_load_node(&node, tree, entry->right); while (~node.header.flags & NODE_IS_LEAF) { _btree_stack_push(tree, &node, 0); replaced = NODE_ENTRY(node.entries, 0); _btree_load_node(&node, tree, replaced->left); } replaced = NODE_ENTRY(node.entries, 0); entry = btree_get_entry(replacement, index); printfv(VERBOSITY_DEBUG, "[btree] Exchanging %zu with %zu.\n", entry->key, replaced->key); SWAP(record_key_t, replaced->key, entry->key); SWAP(offset_t, replaced->location, entry->location); _btree_write_node(replacement, tree, replacement->page); _btree_remove_from_node(tree, &node, 0); } tree->header.changes++; tree->header.records--; _btree_optimize_if_needed(tree); return node.page; } bool btree_update(btree_t *tree, record_key_t key, record_t record) { if (record.key != key) { printfv(VERBOSITY_DEBUG, "[btree] Update key mismatch, removing %zu and adding %zu.\n", key, record.key); btree_remove(tree, key); return btree_insert(tree, record) != PAGE_NONE; } btree_entry_t entry; if (btree_find(tree, key, &entry, NULL, NULL) == PAGE_NONE) { printfv(VERBOSITY_DEBUG, "[btree] Record with key %zu not found.\n", key, record.key); return false; } tape_write(tree->main, entry.location, &record, sizeof(record)); return true; } btree_entry_t *btree_get_entry(btree_node_t* node, unsigned n) { if (n > node->header.entries) { return NULL; } return (btree_entry_t*)(node->entries + NODE_ENTRY_OFFSET(n)); } page_t btree_read_record(btree_t *tree, record_key_t key, record_t *record) { btree_entry_t entry; page_t page = btree_find(tree, key, &entry, NULL, NULL); if (page == PAGE_NONE) { return PAGE_NONE; } tape_read(tree->main, entry.location, record, sizeof(record_t)); return page; } void btree_close(btree_t* tree) { printfv(VERBOSITY_DEBUG, "[btree] Closing index %s.\n", tree->file->filename); _btree_save_header(tree); file_close(tree->file); tape_close(tree->main); } void btree_flush(btree_t* tree) { file_flush(tree->file); file_flush(tree->main->file); } void _btree_optimize_if_needed(btree_t *tree) { double ratio = (double)tree->header.changes / (double)tree->header.records; if (tree->header.records > BTREE_OPTIMIZE_RECORDS_MIN && ratio > BTREE_OPTIMIZE_THRESHOLD) { printfv(VERBOSITY_DEBUG, "[btree] Auto optimization triggered.\n"); btree_optimize(tree); } } void _btree_optimize_page(btree_t* tree, tape_t* tmp, page_t page) { if (!page) return; btree_node_t node; btree_entry_t *entry; record_t record; _btree_load_node(&node, tree, page); for (int i = 0; i < node.header.entries; i++) { entry = btree_get_entry(&node, i); _btree_optimize_page(tree, tmp, entry->left); tape_read(tree->main, entry->location, &record, sizeof(record_t)); entry->location = tape_append(tmp, &record, sizeof(record_t)); } _btree_write_node(&node, tree, node.page); _btree_optimize_page(tree, tmp, entry->right); } void btree_optimize(btree_t *tree) { char tmp[PATH_MAX], path[PATH_MAX - 10], backup[PATH_MAX]; strcpy(path, tree->main->file->filename); sprintf(tmp, "%s.tmp", path); sprintf(backup, "%s.bckp", path); printfv(VERBOSITY_DEBUG, "[btree] Optimizing btree using temp file %s.\n", tmp); tape_t *temp = tape_open(tmp, "wb"); _btree_optimize_page(tree, temp, tree->header.root); tape_close(temp); tape_close(tree->main); printfv(VERBOSITY_DEBUG, "[btree] Renaming %s to %s.\n", path, backup); rename(path, backup); printfv(VERBOSITY_DEBUG, "[btree] Renaming %s to %s.\n", tmp, path); if (rename(tmp, path) == 0) { tree->main = tape_open(path, "rb+"); tree->header.changes = 0; } }