From 8ec4f46f31b1d2f71d193952e595e2037fe7f22d Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Fri, 1 Mar 2019 21:53:13 -0800 Subject: trie: Revamp the API to support mappings --- eval.c | 13 ++-- trie.c | 217 +++++++++++++++++++++++++---------------------------------------- trie.h | 66 +++++++++++--------- 3 files changed, 127 insertions(+), 169 deletions(-) diff --git a/eval.c b/eval.c index 432adfc..d4097ef 100644 --- a/eval.c +++ b/eval.c @@ -1147,14 +1147,17 @@ static bool eval_file_unique(struct eval_state *state, struct trie *seen) { memcpy(id, &statbuf->dev, sizeof(statbuf->dev)); memcpy(id + sizeof(statbuf->dev), &statbuf->ino, sizeof(statbuf->ino)); - int ret = trie_meminsert(seen, id, sizeof(id)); - if (ret > 0) { - state->action = BFTW_SKIP_SUBTREE; - return false; - } else if (ret < 0) { + struct trie_leaf *leaf = trie_insert_mem(seen, id, sizeof(id)); + if (!leaf) { eval_report_error(state); return false; + } + + if (leaf->value) { + state->action = BFTW_SKIP_SUBTREE; + return false; } else { + leaf->value = leaf; return true; } } diff --git a/trie.c b/trie.c index 75acd22..fffa4ac 100644 --- a/trie.c +++ b/trie.c @@ -96,7 +96,7 @@ #define OFFSET_MAX (((size_t)1 << OFFSET_BITS) - 1) /** - * A node in the trie. + * An internal node of the trie. */ struct trie_node { /** @@ -126,13 +126,13 @@ static bool trie_is_leaf(uintptr_t ptr) { } /** Decode a pointer to a leaf. */ -static void *trie_decode_leaf(uintptr_t ptr) { +static struct trie_leaf *trie_decode_leaf(uintptr_t ptr) { assert(trie_is_leaf(ptr)); - return (void *)(ptr ^ 1); + return (struct trie_leaf *)(ptr ^ 1); } /** Encode a pointer to a leaf. */ -static uintptr_t trie_encode_leaf(const void *leaf) { +static uintptr_t trie_encode_leaf(const struct trie_leaf *leaf) { uintptr_t ptr = (uintptr_t)leaf ^ 1; assert(trie_is_leaf(ptr)); return ptr; @@ -186,8 +186,14 @@ static unsigned char trie_key_nibble(const void *key, size_t offset) { return (bytes[byte] >> shift) & 0xF; } -/** Find a leaf that might be the given key, if present. */ -static const void *trie_search(const struct trie *trie, const void *key, size_t size) { +/** + * Finds a leaf in the trie that matches the key at every branch. If the key + * exists in the trie, the representative will match the searched key. But + * since only branch points are tested, it can be different from the key. In + * that case, the first mismatch between the key and the representative will be + * the depth at which to make a new branch to insert the key. + */ +static struct trie_leaf *trie_representative(const struct trie *trie, const void *key, size_t length) { uintptr_t ptr = trie->root; if (!ptr) { return NULL; @@ -197,45 +203,44 @@ static const void *trie_search(const struct trie *trie, const void *key, size_t while (!trie_is_leaf(ptr)) { struct trie_node *node = trie_decode_node(ptr); offset += node->offset; - if ((offset >> 1) >= size) { - return NULL; - } - unsigned char nibble = trie_key_nibble(key, offset); + unsigned char nibble = 0; + if ((offset >> 1) < length) { + nibble = trie_key_nibble(key, offset); + } unsigned int bit = 1U << nibble; + unsigned int index = 0; if (node->bitmap & bit) { - unsigned int index = trie_popcount(node->bitmap & (bit - 1)); - ptr = node->children[index]; - } else { - return NULL; + index = trie_popcount(node->bitmap & (bit - 1)); } + ptr = node->children[index]; } return trie_decode_leaf(ptr); } -bool trie_strsearch(const struct trie *trie, const char *key) { - const char *rep = trie_search(trie, key, strlen(key) + 1); - return rep && strcmp(key, rep) == 0; +struct trie_leaf *trie_find_str(const struct trie *trie, const char *key) { + return trie_find_mem(trie, key, strlen(key) + 1); } -bool trie_strnsearch(const struct trie *trie, const char *key, size_t size) { - const char *rep = trie_search(trie, key, size); - return rep && strncmp(key, rep, size) == 0; -} - -bool trie_memsearch(const struct trie *trie, const void *key, size_t size) { - const void *rep = trie_search(trie, key, size); - return rep && memcmp(key, rep, size) == 0; +struct trie_leaf *trie_find_mem(const struct trie *trie, const void *key, size_t length) { + struct trie_leaf *rep = trie_representative(trie, key, length); + if (rep && rep->length == length && memcmp(rep->key, key, length) == 0) { + return rep; + } else { + return NULL; + } } /** Create a new leaf, holding a copy of the given key. */ -static uintptr_t new_trie_leaf(const void *key, size_t size) { - void *leaf = malloc(size); +static struct trie_leaf *new_trie_leaf(const void *key, size_t length) { + struct trie_leaf *leaf = malloc(sizeof(*leaf) + length); if (leaf) { - memcpy(leaf, key, size); + leaf->value = NULL; + leaf->length = length; + memcpy(leaf->key, key, length); } - return trie_encode_leaf(leaf); + return leaf; } /** Compute the size of a trie node with a certain number of children. */ @@ -248,50 +253,21 @@ static size_t trie_node_size(unsigned int size) { return sizeof(struct trie_node) + size*sizeof(uintptr_t); } -/** - * Finds a leaf in the trie that matches the key at every branch. If the key - * exists in the trie, the representative will match the searched key. But - * since only branch points are tested, it can be different from the key. In - * that case, the first mismatch between the key and the representative will be - * the depth at which to make a new branch to insert the key. - */ -static const void *trie_representative(const struct trie *trie, const void *key, size_t size) { - size_t offset = 0; - uintptr_t ptr = trie->root; - while (!trie_is_leaf(ptr)) { - struct trie_node *node = trie_decode_node(ptr); - offset += node->offset; - - unsigned char nibble = 0; - if ((offset >> 1) < size) { - nibble = trie_key_nibble(key, offset); - } - unsigned int bit = 1U << nibble; - unsigned int index = 0; - if (node->bitmap & bit) { - index = trie_popcount(node->bitmap & (bit - 1)); - } - ptr = node->children[index]; - } - - return trie_decode_leaf(ptr); -} - /** Find the offset of the first nibble that differs between two keys. */ -static size_t trie_key_mismatch(const void *key1, const void *key2, size_t size) { +static size_t trie_key_mismatch(const void *key1, const void *key2, size_t length) { const unsigned char *bytes1 = key1; const unsigned char *bytes2 = key2; size_t i = 0; size_t offset = 0; const size_t chunk = sizeof(size_t); - for (; i + chunk <= size; i += chunk) { + for (; i + chunk <= length; i += chunk) { if (memcmp(bytes1 + i, bytes2 + i, chunk) != 0) { break; } } - for (; i < size; ++i) { + for (; i < length; ++i) { unsigned char b1 = bytes1[i], b2 = bytes2[i]; if (b1 != b2) { offset = (b1 & 0xF) == (b2 & 0xF); @@ -325,7 +301,7 @@ static size_t trie_key_mismatch(const void *key1, const void *key2, size_t size) * | Z * +--->... */ -static int trie_node_insert(uintptr_t *ptr, const void *key, size_t key_size, size_t offset) { +static struct trie_leaf *trie_node_insert(uintptr_t *ptr, const void *key, size_t length, size_t offset) { struct trie_node *node = trie_decode_node(*ptr); unsigned int size = trie_popcount(node->bitmap); @@ -333,14 +309,14 @@ static int trie_node_insert(uintptr_t *ptr, const void *key, size_t key_size, si if ((size & (size - 1)) == 0) { node = realloc(node, trie_node_size(2*size)); if (!node) { - return -1; + return NULL; } *ptr = trie_encode_node(node); } - uintptr_t leaf = new_trie_leaf(key, key_size); + struct trie_leaf *leaf = new_trie_leaf(key, length); if (!leaf) { - return -1; + return NULL; } unsigned char nibble = trie_key_nibble(key, offset); @@ -355,8 +331,8 @@ static int trie_node_insert(uintptr_t *ptr, const void *key, size_t key_size, si if (index < size) { memmove(child + 1, child, (size - index)*sizeof(*child)); } - *child = leaf; - return 0; + *child = trie_encode_leaf(leaf); + return leaf; } /** @@ -370,21 +346,21 @@ static int trie_node_insert(uintptr_t *ptr, const void *key, size_t key_size, si * * and changes it to: * - * ptr - * | - * v - * *--->*--->rep + * ptr ret + * | | + * v v + * *--->*--->rep * * so that a new key can be inserted like: * - * ptr - * | - * v X - * *--->*--->rep - * | Y - * +--->key + * ptr ret + * | | + * v v X + * *--->*--->rep + * | Y + * +--->key */ -static uintptr_t *trie_jump(uintptr_t *ptr, const char *key, size_t size, size_t *offset) { +static uintptr_t *trie_jump(uintptr_t *ptr, const char *key, size_t *offset) { // We only ever need to jump to leaf nodes, since internal nodes are // guaranteed to be within OFFSET_MAX anyway assert(trie_is_leaf(*ptr)); @@ -422,20 +398,20 @@ static uintptr_t *trie_jump(uintptr_t *ptr, const char *key, size_t size, size_t * | Y * +--->key */ -static int trie_split(uintptr_t *ptr, const void *key, size_t size, const void *rep, size_t offset, size_t mismatch) { +static struct trie_leaf *trie_split(uintptr_t *ptr, const void *key, size_t length, struct trie_leaf *rep, size_t offset, size_t mismatch) { unsigned char key_nibble = trie_key_nibble(key, mismatch); - unsigned char rep_nibble = trie_key_nibble(rep, mismatch); + unsigned char rep_nibble = trie_key_nibble(rep->key, mismatch); assert(key_nibble != rep_nibble); struct trie_node *node = malloc(trie_node_size(2)); if (!node) { - return -1; + return NULL; } - uintptr_t leaf = new_trie_leaf(key, size); + struct trie_leaf *leaf = new_trie_leaf(key, length); if (!leaf) { free(node); - return -1; + return NULL; } node->bitmap = (1 << key_nibble) | (1 << rep_nibble); @@ -448,32 +424,30 @@ static int trie_split(uintptr_t *ptr, const void *key, size_t size, const void * node->offset = delta; unsigned int key_index = key_nibble > rep_nibble; - node->children[key_index] = leaf; + node->children[key_index] = trie_encode_leaf(leaf); node->children[key_index ^ 1] = *ptr; *ptr = trie_encode_node(node); - return 0; + return leaf; } -/** - * Insert a key into the trie. - * - * @param trie - * The trie to modify. - * @param key - * The key to insert. - * @param size - * The size of the key in bytes. - * @param rep - * The representative for the key. - * @param mismatch - * The index of the first mismatched nibble between the key and the - * representative. - * @return - * 0 on success, 1 if the key was already present, or -1 on error. - */ -static int trie_insert(struct trie *trie, const void *key, size_t size, const void *rep, size_t mismatch) { - if ((mismatch >> 1) >= size) { - return 1; +struct trie_leaf *trie_insert_str(struct trie *trie, const char *key) { + return trie_insert_mem(trie, key, strlen(key) + 1); +} + +struct trie_leaf *trie_insert_mem(struct trie *trie, const void *key, size_t length) { + struct trie_leaf *rep = trie_representative(trie, key, length); + if (!rep) { + struct trie_leaf *leaf = new_trie_leaf(key, length); + if (leaf) { + trie->root = trie_encode_leaf(leaf); + } + return leaf; + } + + size_t limit = length < rep->length ? length : rep->length; + size_t mismatch = trie_key_mismatch(key, rep->key, limit); + if ((mismatch >> 1) >= length) { + return rep; } size_t offset = 0; @@ -493,43 +467,18 @@ static int trie_insert(struct trie *trie, const void *key, size_t size, const vo ptr = node->children + index; } else { assert(offset == mismatch); - return trie_node_insert(ptr, key, size, offset); + return trie_node_insert(ptr, key, length, offset); } } while (mismatch - offset > OFFSET_MAX) { - ptr = trie_jump(ptr, key, size, &offset); + ptr = trie_jump(ptr, key, &offset); if (!ptr) { - return -1; + return NULL; } } - return trie_split(ptr, key, size, rep, offset, mismatch); -} - -int trie_strinsert(struct trie *trie, const char *key) { - size_t size = strlen(key) + 1; - - if (!trie->root) { - trie->root = new_trie_leaf(key, size); - return trie->root ? 0 : -1; - } - - const char *rep = trie_representative(trie, key, size); - size_t limit = strnlen(rep, size - 1) + 1; - size_t mismatch = trie_key_mismatch(key, rep, limit); - return trie_insert(trie, key, size, rep, mismatch); -} - -int trie_meminsert(struct trie *trie, const void *key, size_t size) { - if (!trie->root) { - trie->root = new_trie_leaf(key, size); - return trie->root ? 0 : -1; - } - - const void *rep = trie_representative(trie, key, size); - size_t mismatch = trie_key_mismatch(key, rep, size); - return trie_insert(trie, key, size, rep, mismatch); + return trie_split(ptr, key, length, rep, offset, mismatch); } /** Free an encoded pointer to a node. */ diff --git a/trie.h b/trie.h index e81f80e..fd91f0c 100644 --- a/trie.h +++ b/trie.h @@ -22,82 +22,88 @@ #include /** - * A trie that holds a set of fixed- or dynamic-length strings. + * A trie that holds a set of fixed- or variable-length strings. */ struct trie { uintptr_t root; }; /** - * Initialize an empty trie. + * A leaf of a trie. */ -void trie_init(struct trie *trie); +struct trie_leaf { + /** + * An arbitrary value associated with this leaf. + */ + const void *value; + + /** + * The length of the key in bytes. + */ + size_t length; + + /** + * The key itself, stored inline. + */ + char key[]; +}; /** - * Check if a NUL-terminated string key exists in a trie. - * - * @param trie - * The trie to search. - * @param key - * The key to search for. - * @return - * Whether the key was found. + * Initialize an empty trie. */ -bool trie_strsearch(const struct trie *trie, const char *key); +void trie_init(struct trie *trie); /** - * Check if a prefix of a NUL-terminated string key exists in a trie. + * Find the leaf for a string key. * * @param trie * The trie to search. * @param key - * The key to search for. - * @param size - * The length of the prefix to consider, in bytes. + * The key to look up. * @return - * Whether the key was found. + * The found leaf, or NULL if the key is not present. */ -bool trie_strnsearch(const struct trie *trie, const char *key, size_t size); +struct trie_leaf *trie_find_str(const struct trie *trie, const char *key); /** - * Check if a fixed-length key exists in a trie. + * Find the leaf for a fixed-size key. * * @param trie * The trie to search. * @param key - * The key to search for. - * @param size + * The key to look up. + * @param length * The length of the key in bytes. * @return - * Whether the key was found. + * The found leaf, or NULL if the key is not present. */ -bool trie_memsearch(const struct trie *trie, const void *key, size_t size); +struct trie_leaf *trie_find_mem(const struct trie *trie, const void *key, size_t length); /** - * Insert a NUL-terminated string key in the trie. + * Insert a string key into the trie. * * @param trie * The trie to modify. * @param key * The key to insert. * @return - * 0 on success, 1 if the key was already present, or -1 on error. + * The inserted leaf, or NULL on failure. */ -int trie_strinsert(struct trie *trie, const char *key); +struct trie_leaf *trie_insert_str(struct trie *trie, const char *key); /** - * Insert a fixed-length key in the trie. + * Insert a fixed-size key into the trie. * * @param trie * The trie to modify. * @param key * The key to insert. - * @param size + * @param length * The length of the key in bytes. * @return - * 0 on success, 1 if the key was already present, or -1 on error. + * The inserted leaf, or NULL on failure. */ -int trie_meminsert(struct trie *trie, const void *key, size_t size); +struct trie_leaf *trie_insert_mem(struct trie *trie, const void *key, size_t length); /** * Destroy a trie and its contents. -- cgit v1.2.3