ccdc6037fe
When parse_object is called, we do the following: 1. read the object data into a buffer via read_sha1_file 2. call parse_object_buffer, which then: a. calls the appropriate lookup_{commit,tree,blob,tag} to either create a new "struct object", or to find an existing one. We know the appropriate type from the lookup in step 1. b. calls the appropriate parse_{commit,tree,blob,tag} to parse the buffer for the new (or existing) object In step 2b, all of the called functions are no-ops for object "X" if "X->object.parsed" is set. I.e., when we have already parsed an object, we end up going to a lot of work just to find out at a low level that there is nothing left for us to do (and we throw away the data from read_sha1_file unread). We can optimize this by moving the check for "do we have an in-memory object" from 2a before the expensive call to read_sha1_file in step 1. This might seem circular, since step 2a uses the type information determined in step 1 to call the appropriate lookup function. However, we can notice that all of the lookup_* functions are backed by lookup_object. In other words, all of the objects are kept in a master hash table, and we don't actually need the type to do the "do we have it" part of the lookup, only to do the "and create it if it doesn't exist" part. This can save time whenever we call parse_object on the same sha1 twice in a single program. Some code paths already perform this optimization manually, with either: if (!obj->parsed) obj = parse_object(obj->sha1); if you already have a "struct object", or: struct object *obj = lookup_unknown_object(sha1); if (!obj || !obj->parsed) obj = parse_object(sha1); if you don't. This patch moves the optimization into parse_object itself. Most git operations won't notice any impact. Either they don't parse a lot of duplicate sha1s, or the calling code takes special care not to re-parse objects. I timed two code paths that do benefit (there may be more, but these two were immediately obvious and easy to time). The first is fast-export, which calls parse_object on each object it outputs, like this: object = parse_object(sha1); if (!object) die(...); if (object->flags & SHOWN) return; which means that just to realize we have already shown an object, we will read the whole object from disk! With this patch, my best-of-five time for "fast-export --all" on git.git dropped from 26.3s to 21.3s. The second case is upload-pack, which will call parse_object for each advertised ref (because it needs to peel tags to show "^{}" entries). This doesn't matter for most repositories, because they don't have a lot of refs pointing to the same objects. However, if you have a big alternates repository with a shared object db for a number of child repositories, then the alternates repository will have duplicated refs representing each of its children. For example, GitHub's alternates repository for git.git has ~120,000 refs, of which only ~3200 are unique. The time for upload-pack to print its list of advertised refs dropped from 3.4s to 0.76s. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
278 lines
5.9 KiB
C
278 lines
5.9 KiB
C
#include "cache.h"
|
|
#include "object.h"
|
|
#include "blob.h"
|
|
#include "tree.h"
|
|
#include "commit.h"
|
|
#include "tag.h"
|
|
|
|
static struct object **obj_hash;
|
|
static int nr_objs, obj_hash_size;
|
|
|
|
unsigned int get_max_object_index(void)
|
|
{
|
|
return obj_hash_size;
|
|
}
|
|
|
|
struct object *get_indexed_object(unsigned int idx)
|
|
{
|
|
return obj_hash[idx];
|
|
}
|
|
|
|
static const char *object_type_strings[] = {
|
|
NULL, /* OBJ_NONE = 0 */
|
|
"commit", /* OBJ_COMMIT = 1 */
|
|
"tree", /* OBJ_TREE = 2 */
|
|
"blob", /* OBJ_BLOB = 3 */
|
|
"tag", /* OBJ_TAG = 4 */
|
|
};
|
|
|
|
const char *typename(unsigned int type)
|
|
{
|
|
if (type >= ARRAY_SIZE(object_type_strings))
|
|
return NULL;
|
|
return object_type_strings[type];
|
|
}
|
|
|
|
int type_from_string(const char *str)
|
|
{
|
|
int i;
|
|
|
|
for (i = 1; i < ARRAY_SIZE(object_type_strings); i++)
|
|
if (!strcmp(str, object_type_strings[i]))
|
|
return i;
|
|
die("invalid object type \"%s\"", str);
|
|
}
|
|
|
|
static unsigned int hash_obj(struct object *obj, unsigned int n)
|
|
{
|
|
unsigned int hash;
|
|
memcpy(&hash, obj->sha1, sizeof(unsigned int));
|
|
return hash % n;
|
|
}
|
|
|
|
static void insert_obj_hash(struct object *obj, struct object **hash, unsigned int size)
|
|
{
|
|
unsigned int j = hash_obj(obj, size);
|
|
|
|
while (hash[j]) {
|
|
j++;
|
|
if (j >= size)
|
|
j = 0;
|
|
}
|
|
hash[j] = obj;
|
|
}
|
|
|
|
static unsigned int hashtable_index(const unsigned char *sha1)
|
|
{
|
|
unsigned int i;
|
|
memcpy(&i, sha1, sizeof(unsigned int));
|
|
return i % obj_hash_size;
|
|
}
|
|
|
|
struct object *lookup_object(const unsigned char *sha1)
|
|
{
|
|
unsigned int i;
|
|
struct object *obj;
|
|
|
|
if (!obj_hash)
|
|
return NULL;
|
|
|
|
i = hashtable_index(sha1);
|
|
while ((obj = obj_hash[i]) != NULL) {
|
|
if (!hashcmp(sha1, obj->sha1))
|
|
break;
|
|
i++;
|
|
if (i == obj_hash_size)
|
|
i = 0;
|
|
}
|
|
return obj;
|
|
}
|
|
|
|
static void grow_object_hash(void)
|
|
{
|
|
int i;
|
|
int new_hash_size = obj_hash_size < 32 ? 32 : 2 * obj_hash_size;
|
|
struct object **new_hash;
|
|
|
|
new_hash = xcalloc(new_hash_size, sizeof(struct object *));
|
|
for (i = 0; i < obj_hash_size; i++) {
|
|
struct object *obj = obj_hash[i];
|
|
if (!obj)
|
|
continue;
|
|
insert_obj_hash(obj, new_hash, new_hash_size);
|
|
}
|
|
free(obj_hash);
|
|
obj_hash = new_hash;
|
|
obj_hash_size = new_hash_size;
|
|
}
|
|
|
|
void *create_object(const unsigned char *sha1, int type, void *o)
|
|
{
|
|
struct object *obj = o;
|
|
|
|
obj->parsed = 0;
|
|
obj->used = 0;
|
|
obj->type = type;
|
|
obj->flags = 0;
|
|
hashcpy(obj->sha1, sha1);
|
|
|
|
if (obj_hash_size - 1 <= nr_objs * 2)
|
|
grow_object_hash();
|
|
|
|
insert_obj_hash(obj, obj_hash, obj_hash_size);
|
|
nr_objs++;
|
|
return obj;
|
|
}
|
|
|
|
struct object *lookup_unknown_object(const unsigned char *sha1)
|
|
{
|
|
struct object *obj = lookup_object(sha1);
|
|
if (!obj)
|
|
obj = create_object(sha1, OBJ_NONE, alloc_object_node());
|
|
return obj;
|
|
}
|
|
|
|
struct object *parse_object_buffer(const unsigned char *sha1, enum object_type type, unsigned long size, void *buffer, int *eaten_p)
|
|
{
|
|
struct object *obj;
|
|
int eaten = 0;
|
|
|
|
obj = NULL;
|
|
if (type == OBJ_BLOB) {
|
|
struct blob *blob = lookup_blob(sha1);
|
|
if (blob) {
|
|
if (parse_blob_buffer(blob, buffer, size))
|
|
return NULL;
|
|
obj = &blob->object;
|
|
}
|
|
} else if (type == OBJ_TREE) {
|
|
struct tree *tree = lookup_tree(sha1);
|
|
if (tree) {
|
|
obj = &tree->object;
|
|
if (!tree->buffer)
|
|
tree->object.parsed = 0;
|
|
if (!tree->object.parsed) {
|
|
if (parse_tree_buffer(tree, buffer, size))
|
|
return NULL;
|
|
eaten = 1;
|
|
}
|
|
}
|
|
} else if (type == OBJ_COMMIT) {
|
|
struct commit *commit = lookup_commit(sha1);
|
|
if (commit) {
|
|
if (parse_commit_buffer(commit, buffer, size))
|
|
return NULL;
|
|
if (!commit->buffer) {
|
|
commit->buffer = buffer;
|
|
eaten = 1;
|
|
}
|
|
obj = &commit->object;
|
|
}
|
|
} else if (type == OBJ_TAG) {
|
|
struct tag *tag = lookup_tag(sha1);
|
|
if (tag) {
|
|
if (parse_tag_buffer(tag, buffer, size))
|
|
return NULL;
|
|
obj = &tag->object;
|
|
}
|
|
} else {
|
|
warning("object %s has unknown type id %d\n", sha1_to_hex(sha1), type);
|
|
obj = NULL;
|
|
}
|
|
if (obj && obj->type == OBJ_NONE)
|
|
obj->type = type;
|
|
*eaten_p = eaten;
|
|
return obj;
|
|
}
|
|
|
|
struct object *parse_object(const unsigned char *sha1)
|
|
{
|
|
unsigned long size;
|
|
enum object_type type;
|
|
int eaten;
|
|
const unsigned char *repl = lookup_replace_object(sha1);
|
|
void *buffer;
|
|
struct object *obj;
|
|
|
|
obj = lookup_object(sha1);
|
|
if (obj && obj->parsed)
|
|
return obj;
|
|
|
|
buffer = read_sha1_file(sha1, &type, &size);
|
|
if (buffer) {
|
|
if (check_sha1_signature(repl, buffer, size, typename(type)) < 0) {
|
|
free(buffer);
|
|
error("sha1 mismatch %s\n", sha1_to_hex(repl));
|
|
return NULL;
|
|
}
|
|
|
|
obj = parse_object_buffer(sha1, type, size, buffer, &eaten);
|
|
if (!eaten)
|
|
free(buffer);
|
|
return obj;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
struct object_list *object_list_insert(struct object *item,
|
|
struct object_list **list_p)
|
|
{
|
|
struct object_list *new_list = xmalloc(sizeof(struct object_list));
|
|
new_list->item = item;
|
|
new_list->next = *list_p;
|
|
*list_p = new_list;
|
|
return new_list;
|
|
}
|
|
|
|
int object_list_contains(struct object_list *list, struct object *obj)
|
|
{
|
|
while (list) {
|
|
if (list->item == obj)
|
|
return 1;
|
|
list = list->next;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void add_object_array(struct object *obj, const char *name, struct object_array *array)
|
|
{
|
|
add_object_array_with_mode(obj, name, array, S_IFINVALID);
|
|
}
|
|
|
|
void add_object_array_with_mode(struct object *obj, const char *name, struct object_array *array, unsigned mode)
|
|
{
|
|
unsigned nr = array->nr;
|
|
unsigned alloc = array->alloc;
|
|
struct object_array_entry *objects = array->objects;
|
|
|
|
if (nr >= alloc) {
|
|
alloc = (alloc + 32) * 2;
|
|
objects = xrealloc(objects, alloc * sizeof(*objects));
|
|
array->alloc = alloc;
|
|
array->objects = objects;
|
|
}
|
|
objects[nr].item = obj;
|
|
objects[nr].name = name;
|
|
objects[nr].mode = mode;
|
|
array->nr = ++nr;
|
|
}
|
|
|
|
void object_array_remove_duplicates(struct object_array *array)
|
|
{
|
|
unsigned int ref, src, dst;
|
|
struct object_array_entry *objects = array->objects;
|
|
|
|
for (ref = 0; ref + 1 < array->nr; ref++) {
|
|
for (src = ref + 1, dst = src;
|
|
src < array->nr;
|
|
src++) {
|
|
if (!strcmp(objects[ref].name, objects[src].name))
|
|
continue;
|
|
if (src != dst)
|
|
objects[dst] = objects[src];
|
|
dst++;
|
|
}
|
|
array->nr = dst;
|
|
}
|
|
}
|