From 7a51ed66f653c248993b3c4a61932e47933d835e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 14 Jan 2008 16:03:17 -0800 Subject: [PATCH 1/9] Make on-disk index representation separate from in-core one This converts the index explicitly on read and write to its on-disk format, allowing the in-core format to contain more flags, and be simpler. In particular, the in-core format is now host-endian (as opposed to the on-disk one that is network endian in order to be able to be shared across machines) and as a result we can dispense with all the htonl/ntohl on accesses to the cache_entry fields. This will make it easier to make use of various temporary flags that do not exist in the on-disk format. Signed-off-by: Linus Torvalds --- builtin-apply.c | 10 +-- builtin-blame.c | 2 +- builtin-commit.c | 2 +- builtin-fsck.c | 2 +- builtin-grep.c | 4 +- builtin-ls-files.c | 10 +-- builtin-read-tree.c | 3 +- builtin-rerere.c | 4 +- builtin-update-index.c | 18 ++-- cache-tree.c | 4 +- cache.h | 46 +++++++--- diff-lib.c | 32 +++---- dir.c | 2 +- entry.c | 6 +- merge-index.c | 2 +- merge-recursive.c | 2 +- reachable.c | 2 +- read-cache.c | 200 ++++++++++++++++++++++++----------------- sha1_name.c | 2 +- tree.c | 4 +- unpack-trees.c | 29 +++--- 21 files changed, 217 insertions(+), 169 deletions(-) diff --git a/builtin-apply.c b/builtin-apply.c index 15432b6782..30d86f2197 100644 --- a/builtin-apply.c +++ b/builtin-apply.c @@ -1946,7 +1946,7 @@ static int read_file_or_gitlink(struct cache_entry *ce, struct strbuf *buf) if (!ce) return 0; - if (S_ISGITLINK(ntohl(ce->ce_mode))) { + if (S_ISGITLINK(ce->ce_mode)) { strbuf_grow(buf, 100); strbuf_addf(buf, "Subproject commit %s\n", sha1_to_hex(ce->sha1)); } else { @@ -2023,7 +2023,7 @@ static int check_to_create_blob(const char *new_name, int ok_if_exists) static int verify_index_match(struct cache_entry *ce, struct stat *st) { - if (S_ISGITLINK(ntohl(ce->ce_mode))) { + if (S_ISGITLINK(ce->ce_mode)) { if (!S_ISDIR(st->st_mode)) return -1; return 0; @@ -2082,12 +2082,12 @@ static int check_patch(struct patch *patch, struct patch *prev_patch) return error("%s: does not match index", old_name); if (cached) - st_mode = ntohl(ce->ce_mode); + st_mode = ce->ce_mode; } else if (stat_ret < 0) return error("%s: %s", old_name, strerror(errno)); if (!cached) - st_mode = ntohl(ce_mode_from_stat(ce, st.st_mode)); + st_mode = ce_mode_from_stat(ce, st.st_mode); if (patch->is_new < 0) patch->is_new = 0; @@ -2388,7 +2388,7 @@ static void add_index_file(const char *path, unsigned mode, void *buf, unsigned ce = xcalloc(1, ce_size); memcpy(ce->name, path, namelen); ce->ce_mode = create_ce_mode(mode); - ce->ce_flags = htons(namelen); + ce->ce_flags = namelen; if (S_ISGITLINK(mode)) { const char *s = buf; diff --git a/builtin-blame.c b/builtin-blame.c index 9b4c02e87f..c7e68874e7 100644 --- a/builtin-blame.c +++ b/builtin-blame.c @@ -2092,7 +2092,7 @@ static struct commit *fake_working_tree_commit(const char *path, const char *con if (!mode) { int pos = cache_name_pos(path, len); if (0 <= pos) - mode = ntohl(active_cache[pos]->ce_mode); + mode = active_cache[pos]->ce_mode; else /* Let's not bother reading from HEAD tree */ mode = S_IFREG | 0644; diff --git a/builtin-commit.c b/builtin-commit.c index 02279360f7..c63ff826fc 100644 --- a/builtin-commit.c +++ b/builtin-commit.c @@ -156,7 +156,7 @@ static int list_paths(struct path_list *list, const char *with_tree, for (i = 0; i < active_nr; i++) { struct cache_entry *ce = active_cache[i]; - if (ce->ce_flags & htons(CE_UPDATE)) + if (ce->ce_flags & CE_UPDATE) continue; if (!pathspec_match(pattern, m, ce->name, 0)) continue; diff --git a/builtin-fsck.c b/builtin-fsck.c index 2a6e94deaf..6fc9525e04 100644 --- a/builtin-fsck.c +++ b/builtin-fsck.c @@ -765,7 +765,7 @@ int cmd_fsck(int argc, const char **argv, const char *prefix) struct blob *blob; struct object *obj; - mode = ntohl(active_cache[i]->ce_mode); + mode = active_cache[i]->ce_mode; if (S_ISGITLINK(mode)) continue; blob = lookup_blob(active_cache[i]->sha1); diff --git a/builtin-grep.c b/builtin-grep.c index 0d6cc7361f..9180b39e3f 100644 --- a/builtin-grep.c +++ b/builtin-grep.c @@ -331,7 +331,7 @@ static int external_grep(struct grep_opt *opt, const char **paths, int cached) struct cache_entry *ce = active_cache[i]; char *name; int kept; - if (!S_ISREG(ntohl(ce->ce_mode))) + if (!S_ISREG(ce->ce_mode)) continue; if (!pathspec_matches(paths, ce->name)) continue; @@ -387,7 +387,7 @@ static int grep_cache(struct grep_opt *opt, const char **paths, int cached) for (nr = 0; nr < active_nr; nr++) { struct cache_entry *ce = active_cache[nr]; - if (!S_ISREG(ntohl(ce->ce_mode))) + if (!S_ISREG(ce->ce_mode)) continue; if (!pathspec_matches(paths, ce->name)) continue; diff --git a/builtin-ls-files.c b/builtin-ls-files.c index 0f0ab2da16..d56e33e251 100644 --- a/builtin-ls-files.c +++ b/builtin-ls-files.c @@ -189,7 +189,7 @@ static void show_ce_entry(const char *tag, struct cache_entry *ce) return; if (tag && *tag && show_valid_bit && - (ce->ce_flags & htons(CE_VALID))) { + (ce->ce_flags & CE_VALID)) { static char alttag[4]; memcpy(alttag, tag, 3); if (isalpha(tag[0])) @@ -210,7 +210,7 @@ static void show_ce_entry(const char *tag, struct cache_entry *ce) } else { printf("%s%06o %s %d\t", tag, - ntohl(ce->ce_mode), + ce->ce_mode, abbrev ? find_unique_abbrev(ce->sha1,abbrev) : sha1_to_hex(ce->sha1), ce_stage(ce)); @@ -242,7 +242,7 @@ static void show_files(struct dir_struct *dir, const char *prefix) continue; if (show_unmerged && !ce_stage(ce)) continue; - if (ce->ce_flags & htons(CE_UPDATE)) + if (ce->ce_flags & CE_UPDATE) continue; show_ce_entry(ce_stage(ce) ? tag_unmerged : tag_cached, ce); } @@ -350,7 +350,7 @@ void overlay_tree_on_cache(const char *tree_name, const char *prefix) struct cache_entry *ce = active_cache[i]; if (!ce_stage(ce)) continue; - ce->ce_flags |= htons(CE_STAGEMASK); + ce->ce_flags |= CE_STAGEMASK; } if (prefix) { @@ -379,7 +379,7 @@ void overlay_tree_on_cache(const char *tree_name, const char *prefix) */ if (last_stage0 && !strcmp(last_stage0->name, ce->name)) - ce->ce_flags |= htons(CE_UPDATE); + ce->ce_flags |= CE_UPDATE; } } } diff --git a/builtin-read-tree.c b/builtin-read-tree.c index c0ea0342b7..5785401753 100644 --- a/builtin-read-tree.c +++ b/builtin-read-tree.c @@ -45,8 +45,7 @@ static int read_cache_unmerged(void) continue; cache_tree_invalidate_path(active_cache_tree, ce->name); last = ce; - ce->ce_mode = 0; - ce->ce_flags &= ~htons(CE_STAGEMASK); + ce->ce_flags |= CE_REMOVE; } *dst++ = ce; } diff --git a/builtin-rerere.c b/builtin-rerere.c index a9e3ebc137..b0c17bde87 100644 --- a/builtin-rerere.c +++ b/builtin-rerere.c @@ -149,8 +149,8 @@ static int find_conflict(struct path_list *conflict) if (ce_stage(e2) == 2 && ce_stage(e3) == 3 && ce_same_name(e2, e3) && - S_ISREG(ntohl(e2->ce_mode)) && - S_ISREG(ntohl(e3->ce_mode))) { + S_ISREG(e2->ce_mode) && + S_ISREG(e3->ce_mode)) { path_list_insert((const char *)e2->name, conflict); i++; /* skip over both #2 and #3 */ } diff --git a/builtin-update-index.c b/builtin-update-index.c index c3a14c74ed..a8795d3d5f 100644 --- a/builtin-update-index.c +++ b/builtin-update-index.c @@ -47,10 +47,10 @@ static int mark_valid(const char *path) if (0 <= pos) { switch (mark_valid_only) { case MARK_VALID: - active_cache[pos]->ce_flags |= htons(CE_VALID); + active_cache[pos]->ce_flags |= CE_VALID; break; case UNMARK_VALID: - active_cache[pos]->ce_flags &= ~htons(CE_VALID); + active_cache[pos]->ce_flags &= ~CE_VALID; break; } cache_tree_invalidate_path(active_cache_tree, path); @@ -95,7 +95,7 @@ static int add_one_path(struct cache_entry *old, const char *path, int len, stru size = cache_entry_size(len); ce = xcalloc(1, size); memcpy(ce->name, path, len); - ce->ce_flags = htons(len); + ce->ce_flags = len; fill_stat_cache_info(ce, st); ce->ce_mode = ce_mode_from_stat(old, st->st_mode); @@ -139,7 +139,7 @@ static int process_directory(const char *path, int len, struct stat *st) /* Exact match: file or existing gitlink */ if (pos >= 0) { struct cache_entry *ce = active_cache[pos]; - if (S_ISGITLINK(ntohl(ce->ce_mode))) { + if (S_ISGITLINK(ce->ce_mode)) { /* Do nothing to the index if there is no HEAD! */ if (resolve_gitlink_ref(path, "HEAD", sha1) < 0) @@ -183,7 +183,7 @@ static int process_file(const char *path, int len, struct stat *st) int pos = cache_name_pos(path, len); struct cache_entry *ce = pos < 0 ? NULL : active_cache[pos]; - if (ce && S_ISGITLINK(ntohl(ce->ce_mode))) + if (ce && S_ISGITLINK(ce->ce_mode)) return error("%s is already a gitlink, not replacing", path); return add_one_path(ce, path, len, st); @@ -226,7 +226,7 @@ static int add_cacheinfo(unsigned int mode, const unsigned char *sha1, ce->ce_flags = create_ce_flags(len, stage); ce->ce_mode = create_ce_mode(mode); if (assume_unchanged) - ce->ce_flags |= htons(CE_VALID); + ce->ce_flags |= CE_VALID; option = allow_add ? ADD_CACHE_OK_TO_ADD : 0; option |= allow_replace ? ADD_CACHE_OK_TO_REPLACE : 0; if (add_cache_entry(ce, option)) @@ -246,14 +246,14 @@ static void chmod_path(int flip, const char *path) if (pos < 0) goto fail; ce = active_cache[pos]; - mode = ntohl(ce->ce_mode); + mode = ce->ce_mode; if (!S_ISREG(mode)) goto fail; switch (flip) { case '+': - ce->ce_mode |= htonl(0111); break; + ce->ce_mode |= 0111; break; case '-': - ce->ce_mode &= htonl(~0111); break; + ce->ce_mode &= ~0111; break; default: goto fail; } diff --git a/cache-tree.c b/cache-tree.c index 50b35264fd..bfc95d2dc9 100644 --- a/cache-tree.c +++ b/cache-tree.c @@ -320,13 +320,13 @@ static int update_one(struct cache_tree *it, } else { sha1 = ce->sha1; - mode = ntohl(ce->ce_mode); + mode = ce->ce_mode; entlen = pathlen - baselen; } if (mode != S_IFGITLINK && !missing_ok && !has_sha1_file(sha1)) return error("invalid object %s", sha1_to_hex(sha1)); - if (!ce->ce_mode) + if (ce->ce_flags & CE_REMOVE) continue; /* entry being removed */ strbuf_grow(&buffer, entlen + 100); diff --git a/cache.h b/cache.h index 549f4bbac7..4a054c5402 100644 --- a/cache.h +++ b/cache.h @@ -94,48 +94,66 @@ struct cache_time { * We save the fields in big-endian order to allow using the * index file over NFS transparently. */ +struct ondisk_cache_entry { + struct cache_time ctime; + struct cache_time mtime; + unsigned int dev; + unsigned int ino; + unsigned int mode; + unsigned int uid; + unsigned int gid; + unsigned int size; + unsigned char sha1[20]; + unsigned short flags; + char name[FLEX_ARRAY]; /* more */ +}; + struct cache_entry { - struct cache_time ce_ctime; - struct cache_time ce_mtime; + unsigned int ce_ctime; + unsigned int ce_mtime; unsigned int ce_dev; unsigned int ce_ino; unsigned int ce_mode; unsigned int ce_uid; unsigned int ce_gid; unsigned int ce_size; + unsigned int ce_flags; unsigned char sha1[20]; - unsigned short ce_flags; char name[FLEX_ARRAY]; /* more */ }; #define CE_NAMEMASK (0x0fff) #define CE_STAGEMASK (0x3000) -#define CE_UPDATE (0x4000) #define CE_VALID (0x8000) #define CE_STAGESHIFT 12 -#define create_ce_flags(len, stage) htons((len) | ((stage) << CE_STAGESHIFT)) -#define ce_namelen(ce) (CE_NAMEMASK & ntohs((ce)->ce_flags)) +/* In-memory only */ +#define CE_UPDATE (0x10000) +#define CE_REMOVE (0x20000) + +#define create_ce_flags(len, stage) ((len) | ((stage) << CE_STAGESHIFT)) +#define ce_namelen(ce) (CE_NAMEMASK & (ce)->ce_flags) #define ce_size(ce) cache_entry_size(ce_namelen(ce)) -#define ce_stage(ce) ((CE_STAGEMASK & ntohs((ce)->ce_flags)) >> CE_STAGESHIFT) +#define ondisk_ce_size(ce) ondisk_cache_entry_size(ce_namelen(ce)) +#define ce_stage(ce) ((CE_STAGEMASK & (ce)->ce_flags) >> CE_STAGESHIFT) #define ce_permissions(mode) (((mode) & 0100) ? 0755 : 0644) static inline unsigned int create_ce_mode(unsigned int mode) { if (S_ISLNK(mode)) - return htonl(S_IFLNK); + return S_IFLNK; if (S_ISDIR(mode) || S_ISGITLINK(mode)) - return htonl(S_IFGITLINK); - return htonl(S_IFREG | ce_permissions(mode)); + return S_IFGITLINK; + return S_IFREG | ce_permissions(mode); } static inline unsigned int ce_mode_from_stat(struct cache_entry *ce, unsigned int mode) { extern int trust_executable_bit, has_symlinks; if (!has_symlinks && S_ISREG(mode) && - ce && S_ISLNK(ntohl(ce->ce_mode))) + ce && S_ISLNK(ce->ce_mode)) return ce->ce_mode; if (!trust_executable_bit && S_ISREG(mode)) { - if (ce && S_ISREG(ntohl(ce->ce_mode))) + if (ce && S_ISREG(ce->ce_mode)) return ce->ce_mode; return create_ce_mode(0666); } @@ -146,14 +164,14 @@ static inline unsigned int ce_mode_from_stat(struct cache_entry *ce, unsigned in S_ISLNK(mode) ? S_IFLNK : S_ISDIR(mode) ? S_IFDIR : S_IFGITLINK) #define cache_entry_size(len) ((offsetof(struct cache_entry,name) + (len) + 8) & ~7) +#define ondisk_cache_entry_size(len) ((offsetof(struct ondisk_cache_entry,name) + (len) + 8) & ~7) struct index_state { struct cache_entry **cache; unsigned int cache_nr, cache_alloc, cache_changed; struct cache_tree *cache_tree; time_t timestamp; - void *mmap; - size_t mmap_size; + void *alloc; }; extern struct index_state the_index; diff --git a/diff-lib.c b/diff-lib.c index d85d8f34ba..4a05b02cd0 100644 --- a/diff-lib.c +++ b/diff-lib.c @@ -37,7 +37,7 @@ static int get_mode(const char *path, int *mode) if (!path || !strcmp(path, "/dev/null")) *mode = 0; else if (!strcmp(path, "-")) - *mode = ntohl(create_ce_mode(0666)); + *mode = create_ce_mode(0666); else if (stat(path, &st)) return error("Could not access '%s'", path); else @@ -384,7 +384,7 @@ int run_diff_files(struct rev_info *revs, unsigned int option) continue; } else - dpath->mode = ntohl(ce_mode_from_stat(ce, st.st_mode)); + dpath->mode = ce_mode_from_stat(ce, st.st_mode); while (i < entries) { struct cache_entry *nce = active_cache[i]; @@ -398,10 +398,10 @@ int run_diff_files(struct rev_info *revs, unsigned int option) */ stage = ce_stage(nce); if (2 <= stage) { - int mode = ntohl(nce->ce_mode); + int mode = nce->ce_mode; num_compare_stages++; hashcpy(dpath->parent[stage-2].sha1, nce->sha1); - dpath->parent[stage-2].mode = ntohl(ce_mode_from_stat(nce, mode)); + dpath->parent[stage-2].mode = ce_mode_from_stat(nce, mode); dpath->parent[stage-2].status = DIFF_STATUS_MODIFIED; } @@ -442,15 +442,15 @@ int run_diff_files(struct rev_info *revs, unsigned int option) } if (silent_on_removed) continue; - diff_addremove(&revs->diffopt, '-', ntohl(ce->ce_mode), + diff_addremove(&revs->diffopt, '-', ce->ce_mode, ce->sha1, ce->name, NULL); continue; } changed = ce_match_stat(ce, &st, ce_option); if (!changed && !DIFF_OPT_TST(&revs->diffopt, FIND_COPIES_HARDER)) continue; - oldmode = ntohl(ce->ce_mode); - newmode = ntohl(ce_mode_from_stat(ce, st.st_mode)); + oldmode = ce->ce_mode; + newmode = ce_mode_from_stat(ce, st.st_mode); diff_change(&revs->diffopt, oldmode, newmode, ce->sha1, (changed ? null_sha1 : ce->sha1), ce->name, NULL); @@ -471,7 +471,7 @@ static void diff_index_show_file(struct rev_info *revs, struct cache_entry *ce, unsigned char *sha1, unsigned int mode) { - diff_addremove(&revs->diffopt, prefix[0], ntohl(mode), + diff_addremove(&revs->diffopt, prefix[0], mode, sha1, ce->name, NULL); } @@ -550,14 +550,14 @@ static int show_modified(struct rev_info *revs, p->len = pathlen; memcpy(p->path, new->name, pathlen); p->path[pathlen] = 0; - p->mode = ntohl(mode); + p->mode = mode; hashclr(p->sha1); memset(p->parent, 0, 2 * sizeof(struct combine_diff_parent)); p->parent[0].status = DIFF_STATUS_MODIFIED; - p->parent[0].mode = ntohl(new->ce_mode); + p->parent[0].mode = new->ce_mode; hashcpy(p->parent[0].sha1, new->sha1); p->parent[1].status = DIFF_STATUS_MODIFIED; - p->parent[1].mode = ntohl(old->ce_mode); + p->parent[1].mode = old->ce_mode; hashcpy(p->parent[1].sha1, old->sha1); show_combined_diff(p, 2, revs->dense_combined_merges, revs); free(p); @@ -569,9 +569,6 @@ static int show_modified(struct rev_info *revs, !DIFF_OPT_TST(&revs->diffopt, FIND_COPIES_HARDER)) return 0; - mode = ntohl(mode); - oldmode = ntohl(oldmode); - diff_change(&revs->diffopt, oldmode, mode, old->sha1, sha1, old->name, NULL); return 0; @@ -628,7 +625,7 @@ static int diff_cache(struct rev_info *revs, cached, match_missing)) break; diff_unmerge(&revs->diffopt, ce->name, - ntohl(ce->ce_mode), ce->sha1); + ce->ce_mode, ce->sha1); break; case 3: diff_unmerge(&revs->diffopt, ce->name, @@ -664,7 +661,7 @@ static void mark_merge_entries(void) struct cache_entry *ce = active_cache[i]; if (!ce_stage(ce)) continue; - ce->ce_flags |= htons(CE_STAGEMASK); + ce->ce_flags |= CE_STAGEMASK; } } @@ -722,8 +719,7 @@ int do_diff_cache(const unsigned char *tree_sha1, struct diff_options *opt) cache_tree_invalidate_path(active_cache_tree, ce->name); last = ce; - ce->ce_mode = 0; - ce->ce_flags &= ~htons(CE_STAGEMASK); + ce->ce_flags |= CE_REMOVE; } *dst++ = ce; } diff --git a/dir.c b/dir.c index 3e345c2fc5..1b9cc7a8a8 100644 --- a/dir.c +++ b/dir.c @@ -391,7 +391,7 @@ static enum exist_status directory_exists_in_index(const char *dirname, int len) break; if (endchar == '/') return index_directory; - if (!endchar && S_ISGITLINK(ntohl(ce->ce_mode))) + if (!endchar && S_ISGITLINK(ce->ce_mode)) return index_gitdir; } return index_nonexistent; diff --git a/entry.c b/entry.c index 257ab46e94..44f4b897d4 100644 --- a/entry.c +++ b/entry.c @@ -103,7 +103,7 @@ static int write_entry(struct cache_entry *ce, char *path, const struct checkout int fd; long wrote; - switch (ntohl(ce->ce_mode) & S_IFMT) { + switch (ce->ce_mode & S_IFMT) { char *new; struct strbuf buf; unsigned long size; @@ -129,7 +129,7 @@ static int write_entry(struct cache_entry *ce, char *path, const struct checkout strcpy(path, ".merge_file_XXXXXX"); fd = mkstemp(path); } else - fd = create_file(path, ntohl(ce->ce_mode)); + fd = create_file(path, ce->ce_mode); if (fd < 0) { free(new); return error("git-checkout-index: unable to create file %s (%s)", @@ -221,7 +221,7 @@ int checkout_entry(struct cache_entry *ce, const struct checkout *state, char *t unlink(path); if (S_ISDIR(st.st_mode)) { /* If it is a gitlink, leave it alone! */ - if (S_ISGITLINK(ntohl(ce->ce_mode))) + if (S_ISGITLINK(ce->ce_mode)) return 0; if (!state->force) return error("%s is a directory", path); diff --git a/merge-index.c b/merge-index.c index fa719cb0b1..bbb700b54e 100644 --- a/merge-index.c +++ b/merge-index.c @@ -48,7 +48,7 @@ static int merge_entry(int pos, const char *path) break; found++; strcpy(hexbuf[stage], sha1_to_hex(ce->sha1)); - sprintf(ownbuf[stage], "%o", ntohl(ce->ce_mode)); + sprintf(ownbuf[stage], "%o", ce->ce_mode); arguments[stage] = hexbuf[stage]; arguments[stage + 4] = ownbuf[stage]; } while (++pos < active_nr); diff --git a/merge-recursive.c b/merge-recursive.c index c292a77a81..bdf03b1f1f 100644 --- a/merge-recursive.c +++ b/merge-recursive.c @@ -333,7 +333,7 @@ static struct path_list *get_unmerged(void) item->util = xcalloc(1, sizeof(struct stage_data)); } e = item->util; - e->stages[ce_stage(ce)].mode = ntohl(ce->ce_mode); + e->stages[ce_stage(ce)].mode = ce->ce_mode; hashcpy(e->stages[ce_stage(ce)].sha, ce->sha1); } diff --git a/reachable.c b/reachable.c index 6383401e2d..00f289f2f4 100644 --- a/reachable.c +++ b/reachable.c @@ -176,7 +176,7 @@ static void add_cache_refs(struct rev_info *revs) * lookup_blob() on them, to avoid populating the hash table * with invalid information */ - if (S_ISGITLINK(ntohl(active_cache[i]->ce_mode))) + if (S_ISGITLINK(active_cache[i]->ce_mode)) continue; lookup_blob(active_cache[i]->sha1); diff --git a/read-cache.c b/read-cache.c index 7db55883d6..82a6238b77 100644 --- a/read-cache.c +++ b/read-cache.c @@ -30,20 +30,16 @@ struct index_state the_index; */ void fill_stat_cache_info(struct cache_entry *ce, struct stat *st) { - ce->ce_ctime.sec = htonl(st->st_ctime); - ce->ce_mtime.sec = htonl(st->st_mtime); -#ifdef USE_NSEC - ce->ce_ctime.nsec = htonl(st->st_ctim.tv_nsec); - ce->ce_mtime.nsec = htonl(st->st_mtim.tv_nsec); -#endif - ce->ce_dev = htonl(st->st_dev); - ce->ce_ino = htonl(st->st_ino); - ce->ce_uid = htonl(st->st_uid); - ce->ce_gid = htonl(st->st_gid); - ce->ce_size = htonl(st->st_size); + ce->ce_ctime = st->st_ctime; + ce->ce_mtime = st->st_mtime; + ce->ce_dev = st->st_dev; + ce->ce_ino = st->st_ino; + ce->ce_uid = st->st_uid; + ce->ce_gid = st->st_gid; + ce->ce_size = st->st_size; if (assume_unchanged) - ce->ce_flags |= htons(CE_VALID); + ce->ce_flags |= CE_VALID; } static int ce_compare_data(struct cache_entry *ce, struct stat *st) @@ -116,7 +112,7 @@ static int ce_modified_check_fs(struct cache_entry *ce, struct stat *st) return DATA_CHANGED; break; case S_IFDIR: - if (S_ISGITLINK(ntohl(ce->ce_mode))) + if (S_ISGITLINK(ce->ce_mode)) return 0; default: return TYPE_CHANGED; @@ -128,14 +124,17 @@ static int ce_match_stat_basic(struct cache_entry *ce, struct stat *st) { unsigned int changed = 0; - switch (ntohl(ce->ce_mode) & S_IFMT) { + if (ce->ce_flags & CE_REMOVE) + return MODE_CHANGED | DATA_CHANGED | TYPE_CHANGED; + + switch (ce->ce_mode & S_IFMT) { case S_IFREG: changed |= !S_ISREG(st->st_mode) ? TYPE_CHANGED : 0; /* We consider only the owner x bit to be relevant for * "mode changes" */ if (trust_executable_bit && - (0100 & (ntohl(ce->ce_mode) ^ st->st_mode))) + (0100 & (ce->ce_mode ^ st->st_mode))) changed |= MODE_CHANGED; break; case S_IFLNK: @@ -149,32 +148,18 @@ static int ce_match_stat_basic(struct cache_entry *ce, struct stat *st) else if (ce_compare_gitlink(ce)) changed |= DATA_CHANGED; return changed; - case 0: /* Special case: unmerged file in index */ - return MODE_CHANGED | DATA_CHANGED | TYPE_CHANGED; default: - die("internal error: ce_mode is %o", ntohl(ce->ce_mode)); + die("internal error: ce_mode is %o", ce->ce_mode); } - if (ce->ce_mtime.sec != htonl(st->st_mtime)) + if (ce->ce_mtime != (unsigned int) st->st_mtime) changed |= MTIME_CHANGED; - if (ce->ce_ctime.sec != htonl(st->st_ctime)) + if (ce->ce_ctime != (unsigned int) st->st_ctime) changed |= CTIME_CHANGED; -#ifdef USE_NSEC - /* - * nsec seems unreliable - not all filesystems support it, so - * as long as it is in the inode cache you get right nsec - * but after it gets flushed, you get zero nsec. - */ - if (ce->ce_mtime.nsec != htonl(st->st_mtim.tv_nsec)) - changed |= MTIME_CHANGED; - if (ce->ce_ctime.nsec != htonl(st->st_ctim.tv_nsec)) - changed |= CTIME_CHANGED; -#endif - - if (ce->ce_uid != htonl(st->st_uid) || - ce->ce_gid != htonl(st->st_gid)) + if (ce->ce_uid != (unsigned int) st->st_uid || + ce->ce_gid != (unsigned int) st->st_gid) changed |= OWNER_CHANGED; - if (ce->ce_ino != htonl(st->st_ino)) + if (ce->ce_ino != (unsigned int) st->st_ino) changed |= INODE_CHANGED; #ifdef USE_STDEV @@ -183,11 +168,11 @@ static int ce_match_stat_basic(struct cache_entry *ce, struct stat *st) * clients will have different views of what "device" * the filesystem is on */ - if (ce->ce_dev != htonl(st->st_dev)) + if (ce->ce_dev != (unsigned int) st->st_dev) changed |= INODE_CHANGED; #endif - if (ce->ce_size != htonl(st->st_size)) + if (ce->ce_size != (unsigned int) st->st_size) changed |= DATA_CHANGED; return changed; @@ -205,7 +190,7 @@ int ie_match_stat(struct index_state *istate, * If it's marked as always valid in the index, it's * valid whatever the checked-out copy says. */ - if (!ignore_valid && (ce->ce_flags & htons(CE_VALID))) + if (!ignore_valid && (ce->ce_flags & CE_VALID)) return 0; changed = ce_match_stat_basic(ce, st); @@ -228,7 +213,7 @@ int ie_match_stat(struct index_state *istate, */ if (!changed && istate->timestamp && - istate->timestamp <= ntohl(ce->ce_mtime.sec)) { + istate->timestamp <= ce->ce_mtime) { if (assume_racy_is_modified) changed |= DATA_CHANGED; else @@ -257,7 +242,7 @@ int ie_modified(struct index_state *istate, * the length field is zero. For other cases the ce_size * should match the SHA1 recorded in the index entry. */ - if ((changed & DATA_CHANGED) && ce->ce_size != htonl(0)) + if ((changed & DATA_CHANGED) && ce->ce_size != 0) return changed; changed_fs = ce_modified_check_fs(ce, st); @@ -320,7 +305,7 @@ int index_name_pos(struct index_state *istate, const char *name, int namelen) while (last > first) { int next = (last + first) >> 1; struct cache_entry *ce = istate->cache[next]; - int cmp = cache_name_compare(name, namelen, ce->name, ntohs(ce->ce_flags)); + int cmp = cache_name_compare(name, namelen, ce->name, ce->ce_flags); if (!cmp) return next; if (cmp < 0) { @@ -405,7 +390,7 @@ int add_file_to_index(struct index_state *istate, const char *path, int verbose) size = cache_entry_size(namelen); ce = xcalloc(1, size); memcpy(ce->name, path, namelen); - ce->ce_flags = htons(namelen); + ce->ce_flags = namelen; fill_stat_cache_info(ce, &st); if (trust_executable_bit && has_symlinks) @@ -583,7 +568,7 @@ static int has_file_name(struct index_state *istate, continue; if (p->name[len] != '/') continue; - if (!ce_stage(p) && !p->ce_mode) + if (p->ce_flags & CE_REMOVE) continue; retval = -1; if (!ok_to_replace) @@ -616,7 +601,7 @@ static int has_dir_name(struct index_state *istate, } len = slash - name; - pos = index_name_pos(istate, name, ntohs(create_ce_flags(len, stage))); + pos = index_name_pos(istate, name, create_ce_flags(len, stage)); if (pos >= 0) { /* * Found one, but not so fast. This could @@ -679,7 +664,7 @@ static int check_file_directory_conflict(struct index_state *istate, /* * When ce is an "I am going away" entry, we allow it to be added */ - if (!ce_stage(ce) && !ce->ce_mode) + if (ce->ce_flags & CE_REMOVE) return 0; /* @@ -704,7 +689,7 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e int skip_df_check = option & ADD_CACHE_SKIP_DFCHECK; cache_tree_invalidate_path(istate->cache_tree, ce->name); - pos = index_name_pos(istate, ce->name, ntohs(ce->ce_flags)); + pos = index_name_pos(istate, ce->name, ce->ce_flags); /* existing match? Just replace it. */ if (pos >= 0) { @@ -736,7 +721,7 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e if (!ok_to_replace) return error("'%s' appears as both a file and as a directory", ce->name); - pos = index_name_pos(istate, ce->name, ntohs(ce->ce_flags)); + pos = index_name_pos(istate, ce->name, ce->ce_flags); pos = -pos-1; } return pos + 1; @@ -810,7 +795,7 @@ static struct cache_entry *refresh_cache_ent(struct index_state *istate, * valid again, under "assume unchanged" mode. */ if (ignore_valid && assume_unchanged && - !(ce->ce_flags & htons(CE_VALID))) + !(ce->ce_flags & CE_VALID)) ; /* mark this one VALID again */ else return ce; @@ -826,7 +811,6 @@ static struct cache_entry *refresh_cache_ent(struct index_state *istate, updated = xmalloc(size); memcpy(updated, ce, size); fill_stat_cache_info(updated, &st); - /* * If ignore_valid is not set, we should leave CE_VALID bit * alone. Otherwise, paths marked with --no-assume-unchanged @@ -834,8 +818,8 @@ static struct cache_entry *refresh_cache_ent(struct index_state *istate, * automatically, which is not really what we want. */ if (!ignore_valid && assume_unchanged && - !(ce->ce_flags & htons(CE_VALID))) - updated->ce_flags &= ~htons(CE_VALID); + !(ce->ce_flags & CE_VALID)) + updated->ce_flags &= ~CE_VALID; return updated; } @@ -880,7 +864,7 @@ int refresh_index(struct index_state *istate, unsigned int flags, const char **p /* If we are doing --really-refresh that * means the index is not valid anymore. */ - ce->ce_flags &= ~htons(CE_VALID); + ce->ce_flags &= ~CE_VALID; istate->cache_changed = 1; } if (quiet) @@ -942,16 +926,34 @@ int read_index(struct index_state *istate) return read_index_from(istate, get_index_file()); } +static void convert_from_disk(struct ondisk_cache_entry *ondisk, struct cache_entry *ce) +{ + ce->ce_ctime = ntohl(ondisk->ctime.sec); + ce->ce_mtime = ntohl(ondisk->mtime.sec); + ce->ce_dev = ntohl(ondisk->dev); + ce->ce_ino = ntohl(ondisk->ino); + ce->ce_mode = ntohl(ondisk->mode); + ce->ce_uid = ntohl(ondisk->uid); + ce->ce_gid = ntohl(ondisk->gid); + ce->ce_size = ntohl(ondisk->size); + /* On-disk flags are just 16 bits */ + ce->ce_flags = ntohs(ondisk->flags); + hashcpy(ce->sha1, ondisk->sha1); + memcpy(ce->name, ondisk->name, ce_namelen(ce)+1); +} + /* remember to discard_cache() before reading a different cache! */ int read_index_from(struct index_state *istate, const char *path) { int fd, i; struct stat st; - unsigned long offset; + unsigned long src_offset, dst_offset; struct cache_header *hdr; + void *mmap; + size_t mmap_size; errno = EBUSY; - if (istate->mmap) + if (istate->alloc) return istate->cache_nr; errno = ENOENT; @@ -967,31 +969,47 @@ int read_index_from(struct index_state *istate, const char *path) die("cannot stat the open index (%s)", strerror(errno)); errno = EINVAL; - istate->mmap_size = xsize_t(st.st_size); - if (istate->mmap_size < sizeof(struct cache_header) + 20) + mmap_size = xsize_t(st.st_size); + if (mmap_size < sizeof(struct cache_header) + 20) die("index file smaller than expected"); - istate->mmap = xmmap(NULL, istate->mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + mmap = xmmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); close(fd); + if (mmap == MAP_FAILED) + die("unable to map index file"); - hdr = istate->mmap; - if (verify_hdr(hdr, istate->mmap_size) < 0) + hdr = mmap; + if (verify_hdr(hdr, mmap_size) < 0) goto unmap; istate->cache_nr = ntohl(hdr->hdr_entries); istate->cache_alloc = alloc_nr(istate->cache_nr); istate->cache = xcalloc(istate->cache_alloc, sizeof(struct cache_entry *)); - offset = sizeof(*hdr); + /* + * The disk format is actually larger than the in-memory format, + * due to space for nsec etc, so even though the in-memory one + * has room for a few more flags, we can allocate using the same + * index size + */ + istate->alloc = xmalloc(mmap_size); + + src_offset = sizeof(*hdr); + dst_offset = 0; for (i = 0; i < istate->cache_nr; i++) { + struct ondisk_cache_entry *disk_ce; struct cache_entry *ce; - ce = (struct cache_entry *)((char *)(istate->mmap) + offset); - offset = offset + ce_size(ce); + disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset); + ce = (struct cache_entry *)((char *)istate->alloc + dst_offset); + convert_from_disk(disk_ce, ce); istate->cache[i] = ce; + + src_offset += ondisk_ce_size(ce); + dst_offset += ce_size(ce); } istate->timestamp = st.st_mtime; - while (offset <= istate->mmap_size - 20 - 8) { + while (src_offset <= mmap_size - 20 - 8) { /* After an array of active_nr index entries, * there can be arbitrary number of extended * sections, each of which is prefixed with @@ -999,40 +1017,36 @@ int read_index_from(struct index_state *istate, const char *path) * in 4-byte network byte order. */ unsigned long extsize; - memcpy(&extsize, (char *)(istate->mmap) + offset + 4, 4); + memcpy(&extsize, (char *)mmap + src_offset + 4, 4); extsize = ntohl(extsize); if (read_index_extension(istate, - ((const char *) (istate->mmap)) + offset, - (char *) (istate->mmap) + offset + 8, + (const char *) mmap + src_offset, + (char *) mmap + src_offset + 8, extsize) < 0) goto unmap; - offset += 8; - offset += extsize; + src_offset += 8; + src_offset += extsize; } + munmap(mmap, mmap_size); return istate->cache_nr; unmap: - munmap(istate->mmap, istate->mmap_size); + munmap(mmap, mmap_size); errno = EINVAL; die("index file corrupt"); } int discard_index(struct index_state *istate) { - int ret; - istate->cache_nr = 0; istate->cache_changed = 0; istate->timestamp = 0; cache_tree_free(&(istate->cache_tree)); - if (istate->mmap == NULL) - return 0; - ret = munmap(istate->mmap, istate->mmap_size); - istate->mmap = NULL; - istate->mmap_size = 0; + free(istate->alloc); + istate->alloc = NULL; /* no need to throw away allocated active_cache */ - return ret; + return 0; } #define WRITE_BUFFER_SIZE 8192 @@ -1144,10 +1158,32 @@ static void ce_smudge_racily_clean_entry(struct cache_entry *ce) * file, and never calls us, so the cached size information * for "frotz" stays 6 which does not match the filesystem. */ - ce->ce_size = htonl(0); + ce->ce_size = 0; } } +static int ce_write_entry(SHA_CTX *c, int fd, struct cache_entry *ce) +{ + int size = ondisk_ce_size(ce); + struct ondisk_cache_entry *ondisk = xcalloc(1, size); + + ondisk->ctime.sec = htonl(ce->ce_ctime); + ondisk->ctime.nsec = 0; + ondisk->mtime.sec = htonl(ce->ce_mtime); + ondisk->mtime.nsec = 0; + ondisk->dev = htonl(ce->ce_dev); + ondisk->ino = htonl(ce->ce_ino); + ondisk->mode = htonl(ce->ce_mode); + ondisk->uid = htonl(ce->ce_uid); + ondisk->gid = htonl(ce->ce_gid); + ondisk->size = htonl(ce->ce_size); + hashcpy(ondisk->sha1, ce->sha1); + ondisk->flags = htons(ce->ce_flags); + memcpy(ondisk->name, ce->name, ce_namelen(ce)); + + return ce_write(c, fd, ondisk, size); +} + int write_index(struct index_state *istate, int newfd) { SHA_CTX c; @@ -1157,7 +1193,7 @@ int write_index(struct index_state *istate, int newfd) int entries = istate->cache_nr; for (i = removed = 0; i < entries; i++) - if (!cache[i]->ce_mode) + if (cache[i]->ce_flags & CE_REMOVE) removed++; hdr.hdr_signature = htonl(CACHE_SIGNATURE); @@ -1170,12 +1206,12 @@ int write_index(struct index_state *istate, int newfd) for (i = 0; i < entries; i++) { struct cache_entry *ce = cache[i]; - if (!ce->ce_mode) + if (ce->ce_flags & CE_REMOVE) continue; if (istate->timestamp && - istate->timestamp <= ntohl(ce->ce_mtime.sec)) + istate->timestamp <= ce->ce_mtime) ce_smudge_racily_clean_entry(ce); - if (ce_write(&c, newfd, ce, ce_size(ce)) < 0) + if (ce_write_entry(&c, newfd, ce) < 0) return -1; } diff --git a/sha1_name.c b/sha1_name.c index 13e11645e1..be8489e4e5 100644 --- a/sha1_name.c +++ b/sha1_name.c @@ -695,7 +695,7 @@ int get_sha1_with_mode(const char *name, unsigned char *sha1, unsigned *mode) break; if (ce_stage(ce) == stage) { hashcpy(sha1, ce->sha1); - *mode = ntohl(ce->ce_mode); + *mode = ce->ce_mode; return 0; } pos++; diff --git a/tree.c b/tree.c index 8c0819fa72..87708ef420 100644 --- a/tree.c +++ b/tree.c @@ -142,8 +142,8 @@ static int cmp_cache_name_compare(const void *a_, const void *b_) ce1 = *((const struct cache_entry **)a_); ce2 = *((const struct cache_entry **)b_); - return cache_name_compare(ce1->name, ntohs(ce1->ce_flags), - ce2->name, ntohs(ce2->ce_flags)); + return cache_name_compare(ce1->name, ce1->ce_flags, + ce2->name, ce2->ce_flags); } int read_tree(struct tree *tree, int stage, const char **match) diff --git a/unpack-trees.c b/unpack-trees.c index aa2513ed79..ff46fd62fd 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -289,7 +289,6 @@ static struct checkout state; static void check_updates(struct cache_entry **src, int nr, struct unpack_trees_options *o) { - unsigned short mask = htons(CE_UPDATE); unsigned cnt = 0, total = 0; struct progress *progress = NULL; char last_symlink[PATH_MAX]; @@ -297,7 +296,7 @@ static void check_updates(struct cache_entry **src, int nr, if (o->update && o->verbose_update) { for (total = cnt = 0; cnt < nr; cnt++) { struct cache_entry *ce = src[cnt]; - if (!ce->ce_mode || ce->ce_flags & mask) + if (ce->ce_flags & (CE_UPDATE | CE_REMOVE)) total++; } @@ -310,15 +309,15 @@ static void check_updates(struct cache_entry **src, int nr, while (nr--) { struct cache_entry *ce = *src++; - if (!ce->ce_mode || ce->ce_flags & mask) + if (ce->ce_flags & (CE_UPDATE | CE_REMOVE)) display_progress(progress, ++cnt); - if (!ce->ce_mode) { + if (ce->ce_flags & CE_REMOVE) { if (o->update) unlink_entry(ce->name, last_symlink); continue; } - if (ce->ce_flags & mask) { - ce->ce_flags &= ~mask; + if (ce->ce_flags & CE_UPDATE) { + ce->ce_flags &= ~CE_UPDATE; if (o->update) { checkout_entry(ce, &state, NULL); *last_symlink = '\0'; @@ -408,7 +407,7 @@ static void verify_uptodate(struct cache_entry *ce, * submodules that are marked to be automatically * checked out. */ - if (S_ISGITLINK(ntohl(ce->ce_mode))) + if (S_ISGITLINK(ce->ce_mode)) return; errno = 0; } @@ -450,7 +449,7 @@ static int verify_clean_subdirectory(struct cache_entry *ce, const char *action, int cnt = 0; unsigned char sha1[20]; - if (S_ISGITLINK(ntohl(ce->ce_mode)) && + if (S_ISGITLINK(ce->ce_mode) && resolve_gitlink_ref(ce->name, "HEAD", sha1) == 0) { /* If we are not going to update the submodule, then * we don't care. @@ -481,7 +480,7 @@ static int verify_clean_subdirectory(struct cache_entry *ce, const char *action, */ if (!ce_stage(ce)) { verify_uptodate(ce, o); - ce->ce_mode = 0; + ce->ce_flags |= CE_REMOVE; } cnt++; } @@ -568,7 +567,7 @@ static void verify_absent(struct cache_entry *ce, const char *action, cnt = cache_name_pos(ce->name, strlen(ce->name)); if (0 <= cnt) { struct cache_entry *ce = active_cache[cnt]; - if (!ce_stage(ce) && !ce->ce_mode) + if (ce->ce_flags & CE_REMOVE) return; } @@ -580,7 +579,7 @@ static void verify_absent(struct cache_entry *ce, const char *action, static int merged_entry(struct cache_entry *merge, struct cache_entry *old, struct unpack_trees_options *o) { - merge->ce_flags |= htons(CE_UPDATE); + merge->ce_flags |= CE_UPDATE; if (old) { /* * See if we can re-use the old CE directly? @@ -601,7 +600,7 @@ static int merged_entry(struct cache_entry *merge, struct cache_entry *old, invalidate_ce_path(merge); } - merge->ce_flags &= ~htons(CE_STAGEMASK); + merge->ce_flags &= ~CE_STAGEMASK; add_cache_entry(merge, ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE); return 1; } @@ -613,7 +612,7 @@ static int deleted_entry(struct cache_entry *ce, struct cache_entry *old, verify_uptodate(old, o); else verify_absent(ce, "removed", o); - ce->ce_mode = 0; + ce->ce_flags |= CE_REMOVE; add_cache_entry(ce, ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE); invalidate_ce_path(ce); return 1; @@ -634,7 +633,7 @@ static void show_stage_entry(FILE *o, else fprintf(o, "%s%06o %s %d\t%s\n", label, - ntohl(ce->ce_mode), + ce->ce_mode, sha1_to_hex(ce->sha1), ce_stage(ce), ce->name); @@ -920,7 +919,7 @@ int oneway_merge(struct cache_entry **src, struct stat st; if (lstat(old->name, &st) || ce_match_stat(old, &st, CE_MATCH_IGNORE_VALID)) - old->ce_flags |= htons(CE_UPDATE); + old->ce_flags |= CE_UPDATE; } return keep_entry(old, o); } From 7fec10b7f41fa32e71aa6377bd04cd7c6fb419e0 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 18 Jan 2008 23:42:00 -0800 Subject: [PATCH 2/9] index: be careful when handling long names We currently use lower 12-bit (masked with CE_NAMEMASK) in the ce_flags field to store the length of the name in cache_entry, without checking the length parameter given to create_ce_flags(). This can make us store incorrect length. Currently we are mostly protected by the fact that many codepaths first copy the path in a variable of size PATH_MAX, which typically is 4096 that happens to match the limit, but that feels like a bug waiting to happen. Besides, that would not allow us to shorten the width of CE_NAMEMASK to use the bits for new flags. This redefines the meaning of the name length stored in the cache_entry. A name that does not fit is represented by storing CE_NAMEMASK in the field, and the actual length needs to be computed by actually counting the bytes in the name[] field. This way, only the unusually long paths need to suffer. Signed-off-by: Junio C Hamano Signed-off-by: Linus Torvalds --- cache.h | 17 +++++++++++++++-- read-cache.c | 12 +++++++++++- t/t0000-basic.sh | 20 ++++++++++++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/cache.h b/cache.h index 4a054c5402..9eaffdefd0 100644 --- a/cache.h +++ b/cache.h @@ -131,8 +131,21 @@ struct cache_entry { #define CE_UPDATE (0x10000) #define CE_REMOVE (0x20000) -#define create_ce_flags(len, stage) ((len) | ((stage) << CE_STAGESHIFT)) -#define ce_namelen(ce) (CE_NAMEMASK & (ce)->ce_flags) +static inline unsigned create_ce_flags(size_t len, unsigned stage) +{ + if (len >= CE_NAMEMASK) + len = CE_NAMEMASK; + return (len | (stage << CE_STAGESHIFT)); +} + +static inline size_t ce_namelen(const struct cache_entry *ce) +{ + size_t len = ce->ce_flags & CE_NAMEMASK; + if (len < CE_NAMEMASK) + return len; + return strlen(ce->name + CE_NAMEMASK) + CE_NAMEMASK; +} + #define ce_size(ce) cache_entry_size(ce_namelen(ce)) #define ondisk_ce_size(ce) ondisk_cache_entry_size(ce_namelen(ce)) #define ce_stage(ce) ((CE_STAGEMASK & (ce)->ce_flags) >> CE_STAGESHIFT) diff --git a/read-cache.c b/read-cache.c index 82a6238b77..528f697f59 100644 --- a/read-cache.c +++ b/read-cache.c @@ -928,6 +928,8 @@ int read_index(struct index_state *istate) static void convert_from_disk(struct ondisk_cache_entry *ondisk, struct cache_entry *ce) { + size_t len; + ce->ce_ctime = ntohl(ondisk->ctime.sec); ce->ce_mtime = ntohl(ondisk->mtime.sec); ce->ce_dev = ntohl(ondisk->dev); @@ -939,7 +941,15 @@ static void convert_from_disk(struct ondisk_cache_entry *ondisk, struct cache_en /* On-disk flags are just 16 bits */ ce->ce_flags = ntohs(ondisk->flags); hashcpy(ce->sha1, ondisk->sha1); - memcpy(ce->name, ondisk->name, ce_namelen(ce)+1); + + len = ce->ce_flags & CE_NAMEMASK; + if (len == CE_NAMEMASK) + len = strlen(ondisk->name); + /* + * NEEDSWORK: If the original index is crafted, this copy could + * go unchecked. + */ + memcpy(ce->name, ondisk->name, len + 1); } /* remember to discard_cache() before reading a different cache! */ diff --git a/t/t0000-basic.sh b/t/t0000-basic.sh index 4e49d59065..9f84b8d3ac 100755 --- a/t/t0000-basic.sh +++ b/t/t0000-basic.sh @@ -297,4 +297,24 @@ test_expect_success 'absolute path works as expected' ' test "$sym" = "$(test-absolute-path $dir2/syml)" ' +test_expect_success 'very long name in the index handled sanely' ' + + a=a && # 1 + a=$a$a$a$a$a$a$a$a$a$a$a$a$a$a$a$a && # 16 + a=$a$a$a$a$a$a$a$a$a$a$a$a$a$a$a$a && # 256 + a=$a$a$a$a$a$a$a$a$a$a$a$a$a$a$a$a && # 4096 + a=${a}q && + + >path4 && + git update-index --add path4 && + ( + git ls-files -s path4 | + sed -e "s/ .*/ /" | + tr -d "\012" + echo "$a" + ) | git update-index --index-info && + len=$(git ls-files "a*" | wc -c) && + test $len = 4098 +' + test_done From eadb5831342bb2e756fa05c03642c4aa1929d4f5 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 18 Jan 2008 23:45:24 -0800 Subject: [PATCH 3/9] Avoid running lstat(2) on the same cache entry. Aside from the lstat(2) done for work tree files, there are quite many lstat(2) calls in refname dwimming codepath. This patch is not about reducing them. * It adds a new ce_flag, CE_UPTODATE, that is meant to mark the cache entries that record a regular file blob that is up to date in the work tree. If somebody later walks the index and wants to see if the work tree has changes, they do not have to be checked with lstat(2) again. * fill_stat_cache_info() marks the cache entry it just added with CE_UPTODATE. This has the effect of marking the paths we write out of the index and lstat(2) immediately as "no need to lstat -- we know it is up-to-date", from quite a lot fo callers: - git-apply --index - git-update-index - git-checkout-index - git-add (uses add_file_to_index()) - git-commit (ditto) - git-mv (ditto) * refresh_cache_ent() also marks the cache entry that are clean with CE_UPTODATE. * write_index is changed not to write CE_UPTODATE out to the index file, because CE_UPTODATE is meant to be transient only in core. For the same reason, CE_UPDATE is not written to prevent an accident from happening. Signed-off-by: Junio C Hamano Signed-off-by: Linus Torvalds --- cache.h | 3 +++ diff.c | 25 +++++++++++++++---------- read-cache.c | 16 +++++++++++++++- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/cache.h b/cache.h index 9eaffdefd0..3a47cdc9d2 100644 --- a/cache.h +++ b/cache.h @@ -130,6 +130,7 @@ struct cache_entry { /* In-memory only */ #define CE_UPDATE (0x10000) #define CE_REMOVE (0x20000) +#define CE_UPTODATE (0x40000) static inline unsigned create_ce_flags(size_t len, unsigned stage) { @@ -149,6 +150,8 @@ static inline size_t ce_namelen(const struct cache_entry *ce) #define ce_size(ce) cache_entry_size(ce_namelen(ce)) #define ondisk_ce_size(ce) ondisk_cache_entry_size(ce_namelen(ce)) #define ce_stage(ce) ((CE_STAGEMASK & (ce)->ce_flags) >> CE_STAGESHIFT) +#define ce_uptodate(ce) ((ce)->ce_flags & CE_UPTODATE) +#define ce_mark_uptodate(ce) ((ce)->ce_flags |= CE_UPTODATE) #define ce_permissions(mode) (((mode) & 0100) ? 0755 : 0644) static inline unsigned int create_ce_mode(unsigned int mode) diff --git a/diff.c b/diff.c index 5b8afdcb05..d464fe3b20 100644 --- a/diff.c +++ b/diff.c @@ -1510,17 +1510,22 @@ static int reuse_worktree_file(const char *name, const unsigned char *sha1, int if (pos < 0) return 0; ce = active_cache[pos]; - if ((lstat(name, &st) < 0) || - !S_ISREG(st.st_mode) || /* careful! */ - ce_match_stat(ce, &st, 0) || - hashcmp(sha1, ce->sha1)) - return 0; - /* we return 1 only when we can stat, it is a regular file, - * stat information matches, and sha1 recorded in the cache - * matches. I.e. we know the file in the work tree really is - * the same as the pair. + + /* + * This is not the sha1 we are looking for, or + * unreusable because it is not a regular file. */ - return 1; + if (hashcmp(sha1, ce->sha1) || !S_ISREG(ce->ce_mode)) + return 0; + + /* + * If ce matches the file in the work tree, we can reuse it. + */ + if (ce_uptodate(ce) || + (!lstat(name, &st) && !ce_match_stat(ce, &st, 0))) + return 1; + + return 0; } static int populate_from_stdin(struct diff_filespec *s) diff --git a/read-cache.c b/read-cache.c index 528f697f59..8ba8f0f88f 100644 --- a/read-cache.c +++ b/read-cache.c @@ -40,6 +40,9 @@ void fill_stat_cache_info(struct cache_entry *ce, struct stat *st) if (assume_unchanged) ce->ce_flags |= CE_VALID; + + if (S_ISREG(st->st_mode)) + ce_mark_uptodate(ce); } static int ce_compare_data(struct cache_entry *ce, struct stat *st) @@ -412,6 +415,7 @@ int add_file_to_index(struct index_state *istate, const char *path, int verbose) !ie_match_stat(istate, istate->cache[pos], &st, ce_option)) { /* Nothing changed, really */ free(ce); + ce_mark_uptodate(istate->cache[pos]); return 0; } @@ -779,6 +783,9 @@ static struct cache_entry *refresh_cache_ent(struct index_state *istate, int changed, size; int ignore_valid = options & CE_MATCH_IGNORE_VALID; + if (ce_uptodate(ce)) + return ce; + if (lstat(ce->name, &st) < 0) { if (err) *err = errno; @@ -797,8 +804,15 @@ static struct cache_entry *refresh_cache_ent(struct index_state *istate, if (ignore_valid && assume_unchanged && !(ce->ce_flags & CE_VALID)) ; /* mark this one VALID again */ - else + else { + /* + * We do not mark the index itself "modified" + * because CE_UPTODATE flag is in-core only; + * we are not going to write this change out. + */ + ce_mark_uptodate(ce); return ce; + } } if (ie_modified(istate, ce, &st, options)) { From d1f2d7e8ca65504722108e2db710788f66c34c6c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 19 Jan 2008 17:27:12 -0800 Subject: [PATCH 4/9] Make run_diff_index() use unpack_trees(), not read_tree() A plain "git commit" would still run lstat() a lot more than necessary, because wt_status_print() would cause the index to be repeatedly flushed and re-read by wt_read_cache(), and that would cause the CE_UPTODATE bit to be lost, resulting in the files in the index being lstat'ed three times each. The reason why wt-status.c ended up invalidating and re-reading the cache multiple times was that it uses "run_diff_index()", which in turn uses "read_tree()" to populate the index with *both* the old index and the tree we want to compare against. So this patch re-writes run_diff_index() to not use read_tree(), but instead use "unpack_trees()" to diff the index to a tree. That, in turn, means that we don't need to modify the index itself, which then means that we don't need to invalidate it and re-read it! This, together with the lstat() optimizations, means that "git commit" on the kernel tree really only needs to lstat() the index entries once. That noticeably cuts down on the cached timings. Best time before: [torvalds@woody linux]$ time git commit > /dev/null real 0m0.399s user 0m0.232s sys 0m0.164s Best time after: [torvalds@woody linux]$ time git commit > /dev/null real 0m0.254s user 0m0.140s sys 0m0.112s so it's a noticeable improvement in addition to being a nice conceptual cleanup (it's really not that pretty that "run_diff_index()" dirties the index!) Doing an "strace -c" on it also shows that as it cuts the number of lstat() calls by two thirds, it goes from being lstat()-limited to being limited by getdents() (which is the readdir system call): Before: % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 60.69 0.000704 0 69230 31 lstat 23.62 0.000274 0 5522 getdents 8.36 0.000097 0 5508 2638 open 2.59 0.000030 0 2869 close 2.50 0.000029 0 274 write 1.47 0.000017 0 2844 fstat After: % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 45.17 0.000276 0 5522 getdents 26.51 0.000162 0 23112 31 lstat 19.80 0.000121 0 5503 2638 open 4.91 0.000030 0 2864 close 1.48 0.000020 0 274 write 1.34 0.000018 0 2844 fstat ... It passes the test-suite for me, but this is another of one of those really core functions, and certainly pretty subtle, so.. NOTE! The Linux lstat() system call is really quite cheap when everything is cached, so the fact that this is quite noticeable on Linux is likely to mean that it is *much* more noticeable on other operating systems. I bet you'll see a much bigger performance improvement from this on Windows in particular. Signed-off-by: Linus Torvalds --- diff-lib.c | 151 ++++++++++++++++++++++++++++++++++++++++++++----- unpack-trees.h | 1 + wt-status.c | 10 ---- 3 files changed, 138 insertions(+), 24 deletions(-) diff --git a/diff-lib.c b/diff-lib.c index 4a05b02cd0..045f3cc58e 100644 --- a/diff-lib.c +++ b/diff-lib.c @@ -9,6 +9,7 @@ #include "revision.h" #include "cache-tree.h" #include "path-list.h" +#include "unpack-trees.h" /* * diff-files @@ -435,6 +436,8 @@ int run_diff_files(struct rev_info *revs, unsigned int option) continue; } + if (ce_uptodate(ce)) + continue; if (lstat(ce->name, &st) < 0) { if (errno != ENOENT && errno != ENOTDIR) { perror(ce->name); @@ -665,20 +668,133 @@ static void mark_merge_entries(void) } } -int run_diff_index(struct rev_info *revs, int cached) +/* + * This gets a mix of an existing index and a tree, one pathname entry + * at a time. The index entry may be a single stage-0 one, but it could + * also be multiple unmerged entries (in which case idx_pos/idx_nr will + * give you the position and number of entries in the index). + */ +static void do_oneway_diff(struct unpack_trees_options *o, + struct cache_entry *idx, + struct cache_entry *tree, + int idx_pos, int idx_nr) { - int ret; - struct object *ent; - struct tree *tree; - const char *tree_name; - int match_missing = 0; + struct rev_info *revs = o->unpack_data; + int match_missing, cached; /* * Backward compatibility wart - "diff-index -m" does - * not mean "do not ignore merges", but totally different. + * not mean "do not ignore merges", but "match_missing". + * + * But with the revision flag parsing, that's found in + * "!revs->ignore_merges". */ - if (!revs->ignore_merges) - match_missing = 1; + cached = o->index_only; + match_missing = !revs->ignore_merges; + + if (cached && idx && ce_stage(idx)) { + if (tree) + diff_unmerge(&revs->diffopt, idx->name, idx->ce_mode, idx->sha1); + return; + } + + /* + * Something added to the tree? + */ + if (!tree) { + show_new_file(revs, idx, cached, match_missing); + return; + } + + /* + * Something removed from the tree? + */ + if (!idx) { + diff_index_show_file(revs, "-", tree, tree->sha1, tree->ce_mode); + return; + } + + /* Show difference between old and new */ + show_modified(revs, tree, idx, 1, cached, match_missing); +} + +/* + * Count how many index entries go with the first one + */ +static inline int count_skip(const struct cache_entry *src, int pos) +{ + int skip = 1; + + /* We can only have multiple entries if the first one is not stage-0 */ + if (ce_stage(src)) { + struct cache_entry **p = active_cache + pos; + int namelen = ce_namelen(src); + + for (;;) { + const struct cache_entry *ce; + pos++; + if (pos >= active_nr) + break; + ce = *++p; + if (ce_namelen(ce) != namelen) + break; + if (memcmp(ce->name, src->name, namelen)) + break; + skip++; + } + } + return skip; +} + +/* + * The unpack_trees() interface is designed for merging, so + * the different source entries are designed primarily for + * the source trees, with the old index being really mainly + * used for being replaced by the result. + * + * For diffing, the index is more important, and we only have a + * single tree. + * + * We're supposed to return how many index entries we want to skip. + * + * This wrapper makes it all more readable, and takes care of all + * the fairly complex unpack_trees() semantic requirements, including + * the skipping, the path matching, the type conflict cases etc. + */ +static int oneway_diff(struct cache_entry **src, + struct unpack_trees_options *o, + int index_pos) +{ + int skip = 0; + struct cache_entry *idx = src[0]; + struct cache_entry *tree = src[1]; + struct rev_info *revs = o->unpack_data; + + if (index_pos >= 0) + skip = count_skip(idx, index_pos); + + /* + * Unpack-trees generates a DF/conflict entry if + * there was a directory in the index and a tree + * in the tree. From a diff standpoint, that's a + * delete of the tree and a create of the file. + */ + if (tree == o->df_conflict_entry) + tree = NULL; + + if (ce_path_match(idx ? idx : tree, revs->prune_data)) + do_oneway_diff(o, idx, tree, index_pos, skip); + + return skip; +} + +int run_diff_index(struct rev_info *revs, int cached) +{ + struct object *ent; + struct tree *tree; + const char *tree_name; + struct unpack_trees_options opts; + struct tree_desc t; mark_merge_entries(); @@ -687,13 +803,20 @@ int run_diff_index(struct rev_info *revs, int cached) tree = parse_tree_indirect(ent->sha1); if (!tree) return error("bad tree object %s", tree_name); - if (read_tree(tree, 1, revs->prune_data)) - return error("unable to read tree object %s", tree_name); - ret = diff_cache(revs, active_cache, active_nr, revs->prune_data, - cached, match_missing); + + memset(&opts, 0, sizeof(opts)); + opts.head_idx = 1; + opts.index_only = cached; + opts.merge = 1; + opts.fn = oneway_diff; + opts.unpack_data = revs; + + init_tree_desc(&t, tree->buffer, tree->size); + unpack_trees(1, &t, &opts); + diffcore_std(&revs->diffopt); diff_flush(&revs->diffopt); - return ret; + return 0; } int do_diff_cache(const unsigned char *tree_sha1, struct diff_options *opt) diff --git a/unpack-trees.h b/unpack-trees.h index 5517faafad..197a0044aa 100644 --- a/unpack-trees.h +++ b/unpack-trees.h @@ -25,6 +25,7 @@ struct unpack_trees_options { int merge_size; struct cache_entry *df_conflict_entry; + void *unpack_data; }; extern int unpack_trees(unsigned n, struct tree_desc *t, diff --git a/wt-status.c b/wt-status.c index c0c247243b..27b946d552 100644 --- a/wt-status.c +++ b/wt-status.c @@ -217,19 +217,12 @@ static void wt_status_print_changed_cb(struct diff_queue_struct *q, wt_status_print_trailer(s); } -static void wt_read_cache(struct wt_status *s) -{ - discard_cache(); - read_cache_from(s->index_file); -} - static void wt_status_print_initial(struct wt_status *s) { int i; struct strbuf buf; strbuf_init(&buf, 0); - wt_read_cache(s); if (active_nr) { s->commitable = 1; wt_status_print_cached_header(s); @@ -256,7 +249,6 @@ static void wt_status_print_updated(struct wt_status *s) rev.diffopt.detect_rename = 1; rev.diffopt.rename_limit = 100; rev.diffopt.break_opt = 0; - wt_read_cache(s); run_diff_index(&rev, 1); } @@ -268,7 +260,6 @@ static void wt_status_print_changed(struct wt_status *s) rev.diffopt.output_format |= DIFF_FORMAT_CALLBACK; rev.diffopt.format_callback = wt_status_print_changed_cb; rev.diffopt.format_callback_data = s; - wt_read_cache(s); run_diff_files(&rev, 0); } @@ -335,7 +326,6 @@ static void wt_status_print_verbose(struct wt_status *s) setup_revisions(0, NULL, &rev, s->reference); rev.diffopt.output_format |= DIFF_FORMAT_PATCH; rev.diffopt.detect_rename = 1; - wt_read_cache(s); run_diff_index(&rev, 1); fflush(stdout); From 204ce979a5ed6f182e56bea5282c7b4a2d91208a Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sun, 20 Jan 2008 15:19:56 +0000 Subject: [PATCH 5/9] Also use unpack_trees() in do_diff_cache() As in run_diff_index(), we call unpack_trees() with the oneway_diff() function in do_diff_cache() now. This makes the function diff_cache() obsolete. Signed-off-by: Johannes Schindelin Signed-off-by: Linus Torvalds --- diff-lib.c | 92 ++++++++---------------------------------------------- 1 file changed, 13 insertions(+), 79 deletions(-) diff --git a/diff-lib.c b/diff-lib.c index 045f3cc58e..03eaa7cef3 100644 --- a/diff-lib.c +++ b/diff-lib.c @@ -577,81 +577,6 @@ static int show_modified(struct rev_info *revs, return 0; } -static int diff_cache(struct rev_info *revs, - struct cache_entry **ac, int entries, - const char **pathspec, - int cached, int match_missing) -{ - while (entries) { - struct cache_entry *ce = *ac; - int same = (entries > 1) && ce_same_name(ce, ac[1]); - - if (DIFF_OPT_TST(&revs->diffopt, QUIET) && - DIFF_OPT_TST(&revs->diffopt, HAS_CHANGES)) - break; - - if (!ce_path_match(ce, pathspec)) - goto skip_entry; - - switch (ce_stage(ce)) { - case 0: - /* No stage 1 entry? That means it's a new file */ - if (!same) { - show_new_file(revs, ce, cached, match_missing); - break; - } - /* Show difference between old and new */ - show_modified(revs, ac[1], ce, 1, - cached, match_missing); - break; - case 1: - /* No stage 3 (merge) entry? - * That means it's been deleted. - */ - if (!same) { - diff_index_show_file(revs, "-", ce, - ce->sha1, ce->ce_mode); - break; - } - /* We come here with ce pointing at stage 1 - * (original tree) and ac[1] pointing at stage - * 3 (unmerged). show-modified with - * report-missing set to false does not say the - * file is deleted but reports true if work - * tree does not have it, in which case we - * fall through to report the unmerged state. - * Otherwise, we show the differences between - * the original tree and the work tree. - */ - if (!cached && - !show_modified(revs, ce, ac[1], 0, - cached, match_missing)) - break; - diff_unmerge(&revs->diffopt, ce->name, - ce->ce_mode, ce->sha1); - break; - case 3: - diff_unmerge(&revs->diffopt, ce->name, - 0, null_sha1); - break; - - default: - die("impossible cache entry stage"); - } - -skip_entry: - /* - * Ignore all the different stages for this file, - * we've handled the relevant cases now. - */ - do { - ac++; - entries--; - } while (entries && ce_same_name(ce, ac[0])); - } - return 0; -} - /* * This turns all merge entries into "stage 3". That guarantees that * when we read in the new tree (into "stage 1"), we won't lose sight @@ -826,6 +751,8 @@ int do_diff_cache(const unsigned char *tree_sha1, struct diff_options *opt) int i; struct cache_entry **dst; struct cache_entry *last = NULL; + struct unpack_trees_options opts; + struct tree_desc t; /* * This is used by git-blame to run diff-cache internally; @@ -853,8 +780,15 @@ int do_diff_cache(const unsigned char *tree_sha1, struct diff_options *opt) tree = parse_tree_indirect(tree_sha1); if (!tree) die("bad tree object %s", sha1_to_hex(tree_sha1)); - if (read_tree(tree, 1, opt->paths)) - return error("unable to read tree %s", sha1_to_hex(tree_sha1)); - return diff_cache(&revs, active_cache, active_nr, revs.prune_data, - 1, 0); + + memset(&opts, 0, sizeof(opts)); + opts.head_idx = 1; + opts.index_only = 1; + opts.merge = 1; + opts.fn = oneway_diff; + opts.unpack_data = &revs; + + init_tree_desc(&t, tree->buffer, tree->size); + unpack_trees(1, &t, &opts); + return 0; } From 077c48df8a72b046a2f562fedffa1c3d3a73a4e2 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 22 Jan 2008 21:24:21 -0800 Subject: [PATCH 6/9] read-cache.c: fix a couple more CE_REMOVE conversion It is a D/F conflict if you want to add "foo/bar" to the index when "foo" already exists. Also it is a conflict if you want to add a file "foo" when "foo/bar" exists. An exception is when the existing entry is there only to mark "I used to be here but I am being removed". This is needed for operations such as "git read-tree -m -u" that update the index and then reflect the result to the work tree --- we need to remember what to remove somewhere, and we use the index for that. In such a case, an existing file "foo" is being removed and we can create "foo/" directory and hang "bar" underneath it without any conflict. We used to use (ce->ce_mode == 0) to mark an entry that is being removed, but (CE_REMOVE & ce->ce_flags) is used for that purpose these days. An earlier commit forgot to convert the logic in the code that checks D/F conflict condition. The old code knew that "to be removed" entries cannot be at higher stage and actively checked that condition, but it was an unnecessary check. This patch removes the extra check as well. Acked-by: Linus Torvalds Signed-off-by: Junio C Hamano --- read-cache.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/read-cache.c b/read-cache.c index 8ba8f0f88f..8f5d02afaa 100644 --- a/read-cache.c +++ b/read-cache.c @@ -615,7 +615,7 @@ static int has_dir_name(struct index_state *istate, * it is Ok to have a directory at the same * path. */ - if (stage || istate->cache[pos]->ce_mode) { + if (!(istate->cache[pos]->ce_flags & CE_REMOVE)) { retval = -1; if (!ok_to_replace) break; @@ -637,8 +637,9 @@ static int has_dir_name(struct index_state *istate, (p->name[len] != '/') || memcmp(p->name, name, len)) break; /* not our subdirectory */ - if (ce_stage(p) == stage && (stage || p->ce_mode)) - /* p is at the same stage as our entry, and + if (ce_stage(p) == stage && !(p->ce_flags & CE_REMOVE)) + /* + * p is at the same stage as our entry, and * is a subdirectory of what we are looking * at, so we cannot have conflicts at our * level or anything shorter. From 6d91da6d3c170d026f2d7f79bbd7b657a6908cc8 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Mon, 21 Jan 2008 00:44:50 -0800 Subject: [PATCH 7/9] read-cache.c: introduce is_racy_timestamp() helper This moves a common boolean expression into a helper function, and makes the comparison between filesystem timestamp and index timestamp done in the function in line with the other places. st.st_mtime should be casted to (unsigned int) when compared to an index timestamp ce_mtime. Signed-off-by: Junio C Hamano --- read-cache.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/read-cache.c b/read-cache.c index 8f5d02afaa..07abd5d7eb 100644 --- a/read-cache.c +++ b/read-cache.c @@ -181,6 +181,12 @@ static int ce_match_stat_basic(struct cache_entry *ce, struct stat *st) return changed; } +static int is_racy_timestamp(struct index_state *istate, struct cache_entry *ce) +{ + return (istate->timestamp && + ((unsigned int)istate->timestamp) <= ce->ce_mtime); +} + int ie_match_stat(struct index_state *istate, struct cache_entry *ce, struct stat *st, unsigned int options) @@ -214,9 +220,7 @@ int ie_match_stat(struct index_state *istate, * whose mtime are the same as the index file timestamp more * carefully than others. */ - if (!changed && - istate->timestamp && - istate->timestamp <= ce->ce_mtime) { + if (!changed && is_racy_timestamp(istate, ce)) { if (assume_racy_is_modified) changed |= DATA_CHANGED; else @@ -1233,8 +1237,7 @@ int write_index(struct index_state *istate, int newfd) struct cache_entry *ce = cache[i]; if (ce->ce_flags & CE_REMOVE) continue; - if (istate->timestamp && - istate->timestamp <= ce->ce_mtime) + if (is_racy_timestamp(istate, ce)) ce_smudge_racily_clean_entry(ce); if (ce_write_entry(&c, newfd, ce) < 0) return -1; From cf558704fb68514a820e3666968967c900e0fd29 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 22 Jan 2008 18:41:14 -0800 Subject: [PATCH 8/9] Create pathname-based hash-table lookup into index This creates a hash index of every single file added to the index. Right now that hash index isn't actually used for much: I implemented a "cache_name_exists()" function that uses it to efficiently look up a filename in the index without having to do the O(logn) binary search, but quite frankly, that's not why this patch is interesting. No, the whole and only reason to create the hash of the filenames in the index is that by modifying the hash function, you can fairly easily do things like making it always hash equivalent names into the same bucket. That, in turn, means that suddenly questions like "does this name exist in the index under an _equivalent_ name?" becomes much much cheaper. Guiding principles behind this patch: - it shouldn't be too costly. In fact, my primary goal here was to actually speed up "git commit" with a fully populated kernel tree, by being faster at checking whether a file already existed in the index. I did succeed, but only barely: Best before: [torvalds@woody linux]$ time git commit > /dev/null real 0m0.255s user 0m0.168s sys 0m0.088s Best after: [torvalds@woody linux]$ time ~/git/git commit > /dev/null real 0m0.233s user 0m0.144s sys 0m0.088s so some things are actually faster (~8%). Caveat: that's really the best case. Other things are invariably going to be slightly slower, since we populate that index cache, and quite frankly, few things really use it to look things up. That said, the cost is really quite small. The worst case is probably doing a "git ls-files", which will do very little except puopulate the index, and never actually looks anything up in it, just lists it. Before: [torvalds@woody linux]$ time git ls-files > /dev/null real 0m0.016s user 0m0.016s sys 0m0.000s After: [torvalds@woody linux]$ time ~/git/git ls-files > /dev/null real 0m0.021s user 0m0.012s sys 0m0.008s and while the thing has really gotten relatively much slower, we're still talking about something almost unmeasurable (eg 5ms). And that really should be pretty much the worst case. So we lose 5ms on one "benchmark", but win 22ms on another. Pick your poison - this patch has the advantage that it will _likely_ speed up the cases that are complex and expensive more than it slows down the cases that are already so fast that nobody cares. But if you look at relative speedups/slowdowns, it doesn't look so good. - It should be simple and clean The code may be a bit subtle (the reasons I do hash removal the way I do etc), but it re-uses the existing hash.c files, so it really is fairly small and straightforward apart from a few odd details. Now, this patch on its own doesn't really do much, but I think it's worth looking at, if only because if done correctly, the name hashing really can make an improvement to the whole issue of "do we have a filename that looks like this in the index already". And at least it gets real testing by being used even by default (ie there is a real use-case for it even without any insane filesystems). NOTE NOTE NOTE! The current hash is a joke. I'm ashamed of it, I'm just not ashamed of it enough to really care. I took all the numbers out of my nether regions - I'm sure it's good enough that it works in practice, but the whole point was that you can make a really much fancier hash that hashes characters not directly, but by their upper-case value or something like that, and thus you get a case-insensitive hash, while still keeping the name and the index itself totally case sensitive. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- cache.h | 6 ++++ dir.c | 2 +- read-cache.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 95 insertions(+), 11 deletions(-) diff --git a/cache.h b/cache.h index 3a47cdc9d2..409738ca6b 100644 --- a/cache.h +++ b/cache.h @@ -3,6 +3,7 @@ #include "git-compat-util.h" #include "strbuf.h" +#include "hash.h" #include SHA1_HEADER #include @@ -109,6 +110,7 @@ struct ondisk_cache_entry { }; struct cache_entry { + struct cache_entry *next; unsigned int ce_ctime; unsigned int ce_mtime; unsigned int ce_dev; @@ -131,6 +133,7 @@ struct cache_entry { #define CE_UPDATE (0x10000) #define CE_REMOVE (0x20000) #define CE_UPTODATE (0x40000) +#define CE_UNHASHED (0x80000) static inline unsigned create_ce_flags(size_t len, unsigned stage) { @@ -188,6 +191,7 @@ struct index_state { struct cache_tree *cache_tree; time_t timestamp; void *alloc; + struct hash_table name_hash; }; extern struct index_state the_index; @@ -211,6 +215,7 @@ extern struct index_state the_index; #define refresh_cache(flags) refresh_index(&the_index, (flags), NULL, NULL) #define ce_match_stat(ce, st, options) ie_match_stat(&the_index, (ce), (st), (options)) #define ce_modified(ce, st, options) ie_modified(&the_index, (ce), (st), (options)) +#define cache_name_exists(name, namelen) index_name_exists(&the_index, (name), (namelen)) #endif enum object_type { @@ -297,6 +302,7 @@ extern int read_index_from(struct index_state *, const char *path); extern int write_index(struct index_state *, int newfd); extern int discard_index(struct index_state *); extern int verify_path(const char *path); +extern int index_name_exists(struct index_state *istate, const char *name, int namelen); extern int index_name_pos(struct index_state *, const char *name, int namelen); #define ADD_CACHE_OK_TO_ADD 1 /* Ok to add */ #define ADD_CACHE_OK_TO_REPLACE 2 /* Ok to replace file/directory */ diff --git a/dir.c b/dir.c index 1b9cc7a8a8..6543105b96 100644 --- a/dir.c +++ b/dir.c @@ -346,7 +346,7 @@ static struct dir_entry *dir_entry_new(const char *pathname, int len) struct dir_entry *dir_add_name(struct dir_struct *dir, const char *pathname, int len) { - if (cache_name_pos(pathname, len) >= 0) + if (cache_name_exists(pathname, len)) return NULL; ALLOC_GROW(dir->entries, dir->nr+1, dir->alloc); diff --git a/read-cache.c b/read-cache.c index 07abd5d7eb..9477c0b398 100644 --- a/read-cache.c +++ b/read-cache.c @@ -23,6 +23,70 @@ struct index_state the_index; +static unsigned int hash_name(const char *name, int namelen) +{ + unsigned int hash = 0x123; + + do { + unsigned char c = *name++; + hash = hash*101 + c; + } while (--namelen); + return hash; +} + +static void set_index_entry(struct index_state *istate, int nr, struct cache_entry *ce) +{ + void **pos; + unsigned int hash = hash_name(ce->name, ce_namelen(ce)); + + istate->cache[nr] = ce; + pos = insert_hash(hash, ce, &istate->name_hash); + if (pos) { + ce->next = *pos; + *pos = ce; + } +} + +/* + * We don't actually *remove* it, we can just mark it invalid so that + * we won't find it in lookups. + * + * Not only would we have to search the lists (simple enough), but + * we'd also have to rehash other hash buckets in case this makes the + * hash bucket empty (common). So it's much better to just mark + * it. + */ +static void remove_hash_entry(struct index_state *istate, struct cache_entry *ce) +{ + ce->ce_flags |= CE_UNHASHED; +} + +static void replace_index_entry(struct index_state *istate, int nr, struct cache_entry *ce) +{ + struct cache_entry *old = istate->cache[nr]; + + if (ce != old) { + remove_hash_entry(istate, old); + set_index_entry(istate, nr, ce); + } + istate->cache_changed = 1; +} + +int index_name_exists(struct index_state *istate, const char *name, int namelen) +{ + unsigned int hash = hash_name(name, namelen); + struct cache_entry *ce = lookup_hash(hash, &istate->name_hash); + + while (ce) { + if (!(ce->ce_flags & CE_UNHASHED)) { + if (!cache_name_compare(name, namelen, ce->name, ce->ce_flags)) + return 1; + } + ce = ce->next; + } + return 0; +} + /* * This only updates the "non-critical" parts of the directory * cache, ie the parts that aren't tracked by GIT, and only used @@ -327,6 +391,9 @@ int index_name_pos(struct index_state *istate, const char *name, int namelen) /* Remove entry, return true if there are more entries to go.. */ int remove_index_entry_at(struct index_state *istate, int pos) { + struct cache_entry *ce = istate->cache[pos]; + + remove_hash_entry(istate, ce); istate->cache_changed = 1; istate->cache_nr--; if (pos >= istate->cache_nr) @@ -702,8 +769,7 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e /* existing match? Just replace it. */ if (pos >= 0) { - istate->cache_changed = 1; - istate->cache[pos] = ce; + replace_index_entry(istate, pos, ce); return 0; } pos = -pos-1; @@ -763,7 +829,7 @@ int add_index_entry(struct index_state *istate, struct cache_entry *ce, int opti memmove(istate->cache + pos + 1, istate->cache + pos, (istate->cache_nr - pos - 1) * sizeof(ce)); - istate->cache[pos] = ce; + set_index_entry(istate, pos, ce); istate->cache_changed = 1; return 0; } @@ -892,11 +958,8 @@ int refresh_index(struct index_state *istate, unsigned int flags, const char **p has_errors = 1; continue; } - istate->cache_changed = 1; - /* You can NOT just free istate->cache[i] here, since it - * might not be necessarily malloc()ed but can also come - * from mmap(). */ - istate->cache[i] = new; + + replace_index_entry(istate, i, new); } return has_errors; } @@ -971,6 +1034,20 @@ static void convert_from_disk(struct ondisk_cache_entry *ondisk, struct cache_en memcpy(ce->name, ondisk->name, len + 1); } +static inline size_t estimate_cache_size(size_t ondisk_size, unsigned int entries) +{ + long per_entry; + + per_entry = sizeof(struct cache_entry) - sizeof(struct ondisk_cache_entry); + + /* + * Alignment can cause differences. This should be "alignof", but + * since that's a gcc'ism, just use the size of a pointer. + */ + per_entry += sizeof(void *); + return ondisk_size + entries*per_entry; +} + /* remember to discard_cache() before reading a different cache! */ int read_index_from(struct index_state *istate, const char *path) { @@ -1021,7 +1098,7 @@ int read_index_from(struct index_state *istate, const char *path) * has room for a few more flags, we can allocate using the same * index size */ - istate->alloc = xmalloc(mmap_size); + istate->alloc = xmalloc(estimate_cache_size(mmap_size, istate->cache_nr)); src_offset = sizeof(*hdr); dst_offset = 0; @@ -1032,7 +1109,7 @@ int read_index_from(struct index_state *istate, const char *path) disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset); ce = (struct cache_entry *)((char *)istate->alloc + dst_offset); convert_from_disk(disk_ce, ce); - istate->cache[i] = ce; + set_index_entry(istate, i, ce); src_offset += ondisk_ce_size(ce); dst_offset += ce_size(ce); @@ -1070,6 +1147,7 @@ int discard_index(struct index_state *istate) istate->cache_nr = 0; istate->cache_changed = 0; istate->timestamp = 0; + free_hash(&istate->name_hash); cache_tree_free(&(istate->cache_tree)); free(istate->alloc); istate->alloc = NULL; From 9cb76b8cdc8ac62a77080595f6443613fd64bab3 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 22 Jan 2008 23:01:13 -0800 Subject: [PATCH 9/9] lazy index hashing This delays the hashing of index names until it becomes necessary for the first time. Signed-off-by: Junio C Hamano --- cache.h | 1 + read-cache.c | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/cache.h b/cache.h index 409738ca6b..e4aeff07d1 100644 --- a/cache.h +++ b/cache.h @@ -191,6 +191,7 @@ struct index_state { struct cache_tree *cache_tree; time_t timestamp; void *alloc; + unsigned name_hash_initialized : 1; struct hash_table name_hash; }; diff --git a/read-cache.c b/read-cache.c index 9477c0b398..e45f4b3d61 100644 --- a/read-cache.c +++ b/read-cache.c @@ -34,12 +34,11 @@ static unsigned int hash_name(const char *name, int namelen) return hash; } -static void set_index_entry(struct index_state *istate, int nr, struct cache_entry *ce) +static void hash_index_entry(struct index_state *istate, struct cache_entry *ce) { void **pos; unsigned int hash = hash_name(ce->name, ce_namelen(ce)); - istate->cache[nr] = ce; pos = insert_hash(hash, ce, &istate->name_hash); if (pos) { ce->next = *pos; @@ -47,6 +46,24 @@ static void set_index_entry(struct index_state *istate, int nr, struct cache_ent } } +static void lazy_init_name_hash(struct index_state *istate) +{ + int nr; + + if (istate->name_hash_initialized) + return; + for (nr = 0; nr < istate->cache_nr; nr++) + hash_index_entry(istate, istate->cache[nr]); + istate->name_hash_initialized = 1; +} + +static void set_index_entry(struct index_state *istate, int nr, struct cache_entry *ce) +{ + istate->cache[nr] = ce; + if (istate->name_hash_initialized) + hash_index_entry(istate, ce); +} + /* * We don't actually *remove* it, we can just mark it invalid so that * we won't find it in lookups. @@ -75,7 +92,10 @@ static void replace_index_entry(struct index_state *istate, int nr, struct cache int index_name_exists(struct index_state *istate, const char *name, int namelen) { unsigned int hash = hash_name(name, namelen); - struct cache_entry *ce = lookup_hash(hash, &istate->name_hash); + struct cache_entry *ce; + + lazy_init_name_hash(istate); + ce = lookup_hash(hash, &istate->name_hash); while (ce) { if (!(ce->ce_flags & CE_UNHASHED)) {