-
Notifications
You must be signed in to change notification settings - Fork 106
midx: verify: group objects by packfile to speed up object verification #124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| #include "midx.h" | ||
| #include "progress.h" | ||
| #include "run-command.h" | ||
| #include "trace2.h" | ||
|
|
||
| #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ | ||
| #define MIDX_VERSION 1 | ||
|
|
@@ -165,6 +166,9 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local | |
| m->pack_names[i]); | ||
| } | ||
|
|
||
| trace2_data_intmax("midx", the_repository, "load/num_packs", m->num_packs); | ||
| trace2_data_intmax("midx", the_repository, "load/num_objects", m->num_objects); | ||
|
|
||
| return m; | ||
|
|
||
| cleanup_fail: | ||
|
|
@@ -1001,9 +1005,29 @@ static void midx_report(const char *fmt, ...) | |
| va_end(ap); | ||
| } | ||
|
|
||
| struct pair_pos_vs_id | ||
| { | ||
| uint32_t pos; | ||
| uint32_t pack_int_id; | ||
| }; | ||
|
|
||
| static int compare_pair_pos_vs_id(const void *_a, const void *_b) | ||
| { | ||
| struct pair_pos_vs_id *a = (struct pair_pos_vs_id *)_a; | ||
| struct pair_pos_vs_id *b = (struct pair_pos_vs_id *)_b; | ||
|
|
||
| if (a->pack_int_id < b->pack_int_id) | ||
| return -1; | ||
| if (a->pack_int_id > b->pack_int_id) | ||
| return 1; | ||
|
|
||
| return 0; | ||
| } | ||
|
|
||
| int verify_midx_file(const char *object_dir) | ||
| { | ||
| uint32_t i; | ||
| struct pair_pos_vs_id *pairs = NULL; | ||
| uint32_t i, k; | ||
| struct progress *progress; | ||
| struct multi_pack_index *m = load_multi_pack_index(object_dir, 1); | ||
| verify_midx_error = 0; | ||
|
|
@@ -1040,15 +1064,32 @@ int verify_midx_file(const char *object_dir) | |
| } | ||
|
|
||
| progress = start_progress(_("Verifying object offsets"), m->num_objects); | ||
|
|
||
| /* | ||
| * Create an array mapping each object to its packfile id. Sort it | ||
| * to group the objects by packfile. Using this permutation to visit | ||
| * each of the objects only require 1 packfile to be open at a time. | ||
| */ | ||
| ALLOC_ARRAY(pairs, m->num_objects); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For very large repos, this could be asking for many GB, right? Should a failure to alloc cause a failure in midx validation?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Jeff and I talked offline and I misunderstood the number of objects in play. We should ask for less than 500MB, which should be serviceable. |
||
| for (i = 0; i < m->num_objects; i++) { | ||
| pairs[i].pos = i; | ||
| pairs[i].pack_int_id = nth_midxed_pack_int_id(m, i); | ||
| } | ||
| QSORT(pairs, m->num_objects, compare_pair_pos_vs_id); | ||
|
|
||
| for (k = 0; k < m->num_objects; k++) { | ||
| struct object_id oid; | ||
| struct pack_entry e; | ||
| off_t m_offset, p_offset; | ||
|
|
||
| nth_midxed_object_oid(&oid, m, i); | ||
| if (k > 0 && pairs[k-1].pack_int_id != pairs[k].pack_int_id && | ||
| m->packs[pairs[k-1].pack_int_id]) | ||
| close_pack_fd(m->packs[pairs[k-1].pack_int_id]); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I'm reading this correctly, this is an (optional) optimization that will keep the pack files open to a minimum. I'm assuming without it, they would start being closed transparently as you reached some max threshold. Since you know they are sorted, makes sense to do the optimization here.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, the problem with max file descriptors that i fixed yesterday said we were holding 2000+ packfiles open when we started running out. fixing that caused us to still hold 2000+ open, but close and open packfiles as necessary to do the random access. So yeah, this fix kinda eliminates the need for the previous fix. But i'm keeping that one in for now since it is harmless and just seems like the correct thing to do. |
||
|
|
||
| nth_midxed_object_oid(&oid, m, pairs[k].pos); | ||
| if (!fill_midx_entry(&oid, &e, m)) { | ||
| midx_report(_("failed to load pack entry for oid[%d] = %s"), | ||
| i, oid_to_hex(&oid)); | ||
| pairs[k].pos, oid_to_hex(&oid)); | ||
| continue; | ||
| } | ||
|
|
||
|
|
@@ -1063,12 +1104,14 @@ int verify_midx_file(const char *object_dir) | |
|
|
||
| if (m_offset != p_offset) | ||
| midx_report(_("incorrect object offset for oid[%d] = %s: %"PRIx64" != %"PRIx64), | ||
| i, oid_to_hex(&oid), m_offset, p_offset); | ||
| pairs[k].pos, oid_to_hex(&oid), m_offset, p_offset); | ||
|
|
||
| display_progress(progress, i + 1); | ||
| display_progress(progress, k + 1); | ||
| } | ||
| stop_progress(&progress); | ||
|
|
||
| free(pairs); | ||
|
|
||
| return verify_midx_error; | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -309,7 +309,7 @@ void close_pack_windows(struct packed_git *p) | |
| } | ||
| } | ||
|
|
||
| static int close_pack_fd(struct packed_git *p) | ||
| int close_pack_fd(struct packed_git *p) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This wouldn't be needed without the optimization above but I don't see any problem making this public.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right. the new verify loop completely verifies all objects in one packfile before moving to the next packfile (because of the sort). But when we hit 2000+ packfiles in the directory, visiting the next packfile requires us to free up a fd, and |
||
| { | ||
| if (p->pack_fd < 0) | ||
| return 0; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense, and follows a pattern that exists elsewhere in this file (ie midx_repack()). Clearly results in a huge perf win by avoiding opening/closing pack files all the time.