Skip to content

Commit 6364d70

Browse files
committed
Merge pull request #421: Sparse index: integrate with the sparse-checkout builtin
This integrates the `sparse-checkout` builtin with the sparse index. The tricky part here is that we need to partially expand the index when we are modifying the sparse-checkout definition. Note that we modify the pattern list in a careful way: we create a `struct pattern_list` in-memory in `builtin/sparse-checkout.c` then apply those patterns to the index before writing the patterns to the sparse-checkout file. The `update_sparsity()` method does the work to assign the `SKIP_WORKTREE` bit appropriately, but this doesn't work if the files that are within the new sparse-checkout cone are still hidden behind a sparse directory. The new `expand_to_pattern_list()` method does the hard work of expanding the sparse directories that are now within the new patterns. This expands only as far as needed, possibly creating new sparse directory entries. This method does not contract existing files to sparse directories, and a big reason why is because of the check for ignored files as we delete those directories. The `clean_tracked_sparse_directories()` method is called after `update_sparsity()`, but we need to read the `A/B/.gitignore` file (or lack thereof) before we can delete `A/B/`. If we convert to sparse too quickly, then we lose this information and cause a full expansion. Most of the correctness is handled by existing tests in `t1092`, but I add checks for `ensure_not_expanded` in some hopefully interesting cases. As for performance, `git sparse-checkout set` can be slow if it needs to move a lot of files. However, no-op `git sparse-checkout set` (i.e. set the sparse-checkout cone to only include files at root, and do this on repeat) has these performance results on Linux in a monorepo with 2+ million files at `HEAD`: ``` Benchmark #1: baseline Time (mean ± σ): 10.465 s ± 0.018 s [User: 9.885 s, System: 0.573 s] Range (min … max): 10.450 s … 10.497 s 5 runs Benchmark #2: new code Time (mean ± σ): 68.9 ms ± 2.9 ms [User: 45.8 ms, System: 17.1 ms] Range (min … max): 63.4 ms … 74.0 ms 41 runs Summary 'new code' ran 151.89 ± 6.30 times faster than 'baseline' ```
2 parents 039e2a1 + c901f8f commit 6364d70

11 files changed

Lines changed: 255 additions & 64 deletions

builtin/sparse-checkout.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ static void clean_tracked_sparse_directories(struct repository *r)
131131
* sparse index will not delete directories that contain
132132
* conflicted entries or submodules.
133133
*/
134-
if (!r->index->sparse_index) {
134+
if (r->index->sparse_index == COMPLETELY_FULL) {
135135
/*
136136
* If something, such as a merge conflict or other concern,
137137
* prevents us from converting to a sparse index, then do
@@ -443,6 +443,9 @@ static int sparse_checkout_init(int argc, const char **argv)
443443
/* force an index rewrite */
444444
repo_read_index(the_repository);
445445
the_repository->index->updated_workdir = 1;
446+
447+
if (!init_opts.sparse_index)
448+
ensure_full_index(the_repository->index);
446449
}
447450

448451
core_apply_sparse_checkout = 1;
@@ -759,6 +762,9 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)
759762

760763
git_config(git_default_config, NULL);
761764

765+
prepare_repo_settings(the_repository);
766+
the_repository->settings.command_requires_full_index = 0;
767+
762768
if (argc > 0) {
763769
if (!strcmp(argv[0], "list"))
764770
return sparse_checkout_list(argc, argv);

cache-tree.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,31 @@ struct cache_tree_sub *cache_tree_sub(struct cache_tree *it, const char *path)
101101
return find_subtree(it, path, pathlen, 1);
102102
}
103103

104+
struct cache_tree *cache_tree_find_path(struct cache_tree *it, const char *path)
105+
{
106+
const char *slash;
107+
int namelen;
108+
struct cache_tree_sub *down;
109+
110+
if (!it)
111+
return NULL;
112+
slash = strchrnul(path, '/');
113+
namelen = slash - path;
114+
it->entry_count = -1;
115+
if (!*slash) {
116+
int pos;
117+
pos = cache_tree_subtree_pos(it, path, namelen);
118+
if (0 <= pos) {
119+
return it->down[pos]->cache_tree;
120+
}
121+
return NULL;
122+
}
123+
down = find_subtree(it, path, namelen, 0);
124+
if (down)
125+
return cache_tree_find_path(down->cache_tree, slash + 1);
126+
return NULL;
127+
}
128+
104129
static int do_invalidate_path(struct cache_tree *it, const char *path)
105130
{
106131
/* a/b/c

cache-tree.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ struct cache_tree_sub *cache_tree_sub(struct cache_tree *, const char *);
2929

3030
int cache_tree_subtree_pos(struct cache_tree *it, const char *path, int pathlen);
3131

32+
struct cache_tree *cache_tree_find_path(struct cache_tree *it, const char *path);
33+
3234
void cache_tree_write(struct strbuf *, struct cache_tree *root);
3335
struct cache_tree *cache_tree_read(const char *buffer, unsigned long size);
3436

cache.h

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,28 @@ struct untracked_cache;
311311
struct progress;
312312
struct pattern_list;
313313

314+
enum sparse_index_mode {
315+
/*
316+
* COMPLETELY_FULL: there are no sparse directories
317+
* in the index at all.
318+
*/
319+
COMPLETELY_FULL = 0,
320+
321+
/*
322+
* COLLAPSED: the index has already been collapsed to sparse
323+
* directories whereever possible.
324+
*/
325+
COLLAPSED = 1,
326+
327+
/*
328+
* PARTIALLY_SPARSE: the sparse directories that exist are
329+
* outside the sparse-checkout boundary, but it is possible
330+
* that some file entries could collapse to sparse directory
331+
* entries.
332+
*/
333+
PARTIALLY_SPARSE = 2,
334+
};
335+
314336
struct index_state {
315337
struct cache_entry **cache;
316338
unsigned int version;
@@ -324,14 +346,8 @@ struct index_state {
324346
drop_cache_tree : 1,
325347
updated_workdir : 1,
326348
updated_skipworktree : 1,
327-
fsmonitor_has_run_once : 1,
328-
329-
/*
330-
* sparse_index == 1 when sparse-directory
331-
* entries exist. Requires sparse-checkout
332-
* in cone mode.
333-
*/
334-
sparse_index : 1;
349+
fsmonitor_has_run_once : 1;
350+
enum sparse_index_mode sparse_index;
335351
struct hashmap name_hash;
336352
struct hashmap dir_hash;
337353
struct object_id oid;

dir.c

Lines changed: 44 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,46 +1446,16 @@ static struct path_pattern *last_matching_pattern_from_list(const char *pathname
14461446
return res;
14471447
}
14481448

1449-
/*
1450-
* Scan the list of patterns to determine if the ordered list
1451-
* of patterns matches on 'pathname'.
1452-
*
1453-
* Return 1 for a match, 0 for not matched and -1 for undecided.
1454-
*/
1455-
enum pattern_match_result path_matches_pattern_list(
1449+
enum pattern_match_result path_matches_cone_mode_pattern_list(
14561450
const char *pathname, int pathlen,
1457-
const char *basename, int *dtype,
1458-
struct pattern_list *pl,
1459-
struct index_state *istate)
1451+
struct pattern_list *pl)
14601452
{
1461-
struct path_pattern *pattern;
14621453
struct strbuf parent_pathname = STRBUF_INIT;
14631454
int result = NOT_MATCHED;
14641455
size_t slash_pos;
14651456

1466-
/*
1467-
* The virtual file system data is used to prevent git from traversing
1468-
* any part of the tree that is not in the virtual file system. Return
1469-
* 1 to exclude the entry if it is not found in the virtual file system,
1470-
* else fall through to the regular excludes logic as it may further exclude.
1471-
*/
1472-
if (*dtype == DT_UNKNOWN)
1473-
*dtype = resolve_dtype(DT_UNKNOWN, istate, pathname, pathlen);
1474-
if (is_excluded_from_virtualfilesystem(pathname, pathlen, *dtype) > 0)
1475-
return 1;
1476-
1477-
if (!pl->use_cone_patterns) {
1478-
pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
1479-
dtype, pl, istate);
1480-
if (pattern) {
1481-
if (pattern->flags & PATTERN_FLAG_NEGATIVE)
1482-
return NOT_MATCHED;
1483-
else
1484-
return MATCHED;
1485-
}
1486-
1487-
return UNDECIDED;
1488-
}
1457+
if (!pl->use_cone_patterns)
1458+
BUG("path_matches_cone_mode_pattern_list requires cone mode patterns");
14891459

14901460
if (pl->full_cone)
14911461
return MATCHED;
@@ -1538,6 +1508,46 @@ enum pattern_match_result path_matches_pattern_list(
15381508
return result;
15391509
}
15401510

1511+
/*
1512+
* Scan the list of patterns to determine if the ordered list
1513+
* of patterns matches on 'pathname'.
1514+
*
1515+
* Return 1 for a match, 0 for not matched and -1 for undecided.
1516+
*/
1517+
enum pattern_match_result path_matches_pattern_list(
1518+
const char *pathname, int pathlen,
1519+
const char *basename, int *dtype,
1520+
struct pattern_list *pl,
1521+
struct index_state *istate)
1522+
{
1523+
/*
1524+
* The virtual file system data is used to prevent git from traversing
1525+
* any part of the tree that is not in the virtual file system. Return
1526+
* 1 to exclude the entry if it is not found in the virtual file system,
1527+
* else fall through to the regular excludes logic as it may further exclude.
1528+
*/
1529+
if (*dtype == DT_UNKNOWN)
1530+
*dtype = resolve_dtype(DT_UNKNOWN, istate, pathname, pathlen);
1531+
if (is_excluded_from_virtualfilesystem(pathname, pathlen, *dtype) > 0)
1532+
return 1;
1533+
1534+
if (!pl->use_cone_patterns) {
1535+
struct path_pattern *pattern = last_matching_pattern_from_list(
1536+
pathname, pathlen, basename,
1537+
dtype, pl, istate);
1538+
if (pattern) {
1539+
if (pattern->flags & PATTERN_FLAG_NEGATIVE)
1540+
return NOT_MATCHED;
1541+
else
1542+
return MATCHED;
1543+
}
1544+
1545+
return UNDECIDED;
1546+
}
1547+
1548+
return path_matches_cone_mode_pattern_list(pathname, pathlen, pl);
1549+
}
1550+
15411551
int init_sparse_checkout_patterns(struct index_state *istate)
15421552
{
15431553
if (!core_apply_sparse_checkout)

dir.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,15 @@ enum pattern_match_result {
383383
MATCHED_RECURSIVE = 2,
384384
};
385385

386+
/*
387+
* Test if a given path is contained in the given pattern list.
388+
*
389+
* The given pattern list _must_ use cone mode patterns.
390+
*/
391+
enum pattern_match_result path_matches_cone_mode_pattern_list(
392+
const char *pathname, int pathlen,
393+
struct pattern_list *pl);
394+
386395
/*
387396
* Scan the list of patterns to determine if the ordered list
388397
* of patterns matches on 'pathname'.

read-cache.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ static const char *alternate_index_output;
108108
static void set_index_entry(struct index_state *istate, int nr, struct cache_entry *ce)
109109
{
110110
if (S_ISSPARSEDIR(ce->ce_mode))
111-
istate->sparse_index = 1;
111+
istate->sparse_index = COLLAPSED;
112112

113113
istate->cache[nr] = ce;
114114
add_name_hash(istate, ce);
@@ -1843,7 +1843,7 @@ static int read_index_extension(struct index_state *istate,
18431843
break;
18441844
case CACHE_EXT_SPARSE_DIRECTORIES:
18451845
/* no content, only an indicator */
1846-
istate->sparse_index = 1;
1846+
istate->sparse_index = COLLAPSED;
18471847
break;
18481848
default:
18491849
if (*ext < 'A' || 'Z' < *ext)
@@ -3142,7 +3142,7 @@ static int do_write_locked_index(struct index_state *istate, struct lock_file *l
31423142
unsigned flags)
31433143
{
31443144
int ret;
3145-
int was_full = !istate->sparse_index;
3145+
int was_full = istate->sparse_index == COMPLETELY_FULL;
31463146

31473147
ret = convert_to_sparse(istate, 0);
31483148

0 commit comments

Comments
 (0)