Skip to content

Commit ce1e4a1

Browse files
derrickstoleegitster
authored andcommitted
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by their modified time. Second, walk those pack-files from oldest to newest, compute their expected size, and add the packs to a list if they are smaller than the given batch-size. Stop when the total expected size is at least the batch size. If the batch size is zero, select all packs in the multi-pack-index. Finally, collect the objects from the multi-pack-index that are in the selected packs and send them to 'git pack-objects'. Write a new multi-pack-index that includes the new pack. Using a batch size of zero is very similar to a standard 'git repack' command, except that we do not delete the old packs and instead rely on the new multi-pack-index to prevent new processes from reading the old packs. This does not disrupt other Git processes that are currently reading the old packs based on the old multi-pack-index. While first designing a 'git multi-pack-index repack' operation, I started by collecting the batches based on the actual size of the objects instead of the size of the pack-files. This allows repacking a large pack-file that has very few referencd objects. However, this came at a significant cost of parsing pack-files instead of simply reading the multi-pack-index and getting the file information for the pack-files. The "expected size" version provides similar behavior, but could skip a pack-file if the average object size is much larger than the actual size of the referenced objects, or can create a large pack if the actual size of the referenced objects is larger than the expected size. Signed-off-by: Derrick Stolee <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 2af890b commit ce1e4a1

File tree

2 files changed

+178
-1
lines changed

2 files changed

+178
-1
lines changed

midx.c

Lines changed: 150 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "midx.h"
1010
#include "progress.h"
1111
#include "trace2.h"
12+
#include "run-command.h"
1213

1314
#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
1415
#define MIDX_VERSION 1
@@ -1227,7 +1228,155 @@ int expire_midx_packs(struct repository *r, const char *object_dir)
12271228
return result;
12281229
}
12291230

1230-
int midx_repack(struct repository *r, const char *object_dir, size_t batch_size)
1231+
struct repack_info {
1232+
timestamp_t mtime;
1233+
uint32_t referenced_objects;
1234+
uint32_t pack_int_id;
1235+
};
1236+
1237+
static int compare_by_mtime(const void *a_, const void *b_)
12311238
{
1239+
const struct repack_info *a, *b;
1240+
1241+
a = (const struct repack_info *)a_;
1242+
b = (const struct repack_info *)b_;
1243+
1244+
if (a->mtime < b->mtime)
1245+
return -1;
1246+
if (a->mtime > b->mtime)
1247+
return 1;
1248+
return 0;
1249+
}
1250+
1251+
static int fill_included_packs_all(struct multi_pack_index *m,
1252+
unsigned char *include_pack)
1253+
{
1254+
uint32_t i;
1255+
1256+
for (i = 0; i < m->num_packs; i++)
1257+
include_pack[i] = 1;
1258+
1259+
return m->num_packs < 2;
1260+
}
1261+
1262+
static int fill_included_packs_batch(struct repository *r,
1263+
struct multi_pack_index *m,
1264+
unsigned char *include_pack,
1265+
size_t batch_size)
1266+
{
1267+
uint32_t i, packs_to_repack;
1268+
size_t total_size;
1269+
struct repack_info *pack_info = xcalloc(m->num_packs, sizeof(struct repack_info));
1270+
1271+
for (i = 0; i < m->num_packs; i++) {
1272+
pack_info[i].pack_int_id = i;
1273+
1274+
if (prepare_midx_pack(r, m, i))
1275+
continue;
1276+
1277+
pack_info[i].mtime = m->packs[i]->mtime;
1278+
}
1279+
1280+
for (i = 0; batch_size && i < m->num_objects; i++) {
1281+
uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
1282+
pack_info[pack_int_id].referenced_objects++;
1283+
}
1284+
1285+
QSORT(pack_info, m->num_packs, compare_by_mtime);
1286+
1287+
total_size = 0;
1288+
packs_to_repack = 0;
1289+
for (i = 0; total_size < batch_size && i < m->num_packs; i++) {
1290+
int pack_int_id = pack_info[i].pack_int_id;
1291+
struct packed_git *p = m->packs[pack_int_id];
1292+
size_t expected_size;
1293+
1294+
if (!p)
1295+
continue;
1296+
if (open_pack_index(p) || !p->num_objects)
1297+
continue;
1298+
1299+
expected_size = (size_t)(p->pack_size
1300+
* pack_info[i].referenced_objects);
1301+
expected_size /= p->num_objects;
1302+
1303+
if (expected_size >= batch_size)
1304+
continue;
1305+
1306+
packs_to_repack++;
1307+
total_size += expected_size;
1308+
include_pack[pack_int_id] = 1;
1309+
}
1310+
1311+
free(pack_info);
1312+
1313+
if (total_size < batch_size || packs_to_repack < 2)
1314+
return 1;
1315+
12321316
return 0;
12331317
}
1318+
1319+
int midx_repack(struct repository *r, const char *object_dir, size_t batch_size)
1320+
{
1321+
int result = 0;
1322+
uint32_t i;
1323+
unsigned char *include_pack;
1324+
struct child_process cmd = CHILD_PROCESS_INIT;
1325+
struct strbuf base_name = STRBUF_INIT;
1326+
struct multi_pack_index *m = load_multi_pack_index(object_dir, 1);
1327+
1328+
if (!m)
1329+
return 0;
1330+
1331+
include_pack = xcalloc(m->num_packs, sizeof(unsigned char));
1332+
1333+
if (batch_size) {
1334+
if (fill_included_packs_batch(r, m, include_pack, batch_size))
1335+
goto cleanup;
1336+
} else if (fill_included_packs_all(m, include_pack))
1337+
goto cleanup;
1338+
1339+
argv_array_push(&cmd.args, "pack-objects");
1340+
1341+
strbuf_addstr(&base_name, object_dir);
1342+
strbuf_addstr(&base_name, "/pack/pack");
1343+
argv_array_push(&cmd.args, base_name.buf);
1344+
strbuf_release(&base_name);
1345+
1346+
cmd.git_cmd = 1;
1347+
cmd.in = cmd.out = -1;
1348+
1349+
if (start_command(&cmd)) {
1350+
error(_("could not start pack-objects"));
1351+
result = 1;
1352+
goto cleanup;
1353+
}
1354+
1355+
for (i = 0; i < m->num_objects; i++) {
1356+
struct object_id oid;
1357+
uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
1358+
1359+
if (!include_pack[pack_int_id])
1360+
continue;
1361+
1362+
nth_midxed_object_oid(&oid, m, i);
1363+
xwrite(cmd.in, oid_to_hex(&oid), the_hash_algo->hexsz);
1364+
xwrite(cmd.in, "\n", 1);
1365+
}
1366+
close(cmd.in);
1367+
1368+
if (finish_command(&cmd)) {
1369+
error(_("could not finish pack-objects"));
1370+
result = 1;
1371+
goto cleanup;
1372+
}
1373+
1374+
result = write_midx_internal(object_dir, m, NULL);
1375+
m = NULL;
1376+
1377+
cleanup:
1378+
if (m)
1379+
close_midx(m);
1380+
free(include_pack);
1381+
return result;
1382+
}

t/t5319-multi-pack-index.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,4 +450,32 @@ test_expect_success 'repack with minimum size does not alter existing packs' '
450450
)
451451
'
452452

453+
test_expect_success 'repack creates a new pack' '
454+
(
455+
cd dup &&
456+
ls .git/objects/pack/*idx >idx-list &&
457+
test_line_count = 5 idx-list &&
458+
THIRD_SMALLEST_SIZE=$(ls -l .git/objects/pack/*pack | awk "{print \$5;}" | sort -n | head -n 3 | tail -n 1) &&
459+
BATCH_SIZE=$(($THIRD_SMALLEST_SIZE + 1)) &&
460+
git multi-pack-index repack --batch-size=$BATCH_SIZE &&
461+
ls .git/objects/pack/*idx >idx-list &&
462+
test_line_count = 6 idx-list &&
463+
test-tool read-midx .git/objects | grep idx >midx-list &&
464+
test_line_count = 6 midx-list
465+
)
466+
'
467+
468+
test_expect_success 'expire removes repacked packs' '
469+
(
470+
cd dup &&
471+
ls -al .git/objects/pack/*pack &&
472+
ls -S .git/objects/pack/*pack | head -n 4 >expect &&
473+
git multi-pack-index expire &&
474+
ls -S .git/objects/pack/*pack >actual &&
475+
test_cmp expect actual &&
476+
test-tool read-midx .git/objects | grep idx >midx-list &&
477+
test_line_count = 4 midx-list
478+
)
479+
'
480+
453481
test_done

0 commit comments

Comments
 (0)