Skip to content

Commit 645ba23

Browse files
committed
Add cron method to gc LFS MetaObjects
This PR adds a task to the cron service to allow garbage collection of LFS meta objects. As repositories may have a large number of LFSMetaObjects, an updated column is added to this table and it is used to perform a generational GC to attempt to reduce the amount of work. (There may need to be a bit more work here but this is probably enough for the moment.) Fix #7045 Signed-off-by: Andrew Thornton <[email protected]>
1 parent a357143 commit 645ba23

File tree

8 files changed

+187
-22
lines changed

8 files changed

+187
-22
lines changed

custom/conf/app.example.ini

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2172,6 +2172,23 @@ ROUTER = console
21722172
;SCHEDULE = @every 168h
21732173
;OLDER_THAN = 8760h
21742174

2175+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2176+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2177+
;; Garbage collect LFS pointers in repositories
2178+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2179+
;[cron.gc_lfs]
2180+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2181+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2182+
;ENABLED = false
2183+
;; Garbage collect LFS pointers in repositories (default false)
2184+
;RUN_AT_START = false
2185+
;; Interval as a duration between each gc run (default every 24h)
2186+
;SCHEDULE = @every 24h
2187+
;; Only attempt to garbage collect LFSMetaObjects older than this (default 7 days)
2188+
;OLDER_THAN = 168h
2189+
;; Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days)
2190+
;LAST_UPDATED_MORE_THAN_AGO = 72h
2191+
21752192
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
21762193
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
21772194
;; Git Operation timeout in seconds

docs/content/doc/advanced/config-cheat-sheet.en-us.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,6 +1025,14 @@ Default templates for project boards:
10251025
- `SCHEDULE`: **@every 168h**: Cron syntax to set how often to check.
10261026
- `OLDER_THAN`: **@every 8760h**: any system notice older than this expression will be deleted from database.
10271027

1028+
#### Cron - Garbage collect LFS pointers in repositories ('cron.gc_lfs')
1029+
1030+
- `ENABLED`: **false**: Enable service.
1031+
- `RUN_AT_START`: **false**: Run tasks at start up time (if ENABLED).
1032+
- `SCHEDULE`: **@every 24h**: Cron syntax to set how often to check.
1033+
- `OLDER_THAN`: **168h**: Only attempt to garbage collect LFSMetaObjects older than this (default 7 days)
1034+
- `LAST_UPDATED_MORE_THAN_AGO`: **72h**: Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days)
1035+
10281036
## Git (`git`)
10291037

10301038
- `PATH`: **""**: The path of Git executable. If empty, Gitea searches through the PATH environment.

models/git/lfs.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ type LFSMetaObject struct {
115115
RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"`
116116
Existing bool `xorm:"-"`
117117
CreatedUnix timeutil.TimeStamp `xorm:"created"`
118+
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
118119
}
119120

120121
func init() {
@@ -335,7 +336,8 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) {
335336
}
336337

337338
type IterateLFSMetaObjectsForRepoOptions struct {
338-
OlderThan time.Time
339+
OlderThan time.Time
340+
UpdatedLessRecentlyThan time.Time
339341
}
340342

341343
// IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo
@@ -348,6 +350,8 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont
348350
LFSMetaObject
349351
}
350352

353+
id := int64(0)
354+
351355
for {
352356
beans := make([]*CountLFSMetaObject, 0, batchSize)
353357
// SELECT `lfs_meta_object`.*, COUNT(`l1`.id) as `count` FROM lfs_meta_object INNER JOIN lfs_meta_object AS l1 ON l1.oid = lfs_meta_object.oid WHERE lfs_meta_object.repository_id = ? GROUP BY lfs_meta_object.id
@@ -357,7 +361,12 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont
357361
if !opts.OlderThan.IsZero() {
358362
sess.And("`lfs_meta_object`.created_unix < ?", opts.OlderThan)
359363
}
364+
if !opts.UpdatedLessRecentlyThan.IsZero() {
365+
sess.And("`lfs_meta_object`.updated_unix < ?", opts.UpdatedLessRecentlyThan)
366+
}
367+
sess.And("`lfs_meta_object`.id > ?", id)
360368
sess.GroupBy("`lfs_meta_object`.id")
369+
sess.OrderBy("`lfs_meta_object`.id ASC")
361370
if err := sess.Limit(batchSize, start).Find(&beans); err != nil {
362371
return err
363372
}
@@ -371,5 +380,18 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont
371380
return err
372381
}
373382
}
383+
id = beans[len(beans)-1].ID
384+
}
385+
}
386+
387+
// MarkLFSMetaObject updates the updated time for the provided LFSMetaObject
388+
func MarkLFSMetaObject(ctx context.Context, id int64) error {
389+
obj := &LFSMetaObject{
390+
UpdatedUnix: timeutil.TimeStampNow(),
391+
}
392+
count, err := db.GetEngine(ctx).ID(id).Update(obj)
393+
if count != 1 {
394+
log.Error("Unexpectedly updated %d LFSMetaObjects with ID: %d", count, id)
374395
}
396+
return err
375397
}

models/migrations/migrations.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,9 @@ var migrations = []Migration{
432432
NewMigration("Update counts of all open milestones", v1_18.UpdateOpenMilestoneCounts),
433433
// v230 -> v231
434434
NewMigration("Add ConfidentialClient column (default true) to OAuth2Application table", v1_18.AddConfidentialClientColumnToOAuth2ApplicationTable),
435+
436+
// Gitea 1.18.0 ends at v231
437+
435438
// v231 -> v232
436439
NewMigration("Add index for hook_task", v1_19.AddIndexForHookTask),
437440
// v232 -> v233
@@ -446,6 +449,8 @@ var migrations = []Migration{
446449
NewMigration("Create secrets table", v1_19.CreateSecretsTable),
447450
// v237 -> v238
448451
NewMigration("Drop ForeignReference table", v1_19.DropForeignReferenceTable),
452+
// v238 -> v239
453+
NewMigration("Add updated unix to LFSMetaObject", v1_19.AddUpdatedUnixToLFSMetaObject),
449454
}
450455

451456
// GetCurrentDBVersion returns the current db version

models/migrations/v1_19/v238.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package v1_19 //nolint
5+
6+
import (
7+
"code.gitea.io/gitea/modules/timeutil"
8+
"xorm.io/xorm"
9+
)
10+
11+
// AddUpdatedUnixToLFSMetaObject adds an updated column to the LFSMetaObject to allow for garbage collection
12+
func AddUpdatedUnixToLFSMetaObject(x *xorm.Engine) error {
13+
// Drop the table introduced in `v211`, it's considered badly designed and doesn't look like to be used.
14+
// See: https://github.com/go-gitea/gitea/issues/21086#issuecomment-1318217453
15+
// LFSMetaObject stores metadata for LFS tracked files.
16+
type LFSMetaObject struct {
17+
ID int64 `xorm:"pk autoincr"`
18+
Oid string `json:"oid" xorm:"UNIQUE(s) INDEX NOT NULL"`
19+
Size int64 `json:"size" xorm:"NOT NULL"`
20+
RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"`
21+
CreatedUnix timeutil.TimeStamp `xorm:"created"`
22+
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
23+
}
24+
25+
return x.Sync(new(LFSMetaObject))
26+
}

modules/doctor/lfs.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package doctor
66
import (
77
"context"
88
"fmt"
9+
"time"
910

1011
"code.gitea.io/gitea/modules/log"
1112
"code.gitea.io/gitea/modules/setting"
@@ -29,7 +30,20 @@ func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool
2930
return fmt.Errorf("LFS support is disabled")
3031
}
3132

32-
if err := repository.GarbageCollectLFSMetaObjects(ctx, logger, autofix); err != nil {
33+
if err := repository.GarbageCollectLFSMetaObjects(ctx, repository.GarbageCollectLFSMetaObjectsOptions{
34+
Logger: logger,
35+
AutoFix: autofix,
36+
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
37+
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
38+
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
39+
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
40+
// objects.
41+
//
42+
// It is likely that a week is potentially excessive but it should definitely be enough that any
43+
// unassociated LFS object is genuinely unassociated.
44+
OlderThan: 24 * time.Hour * 7,
45+
// We don't set the UpdatedLessRecentlyThan because we want to do a full GC
46+
}); err != nil {
3347
return err
3448
}
3549

services/cron/tasks_extended.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,44 @@ func registerDeleteOldSystemNotices() {
175175
})
176176
}
177177

178+
func registerGCLFS() {
179+
if !setting.LFS.StartServer {
180+
return
181+
}
182+
type GCLFSConfig struct {
183+
OlderThanConfig
184+
LastUpdatedMoreThanAgo time.Duration
185+
}
186+
187+
RegisterTaskFatal("delete_old_system_notices", &GCLFSConfig{
188+
OlderThanConfig: OlderThanConfig{
189+
BaseConfig: BaseConfig{
190+
Enabled: false,
191+
RunAtStart: false,
192+
Schedule: "@every 24h",
193+
},
194+
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
195+
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
196+
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
197+
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
198+
// objects.
199+
//
200+
// It is likely that a week is potentially excessive but it should definitely be enough that any
201+
// unassociated LFS object is genuinely unassociated.
202+
OlderThan: 24 * time.Hour * 7,
203+
},
204+
// Only GC things that haven't been looked at in the past 3 days
205+
LastUpdatedMoreThanAgo: 24 * time.Hour * 3,
206+
}, func(ctx context.Context, _ *user_model.User, config Config) error {
207+
gcLFSConfig := config.(*GCLFSConfig)
208+
return repo_service.GarbageCollectLFSMetaObjects(ctx, repo_service.GarbageCollectLFSMetaObjectsOptions{
209+
AutoFix: true,
210+
OlderThan: gcLFSConfig.OlderThan,
211+
LastUpdatedMoreThanAgo: gcLFSConfig.LastUpdatedMoreThanAgo,
212+
})
213+
})
214+
}
215+
178216
func initExtendedTasks() {
179217
registerDeleteInactiveUsers()
180218
registerDeleteRepositoryArchives()
@@ -188,4 +226,5 @@ func initExtendedTasks() {
188226
registerDeleteOldActions()
189227
registerUpdateGiteaChecker()
190228
registerDeleteOldSystemNotices()
229+
registerGCLFS()
191230
}

services/repository/lfs.go

Lines changed: 54 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,40 +14,63 @@ import (
1414
"code.gitea.io/gitea/modules/git"
1515
"code.gitea.io/gitea/modules/lfs"
1616
"code.gitea.io/gitea/modules/log"
17+
"code.gitea.io/gitea/modules/setting"
1718

1819
"xorm.io/builder"
1920
)
2021

21-
func GarbageCollectLFSMetaObjects(ctx context.Context, logger log.Logger, autofix bool) error {
22+
// GarbageCollectLFSMetaObjectsOptions provides options for GarbageCollectLFSMetaObjects function
23+
type GarbageCollectLFSMetaObjectsOptions struct {
24+
RepoID int64
25+
Logger log.Logger
26+
AutoFix bool
27+
OlderThan time.Duration
28+
LastUpdatedMoreThanAgo time.Duration
29+
}
30+
31+
// GarbageCollectLFSMetaObjects garbage collects LFS objects for all repositories
32+
func GarbageCollectLFSMetaObjects(ctx context.Context, opts GarbageCollectLFSMetaObjectsOptions) error {
2233
log.Trace("Doing: GarbageCollectLFSMetaObjects")
34+
defer log.Trace("Finished: GarbageCollectLFSMetaObjects")
35+
36+
if !setting.LFS.StartServer {
37+
if opts.Logger != nil {
38+
opts.Logger.Info("LFS support is disabled")
39+
}
40+
return nil
41+
}
42+
43+
if opts.RepoID == 0 {
44+
repo, err := repo_model.GetRepositoryByID(ctx, opts.RepoID)
45+
if err != nil {
46+
return err
47+
}
48+
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts)
49+
}
2350

24-
if err := db.Iterate(
51+
return db.Iterate(
2552
ctx,
2653
builder.And(builder.Gt{"id": 0}),
2754
func(ctx context.Context, repo *repo_model.Repository) error {
28-
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, logger, autofix)
55+
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts)
2956
},
30-
); err != nil {
31-
return err
32-
}
33-
34-
log.Trace("Finished: GarbageCollectLFSMetaObjects")
35-
return nil
57+
)
3658
}
3759

38-
func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, logger log.Logger, autofix bool) error {
39-
if logger != nil {
40-
logger.Info("Checking %-v", repo)
60+
// GarbageCollectLFSMetaObjectsForRepo garbage collects LFS objects for a specific repository
61+
func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, opts GarbageCollectLFSMetaObjectsOptions) error {
62+
if opts.Logger != nil {
63+
opts.Logger.Info("Checking %-v", repo)
4164
}
4265
total, orphaned, collected, deleted := 0, 0, 0, 0
43-
if logger != nil {
66+
if opts.Logger != nil {
4467
defer func() {
4568
if orphaned == 0 {
46-
logger.Info("Found %d total LFSMetaObjects in %-v", total, repo)
47-
} else if !autofix {
48-
logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo)
69+
opts.Logger.Info("Found %d total LFSMetaObjects in %-v", total, repo)
70+
} else if !opts.AutoFix {
71+
opts.Logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo)
4972
} else {
50-
logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted)
73+
opts.Logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted)
5174
}
5275
}()
5376
}
@@ -61,16 +84,26 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R
6184

6285
store := lfs.NewContentStore()
6386

87+
var olderThan time.Time
88+
var updatedLessRecentlyThan time.Time
89+
90+
if opts.OlderThan > 0 {
91+
olderThan = time.Now().Add(opts.OlderThan)
92+
}
93+
if opts.LastUpdatedMoreThanAgo > 0 {
94+
updatedLessRecentlyThan = time.Now().Add(opts.LastUpdatedMoreThanAgo)
95+
}
96+
6497
return git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error {
6598
total++
6699
pointerSha := git.ComputeBlobHash([]byte(metaObject.Pointer.StringContent()))
67100

68101
if gitRepo.IsObjectExist(pointerSha.String()) {
69-
return nil
102+
return git_model.MarkLFSMetaObject(ctx, metaObject.ID)
70103
}
71104
orphaned++
72105

73-
if !autofix {
106+
if !opts.AutoFix {
74107
return nil
75108
}
76109
// Non-existent pointer file
@@ -100,6 +133,7 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R
100133
//
101134
// It is likely that a week is potentially excessive but it should definitely be enough that any
102135
// unassociated LFS object is genuinely unassociated.
103-
OlderThan: time.Now().Add(-24 * 7 * time.Hour),
136+
OlderThan: olderThan,
137+
UpdatedLessRecentlyThan: updatedLessRecentlyThan,
104138
})
105139
}

0 commit comments

Comments
 (0)