Refactor LFS GC functions

- Remove options that currently aren't set
on `GarbageCollectLFSMetaObjectsOptions` and
`IterateLFSMetaObjectsForRepoOptions`.
- Simplify `IterateRepositoryIDsWithLFSMetaObjects` and
`IterateLFSMetaObjectsForRepo`.
- `IterateLFSMetaObjectsForRepo` was previously able to get in a
loop (`gc-lfs` doctor check was able to reproduce this) because the code
expected that the records would be updated to not match the SQL query,
but that wasn't the case. Simply enforce that only records higher than
the latest `id` from the previous iteration are allowed.
- For `gc-lfs` doctor check this was because `UpdatedLessRecentlyThan`
option was not set, which caused that records just marked as active in
the iteration weren't being filtered.
- Add unit tests
- Most likely a regression from 2cc3a6381c.
- The bug with `gc-lfs` was found on Codeberg.

(cherry picked from commit 7ffa7f5bcef12a0cc05aff660cb5217dc7e4c886)
This commit is contained in:
Gusted 2024-04-05 11:48:03 +02:00 committed by GitHub
parent 7e5fed7e29
commit aa7346c007
5 changed files with 132 additions and 56 deletions

View file

@ -0,0 +1,7 @@
-
id: 1000
oid: 9d172e5c64b4f0024b9901ec6afe9ea052f3c9b6ff9f4b07956d8c48c86fca82
size: 25
repository_id: 1
created_unix: 1712309123

View file

@ -337,32 +337,29 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) {
func IterateRepositoryIDsWithLFSMetaObjects(ctx context.Context, f func(ctx context.Context, repoID, count int64) error) error {
batchSize := setting.Database.IterateBufferSize
sess := db.GetEngine(ctx)
id := int64(0)
var start int
type RepositoryCount struct {
RepositoryID int64
Count int64
}
for {
counts := make([]*RepositoryCount, 0, batchSize)
sess.Select("repository_id, COUNT(id) AS count").
if err := sess.Select("repository_id, COUNT(id) AS count").
Table("lfs_meta_object").
Where("repository_id > ?", id).
GroupBy("repository_id").
OrderBy("repository_id ASC")
if err := sess.Limit(batchSize, 0).Find(&counts); err != nil {
OrderBy("repository_id ASC").Limit(batchSize, start).Find(&counts); err != nil {
return err
}
if len(counts) == 0 {
return nil
}
start += len(counts)
for _, count := range counts {
if err := f(ctx, count.RepositoryID, count.Count); err != nil {
return err
}
}
id = counts[len(counts)-1].RepositoryID
}
}
@ -370,25 +367,17 @@ func IterateRepositoryIDsWithLFSMetaObjects(ctx context.Context, f func(ctx cont
type IterateLFSMetaObjectsForRepoOptions struct {
OlderThan timeutil.TimeStamp
UpdatedLessRecentlyThan timeutil.TimeStamp
OrderByUpdated bool
LoopFunctionAlwaysUpdates bool
}
// IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo
func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(context.Context, *LFSMetaObject, int64) error, opts *IterateLFSMetaObjectsForRepoOptions) error {
var start int
func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(context.Context, *LFSMetaObject) error, opts *IterateLFSMetaObjectsForRepoOptions) error {
batchSize := setting.Database.IterateBufferSize
engine := db.GetEngine(ctx)
type CountLFSMetaObject struct {
Count int64
LFSMetaObject `xorm:"extends"`
}
id := int64(0)
for {
beans := make([]*CountLFSMetaObject, 0, batchSize)
sess := engine.Table("lfs_meta_object").Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`").
beans := make([]*LFSMetaObject, 0, batchSize)
sess := engine.Table("lfs_meta_object").Select("`lfs_meta_object`.*").
Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid").
Where("`lfs_meta_object`.repository_id = ?", repoID)
if !opts.OlderThan.IsZero() {
@ -397,25 +386,19 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont
if !opts.UpdatedLessRecentlyThan.IsZero() {
sess.And("`lfs_meta_object`.updated_unix < ?", opts.UpdatedLessRecentlyThan)
}
sess.GroupBy("`lfs_meta_object`.id")
if opts.OrderByUpdated {
sess.OrderBy("`lfs_meta_object`.updated_unix ASC")
} else {
sess.And("`lfs_meta_object`.id > ?", id)
sess.OrderBy("`lfs_meta_object`.id ASC")
}
if err := sess.Limit(batchSize, start).Find(&beans); err != nil {
sess.GroupBy("`lfs_meta_object`.id").
And("`lfs_meta_object`.id > ?", id).
OrderBy("`lfs_meta_object`.id ASC")
if err := sess.Limit(batchSize, 0).Find(&beans); err != nil {
return err
}
if len(beans) == 0 {
return nil
}
if !opts.LoopFunctionAlwaysUpdates {
start += len(beans)
}
for _, bean := range beans {
if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil {
if err := f(ctx, bean); err != nil {
return err
}
}

101
models/git/lfs_test.go Normal file
View file

@ -0,0 +1,101 @@
// Copyright 2024 The Forgejo Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package git
import (
"context"
"path/filepath"
"testing"
"code.gitea.io/gitea/models/db"
"code.gitea.io/gitea/models/unittest"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/test"
"github.com/stretchr/testify/assert"
)
func TestIterateRepositoryIDsWithLFSMetaObjects(t *testing.T) {
defer unittest.OverrideFixtures(
unittest.FixturesOptions{
Dir: filepath.Join(setting.AppWorkPath, "models/fixtures/"),
Base: setting.AppWorkPath,
Dirs: []string{"models/git/TestIterateRepositoryIDsWithLFSMetaObjects/"},
},
)()
assert.NoError(t, unittest.PrepareTestDatabase())
type repocount struct {
repoid int64
count int64
}
expected := []repocount{{1, 1}, {54, 4}}
t.Run("Normal batch size", func(t *testing.T) {
defer test.MockVariableValue(&setting.Database.IterateBufferSize, 20)()
cases := []repocount{}
err := IterateRepositoryIDsWithLFSMetaObjects(db.DefaultContext, func(ctx context.Context, repoID, count int64) error {
cases = append(cases, repocount{repoID, count})
return nil
})
assert.NoError(t, err)
assert.EqualValues(t, expected, cases)
})
t.Run("Low batch size", func(t *testing.T) {
defer test.MockVariableValue(&setting.Database.IterateBufferSize, 1)()
cases := []repocount{}
err := IterateRepositoryIDsWithLFSMetaObjects(db.DefaultContext, func(ctx context.Context, repoID, count int64) error {
cases = append(cases, repocount{repoID, count})
return nil
})
assert.NoError(t, err)
assert.EqualValues(t, expected, cases)
})
}
func TestIterateLFSMetaObjectsForRepo(t *testing.T) {
assert.NoError(t, unittest.PrepareTestDatabase())
expectedIDs := []int64{1, 2, 3, 4}
t.Run("Normal batch size", func(t *testing.T) {
defer test.MockVariableValue(&setting.Database.IterateBufferSize, 20)()
actualIDs := []int64{}
err := IterateLFSMetaObjectsForRepo(db.DefaultContext, 54, func(ctx context.Context, lo *LFSMetaObject) error {
actualIDs = append(actualIDs, lo.ID)
return nil
}, &IterateLFSMetaObjectsForRepoOptions{})
assert.NoError(t, err)
assert.EqualValues(t, expectedIDs, actualIDs)
})
t.Run("Low batch size", func(t *testing.T) {
defer test.MockVariableValue(&setting.Database.IterateBufferSize, 1)()
actualIDs := []int64{}
err := IterateLFSMetaObjectsForRepo(db.DefaultContext, 54, func(ctx context.Context, lo *LFSMetaObject) error {
actualIDs = append(actualIDs, lo.ID)
return nil
}, &IterateLFSMetaObjectsForRepoOptions{})
assert.NoError(t, err)
assert.EqualValues(t, expectedIDs, actualIDs)
t.Run("Batch handles updates", func(t *testing.T) {
actualIDs := []int64{}
err := IterateLFSMetaObjectsForRepo(db.DefaultContext, 54, func(ctx context.Context, lo *LFSMetaObject) error {
actualIDs = append(actualIDs, lo.ID)
_, err := db.DeleteByID[LFSMetaObject](ctx, lo.ID)
assert.NoError(t, err)
return nil
}, &IterateLFSMetaObjectsForRepoOptions{})
assert.NoError(t, err)
assert.EqualValues(t, expectedIDs, actualIDs)
})
})
}

View file

@ -44,6 +44,7 @@ func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool
OlderThan: time.Now().Add(-24 * time.Hour * 7),
// We don't set the UpdatedLessRecentlyThan because we want to do a full GC
}); err != nil {
logger.Error("Couldn't garabage collect LFS objects: %v", err)
return err
}

View file

@ -5,7 +5,6 @@ package repository
import (
"context"
"errors"
"fmt"
"time"
@ -25,8 +24,6 @@ type GarbageCollectLFSMetaObjectsOptions struct {
AutoFix bool
OlderThan time.Time
UpdatedLessRecentlyThan time.Time
NumberToCheckPerRepo int64
ProportionToCheckPerRepo float64
}
// GarbageCollectLFSMetaObjects garbage collects LFS objects for all repositories
@ -49,9 +46,6 @@ func GarbageCollectLFSMetaObjects(ctx context.Context, opts GarbageCollectLFSMet
return err
}
if newMinimum := int64(float64(count) * opts.ProportionToCheckPerRepo); newMinimum > opts.NumberToCheckPerRepo && opts.NumberToCheckPerRepo != 0 {
opts.NumberToCheckPerRepo = newMinimum
}
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts)
})
}
@ -78,13 +72,9 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R
defer gitRepo.Close()
store := lfs.NewContentStore()
errStop := errors.New("STOPERR")
objectFormat := git.ObjectFormatFromName(repo.ObjectFormatName)
err = git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error {
if opts.NumberToCheckPerRepo > 0 && total > opts.NumberToCheckPerRepo {
return errStop
}
err = git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject) error {
total++
pointerSha := git.ComputeBlobHash(objectFormat, []byte(metaObject.Pointer.StringContent()))
@ -125,14 +115,8 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R
// unassociated LFS object is genuinely unassociated.
OlderThan: timeutil.TimeStamp(opts.OlderThan.Unix()),
UpdatedLessRecentlyThan: timeutil.TimeStamp(opts.UpdatedLessRecentlyThan.Unix()),
OrderByUpdated: true,
LoopFunctionAlwaysUpdates: true,
})
if err == errStop {
opts.LogDetail("Processing stopped at %d total LFSMetaObjects in %-v", total, repo)
return nil
} else if err != nil {
if err != nil {
return err
}
return nil