Skip to content

Automated Test: db-cleanup-optimized #342

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 91 additions & 38 deletions pkg/services/annotations/annotationsimpl/cleanup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package annotationsimpl

import (
"context"
"errors"
"testing"
"time"

Expand All @@ -14,31 +15,30 @@ import (
"github.com/grafana/grafana/pkg/setting"
)

func TestAnnotationCleanUp(t *testing.T) {
fakeSQL := db.InitTestDB(t)

t.Cleanup(func() {
err := fakeSQL.WithDbSession(context.Background(), func(session *db.Session) error {
_, err := session.Exec("DELETE FROM annotation")
return err
})
assert.NoError(t, err)
})
func TestIntegrationAnnotationCleanUp(t *testing.T) {
if testing.Short() {
t.Skip("Skipping integration test")
}

createTestAnnotations(t, fakeSQL, 21, 6)
assertAnnotationCount(t, fakeSQL, "", 21)
assertAnnotationTagCount(t, fakeSQL, 42)
fakeSQL := db.InitTestDB(t)

tests := []struct {
name string
cfg *setting.Cfg
alertAnnotationCount int64
dashboardAnnotationCount int64
APIAnnotationCount int64
affectedAnnotations int64
name string
createAnnotationsNum int
createOldAnnotationsNum int

cfg *setting.Cfg
alertAnnotationCount int64
annotationCleanupJobBatchSize int
dashboardAnnotationCount int64
APIAnnotationCount int64
affectedAnnotations int64
}{
{
name: "default settings should not delete any annotations",
name: "default settings should not delete any annotations",
createAnnotationsNum: 21,
createOldAnnotationsNum: 6,
annotationCleanupJobBatchSize: 1,
cfg: &setting.Cfg{
AlertingAnnotationCleanupSetting: settingsFn(0, 0),
DashboardAnnotationCleanupSettings: settingsFn(0, 0),
Expand All @@ -50,7 +50,10 @@ func TestAnnotationCleanUp(t *testing.T) {
affectedAnnotations: 0,
},
{
name: "should remove annotations created before cut off point",
name: "should remove annotations created before cut off point",
createAnnotationsNum: 21,
createOldAnnotationsNum: 6,
annotationCleanupJobBatchSize: 1,
cfg: &setting.Cfg{
AlertingAnnotationCleanupSetting: settingsFn(time.Hour*48, 0),
DashboardAnnotationCleanupSettings: settingsFn(time.Hour*48, 0),
Expand All @@ -62,7 +65,10 @@ func TestAnnotationCleanUp(t *testing.T) {
affectedAnnotations: 6,
},
{
name: "should only keep three annotations",
name: "should only keep three annotations",
createAnnotationsNum: 15,
createOldAnnotationsNum: 6,
annotationCleanupJobBatchSize: 1,
cfg: &setting.Cfg{
AlertingAnnotationCleanupSetting: settingsFn(0, 3),
DashboardAnnotationCleanupSettings: settingsFn(0, 3),
Expand All @@ -74,7 +80,10 @@ func TestAnnotationCleanUp(t *testing.T) {
affectedAnnotations: 6,
},
{
name: "running the max count delete again should not remove any annotations",
name: "running the max count delete again should not remove any annotations",
createAnnotationsNum: 9,
createOldAnnotationsNum: 6,
annotationCleanupJobBatchSize: 1,
cfg: &setting.Cfg{
AlertingAnnotationCleanupSetting: settingsFn(0, 3),
DashboardAnnotationCleanupSettings: settingsFn(0, 3),
Expand All @@ -85,12 +94,40 @@ func TestAnnotationCleanUp(t *testing.T) {
APIAnnotationCount: 3,
affectedAnnotations: 0,
},
{
name: "should not fail if batch size is larger than SQLITE_MAX_VARIABLE_NUMBER for SQLite >= 3.32.0",
createAnnotationsNum: 40003,
createOldAnnotationsNum: 0,
annotationCleanupJobBatchSize: 32767,
cfg: &setting.Cfg{
AlertingAnnotationCleanupSetting: settingsFn(0, 1),
DashboardAnnotationCleanupSettings: settingsFn(0, 1),
APIAnnotationCleanupSettings: settingsFn(0, 1),
},
alertAnnotationCount: 1,
dashboardAnnotationCount: 1,
APIAnnotationCount: 1,
affectedAnnotations: 40000,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
createTestAnnotations(t, fakeSQL, test.createAnnotationsNum, test.createOldAnnotationsNum)
assertAnnotationCount(t, fakeSQL, "", int64(test.createAnnotationsNum))
assertAnnotationTagCount(t, fakeSQL, 2*int64(test.createAnnotationsNum))

t.Cleanup(func() {
err := fakeSQL.WithDbSession(context.Background(), func(session *db.Session) error {
_, deleteAnnotationErr := session.Exec("DELETE FROM annotation")
_, deleteAnnotationTagErr := session.Exec("DELETE FROM annotation_tag")
return errors.Join(deleteAnnotationErr, deleteAnnotationTagErr)
})
assert.NoError(t, err)
})

cfg := setting.NewCfg()
cfg.AnnotationCleanupJobBatchSize = 1
cfg.AnnotationCleanupJobBatchSize = int64(test.annotationCleanupJobBatchSize)
cleaner := ProvideCleanupService(fakeSQL, cfg)
affectedAnnotations, affectedAnnotationTags, err := cleaner.Run(context.Background(), test.cfg)
require.NoError(t, err)
Expand All @@ -111,7 +148,11 @@ func TestAnnotationCleanUp(t *testing.T) {
}
}

func TestOldAnnotationsAreDeletedFirst(t *testing.T) {
func TestIntegrationOldAnnotationsAreDeletedFirst(t *testing.T) {
if testing.Short() {
t.Skip("Skipping integration test")
}

fakeSQL := db.InitTestDB(t)

t.Cleanup(func() {
Expand Down Expand Up @@ -193,8 +234,11 @@ func createTestAnnotations(t *testing.T, store db.DB, expectedCount int, oldAnno

cutoffDate := time.Now()

newAnnotations := make([]*annotations.Item, 0, expectedCount)
newAnnotationTags := make([]*annotationTag, 0, 2*expectedCount)
for i := 0; i < expectedCount; i++ {
a := &annotations.Item{
ID: int64(i + 1),
DashboardID: 1,
OrgID: 1,
UserID: 1,
Expand Down Expand Up @@ -222,20 +266,29 @@ func createTestAnnotations(t *testing.T, store db.DB, expectedCount int, oldAnno
a.Created = cutoffDate.AddDate(-10, 0, -10).UnixNano() / int64(time.Millisecond)
}

err := store.WithDbSession(context.Background(), func(sess *db.Session) error {
_, err := sess.Insert(a)
require.NoError(t, err, "should be able to save annotation", err)

// mimick the SQL annotation Save logic by writing records to the annotation_tag table
// we need to ensure they get deleted when we clean up annotations
for tagID := range []int{1, 2} {
_, err = sess.Exec("INSERT INTO annotation_tag (annotation_id, tag_id) VALUES(?,?)", a.ID, tagID)
require.NoError(t, err, "should be able to save annotation tag ID", err)
}
return err
})
require.NoError(t, err)
newAnnotations = append(newAnnotations, a)
newAnnotationTags = append(newAnnotationTags, &annotationTag{AnnotationID: a.ID, TagID: 1}, &annotationTag{AnnotationID: a.ID, TagID: 2})
}

err := store.WithDbSession(context.Background(), func(sess *db.Session) error {
batchsize := 500
for i := 0; i < len(newAnnotations); i += batchsize {
_, err := sess.InsertMulti(newAnnotations[i:min(i+batchsize, len(newAnnotations))])
require.NoError(t, err)
}
return nil
})
require.NoError(t, err)

err = store.WithDbSession(context.Background(), func(sess *db.Session) error {
batchsize := 500
for i := 0; i < len(newAnnotationTags); i += batchsize {
_, err := sess.InsertMulti(newAnnotationTags[i:min(i+batchsize, len(newAnnotationTags))])
require.NoError(t, err)
}
return nil
})
require.NoError(t, err)
}

func settingsFn(maxAge time.Duration, maxCount int64) setting.AnnotationCleanupSettings {
Expand Down
130 changes: 107 additions & 23 deletions pkg/services/annotations/annotationsimpl/xorm_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

"github.com/grafana/grafana/pkg/services/annotations/accesscontrol"
"github.com/grafana/grafana/pkg/services/sqlstore/migrator"

"github.com/grafana/grafana/pkg/infra/db"
"github.com/grafana/grafana/pkg/infra/log"
Expand Down Expand Up @@ -519,52 +520,135 @@ func (r *xormRepositoryImpl) CleanAnnotations(ctx context.Context, cfg setting.A
var totalAffected int64
if cfg.MaxAge > 0 {
cutoffDate := timeNow().Add(-cfg.MaxAge).UnixNano() / int64(time.Millisecond)
deleteQuery := `DELETE FROM annotation WHERE id IN (SELECT id FROM (SELECT id FROM annotation WHERE %s AND created < %v ORDER BY id DESC %s) a)`
sql := fmt.Sprintf(deleteQuery, annotationType, cutoffDate, r.db.GetDialect().Limit(r.cfg.AnnotationCleanupJobBatchSize))
// Single-statement approaches, specifically ones using batched sub-queries, seem to deadlock with concurrent inserts on MySQL.
// We have a bounded batch size, so work around this by first loading the IDs into memory and allowing any locks to flush inside each batch.
// This may under-delete when concurrent inserts happen, but any such annotations will simply be cleaned on the next cycle.
//
// We execute the following batched operation repeatedly until either we run out of objects, the context is cancelled, or there is an error.
affected, err := untilDoneOrCancelled(ctx, func() (int64, error) {
cond := fmt.Sprintf(`%s AND created < %v ORDER BY id DESC %s`, annotationType, cutoffDate, r.db.GetDialect().Limit(r.cfg.AnnotationCleanupJobBatchSize))
ids, err := r.fetchIDs(ctx, "annotation", cond)
if err != nil {
return 0, err
}
r.log.Error("Annotations to clean by time", "count", len(ids), "ids", ids, "cond", cond, "err", err)

affected, err := r.executeUntilDoneOrCancelled(ctx, sql)
x, y := r.deleteByIDs(ctx, "annotation", ids)
r.log.Error("cleaned annotations by time", "count", len(ids), "affected", x, "err", y)
return x, y
Comment on lines +534 to +538

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Error-level logs used for routine operational messages; terse variable names.

Lines 534, 537, 554, 557, 576, 579 all use r.log.Error(...) for what are clearly informational/debug messages (e.g., "Annotations to clean by time"). In production this will fire alerts and pollute error dashboards. Additionally, logging the full ids slice at any level will be extremely verbose for large batches (up to 32 767 entries).

The variables x, y on Lines 536–538 (and 556–558, 578–579) should use descriptive names.

Proposed fix (showing the MaxAge block; apply the same pattern to MaxCount and orphaned tags)
-			r.log.Error("Annotations to clean by time", "count", len(ids), "ids", ids, "cond", cond, "err", err)
+			r.log.Debug("Annotations to clean by time", "count", len(ids))

-			x, y := r.deleteByIDs(ctx, "annotation", ids)
-			r.log.Error("cleaned annotations by time", "count", len(ids), "affected", x, "err", y)
-			return x, y
+			affected, err := r.deleteByIDs(ctx, "annotation", ids)
+			r.log.Debug("cleaned annotations by time", "count", len(ids), "affected", affected, "err", err)
+			return affected, err
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
r.log.Error("Annotations to clean by time", "count", len(ids), "ids", ids, "cond", cond, "err", err)
affected, err := r.executeUntilDoneOrCancelled(ctx, sql)
x, y := r.deleteByIDs(ctx, "annotation", ids)
r.log.Error("cleaned annotations by time", "count", len(ids), "affected", x, "err", y)
return x, y
r.log.Debug("Annotations to clean by time", "count", len(ids))
affected, err := r.deleteByIDs(ctx, "annotation", ids)
r.log.Debug("cleaned annotations by time", "count", len(ids), "affected", affected, "err", err)
return affected, err
🤖 Prompt for AI Agents
In `@pkg/services/annotations/annotationsimpl/xorm_store.go` around lines 534 -
538, Change the routine-use r.log.Error calls (e.g., where message is
"Annotations to clean by time" and "cleaned annotations by time") to non-error
levels like Info or Debug, stop logging the full ids slice (log its length and
at most the first N IDs or a summarized sample), and replace terse variables x,y
returned from deleteByIDs with descriptive names (e.g., deletedCount, deleteErr
or affected, err) so the call and subsequent r.log.* usage reads clearly; apply
the same pattern to the MaxCount and orphaned-tags blocks and any other similar
r.log.Error usages.

})
totalAffected += affected
if err != nil {
return totalAffected, err
}
}

if cfg.MaxCount > 0 {
deleteQuery := `DELETE FROM annotation WHERE id IN (SELECT id FROM (SELECT id FROM annotation WHERE %s ORDER BY id DESC %s) a)`
sql := fmt.Sprintf(deleteQuery, annotationType, r.db.GetDialect().LimitOffset(r.cfg.AnnotationCleanupJobBatchSize, cfg.MaxCount))
affected, err := r.executeUntilDoneOrCancelled(ctx, sql)
// Similar strategy as the above cleanup process, to avoid deadlocks.
affected, err := untilDoneOrCancelled(ctx, func() (int64, error) {
cond := fmt.Sprintf(`%s ORDER BY id DESC %s`, annotationType, r.db.GetDialect().LimitOffset(r.cfg.AnnotationCleanupJobBatchSize, cfg.MaxCount))
ids, err := r.fetchIDs(ctx, "annotation", cond)
if err != nil {
return 0, err
}
r.log.Error("Annotations to clean by count", "count", len(ids), "ids", ids, "cond", cond, "err", err)

x, y := r.deleteByIDs(ctx, "annotation", ids)
r.log.Error("cleaned annotations by count", "count", len(ids), "affected", x, "err", y)
return x, y
})
totalAffected += affected
return totalAffected, err
if err != nil {
return totalAffected, err
}
}

return totalAffected, nil
}

func (r *xormRepositoryImpl) CleanOrphanedAnnotationTags(ctx context.Context) (int64, error) {
deleteQuery := `DELETE FROM annotation_tag WHERE id IN ( SELECT id FROM (SELECT id FROM annotation_tag WHERE NOT EXISTS (SELECT 1 FROM annotation a WHERE annotation_id = a.id) %s) a)`
sql := fmt.Sprintf(deleteQuery, r.db.GetDialect().Limit(r.cfg.AnnotationCleanupJobBatchSize))
return r.executeUntilDoneOrCancelled(ctx, sql)
return untilDoneOrCancelled(ctx, func() (int64, error) {
cond := fmt.Sprintf(`NOT EXISTS (SELECT 1 FROM annotation a WHERE annotation_id = a.id) %s`, r.db.GetDialect().Limit(r.cfg.AnnotationCleanupJobBatchSize))
ids, err := r.fetchIDs(ctx, "annotation_tag", cond)
if err != nil {
return 0, err
}
r.log.Error("Tags to clean", "count", len(ids), "ids", ids, "cond", cond, "err", err)

x, y := r.deleteByIDs(ctx, "annotation_tag", ids)
r.log.Error("cleaned tags", "count", len(ids), "affected", x, "err", y)
return x, y
})
}
Comment on lines 569 to +582

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# First, find migration files related to annotation_tag
find . -type f \( -name "*.sql" -o -name "*.go" \) | grep -i migr | head -20

Repository: admin-coderabbit/coderabbit-grafana-mirror

Length of output: 1194


🏁 Script executed:

#!/bin/bash
# Search for annotationTag struct definition
rg -n "type.*annotationTag" --type=go

Repository: admin-coderabbit/coderabbit-grafana-mirror

Length of output: 265


🏁 Script executed:

#!/bin/bash
# Look for annotation_tag table creation in migrations
rg -n "annotation_tag" -A5 -B5 --type=go | grep -A5 -B5 "CREATE\|AddTable\|table" | head -50

Repository: admin-coderabbit/coderabbit-grafana-mirror

Length of output: 4561


🏁 Script executed:

#!/bin/bash
# Search for the specific struct definition around line 663-666 mentioned in the review
rg -n "annotation_id|tag_id" --type=go -A2 -B2 | grep -A5 -B5 "type"

Repository: admin-coderabbit/coderabbit-grafana-mirror

Length of output: 1856


🏁 Script executed:

#!/bin/bash
# Get full annotation_tag table definition from migration
rg -A20 "annotationTagTableV3 := Table{" pkg/services/sqlstore/migrations/annotation_mig.go

Repository: admin-coderabbit/coderabbit-grafana-mirror

Length of output: 209


🏁 Script executed:

#!/bin/bash
# Also check the v2 definition
rg -B5 -A15 "Create annotation_tag table v2" pkg/services/sqlstore/migrations/annotation_mig.go

Repository: admin-coderabbit/coderabbit-grafana-mirror

Length of output: 901


🏁 Script executed:

#!/bin/bash
# Look for Column definitions in the migration
rg -A30 'annotationTagTable := Table' pkg/services/sqlstore/migrations/annotation_mig.go | head -60

Repository: admin-coderabbit/coderabbit-grafana-mirror

Length of output: 1197


Add missing id field to annotationTag struct.

The annotation_tag table schema (v3 migration) includes an id column as the primary key with autoincrement, but the production struct at lines 663–666 omits it. The test struct at lines 591–595 correctly includes ID int64 \xorm:"pk autoincr 'id'"`. Without this field, calls to fetchIDs()(line 572) anddeleteByIDs()(line 578) will fail at runtime because xorm cannot map theid` column from query results.

Current struct definition
type annotationTag struct {
	AnnotationID int64 `xorm:"annotation_id"`
	TagID        int64 `xorm:"tag_id"`
}

Update the struct to match the schema:

type annotationTag struct {
	ID           int64 `xorm:"pk autoincr 'id'"`
	AnnotationID int64 `xorm:"annotation_id"`
	TagID        int64 `xorm:"tag_id"`
}
🤖 Prompt for AI Agents
In `@pkg/services/annotations/annotationsimpl/xorm_store.go` around lines 569 -
582, The annotationTag struct is missing the primary key field causing xorm
mapping failures for fetchIDs and deleteByIDs on the annotation_tag table;
update the annotationTag type to include the ID field with the xorm tag for
primary/autoincrement (e.g., add ID int64 with `xorm:"pk autoincr 'id'"`) so the
struct matches the v3 migration schema and allows fetchIDs/deleteByIDs to map
and delete rows correctly.


func (r *xormRepositoryImpl) fetchIDs(ctx context.Context, table, condition string) ([]int64, error) {
sql := fmt.Sprintf(`SELECT id FROM %s`, table)
if condition == "" {
return nil, fmt.Errorf("condition must be supplied; cannot fetch IDs from entire table")
}
sql += fmt.Sprintf(` WHERE %s`, condition)
ids := make([]int64, 0)
err := r.db.WithDbSession(ctx, func(session *db.Session) error {
return session.SQL(sql).Find(&ids)
})
return ids, err
}

func (r *xormRepositoryImpl) executeUntilDoneOrCancelled(ctx context.Context, sql string) (int64, error) {
func (r *xormRepositoryImpl) deleteByIDs(ctx context.Context, table string, ids []int64) (int64, error) {
if len(ids) == 0 {
return 0, nil
}

sql := ""
args := make([]any, 0)

// SQLite has a parameter limit of 999.
// If the batch size is bigger than that, and we're on SQLite, we have to put the IDs directly into the statement.
const sqliteParameterLimit = 999
if r.db.GetDBType() == migrator.SQLite && r.cfg.AnnotationCleanupJobBatchSize > sqliteParameterLimit {
values := fmt.Sprint(ids[0])
for _, v := range ids[1:] {
values = fmt.Sprintf("%s, %d", values, v)
}
sql = fmt.Sprintf(`DELETE FROM %s WHERE id IN (%s)`, table, values)
} else {
placeholders := "?" + strings.Repeat(",?", len(ids)-1)
sql = fmt.Sprintf(`DELETE FROM %s WHERE id IN (%s)`, table, placeholders)
args = asAny(ids)
}

var affected int64
err := r.db.WithDbSession(ctx, func(session *db.Session) error {
res, err := session.Exec(append([]any{sql}, args...)...)
if err != nil {
return err
}
affected, err = res.RowsAffected()
return err
})
return affected, err
}

func asAny(vs []int64) []any {
r := make([]any, len(vs))
for i, v := range vs {
r[i] = v
}
return r
}

// untilDoneOrCancelled repeatedly executes batched work until that work is either done (i.e., returns zero affected objects),
// a batch produces an error, or the provided context is cancelled.
// The work to be done is given as a callback that returns the number of affected objects for each batch, plus that batch's errors.
func untilDoneOrCancelled(ctx context.Context, batchWork func() (int64, error)) (int64, error) {
var totalAffected int64
for {
select {
case <-ctx.Done():
return totalAffected, ctx.Err()
default:
var affected int64
err := r.db.WithDbSession(ctx, func(session *db.Session) error {
res, err := session.Exec(sql)
if err != nil {
return err
}

affected, err = res.RowsAffected()
totalAffected += affected

return err
})
affected, err := batchWork()
totalAffected += affected
if err != nil {
return totalAffected, err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/services/cleanup/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func (j cleanUpJob) String() string {
func (srv *CleanUpService) Run(ctx context.Context) error {
srv.cleanUpTmpFiles(ctx)

ticker := time.NewTicker(time.Minute * 10)
ticker := time.NewTicker(time.Minute * 1)
for {
select {
case <-ticker.C:
Expand Down