Skip to content

Automated Test: authz-service-improve-caching-pr #323

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions pkg/services/authz/rbac.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,11 @@ func ProvideAuthZClient(
return ctx, nil
}))
authzv1.RegisterAuthzServiceServer(channel, server)
rbacClient := newRBACClient(channel, tracer)
rbacClient := authzlib.NewClient(
channel,
authzlib.WithCacheClientOption(&NoopCache{}),
authzlib.WithTracerClientOption(tracer),
)

if features.IsEnabledGlobally(featuremgmt.FlagZanzana) {
return zanzana.WithShadowClient(rbacClient, zanzanaClient, reg)
Expand Down Expand Up @@ -153,18 +157,16 @@ func newRemoteRBACClient(clientCfg *authzClientSettings, tracer trace.Tracer) (a
return nil, fmt.Errorf("failed to create authz client to remote server: %w", err)
}

return newRBACClient(conn, tracer), nil
}

func newRBACClient(conn grpc.ClientConnInterface, tracer trace.Tracer) authlib.AccessClient {
return authzlib.NewClient(
client := authzlib.NewClient(
conn,
authzlib.WithCacheClientOption(cache.NewLocalCache(cache.Config{
Expiry: 30 * time.Second,
CleanupInterval: 2 * time.Minute,
})),
authzlib.WithTracerClientOption(tracer),
)

return client, nil
}

func RegisterRBACAuthZService(
Expand Down Expand Up @@ -233,3 +235,17 @@ func (t tokenExhangeRoundTripper) RoundTrip(r *http.Request) (*http.Response, er
r.Header.Set("X-Access-Token", "Bearer "+res.Token)
return t.rt.RoundTrip(r)
}

type NoopCache struct{}

func (lc *NoopCache) Get(ctx context.Context, key string) ([]byte, error) {
return nil, cache.ErrNotFound
}

func (lc *NoopCache) Set(ctx context.Context, key string, data []byte, exp time.Duration) error {
return nil
}

func (lc *NoopCache) Delete(ctx context.Context, key string) error {
return nil
}
4 changes: 4 additions & 0 deletions pkg/services/authz/rbac/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ func userPermCacheKey(namespace, userUID, action string) string {
return namespace + ".perm_" + userUID + "_" + action
}

func userPermDenialCacheKey(namespace, userUID, action, name, parent string) string {
return namespace + ".perm_" + userUID + "_" + action + "_" + name + "_" + parent
}

func userBasicRoleCacheKey(namespace, userUID string) string {
return namespace + ".basic_role_" + userUID
}
Expand Down
93 changes: 74 additions & 19 deletions pkg/services/authz/rbac/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,12 @@ type Service struct {
sf *singleflight.Group

// Cache for user permissions, user team memberships and user basic roles
idCache *cacheWrap[store.UserIdentifiers]
permCache *cacheWrap[map[string]bool]
teamCache *cacheWrap[[]int64]
basicRoleCache *cacheWrap[store.BasicRole]
folderCache *cacheWrap[folderTree]
idCache *cacheWrap[store.UserIdentifiers]
permCache *cacheWrap[map[string]bool]
permDenialCache *cacheWrap[bool]
teamCache *cacheWrap[[]int64]
basicRoleCache *cacheWrap[store.BasicRole]
folderCache *cacheWrap[folderTree]
}

func NewService(
Expand All @@ -81,6 +82,7 @@ func NewService(
mapper: newMapper(),
idCache: newCacheWrap[store.UserIdentifiers](cache, logger, longCacheTTL),
permCache: newCacheWrap[map[string]bool](cache, logger, shortCacheTTL),
permDenialCache: newCacheWrap[bool](cache, logger, shortCacheTTL),
teamCache: newCacheWrap[[]int64](cache, logger, shortCacheTTL),
basicRoleCache: newCacheWrap[store.BasicRole](cache, logger, shortCacheTTL),
folderCache: newCacheWrap[folderTree](cache, logger, shortCacheTTL),
Expand Down Expand Up @@ -111,6 +113,29 @@ func (s *Service) Check(ctx context.Context, req *authzv1.CheckRequest) (*authzv
attribute.String("folder", checkReq.ParentFolder),
)

permDenialKey := userPermDenialCacheKey(checkReq.Namespace.Value, checkReq.UserUID, checkReq.Action, checkReq.Name, checkReq.ParentFolder)
if _, ok := s.permDenialCache.Get(ctx, permDenialKey); ok {
s.metrics.permissionCacheUsage.WithLabelValues("true", checkReq.Action).Inc()
s.metrics.requestCount.WithLabelValues("false", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return &authzv1.CheckResponse{Allowed: false}, nil
}
Comment on lines +116 to +121

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Denial cache takes absolute precedence — could serve stale denials after permission grants.

The denial cache short-circuits before even checking the permission cache. If an admin grants a user access, the denial cache entry (up to 30s TTL) will continue to reject requests even though the permission cache would have expired and the DB would return the updated permissions.

Since the permission cache and denial cache share the same TTL but are keyed differently (perm cache by action only, denial cache by action+name+parent), there's no coordinated invalidation. This is likely an acceptable trade-off given the short TTL, but it should be documented and considered if TTLs are ever increased.

🤖 Prompt for AI Agents
In `@pkg/services/authz/rbac/service.go` around lines 116 - 121, The denial cache
(permDenialCache / permDenialKey) is checked before the permission cache which
can cause stale denials to override newly granted permissions; change the logic
to consult the permission cache (e.g., s.permCache.Get or the existing
permission lookup path) before short-circuiting on s.permDenialCache.Get, or if
keeping the denial-first order, validate the denial entry against the permission
cache/DB (invalidate denial if a positive permission exists) so grants are not
blocked while perm cache TTLs expire.


cachedPerms, err := s.getCachedIdentityPermissions(ctx, checkReq.Namespace, checkReq.IdentityType, checkReq.UserUID, checkReq.Action)
if err == nil {
allowed, err := s.checkPermission(ctx, cachedPerms, checkReq)
if err != nil {
ctxLogger.Error("could not check permission", "error", err)
s.metrics.requestCount.WithLabelValues("true", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return deny, err
}
if allowed {
s.metrics.permissionCacheUsage.WithLabelValues("true", checkReq.Action).Inc()
s.metrics.requestCount.WithLabelValues("false", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return &authzv1.CheckResponse{Allowed: allowed}, nil
}
}
s.metrics.permissionCacheUsage.WithLabelValues("false", checkReq.Action).Inc()

permissions, err := s.getIdentityPermissions(ctx, checkReq.Namespace, checkReq.IdentityType, checkReq.UserUID, checkReq.Action)
if err != nil {
ctxLogger.Error("could not get user permissions", "subject", req.GetSubject(), "error", err)
Expand All @@ -125,6 +150,10 @@ func (s *Service) Check(ctx context.Context, req *authzv1.CheckRequest) (*authzv
return deny, err
}

if !allowed {
s.permDenialCache.Set(ctx, permDenialKey, true)
}

s.metrics.requestCount.WithLabelValues("false", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return &authzv1.CheckResponse{Allowed: allowed}, nil
}
Expand All @@ -148,11 +177,18 @@ func (s *Service) List(ctx context.Context, req *authzv1.ListRequest) (*authzv1.
attribute.String("action", listReq.Action),
)

permissions, err := s.getIdentityPermissions(ctx, listReq.Namespace, listReq.IdentityType, listReq.UserUID, listReq.Action)
if err != nil {
ctxLogger.Error("could not get user permissions", "subject", req.GetSubject(), "error", err)
s.metrics.requestCount.WithLabelValues("true", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return nil, err
permissions, err := s.getCachedIdentityPermissions(ctx, listReq.Namespace, listReq.IdentityType, listReq.UserUID, listReq.Action)
if err == nil {
s.metrics.permissionCacheUsage.WithLabelValues("true", listReq.Action).Inc()
} else {
s.metrics.permissionCacheUsage.WithLabelValues("false", listReq.Action).Inc()

permissions, err = s.getIdentityPermissions(ctx, listReq.Namespace, listReq.IdentityType, listReq.UserUID, listReq.Action)
if err != nil {
ctxLogger.Error("could not get user permissions", "subject", req.GetSubject(), "error", err)
s.metrics.requestCount.WithLabelValues("true", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return nil, err
}
}
Comment on lines +180 to 192

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Inconsistent caching strategy between Check and List.

Check (lines 123–136) falls through to the database when cached permissions deny access, handling the "outdated cache" case. List, however, directly uses cached permissions without any fallthrough. This means:

  • Check on resource X: cache says no → DB fetch → finds new permission → allows
  • List: cache says no X → returns stale list without X

This could lead to a brief inconsistency where Check allows access to a resource that List doesn't include. Consider applying the same fallthrough logic or documenting this as intentional.

🤖 Prompt for AI Agents
In `@pkg/services/authz/rbac/service.go` around lines 180 - 192, List currently
uses getCachedIdentityPermissions and accepts its result as authoritative,
unlike Check which falls through to getIdentityPermissions when cache indicates
denial; update the List function to mirror Check's behavior by detecting a
cached "deny" or stale/empty permission result from
s.getCachedIdentityPermissions(ctx, listReq.Namespace, listReq.IdentityType,
listReq.UserUID, listReq.Action) and then calling s.getIdentityPermissions(ctx,
listReq.Namespace, listReq.IdentityType, listReq.UserUID, listReq.Action) to
refresh from the DB; ensure you update the metrics labels the same way Check
does and preserve the existing error logging (ctxLogger.Error with subject and
error) and return semantics when the DB fetch fails.


resp, err := s.listPermission(ctx, permissions, listReq)
Expand Down Expand Up @@ -303,6 +339,34 @@ func (s *Service) getIdentityPermissions(ctx context.Context, ns types.Namespace
}
}

func (s *Service) getCachedIdentityPermissions(ctx context.Context, ns types.NamespaceInfo, idType types.IdentityType, userID, action string) (map[string]bool, error) {
ctx, span := s.tracer.Start(ctx, "authz_direct_db.service.getCachedIdentityPermissions")
defer span.End()

switch idType {
case types.TypeAnonymous:
anonPermKey := anonymousPermCacheKey(ns.Value, action)
if cached, ok := s.permCache.Get(ctx, anonPermKey); ok {
return cached, nil
}
return nil, cache.ErrNotFound
case types.TypeRenderService:
return nil, cache.ErrNotFound
case types.TypeUser, types.TypeServiceAccount:
userIdentifiers, err := s.GetUserIdentifiers(ctx, ns, userID)
if err != nil {
return nil, err
}
userPermKey := userPermCacheKey(ns.Value, userIdentifiers.UID, action)
if cached, ok := s.permCache.Get(ctx, userPermKey); ok {
return cached, nil
}
return nil, cache.ErrNotFound
default:
return nil, fmt.Errorf("unsupported identity type: %s", idType)
}
}

func (s *Service) getUserPermissions(ctx context.Context, ns types.NamespaceInfo, userID, action string, actionSets []string) (map[string]bool, error) {
ctx, span := s.tracer.Start(ctx, "authz_direct_db.service.getUserPermissions")
defer span.End()
Expand All @@ -313,12 +377,6 @@ func (s *Service) getUserPermissions(ctx context.Context, ns types.NamespaceInfo
}

userPermKey := userPermCacheKey(ns.Value, userIdentifiers.UID, action)
if cached, ok := s.permCache.Get(ctx, userPermKey); ok {
s.metrics.permissionCacheUsage.WithLabelValues("true", action).Inc()
return cached, nil
}
s.metrics.permissionCacheUsage.WithLabelValues("false", action).Inc()

res, err, _ := s.sf.Do(userPermKey+"_getUserPermissions", func() (interface{}, error) {
basicRoles, err := s.getUserBasicRole(ctx, ns, userIdentifiers)
if err != nil {
Expand Down Expand Up @@ -363,9 +421,6 @@ func (s *Service) getAnonymousPermissions(ctx context.Context, ns types.Namespac
defer span.End()

anonPermKey := anonymousPermCacheKey(ns.Value, action)
if cached, ok := s.permCache.Get(ctx, anonPermKey); ok {
return cached, nil
}
res, err, _ := s.sf.Do(anonPermKey+"_getAnonymousPermissions", func() (interface{}, error) {
permissions, err := s.permissionStore.GetUserPermissions(ctx, ns, store.PermissionsQuery{Action: action, ActionSets: actionSets, Role: "Viewer"})
if err != nil {
Expand Down
148 changes: 140 additions & 8 deletions pkg/services/authz/rbac/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -339,14 +339,6 @@ func TestService_getUserPermissions(t *testing.T) {
}

testCases := []testCase{
{
name: "should return permissions from cache if available",
permissions: []accesscontrol.Permission{
{Action: "dashboards:read", Scope: "dashboards:uid:some_dashboard"},
},
cacheHit: true,
expectedPerms: map[string]bool{"dashboards:uid:some_dashboard": true},
},
{
name: "should return permissions from store if not in cache",
permissions: []accesscontrol.Permission{
Expand Down Expand Up @@ -898,6 +890,111 @@ func TestService_Check(t *testing.T) {
})
}

func TestService_CacheCheck(t *testing.T) {
callingService := authn.NewAccessTokenAuthInfo(authn.Claims[authn.AccessTokenClaims]{
Claims: jwt.Claims{
Subject: types.NewTypeID(types.TypeAccessPolicy, "some-service"),
Audience: []string{"authzservice"},
},
Rest: authn.AccessTokenClaims{Namespace: "org-12"},
})

ctx := types.WithAuthInfo(context.Background(), callingService)
userID := &store.UserIdentifiers{UID: "test-uid", ID: 1}

t.Run("Allow based on cached permissions", func(t *testing.T) {
s := setupService()

s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)
s.permCache.Set(ctx, userPermCacheKey("org-12", "test-uid", "dashboards:read"), map[string]bool{"dashboards:uid:dash1": true})

resp, err := s.Check(ctx, &authzv1.CheckRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "get",
Name: "dash1",
})
require.NoError(t, err)
assert.True(t, resp.Allowed)
})
t.Run("Fallback to the database on cache miss", func(t *testing.T) {
s := setupService()

// Populate database permission but not the cache
store := &fakeStore{
userID: userID,
userPermissions: []accesscontrol.Permission{{Action: "dashboards:read", Scope: "dashboards:uid:dash2"}},
}

s.store = store
s.permissionStore = store

s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)

resp, err := s.Check(ctx, &authzv1.CheckRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "get",
Name: "dash2",
})
require.NoError(t, err)
assert.True(t, resp.Allowed)
})
t.Run("Fallback to the database on outdated cache", func(t *testing.T) {
s := setupService()

store := &fakeStore{
userID: userID,
userPermissions: []accesscontrol.Permission{{Action: "dashboards:read", Scope: "dashboards:uid:dash2"}},
}

s.store = store
s.permissionStore = store

s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)
// The cache does not have the permission for dash2 (outdated)
s.permCache.Set(ctx, userPermCacheKey("org-12", "test-uid", "dashboards:read"), map[string]bool{"dashboards:uid:dash1": true})

resp, err := s.Check(ctx, &authzv1.CheckRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "get",
Name: "dash2",
})
require.NoError(t, err)
assert.True(t, resp.Allowed)
})
t.Run("Should deny on explicit cache deny entry", func(t *testing.T) {
s := setupService()

s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)

// Explicitly deny access to the dashboard
s.permDenialCache.Set(ctx, userPermDenialCacheKey("org-12", "test-uid", "dashboards:read", "dash1", "fold1"), true)

// Allow access to the dashboard to prove this is not checked
s.permCache.Set(ctx, userPermCacheKey("org-12", "test-uid", "dashboards:read"), map[string]bool{"dashboards:uid:dash1": false})

resp, err := s.Check(ctx, &authzv1.CheckRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "get",
Name: "dash1",
Folder: "fold1",
})
require.NoError(t, err)
assert.False(t, resp.Allowed)
})
Comment on lines +973 to +995

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Test comment is misleading — perm cache value false doesn't actually "allow" access.

Line 982 sets map[string]bool{"dashboards:uid:dash1": false}, with the comment "Allow access to the dashboard to prove this is not checked." However, a false value in the scope map would deny access anyway. To truly prove that the denial cache short-circuits before the permission cache is consulted, set the value to true:

🧪 Suggested fix for a more convincing test
-		// Allow access to the dashboard to prove this is not checked
-		s.permCache.Set(ctx, userPermCacheKey("org-12", "test-uid", "dashboards:read"), map[string]bool{"dashboards:uid:dash1": false})
+		// Set permission cache to allow access — proves denial cache takes precedence
+		s.permCache.Set(ctx, userPermCacheKey("org-12", "test-uid", "dashboards:read"), map[string]bool{"dashboards:uid:dash1": true})
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
t.Run("Should deny on explicit cache deny entry", func(t *testing.T) {
s := setupService()
s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)
// Explicitly deny access to the dashboard
s.permDenialCache.Set(ctx, userPermDenialCacheKey("org-12", "test-uid", "dashboards:read", "dash1", "fold1"), true)
// Allow access to the dashboard to prove this is not checked
s.permCache.Set(ctx, userPermCacheKey("org-12", "test-uid", "dashboards:read"), map[string]bool{"dashboards:uid:dash1": false})
resp, err := s.Check(ctx, &authzv1.CheckRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "get",
Name: "dash1",
Folder: "fold1",
})
require.NoError(t, err)
assert.False(t, resp.Allowed)
})
t.Run("Should deny on explicit cache deny entry", func(t *testing.T) {
s := setupService()
s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)
// Explicitly deny access to the dashboard
s.permDenialCache.Set(ctx, userPermDenialCacheKey("org-12", "test-uid", "dashboards:read", "dash1", "fold1"), true)
// Set permission cache to allow access — proves denial cache takes precedence
s.permCache.Set(ctx, userPermCacheKey("org-12", "test-uid", "dashboards:read"), map[string]bool{"dashboards:uid:dash1": true})
resp, err := s.Check(ctx, &authzv1.CheckRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "get",
Name: "dash1",
Folder: "fold1",
})
require.NoError(t, err)
assert.False(t, resp.Allowed)
})
🤖 Prompt for AI Agents
In `@pkg/services/authz/rbac/service_test.go` around lines 973 - 995, The test's
claim that the perm cache "allows" access is wrong because it sets
map[string]bool{"dashboards:uid:dash1": false}; update the test to set the
permission entry to true so it actually grants access in the perm cache and thus
demonstrates that the explicit denial in s.permDenialCache
(userPermDenialCacheKey) short-circuits Check; locate the test block around func
t.Run "Should deny on explicit cache deny entry" and change the s.permCache.Set
call (using userPermCacheKey) to store a map with the dashboard key set to true
so the final require/assert still expects resp.Allowed == false.

}

func TestService_List(t *testing.T) {
callingService := authn.NewAccessTokenAuthInfo(authn.Claims[authn.AccessTokenClaims]{
Claims: jwt.Claims{
Expand Down Expand Up @@ -1191,6 +1288,40 @@ func TestService_List(t *testing.T) {
})
}

func TestService_CacheList(t *testing.T) {
callingService := authn.NewAccessTokenAuthInfo(authn.Claims[authn.AccessTokenClaims]{
Claims: jwt.Claims{
Subject: types.NewTypeID(types.TypeAccessPolicy, "some-service"),
Audience: []string{"authzservice"},
},
Rest: authn.AccessTokenClaims{Namespace: "org-12"},
})

t.Run("List based on cached permissions", func(t *testing.T) {
s := setupService()
ctx := types.WithAuthInfo(context.Background(), callingService)
userID := &store.UserIdentifiers{UID: "test-uid", ID: 1}
s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)
s.permCache.Set(ctx,
userPermCacheKey("org-12", "test-uid", "dashboards:read"),
map[string]bool{"dashboards:uid:dash1": true, "dashboards:uid:dash2": true, "folders:uid:fold1": true},
)
s.identityStore = &fakeIdentityStore{}

resp, err := s.List(ctx, &authzv1.ListRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "list",
})

require.NoError(t, err)
require.ElementsMatch(t, resp.Items, []string{"dash1", "dash2"})
require.ElementsMatch(t, resp.Folders, []string{"fold1"})
})
}

func setupService() *Service {
cache := cache.NewLocalCache(cache.Config{Expiry: 5 * time.Minute, CleanupInterval: 5 * time.Minute})
logger := log.New("authz-rbac-service")
Expand All @@ -1202,6 +1333,7 @@ func setupService() *Service {
metrics: newMetrics(nil),
idCache: newCacheWrap[store.UserIdentifiers](cache, logger, longCacheTTL),
permCache: newCacheWrap[map[string]bool](cache, logger, shortCacheTTL),
permDenialCache: newCacheWrap[bool](cache, logger, shortCacheTTL),
teamCache: newCacheWrap[[]int64](cache, logger, shortCacheTTL),
basicRoleCache: newCacheWrap[store.BasicRole](cache, logger, longCacheTTL),
folderCache: newCacheWrap[folderTree](cache, logger, shortCacheTTL),
Expand Down