Skip to content

Automated Test: authz-service-improve-caching-pr #341

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions pkg/services/authz/rbac.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,11 @@ func ProvideAuthZClient(
return ctx, nil
}))
authzv1.RegisterAuthzServiceServer(channel, server)
rbacClient := newRBACClient(channel, tracer)
rbacClient := authzlib.NewClient(
channel,
authzlib.WithCacheClientOption(&NoopCache{}),
authzlib.WithTracerClientOption(tracer),
)

if features.IsEnabledGlobally(featuremgmt.FlagZanzana) {
return zanzana.WithShadowClient(rbacClient, zanzanaClient, reg)
Expand Down Expand Up @@ -153,18 +157,16 @@ func newRemoteRBACClient(clientCfg *authzClientSettings, tracer trace.Tracer) (a
return nil, fmt.Errorf("failed to create authz client to remote server: %w", err)
}

return newRBACClient(conn, tracer), nil
}

func newRBACClient(conn grpc.ClientConnInterface, tracer trace.Tracer) authlib.AccessClient {
return authzlib.NewClient(
client := authzlib.NewClient(
conn,
authzlib.WithCacheClientOption(cache.NewLocalCache(cache.Config{
Expiry: 30 * time.Second,
CleanupInterval: 2 * time.Minute,
})),
authzlib.WithTracerClientOption(tracer),
)

return client, nil
}

func RegisterRBACAuthZService(
Expand Down Expand Up @@ -233,3 +235,17 @@ func (t tokenExhangeRoundTripper) RoundTrip(r *http.Request) (*http.Response, er
r.Header.Set("X-Access-Token", "Bearer "+res.Token)
return t.rt.RoundTrip(r)
}

type NoopCache struct{}

func (lc *NoopCache) Get(ctx context.Context, key string) ([]byte, error) {
return nil, cache.ErrNotFound
}

func (lc *NoopCache) Set(ctx context.Context, key string, data []byte, exp time.Duration) error {
return nil
}

func (lc *NoopCache) Delete(ctx context.Context, key string) error {
return nil
}
4 changes: 4 additions & 0 deletions pkg/services/authz/rbac/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ func userPermCacheKey(namespace, userUID, action string) string {
return namespace + ".perm_" + userUID + "_" + action
}

func userPermDenialCacheKey(namespace, userUID, action, name, parent string) string {
return namespace + ".perm_" + userUID + "_" + action + "_" + name + "_" + parent
}

func userBasicRoleCacheKey(namespace, userUID string) string {
return namespace + ".basic_role_" + userUID
}
Expand Down
93 changes: 74 additions & 19 deletions pkg/services/authz/rbac/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,12 @@ type Service struct {
sf *singleflight.Group

// Cache for user permissions, user team memberships and user basic roles
idCache *cacheWrap[store.UserIdentifiers]
permCache *cacheWrap[map[string]bool]
teamCache *cacheWrap[[]int64]
basicRoleCache *cacheWrap[store.BasicRole]
folderCache *cacheWrap[folderTree]
idCache *cacheWrap[store.UserIdentifiers]
permCache *cacheWrap[map[string]bool]
permDenialCache *cacheWrap[bool]
teamCache *cacheWrap[[]int64]
basicRoleCache *cacheWrap[store.BasicRole]
folderCache *cacheWrap[folderTree]
}

func NewService(
Expand All @@ -81,6 +82,7 @@ func NewService(
mapper: newMapper(),
idCache: newCacheWrap[store.UserIdentifiers](cache, logger, longCacheTTL),
permCache: newCacheWrap[map[string]bool](cache, logger, shortCacheTTL),
permDenialCache: newCacheWrap[bool](cache, logger, shortCacheTTL),
teamCache: newCacheWrap[[]int64](cache, logger, shortCacheTTL),
basicRoleCache: newCacheWrap[store.BasicRole](cache, logger, shortCacheTTL),
folderCache: newCacheWrap[folderTree](cache, logger, shortCacheTTL),
Expand Down Expand Up @@ -111,6 +113,29 @@ func (s *Service) Check(ctx context.Context, req *authzv1.CheckRequest) (*authzv
attribute.String("folder", checkReq.ParentFolder),
)

permDenialKey := userPermDenialCacheKey(checkReq.Namespace.Value, checkReq.UserUID, checkReq.Action, checkReq.Name, checkReq.ParentFolder)
if _, ok := s.permDenialCache.Get(ctx, permDenialKey); ok {
s.metrics.permissionCacheUsage.WithLabelValues("true", checkReq.Action).Inc()
s.metrics.requestCount.WithLabelValues("false", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return &authzv1.CheckResponse{Allowed: false}, nil
}

cachedPerms, err := s.getCachedIdentityPermissions(ctx, checkReq.Namespace, checkReq.IdentityType, checkReq.UserUID, checkReq.Action)
if err == nil {
allowed, err := s.checkPermission(ctx, cachedPerms, checkReq)
if err != nil {
ctxLogger.Error("could not check permission", "error", err)
s.metrics.requestCount.WithLabelValues("true", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return deny, err
}
if allowed {
s.metrics.permissionCacheUsage.WithLabelValues("true", checkReq.Action).Inc()
s.metrics.requestCount.WithLabelValues("false", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return &authzv1.CheckResponse{Allowed: allowed}, nil
}
}
s.metrics.permissionCacheUsage.WithLabelValues("false", checkReq.Action).Inc()
Comment on lines +116 to +137

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Misleading cache-usage metric when cache hits but check denies.

When getCachedIdentityPermissions returns cached permissions (cache hit) but checkPermission returns allowed=false, execution falls through to line 137 which unconditionally records a cache miss ("false"). The cache was found and consulted; the data simply didn't produce an allow. This inflates the cache-miss rate and deflates the cache-hit rate in monitoring, making it harder to assess actual cache effectiveness.

Consider recording the metric inside the if err == nil branch to reflect that the cache was hit, or use a distinct label (e.g., "hit_denied") to differentiate this path from a true cache miss.

💡 Proposed fix
 	cachedPerms, err := s.getCachedIdentityPermissions(ctx, checkReq.Namespace, checkReq.IdentityType, checkReq.UserUID, checkReq.Action)
 	if err == nil {
 		allowed, err := s.checkPermission(ctx, cachedPerms, checkReq)
 		if err != nil {
 			ctxLogger.Error("could not check permission", "error", err)
 			s.metrics.requestCount.WithLabelValues("true", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
 			return deny, err
 		}
 		if allowed {
 			s.metrics.permissionCacheUsage.WithLabelValues("true", checkReq.Action).Inc()
 			s.metrics.requestCount.WithLabelValues("false", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
 			return &authzv1.CheckResponse{Allowed: allowed}, nil
 		}
+		// Cache was present but didn't grant access — still a cache hit (just not conclusive for allow)
+		s.metrics.permissionCacheUsage.WithLabelValues("false", checkReq.Action).Inc()
+	} else {
+		s.metrics.permissionCacheUsage.WithLabelValues("false", checkReq.Action).Inc()
 	}
-	s.metrics.permissionCacheUsage.WithLabelValues("false", checkReq.Action).Inc()
🤖 Prompt for AI Agents
In `@pkg/services/authz/rbac/service.go` around lines 116 - 137, The permission
cache metric is being recorded as a miss even when
getCachedIdentityPermissions(...) returns a cache hit but checkPermission(...)
yields allowed=false; update the logic around permDenialKey,
getCachedIdentityPermissions, and checkPermission so that
metrics.permissionCacheUsage reflects a cache hit in the err==nil branch (e.g.,
call s.metrics.permissionCacheUsage.WithLabelValues("true",
checkReq.Action).Inc() when cachedPerms were found regardless of allowed result,
or add a distinct label like "hit_denied") and only record "false" for real
cache misses outside that branch.


permissions, err := s.getIdentityPermissions(ctx, checkReq.Namespace, checkReq.IdentityType, checkReq.UserUID, checkReq.Action)
if err != nil {
ctxLogger.Error("could not get user permissions", "subject", req.GetSubject(), "error", err)
Expand All @@ -125,6 +150,10 @@ func (s *Service) Check(ctx context.Context, req *authzv1.CheckRequest) (*authzv
return deny, err
}

if !allowed {
s.permDenialCache.Set(ctx, permDenialKey, true)
}

s.metrics.requestCount.WithLabelValues("false", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return &authzv1.CheckResponse{Allowed: allowed}, nil
}
Expand All @@ -148,11 +177,18 @@ func (s *Service) List(ctx context.Context, req *authzv1.ListRequest) (*authzv1.
attribute.String("action", listReq.Action),
)

permissions, err := s.getIdentityPermissions(ctx, listReq.Namespace, listReq.IdentityType, listReq.UserUID, listReq.Action)
if err != nil {
ctxLogger.Error("could not get user permissions", "subject", req.GetSubject(), "error", err)
s.metrics.requestCount.WithLabelValues("true", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return nil, err
permissions, err := s.getCachedIdentityPermissions(ctx, listReq.Namespace, listReq.IdentityType, listReq.UserUID, listReq.Action)
if err == nil {
s.metrics.permissionCacheUsage.WithLabelValues("true", listReq.Action).Inc()
} else {
s.metrics.permissionCacheUsage.WithLabelValues("false", listReq.Action).Inc()

permissions, err = s.getIdentityPermissions(ctx, listReq.Namespace, listReq.IdentityType, listReq.UserUID, listReq.Action)
if err != nil {
ctxLogger.Error("could not get user permissions", "subject", req.GetSubject(), "error", err)
s.metrics.requestCount.WithLabelValues("true", "true", req.GetVerb(), req.GetGroup(), req.GetResource()).Inc()
return nil, err
}
}

resp, err := s.listPermission(ctx, permissions, listReq)
Expand Down Expand Up @@ -303,6 +339,34 @@ func (s *Service) getIdentityPermissions(ctx context.Context, ns types.Namespace
}
}

func (s *Service) getCachedIdentityPermissions(ctx context.Context, ns types.NamespaceInfo, idType types.IdentityType, userID, action string) (map[string]bool, error) {
ctx, span := s.tracer.Start(ctx, "authz_direct_db.service.getCachedIdentityPermissions")
defer span.End()

switch idType {
case types.TypeAnonymous:
anonPermKey := anonymousPermCacheKey(ns.Value, action)
if cached, ok := s.permCache.Get(ctx, anonPermKey); ok {
return cached, nil
}
return nil, cache.ErrNotFound
case types.TypeRenderService:
return nil, cache.ErrNotFound
case types.TypeUser, types.TypeServiceAccount:
userIdentifiers, err := s.GetUserIdentifiers(ctx, ns, userID)
if err != nil {
return nil, err
}
userPermKey := userPermCacheKey(ns.Value, userIdentifiers.UID, action)
if cached, ok := s.permCache.Get(ctx, userPermKey); ok {
return cached, nil
}
return nil, cache.ErrNotFound
default:
return nil, fmt.Errorf("unsupported identity type: %s", idType)
}
}

func (s *Service) getUserPermissions(ctx context.Context, ns types.NamespaceInfo, userID, action string, actionSets []string) (map[string]bool, error) {
ctx, span := s.tracer.Start(ctx, "authz_direct_db.service.getUserPermissions")
defer span.End()
Expand All @@ -313,12 +377,6 @@ func (s *Service) getUserPermissions(ctx context.Context, ns types.NamespaceInfo
}

userPermKey := userPermCacheKey(ns.Value, userIdentifiers.UID, action)
if cached, ok := s.permCache.Get(ctx, userPermKey); ok {
s.metrics.permissionCacheUsage.WithLabelValues("true", action).Inc()
return cached, nil
}
s.metrics.permissionCacheUsage.WithLabelValues("false", action).Inc()

res, err, _ := s.sf.Do(userPermKey+"_getUserPermissions", func() (interface{}, error) {
basicRoles, err := s.getUserBasicRole(ctx, ns, userIdentifiers)
if err != nil {
Expand Down Expand Up @@ -363,9 +421,6 @@ func (s *Service) getAnonymousPermissions(ctx context.Context, ns types.Namespac
defer span.End()

anonPermKey := anonymousPermCacheKey(ns.Value, action)
if cached, ok := s.permCache.Get(ctx, anonPermKey); ok {
return cached, nil
}
res, err, _ := s.sf.Do(anonPermKey+"_getAnonymousPermissions", func() (interface{}, error) {
permissions, err := s.permissionStore.GetUserPermissions(ctx, ns, store.PermissionsQuery{Action: action, ActionSets: actionSets, Role: "Viewer"})
if err != nil {
Expand Down
148 changes: 140 additions & 8 deletions pkg/services/authz/rbac/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -339,14 +339,6 @@ func TestService_getUserPermissions(t *testing.T) {
}

testCases := []testCase{
{
name: "should return permissions from cache if available",
permissions: []accesscontrol.Permission{
{Action: "dashboards:read", Scope: "dashboards:uid:some_dashboard"},
},
cacheHit: true,
expectedPerms: map[string]bool{"dashboards:uid:some_dashboard": true},
},
{
name: "should return permissions from store if not in cache",
permissions: []accesscontrol.Permission{
Expand Down Expand Up @@ -898,6 +890,111 @@ func TestService_Check(t *testing.T) {
})
}

func TestService_CacheCheck(t *testing.T) {
callingService := authn.NewAccessTokenAuthInfo(authn.Claims[authn.AccessTokenClaims]{
Claims: jwt.Claims{
Subject: types.NewTypeID(types.TypeAccessPolicy, "some-service"),
Audience: []string{"authzservice"},
},
Rest: authn.AccessTokenClaims{Namespace: "org-12"},
})

ctx := types.WithAuthInfo(context.Background(), callingService)
userID := &store.UserIdentifiers{UID: "test-uid", ID: 1}

t.Run("Allow based on cached permissions", func(t *testing.T) {
s := setupService()

s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)
s.permCache.Set(ctx, userPermCacheKey("org-12", "test-uid", "dashboards:read"), map[string]bool{"dashboards:uid:dash1": true})

resp, err := s.Check(ctx, &authzv1.CheckRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "get",
Name: "dash1",
})
require.NoError(t, err)
assert.True(t, resp.Allowed)
})
t.Run("Fallback to the database on cache miss", func(t *testing.T) {
s := setupService()

// Populate database permission but not the cache
store := &fakeStore{
userID: userID,
userPermissions: []accesscontrol.Permission{{Action: "dashboards:read", Scope: "dashboards:uid:dash2"}},
}

s.store = store
s.permissionStore = store

s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)

resp, err := s.Check(ctx, &authzv1.CheckRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "get",
Name: "dash2",
})
require.NoError(t, err)
assert.True(t, resp.Allowed)
})
t.Run("Fallback to the database on outdated cache", func(t *testing.T) {
s := setupService()

store := &fakeStore{
userID: userID,
userPermissions: []accesscontrol.Permission{{Action: "dashboards:read", Scope: "dashboards:uid:dash2"}},
}

s.store = store
s.permissionStore = store

s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)
// The cache does not have the permission for dash2 (outdated)
s.permCache.Set(ctx, userPermCacheKey("org-12", "test-uid", "dashboards:read"), map[string]bool{"dashboards:uid:dash1": true})

resp, err := s.Check(ctx, &authzv1.CheckRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "get",
Name: "dash2",
})
require.NoError(t, err)
assert.True(t, resp.Allowed)
})
t.Run("Should deny on explicit cache deny entry", func(t *testing.T) {
s := setupService()

s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)

// Explicitly deny access to the dashboard
s.permDenialCache.Set(ctx, userPermDenialCacheKey("org-12", "test-uid", "dashboards:read", "dash1", "fold1"), true)

// Allow access to the dashboard to prove this is not checked
s.permCache.Set(ctx, userPermCacheKey("org-12", "test-uid", "dashboards:read"), map[string]bool{"dashboards:uid:dash1": false})

resp, err := s.Check(ctx, &authzv1.CheckRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "get",
Name: "dash1",
Folder: "fold1",
})
require.NoError(t, err)
assert.False(t, resp.Allowed)
})
}

func TestService_List(t *testing.T) {
callingService := authn.NewAccessTokenAuthInfo(authn.Claims[authn.AccessTokenClaims]{
Claims: jwt.Claims{
Expand Down Expand Up @@ -1191,6 +1288,40 @@ func TestService_List(t *testing.T) {
})
}

func TestService_CacheList(t *testing.T) {
callingService := authn.NewAccessTokenAuthInfo(authn.Claims[authn.AccessTokenClaims]{
Claims: jwt.Claims{
Subject: types.NewTypeID(types.TypeAccessPolicy, "some-service"),
Audience: []string{"authzservice"},
},
Rest: authn.AccessTokenClaims{Namespace: "org-12"},
})

t.Run("List based on cached permissions", func(t *testing.T) {
s := setupService()
ctx := types.WithAuthInfo(context.Background(), callingService)
userID := &store.UserIdentifiers{UID: "test-uid", ID: 1}
s.idCache.Set(ctx, userIdentifierCacheKey("org-12", "test-uid"), *userID)
s.permCache.Set(ctx,
userPermCacheKey("org-12", "test-uid", "dashboards:read"),
map[string]bool{"dashboards:uid:dash1": true, "dashboards:uid:dash2": true, "folders:uid:fold1": true},
)
s.identityStore = &fakeIdentityStore{}

resp, err := s.List(ctx, &authzv1.ListRequest{
Namespace: "org-12",
Subject: "user:test-uid",
Group: "dashboard.grafana.app",
Resource: "dashboards",
Verb: "list",
})

require.NoError(t, err)
require.ElementsMatch(t, resp.Items, []string{"dash1", "dash2"})
require.ElementsMatch(t, resp.Folders, []string{"fold1"})
})
}

func setupService() *Service {
cache := cache.NewLocalCache(cache.Config{Expiry: 5 * time.Minute, CleanupInterval: 5 * time.Minute})
logger := log.New("authz-rbac-service")
Expand All @@ -1202,6 +1333,7 @@ func setupService() *Service {
metrics: newMetrics(nil),
idCache: newCacheWrap[store.UserIdentifiers](cache, logger, longCacheTTL),
permCache: newCacheWrap[map[string]bool](cache, logger, shortCacheTTL),
permDenialCache: newCacheWrap[bool](cache, logger, shortCacheTTL),
teamCache: newCacheWrap[[]int64](cache, logger, shortCacheTTL),
basicRoleCache: newCacheWrap[store.BasicRole](cache, logger, longCacheTTL),
folderCache: newCacheWrap[folderTree](cache, logger, shortCacheTTL),
Expand Down