Okay, my fix for the memory leaks on the MDS went fine. Now I've got a problem on the DS:
> ::findleaks
CACHE LEAKED BUFCTL CALLER
ffffff01c682a860 1 ffffff01d49a0068 dserv_mds_do_reportavail+0x210
ffffff01c68262e0 520364 ffffff01ea054458 dserv_nnode_from_fh_ds+0x5b
ffffff01c68262e0 1258470 ffffff01f3ed5db0 dserv_nnode_from_fh_ds+0x5b
ffffff01c68262e0 1343031 ffffff01f3ed5e88 dserv_nnode_from_fh_ds+0x77
ffffff01c68262e0 435756 ffffff01ee4b65f8 dserv_nnode_from_fh_ds+0x77
ffffff01c68262e0 1 ffffff01de76e558 mds_compound+0x54
ffffff01c68262e0 3 ffffff01e83df668 mds_compound+0x54
ffffff01c682b2e0 4 ffffff01d99cea30 mds_compound+0x193
ffffff01c6828020 2 ffffff01dcd9f8f8 mds_get_server_impl_id+0x30
ffffff01c68262e0 2 ffffff01dd4350d8 mds_get_server_impl_id+0x58
ffffff01c6826b20 2 ffffff01d7153290 mds_get_server_impl_id+0x8a
ffffff01c6828860 1 ffffff01d8805120 modinstall+0x129
ffffff01c6828860 1 ffffff01d7fe7a98 modinstall+0x129
ffffff01c682b2e0 1 ffffff01ddd5a870 modinstall+0x129
ffffff01c6828020 1 ffffff0f82dd4b68 rpc_init_taglist+0x25
ffffff01c6828020 20145 ffffff020b8337a0 rpc_init_taglist+0x25
ffffff01c6828020 1 ffffff01d613e538 rpc_init_taglist+0x25
ffffff01c6828020 11460 ffffff01e20b1de8 rpc_init_taglist+0x25
ffffff01c6828020 1 ffffff0f82de32e0 rpc_init_taglist+0x25
ffffff01c6828020 1 ffffff0f82de1708 rpc_init_taglist+0x25
ffffff01c6828020 2 ffffff01ddda3018 rpc_init_taglist+0x25
ffffff01c68265a0 2 ffffff01dc8f41b0 tohex+0x32
ffffff01c6828020 3 ffffff01dcda06c0 xdr_array+0xae
ffffff01c6828020 1 ffffff01df04e7f8 xdr_array+0xae
ffffff01c68262e0 1083945 ffffff01edde37b0 xdr_bytes+0x70
ffffff01c68282e0 1 ffffff01f31f9e60 xdr_bytes+0x70
ffffff01c68285a0 1 ffffff01e1798598 xdr_bytes+0x70
ffffff01c68262e0 694948 ffffff01f3ed5c00 xdr_bytes+0x70
------------------------------------------------------------------------
Total 5368151 buffers, 86904136 bytes
Even though we still see a high number of leaks in xdr_bytes(), which we expect to indicate that the mds_sid is not getting cleaned up, I'll argue that this is to be expected with the high number of leaks in dserv_nnode_from_fh_ds(). I.e., if the file handle is not being nuked, we can't expect the mds_sid component to also be zapped.
> ffffff01ea054458$<bufctl_audit
ADDR BUFADDR TIMESTAMP THREAD
CACHE LASTLOG CONTENTS
ffffff01ea054458 ffffff01e9f71d50 89f9dc8f5e7 ffffff01d7aa93a0
ffffff01c68262e0 ffffff01c747f9c0 ffffff01cfd78cd8
kmem_cache_alloc_debug+0x283
kmem_cache_alloc+0xa9
kmem_alloc+0xa3
dserv_nnode_from_fh_ds+0x5b
nnode_from_fh_v41+0x45
mds_op_putfh+0xe3
rfs41_op_dispatch+0x72
mds_compound+0x1d4
rfs41_dispatch+0x17c
dispatch_dserv_nfsv41+0x93
svc_getreq+0x20d
svc_run+0x197
svc_do_run+0x81
nfssys+0xa0e
The issue is actually very simple to understand - prior to my changes, the dnk_sid field was not being used. But that field is a mds_sid, which requires a memory allocation to use:
426 static nnode_error_t
427 dserv_nnode_from_fh_ds(nnode_t **npp, mds_ds_fh *fhp)
428 {
429 dserv_nnode_key_t dskey;
430 nnode_key_t key;
431 uint32_t hash;
432
433 if (fhp->vers < 1)
434 return (ESTALE); /* XXX badhandle */
435 if (fhp->fh.v1.mds_fid.len < 8) /* XXX stupid */
436 return (ESTALE);
437 if (fhp->fh.v1.mds_fid.len > DS_MAXFIDSZ)
438 return (ESTALE);
439 /* XXX cannot do sid until mds_sid is sane */
440 dskey.dnk_sid = NULL;
441 dskey.dnk_fid = &fhp->fh.v1.mds_fid;
442
443 hash = dserv_nnode_hash(&dskey);
444
445 key.nk_keydata = &dskey;
446 key.nk_compare = dserv_nnode_compare;
447
448 return (nnode_find_or_create(npp, &key, hash, fhp, dserv_nnode_build));
449 }
I changed this to be:
425 static nnode_error_t
426 dserv_nnode_from_fh_ds(nnode_t **npp, mds_ds_fh *fhp)
427 {
428 dserv_nnode_key_t dskey;
429 nnode_key_t key;
430 uint32_t hash;
431
432 if (fhp->vers < 1)
433 return (ESTALE); /* XXX badhandle */
434 if (fhp->fh.v1.mds_fid.len < 8) /* XXX stupid */
435 return (ESTALE);
436 if (fhp->fh.v1.mds_fid.len > DS_MAXFIDSZ)
437 return (ESTALE);
438
439 /*
440 * XXX: Should we be mallocing in the middle of a
441 * cache? Cache entry does not have enough space.
442 * What is the use of the mds_sid here?
443 */
444 dskey.dnk_sid = kmem_alloc(sizeof (mds_sid), KM_SLEEP);
445 dskey.dnk_sid->len = fhp->fh.v1.mds_sid.len;
446 dskey.dnk_sid->val = kmem_alloc(dskey.dnk_sid->len,
447 KM_SLEEP);
448
449 bcopy(fhp->fh.v1.mds_sid.val, dskey.dnk_sid->val,
450 dskey.dnk_sid->len);
451
452 dskey.dnk_fid = &fhp->fh.v1.mds_fid;
453
454 hash = dserv_nnode_hash(&dskey);
455
456 key.nk_keydata = &dskey;
457 key.nk_compare = dserv_nnode_compare;
458
459 return (nnode_find_or_create(npp, &key, hash, fhp, dserv_nnode_build));
460 }
The problem here is that both dskey and key are pulled off of the stack. We don't expect them to have a life after dserv_nnode_from_fh_ds() and before my changes, there was no need to release any memory.
The key is only used here in the find case:
637 rw_enter(&bucket->nb_lock, rw);
638 found = avl_find(&bucket->nb_tree, &key, &where);
639 if (found) {
And indeed, we can easily follow where the compare function has to be from line 457 above:
310 static int
311 dserv_nnode_compare(const void *va, const void *vb)
312 {
313 const dserv_nnode_key_t *a = va;
314 const dserv_nnode_key_t *b = vb;
315 int rc;
316
317 NFS_AVL_COMPARE(a->dnk_sid->len, b->dnk_sid->len);
318 rc = memcmp(a->dnk_sid->val, b->dnk_sid->val, a->dnk_sid->len);
319 NFS_AVL_RETURN(rc);
320
321 NFS_AVL_COMPARE(a->dnk_fid->len, b->dnk_fid->len);
322 rc = memcmp(a->dnk_fid->val, b->dnk_fid->val, a->dnk_fid->len);
323 NFS_AVL_RETURN(rc);
324
325 return (0);
326 }
And I added that code on 317-319!
Edit Note: And I just realized, I'm supplying only one of the two keys. Where does the other come from? It turns out that it does get created in:
static nnode_error_t
dserv_nnode_build(nnode_seed_t *seed, void *vfh)
{
dserv_nnode_key_t *key = NULL;
...
key->dnk_sid = kmem_alloc(sizeof (mds_sid), KM_SLEEP);
key->dnk_sid->len = fhp->fh.v1.mds_sid.len;
key->dnk_sid->val = kmem_alloc(key->dnk_sid->len,
KM_SLEEP);
And that code does take care of releasing the memory correctly with dserv_nnode_key_free()!
It is not used in the create case:
674 rc = nnode_build(npp, data, nnbuild); 675 if (rc == 0) 676 avl_insert(&bucket->nb_tree, *npp, where);
So the fix will be:
457 ne = nnode_find_or_create(npp, &key, hash, fhp, dserv_nnode_build); 458 459 kmem_free(dskey.dnk_sid->val, dskey.dnk_sid->len); 460 kmem_free(dskey.dnk_sid, sizeof (mds_sid)); 461 462 return (ne); 463 }
Ouch, I thought I was done! Remember earlier I stated that the xdr_bytes would be from this code? Well I decided to challenge that statement:
> ffffff01edde37b0$<bufctl_audit
ADDR BUFADDR TIMESTAMP THREAD
CACHE LASTLOG CONTENTS
ffffff01edde37b0 ffffff01edc60168 89f9c4733bc ffffff01d7aa93a0
ffffff01c68262e0 ffffff01c9a25b80 ffffff01cca20030
kmem_cache_alloc_debug+0x283
kmem_cache_alloc+0xa9
kmem_alloc+0xa3
xdr_bytes+0x70
xdr_mds_sid+0x21
xdr_ds_fh_v1+0x68
xdr_ds_fh+0x3f
xdr_decode_nfs41_fh+0xdd
xdr_snfs_argop4+0x5e
xdr_COMPOUND4args_srv+0xf4
svc_authany_wrap+0x22
svc_cots_kgetargs+0x41
dispatch_dserv_nfsv41+0x5d
svc_getreq+0x20d
svc_run+0x197
In retrospect, "D'oh!", we can see we are not decoding the xdr here. But then, in my defense, I made that statement before we looked at the code!
Ahh, looks like we have some hand written XDR code:
4034 static bool_t
4035 xdr_snfs_argop4_free(XDR *xdrs, nfs_argop4 **arrayp, int len)
4036 {
...
4056 case OP_PUTFH:
4057 if (array[i].nfs_argop4_u.opputfh.object.nfs_fh4_val !=
4058 NULL) {
4059 kmem_free(array[i].nfs_argop4_u.opputfh.object.
4060 nfs_fh4_val,
4061 array[i].nfs_argop4_u.opputfh.object.
4062 nfs_fh4_len);
4063 }
Which does not free the mds_sid portion of the file handle. This hand encoding is a nightmare to unravel. I'm going to have to pass in the minorversion:
5022 (void) xdr_snfs_argop4_free(xdrs, &objp->array, 5023 objp->array_len, objp->minorversion);
to determine whether or not to check for the mds_sid. If it is a minorversion, then I'll need to check whether it is a normal NFSv41 filehandle or a DS filehandle:
4057 case OP_PUTFH:
4058 if (array[i].nfs_argop4_u.opputfh.object.nfs_fh4_val ==
4059 NULL)
4060 continue;
4061
4062 if (minorversion != 0) {
4063 struct mds_ds_fh *dsfh =
4064 (struct mds_ds_fh *)array[i].nfs_argop4_u.
4065 opputfh.object.nfs_fh4_val;
4066
4067 /*
4068 * Is it really a DS filehandle?
4069 */
4070 if (fh4->type == FH41_TYPE_DMU_DS) {
4071 mds_sid *sid = &dsfh->fh.v1.mds_sid;
4072 if (sid->val) {
4073 kmem_free(sid->val, sid->len);
4074 }
4075 }
4076 }
4077
4078 kmem_free(array[i].nfs_argop4_u.opputfh.object.
4079 nfs_fh4_val,
4080 array[i].nfs_argop4_u.opputfh.object.
4081 nfs_fh4_len);
The key to understanding this code (and the headache) is to look at the definitions of our filehandles:
typedef struct {
nfs41_fh_type_t type;
nfs41_fh_vers_t vers;
union {
nfs41_fh_v1_t v1;
/* new versions will be added here */
} fh;
} nfs41_fh_fmt_t;
struct mds_ds_fh {
nfs41_fh_type_t type; /* MDS or DS? */
ds_fh_version vers; /* OTW version number */
union {
ds_fh_v1 v1;
/* new versions will be added here */
} fh;
};
typedef struct mds_ds_fh mds_ds_fh;
You can typecast nfs_fh4_val in several ways, i.e.:
4063 struct mds_ds_fh *dsfh = 4064 (struct mds_ds_fh *)array[i].nfs_argop4_u. 4065 opputfh.object.nfs_fh4_val;
but you always have to check the type to make sure it is valid. The key to understanding this comes out of nfs41_filehandle_xdr.c:
132 bool_t
133 xdr_decode_nfs41_fh(XDR *xdrs, nfs_fh4 *objp)
134 {
135
136 uint_t otw_len;
137 uint_t type;
138
139 ASSERT(xdrs->x_op == XDR_DECODE);
140
141 objp->nfs_fh4_val = NULL;
142 objp->nfs_fh4_len = 0;
...
150 /* Get the filehandle type */
151 if (!xdr_enum(xdrs, (enum_t *)&type))
152 return (FALSE);
153
154 switch (type) {
155 case FH41_TYPE_NFS: {
156 nfs41_fh_fmt_t *nfhp = NULL;
157
158 nfhp = kmem_zalloc(sizeof (nfs41_fh_fmt_t), KM_SLEEP);
159 nfhp->type = FH41_TYPE_NFS;
...
170 case FH41_TYPE_DMU_DS: {
171 struct mds_ds_fh *dfhp = NULL;
172
173 dfhp = kmem_zalloc(sizeof (struct mds_ds_fh), KM_SLEEP);
174 dfhp->type = FH41_TYPE_DMU_DS;
Since both structures have the type as the first field, after the typecast, we will be able to determine if that was valid to do.
So the first memory leak I fixed was one I had added. The second one was a pre-existing one. Running findleaks has saved my bacon here.
And of course, I'm going to have to check one more time to make sure that my fixes plugged the leaks!