星期三 四月 12, 2006

A R/W deadlock of aggregation in GLD code

I met this problem when running heavy traffic over an aggregation and add/remove interfaces into the aggregation.

This is a good example of read/write deadlock problem.

First, let's explain how the deadlock will happen.

1. When a packet of TCP incoming and triggerres interrupt, it will following below call sequence:     
    driver_xxx_intr -> mac_rx -> (a serie of TCP funcs) -> aggr_m_tx

Above call sequence will acquire rw lock in below sequence:
(1) mac_rx -> mi_rx_lock (as RW_READER) mac.c, LINE 1145
(2) aggr_m_tx -> lg_lock (as RW_READER) aggr_send.c, LINE 220

/* See below code */
http://cvs.opensolaris.org/source/xref/on/usr/src/uts/common/io/mac/mac.c , line 1136 - 1167

   1136 void
1137 mac_rx(mac_t *mp, mac_resource_handle_t mrh, mblk_t *bp)
1138 {
1139 mac_impl_t *mip = mp->m_impl;
1140 mac_rx_fn_t *mrfp;
1141
1142 /*
1143 * Call all registered receive functions.
1144 */

1145 rw_enter(&mip->mi_rx_lock, RW_READER);
1146 mrfp = mip->mi_mrfp;
1147 if (mrfp == NULL) {
1148 /* There are no registered receive functions. */
1149 freemsgchain(bp);
1150 rw_exit(&mip->mi_rx_lock);
1151 return;
1152 }
1153 do {
1154 mblk_t *recv_bp;
1155
1156 if (mrfp->mrf_nextp != NULL) {
1157 /* XXX Do we bump a counter if copymsgchain() fails? */
1158 recv_bp = copymsgchain(bp);
1159 } else {
1160 recv_bp = bp;
1161 }
1162 if (recv_bp != NULL)
1163 mrfp->mrf_fn(mrfp->mrf_arg, mrh, recv_bp);
1164 mrfp = mrfp->mrf_nextp;
1165 } while (mrfp != NULL);
1166 rw_exit(&mip->mi_rx_lock);
1167 }


When packet arrived the interface, interrupt handler will call into mac_rx.
In Line 1145,  mip->mi_rx_lock will be acquired as RW_READER here.

While in aggr codes, http://cvs.opensolaris.org/source/xref/on/usr/src/uts/common/io/aggr/aggr_send.c. In LINE 220, it tries to acquire grp->lg_lock as RW_READER.

    212 mblk_t *
213 aggr_m_tx(void *arg, mblk_t *mp)
214 {
215 aggr_grp_t *grp = arg;
216 aggr_port_t *port;
217 mblk_t *nextp;
218 const mac_txinfo_t *mtp;
219
220 rw_enter(&grp->lg_lock, RW_READER);
221
222 if (grp->lg_ntx_ports == 0) {
223 /*
224 * We could have returned from aggr_m_start() before
225 * the ports were actually attached. Drop the chain.
226 */

227 rw_exit(&grp->lg_lock);
228
229 freemsgchain(mp);
230 return (NULL);
231 }
232
233 for (;;) {
234 nextp = mp->b_next;
235 mp->b_next = NULL;
236
237 port = grp->lg_tx_ports[aggr_send_port(grp, mp)];
238 ASSERT(port->lp_state == AGGR_PORT_STATE_ATTACHED);
239
240 rw_exit(&grp->lg_lock);
241
242 /*
243 * We store the transmit info pointer locally in case it
244 * changes between loading mt_fn and mt_arg.
245 */

246 mtp = port->lp_txinfo;
247 if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
248 mp->b_next = nextp;
249 goto done;
250 }
251
252 if ((mp = nextp) == NULL)
253 goto done;
254
255 rw_enter(&grp->lg_lock, RW_READER);
256 }
257
258 done:
259 return (mp);
260 }


2. When adminstrator using dladm is to remove an interface from current aggregation(with dladm remove-aggr), it will follow below call sequence:
     aggr_ioctl -> aggr_ioc_remove -> aggr_grp_rem_ports -> aggr_grp_rem_port -> aggr_port_delete -> mac_rx_remove

So the mi_rx_lock and lg_lock will be held in sequence:
(1) aggr_grp_rem_ports -> acquire "lg_lock" (as RW_WRITER), aggr_grp.c LINE 861
(2) mac_rx_remove -> acquire "mi_rx_lock" (as RW_WRITER), mac.c LINE 941

/* See below code */
http://cvs.opensolaris.org/source/xref/on/usr/src/uts/common/io/aggr/aggr_grp.c,  LINE 842 - 909,

    842 int
843 aggr_grp_rem_ports(uint32_t key, uint_t nports, laioc_port_t *ports)
844 {
845 int rc = 0, i;
846 aggr_grp_t *grp = NULL;
847 aggr_port_t *port;
848 boolean_t notify = B_FALSE, grp_mac_addr_changed;
849
850 /* get group corresponding to key */
851 rw_enter(&aggr_grp_lock, RW_READER);
852 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
853 (mod_hash_val_t *)&grp) != 0) {
854 rw_exit(&aggr_grp_lock);
855 return (ENOENT);
856 }
857 AGGR_GRP_REFHOLD(grp);
858 rw_exit(&aggr_grp_lock);
859
860 AGGR_LACP_LOCK(grp);
861 rw_enter(&grp->lg_lock, RW_WRITER);
862
863 /* we need to keep at least one port per group */
864 if (nports >= grp->lg_nports) {
865 rc = EINVAL;
866 goto bail;
867 }
868
869 /* first verify that all the groups are valid */
870 for (i = 0; i < nports; i++) {
871 if (aggr_grp_port_lookup(grp, ports[i].lp_devname,
872 ports[i].lp_port) == NULL) {
873 /* port not found */
874 rc = ENOENT;
875 goto bail;
876 }
877 }
878
879 /* remove the specified ports from group */
880 for (i = 0; i < nports && !grp->lg_closing; i++) {
881 /* lookup port */
882 port = aggr_grp_port_lookup(grp, ports[i].lp_devname,
883 ports[i].lp_port);
884 ASSERT(port != NULL);
885
886 /* stop port if group has already been started */
887 if (grp->lg_started) {
888 rw_enter(&port->lp_lock, RW_WRITER);
889 aggr_port_stop(port);
890 rw_exit(&port->lp_lock);
891 }
892
893 /* remove port from group */
894 rc = aggr_grp_rem_port(grp, port, &grp_mac_addr_changed);
895 ASSERT(rc == 0);
896 notify = notify || grp_mac_addr_changed;
897 }
898
899 bail:
900 rw_exit(&grp->lg_lock);
901 AGGR_LACP_UNLOCK(grp);
902 if (notify && !grp->lg_closing)
903 mac_unicst_update(&grp->lg_mac, grp->lg_addr);
904 if (rc == 0 && !grp->lg_closing)
905 mac_resource_update(&grp->lg_mac);
906 AGGR_GRP_REFRELE(grp);
907
908 return (rc);
909 }

http://cvs.opensolaris.org/source/xref/on/usr/src/uts/common/io/mac/mac.c, LINE 930 - 953,

    930 void
931 mac_rx_remove(mac_handle_t mh, mac_rx_handle_t mrh)
932 {
933 mac_impl_t *mip = (mac_impl_t *)mh;
934 mac_rx_fn_t *mrfp = (mac_rx_fn_t *)mrh;
935 mac_rx_fn_t **pp;
936 mac_rx_fn_t *p;
937
938 /*
939 * Search the 'rx' callback list for the function closure.
940 */

941 rw_enter(&(mip->mi_rx_lock), RW_WRITER);
942 for (pp = &(mip->mi_mrfp); (p = *pp) != NULL; pp = &(p->mrf_nextp)) {
943 if (p == mrfp)
944 break;
945 }
946 ASSERT(p != NULL);
947
948 /* Remove it from the list. */
949 *pp = p->mrf_nextp;
950 kmem_free(mrfp, sizeof (mac_rx_fn_t));
951 rw_exit(&(mip->mi_rx_lock));
952 }

3. The deadlock will happen in below scene:
(1) thread 1, dladm calls into "aggr_grp_rem_ports" and acquire "lg_lock" as RW_WRITER
(2) thread 2, an packet arrived at the aggregation and interrupt handler calls "mac_rx" and acquire "mi_rx_lock" as RW_READER
(3) thread 2, mac_rx calls into "aggr_m_tx" and tries to acquire "lg_lock" as RW_READER, but currently it is held by step (1) as RW_WRITER, so thread 2 will block
(4) thread 1, aggr_grp_rem_ports calls into "mac_rx_remove" and tris to acquire "mi_rx_lock" as RW_WRITER, but currently it is held by step (2) as RW_READER, so thread 1 will block
(5) The deadload happens

Technorati Tags:

评论:

发表一条评论:
  • HTML语法: 禁用