MFC 197395: Improve mxge watchdog routine's ability to reliably reset a failed NIC

Approved by: re (kib)
This commit is contained in:
Andrew Gallatin 2009-09-28 23:48:16 +00:00
parent fe36e02918
commit 264d14d30d

View file

@ -143,7 +143,7 @@ MODULE_DEPEND(mxge, zlib, 1, 1, 1);
static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
static int mxge_close(mxge_softc_t *sc);
static int mxge_close(mxge_softc_t *sc, int down);
static int mxge_open(mxge_softc_t *sc);
static void mxge_tick(void *arg);
@ -1305,8 +1305,7 @@ mxge_reset(mxge_softc_t *sc, int interrupts_setup)
ss->lro_queued = 0;
ss->lro_flushed = 0;
if (ss->fw_stats != NULL) {
ss->fw_stats->valid = 0;
ss->fw_stats->send_done_count = 0;
bzero(ss->fw_stats, sizeof *ss->fw_stats);
}
}
sc->rdma_tags_available = 15;
@ -1379,7 +1378,7 @@ mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
ifp->if_capenable |= IFCAP_LRO;
sc->lro_cnt = lro_cnt;
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
mxge_close(sc);
mxge_close(sc, 0);
err = mxge_open(sc);
}
return err;
@ -1495,6 +1494,10 @@ mxge_add_sysctls(mxge_softc_t *sc)
"read_write_dma_MBs",
CTLFLAG_RD, &sc->read_write_dma,
0, "DMA concurrent Read/Write speed in MB/s");
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"watchdog_resets",
CTLFLAG_RD, &sc->watchdog_resets,
0, "Number of times NIC was reset");
/* performance related tunables */
@ -3600,7 +3603,7 @@ abort:
}
static int
mxge_close(mxge_softc_t *sc)
mxge_close(mxge_softc_t *sc, int down)
{
mxge_cmd_t cmd;
int err, old_down_cnt;
@ -3617,21 +3620,23 @@ mxge_close(mxge_softc_t *sc)
}
#endif
sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
old_down_cnt = sc->down_cnt;
wmb();
err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
if (err) {
device_printf(sc->dev, "Couldn't bring down link\n");
if (!down) {
old_down_cnt = sc->down_cnt;
wmb();
err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
if (err) {
device_printf(sc->dev,
"Couldn't bring down link\n");
}
if (old_down_cnt == sc->down_cnt) {
/* wait for down irq */
DELAY(10 * sc->intr_coal_delay);
}
wmb();
if (old_down_cnt == sc->down_cnt) {
device_printf(sc->dev, "never got down irq\n");
}
}
if (old_down_cnt == sc->down_cnt) {
/* wait for down irq */
DELAY(10 * sc->intr_coal_delay);
}
wmb();
if (old_down_cnt == sc->down_cnt) {
device_printf(sc->dev, "never got down irq\n");
}
mxge_free_mbufs(sc);
return 0;
@ -3684,8 +3689,9 @@ static int
mxge_watchdog_reset(mxge_softc_t *sc, int slice)
{
struct pci_devinfo *dinfo;
struct mxge_slice_state *ss;
mxge_tx_ring_t *tx;
int err;
int err, running, s, num_tx_slices = 1;
uint32_t reboot;
uint16_t cmd;
@ -3719,6 +3725,30 @@ mxge_watchdog_reset(mxge_softc_t *sc, int slice)
reboot = mxge_read_reboot(sc);
device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
reboot);
running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
if (running) {
/*
* quiesce NIC so that TX routines will not try to
* xmit after restoration of BAR
*/
/* Mark the link as down */
if (sc->link_state) {
sc->link_state = 0;
if_link_state_change(sc->ifp,
LINK_STATE_DOWN);
}
#ifdef IFNET_BUF_RING
num_tx_slices = sc->num_slices;
#endif
/* grab all TX locks to ensure no tx */
for (s = 0; s < num_tx_slices; s++) {
ss = &sc->ss[s];
mtx_lock(&ss->tx.mtx);
}
mxge_close(sc, 1);
}
/* restore PCI configuration space */
dinfo = device_get_ivars(sc->dev);
pci_cfg_restore(sc->dev, dinfo);
@ -3726,10 +3756,22 @@ mxge_watchdog_reset(mxge_softc_t *sc, int slice)
/* and redo any changes we made to our config space */
mxge_setup_cfg_space(sc);
if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
mxge_close(sc);
err = mxge_open(sc);
/* reload f/w */
err = mxge_load_firmware(sc, 0);
if (err) {
device_printf(sc->dev,
"Unable to re-load f/w\n");
}
if (running) {
if (!err)
err = mxge_open(sc);
/* release all TX locks */
for (s = 0; s < num_tx_slices; s++) {
ss = &sc->ss[s];
mtx_unlock(&ss->tx.mtx);
}
}
sc->watchdog_resets++;
} else {
tx = &sc->ss[slice].tx;
device_printf(sc->dev,
@ -3745,6 +3787,9 @@ mxge_watchdog_reset(mxge_softc_t *sc, int slice)
be32toh(sc->ss->fw_stats->send_done_count));
device_printf(sc->dev, "not resetting\n");
}
if (err)
device_printf(sc->dev, "watchdog reset failed\n");
return (err);
}
@ -3860,11 +3905,11 @@ mxge_change_mtu(mxge_softc_t *sc, int mtu)
old_mtu = ifp->if_mtu;
ifp->if_mtu = mtu;
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
mxge_close(sc);
mxge_close(sc, 0);
err = mxge_open(sc);
if (err != 0) {
ifp->if_mtu = old_mtu;
mxge_close(sc);
mxge_close(sc, 0);
(void) mxge_open(sc);
}
}
@ -3922,7 +3967,7 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
}
} else {
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
mxge_close(sc);
mxge_close(sc, 0);
}
}
mtx_unlock(&sc->driver_mtx);
@ -4645,7 +4690,7 @@ mxge_detach(device_t dev)
mtx_lock(&sc->driver_mtx);
sc->dying = 1;
if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
mxge_close(sc);
mxge_close(sc, 0);
mtx_unlock(&sc->driver_mtx);
ether_ifdetach(sc->ifp);
callout_drain(&sc->co_hdl);