diff --git a/share/man/man4/ena.4 b/share/man/man4/ena.4 index aacf7956c9f..089457fd487 100644 --- a/share/man/man4/ena.4 +++ b/share/man/man4/ena.4 @@ -71,6 +71,11 @@ is advertised by the device via the Admin Queue), a dedicated MSI-X interrupt vector per Tx/Rx queue pair, and CPU cacheline optimized data placement. .Pp +When RSS is enabled, each Tx/Rx queue pair is bound to a corresponding +CPU core and its NUMA domain. The order of those bindings is based on +the RSS bucket mapping. For builds with RSS support disabled, the +CPU and NUMA management is left to the kernel. +.Pp The .Nm driver supports industry standard TCP/IP offload features such diff --git a/sys/contrib/ena-com/ena_plat.h b/sys/contrib/ena-com/ena_plat.h index 274f795950c..9287532b847 100644 --- a/sys/contrib/ena-com/ena_plat.h +++ b/sys/contrib/ena-com/ena_plat.h @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -170,6 +171,8 @@ static inline long PTR_ERR(const void *ptr) #define ENA_COM_TIMER_EXPIRED ETIMEDOUT #define ENA_COM_EIO EIO +#define ENA_NODE_ANY (-1) + #define ENA_MSLEEP(x) pause_sbt("ena", SBT_1MS * (x), SBT_1MS, 0) #define ENA_USLEEP(x) pause_sbt("ena", SBT_1US * (x), SBT_1US, 0) #define ENA_UDELAY(x) DELAY(x) @@ -277,7 +280,7 @@ typedef struct ifnet ena_netdev; void ena_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error); int ena_dma_alloc(device_t dmadev, bus_size_t size, ena_mem_handle_t *dma, - int mapflags, bus_size_t alignment); + int mapflags, bus_size_t alignment, int domain); static inline uint32_t ena_reg_read32(struct ena_bus *bus, bus_size_t offset) @@ -299,16 +302,27 @@ ena_reg_read32(struct ena_bus *bus, bus_size_t offset) } while (0) #define ENA_MEM_ALLOC(dmadev, size) malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO) -#define ENA_MEM_ALLOC_NODE(dmadev, size, virt, node, dev_node) (virt = NULL) + +#define ENA_MEM_ALLOC_NODE(dmadev, size, virt, node, dev_node) \ + do { \ + (virt) = malloc_domainset((size), M_DEVBUF, \ + (node) < 0 ? DOMAINSET_RR() : DOMAINSET_PREF(node), \ + M_NOWAIT | M_ZERO); \ + (void)(dev_node); \ + } while (0) + #define ENA_MEM_FREE(dmadev, ptr, size) \ do { \ (void)(size); \ free(ptr, M_DEVBUF); \ } while (0) #define ENA_MEM_ALLOC_COHERENT_NODE_ALIGNED(dmadev, size, virt, phys, \ - handle, node, dev_node, alignment) \ + dma, node, dev_node, alignment) \ do { \ - ((virt) = NULL); \ + ena_dma_alloc((dmadev), (size), &(dma), 0, (alignment), \ + (node)); \ + (virt) = (void *)(dma).vaddr; \ + (phys) = (dma).paddr; \ (void)(dev_node); \ } while (0) @@ -320,7 +334,8 @@ ena_reg_read32(struct ena_bus *bus, bus_size_t offset) #define ENA_MEM_ALLOC_COHERENT_ALIGNED(dmadev, size, virt, phys, dma, \ alignment) \ do { \ - ena_dma_alloc((dmadev), (size), &(dma), 0, alignment); \ + ena_dma_alloc((dmadev), (size), &(dma), 0, (alignment), \ + ENA_NODE_ANY); \ (virt) = (void *)(dma).vaddr; \ (phys) = (dma).paddr; \ } while (0) @@ -366,7 +381,6 @@ ena_reg_read32(struct ena_bus *bus, bus_size_t offset) #define time_after(a,b) ((long)((unsigned long)(b) - (unsigned long)(a)) < 0) #define VLAN_HLEN sizeof(struct ether_vlan_header) -#define CSUM_OFFLOAD (CSUM_IP|CSUM_TCP|CSUM_UDP) #define prefetch(x) (void)(x) #define prefetchw(x) (void)(x) diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c index 84ef234cd93..63b4598a935 100644 --- a/sys/dev/ena/ena.c +++ b/sys/dev/ena/ena.c @@ -198,7 +198,7 @@ ena_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) int ena_dma_alloc(device_t dmadev, bus_size_t size, - ena_mem_handle_t *dma, int mapflags, bus_size_t alignment) + ena_mem_handle_t *dma, int mapflags, bus_size_t alignment, int domain) { struct ena_adapter* adapter = device_get_softc(dmadev); device_t pdev = adapter->pdev; @@ -229,6 +229,13 @@ ena_dma_alloc(device_t dmadev, bus_size_t size, goto fail_tag; } + error = bus_dma_tag_set_domain(dma->tag, domain); + if (unlikely(error != 0)) { + ena_log(pdev, ERR, "bus_dma_tag_set_domain failed: %d\n", + error); + goto fail_map_create; + } + error = bus_dmamem_alloc(dma->tag, (void**) &dma->vaddr, BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->map); if (unlikely(error != 0)) { @@ -1445,6 +1452,8 @@ ena_create_io_queues(struct ena_adapter *adapter) ctx.queue_size = adapter->requested_tx_ring_size; ctx.msix_vector = msix_vector; ctx.qid = ena_qid; + ctx.numa_node = adapter->que[i].domain; + rc = ena_com_create_io_queue(ena_dev, &ctx); if (rc != 0) { ena_log(adapter->pdev, ERR, @@ -1462,6 +1471,11 @@ ena_create_io_queues(struct ena_adapter *adapter) ena_com_destroy_io_queue(ena_dev, ena_qid); goto err_tx; } + + if (ctx.numa_node >= 0) { + ena_com_update_numa_node(ring->ena_com_io_cq, + ctx.numa_node); + } } /* Create RX queues */ @@ -1473,6 +1487,8 @@ ena_create_io_queues(struct ena_adapter *adapter) ctx.queue_size = adapter->requested_rx_ring_size; ctx.msix_vector = msix_vector; ctx.qid = ena_qid; + ctx.numa_node = adapter->que[i].domain; + rc = ena_com_create_io_queue(ena_dev, &ctx); if (unlikely(rc != 0)) { ena_log(adapter->pdev, ERR, @@ -1491,6 +1507,11 @@ ena_create_io_queues(struct ena_adapter *adapter) ena_com_destroy_io_queue(ena_dev, ena_qid); goto err_rx; } + + if (ctx.numa_node >= 0) { + ena_com_update_numa_node(ring->ena_com_io_cq, + ctx.numa_node); + } } for (i = 0; i < adapter->num_io_queues; i++) { @@ -1646,12 +1667,22 @@ ena_setup_io_intr(struct ena_adapter *adapter) #ifdef RSS int num_buckets = rss_getnumbuckets(); static int last_bind = 0; + int cur_bind; + int idx; #endif int irq_idx; if (adapter->msix_entries == NULL) return (EINVAL); +#ifdef RSS + if (adapter->first_bind < 0) { + adapter->first_bind = last_bind; + last_bind = (last_bind + adapter->num_io_queues) % num_buckets; + } + cur_bind = adapter->first_bind; +#endif + for (int i = 0; i < adapter->num_io_queues; i++) { irq_idx = ENA_IO_IRQ_IDX(i); @@ -1666,9 +1697,17 @@ ena_setup_io_intr(struct ena_adapter *adapter) #ifdef RSS adapter->que[i].cpu = adapter->irq_tbl[irq_idx].cpu = - rss_getcpu(last_bind); - last_bind = (last_bind + 1) % num_buckets; + rss_getcpu(cur_bind); + cur_bind = (cur_bind + 1) % num_buckets; CPU_SETOF(adapter->que[i].cpu, &adapter->que[i].cpu_mask); + + for (idx = 0; idx < MAXMEMDOM; ++idx) { + if (CPU_ISSET(adapter->que[i].cpu, &cpuset_domain[idx])) + break; + } + adapter->que[i].domain = idx; +#else + adapter->que[i].domain = -1; #endif } @@ -3459,6 +3498,7 @@ ena_attach(device_t pdev) adapter = device_get_softc(pdev); adapter->pdev = pdev; + adapter->first_bind = -1; /* * Set up the timer service - driver is responsible for avoiding diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h index f559f9127c1..260c2648289 100644 --- a/sys/dev/ena/ena.h +++ b/sys/dev/ena/ena.h @@ -222,6 +222,7 @@ struct ena_que { int cpu; cpuset_t cpu_mask; #endif + int domain; struct sysctl_oid *oid; }; @@ -439,6 +440,7 @@ struct ena_adapter { uint32_t buf_ring_size; /* RSS*/ + int first_bind; struct ena_indir *rss_indir; uint8_t mac_addr[ETHER_ADDR_LEN]; diff --git a/sys/dev/ena/ena_datapath.h b/sys/dev/ena/ena_datapath.h index 4886ff1e639..8da6a2a0edc 100644 --- a/sys/dev/ena/ena_datapath.h +++ b/sys/dev/ena/ena_datapath.h @@ -39,4 +39,6 @@ void ena_qflush(if_t ifp); int ena_mq_start(if_t ifp, struct mbuf *m); void ena_deferred_mq_start(void *arg, int pending); +#define CSUM_OFFLOAD (CSUM_IP|CSUM_TCP|CSUM_UDP) + #endif /* ENA_TXRX_H */ diff --git a/sys/dev/ena/ena_sysctl.c b/sys/dev/ena/ena_sysctl.c index 7337f6578e6..f523bdbdbe8 100644 --- a/sys/dev/ena/ena_sysctl.c +++ b/sys/dev/ena/ena_sysctl.c @@ -208,6 +208,14 @@ ena_sysctl_add_stats(struct ena_adapter *adapter) adapter->que[i].oid = queue_node; +#ifdef RSS + /* Common stats */ + SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "cpu", + CTLFLAG_RD, &adapter->que[i].cpu, 0, "CPU affinity"); + SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "domain", + CTLFLAG_RD, &adapter->que[i].domain, 0, "NUMA domain"); +#endif + /* TX specific stats */ tx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, "tx_ring", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX ring");