From 2e97b962e7b843fe4908ce87fb60dd479036792c Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Mon, 15 Aug 2016 14:21:10 +0000 Subject: [PATCH 01/41] 7090 zfs should improve allocation order and throttle allocations illumos/illumos-gate@0f7643c7376dd69a08acbfc9d1d7d548b10c846a https://github.com/illumos/illumos-gate/commit/0f7643c7376dd69a08acbfc9d1d7d548b10c846a https://www.illumos.org/issues/7090 When write I/Os are issued, they are issued in block order but the ZIO pipeline will drive them asynchronously through the allocation stage which can result in blocks being allocated out-of-order. It would be nice to preserve as much of the logical order as possible. In addition, the allocations are equally scattered across all top-level VDEVs but not all top-level VDEVs are created equally. The pipeline should be able to detect devices that are more capable of handling allocations and should allocate more blocks to those devices. This allows for dynamic allocation distribution when devices are imbalanced as fuller devices will tend to be slower than empty devices. The change includes a new pool-wide allocation queue which would throttle and order allocations in the ZIO pipeline. The queue would be ordered by issued time and offset and would provide an initial amount of allocation of work to each top-level vdev. The allocation logic utilizes a reservation system to reserve allocations that will be performed by the allocator. Once an allocation is successfully completed it's scheduled on a given top-level vdev. Each top- level vdev maintains a maximum number of allocations that it can handle (mg_alloc_queue_depth). The pool-wide reserved allocations (top-levels * mg_alloc_queue_depth) are distributed across the top-level vdevs metaslab groups and round robin across all eligible metaslab groups to distribute the work. As top-levels complete their work, they receive additional work from the pool-wide allocation queue until the allocation queue is emptied. Reviewed by: Adam Leventhal Reviewed by: Alex Reece Reviewed by: Christopher Siden Reviewed by: Dan Kimmel Reviewed by: Matthew Ahrens Reviewed by: Paul Dagnelie Reviewed by: Prakash Surya Reviewed by: Sebastien Roy Approved by: Robert Mustacchi Author: George Wilson --- uts/common/fs/zfs/metaslab.c | 329 ++++++++++++--- uts/common/fs/zfs/refcount.c | 65 ++- uts/common/fs/zfs/spa.c | 47 ++- uts/common/fs/zfs/spa_misc.c | 6 + uts/common/fs/zfs/sys/metaslab.h | 21 +- uts/common/fs/zfs/sys/metaslab_impl.h | 63 ++- uts/common/fs/zfs/sys/refcount.h | 8 +- uts/common/fs/zfs/sys/spa_impl.h | 2 + uts/common/fs/zfs/sys/vdev_impl.h | 16 +- uts/common/fs/zfs/sys/zio.h | 42 +- uts/common/fs/zfs/sys/zio_impl.h | 51 ++- uts/common/fs/zfs/vdev.c | 8 +- uts/common/fs/zfs/vdev_cache.c | 5 +- uts/common/fs/zfs/vdev_mirror.c | 5 +- uts/common/fs/zfs/vdev_queue.c | 23 +- uts/common/fs/zfs/zio.c | 551 +++++++++++++++++++++----- uts/common/sys/fs/zfs.h | 3 +- 17 files changed, 1024 insertions(+), 221 deletions(-) diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index 6d919b3cab3..e7624f040ed 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -35,17 +35,8 @@ #include #include -/* - * Allow allocations to switch to gang blocks quickly. We do this to - * avoid having to load lots of space_maps in a given txg. There are, - * however, some cases where we want to avoid "fast" ganging and instead - * we want to do an exhaustive search of all metaslabs on this device. - * Currently we don't allow any gang, slog, or dump device related allocations - * to "fast" gang. - */ -#define CAN_FASTGANG(flags) \ - (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ - METASLAB_GANG_AVOID))) +#define GANG_ALLOCATION(flags) \ + ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) @@ -196,6 +187,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) mc->mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; + mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); + refcount_create_tracked(&mc->mc_alloc_slots); return (mc); } @@ -209,6 +202,8 @@ metaslab_class_destroy(metaslab_class_t *mc) ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); + refcount_destroy(&mc->mc_alloc_slots); + mutex_destroy(&mc->mc_lock); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -425,9 +420,10 @@ metaslab_compare(const void *x1, const void *x2) /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below - * the zfs_mg_noalloc_threshold. If a metaslab group transitions - * from allocatable to non-allocatable or vice versa then the metaslab - * group's class is updated to reflect the transition. + * the zfs_mg_noalloc_threshold or has a fragmentation value that is + * greater than zfs_mg_fragmentation_threshold. If a metaslab group + * transitions from allocatable to non-allocatable or vice versa then the + * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) @@ -436,22 +432,45 @@ metaslab_group_alloc_update(metaslab_group_t *mg) metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; + boolean_t was_initialized; ASSERT(vd == vd->vdev_top); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; + was_initialized = mg->mg_initialized; mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); + mutex_enter(&mc->mc_lock); + + /* + * If the metaslab group was just added then it won't + * have any space until we finish syncing out this txg. + * At that point we will consider it initialized and available + * for allocations. We also don't consider non-activated + * metaslab groups (e.g. vdevs that are in the middle of being removed) + * to be initialized, because they can't be used for allocation. + */ + mg->mg_initialized = metaslab_group_initialized(mg); + if (!was_initialized && mg->mg_initialized) { + mc->mc_groups++; + } else if (was_initialized && !mg->mg_initialized) { + ASSERT3U(mc->mc_groups, >, 0); + mc->mc_groups--; + } + if (mg->mg_initialized) + mg->mg_no_free_space = B_FALSE; + /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ - mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && + mg->mg_allocatable = (mg->mg_activation_count > 0 && + mg->mg_free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); @@ -474,6 +493,7 @@ metaslab_group_alloc_update(metaslab_group_t *mg) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; + mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } @@ -490,6 +510,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; + mg->mg_initialized = B_FALSE; + mg->mg_no_free_space = B_TRUE; + refcount_create_tracked(&mg->mg_alloc_queue_depth); mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); @@ -512,6 +535,7 @@ metaslab_group_destroy(metaslab_group_t *mg) taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); + refcount_destroy(&mg->mg_alloc_queue_depth); kmem_free(mg, sizeof (metaslab_group_t)); } @@ -581,6 +605,15 @@ metaslab_group_passivate(metaslab_group_t *mg) mg->mg_next = NULL; } +boolean_t +metaslab_group_initialized(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + vdev_stat_t *vs = &vd->vdev_stat; + + return (vs->vs_space != 0 && mg->mg_activation_count > 0); +} + uint64_t metaslab_group_get_space(metaslab_group_t *mg) { @@ -750,30 +783,97 @@ metaslab_group_fragmentation(metaslab_group_t *mg) * group should avoid allocations if its free capacity is less than the * zfs_mg_noalloc_threshold or its fragmentation metric is greater than * zfs_mg_fragmentation_threshold and there is at least one metaslab group - * that can still handle allocations. + * that can still handle allocations. If the allocation throttle is enabled + * then we skip allocations to devices that have reached their maximum + * allocation queue depth unless the selected metaslab group is the only + * eligible group remaining. */ static boolean_t -metaslab_group_allocatable(metaslab_group_t *mg) +metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, + uint64_t psize) { - vdev_t *vd = mg->mg_vd; - spa_t *spa = vd->vdev_spa; + spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; /* - * We use two key metrics to determine if a metaslab group is - * considered allocatable -- free space and fragmentation. If - * the free space is greater than the free space threshold and - * the fragmentation is less than the fragmentation threshold then - * consider the group allocatable. There are two case when we will - * not consider these key metrics. The first is if the group is - * associated with a slog device and the second is if all groups - * in this metaslab class have already been consider ineligible + * We can only consider skipping this metaslab group if it's + * in the normal metaslab class and there are other metaslab + * groups to select from. Otherwise, we always consider it eligible * for allocations. */ - return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && - (mg->mg_fragmentation == ZFS_FRAG_INVALID || - mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || - mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); + if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) + return (B_TRUE); + + /* + * If the metaslab group's mg_allocatable flag is set (see comments + * in metaslab_group_alloc_update() for more information) and + * the allocation throttle is disabled then allow allocations to this + * device. However, if the allocation throttle is enabled then + * check if we have reached our allocation limit (mg_alloc_queue_depth) + * to determine if we should allow allocations to this metaslab group. + * If all metaslab groups are no longer considered allocatable + * (mc_alloc_groups == 0) or we're trying to allocate the smallest + * gang block size then we allow allocations on this metaslab group + * regardless of the mg_allocatable or throttle settings. + */ + if (mg->mg_allocatable) { + metaslab_group_t *mgp; + int64_t qdepth; + uint64_t qmax = mg->mg_max_alloc_queue_depth; + + if (!mc->mc_alloc_throttle_enabled) + return (B_TRUE); + + /* + * If this metaslab group does not have any free space, then + * there is no point in looking further. + */ + if (mg->mg_no_free_space) + return (B_FALSE); + + qdepth = refcount_count(&mg->mg_alloc_queue_depth); + + /* + * If this metaslab group is below its qmax or it's + * the only allocatable metasable group, then attempt + * to allocate from it. + */ + if (qdepth < qmax || mc->mc_alloc_groups == 1) + return (B_TRUE); + ASSERT3U(mc->mc_alloc_groups, >, 1); + + /* + * Since this metaslab group is at or over its qmax, we + * need to determine if there are metaslab groups after this + * one that might be able to handle this allocation. This is + * racy since we can't hold the locks for all metaslab + * groups at the same time when we make this check. + */ + for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { + qmax = mgp->mg_max_alloc_queue_depth; + + qdepth = refcount_count(&mgp->mg_alloc_queue_depth); + + /* + * If there is another metaslab group that + * might be able to handle the allocation, then + * we return false so that we skip this group. + */ + if (qdepth < qmax && !mgp->mg_no_free_space) + return (B_FALSE); + } + + /* + * We didn't find another group to handle the allocation + * so we can't skip this metaslab group even though + * we are at or over our qmax. + */ + return (B_TRUE); + + } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { + return (B_TRUE); + } + return (B_FALSE); } /* @@ -2041,8 +2141,57 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) return (0); } +/* + * ========================================================================== + * Metaslab block operations + * ========================================================================== + */ + +static void +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) +{ + if (!(flags & METASLAB_ASYNC_ALLOC) || + flags & METASLAB_DONT_THROTTLE) + return; + + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + (void) refcount_add(&mg->mg_alloc_queue_depth, tag); +} + +void +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) +{ + if (!(flags & METASLAB_ASYNC_ALLOC) || + flags & METASLAB_DONT_THROTTLE) + return; + + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); +} + +void +metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) +{ +#ifdef ZFS_DEBUG + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + + for (int d = 0; d < ndvas; d++) { + uint64_t vdev = DVA_GET_VDEV(&dva[d]); + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); + } +#endif +} + static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, +metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) { spa_t *spa = mg->mg_vd->vdev_spa; @@ -2069,10 +2218,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, if (msp->ms_weight < asize) { spa_dbgmsg(spa, "%s: failed to meet weight " "requirement: vdev %llu, txg %llu, mg %p, " - "msp %p, psize %llu, asize %llu, " + "msp %p, asize %llu, " "weight %llu", spa_name(spa), mg->mg_vd->vdev_id, txg, - mg, msp, psize, asize, msp->ms_weight); + mg, msp, asize, msp->ms_weight); mutex_exit(&mg->mg_lock); return (-1ULL); } @@ -2154,7 +2303,6 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, msp->ms_access_txg = txg + metaslab_unload_delay; mutex_exit(&msp->ms_lock); - return (offset); } @@ -2171,7 +2319,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, int all_zero; int zio_lock = B_FALSE; boolean_t allocatable; - uint64_t offset = -1ULL; uint64_t asize; uint64_t distance; @@ -2241,7 +2388,6 @@ top: all_zero = B_TRUE; do { ASSERT(mg->mg_activation_count == 1); - vd = mg->mg_vd; /* @@ -2257,24 +2403,23 @@ top: /* * Determine if the selected metaslab group is eligible - * for allocations. If we're ganging or have requested - * an allocation for the smallest gang block size - * then we don't want to avoid allocating to the this - * metaslab group. If we're in this condition we should - * try to allocate from any device possible so that we - * don't inadvertently return ENOSPC and suspend the pool + * for allocations. If we're ganging then don't allow + * this metaslab group to skip allocations since that would + * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ - if (allocatable && CAN_FASTGANG(flags) && - psize > SPA_GANGBLOCKSIZE) - allocatable = metaslab_group_allocatable(mg); + if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) { + allocatable = metaslab_group_allocatable(mg, rotor, + psize); + } if (!allocatable) goto next; + ASSERT(mg->mg_initialized); + /* - * Avoid writing single-copy data to a failing vdev - * unless the user instructs us that it is okay. + * Avoid writing single-copy data to a failing vdev. */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && @@ -2294,8 +2439,32 @@ top: asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, psize, asize, txg, distance, - dva, d); + uint64_t offset = metaslab_group_alloc(mg, asize, txg, + distance, dva, d); + + mutex_enter(&mg->mg_lock); + if (offset == -1ULL) { + mg->mg_failed_allocations++; + if (asize == SPA_GANGBLOCKSIZE) { + /* + * This metaslab group was unable to allocate + * the minimum gang block size so it must be + * out of space. We must notify the allocation + * throttle to start skipping allocation + * attempts to this metaslab group until more + * space becomes available. + * + * Note: this failure cannot be caused by the + * allocation throttle since the allocation + * throttle is only responsible for skipping + * devices and not failing block allocations. + */ + mg->mg_no_free_space = B_TRUE; + } + } + mg->mg_allocations++; + mutex_exit(&mg->mg_lock); + if (offset != -1ULL) { /* * If we've just selected this metaslab group, @@ -2476,9 +2645,57 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) return (0); } +/* + * Reserve some allocation slots. The reservation system must be called + * before we call into the allocator. If there aren't any available slots + * then the I/O will be throttled until an I/O completes and its slots are + * freed up. The function returns true if it was successful in placing + * the reservation. + */ +boolean_t +metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, + int flags) +{ + uint64_t available_slots = 0; + boolean_t slot_reserved = B_FALSE; + + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + + uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); + if (reserved_slots < mc->mc_alloc_max_slots) + available_slots = mc->mc_alloc_max_slots - reserved_slots; + + if (slots <= available_slots || GANG_ALLOCATION(flags)) { + /* + * We reserve the slots individually so that we can unreserve + * them individually when an I/O completes. + */ + for (int d = 0; d < slots; d++) { + reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); + } + zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; + slot_reserved = B_TRUE; + } + + mutex_exit(&mc->mc_lock); + return (slot_reserved); +} + +void +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) +{ + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + for (int d = 0; d < slots; d++) { + (void) refcount_remove(&mc->mc_alloc_slots, zio); + } + mutex_exit(&mc->mc_lock); +} + int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, - int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; @@ -2504,11 +2721,21 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, if (error != 0) { for (d--; d >= 0; d--) { metaslab_free_dva(spa, &dva[d], txg, B_TRUE); + metaslab_group_alloc_decrement(spa, + DVA_GET_VDEV(&dva[d]), zio, flags); bzero(&dva[d], sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); + } else { + /* + * Update the metaslab group's queue depth + * based on the newly allocated dva. + */ + metaslab_group_alloc_increment(spa, + DVA_GET_VDEV(&dva[d]), zio, flags); } + } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); diff --git a/uts/common/fs/zfs/refcount.c b/uts/common/fs/zfs/refcount.c index 1e6229877bd..8176cfd849d 100644 --- a/uts/common/fs/zfs/refcount.c +++ b/uts/common/fs/zfs/refcount.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include @@ -68,6 +68,13 @@ refcount_create(refcount_t *rc) rc->rc_tracked = reference_tracking_enable; } +void +refcount_create_tracked(refcount_t *rc) +{ + refcount_create(rc); + rc->rc_tracked = B_TRUE; +} + void refcount_create_untracked(refcount_t *rc) { @@ -251,4 +258,60 @@ refcount_transfer_ownership(refcount_t *rc, void *current_holder, ASSERT(found); mutex_exit(&rc->rc_mtx); } + +/* + * If tracking is enabled, return true if a reference exists that matches + * the "holder" tag. If tracking is disabled, then return true if a reference + * might be held. + */ +boolean_t +refcount_held(refcount_t *rc, void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (rc->rc_count > 0); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_FALSE); +} + +/* + * If tracking is enabled, return true if a reference does not exist that + * matches the "holder" tag. If tracking is disabled, always return true + * since the reference might not be held. + */ +boolean_t +refcount_not_held(refcount_t *rc, void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_FALSE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_TRUE); +} #endif /* ZFS_DEBUG */ diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index 07abe37f573..25b4714ea68 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -1275,7 +1275,6 @@ spa_unload(spa_t *spa) ddt_unload(spa); - /* * Drop and purge level 2 cache */ @@ -3634,6 +3633,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; + spa->spa_load_state = SPA_LOAD_CREATE; /* * Create "The Godfather" zio to hold all async IOs @@ -3818,6 +3818,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, */ spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); + spa->spa_load_state = SPA_LOAD_NONE; mutex_exit(&spa_namespace_lock); @@ -5321,7 +5322,7 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, - nvlist_t *dev_to_remove) + nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; @@ -6483,6 +6484,8 @@ spa_sync(spa_t *spa, uint64_t txg) vdev_t *vd; dmu_tx_t *tx; int error; + uint32_t max_queue_depth = zfs_vdev_async_write_max_active * + zfs_vdev_queue_depth_pct / 100; VERIFY(spa_writeable(spa)); @@ -6494,6 +6497,10 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; + mutex_enter(&spa->spa_alloc_lock); + VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); + mutex_exit(&spa->spa_alloc_lock); + /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. @@ -6545,6 +6552,38 @@ spa_sync(spa_t *spa, uint64_t txg) } } + /* + * Set the top-level vdev's max queue depth. Evaluate each + * top-level's async write queue depth in case it changed. + * The max queue depth will not change in the middle of syncing + * out this txg. + */ + uint64_t queue_depth_total = 0; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (mg == NULL || mg->mg_class != spa_normal_class(spa) || + !metaslab_group_initialized(mg)) + continue; + + /* + * It is safe to do a lock-free check here because only async + * allocations look at mg_max_alloc_queue_depth, and async + * allocations all happen from spa_sync(). + */ + ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); + mg->mg_max_alloc_queue_depth = max_queue_depth; + queue_depth_total += mg->mg_max_alloc_queue_depth; + } + metaslab_class_t *mc = spa_normal_class(spa); + ASSERT0(refcount_count(&mc->mc_alloc_slots)); + mc->mc_alloc_max_slots = queue_depth_total; + mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + + ASSERT3U(mc->mc_alloc_max_slots, <=, + max_queue_depth * rvd->vdev_children); + /* * Iterate to convergence. */ @@ -6696,6 +6735,10 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); + mutex_enter(&spa->spa_alloc_lock); + VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); + mutex_exit(&spa->spa_alloc_lock); + /* * Update usable space statistics. */ diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c index a674eea6b04..7e00d9f42a1 100644 --- a/uts/common/fs/zfs/spa_misc.c +++ b/uts/common/fs/zfs/spa_misc.c @@ -570,6 +570,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -619,6 +620,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_active_count++; } + avl_create(&spa->spa_alloc_tree, zio_timestamp_compare, + sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + /* * Every pool starts with the default cachefile */ @@ -704,6 +708,7 @@ spa_remove(spa_t *spa) kmem_free(dp, sizeof (spa_config_dirent_t)); } + avl_destroy(&spa->spa_alloc_tree); list_destroy(&spa->spa_config_list); nvlist_free(spa->spa_label_features); @@ -734,6 +739,7 @@ spa_remove(spa_t *spa) cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); + mutex_destroy(&spa->spa_alloc_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); diff --git a/uts/common/fs/zfs/sys/metaslab.h b/uts/common/fs/zfs/sys/metaslab.h index b3b9374c779..f7271f08ad4 100644 --- a/uts/common/fs/zfs/sys/metaslab.h +++ b/uts/common/fs/zfs/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -55,14 +55,15 @@ void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); uint64_t metaslab_block_maxsize(metaslab_t *); -#define METASLAB_HINTBP_FAVOR 0x0 -#define METASLAB_HINTBP_AVOID 0x1 -#define METASLAB_GANG_HEADER 0x2 -#define METASLAB_GANG_CHILD 0x4 -#define METASLAB_GANG_AVOID 0x8 +#define METASLAB_HINTBP_FAVOR 0x0 +#define METASLAB_HINTBP_AVOID 0x1 +#define METASLAB_GANG_HEADER 0x2 +#define METASLAB_GANG_CHILD 0x4 +#define METASLAB_ASYNC_ALLOC 0x8 +#define METASLAB_DONT_THROTTLE 0x10 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int); + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); @@ -73,6 +74,9 @@ int metaslab_class_validate(metaslab_class_t *); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, + zio_t *, int); +void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, int64_t, int64_t); @@ -85,10 +89,13 @@ metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); +boolean_t metaslab_group_initialized(metaslab_group_t *); uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); +void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int); +void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *); #ifdef __cplusplus } diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h index 27a53b515fb..1c8993aca55 100644 --- a/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/uts/common/fs/zfs/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -59,11 +59,42 @@ extern "C" { * to use a block allocator that best suits that class. */ struct metaslab_class { + kmutex_t mc_lock; spa_t *mc_spa; metaslab_group_t *mc_rotor; metaslab_ops_t *mc_ops; uint64_t mc_aliquot; + + /* + * Track the number of metaslab groups that have been initialized + * and can accept allocations. An initialized metaslab group is + * one has been completely added to the config (i.e. we have + * updated the MOS config and the space has been added to the pool). + */ + uint64_t mc_groups; + + /* + * Toggle to enable/disable the allocation throttle. + */ + boolean_t mc_alloc_throttle_enabled; + + /* + * The allocation throttle works on a reservation system. Whenever + * an asynchronous zio wants to perform an allocation it must + * first reserve the number of blocks that it wants to allocate. + * If there aren't sufficient slots available for the pending zio + * then that I/O is throttled until more slots free up. The current + * number of reserved allocations is maintained by the mc_alloc_slots + * refcount. The mc_alloc_max_slots value determines the maximum + * number of allocations that the system allows. Gang blocks are + * allowed to reserve slots even if we've reached the maximum + * number of allocations allowed. + */ + uint64_t mc_alloc_max_slots; + refcount_t mc_alloc_slots; + uint64_t mc_alloc_groups; /* # of allocatable groups */ + uint64_t mc_alloc; /* total allocated space */ uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ @@ -85,6 +116,15 @@ struct metaslab_group { avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ + + /* + * A metaslab group is considered to be initialized only after + * we have updated the MOS config and added the space to the pool. + * We only allow allocation attempts to a metaslab group if it + * has been initialized. + */ + boolean_t mg_initialized; + uint64_t mg_free_capacity; /* percentage free */ int64_t mg_bias; int64_t mg_activation_count; @@ -93,6 +133,27 @@ struct metaslab_group { taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; + + /* + * Each metaslab group can handle mg_max_alloc_queue_depth allocations + * which are tracked by mg_alloc_queue_depth. It's possible for a + * metaslab group to handle more allocations than its max. This + * can occur when gang blocks are required or when other groups + * are unable to handle their share of allocations. + */ + uint64_t mg_max_alloc_queue_depth; + refcount_t mg_alloc_queue_depth; + + /* + * A metalab group that can no longer allocate the minimum block + * size will set mg_no_free_space. Once a metaslab group is out + * of space then its share of work must be distributed to other + * groups. + */ + boolean_t mg_no_free_space; + + uint64_t mg_allocations; + uint64_t mg_failed_allocations; uint64_t mg_fragmentation; uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; }; diff --git a/uts/common/fs/zfs/sys/refcount.h b/uts/common/fs/zfs/sys/refcount.h index d13a87ea86e..345d42aa285 100644 --- a/uts/common/fs/zfs/sys/refcount.h +++ b/uts/common/fs/zfs/sys/refcount.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_REFCOUNT_H @@ -62,6 +62,7 @@ typedef struct refcount { void refcount_create(refcount_t *rc); void refcount_create_untracked(refcount_t *rc); +void refcount_create_tracked(refcount_t *rc); void refcount_destroy(refcount_t *rc); void refcount_destroy_many(refcount_t *rc, uint64_t number); int refcount_is_zero(refcount_t *rc); @@ -72,6 +73,8 @@ int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); void refcount_transfer(refcount_t *dst, refcount_t *src); void refcount_transfer_ownership(refcount_t *, void *, void *); +boolean_t refcount_held(refcount_t *, void *); +boolean_t refcount_not_held(refcount_t *, void *); void refcount_init(void); void refcount_fini(void); @@ -84,6 +87,7 @@ typedef struct refcount { #define refcount_create(rc) ((rc)->rc_count = 0) #define refcount_create_untracked(rc) ((rc)->rc_count = 0) +#define refcount_create_tracked(rc) ((rc)->rc_count = 0) #define refcount_destroy(rc) ((rc)->rc_count = 0) #define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) #define refcount_is_zero(rc) ((rc)->rc_count == 0) @@ -100,6 +104,8 @@ typedef struct refcount { atomic_add_64(&(dst)->rc_count, __tmp); \ } #define refcount_transfer_ownership(rc, current_holder, new_holder) +#define refcount_held(rc, holder) ((rc)->rc_count > 0) +#define refcount_not_held(rc, holder) (B_TRUE) #define refcount_init() #define refcount_fini() diff --git a/uts/common/fs/zfs/sys/spa_impl.h b/uts/common/fs/zfs/sys/spa_impl.h index b49507b22ec..b19863c4d89 100644 --- a/uts/common/fs/zfs/sys/spa_impl.h +++ b/uts/common/fs/zfs/sys/spa_impl.h @@ -164,6 +164,8 @@ struct spa { uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ + kmutex_t spa_alloc_lock; + avl_tree_t spa_alloc_tree; spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ diff --git a/uts/common/fs/zfs/sys/vdev_impl.h b/uts/common/fs/zfs/sys/vdev_impl.h index 3f1b7d8a54f..ff14aa80c87 100644 --- a/uts/common/fs/zfs/sys/vdev_impl.h +++ b/uts/common/fs/zfs/sys/vdev_impl.h @@ -53,6 +53,9 @@ typedef struct vdev_queue vdev_queue_t; typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache_entry vdev_cache_entry_t; +extern int zfs_vdev_queue_depth_pct; +extern uint32_t zfs_vdev_async_write_max_active; + /* * Virtual device operations */ @@ -171,9 +174,20 @@ struct vdev { uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_removing; /* device is being removed? */ - boolean_t vdev_ishole; /* is a hole in the namespace */ + boolean_t vdev_ishole; /* is a hole in the namespace */ + kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ uint64_t vdev_top_zap; + /* + * The queue depth parameters determine how many async writes are + * still pending (i.e. allocated by net yet issued to disk) per + * top-level (vdev_async_write_queue_depth) and the maximum allowed + * (vdev_max_async_write_queue_depth). These values only apply to + * top-level vdevs. + */ + uint64_t vdev_async_write_queue_depth; + uint64_t vdev_max_async_write_queue_depth; + /* * Leaf vdev state. */ diff --git a/uts/common/fs/zfs/sys/zio.h b/uts/common/fs/zfs/sys/zio.h index 8061c81e0b4..25bb167147d 100644 --- a/uts/common/fs/zfs/sys/zio.h +++ b/uts/common/fs/zfs/sys/zio.h @@ -173,6 +173,7 @@ enum zio_flag { ZIO_FLAG_DONT_CACHE = 1 << 11, ZIO_FLAG_NODATA = 1 << 12, ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, + ZIO_FLAG_IO_ALLOCATING = 1 << 14, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -180,27 +181,27 @@ enum zio_flag { /* * Flags inherited by vdev children. */ - ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */ - ZIO_FLAG_PROBE = 1 << 15, - ZIO_FLAG_TRYHARD = 1 << 16, - ZIO_FLAG_OPTIONAL = 1 << 17, + ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1 << 16, + ZIO_FLAG_TRYHARD = 1 << 17, + ZIO_FLAG_OPTIONAL = 1 << 18, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ - ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */ - ZIO_FLAG_DONT_PROPAGATE = 1 << 19, - ZIO_FLAG_IO_BYPASS = 1 << 20, - ZIO_FLAG_IO_REWRITE = 1 << 21, - ZIO_FLAG_RAW = 1 << 22, - ZIO_FLAG_GANG_CHILD = 1 << 23, - ZIO_FLAG_DDT_CHILD = 1 << 24, - ZIO_FLAG_GODFATHER = 1 << 25, - ZIO_FLAG_NOPWRITE = 1 << 26, - ZIO_FLAG_REEXECUTED = 1 << 27, - ZIO_FLAG_DELEGATED = 1 << 28, + ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1 << 20, + ZIO_FLAG_IO_BYPASS = 1 << 21, + ZIO_FLAG_IO_REWRITE = 1 << 22, + ZIO_FLAG_RAW = 1 << 23, + ZIO_FLAG_GANG_CHILD = 1 << 24, + ZIO_FLAG_DDT_CHILD = 1 << 25, + ZIO_FLAG_GODFATHER = 1 << 26, + ZIO_FLAG_NOPWRITE = 1 << 27, + ZIO_FLAG_REEXECUTED = 1 << 28, + ZIO_FLAG_DELEGATED = 1 << 29, }; #define ZIO_FLAG_MUSTSUCCEED 0 @@ -240,6 +241,7 @@ enum zio_wait_type { typedef void zio_done_func_t(zio_t *zio); +extern boolean_t zio_dva_throttle_enabled; extern const char *zio_type_name[ZIO_TYPES]; /* @@ -394,7 +396,6 @@ struct zio { blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; - zio_link_t *io_walk_link; zio_t *io_logical; zio_transform_t *io_transform_stack; @@ -420,9 +421,11 @@ struct zio { uint64_t io_offset; hrtime_t io_timestamp; + hrtime_t io_queued_timestamp; hrtime_t io_target_timestamp; avl_node_t io_queue_node; avl_node_t io_offset_node; + avl_node_t io_alloc_node; /* Internal pipeline state */ enum zio_flag io_flags; @@ -431,6 +434,7 @@ struct zio { enum zio_flag io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; + enum zio_stage io_pipeline_trace; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; @@ -453,6 +457,8 @@ struct zio { taskq_ent_t io_tqent; }; +extern int zio_timestamp_compare(const void *, const void *); + extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags); @@ -512,8 +518,8 @@ extern void zio_interrupt(zio_t *zio); extern void zio_delay_init(zio_t *zio); extern void zio_delay_interrupt(zio_t *zio); -extern zio_t *zio_walk_parents(zio_t *cio); -extern zio_t *zio_walk_children(zio_t *pio); +extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); +extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); diff --git a/uts/common/fs/zfs/sys/zio_impl.h b/uts/common/fs/zfs/sys/zio_impl.h index 08f820103e8..a36749a308d 100644 --- a/uts/common/fs/zfs/sys/zio_impl.h +++ b/uts/common/fs/zfs/sys/zio_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _ZIO_IMPL_H @@ -108,35 +108,37 @@ enum zio_stage { ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ - ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */ - ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */ - ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */ + ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */ + ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */ + ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */ + ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */ + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */ - ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */ + ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */ - ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */ + ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */ - ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */ + ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */ - ZIO_STAGE_READY = 1 << 16, /* RWFCI */ + ZIO_STAGE_READY = 1 << 18, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RW--I */ - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */ - ZIO_STAGE_DONE = 1 << 21 /* RWFCI */ + ZIO_STAGE_DONE = 1 << 23 /* RWFCI */ }; #define ZIO_INTERLOCK_STAGES \ @@ -187,22 +189,27 @@ enum zio_stage { #define ZIO_REWRITE_PIPELINE \ (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_COMPRESS | \ ZIO_STAGE_WRITE_BP_INIT) #define ZIO_WRITE_PIPELINE \ (ZIO_WRITE_COMMON_STAGES | \ ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_DVA_THROTTLE | \ ZIO_STAGE_DVA_ALLOCATE) #define ZIO_DDT_CHILD_WRITE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DVA_THROTTLE | \ ZIO_STAGE_DVA_ALLOCATE) #define ZIO_DDT_WRITE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_ISSUE_ASYNC | \ ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_WRITE_COMPRESS | \ ZIO_STAGE_CHECKSUM_GENERATE | \ ZIO_STAGE_DDT_WRITE) diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c index f92409962cf..d57c1b31f9c 100644 --- a/uts/common/fs/zfs/vdev.c +++ b/uts/common/fs/zfs/vdev.c @@ -347,6 +347,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL, &vd->vdev_dtl_lock); @@ -676,6 +677,7 @@ vdev_free(vdev_t *vd) } mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); @@ -988,7 +990,8 @@ vdev_probe_done(zio_t *zio) vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); - while ((pio = zio_walk_parents(zio)) != NULL) + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); @@ -2703,7 +2706,8 @@ vdev_allocatable(vdev_t *vd) * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write && !vd->vdev_ishole); + !vd->vdev_cant_write && !vd->vdev_ishole && + vd->vdev_mg->mg_initialized); } boolean_t diff --git a/uts/common/fs/zfs/vdev_cache.c b/uts/common/fs/zfs/vdev_cache.c index 0b188dbc166..a6d6cfa61b4 100644 --- a/uts/common/fs/zfs/vdev_cache.c +++ b/uts/common/fs/zfs/vdev_cache.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include @@ -238,7 +238,8 @@ vdev_cache_fill(zio_t *fio) * any reads that were queued up before the missed update are still * valid, so we can satisfy them from this line before we evict it. */ - while ((pio = zio_walk_parents(fio)) != NULL) + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(fio, &zl)) != NULL) vdev_cache_hit(vc, ve, pio); if (fio->io_error || ve->ve_missed_update) diff --git a/uts/common/fs/zfs/vdev_mirror.c b/uts/common/fs/zfs/vdev_mirror.c index 8749e539f46..b038ef6f67c 100644 --- a/uts/common/fs/zfs/vdev_mirror.c +++ b/uts/common/fs/zfs/vdev_mirror.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include @@ -190,9 +190,10 @@ vdev_mirror_scrub_done(zio_t *zio) if (zio->io_error == 0) { zio_t *pio; + zio_link_t *zl = NULL; mutex_enter(&zio->io_lock); - while ((pio = zio_walk_parents(zio)) != NULL) { + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); bcopy(zio->io_data, pio->io_data, pio->io_size); diff --git a/uts/common/fs/zfs/vdev_queue.c b/uts/common/fs/zfs/vdev_queue.c index 4917cc92841..ac586c879fd 100644 --- a/uts/common/fs/zfs/vdev_queue.c +++ b/uts/common/fs/zfs/vdev_queue.c @@ -34,6 +34,7 @@ #include #include #include +#include /* * ZFS I/O Scheduler @@ -167,6 +168,23 @@ int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; +/* + * Define the queue depth percentage for each top-level. This percentage is + * used in conjunction with zfs_vdev_async_max_active to determine how many + * allocations a specific top-level vdev should handle. Once the queue depth + * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 + * then allocator will stop allocating blocks on that top-level device. + * The default kernel setting is 1000% which will yield 100 allocations per + * device. For userland testing, the default setting is 300% which equates + * to 30 allocations per device. + */ +#ifdef _KERNEL +int zfs_vdev_queue_depth_pct = 1000; +#else +int zfs_vdev_queue_depth_pct = 300; +#endif + + int vdev_queue_offset_compare(const void *x1, const void *x2) { @@ -274,6 +292,7 @@ static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -289,6 +308,7 @@ static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -349,7 +369,8 @@ vdev_queue_agg_io_done(zio_t *aio) { if (aio->io_type == ZIO_TYPE_READ) { zio_t *pio; - while ((pio = zio_walk_parents(aio)) != NULL) { + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(aio, &zl)) != NULL) { bcopy((char *)aio->io_data + (pio->io_offset - aio->io_offset), pio->io_data, pio->io_size); } diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c index 30798b6541f..be7087d4db4 100644 --- a/uts/common/fs/zfs/zio.c +++ b/uts/common/fs/zfs/zio.c @@ -40,6 +40,7 @@ #include #include #include +#include /* * ========================================================================== @@ -51,6 +52,8 @@ const char *zio_type_name[ZIO_TYPES] = { "zio_ioctl" }; +boolean_t zio_dva_throttle_enabled = B_TRUE; + /* * ========================================================================== * I/O kmem caches @@ -101,6 +104,8 @@ int zio_buf_debug_limit = 16384; int zio_buf_debug_limit = 0; #endif +static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); + void zio_init(void) { @@ -333,52 +338,39 @@ zio_decompress(zio_t *zio, void *data, uint64_t size) * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ -/* - * NOTE - Callers to zio_walk_parents() and zio_walk_children must - * continue calling these functions until they return NULL. - * Otherwise, the next caller will pick up the list walk in - * some indeterminate state. (Otherwise every caller would - * have to pass in a cookie to keep the state represented by - * io_walk_link, which gets annoying.) - */ zio_t * -zio_walk_parents(zio_t *cio) +zio_walk_parents(zio_t *cio, zio_link_t **zl) { - zio_link_t *zl = cio->io_walk_link; list_t *pl = &cio->io_parent_list; - zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); - cio->io_walk_link = zl; - - if (zl == NULL) + *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); + if (*zl == NULL) return (NULL); - ASSERT(zl->zl_child == cio); - return (zl->zl_parent); + ASSERT((*zl)->zl_child == cio); + return ((*zl)->zl_parent); } zio_t * -zio_walk_children(zio_t *pio) +zio_walk_children(zio_t *pio, zio_link_t **zl) { - zio_link_t *zl = pio->io_walk_link; list_t *cl = &pio->io_child_list; - zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); - pio->io_walk_link = zl; - - if (zl == NULL) + *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); + if (*zl == NULL) return (NULL); - ASSERT(zl->zl_parent == pio); - return (zl->zl_child); + ASSERT((*zl)->zl_parent == pio); + return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { - zio_t *pio = zio_walk_parents(cio); + zio_link_t *zl = NULL; + zio_t *pio = zio_walk_parents(cio, &zl); - VERIFY(zio_walk_parents(cio) == NULL); + VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } @@ -447,6 +439,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) ASSERT(zio->io_stall == NULL); if (*countp != 0) { zio->io_stage >>= 1; + ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; } @@ -470,9 +463,18 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) (*countp)--; if (*countp == 0 && pio->io_stall == countp) { + zio_taskq_type_t type = + pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : + ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); - zio_execute(pio); + /* + * Dispatch the parent zio in its own taskq so that + * the child can continue to make progress. This also + * prevents overflowing the stack when we have deeply nested + * parent-child relationships. + */ + zio_taskq_dispatch(pio, type, B_FALSE); } else { mutex_exit(&pio->io_lock); } @@ -485,6 +487,30 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) zio->io_error = zio->io_child_error[c]; } +int +zio_timestamp_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_queued_timestamp < z2->io_queued_timestamp) + return (-1); + if (z1->io_queued_timestamp > z2->io_queued_timestamp) + return (1); + + if (z1->io_offset < z2->io_offset) + return (-1); + if (z1->io_offset > z2->io_offset) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) @@ -553,6 +579,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; + zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); @@ -750,7 +777,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); @@ -865,6 +892,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + ASSERT0(zio->io_queued_timestamp); return (zio); } @@ -984,9 +1012,30 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; + /* + * If we're creating a child I/O that is not associated with a + * top-level vdev, then the child zio is not an allocating I/O. + * If this is a retried I/O then we ignore it since we will + * have already processed the original allocating I/O. + */ + if (flags & ZIO_FLAG_IO_ALLOCATING && + (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { + metaslab_class_t *mc = spa_normal_class(pio->io_spa); + + ASSERT(mc->mc_alloc_throttle_enabled); + ASSERT(type == ZIO_TYPE_WRITE); + ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); + ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || + pio->io_child_type == ZIO_CHILD_GANG); + + flags &= ~ZIO_FLAG_IO_ALLOCATING; + } + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) @@ -1080,6 +1129,65 @@ zio_read_bp_init(zio_t *zio) static int zio_write_bp_init(zio_t *zio) +{ + if (!IO_IS_ALLOCATING(zio)) + return (ZIO_PIPELINE_CONTINUE); + + ASSERT(zio->io_child_type != ZIO_CHILD_DDT); + + if (zio->io_bp_override) { + blkptr_t *bp = zio->io_bp; + zio_prop_t *zp = &zio->io_prop; + + ASSERT(bp->blk_birth != zio->io_txg); + ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); + + *bp = *zio->io_bp_override; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + if (BP_IS_EMBEDDED(bp)) + return (ZIO_PIPELINE_CONTINUE); + + /* + * If we've been overridden and nopwrite is set then + * set the flag accordingly to indicate that a nopwrite + * has already occurred. + */ + if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { + ASSERT(!zp->zp_dedup); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); + zio->io_flags |= ZIO_FLAG_NOPWRITE; + return (ZIO_PIPELINE_CONTINUE); + } + + ASSERT(!zp->zp_nopwrite); + + if (BP_IS_HOLE(bp) || !zp->zp_dedup) + return (ZIO_PIPELINE_CONTINUE); + + ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); + + if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { + BP_SET_DEDUP(bp, 1); + zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; + return (ZIO_PIPELINE_CONTINUE); + } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ + zio->io_bp_override = NULL; + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; + } + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; @@ -1111,44 +1219,7 @@ zio_write_bp_init(zio_t *zio) } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); - - if (zio->io_bp_override) { - ASSERT(bp->blk_birth != zio->io_txg); - ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); - - *bp = *zio->io_bp_override; - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - - if (BP_IS_EMBEDDED(bp)) - return (ZIO_PIPELINE_CONTINUE); - - /* - * If we've been overridden and nopwrite is set then - * set the flag accordingly to indicate that a nopwrite - * has already occurred. - */ - if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { - ASSERT(!zp->zp_dedup); - zio->io_flags |= ZIO_FLAG_NOPWRITE; - return (ZIO_PIPELINE_CONTINUE); - } - - ASSERT(!zp->zp_nopwrite); - - if (BP_IS_HOLE(bp) || !zp->zp_dedup) - return (ZIO_PIPELINE_CONTINUE); - - ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & - ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); - - if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { - BP_SET_DEDUP(bp, 1); - zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; - return (ZIO_PIPELINE_CONTINUE); - } - zio->io_bp_override = NULL; - BP_ZERO(bp); - } + ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* @@ -1217,6 +1288,14 @@ zio_write_bp_init(zio_t *zio) psize, lsize, NULL); } } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ + zio->io_bp_override = NULL; + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; } /* @@ -1269,7 +1348,6 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } - return (ZIO_PIPELINE_CONTINUE); } @@ -1440,6 +1518,8 @@ zio_execute(zio_t *zio) { zio->io_executor = curthread; + ASSERT3U(zio->io_queued_timestamp, >, 0); + while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; @@ -1473,6 +1553,7 @@ zio_execute(zio_t *zio) } zio->io_stage = stage; + zio->io_pipeline_trace |= zio->io_stage; rv = zio_pipeline[highbit64(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) @@ -1496,6 +1577,8 @@ zio_wait(zio_t *zio) ASSERT(zio->io_executor == NULL); zio->io_waiter = curthread; + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); zio_execute(zio); @@ -1527,6 +1610,8 @@ zio_nowait(zio_t *zio) zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); } + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); zio_execute(zio); } @@ -1551,6 +1636,7 @@ zio_reexecute(zio_t *pio) pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; + pio->io_pipeline_trace = 0; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; @@ -1567,8 +1653,9 @@ zio_reexecute(zio_t *pio) * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ - for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { - cio_next = zio_walk_children(pio); + zio_link_t *zl = NULL; + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); mutex_enter(&pio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; @@ -1581,8 +1668,10 @@ zio_reexecute(zio_t *pio) * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on him. */ - if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) + if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { + pio->io_queued_timestamp = gethrtime(); zio_execute(pio); + } } void @@ -1975,6 +2064,7 @@ static int zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; + metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -1988,10 +2078,43 @@ zio_write_gang_block(zio_t *pio) zio_prop_t zp; int error; - error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, - bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, - METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); + int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + flags |= METASLAB_ASYNC_ALLOC; + VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); + + /* + * The logical zio has already placed a reservation for + * 'copies' allocation slots but gang blocks may require + * additional copies. These additional copies + * (i.e. gbh_copies - copies) are guaranteed to succeed + * since metaslab_class_throttle_reserve() always allows + * additional reservations for gang blocks. + */ + VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, + pio, flags)); + } + + error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, + bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio); if (error) { + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + /* + * If we failed to allocate the gang block header then + * we remove any additional allocation reservations that + * we placed here. The original reservation will + * be removed when the logical I/O goes to the ready + * stage. + */ + metaslab_class_throttle_unreserve(mc, + gbh_copies - copies, pio); + } pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } @@ -2030,11 +2153,25 @@ zio_write_gang_block(zio_t *pio) zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; - zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], + zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL, &gn->gn_child[g], pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); + + /* + * Gang children won't throttle but we should + * account for their work, so reserve an allocation + * slot for them here. + */ + VERIFY(metaslab_class_throttle_reserve(mc, + zp.zp_copies, cio, flags)); + } + zio_nowait(cio); } /* @@ -2292,7 +2429,8 @@ zio_ddt_child_write_ready(zio_t *zio) ddt_phys_fill(ddp, zio->io_bp); - while ((pio = zio_walk_parents(zio)) != NULL) + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); @@ -2313,7 +2451,8 @@ zio_ddt_child_write_done(zio_t *zio) dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { - while (zio_walk_parents(zio) != NULL) + zio_link_t *zl = NULL; + while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); @@ -2491,6 +2630,97 @@ zio_ddt_free(zio_t *zio) * Allocate and free blocks * ========================================================================== */ + +static zio_t * +zio_io_to_allocate(spa_t *spa) +{ + zio_t *zio; + + ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); + + zio = avl_first(&spa->spa_alloc_tree); + if (zio == NULL) + return (NULL); + + ASSERT(IO_IS_ALLOCATING(zio)); + + /* + * Try to place a reservation for this zio. If we're unable to + * reserve then we throttle. + */ + if (!metaslab_class_throttle_reserve(spa_normal_class(spa), + zio->io_prop.zp_copies, zio, 0)) { + return (NULL); + } + + avl_remove(&spa->spa_alloc_tree, zio); + ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); + + return (zio); +} + +static int +zio_dva_throttle(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_t *nio; + + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || + !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || + zio->io_child_type == ZIO_CHILD_GANG || + zio->io_flags & ZIO_FLAG_NODATA) { + return (ZIO_PIPELINE_CONTINUE); + } + + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + ASSERT3U(zio->io_queued_timestamp, >, 0); + ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); + + mutex_enter(&spa->spa_alloc_lock); + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + avl_add(&spa->spa_alloc_tree, zio); + + nio = zio_io_to_allocate(zio->io_spa); + mutex_exit(&spa->spa_alloc_lock); + + if (nio == zio) + return (ZIO_PIPELINE_CONTINUE); + + if (nio != NULL) { + ASSERT3U(nio->io_queued_timestamp, <=, + zio->io_queued_timestamp); + ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); + /* + * We are passing control to a new zio so make sure that + * it is processed by a different thread. We do this to + * avoid stack overflows that can occur when parents are + * throttled and children are making progress. We allow + * it to go to the head of the taskq since it's already + * been waiting. + */ + zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); + } + return (ZIO_PIPELINE_STOP); +} + +void +zio_allocate_dispatch(spa_t *spa) +{ + zio_t *zio; + + mutex_enter(&spa->spa_alloc_lock); + zio = zio_io_to_allocate(spa); + mutex_exit(&spa->spa_alloc_lock); + if (zio == NULL) + return; + + ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); + ASSERT0(zio->io_error); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); +} + static int zio_dva_allocate(zio_t *zio) { @@ -2511,18 +2741,20 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - /* - * The dump device does not support gang blocks so allocation on - * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid - * the "fast" gang feature. - */ - flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; - flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? - METASLAB_GANG_CHILD : 0; - error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, flags); + if (zio->io_flags & ZIO_FLAG_NODATA) { + flags |= METASLAB_DONT_THROTTLE; + } + if (zio->io_flags & ZIO_FLAG_GANG_CHILD) { + flags |= METASLAB_GANG_CHILD; + } + if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) { + flags |= METASLAB_ASYNC_ALLOC; + } - if (error) { + error = metaslab_alloc(spa, mc, zio->io_size, bp, + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio); + + if (error != 0) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); @@ -2587,21 +2819,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, ASSERT(txg > spa_syncing_txg(spa)); - /* - * ZIL blocks are always contiguous (i.e. not gang blocks) so we - * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" - * when allocating them. - */ if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, old_bp, - METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, old_bp, - METASLAB_HINTBP_AVOID); + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL); } if (error == 0) { @@ -2670,6 +2895,8 @@ zio_vdev_io_start(zio_t *zio) return (ZIO_PIPELINE_STOP); } + ASSERT3P(zio->io_logical, !=, zio); + /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care @@ -3043,6 +3270,7 @@ zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; + zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) @@ -3060,12 +3288,26 @@ zio_ready(zio_t *zio) if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; - if (zio->io_error) + if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + /* + * We were unable to allocate anything, unreserve and + * issue the next I/O to allocate. + */ + metaslab_class_throttle_unreserve( + spa_normal_class(zio->io_spa), + zio->io_prop.zp_copies, zio); + zio_allocate_dispatch(zio->io_spa); + } + } + mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; - pio = zio_walk_parents(zio); + pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* @@ -3076,7 +3318,7 @@ zio_ready(zio_t *zio) * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { - pio_next = zio_walk_parents(zio); + pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY); } @@ -3096,6 +3338,66 @@ zio_ready(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } +/* + * Update the allocation throttle accounting. + */ +static void +zio_dva_throttle_done(zio_t *zio) +{ + zio_t *lio = zio->io_logical; + zio_t *pio = zio_unique_parent(zio); + vdev_t *vd = zio->io_vd; + int flags = METASLAB_ASYNC_ALLOC; + + ASSERT3P(zio->io_bp, !=, NULL); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + ASSERT(vd != NULL); + ASSERT3P(vd, ==, vd->vdev_top); + ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); + ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); + ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); + + /* + * Parents of gang children can have two flavors -- ones that + * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) + * and ones that allocated the constituent blocks. The allocation + * throttle needs to know the allocating parent zio so we must find + * it here. + */ + if (pio->io_child_type == ZIO_CHILD_GANG) { + /* + * If our parent is a rewrite gang child then our grandparent + * would have been the one that performed the allocation. + */ + if (pio->io_flags & ZIO_FLAG_IO_REWRITE) + pio = zio_unique_parent(pio); + flags |= METASLAB_GANG_CHILD; + } + + ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT3P(zio, !=, zio->io_logical); + ASSERT(zio->io_logical != NULL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); + ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); + + mutex_enter(&pio->io_lock); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); + mutex_exit(&pio->io_lock); + + metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), + 1, pio); + + /* + * Call into the pipeline to see if there is more work that + * needs to be done. If there is work to be done it will be + * dispatched to another taskq thread. + */ + zio_allocate_dispatch(zio->io_spa); +} + static int zio_done(zio_t *zio) { @@ -3105,6 +3407,8 @@ zio_done(zio_t *zio) vdev_t *vd = zio->io_vd; uint64_t psize = zio->io_size; zio_t *pio, *pio_next; + metaslab_class_t *mc = spa_normal_class(spa); + zio_link_t *zl = NULL; /* * If our children haven't all completed, @@ -3116,6 +3420,30 @@ zio_done(zio_t *zio) zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); + /* + * If the allocation throttle is enabled, then update the accounting. + * We only track child I/Os that are part of an allocating async + * write. We must do this since the allocation is performed + * by the logical I/O but the actual write is done by child I/Os. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && + zio->io_child_type == ZIO_CHILD_VDEV) { + ASSERT(mc->mc_alloc_throttle_enabled); + zio_dva_throttle_done(zio); + } + + /* + * If the allocation throttle is enabled, verify that + * we have decremented the refcounts for every I/O that was throttled. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(bp != NULL); + metaslab_group_alloc_verify(spa, zio->io_bp, zio); + VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio)); + } + for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); @@ -3285,13 +3613,15 @@ zio_done(zio_t *zio) * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ - for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { - zio_link_t *zl = zio->io_walk_link; - pio_next = zio_walk_parents(zio); + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; + pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { - zio_remove_child(pio, zio, zl); + zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } } @@ -3351,10 +3681,11 @@ zio_done(zio_t *zio) zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); - for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { - zio_link_t *zl = zio->io_walk_link; - pio_next = zio_walk_parents(zio); - zio_remove_child(pio, zio, zl); + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); + zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } @@ -3378,9 +3709,10 @@ zio_done(zio_t *zio) static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, + zio_write_bp_init, zio_free_bp_init, zio_issue_async, - zio_write_bp_init, + zio_write_compress, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, @@ -3389,6 +3721,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_ddt_free, zio_gang_assemble, zio_gang_issue, + zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, diff --git a/uts/common/sys/fs/zfs.h b/uts/common/sys/fs/zfs.h index 6282712b31d..62f0263c348 100644 --- a/uts/common/sys/fs/zfs.h +++ b/uts/common/sys/fs/zfs.h @@ -884,7 +884,8 @@ typedef enum { SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ - SPA_LOAD_ERROR /* load failed */ + SPA_LOAD_ERROR, /* load failed */ + SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; /* From 2a7aac5a99f6b26391593af61605fd5c7daf0c07 Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Mon, 15 Aug 2016 14:21:46 +0000 Subject: [PATCH 02/41] 7235 remove unused func dsl_dataset_set_blkptr illumos/illumos-gate@bd56f80007857b960e0981ed0797ad8ec844a96b https://github.com/illumos/illumos-gate/commit/bd56f80007857b960e0981ed0797ad8ec844a96b https://www.illumos.org/issues/7235 The function dsl_dataset_set_blkptr() is unused. We should remove it. Reviewed by: George Wilson Reviewed by: Alex Reece Reviewed by: Prakash Surya Reviewed by: Igor Kozhukhov Approved by: Robert Mustacchi Author: Matthew Ahrens --- uts/common/fs/zfs/dsl_dataset.c | 13 ------------- uts/common/fs/zfs/sys/dsl_dataset.h | 1 - 2 files changed, 14 deletions(-) diff --git a/uts/common/fs/zfs/dsl_dataset.c b/uts/common/fs/zfs/dsl_dataset.c index 4aa33b80988..8eef4960833 100644 --- a/uts/common/fs/zfs/dsl_dataset.c +++ b/uts/common/fs/zfs/dsl_dataset.c @@ -1007,19 +1007,6 @@ dsl_dataset_get_blkptr(dsl_dataset_t *ds) return (&dsl_dataset_phys(ds)->ds_bp); } -void -dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - /* If it's the meta-objset, set dp_meta_rootbp */ - if (ds == NULL) { - tx->tx_pool->dp_meta_rootbp = *bp; - } else { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - dsl_dataset_phys(ds)->ds_bp = *bp; - } -} - spa_t * dsl_dataset_get_spa(dsl_dataset_t *ds) { diff --git a/uts/common/fs/zfs/sys/dsl_dataset.h b/uts/common/fs/zfs/sys/dsl_dataset.h index 17d15d70c83..f8c53c41964 100644 --- a/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/uts/common/fs/zfs/sys/dsl_dataset.h @@ -272,7 +272,6 @@ int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, minor_t cleanup_minor, const char *htag); blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); -void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); From cb0a9ee48f19eda063007c733ab434a9303846b5 Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Mon, 15 Aug 2016 14:22:12 +0000 Subject: [PATCH 03/41] 7230 add assertions to dmu_send_impl() to verify that stream includes BEGIN and END records illumos/illumos-gate@12b90ee2d3b10689fc45f4930d2392f5fe1d9cfa https://github.com/illumos/illumos-gate/commit/12b90ee2d3b10689fc45f4930d2392f5fe1d9cfa https://www.illumos.org/issues/7230 A test failure occurred where a send stream had only a BEGIN record. This should not be possible if the send returns without error. Prevented this from happening in the future by adding an assertion to dmu_send_impl() to verify that if the function returns 0 (success) both a BEGIN and END record are present. Did this by adding flags to dmu_sendarg_t (indicating whether BEGIN or END records sent), having dump_record() set flags appropriately, adding VERIFY statement to dmu_send_impl(). Reviewed by: Matthew Ahrens Reviewed by: Paul Dagnelie Reviewed by: Igor Kozhukhov Approved by: Robert Mustacchi Author: Matt Krantz --- uts/common/fs/zfs/dmu_send.c | 9 ++++++++- uts/common/fs/zfs/sys/dmu_impl.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/uts/common/fs/zfs/dmu_send.c b/uts/common/fs/zfs/dmu_send.c index c3f52d33e7e..f442baf390f 100644 --- a/uts/common/fs/zfs/dmu_send.c +++ b/uts/common/fs/zfs/dmu_send.c @@ -135,11 +135,16 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) fletcher_4_incremental_native(dsp->dsa_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &dsp->dsa_zc); - if (dsp->dsa_drr->drr_type != DRR_BEGIN) { + if (dsp->dsa_drr->drr_type == DRR_BEGIN) { + dsp->dsa_sent_begin = B_TRUE; + } else { ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. drr_checksum.drr_checksum)); dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; } + if (dsp->dsa_drr->drr_type == DRR_END) { + dsp->dsa_sent_end = B_TRUE; + } fletcher_4_incremental_native(&dsp->dsa_drr-> drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), &dsp->dsa_zc); @@ -881,6 +886,8 @@ out: list_remove(&to_ds->ds_sendstreams, dsp); mutex_exit(&to_ds->ds_sendstream_lock); + VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); + kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); diff --git a/uts/common/fs/zfs/sys/dmu_impl.h b/uts/common/fs/zfs/sys/dmu_impl.h index 8f3b27ff3fe..3adb914bf61 100644 --- a/uts/common/fs/zfs/sys/dmu_impl.h +++ b/uts/common/fs/zfs/sys/dmu_impl.h @@ -298,6 +298,8 @@ typedef struct dmu_sendarg { uint64_t dsa_last_data_offset; uint64_t dsa_resume_object; uint64_t dsa_resume_offset; + boolean_t dsa_sent_begin; + boolean_t dsa_sent_end; } dmu_sendarg_t; void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); From 4b498e5e9ddd97dce32d005d070c2dc5da8fe942 Mon Sep 17 00:00:00 2001 From: Andriy Gapon Date: Mon, 15 Aug 2016 14:23:50 +0000 Subject: [PATCH 04/41] 7136 ESC_VDEV_REMOVE_AUX ought to always include vdev information 7115 6922 generates ESC_ZFS_VDEV_REMOVE_AUX a bit too often illumos/illumos-gate@b72b6bb10ad55121a1b352c6f68ebdc8e20c9086 https://github.com/illumos/illumos-gate/commit/b72b6bb10ad55121a1b352c6f68ebdc8e20c9086 https://www.illumos.org/issues/7136 6922 added ESC_ZFS_VDEV_REMOVE_AUX and ESC_ZFS_VDEV_REMOVE_DEV sysevents whenever an aux device gets removed from a pool. However, those sysevents will be created without the vdev_guid and vdev_path fields. It would be better to always populate those fields. https://www.illumos.org/issues/7115 The addition of spa_event_notify in vdev removal code (see #6922) causes events to be generated even if the spare failed to be removed with EBUSY. Reviewed by: George Wilson Reviewed by: Josef 'Jeff' Sipek Approved by: Robert Mustacchi Author: Alan Somers --- uts/common/fs/zfs/spa.c | 57 +++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index 25b4714ea68..bc15615a140 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -139,6 +139,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; +static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, const char *name); +static void spa_event_post(sysevent_t *ev); static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); @@ -5447,6 +5449,7 @@ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; + sysevent_t *ev = NULL; metaslab_group_t *mg; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; @@ -5470,6 +5473,9 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * in this pool. */ if (vd == NULL || unspare) { + if (vd == NULL) + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); spa_load_spares(spa); @@ -5477,7 +5483,6 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) } else { error = SET_ERROR(EBUSY); } - spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); } else if (spa->spa_l2cache.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && @@ -5485,11 +5490,12 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) /* * Cache devices can always be removed. */ + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; - spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); } else if (vd != NULL && vd->vdev_islog) { ASSERT(!locked); ASSERT(vd == vd->vdev_top); @@ -5526,9 +5532,9 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) /* * Clean up the vdev namespace. */ + ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV); spa_vdev_remove_from_namespace(spa, vd); - spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV); } else if (vd != NULL) { /* * Normal vdevs cannot be removed (yet). @@ -5544,6 +5550,9 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) if (!locked) error = spa_vdev_exit(spa, NULL, txg, error); + if (ev) + spa_event_post(ev); + return (error); } @@ -6920,24 +6929,17 @@ spa_has_active_shared_spare(spa_t *spa) return (B_FALSE); } -/* - * Post a sysevent corresponding to the given event. The 'name' must be one of - * the event definitions in sys/sysevent/eventdefs.h. The payload will be - * filled in from the spa and (optionally) the vdev. This doesn't do anything - * in the userland libzpool, as we don't want consumers to misinterpret ztest - * or zdb as real changes. - */ -void -spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) +static sysevent_t * +spa_event_create(spa_t *spa, vdev_t *vd, const char *name) { + sysevent_t *ev = NULL; #ifdef _KERNEL - sysevent_t *ev; sysevent_attr_list_t *attr = NULL; sysevent_value_t value; - sysevent_id_t eid; ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", SE_SLEEP); + ASSERT(ev != NULL); value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = spa_name(spa); @@ -6969,11 +6971,34 @@ spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) goto done; attr = NULL; - (void) log_sysevent(ev, SE_SLEEP, &eid); - done: if (attr) sysevent_free_attr(attr); + +#endif + return (ev); +} + +static void +spa_event_post(sysevent_t *ev) +{ +#ifdef _KERNEL + sysevent_id_t eid; + + (void) log_sysevent(ev, SE_SLEEP, &eid); sysevent_free(ev); #endif } + +/* + * Post a sysevent corresponding to the given event. The 'name' must be one of + * the event definitions in sys/sysevent/eventdefs.h. The payload will be + * filled in from the spa and (optionally) the vdev. This doesn't do anything + * in the userland libzpool, as we don't want consumers to misinterpret ztest + * or zdb as real changes. + */ +void +spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) +{ + spa_event_post(spa_event_create(spa, vd, name)); +} From 90adad104b4f1c3fb8644bdb843d0c641085787b Mon Sep 17 00:00:00 2001 From: Bruce Evans Date: Thu, 1 Sep 2016 19:18:26 +0000 Subject: [PATCH 05/41] The log message for the previous commit didn't mention the most the important detail that sc_cngetc() now opens and closes the keyboard on every call again. This was moved from sc_cngetc() to scn_cngrab/ ungrab() in r228644, but the change wasn't quite complete. After fixes for nesting in kbdd_poll() in ukbd and kbdmux, these opens and closes should have no significant effect if done while grabbed. They fix unusual cases when cngetc() is called while not grabbed. This commit is the main fix for screen locking in sc_cnputc(): detect deadlock or likely-deadlock and handle it by buffering the output atomically and printing it later if the deadlock condition clears (and sc_cnputc() is called). The most common deadlock is when the screen lock is held by ourself. Then it would be safe to acquire the lock recursively if the console driver is calling printf() in a safe context, but we don't know when that is. It is not safe to ignore the lock even in kdb or panic mode. But ignore it in panic mode. The only other known case of deadlock is when another thread holds the lock but is running on a stopped CPU. Detect that case approximately by using trylock and retrying for 1000 usec. On a 4 GHz CPU, 100 usec is almost long enough -- screen switches take slightly longer than that. Not retrying at all is good enough except for stress tests, and planned future versions will extend the timeout so that the stress tests work better. To see the behaviour when deadlock is detected, single step through sctty_outwakeup() (or sc_puts() to start with deadlock). Another (serial) console is needed to the buffered-only output, but the keyboard works in this context to continue or step out of the deadlocked region. The buffer is not large enough to hold all the output for this. --- sys/dev/syscons/syscons.c | 68 +++++++++++++++++++++++++++++++++++---- sys/dev/syscons/syscons.h | 2 ++ 2 files changed, 64 insertions(+), 6 deletions(-) diff --git a/sys/dev/syscons/syscons.c b/sys/dev/syscons/syscons.c index 7333b3c7c1c..30c7a1a1327 100644 --- a/sys/dev/syscons/syscons.c +++ b/sys/dev/syscons/syscons.c @@ -1678,13 +1678,39 @@ sccnkbdunlock(sc_softc_t *sc, struct sc_cnstate *sp) static void sccnscrlock(sc_softc_t *sc, struct sc_cnstate *sp) { - SC_VIDEO_LOCK(sc); + int retries; + + /** + * Locking method: + * - if kdb_active and video_mtx is not owned by anyone, then lock + * by kdb remaining active + * - if !kdb_active, try to acquire video_mtx without blocking or + * recursing; if we get it then it works normally. + * Note that video_mtx is especially unusable if we already own it, + * since then it is protecting something and syscons is not reentrant + * enough to ignore the protection even in the kdb_active case. + */ + if (kdb_active) { + sp->kdb_locked = sc->video_mtx.mtx_lock == MTX_UNOWNED || panicstr; + sp->mtx_locked = FALSE; + } else { + sp->kdb_locked = FALSE; + for (retries = 0; retries < 1000; retries++) { + sp->mtx_locked = mtx_trylock_spin_flags(&sc->video_mtx, + MTX_QUIET) != 0 || panicstr; + if (sp->mtx_locked) + break; + DELAY(1); + } + } } static void sccnscrunlock(sc_softc_t *sc, struct sc_cnstate *sp) { - SC_VIDEO_UNLOCK(sc); + if (sp->mtx_locked) + mtx_unlock_spin(&sc->video_mtx); + sp->mtx_locked = sp->kdb_locked = FALSE; } static void @@ -1721,6 +1747,8 @@ over_keyboard: ; /* The screen is opened iff locking it succeeds. */ sccnscrlock(sc, sp); + if (!sp->kdb_locked && !sp->mtx_locked) + return; sp->scr_opened = TRUE; /* The screen switch is optional. */ @@ -1797,6 +1825,10 @@ sc_cnungrab(struct consdev *cp) atomic_add_int(&sc->grab_level, -1); } +static char sc_cnputc_log[0x1000]; +static u_int sc_cnputc_loghead; +static u_int sc_cnputc_logtail; + static void sc_cnputc(struct consdev *cd, int c) { @@ -1808,12 +1840,28 @@ sc_cnputc(struct consdev *cd, int c) struct tty *tp; #endif #endif /* !SC_NO_HISTORY */ + u_int head; int s; /* assert(sc_console != NULL) */ sccnopen(scp->sc, &st, 0); + /* + * Log the output. + * + * In the unlocked case, the logging is intentionally only + * perfectly atomic for the indexes. + */ + head = atomic_fetchadd_int(&sc_cnputc_loghead, 1); + sc_cnputc_log[head % sizeof(sc_cnputc_log)] = c; + + /* + * If we couldn't open, return to defer output. + */ + if (!st.scr_opened) + return; + #ifndef SC_NO_HISTORY if (scp == scp->sc->cur_scp && scp->status & SLKED) { scp->status &= ~SLKED; @@ -1841,8 +1889,14 @@ sc_cnputc(struct consdev *cd, int c) } #endif /* !SC_NO_HISTORY */ - buf[0] = c; - sc_puts(scp, buf, 1, 1); + /* Play any output still in the log (our char may already be done). */ + while (sc_cnputc_logtail != atomic_load_acq_int(&sc_cnputc_loghead)) { + buf[0] = sc_cnputc_log[sc_cnputc_logtail++ % sizeof(sc_cnputc_log)]; + if (atomic_load_acq_int(&sc_cnputc_loghead) - sc_cnputc_logtail >= + sizeof(sc_cnputc_log)) + continue; + sc_puts(scp, buf, 1, 1); + } s = spltty(); /* block sckbdevent and scrn_timer */ sccnupdate(scp); @@ -1883,9 +1937,11 @@ sc_cngetc_locked(struct sc_cnstate *sp) * Stop the screen saver and update the screen if necessary. * What if we have been running in the screen saver code... XXX */ - sc_touch_scrn_saver(); + if (sp->scr_opened) + sc_touch_scrn_saver(); scp = sc_console->sc->cur_scp; /* XXX */ - sccnupdate(scp); + if (sp->scr_opened) + sccnupdate(scp); if (fkeycp < fkey.len) return fkey.str[fkeycp++]; diff --git a/sys/dev/syscons/syscons.h b/sys/dev/syscons/syscons.h index 8d8e5ca85f2..3454700c947 100644 --- a/sys/dev/syscons/syscons.h +++ b/sys/dev/syscons/syscons.h @@ -191,6 +191,8 @@ struct tty; struct sc_cnstate { u_char kbd_locked; + u_char kdb_locked; + u_char mtx_locked; u_char kbd_opened; u_char scr_opened; }; From 2d77e0ca0605bab108e58cf52a65989704e27a08 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 1 Sep 2016 19:51:35 +0000 Subject: [PATCH 06/41] Remove the digi(4) drivers. These drivers were never updated for the new TTY changes and have been disconnected from the build since 8.0. Ok'd by: imp, peterj --- share/man/man4/Makefile | 1 - share/man/man4/digi.4 | 377 ------------------------ sys/conf/files | 10 - sys/modules/digi/Makefile | 12 - sys/modules/digi/Makefile.inc | 3 - sys/modules/digi/digi/Makefile | 16 - sys/modules/digi/digi_CX/Makefile | 7 - sys/modules/digi/digi_CX_PCI/Makefile | 7 - sys/modules/digi/digi_EPCX/Makefile | 7 - sys/modules/digi/digi_EPCX_PCI/Makefile | 7 - sys/modules/digi/digi_Xe/Makefile | 7 - sys/modules/digi/digi_Xem/Makefile | 7 - sys/modules/digi/digi_Xr/Makefile | 7 - 13 files changed, 468 deletions(-) delete mode 100644 share/man/man4/digi.4 delete mode 100644 sys/modules/digi/Makefile delete mode 100644 sys/modules/digi/Makefile.inc delete mode 100644 sys/modules/digi/digi/Makefile delete mode 100644 sys/modules/digi/digi_CX/Makefile delete mode 100644 sys/modules/digi/digi_CX_PCI/Makefile delete mode 100644 sys/modules/digi/digi_EPCX/Makefile delete mode 100644 sys/modules/digi/digi_EPCX_PCI/Makefile delete mode 100644 sys/modules/digi/digi_Xe/Makefile delete mode 100644 sys/modules/digi/digi_Xem/Makefile delete mode 100644 sys/modules/digi/digi_Xr/Makefile diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile index 5cd4b7defcc..0f64f72c431 100644 --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -122,7 +122,6 @@ MAN= aac.4 \ ddb.4 \ de.4 \ devctl.4 \ - digi.4 \ disc.4 \ divert.4 \ ${_dpms.4} \ diff --git a/share/man/man4/digi.4 b/share/man/man4/digi.4 deleted file mode 100644 index 1e3411995f5..00000000000 --- a/share/man/man4/digi.4 +++ /dev/null @@ -1,377 +0,0 @@ -.\" Copyright (c) 1990, 1991 The Regents of the University of California. -.\" All rights reserved. -.\" -.\" This code is derived from software contributed to Berkeley by -.\" the Systems Programming Group of the University of Utah Computer -.\" Science Department. -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" from: @(#)dca.4 5.2 (Berkeley) 3/27/91 -.\" from: com.4,v 1.1 1993/08/06 11:19:07 cgd Exp -.\" from: sio.4,v 1.15 1994/12/06 20:14:30 bde Exp -.\" $FreeBSD$ -.\" -.Dd December 7, 2003 -.Dt DIGI 4 -.Os -.Sh NAME -.Nm digi -.Nd DigiBoard intelligent serial cards driver -.Sh SYNOPSIS -.Cd "device digi" -.Pp -This man page was originally written for the dgb driver, and should -likely be gone over with a fine tooth comb to reflect differences -with the digi driver. -.Pp -When not defined the number is computed: -.Bd -ragged -offset 4n -default -.Dv NDGBPORTS -= number_of_described_DigiBoard_cards * 16 -.Ed -.Pp -If it is less than the actual number of ports -the system will be able to use only the -first -.Dv NDGBPORTS -ports. -If it is greater then all ports will be usable -but some memory will be wasted. -.Pp -Meaning of -.Cm flags : -.Bl -tag -width indent -compact -.It 0x0001 -use alternate pinout (exchange DCD and DSR lines) -.It 0x0002 -do not use 8K window mode of PC/Xe -.El -.Pp -Device numbering: -.Bd -literal -compact -0b\fICC\fPmmmmmmmm\fIOLIPPPPP\fP - \fBCC\fPard number - \fRmmmmmmmm\fPajor number - call\fBO\fPut - \fBL\fPock - \fBI\fPnitial - \fBPPPPP\fPort number -.Ed -.Sh DESCRIPTION -The -.Nm -driver provides support for DigiBoard PC/Xe and PC/Xi series intelligent -serial multiport cards with asynchronous interfaces based on the -.Tn EIA -.Tn RS-232C -.Pf ( Tn CCITT -.Tn V.24 ) -standard. -.Pp -Input and output for each line may set to one of following baud rates; -50, 75, 110, 134.5, 150, 300, 600, 1200, 1800, 2400, 4800, 9600, -19200, 38400, 57600, or for newer versions of cards 115200. -.Pp -The driver does not use any interrupts, it is -.Dq polling-based . -This means that -it uses clock interrupts instead of interrupts generated by DigiBoard cards and -checks the state of cards 25 times per second. -This is practical because the -DigiBoard cards have large input and output buffers (more than 1Kbyte per -port) and hardware that allows efficiently finding the port that needs -attention. -The only problem seen with this policy is slower -SLIP and PPP response. -.Pp -Each line in the kernel configuration file describes one card, not one port -as in the -.Xr sio 4 -driver. -.Pp -The -.Cm flags -keyword may be used on each -.Dq Li "device dgb" -line in the kernel configuration file -to change the pinout of the interface or to use new PC/Xe cards -which can work with an 8K memory window in compatibility mode -(with a 64K memory window). -Note -that using 8K memory window does not mean shorter input/output buffers, it means -only that all buffers will be mapped to the same memory address and switched as -needed. -.Pp -The -.Cm port -value must be the same -as the -port -set on the card by jumpers. -For PC/Xi cards the same rule is applicable to the -.Cm iomem -value. -It must be the same as the memory address set on the card -by jumpers. -.\"Some documentation gives the address as a ``paragraph'' or ``segment''; -.\"you can get the value of address by adding the digit "0" at end of -.\"paragraph value, e.g., 0xfc000 -> 0xfc0000. -For PC/Xe cards there is no need to use jumpers for this purpose. -In fact there are no jumpers to do it. -Just -write the address you want as the -.Cm iomem -value in kernel config file and the card will be programmed -to use this address. -.Pp -The same range of memory addresses may be used -for all the DigiBoards installed -(but not for any other card or real memory). -DigiBoards -with a large amount of memory (256K or 512K and perhaps -even 128K) must be mapped -to memory addresses outside of the first megabyte. -If the computer -has more than 15 megabytes of memory then there is no free address space -outside of the first megabyte where such DigiBoards can be mapped. -In this case you -may need to reduce the amount of memory in the computer. -But many machines provide a better solution. -They have the ability to -.Dq "turn off" -the memory in the 16th megabyte (addresses 0xF00000 - 0xFFFFFF) -using the -BIOS setup. -Then the DigiBoard's address space can be set to this -.Dq hole . -.\" XXX the following should be true for all serial drivers and -.\" should not be repeated in the man pages for all serial drivers. -.\" It was copied from sio.4. The only changes were s/sio/dgb/g. -.Pp -Serial ports controlled by the -.Nm -driver can be used for both -.Dq callin -and -.Dq callout . -For each port there is a callin device and a callout device. -The minor number of the callout device is 128 higher -than that of the corresponding callin port. -The callin device is general purpose. -Processes opening it normally wait for carrier -and for the callout device to become inactive. -The callout device is used to steal the port from -processes waiting for carrier on the callin device. -Processes opening it do not wait for carrier -and put any processes waiting for carrier on the callin device into -a deeper sleep so that they do not conflict with the callout session. -The callout device is abused for handling programs that are supposed -to work on general ports and need to open the port without waiting -but are too stupid to do so. -.Pp -The -.Nm -driver also supports an initial-state and a lock-state control -device for each of the callin and the callout -.Dq data -devices. -The minor number of the initial-state device is 32 higher -than that of the corresponding data device. -The minor number of the lock-state device is 64 higher -than that of the corresponding data device. -The termios settings of a data device are copied -from those of the corresponding initial-state device -on first opens and are not inherited from previous opens. -Use -.Xr stty 1 -in the normal way on the initial-state devices to program -initial termios states suitable for your setup. -.Pp -The lock termios state acts as flags to disable changing -the termios state. -E.g., to lock a flag variable such as -.Dv CRTSCTS , -use -.Dq Li "stty crtscts" -on the lock-state device. -Speeds and special characters -may be locked by setting the corresponding value in the lock-state -device to any nonzero value. -.Pp -Correct programs talking to correctly wired external devices -.\" XXX change next line in other man pages too, and rewrite this paragraph. -work with almost arbitrary initial states and no locking, -but other setups may benefit from changing some of the default -initial state and locking the state. -In particular, the initial states for non (POSIX) standard flags -should be set to suit the devices attached and may need to be -locked to prevent buggy programs from changing them. -E.g., -.Dv CRTSCTS -should be locked on for devices that support -RTS/CTS handshaking at all times and off for devices that do not -support it at all. -.Dv CLOCAL -should be locked on for devices -that do not support carrier. -.Dv HUPCL -may be locked off if you do not -want to hang up for some reason. -In general, very bad things happen -if something is locked to the wrong state, and things should not -be locked for devices that support more than one setting. -The -.Dv CLOCAL -flag on callin ports should be locked off for logins -to avoid certain security holes, but this needs to be done by -getty if the callin port is used for anything else. -.Sh FILES -.Bl -tag -width /dev/ttyiD?? -compact -.It Pa /dev/ttyD?? -for callin ports -.It Pa /dev/ttyiD?? -.It Pa /dev/ttylD?? -corresponding callin initial-state and lock-state devices -.Pp -.It Pa /dev/cuaD?? -for callout ports -.It Pa /dev/cuaiD?? -.It Pa /dev/cualD?? -corresponding callout initial-state and lock-state devices -.El -.Pp -.Bl -tag -width /etc/rc.serial -compact -.It Pa /etc/rc.serial -examples of setting the initial-state and lock-state devices -.El -.Pp -The first question mark in these device names is short for the -card number -(a decimal number between 0 and 65535 inclusive). -The second question mark is short for the port number -(a letter in the range [0-9a-v]). -.Sh DIAGNOSTICS -You may enable extended diagnostics by defining DEBUG at the -start of the source file -.Pa dgb.c . -.Bl -diag -.It dgb\fIX\fP: warning: address \fIN\fP truncated to \fIM\fP -The memory address for the PC/Xe's 8K window is misaligned (it should be -on an 8K boundary) or outside of the first megabyte. -.It dgb\fIX\fP: 1st reset failed -Problems with accessing I/O port of the card, probably -the wrong -.Cm port -value is specified in the kernel config file. -.It dgb\fIX\fP: 2nd reset failed -Problems with hardware. -.It dgb\fIX\fP: \fIN\fP[st,nd,rd,th] memory test failed -Problems with accessing the memory of the card, probably -the wrong -.Cm iomem -value is specified in the kernel config file. -.It dgb\fIX\fP: BIOS start failed -Problems with starting the on-board BIOS. -Probably the memory addresses of the -DigiBoard overlap with some other device or with RAM. -.It dgb\fIX\fP: BIOS download failed -Problems with the on-board BIOS. -Probably the memory addresses of the -DigiBoard overlap with some other device or with RAM. -.It dgb\fIX\fP: FEP code download failed -Problems with downloading of the Front-End Processor's micro-OS. -Probably the memory addresses of the -DigiBoard overlap with some other device or with RAM. -.It dgb\fIX\fP: FEP/OS start failed -Problems with starting of the Front-End Processor's micro-OS. -Probably the memory addresses of the -DigiBoard overlap with some other device or with RAM. -.It dgb\fIX\fP: too many ports -This DigiBoard reports that it has more than 32 ports. -Perhaps a hardware problem or -the memory addresses of the -DigiBoard overlap with some other device or with RAM. -.It dgb\fIX\fP: only \fIN\fP ports are usable -The -.Dv NDGBPORTS -parameter is too small and there is only enough space allocated -for -.Ar N -ports on this card. -.It dgb\fIX\fP: port \fIY\fP is broken -The on-board diagnostic has reported that the specified port has hardware -problems. -.It dgb\fIX\fP: polling of disabled board stopped -Internal problems in the polling logic of driver. -.It dgb\fIX\fP: event queue's head or tail is wrong! -Internal problems in the driver or hardware. -.It dgb\fIX\fP: port \fIY\fP: got event on nonexisting port -Some status changed on a port that is physically present but is -unusable due to misconfiguration. -.It dgb\fIX\fP: port \fIY\fP: event \fIN\fP mstat \fIM\fP lstat \fIK\fP -The driver got a strange event from card. -Probably this means that you have a -newer card with an extended list of events or some other hardware problem. -.It dgb\fIX\fP: port \fIY\fP: overrun -Input buffer has filled up. -Problems in polling logic of driver. -.It dgb\fIX\fP: port \fIY\fP: FEP command on disabled port -Internal problems in driver. -.It dgb\fIX\fP: port \fIY\fP: timeout on FEP command -Problems in hardware. -.El -.Sh SEE ALSO -.Xr stty 1 , -.Xr termios 4 , -.Xr tty 4 , -.Xr comcontrol 8 -.\" XXX add next line to many other drivers. -.Sh HISTORY -The -.Nm -driver is derived from the -.Xr sio 4 -driver and the DigiBoard driver from -.Tn Linux -and is -.Ud -.Sh BUGS -The implementation of sending -.Dv BREAK -is broken. -.Dv BREAK -of fixed length of 1/4 s -is sent anyway. -.Pp -There was a bug in implementation of -.Xr select 2 . -It is fixed now but not widely tested yet. -.Pp -There is no ditty command. -Most of its functions (alternate pinout, -speed up to 115200 baud, etc.) are implemented in the driver itself. -Some -other functions are missing. diff --git a/sys/conf/files b/sys/conf/files index 56f1f94c029..1e6e19abc71 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1349,16 +1349,6 @@ dev/dcons/dcons.c optional dcons dev/dcons/dcons_crom.c optional dcons_crom dev/dcons/dcons_os.c optional dcons dev/de/if_de.c optional de pci -dev/digi/CX.c optional digi_CX -dev/digi/CX_PCI.c optional digi_CX_PCI -dev/digi/EPCX.c optional digi_EPCX -dev/digi/EPCX_PCI.c optional digi_EPCX_PCI -dev/digi/Xe.c optional digi_Xe -dev/digi/Xem.c optional digi_Xem -dev/digi/Xr.c optional digi_Xr -dev/digi/digi.c optional digi -dev/digi/digi_isa.c optional digi isa -dev/digi/digi_pci.c optional digi pci dev/dpt/dpt_eisa.c optional dpt eisa dev/dpt/dpt_pci.c optional dpt pci dev/dpt/dpt_scsi.c optional dpt diff --git a/sys/modules/digi/Makefile b/sys/modules/digi/Makefile deleted file mode 100644 index 9d9aea61da7..00000000000 --- a/sys/modules/digi/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# $FreeBSD$ - -SUBDIR= digi -SUBDIR+=digi_CX -SUBDIR+=digi_CX_PCI -SUBDIR+=digi_EPCX -SUBDIR+=digi_EPCX_PCI -SUBDIR+=digi_Xe -SUBDIR+=digi_Xem -SUBDIR+=digi_Xr - -.include diff --git a/sys/modules/digi/Makefile.inc b/sys/modules/digi/Makefile.inc deleted file mode 100644 index 265f86d1ed5..00000000000 --- a/sys/modules/digi/Makefile.inc +++ /dev/null @@ -1,3 +0,0 @@ -# $FreeBSD$ - -.include "../Makefile.inc" diff --git a/sys/modules/digi/digi/Makefile b/sys/modules/digi/digi/Makefile deleted file mode 100644 index 3f19f01b69b..00000000000 --- a/sys/modules/digi/digi/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# $FreeBSD$ - -.PATH: ${.CURDIR}/../../../dev/digi -KMOD= digi -SRCS= digi.c digi_pci.c digi_isa.c -SRCS+= digi.h digi_pci.h digireg.h digi_mod.h -SRCS+= bus_if.h pci_if.h device_if.h -SRCS+= opt_compat.h - -.if !defined(KERNBUILDDIR) -opt_compat.h: - echo "#define COMPAT_43 1" > ${.TARGET} - echo "#define COMPAT_FREEBSD6 1" >> ${.TARGET} -.endif - -.include diff --git a/sys/modules/digi/digi_CX/Makefile b/sys/modules/digi/digi_CX/Makefile deleted file mode 100644 index 2d833dec8ff..00000000000 --- a/sys/modules/digi/digi_CX/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# $FreeBSD$ - -.PATH: ${.CURDIR}/../../../dev/digi -KMOD= digi_CX -SRCS= CX.c CX.bios.h CX.fepos.h digi_mod.h - -.include diff --git a/sys/modules/digi/digi_CX_PCI/Makefile b/sys/modules/digi/digi_CX_PCI/Makefile deleted file mode 100644 index 65afb17f2d6..00000000000 --- a/sys/modules/digi/digi_CX_PCI/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# $FreeBSD$ - -.PATH: ${.CURDIR}/../../../dev/digi -KMOD= digi_CX_PCI -SRCS= CX_PCI.c CX_PCI.bios.h CX_PCI.fepos.h digi_mod.h - -.include diff --git a/sys/modules/digi/digi_EPCX/Makefile b/sys/modules/digi/digi_EPCX/Makefile deleted file mode 100644 index e9d95b01d14..00000000000 --- a/sys/modules/digi/digi_EPCX/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# $FreeBSD$ - -.PATH: ${.CURDIR}/../../../dev/digi -KMOD= digi_EPCX -SRCS= EPCX.c EPCX.bios.h EPCX.fepos.h digi_mod.h - -.include diff --git a/sys/modules/digi/digi_EPCX_PCI/Makefile b/sys/modules/digi/digi_EPCX_PCI/Makefile deleted file mode 100644 index d1119cc0bb5..00000000000 --- a/sys/modules/digi/digi_EPCX_PCI/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# $FreeBSD$ - -.PATH: ${.CURDIR}/../../../dev/digi -KMOD= digi_EPCX_PCI -SRCS= EPCX_PCI.c EPCX_PCI.bios.h EPCX_PCI.fepos.h digi_mod.h - -.include diff --git a/sys/modules/digi/digi_Xe/Makefile b/sys/modules/digi/digi_Xe/Makefile deleted file mode 100644 index 4c8f9e53c62..00000000000 --- a/sys/modules/digi/digi_Xe/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# $FreeBSD$ - -.PATH: ${.CURDIR}/../../../dev/digi -KMOD= digi_Xe -SRCS= Xe.c Xe.bios.h Xe.fepos.h digi_mod.h - -.include diff --git a/sys/modules/digi/digi_Xem/Makefile b/sys/modules/digi/digi_Xem/Makefile deleted file mode 100644 index 0a49ddd07f7..00000000000 --- a/sys/modules/digi/digi_Xem/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# $FreeBSD$ - -.PATH: ${.CURDIR}/../../../dev/digi -KMOD= digi_Xem -SRCS= Xem.c Xem.bios.h Xem.fepos.h digi_mod.h - -.include diff --git a/sys/modules/digi/digi_Xr/Makefile b/sys/modules/digi/digi_Xr/Makefile deleted file mode 100644 index 786ce79a1c6..00000000000 --- a/sys/modules/digi/digi_Xr/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# $FreeBSD$ - -.PATH: ${.CURDIR}/../../../dev/digi -KMOD= digi_Xr -SRCS= Xr.c Xr.bios.h Xr.fepos.h digi_mod.h - -.include From 7cba15b16eb22b565bac8fe126cbedf2af79a69c Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Thu, 1 Sep 2016 20:43:01 +0000 Subject: [PATCH 07/41] cxgbe/cxgbei: Retire all DDP related code from cxgbei and switch to routines available in t4_tom to manage the iSCSI DDP page pod region. This adds the ability to use multiple DDP page sizes to the iSCSI driver, among other improvements. Sponsored by: Chelsio Communications --- sys/dev/cxgbe/cxgbei/cxgbei.c | 484 ++----------------------- sys/dev/cxgbe/cxgbei/cxgbei.h | 57 +-- sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.c | 417 --------------------- sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.h | 217 ----------- sys/dev/cxgbe/cxgbei/icl_cxgbei.c | 248 +++++++++++-- sys/dev/cxgbe/offload.h | 1 - sys/dev/cxgbe/t4_main.c | 10 - sys/dev/cxgbe/t4_sge.c | 13 +- sys/modules/cxgbe/cxgbei/Makefile | 1 - 9 files changed, 257 insertions(+), 1191 deletions(-) delete mode 100644 sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.c delete mode 100644 sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.h diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.c b/sys/dev/cxgbe/cxgbei/cxgbei.c index a246ff80dea..044056b0bf6 100644 --- a/sys/dev/cxgbe/cxgbei/cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/cxgbei.c @@ -90,7 +90,6 @@ __FBSDID("$FreeBSD$"); #include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */ #include "tom/t4_tom.h" #include "cxgbei.h" -#include "cxgbei_ulp2_ddp.h" static int worker_thread_count; static struct cxgbei_worker_thread_softc *cwt_softc; @@ -101,376 +100,6 @@ struct icl_pdu *icl_cxgbei_new_pdu(int); void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *); void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *); -/* - * Direct Data Placement - - * Directly place the iSCSI Data-In or Data-Out PDU's payload into pre-posted - * final destination host-memory buffers based on the Initiator Task Tag (ITT) - * in Data-In or Target Task Tag (TTT) in Data-Out PDUs. - * The host memory address is programmed into h/w in the format of pagepod - * entries. - * The location of the pagepod entry is encoded into ddp tag which is used as - * the base for ITT/TTT. - */ - -/* - * functions to program the pagepod in h/w - */ -static void inline -ppod_set(struct pagepod *ppod, - struct cxgbei_ulp2_pagepod_hdr *hdr, - struct cxgbei_ulp2_gather_list *gl, - unsigned int pidx) -{ - int i; - - memcpy(ppod, hdr, sizeof(*hdr)); - - for (i = 0; i < (PPOD_PAGES + 1); i++, pidx++) { - ppod->addr[i] = pidx < gl->nelem ? - cpu_to_be64(gl->dma_sg[pidx].phys_addr) : 0ULL; - } -} - -static void inline -ppod_clear(struct pagepod *ppod) -{ - memset(ppod, 0, sizeof(*ppod)); -} - -static inline void -ulp_mem_io_set_hdr(struct adapter *sc, int tid, struct ulp_mem_io *req, - unsigned int wr_len, unsigned int dlen, - unsigned int pm_addr) -{ - struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1); - - INIT_ULPTX_WR(req, wr_len, 0, 0); - req->cmd = cpu_to_be32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) | - V_ULP_MEMIO_ORDER(is_t4(sc)) | - V_T5_ULP_MEMIO_IMM(is_t5(sc))); - req->dlen = htonl(V_ULP_MEMIO_DATA_LEN(dlen >> 5)); - req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16) - | V_FW_WR_FLOWID(tid)); - req->lock_addr = htonl(V_ULP_MEMIO_ADDR(pm_addr >> 5)); - - idata->cmd_more = htonl(V_ULPTX_CMD(ULP_TX_SC_IMM)); - idata->len = htonl(dlen); -} - -#define ULPMEM_IDATA_MAX_NPPODS 1 /* 256/PPOD_SIZE */ -#define PCIE_MEMWIN_MAX_NPPODS 16 /* 1024/PPOD_SIZE */ - -static int -ppod_write_idata(struct cxgbei_data *ci, - struct cxgbei_ulp2_pagepod_hdr *hdr, - unsigned int idx, unsigned int npods, - struct cxgbei_ulp2_gather_list *gl, - unsigned int gl_pidx, struct toepcb *toep) -{ - u_int dlen = PPOD_SIZE * npods; - u_int pm_addr = idx * PPOD_SIZE + ci->llimit; - u_int wr_len = roundup(sizeof(struct ulp_mem_io) + - sizeof(struct ulptx_idata) + dlen, 16); - struct ulp_mem_io *req; - struct ulptx_idata *idata; - struct pagepod *ppod; - u_int i; - struct wrqe *wr; - struct adapter *sc = toep->vi->pi->adapter; - - wr = alloc_wrqe(wr_len, toep->ctrlq); - if (wr == NULL) { - CXGBE_UNIMPLEMENTED("ppod_write_idata: alloc_wrqe failure"); - return (ENOMEM); - } - - req = wrtod(wr); - memset(req, 0, wr_len); - ulp_mem_io_set_hdr(sc, toep->tid, req, wr_len, dlen, pm_addr); - idata = (struct ulptx_idata *)(req + 1); - - ppod = (struct pagepod *)(idata + 1); - for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES) { - if (!hdr) /* clear the pagepod */ - ppod_clear(ppod); - else /* set the pagepod */ - ppod_set(ppod, hdr, gl, gl_pidx); - } - - t4_wrq_tx(sc, wr); - return 0; -} - -int -t4_ddp_set_map(struct cxgbei_data *ci, void *iccp, - struct cxgbei_ulp2_pagepod_hdr *hdr, u_int idx, u_int npods, - struct cxgbei_ulp2_gather_list *gl, int reply) -{ - struct icl_cxgbei_conn *icc = (struct icl_cxgbei_conn *)iccp; - struct toepcb *toep = icc->toep; - int err; - unsigned int pidx = 0, w_npods = 0, cnt; - - /* - * on T4, if we use a mix of IMMD and DSGL with ULP_MEM_WRITE, - * the order would not be guaranteed, so we will stick with IMMD - */ - gl->tid = toep->tid; - gl->port_id = toep->vi->pi->port_id; - gl->egress_dev = (void *)toep->vi->ifp; - - /* send via immediate data */ - for (; w_npods < npods; idx += cnt, w_npods += cnt, - pidx += PPOD_PAGES) { - cnt = npods - w_npods; - if (cnt > ULPMEM_IDATA_MAX_NPPODS) - cnt = ULPMEM_IDATA_MAX_NPPODS; - err = ppod_write_idata(ci, hdr, idx, cnt, gl, pidx, toep); - if (err) { - printf("%s: ppod_write_idata failed\n", __func__); - break; - } - } - return err; -} - -void -t4_ddp_clear_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl, - u_int tag, u_int idx, u_int npods, struct icl_cxgbei_conn *icc) -{ - struct toepcb *toep = icc->toep; - int err = -1; - u_int pidx = 0; - u_int w_npods = 0; - u_int cnt; - - for (; w_npods < npods; idx += cnt, w_npods += cnt, - pidx += PPOD_PAGES) { - cnt = npods - w_npods; - if (cnt > ULPMEM_IDATA_MAX_NPPODS) - cnt = ULPMEM_IDATA_MAX_NPPODS; - err = ppod_write_idata(ci, NULL, idx, cnt, gl, 0, toep); - if (err) - break; - } -} - -static int -cxgbei_map_sg(struct cxgbei_sgl *sgl, struct ccb_scsiio *csio) -{ - unsigned int data_len = csio->dxfer_len; - unsigned int sgoffset = (uint64_t)csio->data_ptr & PAGE_MASK; - unsigned int nsge; - unsigned char *sgaddr = csio->data_ptr; - unsigned int len = 0; - - nsge = (csio->dxfer_len + sgoffset + PAGE_SIZE - 1) >> PAGE_SHIFT; - sgl->sg_addr = sgaddr; - sgl->sg_offset = sgoffset; - if (data_len < (PAGE_SIZE - sgoffset)) - len = data_len; - else - len = PAGE_SIZE - sgoffset; - - sgl->sg_length = len; - - data_len -= len; - sgaddr += len; - sgl = sgl+1; - - while (data_len > 0) { - sgl->sg_addr = sgaddr; - len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE; - sgl->sg_length = len; - sgaddr += len; - data_len -= len; - sgl = sgl + 1; - } - - return nsge; -} - -static int -cxgbei_map_sg_tgt(struct cxgbei_sgl *sgl, union ctl_io *io) -{ - unsigned int data_len, sgoffset, nsge; - unsigned char *sgaddr; - unsigned int len = 0, index = 0, ctl_sg_count, i; - struct ctl_sg_entry ctl_sg_entry, *ctl_sglist; - - if (io->scsiio.kern_sg_entries > 0) { - ctl_sglist = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr; - ctl_sg_count = io->scsiio.kern_sg_entries; - } else { - ctl_sglist = &ctl_sg_entry; - ctl_sglist->addr = io->scsiio.kern_data_ptr; - ctl_sglist->len = io->scsiio.kern_data_len; - ctl_sg_count = 1; - } - - sgaddr = sgl->sg_addr = ctl_sglist[index].addr; - sgoffset = sgl->sg_offset = (uint64_t)sgl->sg_addr & PAGE_MASK; - data_len = ctl_sglist[index].len; - - if (data_len < (PAGE_SIZE - sgoffset)) - len = data_len; - else - len = PAGE_SIZE - sgoffset; - - sgl->sg_length = len; - - data_len -= len; - sgaddr += len; - sgl = sgl+1; - - len = 0; - for (i = 0; i< ctl_sg_count; i++) - len += ctl_sglist[i].len; - nsge = (len + sgoffset + PAGE_SIZE -1) >> PAGE_SHIFT; - while (data_len > 0) { - sgl->sg_addr = sgaddr; - len = (data_len < PAGE_SIZE)? data_len: PAGE_SIZE; - sgl->sg_length = len; - sgaddr += len; - data_len -= len; - sgl = sgl + 1; - if (data_len == 0) { - if (index == ctl_sg_count - 1) - break; - index++; - sgaddr = ctl_sglist[index].addr; - data_len = ctl_sglist[index].len; - } - } - - return nsge; -} - -static int -t4_sk_ddp_tag_reserve(struct cxgbei_data *ci, struct icl_cxgbei_conn *icc, - u_int xferlen, struct cxgbei_sgl *sgl, u_int sgcnt, u_int *ddp_tag) -{ - struct cxgbei_ulp2_gather_list *gl; - int err = -EINVAL; - struct toepcb *toep = icc->toep; - - gl = cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(xferlen, sgl, sgcnt, ci, 0); - if (gl) { - err = cxgbei_ulp2_ddp_tag_reserve(ci, icc, toep->tid, - &ci->tag_format, ddp_tag, gl, 0, 0); - if (err) { - cxgbei_ulp2_ddp_release_gl(ci, gl); - } - } - - return err; -} - -static unsigned int -cxgbei_task_reserve_itt(struct icl_conn *ic, void **prv, - struct ccb_scsiio *scmd, unsigned int *itt) -{ - struct icl_cxgbei_conn *icc = ic_to_icc(ic); - int xferlen = scmd->dxfer_len; - struct cxgbei_task_data *tdata = NULL; - struct cxgbei_sgl *sge = NULL; - struct toepcb *toep = icc->toep; - struct adapter *sc = td_adapter(toep->td); - struct cxgbei_data *ci = sc->iscsi_ulp_softc; - int err = -1; - - MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); - - tdata = (struct cxgbei_task_data *)*prv; - if (xferlen == 0 || tdata == NULL) - goto out; - if (xferlen < DDP_THRESHOLD) - goto out; - - if ((scmd->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) { - tdata->nsge = cxgbei_map_sg(tdata->sgl, scmd); - if (tdata->nsge == 0) { - CTR1(KTR_CXGBE, "%s: map_sg failed", __func__); - return 0; - } - sge = tdata->sgl; - - tdata->sc_ddp_tag = *itt; - - CTR3(KTR_CXGBE, "%s: *itt:0x%x sc_ddp_tag:0x%x", - __func__, *itt, tdata->sc_ddp_tag); - if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format, - tdata->sc_ddp_tag)) { - err = t4_sk_ddp_tag_reserve(ci, icc, scmd->dxfer_len, - sge, tdata->nsge, &tdata->sc_ddp_tag); - } else { - CTR3(KTR_CXGBE, - "%s: itt:0x%x sc_ddp_tag:0x%x not usable", - __func__, *itt, tdata->sc_ddp_tag); - } - } -out: - if (err < 0) - tdata->sc_ddp_tag = - cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *itt); - - return tdata->sc_ddp_tag; -} - -static unsigned int -cxgbei_task_reserve_ttt(struct icl_conn *ic, void **prv, union ctl_io *io, - unsigned int *ttt) -{ - struct icl_cxgbei_conn *icc = ic_to_icc(ic); - struct toepcb *toep = icc->toep; - struct adapter *sc = td_adapter(toep->td); - struct cxgbei_data *ci = sc->iscsi_ulp_softc; - struct cxgbei_task_data *tdata = NULL; - int xferlen, err = -1; - struct cxgbei_sgl *sge = NULL; - - MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); - - xferlen = (io->scsiio.kern_data_len - io->scsiio.ext_data_filled); - tdata = (struct cxgbei_task_data *)*prv; - if ((xferlen == 0) || (tdata == NULL)) - goto out; - if (xferlen < DDP_THRESHOLD) - goto out; - tdata->nsge = cxgbei_map_sg_tgt(tdata->sgl, io); - if (tdata->nsge == 0) { - CTR1(KTR_CXGBE, "%s: map_sg failed", __func__); - return 0; - } - sge = tdata->sgl; - - tdata->sc_ddp_tag = *ttt; - if (cxgbei_ulp2_sw_tag_usable(&ci->tag_format, tdata->sc_ddp_tag)) { - err = t4_sk_ddp_tag_reserve(ci, icc, xferlen, sge, - tdata->nsge, &tdata->sc_ddp_tag); - } else { - CTR2(KTR_CXGBE, "%s: sc_ddp_tag:0x%x not usable", - __func__, tdata->sc_ddp_tag); - } -out: - if (err < 0) - tdata->sc_ddp_tag = - cxgbei_ulp2_set_non_ddp_tag(&ci->tag_format, *ttt); - return tdata->sc_ddp_tag; -} - -static int -t4_sk_ddp_tag_release(struct icl_cxgbei_conn *icc, unsigned int ddp_tag) -{ - struct toepcb *toep = icc->toep; - struct adapter *sc = td_adapter(toep->td); - struct cxgbei_data *ci = sc->iscsi_ulp_softc; - - cxgbei_ulp2_ddp_tag_release(ci, ddp_tag, icc); - - return (0); -} - static void read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len, uint32_t *max_rx_pdu_len) @@ -504,58 +133,43 @@ read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len, static int cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) { - int nppods, bits, rc; - static const u_int pgsz_order[] = {0, 1, 2, 3}; + struct ppod_region *pr; + uint32_t r; + int rc; MPASS(sc->vres.iscsi.size > 0); + MPASS(ci != NULL); - ci->llimit = sc->vres.iscsi.start; - ci->ulimit = sc->vres.iscsi.start + sc->vres.iscsi.size - 1; read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len); - nppods = sc->vres.iscsi.size >> IPPOD_SIZE_SHIFT; - if (nppods <= 1024) - return (ENXIO); + ci->ddp_threshold = 2048; + pr = &ci->pr; - bits = fls(nppods); - if (bits > IPPOD_IDX_MAX_SIZE) - bits = IPPOD_IDX_MAX_SIZE; - nppods = (1 << (bits - 1)) - 1; - - rc = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR, - BUS_SPACE_MAXADDR, NULL, NULL, UINT32_MAX , 8, BUS_SPACE_MAXSIZE, - BUS_DMA_ALLOCNOW, NULL, NULL, &ci->ulp_ddp_tag); + r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ); + rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods"); if (rc != 0) { - device_printf(sc->dev, "%s: failed to create DMA tag: %u.\n", + device_printf(sc->dev, + "%s: failed to initialize the iSCSI page pod region: %u.\n", __func__, rc); return (rc); } - ci->colors = malloc(nppods * sizeof(char), M_CXGBE, M_NOWAIT | M_ZERO); - ci->gl_map = malloc(nppods * sizeof(struct cxgbei_ulp2_gather_list *), - M_CXGBE, M_NOWAIT | M_ZERO); - if (ci->colors == NULL || ci->gl_map == NULL) { - bus_dma_tag_destroy(ci->ulp_ddp_tag); - free(ci->colors, M_CXGBE); - free(ci->gl_map, M_CXGBE); - return (ENOMEM); + r = t4_read_reg(sc, A_ULP_RX_ISCSI_TAGMASK); + r &= V_ISCSITAGMASK(M_ISCSITAGMASK); + if (r != pr->pr_tag_mask) { + /* + * Recent firmwares are supposed to set up the the iSCSI tagmask + * but we'll do it ourselves it the computed value doesn't match + * what's in the register. + */ + device_printf(sc->dev, + "tagmask 0x%08x does not match computed mask 0x%08x.\n", r, + pr->pr_tag_mask); + t4_set_reg_field(sc, A_ULP_RX_ISCSI_TAGMASK, + V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask); } - mtx_init(&ci->map_lock, "ddp lock", NULL, MTX_DEF | MTX_DUPOK); - ci->nppods = nppods; - ci->idx_last = nppods; - ci->idx_bits = bits; - ci->idx_mask = (1 << bits) - 1; - ci->rsvd_tag_mask = (1 << (bits + IPPOD_IDX_SHIFT)) - 1; - - ci->tag_format.sw_bits = bits; - ci->tag_format.rsvd_bits = bits; - ci->tag_format.rsvd_shift = IPPOD_IDX_SHIFT; - ci->tag_format.rsvd_mask = ci->idx_mask; - - t4_iscsi_init(sc, ci->idx_mask << IPPOD_IDX_SHIFT, pgsz_order); - - return (rc); + return (0); } static int @@ -772,47 +386,6 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) return (0); } -/* initiator */ -void -cxgbei_conn_task_reserve_itt(void *conn, void **prv, - void *scmd, unsigned int *itt) -{ - unsigned int tag; - tag = cxgbei_task_reserve_itt(conn, prv, scmd, itt); - if (tag) - *itt = htonl(tag); - return; -} - -/* target */ -void -cxgbei_conn_transfer_reserve_ttt(void *conn, void **prv, - void *scmd, unsigned int *ttt) -{ - unsigned int tag; - tag = cxgbei_task_reserve_ttt(conn, prv, scmd, ttt); - if (tag) - *ttt = htonl(tag); - return; -} - -void -cxgbei_cleanup_task(void *conn, void *ofld_priv) -{ - struct icl_conn *ic = (struct icl_conn *)conn; - struct icl_cxgbei_conn *icc = ic_to_icc(ic); - struct cxgbei_task_data *tdata = ofld_priv; - struct adapter *sc = icc->sc; - struct cxgbei_data *ci = sc->iscsi_ulp_softc; - - MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); - MPASS(tdata != NULL); - - if (cxgbei_ulp2_is_ddp_tag(&ci->tag_format, tdata->sc_ddp_tag)) - t4_sk_ddp_tag_release(icc, tdata->sc_ddp_tag); - memset(tdata, 0, sizeof(*tdata)); -} - static int cxgbei_activate(struct adapter *sc) { @@ -834,7 +407,7 @@ cxgbei_activate(struct adapter *sc) } /* per-adapter softc for iSCSI */ - ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_NOWAIT); + ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK); if (ci == NULL) return (ENOMEM); @@ -852,12 +425,13 @@ cxgbei_activate(struct adapter *sc) static int cxgbei_deactivate(struct adapter *sc) { + struct cxgbei_data *ci = sc->iscsi_ulp_softc; ASSERT_SYNCHRONIZED_OP(sc); - if (sc->iscsi_ulp_softc != NULL) { - cxgbei_ddp_cleanup(sc->iscsi_ulp_softc); - free(sc->iscsi_ulp_softc, M_CXGBE); + if (ci != NULL) { + t4_free_ppod_region(&ci->pr); + free(ci, M_CXGBE); sc->iscsi_ulp_softc = NULL; } diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.h b/sys/dev/cxgbe/cxgbei/cxgbei.h index acd3c192940..a96499b482c 100644 --- a/sys/dev/cxgbe/cxgbei/cxgbei.h +++ b/sys/dev/cxgbe/cxgbei/cxgbei.h @@ -105,69 +105,18 @@ ip_to_icp(struct icl_pdu *ip) return (__containerof(ip, struct icl_cxgbei_pdu, ip)); } -struct cxgbei_sgl { - int sg_flag; - void *sg_addr; - void *sg_dma_addr; - size_t sg_offset; - size_t sg_length; -}; - -#define cxgbei_scsi_for_each_sg(_sgl, _sgel, _n, _i) \ - for (_i = 0, _sgel = (cxgbei_sgl*) (_sgl); _i < _n; _i++, \ - _sgel++) -#define sg_dma_addr(_sgel) _sgel->sg_dma_addr -#define sg_virt(_sgel) _sgel->sg_addr -#define sg_len(_sgel) _sgel->sg_length -#define sg_off(_sgel) _sgel->sg_offset -#define sg_next(_sgel) _sgel + 1 - -/* private data for each scsi task */ -struct cxgbei_task_data { - struct cxgbei_sgl sgl[256]; - u_int nsge; - u_int sc_ddp_tag; -}; - -struct cxgbei_ulp2_tag_format { - u_char sw_bits; - u_char rsvd_bits; - u_char rsvd_shift; - u_char filler[1]; - uint32_t rsvd_mask; -}; - struct cxgbei_data { - u_int llimit; - u_int ulimit; - u_int nppods; - u_int idx_last; - u_char idx_bits; - uint32_t idx_mask; - uint32_t rsvd_tag_mask; u_int max_tx_pdu_len; u_int max_rx_pdu_len; - struct mtx map_lock; - bus_dma_tag_t ulp_ddp_tag; - unsigned char *colors; - struct cxgbei_ulp2_gather_list **gl_map; + u_int ddp_threshold; + struct ppod_region pr; - struct cxgbei_ulp2_tag_format tag_format; }; -void cxgbei_conn_task_reserve_itt(void *, void **, void *, unsigned int *); -void cxgbei_conn_transfer_reserve_ttt(void *, void **, void *, unsigned int *); -void cxgbei_cleanup_task(void *, void *); +/* cxgbei.c */ u_int cxgbei_select_worker_thread(struct icl_cxgbei_conn *); -struct cxgbei_ulp2_pagepod_hdr; -int t4_ddp_set_map(struct cxgbei_data *, void *, - struct cxgbei_ulp2_pagepod_hdr *, u_int, u_int, - struct cxgbei_ulp2_gather_list *, int); -void t4_ddp_clear_map(struct cxgbei_data *, struct cxgbei_ulp2_gather_list *, - u_int, u_int, u_int, struct icl_cxgbei_conn *); - /* icl_cxgbei.c */ int icl_cxgbei_mod_load(void); int icl_cxgbei_mod_unload(void); diff --git a/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.c b/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.c deleted file mode 100644 index fd7cd4a74e1..00000000000 --- a/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.c +++ /dev/null @@ -1,417 +0,0 @@ -/*- - * Copyright (c) 2012 Chelsio Communications, Inc. - * All rights reserved. - * - * Chelsio T5xx iSCSI driver - * cxgbei_ulp2_ddp.c: Chelsio iSCSI DDP Manager. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include "opt_inet.h" -#include "opt_inet6.h" - -#ifdef TCP_OFFLOAD -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include - -#include "common/common.h" -#include "common/t4_msg.h" -#include "common/t4_regs.h" /* for PCIE_MEM_ACCESS */ -#include "tom/t4_tom.h" -#include "cxgbei.h" -#include "cxgbei_ulp2_ddp.h" - -/* - * Map a single buffer address. - */ -static void -ulp2_dma_map_addr(void *arg, bus_dma_segment_t *segs, int nseg, int error) -{ - bus_addr_t *ba = arg; - if (error) - return; - - KASSERT(nseg == 1, ("%s: %d segments returned!", __func__, nseg)); - - *ba = segs->ds_addr; -} - -/* - * iSCSI Direct Data Placement - * - * T4/5 ulp2 h/w can directly place the iSCSI Data-In or Data-Out PDU's - * payload into pre-posted final destination host-memory buffers based on the - * Initiator Task Tag (ITT) in Data-In or Target Task Tag (TTT) in Data-Out - * PDUs. - * - * The host memory address is programmed into h/w in the format of pagepod - * entries. - * The location of the pagepod entry is encoded into ddp tag which is used or - * is the base for ITT/TTT. - */ - - -static inline int -ddp_find_unused_entries(struct cxgbei_data *ci, u_int start, u_int max, - u_int count, u_int *idx, struct cxgbei_ulp2_gather_list *gl) -{ - unsigned int i, j, k; - - /* not enough entries */ - if (max - start < count) - return (EBUSY); - - max -= count; - mtx_lock(&ci->map_lock); - for (i = start; i < max;) { - for (j = 0, k = i; j < count; j++, k++) { - if (ci->gl_map[k]) - break; - } - if (j == count) { - for (j = 0, k = i; j < count; j++, k++) - ci->gl_map[k] = gl; - mtx_unlock(&ci->map_lock); - *idx = i; - return (0); - } - i += j + 1; - } - mtx_unlock(&ci->map_lock); - return (EBUSY); -} - -static inline void -ddp_unmark_entries(struct cxgbei_data *ci, u_int start, u_int count) -{ - - mtx_lock(&ci->map_lock); - memset(&ci->gl_map[start], 0, - count * sizeof(struct cxgbei_ulp2_gather_list *)); - mtx_unlock(&ci->map_lock); -} - -static inline void -ddp_gl_unmap(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl) -{ - int i; - - if (!gl->pages[0]) - return; - - for (i = 0; i < gl->nelem; i++) { - bus_dmamap_unload(ci->ulp_ddp_tag, gl->dma_sg[i].bus_map); - bus_dmamap_destroy(ci->ulp_ddp_tag, gl->dma_sg[i].bus_map); - } -} - -static inline int -ddp_gl_map(struct cxgbei_data *ci, struct cxgbei_ulp2_gather_list *gl) -{ - int i, rc; - bus_addr_t pa; - - MPASS(ci != NULL); - - mtx_lock(&ci->map_lock); - for (i = 0; i < gl->nelem; i++) { - rc = bus_dmamap_create(ci->ulp_ddp_tag, 0, - &gl->dma_sg[i].bus_map); - if (rc != 0) - goto unmap; - rc = bus_dmamap_load(ci->ulp_ddp_tag, gl->dma_sg[i].bus_map, - gl->pages[i], PAGE_SIZE, ulp2_dma_map_addr, - &pa, BUS_DMA_NOWAIT); - if (rc != 0) - goto unmap; - gl->dma_sg[i].phys_addr = pa; - } - mtx_unlock(&ci->map_lock); - - return (0); - -unmap: - if (i) { - u_int nelem = gl->nelem; - - gl->nelem = i; - ddp_gl_unmap(ci, gl); - gl->nelem = nelem; - } - return (ENOMEM); -} - -/** - * cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec - build ddp page buffer list - * @xferlen: total buffer length - * @sgl: page buffer scatter-gather list (struct cxgbei_sgl) - * @sgcnt: # of page buffers - * @gfp: allocation mode - * - * construct a ddp page buffer list from the scsi scattergather list. - * coalesce buffers as much as possible, and obtain dma addresses for - * each page. - * - * Return the cxgbei_ulp2_gather_list constructed from the page buffers if the - * memory can be used for ddp. Return NULL otherwise. - */ -struct cxgbei_ulp2_gather_list * -cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(u_int xferlen, struct cxgbei_sgl *sgl, - u_int sgcnt, struct cxgbei_data *ci, int gfp) -{ - struct cxgbei_ulp2_gather_list *gl; - struct cxgbei_sgl *sg = sgl; - void *sgpage = (void *)((u64)sg->sg_addr & (~PAGE_MASK)); - unsigned int sglen = sg->sg_length; - unsigned int sgoffset = (u64)sg->sg_addr & PAGE_MASK; - unsigned int npages = (xferlen + sgoffset + PAGE_SIZE - 1) >> - PAGE_SHIFT; - int i = 1, j = 0; - - if (xferlen <= DDP_THRESHOLD) { - CTR2(KTR_CXGBE, "xfer %u < threshold %u, no ddp.", - xferlen, DDP_THRESHOLD); - return NULL; - } - - gl = malloc(sizeof(struct cxgbei_ulp2_gather_list) + - npages * (sizeof(struct dma_segments) + sizeof(void *)), - M_DEVBUF, M_NOWAIT | M_ZERO); - if (gl == NULL) - return (NULL); - - gl->pages = (void **)&gl->dma_sg[npages]; - gl->length = xferlen; - gl->offset = sgoffset; - gl->pages[0] = sgpage; - CTR6(KTR_CXGBE, - "%s: xferlen:0x%x len:0x%x off:0x%x sg_addr:%p npages:%d", - __func__, xferlen, gl->length, gl->offset, sg->sg_addr, npages); - - for (i = 1, sg = sg_next(sg); i < sgcnt; i++, sg = sg_next(sg)) { - void *page = sg->sg_addr; - - if (sgpage == page && sg->sg_offset == sgoffset + sglen) - sglen += sg->sg_length; - else { - /* make sure the sgl is fit for ddp: - * each has the same page size, and - * all of the middle pages are used completely - */ - if ((j && sgoffset) || - ((i != sgcnt - 1) && - ((sglen + sgoffset) & ~CXGBEI_PAGE_MASK))){ - goto error_out; - } - - j++; - if (j == gl->nelem || sg->sg_offset) { - goto error_out; - } - gl->pages[j] = page; - sglen = sg->sg_length; - sgoffset = sg->sg_offset; - sgpage = page; - } - } - gl->nelem = ++j; - - if (ddp_gl_map(ci, gl) < 0) - goto error_out; - - return gl; - -error_out: - free(gl, M_DEVBUF); - return NULL; -} - -/** - * cxgbei_ulp2_ddp_release_gl - release a page buffer list - * @gl: a ddp page buffer list - * @pdev: pci_dev used for pci_unmap - * free a ddp page buffer list resulted from cxgbei_ulp2_ddp_make_gl(). - */ -void -cxgbei_ulp2_ddp_release_gl(struct cxgbei_data *ci, - struct cxgbei_ulp2_gather_list *gl) -{ - - ddp_gl_unmap(ci, gl); - free(gl, M_DEVBUF); -} - -/** - * cxgbei_ulp2_ddp_tag_reserve - set up ddp for a data transfer - * @ci: adapter's ddp info - * @tid: connection id - * @tformat: tag format - * @tagp: contains s/w tag initially, will be updated with ddp/hw tag - * @gl: the page momory list - * @gfp: allocation mode - * - * ddp setup for a given page buffer list and construct the ddp tag. - * return 0 if success, < 0 otherwise. - */ -int -cxgbei_ulp2_ddp_tag_reserve(struct cxgbei_data *ci, void *icc, u_int tid, - struct cxgbei_ulp2_tag_format *tformat, u32 *tagp, - struct cxgbei_ulp2_gather_list *gl, int gfp, int reply) -{ - struct cxgbei_ulp2_pagepod_hdr hdr; - u_int npods, idx; - int rc; - u32 sw_tag = *tagp; - u32 tag; - - MPASS(ci != NULL); - - if (!gl || !gl->nelem || gl->length < DDP_THRESHOLD) - return (EINVAL); - - npods = (gl->nelem + IPPOD_PAGES_MAX - 1) >> IPPOD_PAGES_SHIFT; - - if (ci->idx_last == ci->nppods) - rc = ddp_find_unused_entries(ci, 0, ci->nppods, npods, &idx, - gl); - else { - rc = ddp_find_unused_entries(ci, ci->idx_last + 1, - ci->nppods, npods, &idx, gl); - if (rc && ci->idx_last >= npods) { - rc = ddp_find_unused_entries(ci, 0, - min(ci->idx_last + npods, ci->nppods), - npods, &idx, gl); - } - } - if (rc) { - CTR3(KTR_CXGBE, "xferlen %u, gl %u, npods %u NO DDP.", - gl->length, gl->nelem, npods); - return (rc); - } - - tag = cxgbei_ulp2_ddp_tag_base(idx, ci->colors, tformat, sw_tag); - CTR4(KTR_CXGBE, "%s: sw_tag:0x%x idx:0x%x tag:0x%x", - __func__, sw_tag, idx, tag); - - hdr.rsvd = 0; - hdr.vld_tid = htonl(F_IPPOD_VALID | V_IPPOD_TID(tid)); - hdr.pgsz_tag_clr = htonl(tag & ci->rsvd_tag_mask); - hdr.maxoffset = htonl(gl->length); - hdr.pgoffset = htonl(gl->offset); - - rc = t4_ddp_set_map(ci, icc, &hdr, idx, npods, gl, reply); - if (rc < 0) - goto unmark_entries; - - ci->idx_last = idx; - *tagp = tag; - return (0); - -unmark_entries: - ddp_unmark_entries(ci, idx, npods); - return (rc); -} - -/** - * cxgbei_ulp2_ddp_tag_release - release a ddp tag - * @ci: adapter's ddp info - * @tag: ddp tag - * ddp cleanup for a given ddp tag and release all the resources held - */ -void -cxgbei_ulp2_ddp_tag_release(struct cxgbei_data *ci, uint32_t tag, - struct icl_cxgbei_conn *icc) -{ - uint32_t idx; - - MPASS(ci != NULL); - MPASS(icc != NULL); - - idx = (tag >> IPPOD_IDX_SHIFT) & ci->idx_mask; - CTR3(KTR_CXGBE, "tag:0x%x idx:0x%x nppods:0x%x", - tag, idx, ci->nppods); - if (idx < ci->nppods) { - struct cxgbei_ulp2_gather_list *gl = ci->gl_map[idx]; - unsigned int npods; - - if (!gl || !gl->nelem) { - CTR4(KTR_CXGBE, - "release 0x%x, idx 0x%x, gl 0x%p, %u.", - tag, idx, gl, gl ? gl->nelem : 0); - return; - } - npods = (gl->nelem + IPPOD_PAGES_MAX - 1) >> IPPOD_PAGES_SHIFT; - CTR3(KTR_CXGBE, "ddp tag 0x%x, release idx 0x%x, npods %u.", - tag, idx, npods); - t4_ddp_clear_map(ci, gl, tag, idx, npods, icc); - ddp_unmark_entries(ci, idx, npods); - cxgbei_ulp2_ddp_release_gl(ci, gl); - } else - CTR3(KTR_CXGBE, "ddp tag 0x%x, idx 0x%x > max 0x%x.", - tag, idx, ci->nppods); -} - -/** - * cxgbei_ddp_cleanup - release the adapter's ddp resources - */ -void -cxgbei_ddp_cleanup(struct cxgbei_data *ci) -{ - int i = 0; - - while (i < ci->nppods) { - struct cxgbei_ulp2_gather_list *gl = ci->gl_map[i]; - if (gl) { - int npods = (gl->nelem + IPPOD_PAGES_MAX - 1) - >> IPPOD_PAGES_SHIFT; - free(gl, M_DEVBUF); - i += npods; - } else - i++; - } - free(ci->colors, M_CXGBE); - free(ci->gl_map, M_CXGBE); -} -#endif diff --git a/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.h b/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.h deleted file mode 100644 index f069f09aa47..00000000000 --- a/sys/dev/cxgbe/cxgbei/cxgbei_ulp2_ddp.h +++ /dev/null @@ -1,217 +0,0 @@ -/*- - * Copyright (c) 2012 Chelsio Communications, Inc. - * All rights reserved. - * - * Chelsio T5xx iSCSI driver - * cxgbei_ulp2_ddp.c: Chelsio iSCSI DDP Manager. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - * - */ - -#ifndef __CXGBEI_ULP2_DDP_H__ -#define __CXGBEI_ULP2_DDP_H__ - -#define CXGBEI_PAGE_MASK (~(PAGE_SIZE-1)) -#define DDP_THRESHOLD 2048 - -/* - * cxgbei ddp tag are 32 bits, it consists of reserved bits used by h/w and - * non-reserved bits that can be used by the iscsi s/w. - * The reserved bits are identified by the rsvd_bits and rsvd_shift fields - * in struct cxgbei_ulp2_tag_format. - * - * The upper most reserved bit can be used to check if a tag is ddp tag or not: - * if the bit is 0, the tag is a valid ddp tag - */ - -/* - * cxgbei_ulp2_is_ddp_tag - check if a given tag is a hw/ddp tag - * @tformat: tag format information - * @tag: tag to be checked - * - * return true if the tag is a ddp tag, false otherwise. - */ -static inline int -cxgbei_ulp2_is_ddp_tag(struct cxgbei_ulp2_tag_format *tformat, uint32_t tag) -{ - - return (!(tag & (1 << (tformat->rsvd_bits + tformat->rsvd_shift - 1)))); -} - -/* - * cxgbei_ulp2_sw_tag_usable - check if s/w tag has enough bits left for hw bits - * @tformat: tag format information - * @sw_tag: s/w tag to be checked - * - * return true if the tag can be used for hw ddp tag, false otherwise. - */ -static inline int -cxgbei_ulp2_sw_tag_usable(struct cxgbei_ulp2_tag_format *tformat, - uint32_t sw_tag) -{ - - return (1); /* XXXNP: huh? */ - - sw_tag >>= (32 - tformat->rsvd_bits + tformat->rsvd_shift); - return !sw_tag; -} - -/* - * cxgbei_ulp2_set_non_ddp_tag - mark a given s/w tag as an invalid ddp tag - * @tformat: tag format information - * @sw_tag: s/w tag to be checked - * - * insert 1 at the upper most reserved bit to mark it as an invalid ddp tag. - */ -static inline uint32_t -cxgbei_ulp2_set_non_ddp_tag(struct cxgbei_ulp2_tag_format *tformat, - uint32_t sw_tag) -{ - uint32_t rsvd_bits = tformat->rsvd_bits + tformat->rsvd_shift; - if (sw_tag) { - u32 v1 = sw_tag & ((1 << (rsvd_bits - 1)) - 1); - u32 v2 = (sw_tag >> (rsvd_bits - 1)) << rsvd_bits; - return v2 | (1 << (rsvd_bits - 1)) | v1; - } - - return sw_tag | (1 << (rsvd_bits - 1)) ; -} - -struct dma_segments { - bus_dmamap_t bus_map; - bus_addr_t phys_addr; -}; -/* - * struct cxgbei_ulp2_gather_list - cxgbei direct data placement memory - * - * @tag: ddp tag - * @length: total data buffer length - * @offset: initial offset to the 1st page - * @nelem: # of pages - * @pages: page pointers - * @phys_addr: physical address - */ -struct cxgbei_ulp2_gather_list { - uint32_t tag; - uint32_t tid; - uint32_t port_id; - void *egress_dev; - unsigned int length; - unsigned int offset; - unsigned int nelem; - bus_size_t mapsize; - bus_dmamap_t bus_map; - bus_dma_segment_t *segments; - void **pages; - struct dma_segments dma_sg[0]; -}; - -#define IPPOD_SIZE sizeof(struct cxgbei_ulp2_pagepod) /* 64 */ -#define IPPOD_SIZE_SHIFT 6 - -#define IPPOD_COLOR_SHIFT 0 -#define IPPOD_COLOR_SIZE 6 -#define IPPOD_COLOR_MASK ((1 << IPPOD_COLOR_SIZE) - 1) - -#define IPPOD_IDX_SHIFT IPPOD_COLOR_SIZE -#define IPPOD_IDX_MAX_SIZE 24 - -#define S_IPPOD_TID 0 -#define M_IPPOD_TID 0xFFFFFF -#define V_IPPOD_TID(x) ((x) << S_IPPOD_TID) - -#define S_IPPOD_VALID 24 -#define V_IPPOD_VALID(x) ((x) << S_IPPOD_VALID) -#define F_IPPOD_VALID V_IPPOD_VALID(1U) - -#define S_IPPOD_COLOR 0 -#define M_IPPOD_COLOR 0x3F -#define V_IPPOD_COLOR(x) ((x) << S_IPPOD_COLOR) - -#define S_IPPOD_TAG 6 -#define M_IPPOD_TAG 0xFFFFFF -#define V_IPPOD_TAG(x) ((x) << S_IPPOD_TAG) - -#define S_IPPOD_PGSZ 30 -#define M_IPPOD_PGSZ 0x3 -#define V_IPPOD_PGSZ(x) ((x) << S_IPPOD_PGSZ) - -static inline uint32_t -cxgbei_ulp2_ddp_tag_base(u_int idx, u_char *colors, - struct cxgbei_ulp2_tag_format *tformat, uint32_t sw_tag) -{ - if (__predict_false(++colors[idx] == 1 << IPPOD_IDX_SHIFT)) - colors[idx] = 0; - - sw_tag <<= tformat->rsvd_bits + tformat->rsvd_shift; - - return (sw_tag | idx << IPPOD_IDX_SHIFT | colors[idx]); -} - -#define ISCSI_PDU_NONPAYLOAD_LEN 312 /* bhs(48) + ahs(256) + digest(8) */ - -/* - * align pdu size to multiple of 512 for better performance - */ -#define cxgbei_align_pdu_size(n) do { n = (n) & (~511); } while (0) - -#define ULP2_MAX_PKT_SIZE 16224 -#define ULP2_MAX_PDU_PAYLOAD (ULP2_MAX_PKT_SIZE - ISCSI_PDU_NONPAYLOAD_LEN) -#define IPPOD_PAGES_MAX 4 -#define IPPOD_PAGES_SHIFT 2 /* 4 pages per pod */ - -/* - * struct pagepod_hdr, pagepod - pagepod format - */ -struct cxgbei_ulp2_pagepod_hdr { - uint32_t vld_tid; - uint32_t pgsz_tag_clr; - uint32_t maxoffset; - uint32_t pgoffset; - uint64_t rsvd; -}; - -struct cxgbei_ulp2_pagepod { - struct cxgbei_ulp2_pagepod_hdr hdr; - uint64_t addr[IPPOD_PAGES_MAX + 1]; -}; - -int cxgbei_ulp2_ddp_tag_reserve(struct cxgbei_data *, void *, unsigned int, - struct cxgbei_ulp2_tag_format *, uint32_t *, - struct cxgbei_ulp2_gather_list *, int , int ); -void cxgbei_ulp2_ddp_tag_release(struct cxgbei_data *, uint32_t, - struct icl_cxgbei_conn *); - -struct cxgbei_ulp2_gather_list *cxgbei_ulp2_ddp_make_gl_from_iscsi_sgvec(u_int, - struct cxgbei_sgl *, u_int, struct cxgbei_data *, int); -void cxgbei_ulp2_ddp_release_gl(struct cxgbei_data *, - struct cxgbei_ulp2_gather_list *); - -int cxgbei_ulp2_ddp_find_page_index(u_long); -int cxgbei_ulp2_adapter_ddp_info(struct cxgbei_data *, - struct cxgbei_ulp2_tag_format *); - -void cxgbei_ddp_cleanup(struct cxgbei_data *); -#endif diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c index f0eef35de3b..6696f6dee33 100644 --- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c @@ -60,6 +60,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include #include @@ -70,6 +72,28 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "common/common.h" #include "common/t4_tcb.h" #include "tom/t4_tom.h" @@ -90,8 +114,7 @@ static int recvspace = 1048576; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN, &recvspace, 0, "Default receive socket buffer size"); -static uma_zone_t icl_transfer_zone; - +static uma_zone_t prsv_zone; static volatile u_int icl_cxgbei_ncons; #define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) @@ -242,12 +265,6 @@ icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic, return (icl_pdu_data_segment_length(request)); } -static uint32_t -icl_conn_build_tasktag(struct icl_conn *ic, uint32_t tag) -{ - return tag; -} - static struct mbuf * finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp) { @@ -776,55 +793,215 @@ icl_cxgbei_conn_close(struct icl_conn *ic) int icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, - struct ccb_scsiio *csio, uint32_t *task_tagp, void **prvp) + struct ccb_scsiio *csio, uint32_t *ittp, void **arg) { - void *prv; + struct icl_cxgbei_conn *icc = ic_to_icc(ic); + struct toepcb *toep = icc->toep; + struct adapter *sc = icc->sc; + struct cxgbei_data *ci = sc->iscsi_ulp_softc; + struct ppod_region *pr = &ci->pr; + struct ppod_reservation *prsv; + uint32_t itt; + int rc = 0; - *task_tagp = icl_conn_build_tasktag(ic, *task_tagp); + /* This is for the offload driver's state. Must not be set already. */ + MPASS(arg != NULL); + MPASS(*arg == NULL); - prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO); - if (prv == NULL) - return (ENOMEM); + if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN || + csio->dxfer_len < ci->ddp_threshold) { +no_ddp: + /* + * No DDP for this I/O. Allocate an ITT (based on the one + * passed in) that cannot be a valid hardware DDP tag in the + * iSCSI region. + */ + itt = *ittp & M_PPOD_TAG; + itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit; + *ittp = htobe32(itt); + MPASS(*arg == NULL); /* State is maintained for DDP only. */ + return (0); + } - *prvp = prv; + /* + * Reserve resources for DDP, update the itt that should be used in the + * PDU, and save DDP specific state for this I/O in *arg. + */ - cxgbei_conn_task_reserve_itt(ic, prvp, csio, task_tagp); + prsv = uma_zalloc(prsv_zone, M_NOWAIT); + if (prsv == NULL) { + rc = ENOMEM; + goto no_ddp; + } + /* XXX add support for all CAM_DATA_ types */ + MPASS((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR); + rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr, + csio->dxfer_len, prsv); + if (rc != 0) { + uma_zfree(prsv_zone, prsv); + goto no_ddp; + } + + rc = t4_write_page_pods_for_buf(sc, toep->ofld_txq, toep->tid, prsv, + (vm_offset_t)csio->data_ptr, csio->dxfer_len); + if (rc != 0) { + t4_free_page_pods(prsv); + uma_zfree(prsv_zone, prsv); + goto no_ddp; + } + + *ittp = htobe32(prsv->prsv_tag); + *arg = prsv; return (0); } void -icl_cxgbei_conn_task_done(struct icl_conn *ic, void *prv) +icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg) { - cxgbei_cleanup_task(ic, prv); - uma_zfree(icl_transfer_zone, prv); + if (arg != NULL) { + struct ppod_reservation *prsv = arg; + + t4_free_page_pods(prsv); + uma_zfree(prsv_zone, prsv); + } } +/* XXXNP: PDU should be passed in as parameter, like on the initiator. */ +#define io_to_request_pdu(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr) +#define io_to_ppod_reservation(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr) + int icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io, - uint32_t *transfer_tag, void **prvp) + uint32_t *tttp, void **arg) { - void *prv; + struct icl_cxgbei_conn *icc = ic_to_icc(ic); + struct toepcb *toep = icc->toep; + struct ctl_scsiio *ctsio = &io->scsiio; + struct adapter *sc = icc->sc; + struct cxgbei_data *ci = sc->iscsi_ulp_softc; + struct ppod_region *pr = &ci->pr; + struct ppod_reservation *prsv; + uint32_t ttt; + int xferlen, rc = 0, alias; - *transfer_tag = icl_conn_build_tasktag(ic, *transfer_tag); + /* This is for the offload driver's state. Must not be set already. */ + MPASS(arg != NULL); + MPASS(*arg == NULL); - prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO); - if (prv == NULL) - return (ENOMEM); + if (ctsio->ext_data_filled == 0) { + int first_burst; + struct icl_pdu *ip = io_to_request_pdu(io); + vm_offset_t buf; +#ifdef INVARIANTS + struct icl_cxgbei_pdu *icp = ip_to_icp(ip); - *prvp = prv; + MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); + MPASS(ic == ip->ip_conn); + MPASS(ip->ip_bhs_mbuf != NULL); +#endif + first_burst = icl_pdu_data_segment_length(ip); - cxgbei_conn_transfer_reserve_ttt(ic, prvp, io, transfer_tag); + /* + * Note that ICL calls conn_transfer_setup even if the first + * burst had everything and there's nothing left to transfer. + */ + MPASS(ctsio->kern_data_len >= first_burst); + xferlen = ctsio->kern_data_len; + if (xferlen - first_burst < ci->ddp_threshold) { +no_ddp: + /* + * No DDP for this transfer. Allocate a TTT (based on + * the one passed in) that cannot be a valid hardware + * DDP tag in the iSCSI region. + */ + ttt = *tttp & M_PPOD_TAG; + ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit; + *tttp = htobe32(ttt); + MPASS(io_to_ppod_reservation(io) == NULL); + return (0); + } + + if (ctsio->kern_sg_entries == 0) + buf = (vm_offset_t)ctsio->kern_data_ptr; + else if (ctsio->kern_sg_entries == 1) { + struct ctl_sg_entry *sgl = (void *)ctsio->kern_data_ptr; + + MPASS(sgl->len == xferlen); + buf = (vm_offset_t)sgl->addr; + } else { + rc = EAGAIN; /* XXX implement */ + goto no_ddp; + } + + + /* + * Reserve resources for DDP, update the ttt that should be used + * in the PDU, and save DDP specific state for this I/O. + */ + + MPASS(io_to_ppod_reservation(io) == NULL); + prsv = uma_zalloc(prsv_zone, M_NOWAIT); + if (prsv == NULL) { + rc = ENOMEM; + goto no_ddp; + } + + rc = t4_alloc_page_pods_for_buf(pr, buf, xferlen, prsv); + if (rc != 0) { + uma_zfree(prsv_zone, prsv); + goto no_ddp; + } + + rc = t4_write_page_pods_for_buf(sc, toep->ofld_txq, toep->tid, + prsv, buf, xferlen); + if (rc != 0) { + t4_free_page_pods(prsv); + uma_zfree(prsv_zone, prsv); + goto no_ddp; + } + + *tttp = htobe32(prsv->prsv_tag); + io_to_ppod_reservation(io) = prsv; + *arg = ctsio; + return (0); + } + + /* + * In the middle of an I/O. A non-NULL page pod reservation indicates + * that a DDP buffer is being used for the I/O. + */ + + prsv = io_to_ppod_reservation(ctsio); + if (prsv == NULL) + goto no_ddp; + + alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift; + alias++; + prsv->prsv_tag &= ~pr->pr_alias_mask; + prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask; + + *tttp = htobe32(prsv->prsv_tag); + *arg = ctsio; return (0); } void -icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *prv) +icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg) { - cxgbei_cleanup_task(ic, prv); - uma_zfree(icl_transfer_zone, prv); + struct ctl_scsiio *ctsio = arg; + + if (ctsio != NULL && ctsio->kern_data_len == ctsio->ext_data_filled) { + struct ppod_reservation *prsv; + + prsv = io_to_ppod_reservation(ctsio); + MPASS(prsv != NULL); + + t4_free_page_pods(prsv); + uma_zfree(prsv_zone, prsv); + } } static void @@ -882,9 +1059,12 @@ icl_cxgbei_mod_load(void) { int rc; - icl_transfer_zone = uma_zcreate("icl_transfer", - 16 * 1024, NULL, NULL, NULL, NULL, - UMA_ALIGN_PTR, 0); + /* + * Space to track pagepod reservations. + */ + prsv_zone = uma_zcreate("Pagepod reservations", + sizeof(struct ppod_reservation), NULL, NULL, NULL, NULL, + CACHE_LINE_SIZE, 0); refcount_init(&icl_cxgbei_ncons, 0); @@ -903,7 +1083,7 @@ icl_cxgbei_mod_unload(void) icl_unregister("cxgbei", false); - uma_zdestroy(icl_transfer_zone); + uma_zdestroy(prsv_zone); return (0); } diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h index cb0006c00a2..e184c31a8b7 100644 --- a/sys/dev/cxgbe/offload.h +++ b/sys/dev/cxgbe/offload.h @@ -155,7 +155,6 @@ int t4_register_uld(struct uld_info *); int t4_unregister_uld(struct uld_info *); int t4_activate_uld(struct adapter *, int); int t4_deactivate_uld(struct adapter *, int); -void t4_iscsi_init(struct adapter *, u_int, const u_int *); int uld_active(struct adapter *, int); #endif #endif diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 8727c985b5a..b416603f7ef 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -8929,16 +8929,6 @@ t4_db_dropped(struct adapter *sc) } #ifdef TCP_OFFLOAD -void -t4_iscsi_init(struct adapter *sc, u_int tag_mask, const u_int *pgsz_order) -{ - - t4_write_reg(sc, A_ULP_RX_ISCSI_TAGMASK, tag_mask); - t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, V_HPZ0(pgsz_order[0]) | - V_HPZ1(pgsz_order[1]) | V_HPZ2(pgsz_order[2]) | - V_HPZ3(pgsz_order[3])); -} - static int toe_capability(struct vi_info *vi, int enable) { diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index bd949b097fa..923f1c16de9 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -575,11 +575,20 @@ t4_tweak_chip_settings(struct adapter *sc) V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5])); t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v); - /* 4K, 16K, 64K, 256K DDP "page sizes" */ + /* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */ v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v); - m = v = F_TDDPTAGTCB; + /* + * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP. These have been + * chosen with MAXPHYS = 128K in mind. The largest DDP buffer that we + * may have to deal with is MAXPHYS + 1 page. + */ + v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4); + t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v); + + /* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */ + m = v = F_TDDPTAGTCB | F_ISCSITAGTCB; t4_set_reg_field(sc, A_ULP_RX_CTL, m, v); m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | diff --git a/sys/modules/cxgbe/cxgbei/Makefile b/sys/modules/cxgbe/cxgbei/Makefile index 898cc3f00b1..bf577514bd7 100644 --- a/sys/modules/cxgbe/cxgbei/Makefile +++ b/sys/modules/cxgbe/cxgbei/Makefile @@ -6,7 +6,6 @@ CXGBE = ${.CURDIR}/../../../dev/cxgbe KMOD= cxgbei SRCS= cxgbei.c -SRCS+= cxgbei_ulp2_ddp.c SRCS+= icl_cxgbei.c SRCS+= bus_if.h SRCS+= device_if.h From 2fbab0097a29597a4afd625c1fddeac51a12461c Mon Sep 17 00:00:00 2001 From: "Andrey A. Chernov" Date: Thu, 1 Sep 2016 20:45:04 +0000 Subject: [PATCH 08/41] fgetwc(3) may set both __SEOF and __SERR at once (in case of incomplete sequence near EOF), so we can't just check for (wc == WEOF && !__sfeof(fp)) and must relay on __sferror(fp) with __SERR clearing/restoring. MFC after: 7 days --- lib/libc/stdio/fgetwln.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/libc/stdio/fgetwln.c b/lib/libc/stdio/fgetwln.c index 34a80a076b5..037657c224b 100644 --- a/lib/libc/stdio/fgetwln.c +++ b/lib/libc/stdio/fgetwln.c @@ -47,11 +47,16 @@ fgetwln_l(FILE * __restrict fp, size_t *lenp, locale_t locale) { wint_t wc; size_t len; + int savserr; + FIX_LOCALE(locale); FLOCKFILE(fp); ORIENT(fp, 1); + savserr = fp->_flags & __SERR; + fp->_flags &= ~__SERR; + len = 0; while ((wc = __fgetwc(fp, locale)) != WEOF) { #define GROW 512 @@ -64,7 +69,12 @@ fgetwln_l(FILE * __restrict fp, size_t *lenp, locale_t locale) if (wc == L'\n') break; } - if (len == 0 || (wc == WEOF && !__sfeof(fp))) + /* fgetwc(3) may set both __SEOF and __SERR at once. */ + if (__sferror(fp)) + goto error; + + fp->_flags |= savserr; + if (len == 0) goto error; FUNLOCKFILE(fp); From a9d0abd93faed9b8d8d35d9e69e9eb0887590b9d Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 1 Sep 2016 21:15:37 +0000 Subject: [PATCH 09/41] DIRDEPS_BUILD: Add missing crunchgen host tool. MFC after: 3 days Sponsored by: EMC / Isilon Storage Division --- targets/pseudo/hosttools/Makefile.depend.host | 1 + 1 file changed, 1 insertion(+) diff --git a/targets/pseudo/hosttools/Makefile.depend.host b/targets/pseudo/hosttools/Makefile.depend.host index 89d87f49716..5cce2d5efcf 100644 --- a/targets/pseudo/hosttools/Makefile.depend.host +++ b/targets/pseudo/hosttools/Makefile.depend.host @@ -18,6 +18,7 @@ DIRDEPS = \ usr.bin/xlint/xlint \ usr.bin/yacc \ usr.sbin/config \ + usr.sbin/crunch/crunchgen.host \ .if ${MK_KERBEROS} != "no" DIRDEPS+= \ From 8fc33c6776ad3a5a28f1d5dadb1e73686608bb27 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 1 Sep 2016 21:16:29 +0000 Subject: [PATCH 10/41] Remove the digi(4) manpage and digi from the release notes. --- ObsoleteFiles.inc | 2 ++ release/doc/en_US.ISO8859-1/hardware/article.xml | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ObsoleteFiles.inc b/ObsoleteFiles.inc index 27df19589a7..bc89fefcd6c 100644 --- a/ObsoleteFiles.inc +++ b/ObsoleteFiles.inc @@ -38,6 +38,8 @@ # xargs -n1 | sort | uniq -d; # done +# 20160901: Remove digi(4) +OLD_FILES+=usr/share/man/man4/digi.4.gz # 20160819: Remove ie(4) OLD_FILES+=usr/share/man/man4/i386/ie.4.gz # 20160819: Remove spic(4) diff --git a/release/doc/en_US.ISO8859-1/hardware/article.xml b/release/doc/en_US.ISO8859-1/hardware/article.xml index 7a2e00239fb..962b3e52b9f 100644 --- a/release/doc/en_US.ISO8859-1/hardware/article.xml +++ b/release/doc/en_US.ISO8859-1/hardware/article.xml @@ -1098,9 +1098,6 @@ [&arch.i386;] STB 4 port card using shared IRQ - [&arch.i386;] DigiBoard intelligent serial cards (digi - driver) - [&arch.amd64;, &arch.i386;] PCI-Based multi-port serial boards (&man.puc.4; driver) From e8ca6c4d83ab32ace3abe54840042f2e5c02c892 Mon Sep 17 00:00:00 2001 From: Jared McNeill Date: Thu, 1 Sep 2016 21:19:11 +0000 Subject: [PATCH 11/41] Add support for setting DCDC2 voltage. --- sys/arm/allwinner/axp81x.c | 101 +++++++++++++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/sys/arm/allwinner/axp81x.c b/sys/arm/allwinner/axp81x.c index bbcc4c97872..9849517769c 100644 --- a/sys/arm/allwinner/axp81x.c +++ b/sys/arm/allwinner/axp81x.c @@ -61,8 +61,13 @@ __FBSDID("$FreeBSD$"); MALLOC_DEFINE(M_AXP81X_REG, "AXP81x regulator", "AXP81x power regulator"); #define AXP_ICTYPE 0x03 +#define AXP_POWERCTL1 0x10 +#define AXP_POWERCTL1_DCDC2 (1 << 1) #define AXP_POWERCTL2 0x12 #define AXP_POWERCTL2_DC1SW (1 << 7) +#define AXP_VOLTCTL_DCDC2 0x21 +#define AXP_VOLTCTL_STATUS (1 << 7) +#define AXP_VOLTCTL_MASK 0x7f #define AXP_POWERBAT 0x32 #define AXP_POWERBAT_SHUTDOWN (1 << 7) #define AXP_IRQEN1 0x40 @@ -109,10 +114,18 @@ struct axp81x_regdef { char *supply_name; uint8_t enable_reg; uint8_t enable_mask; + uint8_t voltage_reg; + int voltage_min; + int voltage_max; + int voltage_step1; + int voltage_nstep1; + int voltage_step2; + int voltage_nstep2; }; enum axp81x_reg_id { - AXP81X_REG_ID_DC1SW + AXP81X_REG_ID_DC1SW, + AXP81X_REG_ID_DCDC2, }; static struct axp81x_regdef axp81x_regdefs[] = { @@ -122,6 +135,19 @@ static struct axp81x_regdef axp81x_regdefs[] = { .enable_reg = AXP_POWERCTL2, .enable_mask = AXP_POWERCTL2_DC1SW, }, + { + .id = AXP81X_REG_ID_DCDC2, + .name = "dcdc2", + .enable_reg = AXP_POWERCTL1, + .enable_mask = AXP_POWERCTL1_DCDC2, + .voltage_reg = AXP_VOLTCTL_DCDC2, + .voltage_min = 500, + .voltage_max = 1300, + .voltage_step1 = 10, + .voltage_nstep1 = 70, + .voltage_step2 = 20, + .voltage_nstep2 = 5, + }, }; struct axp81x_softc; @@ -218,17 +244,82 @@ axp81x_regnode_enable(struct regnode *regnode, bool enable, int *udelay) return (0); } +static void +axp81x_regnode_reg_to_voltage(struct axp81x_reg_sc *sc, uint8_t val, int *uv) +{ + if (val < sc->def->voltage_nstep1) + *uv = sc->def->voltage_min + val * sc->def->voltage_step1; + else + *uv = sc->def->voltage_min + + (sc->def->voltage_nstep1 * sc->def->voltage_step1) + + ((val - sc->def->voltage_nstep1) * sc->def->voltage_step2); + *uv *= 1000; +} + +static int +axp81x_regnode_voltage_to_reg(struct axp81x_reg_sc *sc, int min_uvolt, + int max_uvolt, uint8_t *val) +{ + uint8_t nval; + int nstep, uvolt; + + nval = 0; + uvolt = sc->def->voltage_min * 1000; + + for (nstep = 0; nstep < sc->def->voltage_nstep1 && uvolt < min_uvolt; + nstep++) { + ++nval; + uvolt += (sc->def->voltage_step1 * 1000); + } + for (nstep = 0; nstep < sc->def->voltage_nstep2 && uvolt < min_uvolt; + nstep++) { + ++nval; + uvolt += (sc->def->voltage_step2 * 1000); + } + if (uvolt > max_uvolt) + return (EINVAL); + + *val = nval; + return (0); +} + static int axp81x_regnode_set_voltage(struct regnode *regnode, int min_uvolt, int max_uvolt, int *udelay) { - return (ENXIO); + struct axp81x_reg_sc *sc; + uint8_t val; + + sc = regnode_get_softc(regnode); + + if (!sc->def->voltage_step1 || !sc->def->voltage_step2) + return (ENXIO); + + if (axp81x_regnode_voltage_to_reg(sc, min_uvolt, max_uvolt, &val) != 0) + return (ERANGE); + + axp81x_write(sc->base_dev, sc->def->voltage_reg, val); + + *udelay = 0; + + return (0); } static int axp81x_regnode_get_voltage(struct regnode *regnode, int *uvolt) { - return (ENXIO); + struct axp81x_reg_sc *sc; + uint8_t val; + + sc = regnode_get_softc(regnode); + + if (!sc->def->voltage_step1 || !sc->def->voltage_step2) + return (ENXIO); + + axp81x_read(sc->base_dev, sc->def->voltage_reg, &val, 1); + axp81x_regnode_reg_to_voltage(sc, val & AXP_VOLTCTL_MASK, uvolt); + + return (0); } static regnode_method_t axp81x_regnode_methods[] = { @@ -519,6 +610,10 @@ axp81x_reg_attach(device_t dev, phandle_t node, memset(&initdef, 0, sizeof(initdef)); regulator_parse_ofw_stdparam(dev, node, &initdef); + if (initdef.std_param.min_uvolt == 0) + initdef.std_param.min_uvolt = def->voltage_min * 1000; + if (initdef.std_param.max_uvolt == 0) + initdef.std_param.max_uvolt = def->voltage_max * 1000; initdef.id = def->id; initdef.ofw_node = node; regnode = regnode_create(dev, &axp81x_regnode_class, &initdef); From 1396b52e5513c87da25af1c20e3f2414b09d1d43 Mon Sep 17 00:00:00 2001 From: Jared McNeill Date: Thu, 1 Sep 2016 21:20:07 +0000 Subject: [PATCH 12/41] Add support for changing A23 PLL1 frequency. --- sys/arm/allwinner/clk/aw_pll.c | 57 ++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/sys/arm/allwinner/clk/aw_pll.c b/sys/arm/allwinner/clk/aw_pll.c index 8386a480822..be6859c1be7 100644 --- a/sys/arm/allwinner/clk/aw_pll.c +++ b/sys/arm/allwinner/clk/aw_pll.c @@ -110,14 +110,14 @@ __FBSDID("$FreeBSD$"); #define A13_PLL2_PRE_DIV (0x1f << 0) #define A13_PLL2_PRE_DIV_SHIFT 0 +#define A23_PLL1_FACTOR_P (0x3 << 16) +#define A23_PLL1_FACTOR_P_SHIFT 16 #define A23_PLL1_FACTOR_N (0x1f << 8) #define A23_PLL1_FACTOR_N_SHIFT 8 #define A23_PLL1_FACTOR_K (0x3 << 4) #define A23_PLL1_FACTOR_K_SHIFT 4 #define A23_PLL1_FACTOR_M (0x3 << 0) #define A23_PLL1_FACTOR_M_SHIFT 0 -#define A23_PLL1_FACTOR_P (0x3 << 16) -#define A23_PLL1_FACTOR_P_SHIFT 16 #define A31_PLL1_LOCK (1 << 28) #define A31_PLL1_CPU_SIGMA_DELTA_EN (1 << 24) @@ -171,6 +171,24 @@ __FBSDID("$FreeBSD$"); #define CLKID_A31_PLL6 0 #define CLKID_A31_PLL6_X2 1 +struct aw_pll_factor { + unsigned int n; + unsigned int k; + unsigned int m; + unsigned int p; + uint64_t freq; +}; +#define PLLFACTOR(_n, _k, _m, _p, _freq) \ + { .n = (_n), .k = (_k), .m = (_m), .p = (_p), .freq = (_freq) } + +static struct aw_pll_factor aw_a23_pll1_factors[] = { + PLLFACTOR(16, 0, 0, 0, 408000000), + PLLFACTOR(26, 0, 0, 0, 648000000), + PLLFACTOR(16, 1, 0, 0, 816000000), + PLLFACTOR(20, 1, 0, 0, 1008000000), + PLLFACTOR(24, 1, 0, 0, 1200000000), +}; + enum aw_pll_type { AWPLL_A10_PLL1 = 1, AWPLL_A10_PLL2, @@ -556,6 +574,39 @@ a13_pll2_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout, return (0); } +static int +a23_pll1_set_freq(struct aw_pll_sc *sc, uint64_t fin, uint64_t *fout, + int flags) +{ + struct aw_pll_factor *f; + uint32_t val; + int n; + + f = NULL; + for (n = 0; n < nitems(aw_a23_pll1_factors); n++) { + if (aw_a23_pll1_factors[n].freq == *fout) { + f = &aw_a23_pll1_factors[n]; + break; + } + } + if (f == NULL) + return (EINVAL); + + DEVICE_LOCK(sc); + PLL_READ(sc, &val); + val &= ~(A23_PLL1_FACTOR_N|A23_PLL1_FACTOR_K|A23_PLL1_FACTOR_M| + A23_PLL1_FACTOR_P); + val |= (f->n << A23_PLL1_FACTOR_N_SHIFT); + val |= (f->k << A23_PLL1_FACTOR_K_SHIFT); + val |= (f->m << A23_PLL1_FACTOR_M_SHIFT); + val |= (f->p << A23_PLL1_FACTOR_P_SHIFT); + PLL_WRITE(sc, val); + DEVICE_UNLOCK(sc); + + return (0); + +} + static int a23_pll1_recalc(struct aw_pll_sc *sc, uint64_t *freq) { @@ -719,7 +770,7 @@ static struct aw_pll_funcs aw_pll_func[] = { PLL(AWPLL_A10_PLL5, a10_pll5_recalc, NULL, NULL), PLL(AWPLL_A10_PLL6, a10_pll6_recalc, a10_pll6_set_freq, a10_pll6_init), PLL(AWPLL_A13_PLL2, a13_pll2_recalc, a13_pll2_set_freq, NULL), - PLL(AWPLL_A23_PLL1, a23_pll1_recalc, NULL, NULL), + PLL(AWPLL_A23_PLL1, a23_pll1_recalc, a23_pll1_set_freq, NULL), PLL(AWPLL_A31_PLL1, a31_pll1_recalc, NULL, NULL), PLL(AWPLL_A31_PLL6, a31_pll6_recalc, NULL, a31_pll6_init), PLL(AWPLL_A80_PLL4, a80_pll4_recalc, NULL, NULL), From 72ac509e46c9ad518a1dd610f04d299877adbf3d Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 1 Sep 2016 21:30:12 +0000 Subject: [PATCH 13/41] Remove warning about pci_addr_t being different sizes. pci_addr_t has always been 64-bits since r163805. MFC after: 1 week --- share/man/man9/pci.9 | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/share/man/man9/pci.9 b/share/man/man9/pci.9 index 161f9aa1329..1a4c5deb8a9 100644 --- a/share/man/man9/pci.9 +++ b/share/man/man9/pci.9 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd August 3, 2016 +.Dd September 1, 2016 .Dt PCI 9 .Os .Sh NAME @@ -910,11 +910,6 @@ with one in the new distribution. The .Fn pci_remap_msix function will fail if this condition is not met. -.Sh IMPLEMENTATION NOTES -The -.Vt pci_addr_t -type varies according to the size of the PCI bus address -space on the target architecture. .Sh SEE ALSO .Xr pci 4 , .Xr pciconf 8 , From fec7f833421e921cb3db5371fe32ebe2970bb3b3 Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Thu, 1 Sep 2016 22:40:55 +0000 Subject: [PATCH 14/41] cxgbe/cxgbei: Minor changes in the iSCSI CPL handlers. - Use m_copydata instead of bcopy. - Add new assertions. - Log some more information. Sponsored by: Chelsio Communications --- sys/dev/cxgbe/cxgbei/cxgbei.c | 45 +++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.c b/sys/dev/cxgbe/cxgbei/cxgbei.c index 044056b0bf6..0aa4b43cf42 100644 --- a/sys/dev/cxgbe/cxgbei/cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/cxgbei.c @@ -181,15 +181,18 @@ do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) struct toepcb *toep = lookup_tid(sc, tid); struct icl_pdu *ip; struct icl_cxgbei_pdu *icp; + uint16_t len_ddp = be16toh(cpl->pdu_len_ddp); + uint16_t len = be16toh(cpl->len); M_ASSERTPKTHDR(m); + MPASS(m->m_pkthdr.len == len + sizeof(*cpl)); ip = icl_cxgbei_new_pdu(M_NOWAIT); if (ip == NULL) CXGBE_UNIMPLEMENTED("PDU allocation failure"); + m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs); + ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len; icp = ip_to_icp(ip); - bcopy(mtod(m, caddr_t) + sizeof(*cpl), icp->ip.ip_bhs, sizeof(struct - iscsi_bhs)); icp->icp_seq = ntohl(cpl->seq); icp->icp_flags = ICPF_RX_HDR; @@ -198,8 +201,8 @@ do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) toep->ulpcb2 = icp; #if 0 - CTR4(KTR_CXGBE, "%s: tid %u, cpl->len hlen %u, m->m_len hlen %u", - __func__, tid, ntohs(cpl->len), m->m_len); + CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p", + __func__, tid, len, len_ddp, icp); #endif m_freem(m); @@ -216,22 +219,23 @@ do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m struct icl_cxgbei_pdu *icp = toep->ulpcb2; M_ASSERTPKTHDR(m); + MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl)); /* Must already have received the header (but not the data). */ MPASS(icp != NULL); MPASS(icp->icp_flags == ICPF_RX_HDR); MPASS(icp->ip.ip_data_mbuf == NULL); - MPASS(icp->ip.ip_data_len == 0); + m_adj(m, sizeof(*cpl)); + MPASS(icp->ip.ip_data_len == m->m_pkthdr.len); icp->icp_flags |= ICPF_RX_FLBUF; icp->ip.ip_data_mbuf = m; - icp->ip.ip_data_len = m->m_pkthdr.len; #if 0 - CTR4(KTR_CXGBE, "%s: tid %u, cpl->len dlen %u, m->m_len dlen %u", - __func__, tid, ntohs(cpl->len), m->m_len); + CTR3(KTR_CXGBE, "%s: tid %u, cpl->len %u", __func__, tid, + be16toh(cpl->len)); #endif return (0); @@ -259,20 +263,30 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) /* Must already be assembling a PDU. */ MPASS(icp != NULL); MPASS(icp->icp_flags & ICPF_RX_HDR); /* Data is optional. */ - ip = &icp->ip; + MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0); + + pdu_len = be16toh(cpl->len); /* includes everything. */ + val = be32toh(cpl->ddpvld); + +#if 0 + CTR4(KTR_CXGBE, + "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x", + __func__, tid, pdu_len, val, icp->icp_flags); +#endif + icp->icp_flags |= ICPF_RX_STATUS; - val = ntohl(cpl->ddpvld); + ip = &icp->ip; if (val & F_DDP_PADDING_ERR) icp->icp_flags |= ICPF_PAD_ERR; if (val & F_DDP_HDRCRC_ERR) icp->icp_flags |= ICPF_HCRC_ERR; if (val & F_DDP_DATACRC_ERR) icp->icp_flags |= ICPF_DCRC_ERR; - if (ip->ip_data_mbuf == NULL) { - /* XXXNP: what should ip->ip_data_len be, and why? */ + if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) { + MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); + MPASS(ip->ip_data_len > 0); icp->icp_flags |= ICPF_RX_DDP; } - pdu_len = ntohs(cpl->len); /* includes everything. */ INP_WLOCK(inp); if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { @@ -358,11 +372,6 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) m_freem(m); } -#if 0 - CTR4(KTR_CXGBE, "%s: tid %u, pdu_len %u, pdu_flags 0x%x", - __func__, tid, pdu_len, icp->icp_flags); -#endif - STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next); if ((icc->rx_flags & RXF_ACTIVE) == 0) { struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt]; From 9954e8e61219605bffe2974a3b00c487b60ab7ff Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 1 Sep 2016 23:20:54 +0000 Subject: [PATCH 15/41] META_MODE: Don't expect .meta files for side-effect generated files. This is similar to r301285. MFC after: 1 week Sponsored by: EMC / Isilon Storage Division --- share/mk/bsd.crunchgen.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/share/mk/bsd.crunchgen.mk b/share/mk/bsd.crunchgen.mk index 2c93526ddcf..a548f2cdfce 100644 --- a/share/mk/bsd.crunchgen.mk +++ b/share/mk/bsd.crunchgen.mk @@ -112,6 +112,7 @@ CRUNCHENV+= MK_TESTS=no \ _RECURSING_CRUNCH=1 .ORDER: ${OUTPUTS} objs ${OUTPUTS:[1]}: .META +${OUTPUTS:[2..-1]}: .NOMETA ${OUTPUTS}: ${CONF} MAKE=${MAKE} ${CRUNCHENV:NMK_AUTO_OBJ=*} MAKEOBJDIRPREFIX=${CRUNCHOBJS} \ MK_AUTO_OBJ=${MK_AUTO_OBJ} \ From 5017231ad7340603a8319371361bb51b32741853 Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 1 Sep 2016 23:21:08 +0000 Subject: [PATCH 16/41] META_MODE/DIRDEPS_BUILD: Fix various issues with crunch builds. - DIRDEPS_BUILD: Fix crunchgen builds losing their library dependencies on a nop-rebuild. - META_MODE: Fix not rebuilding various crunch.mk targets if their .meta files warrant a rebuild. They were lacking .meta files previously. This adds .NOMETA to the crunch objects being used since they are already built. Bmake was forcing a rebuild on them since their .meta files were not in the expected place; there is no reason to rebuild them. MFC after: 2 weeks Sponsored by: EMC / Isilon Storage Division --- share/mk/bsd.crunchgen.mk | 7 +++++-- usr.sbin/crunch/crunchgen/crunchgen.c | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/share/mk/bsd.crunchgen.mk b/share/mk/bsd.crunchgen.mk index a548f2cdfce..d1d829b9be6 100644 --- a/share/mk/bsd.crunchgen.mk +++ b/share/mk/bsd.crunchgen.mk @@ -123,11 +123,13 @@ ${OUTPUTS}: ${CONF} # These 2 targets cannot use .MAKE since they depend on the generated # ${OUTMK} above. -${PROG}: ${OUTPUTS} objs .META +${PROG}: ${OUTPUTS} objs .NOMETA .PHONY ${CRUNCHENV} \ CC="${CC} ${CFLAGS} ${LDFLAGS}" \ CXX="${CXX} ${CXXFLAGS} ${LDFLAGS}" \ - ${MAKE} .MAKE.MODE=normal -f ${OUTMK} exe + ${MAKE} .MAKE.MODE="${.MAKE.MODE} curdirOk=yes" \ + .MAKE.META.IGNORE_PATHS="${.MAKE.META.IGNORE_PATHS}" \ + -f ${OUTMK} exe objs: ${OUTMK} .META ${CRUNCHENV} MAKEOBJDIRPREFIX=${CRUNCHOBJS} \ @@ -167,3 +169,4 @@ clean: fi META_XTRAS+= ${find ${CRUNCHOBJS}${SRCTOP} -name '*.meta' 2>/dev/null || true:L:sh} +META_XTRAS+= ${PROG}.meta diff --git a/usr.sbin/crunch/crunchgen/crunchgen.c b/usr.sbin/crunch/crunchgen/crunchgen.c index 7a63c915e8f..f328ea44df1 100644 --- a/usr.sbin/crunch/crunchgen/crunchgen.c +++ b/usr.sbin/crunch/crunchgen/crunchgen.c @@ -1064,6 +1064,7 @@ prog_makefile_rules(FILE *outmk, prog_t *p) } fprintf(outmk, "\n"); } + fprintf(outmk, "$(%s_OBJPATHS): .NOMETA\n", p->ident); if (p->srcdir && p->objs) { fprintf(outmk, "%s_SRCDIR=%s\n", p->ident, p->srcdir); From 12330f35fa56d2e1176db4939fa204c7dec4f137 Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 1 Sep 2016 23:21:12 +0000 Subject: [PATCH 17/41] DIRDEPS_BUILD: Fix 'make bootstrap-tools' not using the proper tblgen binaries. This was an incomplete item from r291561. The host {clang,llvm}-tblgen binaries were used, rather than the ones built into the host stagedir by normal Makefile.depend dependencies on tblgen. MFC after: 1 week Sponsored by: EMC / Isilon Storage Division --- targets/pseudo/bootstrap-tools/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/targets/pseudo/bootstrap-tools/Makefile b/targets/pseudo/bootstrap-tools/Makefile index 98064ff06b5..2259861ce24 100644 --- a/targets/pseudo/bootstrap-tools/Makefile +++ b/targets/pseudo/bootstrap-tools/Makefile @@ -78,6 +78,8 @@ BSTCENV= \ BSTCARGS= \ ${BSARGS:NDESTDIR=*:NOBJTOP=*:NOBJROOT=*:NMK_CROSS_COMPILER=*:NMK_CLANG=*:NMK_GCC=*} \ BUILD_DIRDEPS=yes \ + LLVM_TBLGEN=${TOOLSDIR}/usr/bin/llvm-tblgen \ + CLANG_TBLGEN=${TOOLSDIR}/usr/bin/clang-tblgen \ -DWITH_STAGING \ -DWITH_TOOLSDIR From fcc803dfce3ab92e0eeb730a3eb4eb5989157414 Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 1 Sep 2016 23:22:31 +0000 Subject: [PATCH 18/41] Bump __FreeBSD_version for crunchgen META_MODE fix in r305254. MFC after: 2 weeks Sponsored by: EMC / Isilon Storage Division --- Makefile.inc1 | 4 +++- sys/sys/param.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile.inc1 b/Makefile.inc1 index d87a6dbdb96..29ec4262251 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -1628,8 +1628,10 @@ _crunchide= usr.sbin/crunch/crunchide # r285986 crunchen: use STRIPBIN rather than STRIP # 1100113: Support MK_AUTO_OBJ +# 1200006: META_MODE fixes .if ${BOOTSTRAPPING} < 1100078 || \ - (${MK_AUTO_OBJ} == "yes" && ${BOOTSTRAPPING} < 1100114) + (${MK_AUTO_OBJ} == "yes" && ${BOOTSTRAPPING} < 1100114) || \ + (${MK_META_MODE} == "yes" && ${BOOTSTRAPPING} < 1200006) _crunchgen= usr.sbin/crunch/crunchgen .endif diff --git a/sys/sys/param.h b/sys/sys/param.h index 1f70de4f985..b7c5878bbf8 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -58,7 +58,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1200005 /* Master, propagated to newvers */ +#define __FreeBSD_version 1200006 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, From 689a876e9618ffa59a3cc3b3e15e62942ba2f902 Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 1 Sep 2016 23:52:20 +0000 Subject: [PATCH 19/41] DIRDEPS_BUILD: Include crunched object meta files for gendirdeps. MFC after: 2 weeks Sponsored by: EMC / Isilon Storage Division --- share/mk/bsd.crunchgen.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/share/mk/bsd.crunchgen.mk b/share/mk/bsd.crunchgen.mk index d1d829b9be6..2ac489f5f45 100644 --- a/share/mk/bsd.crunchgen.mk +++ b/share/mk/bsd.crunchgen.mk @@ -169,4 +169,5 @@ clean: fi META_XTRAS+= ${find ${CRUNCHOBJS}${SRCTOP} -name '*.meta' 2>/dev/null || true:L:sh} +META_XTRAS+= ${echo ${CRUNCHOBJS}/*.lo.meta 2>/dev/null || true:L:sh} META_XTRAS+= ${PROG}.meta From b3cf0fa8749d00c78a80b9a8240c8f95df92b402 Mon Sep 17 00:00:00 2001 From: Bryan Drewery Date: Thu, 1 Sep 2016 23:52:25 +0000 Subject: [PATCH 20/41] DIRDEPS_BUILD: Build crunchide for the host. MFC after: 2 weeks Sponsored by: EMC / Isilon Storage Division --- rescue/rescue/Makefile.depend | 1 + targets/pseudo/hosttools/Makefile.depend.host | 4 +++- tools/bsdbox/Makefile.depend | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/rescue/rescue/Makefile.depend b/rescue/rescue/Makefile.depend index 1710daf9a39..38219b53ac8 100644 --- a/rescue/rescue/Makefile.depend +++ b/rescue/rescue/Makefile.depend @@ -52,6 +52,7 @@ DIRDEPS = \ usr.bin/lex/lib \ usr.bin/yacc.host \ usr.sbin/crunch/crunchgen.host \ + usr.sbin/crunch/crunchide.host \ .include diff --git a/targets/pseudo/hosttools/Makefile.depend.host b/targets/pseudo/hosttools/Makefile.depend.host index 5cce2d5efcf..12948cdde9a 100644 --- a/targets/pseudo/hosttools/Makefile.depend.host +++ b/targets/pseudo/hosttools/Makefile.depend.host @@ -18,7 +18,9 @@ DIRDEPS = \ usr.bin/xlint/xlint \ usr.bin/yacc \ usr.sbin/config \ - usr.sbin/crunch/crunchgen.host \ + usr.sbin/crunch/crunchgen \ + usr.sbin/crunch/crunchide \ + .if ${MK_KERBEROS} != "no" DIRDEPS+= \ diff --git a/tools/bsdbox/Makefile.depend b/tools/bsdbox/Makefile.depend index 6f26c0b273d..4ed702841d9 100644 --- a/tools/bsdbox/Makefile.depend +++ b/tools/bsdbox/Makefile.depend @@ -47,6 +47,9 @@ DIRDEPS = \ secure/lib/libcrypto \ secure/lib/libssl \ usr.bin/lex/lib \ + usr.bin/yacc.host \ + usr.sbin/crunch/crunchgen.host \ + usr.sbin/crunch/crunchide.host \ .include From 985efa77cc6e2e7ea7a02f6100dab39726969dd9 Mon Sep 17 00:00:00 2001 From: Conrad Meyer Date: Thu, 1 Sep 2016 23:56:02 +0000 Subject: [PATCH 21/41] ioat(4): Despam relatively common hardware reset messages Reported by: ngie@ --- sys/dev/ioat/ioat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/dev/ioat/ioat.c b/sys/dev/ioat/ioat.c index a8150956ff4..15a29e09a79 100644 --- a/sys/dev/ioat/ioat.c +++ b/sys/dev/ioat/ioat.c @@ -750,13 +750,13 @@ out: * Fatal programming error on this DMA channel. Flush any outstanding * work with error status and restart the engine. */ - ioat_log_message(0, "Channel halted due to fatal programming error\n"); mtx_lock(&ioat->submit_lock); mtx_lock(&ioat->cleanup_lock); ioat->quiescing = TRUE; chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET); - ioat_halted_debug(ioat, chanerr); + if (1 <= g_ioat_debug_level) + ioat_halted_debug(ioat, chanerr); ioat->stats.last_halt_chanerr = chanerr; while (ioat_get_active(ioat) > 0) { From 22536e15566985c2b3d7e460930a9cec7dd9d076 Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Thu, 1 Sep 2016 23:58:36 +0000 Subject: [PATCH 22/41] cxgbe/cxgbei: Count various events related to iSCSI operation and make these counters available in the sysctl MIB. Sponsored by: Chelsio Communications --- sys/dev/cxgbe/cxgbei/cxgbei.c | 88 +++++++++++++++++++++++++++++++ sys/dev/cxgbe/cxgbei/cxgbei.h | 7 +++ sys/dev/cxgbe/cxgbei/icl_cxgbei.c | 6 +++ 3 files changed, 101 insertions(+) diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.c b/sys/dev/cxgbe/cxgbei/cxgbei.c index 0aa4b43cf42..ec9c9f4dbc4 100644 --- a/sys/dev/cxgbe/cxgbei/cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/cxgbei.c @@ -100,6 +100,50 @@ struct icl_pdu *icl_cxgbei_new_pdu(int); void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *); void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *); +static void +free_ci_counters(struct cxgbei_data *ci) +{ + +#define FREE_CI_COUNTER(x) do { \ + if (ci->x != NULL) { \ + counter_u64_free(ci->x); \ + ci->x = NULL; \ + } \ +} while (0) + + FREE_CI_COUNTER(ddp_setup_ok); + FREE_CI_COUNTER(ddp_setup_error); + FREE_CI_COUNTER(ddp_bytes); + FREE_CI_COUNTER(ddp_pdus); + FREE_CI_COUNTER(fl_bytes); + FREE_CI_COUNTER(fl_pdus); +#undef FREE_CI_COUNTER +} + +static int +alloc_ci_counters(struct cxgbei_data *ci) +{ + +#define ALLOC_CI_COUNTER(x) do { \ + ci->x = counter_u64_alloc(M_WAITOK); \ + if (ci->x == NULL) \ + goto fail; \ +} while (0) + + ALLOC_CI_COUNTER(ddp_setup_ok); + ALLOC_CI_COUNTER(ddp_setup_error); + ALLOC_CI_COUNTER(ddp_bytes); + ALLOC_CI_COUNTER(ddp_pdus); + ALLOC_CI_COUNTER(fl_bytes); + ALLOC_CI_COUNTER(fl_pdus); +#undef ALLOC_CI_COUNTER + + return (0); +fail: + free_ci_counters(ci); + return (ENOMEM); +} + static void read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len, uint32_t *max_rx_pdu_len) @@ -133,6 +177,8 @@ read_pdu_limits(struct adapter *sc, uint32_t *max_tx_pdu_len, static int cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) { + struct sysctl_oid *oid; + struct sysctl_oid_list *children; struct ppod_region *pr; uint32_t r; int rc; @@ -140,6 +186,10 @@ cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) MPASS(sc->vres.iscsi.size > 0); MPASS(ci != NULL); + rc = alloc_ci_counters(ci); + if (rc != 0) + return (rc); + read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len); ci->ddp_threshold = 2048; @@ -151,6 +201,7 @@ cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) device_printf(sc->dev, "%s: failed to initialize the iSCSI page pod region: %u.\n", __func__, rc); + free_ci_counters(ci); return (rc); } @@ -169,6 +220,35 @@ cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) V_ISCSITAGMASK(M_ISCSITAGMASK), pr->pr_tag_mask); } + sysctl_ctx_init(&ci->ctx); + oid = device_get_sysctl_tree(sc->dev); /* dev.t5nex.X */ + children = SYSCTL_CHILDREN(oid); + + oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi", CTLFLAG_RD, + NULL, "iSCSI ULP statistics"); + children = SYSCTL_CHILDREN(oid); + + SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_ok", + CTLFLAG_RD, &ci->ddp_setup_ok, + "# of times DDP buffer was setup successfully."); + + SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_setup_error", + CTLFLAG_RD, &ci->ddp_setup_error, + "# of times DDP buffer setup failed."); + + SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_bytes", + CTLFLAG_RD, &ci->ddp_bytes, "# of bytes placed directly"); + + SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "ddp_pdus", + CTLFLAG_RD, &ci->ddp_pdus, "# of PDUs with data placed directly."); + + SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_bytes", + CTLFLAG_RD, &ci->fl_bytes, "# of data bytes delivered in freelist"); + + SYSCTL_ADD_COUNTER_U64(&ci->ctx, children, OID_AUTO, "fl_pdus", + CTLFLAG_RD, &ci->fl_pdus, + "# of PDUs with data delivered in freelist"); + return (0); } @@ -213,6 +293,7 @@ static int do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; + struct cxgbei_data *ci = sc->iscsi_ulp_softc; struct cpl_iscsi_data *cpl = mtod(m, struct cpl_iscsi_data *); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); @@ -232,6 +313,8 @@ do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m icp->icp_flags |= ICPF_RX_FLBUF; icp->ip.ip_data_mbuf = m; + counter_u64_add(ci->fl_pdus, 1); + counter_u64_add(ci->fl_bytes, m->m_pkthdr.len); #if 0 CTR3(KTR_CXGBE, "%s: tid %u, cpl->len %u", __func__, tid, @@ -245,6 +328,7 @@ static int do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; + struct cxgbei_data *ci = sc->iscsi_ulp_softc; const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); @@ -286,6 +370,8 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0); MPASS(ip->ip_data_len > 0); icp->icp_flags |= ICPF_RX_DDP; + counter_u64_add(ci->ddp_pdus, 1); + counter_u64_add(ci->ddp_bytes, ip->ip_data_len); } INP_WLOCK(inp); @@ -439,7 +525,9 @@ cxgbei_deactivate(struct adapter *sc) ASSERT_SYNCHRONIZED_OP(sc); if (ci != NULL) { + sysctl_ctx_free(&ci->ctx); t4_free_ppod_region(&ci->pr); + free_ci_counters(ci); free(ci, M_CXGBE); sc->iscsi_ulp_softc = NULL; } diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.h b/sys/dev/cxgbe/cxgbei/cxgbei.h index a96499b482c..798a1cf4966 100644 --- a/sys/dev/cxgbe/cxgbei/cxgbei.h +++ b/sys/dev/cxgbe/cxgbei/cxgbei.h @@ -112,6 +112,13 @@ struct cxgbei_data { u_int ddp_threshold; struct ppod_region pr; + struct sysctl_ctx_list ctx; /* from uld_activate to deactivate */ + counter_u64_t ddp_setup_ok; + counter_u64_t ddp_setup_error; + counter_u64_t ddp_bytes; + counter_u64_t ddp_pdus; + counter_u64_t fl_bytes; + counter_u64_t fl_pdus; }; /* cxgbei.c */ diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c index 6696f6dee33..991c190bf3b 100644 --- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c @@ -820,6 +820,8 @@ no_ddp: itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit; *ittp = htobe32(itt); MPASS(*arg == NULL); /* State is maintained for DDP only. */ + if (rc != 0) + counter_u64_add(ci->ddp_setup_error, 1); return (0); } @@ -853,6 +855,7 @@ no_ddp: *ittp = htobe32(prsv->prsv_tag); *arg = prsv; + counter_u64_add(ci->ddp_setup_ok, 1); return (0); } @@ -920,6 +923,8 @@ no_ddp: ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit; *tttp = htobe32(ttt); MPASS(io_to_ppod_reservation(io) == NULL); + if (rc != 0) + counter_u64_add(ci->ddp_setup_error, 1); return (0); } @@ -965,6 +970,7 @@ no_ddp: *tttp = htobe32(prsv->prsv_tag); io_to_ppod_reservation(io) = prsv; *arg = ctsio; + counter_u64_add(ci->ddp_setup_ok, 1); return (0); } From 466522c3acddc430311feccc231eb8fc6dbe2604 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Fri, 2 Sep 2016 00:16:19 +0000 Subject: [PATCH 23/41] Initialize lists of signals using C99 designators Reviewed by: jilles Sponsored by: DARPA, AFRL Differential Revision: https://reviews.freebsd.org/D7601 --- lib/libc/gen/siglist.c | 128 ++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/lib/libc/gen/siglist.c b/lib/libc/gen/siglist.c index d4abfaa8ea0..2a177bffcf1 100644 --- a/lib/libc/gen/siglist.c +++ b/lib/libc/gen/siglist.c @@ -36,72 +36,72 @@ __FBSDID("$FreeBSD$"); #include const char *const sys_signame[NSIG] = { - "Signal 0", - "HUP", /* SIGHUP */ - "INT", /* SIGINT */ - "QUIT", /* SIGQUIT */ - "ILL", /* SIGILL */ - "TRAP", /* SIGTRAP */ - "ABRT", /* SIGABRT */ - "EMT", /* SIGEMT */ - "FPE", /* SIGFPE */ - "KILL", /* SIGKILL */ - "BUS", /* SIGBUS */ - "SEGV", /* SIGSEGV */ - "SYS", /* SIGSYS */ - "PIPE", /* SIGPIPE */ - "ALRM", /* SIGALRM */ - "TERM", /* SIGTERM */ - "URG", /* SIGURG */ - "STOP", /* SIGSTOP */ - "TSTP", /* SIGTSTP */ - "CONT", /* SIGCONT */ - "CHLD", /* SIGCHLD */ - "TTIN", /* SIGTTIN */ - "TTOU", /* SIGTTOU */ - "IO", /* SIGIO */ - "XCPU", /* SIGXCPU */ - "XFSZ", /* SIGXFSZ */ - "VTALRM", /* SIGVTALRM */ - "PROF", /* SIGPROF */ - "WINCH", /* SIGWINCH */ - "INFO", /* SIGINFO */ - "USR1", /* SIGUSR1 */ - "USR2" /* SIGUSR2 */ + [0] = "Signal 0", + [SIGHUP] = "HUP", + [SIGINT] = "INT", + [SIGQUIT] = "QUIT", + [SIGILL] = "ILL", + [SIGTRAP] = "TRAP", + [SIGABRT] = "ABRT", + [SIGEMT] = "EMT", + [SIGFPE] = "FPE", + [SIGKILL] = "KILL", + [SIGBUS] = "BUS", + [SIGSEGV] = "SEGV", + [SIGSYS] = "SYS", + [SIGPIPE] = "PIPE", + [SIGALRM] = "ALRM", + [SIGTERM] = "TERM", + [SIGURG] = "URG", + [SIGSTOP] = "STOP", + [SIGTSTP] = "TSTP", + [SIGCONT] = "CONT", + [SIGCHLD] = "CHLD", + [SIGTTIN] = "TTIN", + [SIGTTOU] = "TTOU", + [SIGIO] = "IO", + [SIGXCPU] = "XCPU", + [SIGXFSZ] = "XFSZ", + [SIGVTALRM] = "VTALRM", + [SIGPROF] = "PROF", + [SIGWINCH] = "WINCH", + [SIGINFO] = "INFO", + [SIGUSR1] = "USR1", + [SIGUSR2] = "USR2", }; const char *const sys_siglist[NSIG] = { - "Signal 0", - "Hangup", /* SIGHUP */ - "Interrupt", /* SIGINT */ - "Quit", /* SIGQUIT */ - "Illegal instruction", /* SIGILL */ - "Trace/BPT trap", /* SIGTRAP */ - "Abort trap", /* SIGABRT */ - "EMT trap", /* SIGEMT */ - "Floating point exception", /* SIGFPE */ - "Killed", /* SIGKILL */ - "Bus error", /* SIGBUS */ - "Segmentation fault", /* SIGSEGV */ - "Bad system call", /* SIGSYS */ - "Broken pipe", /* SIGPIPE */ - "Alarm clock", /* SIGALRM */ - "Terminated", /* SIGTERM */ - "Urgent I/O condition", /* SIGURG */ - "Suspended (signal)", /* SIGSTOP */ - "Suspended", /* SIGTSTP */ - "Continued", /* SIGCONT */ - "Child exited", /* SIGCHLD */ - "Stopped (tty input)", /* SIGTTIN */ - "Stopped (tty output)", /* SIGTTOU */ - "I/O possible", /* SIGIO */ - "Cputime limit exceeded", /* SIGXCPU */ - "Filesize limit exceeded", /* SIGXFSZ */ - "Virtual timer expired", /* SIGVTALRM */ - "Profiling timer expired", /* SIGPROF */ - "Window size changes", /* SIGWINCH */ - "Information request", /* SIGINFO */ - "User defined signal 1", /* SIGUSR1 */ - "User defined signal 2" /* SIGUSR2 */ + [0] = "Signal 0", + [SIGHUP] = "Hangup", + [SIGINT] = "Interrupt", + [SIGQUIT] = "Quit", + [SIGILL] = "Illegal instruction", + [SIGTRAP] = "Trace/BPT trap", + [SIGABRT] = "Abort trap", + [SIGEMT] = "EMT trap", + [SIGFPE] = "Floating point exception", + [SIGKILL] = "Killed", + [SIGBUS] = "Bus error", + [SIGSEGV] = "Segmentation fault", + [SIGSYS] = "Bad system call", + [SIGPIPE] = "Broken pipe", + [SIGALRM] = "Alarm clock", + [SIGTERM] = "Terminated", + [SIGURG] = "Urgent I/O condition", + [SIGSTOP] = "Suspended (signal)", + [SIGTSTP] = "Suspended", + [SIGCONT] = "Continued", + [SIGCHLD] = "Child exited", + [SIGTTIN] = "Stopped (tty input)", + [SIGTTOU] = "Stopped (tty output)", + [SIGIO] = "I/O possible", + [SIGXCPU] = "Cputime limit exceeded", + [SIGXFSZ] = "Filesize limit exceeded", + [SIGVTALRM] = "Virtual timer expired", + [SIGPROF] = "Profiling timer expired", + [SIGWINCH] = "Window size changes", + [SIGINFO] = "Information request", + [SIGUSR1] = "User defined signal 1", + [SIGUSR2] = "User defined signal 2", }; const int sys_nsig = sizeof(sys_siglist) / sizeof(sys_siglist[0]); From af4251f3a174dfee30e076593e82c8a539afcd03 Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Fri, 2 Sep 2016 00:21:24 +0000 Subject: [PATCH 24/41] cxgbe/cxgbei: Provide a knob to set the DDP threshold for iSCSI transfers. The Initiator and Target both perform zero copy receive for transfers greater than or equal to this threshold. Sponsored by: Chelsio Communications --- sys/dev/cxgbe/cxgbei/cxgbei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.c b/sys/dev/cxgbe/cxgbei/cxgbei.c index ec9c9f4dbc4..21d1e0f32bf 100644 --- a/sys/dev/cxgbe/cxgbei/cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/cxgbei.c @@ -192,9 +192,7 @@ cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) read_pdu_limits(sc, &ci->max_tx_pdu_len, &ci->max_rx_pdu_len); - ci->ddp_threshold = 2048; pr = &ci->pr; - r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ); rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods"); if (rc != 0) { @@ -249,6 +247,10 @@ cxgbei_init(struct adapter *sc, struct cxgbei_data *ci) CTLFLAG_RD, &ci->fl_pdus, "# of PDUs with data delivered in freelist"); + ci->ddp_threshold = 2048; + SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold", + CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold"); + return (0); } From 578fb84eb4d84662b28115432bb43e1a62c8820c Mon Sep 17 00:00:00 2001 From: Kevin Lo Date: Fri, 2 Sep 2016 00:27:27 +0000 Subject: [PATCH 25/41] Replace %m with %_m in date_fmt for Chinese locales. This is a fix for the problem mentioned in the PR. PR: 199441 --- tools/tools/locale/tools/cldr2def.pl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/tools/locale/tools/cldr2def.pl b/tools/tools/locale/tools/cldr2def.pl index 4c44f3d8437..1b90bb98fba 100755 --- a/tools/tools/locale/tools/cldr2def.pl +++ b/tools/tools/locale/tools/cldr2def.pl @@ -243,7 +243,10 @@ sub callback_dtformat { if ($nl eq 'ja_JP') { $s =~ s/(> )(%H)/$1%A $2/; - } elsif ($nl eq 'ko_KR' || $nl eq 'zh_TW') { + } elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') { + if ($nl ne 'ko_KR') { + $s =~ s/%m/%_m/; + } $s =~ s/(> )(%p)/$1%A $2/; } $s =~ s/\.,/\./; From 3c0283438943d79594bb77e300b771eb019588de Mon Sep 17 00:00:00 2001 From: Kevin Lo Date: Fri, 2 Sep 2016 00:43:03 +0000 Subject: [PATCH 26/41] Revert r304192 to fix short month names and replace %b with %_m in date_fmt for Chinese locales. As mentioned in the commit message of r289041, nl_langinfo(ABMON_*) only returned numbers when using a Chinese locale, this causes problems in applications that put the short month name and the day of the month together. Spotted by: Ting-Wei Lan --- share/timedef/zh_CN.GB2312.src | 26 +++++++++++++------------- share/timedef/zh_CN.GBK.src | 26 +++++++++++++------------- share/timedef/zh_CN.UTF-8.src | 26 +++++++++++++------------- share/timedef/zh_CN.eucCN.src | 26 +++++++++++++------------- share/timedef/zh_TW.Big5.src | 26 +++++++++++++------------- share/timedef/zh_TW.UTF-8.src | 26 +++++++++++++------------- 6 files changed, 78 insertions(+), 78 deletions(-) diff --git a/share/timedef/zh_CN.GB2312.src b/share/timedef/zh_CN.GB2312.src index e98dc5fc28f..ab5504114e8 100644 --- a/share/timedef/zh_CN.GB2312.src +++ b/share/timedef/zh_CN.GB2312.src @@ -4,18 +4,18 @@ # ----------------------------------------------------------------------------- # # Short month names - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 -10 -11 -12 +!!#1TB +!!#2TB +!!#3TB +!!#4TB +!!#5TB +!!#6TB +!!#7TB +!!#8TB +!!#9TB +#1#0TB +#1#1TB +#1#2TB # # Long month names (as in a date) Ò»Ô @@ -63,7 +63,7 @@ ÏÂÎç # # date_fmt -%YÄê%bÔÂ%eÈÕ %A %X %Z +%YÄê%_mÔÂ%eÈÕ %A %X %Z # # Long month names (without case ending) Ò»Ô diff --git a/share/timedef/zh_CN.GBK.src b/share/timedef/zh_CN.GBK.src index e98dc5fc28f..7079ac09d15 100644 --- a/share/timedef/zh_CN.GBK.src +++ b/share/timedef/zh_CN.GBK.src @@ -4,18 +4,18 @@ # ----------------------------------------------------------------------------- # # Short month names - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 -10 -11 -12 + 1Ô + 2Ô + 3Ô + 4Ô + 5Ô + 6Ô + 7Ô + 8Ô + 9Ô +10Ô +11Ô +12Ô # # Long month names (as in a date) Ò»Ô @@ -63,7 +63,7 @@ ÏÂÎç # # date_fmt -%YÄê%bÔÂ%eÈÕ %A %X %Z +%YÄê%_mÔÂ%eÈÕ %A %X %Z # # Long month names (without case ending) Ò»Ô diff --git a/share/timedef/zh_CN.UTF-8.src b/share/timedef/zh_CN.UTF-8.src index 2460706f3bf..a99e12b185d 100644 --- a/share/timedef/zh_CN.UTF-8.src +++ b/share/timedef/zh_CN.UTF-8.src @@ -4,18 +4,18 @@ # ----------------------------------------------------------------------------- # # Short month names - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 -10 -11 -12 + 1月 + 2月 + 3月 + 4月 + 5月 + 6月 + 7月 + 8月 + 9月 +10月 +11月 +12月 # # Long month names (as in a date) 一月 @@ -63,7 +63,7 @@ ä¸‹åˆ # # date_fmt -%Yå¹´%b月%eæ—¥ %A %X %Z +%Yå¹´%_m月%eæ—¥ %A %X %Z # # Long month names (without case ending) 一月 diff --git a/share/timedef/zh_CN.eucCN.src b/share/timedef/zh_CN.eucCN.src index e98dc5fc28f..34803e40507 100644 --- a/share/timedef/zh_CN.eucCN.src +++ b/share/timedef/zh_CN.eucCN.src @@ -4,18 +4,18 @@ # ----------------------------------------------------------------------------- # # Short month names - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 -10 -11 -12 +¡¡£±Ô +¡¡£²Ô +¡¡£³Ô +¡¡£´Ô +¡¡£µÔ +¡¡£¶Ô +¡¡£·Ô +¡¡£¸Ô +¡¡£¹Ô +£±£°Ô +£±£±Ô +£±£²Ô # # Long month names (as in a date) Ò»Ô @@ -63,7 +63,7 @@ ÏÂÎç # # date_fmt -%YÄê%bÔÂ%eÈÕ %A %X %Z +%YÄê%_mÔÂ%eÈÕ %A %X %Z # # Long month names (without case ending) Ò»Ô diff --git a/share/timedef/zh_TW.Big5.src b/share/timedef/zh_TW.Big5.src index 4ed7cdeeda1..2930f5ffc9d 100644 --- a/share/timedef/zh_TW.Big5.src +++ b/share/timedef/zh_TW.Big5.src @@ -4,18 +4,18 @@ # ----------------------------------------------------------------------------- # # Short month names - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 -10 -11 -12 +¡@¢°¤ë +¡@¢±¤ë +¡@¢²¤ë +¡@¢³¤ë +¡@¢´¤ë +¡@¢µ¤ë +¡@¢¶¤ë +¡@¢·¤ë +¡@¢¸¤ë +¢°¢¯¤ë +¢°¢°¤ë +¢°¢±¤ë # # Long month names (as in a date) 1¤ë @@ -63,7 +63,7 @@ ¤U¤È # # date_fmt -%Y¦~%b¤ë%e¤é %A %X %Z +%Y¦~%_m¤ë%e¤é %A %X %Z # # Long month names (without case ending) 1¤ë diff --git a/share/timedef/zh_TW.UTF-8.src b/share/timedef/zh_TW.UTF-8.src index 3121d4a4f4f..3affab634a2 100644 --- a/share/timedef/zh_TW.UTF-8.src +++ b/share/timedef/zh_TW.UTF-8.src @@ -4,18 +4,18 @@ # ----------------------------------------------------------------------------- # # Short month names - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 -10 -11 -12 + 1月 + 2月 + 3月 + 4月 + 5月 + 6月 + 7月 + 8月 + 9月 +10月 +11月 +12月 # # Long month names (as in a date) 1月 @@ -63,7 +63,7 @@ ä¸‹åˆ # # date_fmt -%Yå¹´%b月%eæ—¥ %A %X %Z +%Yå¹´%_m月%eæ—¥ %A %X %Z # # Long month names (without case ending) 1月 From a19105157759c43629895b420d6179b3fe8ffb00 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Fri, 2 Sep 2016 03:10:30 +0000 Subject: [PATCH 27/41] hyperv/hn: Rework RXCSUM related bits MFC after: 1 week Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D7735 --- sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c | 15 ++++++++------- sys/dev/hyperv/netvsc/hv_rndis_filter.c | 7 +++---- sys/dev/hyperv/netvsc/if_hnvar.h | 3 ++- sys/dev/hyperv/netvsc/ndis.h | 12 ++++++++++++ 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c index 5bfa85f5db7..56202eba78c 100644 --- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c +++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c @@ -1335,28 +1335,29 @@ netvsc_recv(struct hn_rx_ring *rxr, const void *data, int dlen, do_csum = 0; /* receive side checksum offload */ - if (info->csum_info != NULL) { + if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { /* IP csum offload */ - if (info->csum_info->receive.ip_csum_succeeded && do_csum) { + if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); rxr->hn_csum_ip++; } /* TCP/UDP csum offload */ - if ((info->csum_info->receive.tcp_csum_succeeded || - info->csum_info->receive.udp_csum_succeeded) && do_csum) { + if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | + NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; - if (info->csum_info->receive.tcp_csum_succeeded) + if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) rxr->hn_csum_tcp++; else rxr->hn_csum_udp++; } - if (info->csum_info->receive.ip_csum_succeeded && - info->csum_info->receive.tcp_csum_succeeded) + if ((info->csum_info & + (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == + (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) do_lro = 1; } else { const struct ether_header *eh; diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c index ab85e41cc07..22e2a818c3a 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c +++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c @@ -157,7 +157,7 @@ hv_rf_find_recvinfo(const rndis_packet *rpkt, struct hn_recvinfo *info) uint32_t mask = 0, len; info->vlan_info = HN_NDIS_VLAN_INFO_INVALID; - info->csum_info = NULL; + info->csum_info = HN_NDIS_RXCSUM_INFO_INVALID; info->hash_info = NULL; info->hash_value = NULL; @@ -200,10 +200,9 @@ hv_rf_find_recvinfo(const rndis_packet *rpkt, struct hn_recvinfo *info) break; case tcpip_chksum_info: - if (__predict_false(dlen < - sizeof(rndis_tcp_ip_csum_info))) + if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) return (EINVAL); - info->csum_info = data; + info->csum_info = *((const uint32_t *)data); mask |= HV_RF_RECVINFO_CSUM; break; diff --git a/sys/dev/hyperv/netvsc/if_hnvar.h b/sys/dev/hyperv/netvsc/if_hnvar.h index 99313628a9f..cae9bd1e652 100644 --- a/sys/dev/hyperv/netvsc/if_hnvar.h +++ b/sys/dev/hyperv/netvsc/if_hnvar.h @@ -56,10 +56,11 @@ struct ndis_8021q_info_; struct rndis_tcp_ip_csum_info_; #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff +#define HN_NDIS_RXCSUM_INFO_INVALID 0 struct hn_recvinfo { uint32_t vlan_info; - const struct rndis_tcp_ip_csum_info_ *csum_info; + uint32_t csum_info; const struct rndis_hash_info *hash_info; const struct rndis_hash_value *hash_value; }; diff --git a/sys/dev/hyperv/netvsc/ndis.h b/sys/dev/hyperv/netvsc/ndis.h index 923f3d9a6c7..9e65b4f5a6a 100644 --- a/sys/dev/hyperv/netvsc/ndis.h +++ b/sys/dev/hyperv/netvsc/ndis.h @@ -219,4 +219,16 @@ struct ndis_rssprm_toeplitz { #define NDIS_VLAN_INFO_CFI(inf) (((inf) & NDIS_VLAN_INFO_CFI_MASK) >> 3) #define NDIS_VLAN_INFO_PRI(inf) ((inf) & NDIS_VLAN_INFO_PRI_MASK) +/* Reception checksum */ +#define NDIS_RXCSUM_INFO_SIZE sizeof(uint32_t) +#define NDIS_RXCSUM_INFO_TCPCS_FAILED 0x0001 +#define NDIS_RXCSUM_INFO_UDPCS_FAILED 0x0002 +#define NDIS_RXCSUM_INFO_IPCS_FAILED 0x0004 +#define NDIS_RXCSUM_INFO_TCPCS_OK 0x0008 +#define NDIS_RXCSUM_INFO_UDPCS_OK 0x0010 +#define NDIS_RXCSUM_INFO_IPCS_OK 0x0020 +#define NDIS_RXCSUM_INFO_LOOPBACK 0x0040 +#define NDIS_RXCSUM_INFO_TCPCS_INVAL 0x0080 +#define NDIS_RXCSUM_INFO_IPCS_INVAL 0x0100 + #endif /* !_NET_NDIS_H_ */ From 5e5d4b233a27f6d6e345159ac68cc40449a2fd4a Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Fri, 2 Sep 2016 03:15:54 +0000 Subject: [PATCH 28/41] cron: use existing maximum username constant MAXLOGNAME Previously cron had its own maximum username length limit, which was smaller than the system's MAXLOGNAME. This could lead to crontab -u updating the wrong user's crontab (if the name was truncated, and matched another user). PR: 212305 Reported by: Andrii Kuzik Reviewed by: allanjude, jilles MFC after: 3 days Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D7747 --- usr.sbin/cron/cron/cron.h | 1 - usr.sbin/cron/crontab/crontab.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/usr.sbin/cron/cron/cron.h b/usr.sbin/cron/cron/cron.h index 1a814dfed83..60b2a90819e 100644 --- a/usr.sbin/cron/cron/cron.h +++ b/usr.sbin/cron/cron/cron.h @@ -73,7 +73,6 @@ #define MAX_COMMAND 1000 /* max length of internally generated cmd */ #define MAX_ENVSTR 1000 /* max length of envvar=value\0 strings */ #define MAX_TEMPSTR 100 /* obvious */ -#define MAX_UNAME 20 /* max length of username, should be overkill */ #define ROOT_UID 0 /* don't change this, it really must be root */ #define ROOT_USER "root" /* ditto */ #define SYS_NAME "*system*" /* magic owner name for system crontab */ diff --git a/usr.sbin/cron/crontab/crontab.c b/usr.sbin/cron/crontab/crontab.c index 2608be711ce..933450b9b4d 100644 --- a/usr.sbin/cron/crontab/crontab.c +++ b/usr.sbin/cron/crontab/crontab.c @@ -28,6 +28,7 @@ static const char rcsid[] = #define MAIN_PROGRAM +#include #include "cron.h" #include #include @@ -57,7 +58,7 @@ static char *Options[] = { "???", "list", "delete", "edit", "replace" }; static PID_T Pid; -static char User[MAX_UNAME], RealUser[MAX_UNAME]; +static char User[MAXLOGNAME], RealUser[MAXLOGNAME]; static char Filename[MAX_FNAME]; static FILE *NewCrontab; static int CheckErrorCount; From 4e835450fde0dffd57eceeba5f341d94e9200ada Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Fri, 2 Sep 2016 03:19:55 +0000 Subject: [PATCH 29/41] hyperv/hn: Simplify RX hash related bits. MFC after: 1 week Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D7736 --- sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c | 20 ++++++++----------- sys/dev/hyperv/netvsc/hv_rndis_filter.c | 20 +++++++++++-------- sys/dev/hyperv/netvsc/if_hnreg.h | 13 ++++++++++++ sys/dev/hyperv/netvsc/if_hnvar.h | 5 +++-- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c index 56202eba78c..76740eff9cd 100644 --- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c +++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c @@ -1289,7 +1289,7 @@ netvsc_recv(struct hn_rx_ring *rxr, const void *data, int dlen, struct ifnet *ifp = rxr->hn_ifp; struct mbuf *m_new; int size, do_lro = 0, do_csum = 1; - int hash_type = M_HASHTYPE_OPAQUE_HASH; + int hash_type; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return (0); @@ -1421,13 +1421,13 @@ skip: m_new->m_flags |= M_VLANTAG; } - if (info->hash_info != NULL && info->hash_value != NULL) { + if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { rxr->hn_rss_pkts++; - m_new->m_pkthdr.flowid = info->hash_value->hash_value; - if ((info->hash_info->hash_info & NDIS_HASH_FUNCTION_MASK) == + m_new->m_pkthdr.flowid = info->hash_value; + hash_type = M_HASHTYPE_OPAQUE_HASH; + if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == NDIS_HASH_FUNCTION_TOEPLITZ) { - uint32_t type = - (info->hash_info->hash_info & NDIS_HASH_TYPE_MASK); + uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); switch (type) { case NDIS_HASH_IPV4: @@ -1456,12 +1456,8 @@ skip: } } } else { - if (info->hash_value != NULL) { - m_new->m_pkthdr.flowid = info->hash_value->hash_value; - } else { - m_new->m_pkthdr.flowid = rxr->hn_rx_idx; - hash_type = M_HASHTYPE_OPAQUE; - } + m_new->m_pkthdr.flowid = rxr->hn_rx_idx; + hash_type = M_HASHTYPE_OPAQUE; } M_HASHTYPE_SET(m_new, hash_type); diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c index 22e2a818c3a..1731ceb38ba 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c +++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c @@ -158,8 +158,7 @@ hv_rf_find_recvinfo(const rndis_packet *rpkt, struct hn_recvinfo *info) info->vlan_info = HN_NDIS_VLAN_INFO_INVALID; info->csum_info = HN_NDIS_RXCSUM_INFO_INVALID; - info->hash_info = NULL; - info->hash_value = NULL; + info->hash_info = HN_NDIS_HASH_INFO_INVALID; if (rpkt->per_pkt_info_offset == 0) return (0); @@ -207,18 +206,16 @@ hv_rf_find_recvinfo(const rndis_packet *rpkt, struct hn_recvinfo *info) break; case nbl_hash_value: - if (__predict_false(dlen < - sizeof(struct rndis_hash_value))) + if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) return (EINVAL); - info->hash_value = data; + info->hash_value = *((const uint32_t *)data); mask |= HV_RF_RECVINFO_HASHVAL; break; case nbl_hash_info: - if (__predict_false(dlen < - sizeof(struct rndis_hash_info))) + if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) return (EINVAL); - info->hash_info = data; + info->hash_info = *((const uint32_t *)data); mask |= HV_RF_RECVINFO_HASHINF; break; @@ -234,6 +231,13 @@ next: pi = (const struct rndis_pktinfo *) ((const uint8_t *)pi + pi->rm_size); } + + /* + * Final fixup. + * - If there is no hash value, invalidate the hash info. + */ + if ((mask & HV_RF_RECVINFO_HASHVAL) == 0) + info->hash_info = HN_NDIS_HASH_INFO_INVALID; return (0); } diff --git a/sys/dev/hyperv/netvsc/if_hnreg.h b/sys/dev/hyperv/netvsc/if_hnreg.h index 467c5400011..5dfadbf5535 100644 --- a/sys/dev/hyperv/netvsc/if_hnreg.h +++ b/sys/dev/hyperv/netvsc/if_hnreg.h @@ -208,4 +208,17 @@ struct hn_nvs_rndis_ack { } __packed; CTASSERT(sizeof(struct hn_nvs_rndis_ack) >= HN_NVS_REQSIZE_MIN); +/* + * RNDIS extension + */ + +/* Per-packet hash info */ +#define HN_NDIS_HASH_INFO_SIZE sizeof(uint32_t) +#define HN_NDIS_PKTINFO_TYPE_HASHINF NDIS_PKTINFO_TYPE_ORIG_NBLIST +/* NDIS_HASH_ */ + +/* Per-packet hash value */ +#define HN_NDIS_HASH_VALUE_SIZE sizeof(uint32_t) +#define HN_NDIS_PKTINFO_TYPE_HASHVAL NDIS_PKTINFO_TYPE_PKT_CANCELID + #endif /* !_IF_HNREG_H_ */ diff --git a/sys/dev/hyperv/netvsc/if_hnvar.h b/sys/dev/hyperv/netvsc/if_hnvar.h index cae9bd1e652..00ef144273c 100644 --- a/sys/dev/hyperv/netvsc/if_hnvar.h +++ b/sys/dev/hyperv/netvsc/if_hnvar.h @@ -57,12 +57,13 @@ struct rndis_tcp_ip_csum_info_; #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff #define HN_NDIS_RXCSUM_INFO_INVALID 0 +#define HN_NDIS_HASH_INFO_INVALID 0 struct hn_recvinfo { uint32_t vlan_info; uint32_t csum_info; - const struct rndis_hash_info *hash_info; - const struct rndis_hash_value *hash_value; + uint32_t hash_info; + uint32_t hash_value; }; #define HN_SEND_CTX_INITIALIZER(cb, cbarg) \ From 2a469fff988ffed0f0bd794b5622473b4129e799 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Fri, 2 Sep 2016 05:30:38 +0000 Subject: [PATCH 30/41] hyperv/hn: Use the per-packet-info types defined by net/rndis.h MFC after: 1 week Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D7737 --- sys/dev/hyperv/netvsc/hv_rndis_filter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sys/dev/hyperv/netvsc/hv_rndis_filter.c b/sys/dev/hyperv/netvsc/hv_rndis_filter.c index 1731ceb38ba..a6f100f7866 100644 --- a/sys/dev/hyperv/netvsc/hv_rndis_filter.c +++ b/sys/dev/hyperv/netvsc/hv_rndis_filter.c @@ -191,28 +191,28 @@ hv_rf_find_recvinfo(const rndis_packet *rpkt, struct hn_recvinfo *info) data = pi->rm_data; switch (pi->rm_type) { - case ieee_8021q_info: + case NDIS_PKTINFO_TYPE_VLAN: if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) return (EINVAL); info->vlan_info = *((const uint32_t *)data); mask |= HV_RF_RECVINFO_VLAN; break; - case tcpip_chksum_info: + case NDIS_PKTINFO_TYPE_CSUM: if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) return (EINVAL); info->csum_info = *((const uint32_t *)data); mask |= HV_RF_RECVINFO_CSUM; break; - case nbl_hash_value: + case HN_NDIS_PKTINFO_TYPE_HASHVAL: if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) return (EINVAL); info->hash_value = *((const uint32_t *)data); mask |= HV_RF_RECVINFO_HASHVAL; break; - case nbl_hash_info: + case HN_NDIS_PKTINFO_TYPE_HASHINF: if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) return (EINVAL); info->hash_info = *((const uint32_t *)data); From 178228a10e8fe8045e4ecf7ead291eac2002cb56 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Fri, 2 Sep 2016 05:49:38 +0000 Subject: [PATCH 31/41] net/rndis: Add comment for rndis_comp_hdr Reviewed by: hps MFC after: 1 week Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D7738 --- sys/net/rndis.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sys/net/rndis.h b/sys/net/rndis.h index 7e75ddeb131..3d823d3bb91 100644 --- a/sys/net/rndis.h +++ b/sys/net/rndis.h @@ -147,6 +147,12 @@ struct rndis_pktinfo { /* * RNDIS control messages */ + +/* + * Common header for RNDIS completion messages. + * + * NOTE: It does not apply to REMOTE_NDIS_RESET_CMPLT. + */ struct rndis_comp_hdr { uint32_t rm_type; uint32_t rm_len; From 772b86ba12208010106927ce57a966e7c31e0b7b Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Fri, 2 Sep 2016 05:57:13 +0000 Subject: [PATCH 32/41] net/rndis: Define common message header for RNDIS messages. And avoid RNDIS_HEADER_OFFSET hardcoding. Reviewed by: hps MFC after: 1 week Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D7739 --- sys/net/rndis.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sys/net/rndis.h b/sys/net/rndis.h index 3d823d3bb91..0477a65701e 100644 --- a/sys/net/rndis.h +++ b/sys/net/rndis.h @@ -92,6 +92,14 @@ #define RNDIS_DF_CONNECTIONLESS 0x00000001 #define RNDIS_DF_CONNECTION_ORIENTED 0x00000002 +/* + * Common RNDIS message header. + */ +struct rndis_msghdr { + uint32_t rm_type; + uint32_t rm_len; +}; + /* * RNDIS data message */ @@ -322,7 +330,7 @@ struct rndis_keepalive_comp { #define NDIS_PACKET_TYPE_MAC_FRAME 0x00008000 /* RNDIS offsets */ -#define RNDIS_HEADER_OFFSET 8 /* bytes */ +#define RNDIS_HEADER_OFFSET ((uint32_t)sizeof(struct rndis_msghdr)) #define RNDIS_DATA_OFFSET \ ((uint32_t)(sizeof(struct rndis_packet_msg) - RNDIS_HEADER_OFFSET)) From 9c040f41df3fe956a8dfbc81e71a2fe7db303444 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Fri, 2 Sep 2016 06:08:08 +0000 Subject: [PATCH 33/41] hyperv/ic: Minor style fix. MFC after: 1 week Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D7743 --- sys/dev/hyperv/utilities/hv_heartbeat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/dev/hyperv/utilities/hv_heartbeat.c b/sys/dev/hyperv/utilities/hv_heartbeat.c index 5e91cd1a692..37867148b53 100644 --- a/sys/dev/hyperv/utilities/hv_heartbeat.c +++ b/sys/dev/hyperv/utilities/hv_heartbeat.c @@ -69,7 +69,7 @@ vmbus_heartbeat_cb(struct vmbus_channel *chan, void *xsc) if (error) return; - if (dlen < sizeof(struct vmbus_icmsg_hdr)) { + if (dlen < sizeof(*hdr)) { device_printf(sc->ic_dev, "invalid data len %d\n", dlen); return; } From 1feb13270d786b7896361948c4aea3ef14bae5b6 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Fri, 2 Sep 2016 06:15:30 +0000 Subject: [PATCH 34/41] hyperv/ic: Cleanup shutdown channel callback. MFC after: 1 week Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D7744 --- sys/dev/hyperv/utilities/hv_shutdown.c | 135 +++++++++++++------------ sys/dev/hyperv/utilities/vmbus_icreg.h | 11 ++ 2 files changed, 79 insertions(+), 67 deletions(-) diff --git a/sys/dev/hyperv/utilities/hv_shutdown.c b/sys/dev/hyperv/utilities/hv_shutdown.c index abc82fa09e3..458009381b2 100644 --- a/sys/dev/hyperv/utilities/hv_shutdown.c +++ b/sys/dev/hyperv/utilities/hv_shutdown.c @@ -22,28 +22,23 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ */ -/* - * A common driver for all hyper-V util services. - */ +#include +__FBSDID("$FreeBSD$"); #include -#include #include -#include +#include #include #include -#include -#include #include #include #include -#include -#include "hv_util.h" +#include +#include + #include "vmbus_if.h" static const struct vmbus_ic_desc vmbus_shutdown_descs[] = { @@ -56,73 +51,78 @@ static const struct vmbus_ic_desc vmbus_shutdown_descs[] = { VMBUS_IC_DESC_END }; -/** - * Shutdown - */ static void -hv_shutdown_cb(struct vmbus_channel *channel, void *context) +vmbus_shutdown_cb(struct vmbus_channel *chan, void *xsc) { - uint8_t* buf; - uint8_t execute_shutdown = 0; - hv_vmbus_icmsg_hdr* icmsghdrp; - uint32_t recv_len; - uint64_t request_id; - int ret; - hv_vmbus_shutdown_msg_data* shutdown_msg; - hv_util_sc *softc; + struct hv_util_sc *sc = xsc; + struct vmbus_icmsg_hdr *hdr; + struct vmbus_icmsg_shutdown *msg; + int dlen, error, do_shutdown = 0; + uint64_t xactid; + void *data; - softc = (hv_util_sc*)context; - buf = softc->receive_buffer; + /* + * Receive request. + */ + data = sc->receive_buffer; + dlen = sc->ic_buflen; + error = vmbus_chan_recv(chan, data, &dlen, &xactid); + KASSERT(error != ENOBUFS, ("icbuf is not large enough")); + if (error) + return; - recv_len = softc->ic_buflen; - ret = vmbus_chan_recv(channel, buf, &recv_len, &request_id); - KASSERT(ret != ENOBUFS, ("hvshutdown recvbuf is not large enough")); - /* XXX check recv_len to make sure that it contains enough data */ + if (dlen < sizeof(*hdr)) { + device_printf(sc->ic_dev, "invalid data len %d\n", dlen); + return; + } + hdr = data; - if ((ret == 0) && recv_len > 0) { - - icmsghdrp = (struct hv_vmbus_icmsg_hdr *) - &buf[sizeof(struct hv_vmbus_pipe_hdr)]; - - if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { - int error; - - error = vmbus_ic_negomsg(softc, buf, &recv_len); + /* + * Update request, which will be echoed back as response. + */ + switch (hdr->ic_type) { + case VMBUS_ICMSG_TYPE_NEGOTIATE: + error = vmbus_ic_negomsg(sc, data, &dlen); if (error) return; - } else { - shutdown_msg = - (struct hv_vmbus_shutdown_msg_data *) - &buf[sizeof(struct hv_vmbus_pipe_hdr) + - sizeof(struct hv_vmbus_icmsg_hdr)]; + break; - switch (shutdown_msg->flags) { - case 0: - case 1: - icmsghdrp->status = HV_S_OK; - execute_shutdown = 1; - if(bootverbose) - printf("Shutdown request received -" - " graceful shutdown initiated\n"); - break; - default: - icmsghdrp->status = HV_E_FAIL; - execute_shutdown = 0; - printf("Shutdown request received -" - " Invalid request\n"); - break; - } - } + case VMBUS_ICMSG_TYPE_SHUTDOWN: + if (dlen < VMBUS_ICMSG_SHUTDOWN_SIZE_MIN) { + device_printf(sc->ic_dev, "invalid shutdown len %d\n", + dlen); + return; + } + msg = data; - icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | - HV_ICMSGHDRFLAG_RESPONSE; + /* XXX ic_flags definition? */ + if (msg->ic_haltflags == 0 || msg->ic_haltflags == 1) { + device_printf(sc->ic_dev, "shutdown requested\n"); + hdr->ic_status = VMBUS_ICMSG_STATUS_OK; + do_shutdown = 1; + } else { + device_printf(sc->ic_dev, "unknown shutdown flags " + "0x%08x\n", msg->ic_haltflags); + hdr->ic_status = VMBUS_ICMSG_STATUS_FAIL; + } + break; - vmbus_chan_send(channel, VMBUS_CHANPKT_TYPE_INBAND, 0, - buf, recv_len, request_id); + default: + device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type); + break; } - if (execute_shutdown) - shutdown_nice(RB_POWEROFF); + /* + * Send response by echoing the updated request back. + */ + hdr->ic_flags = VMBUS_ICMSG_FLAG_XACT | VMBUS_ICMSG_FLAG_RESP; + error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, 0, + data, dlen, xactid); + if (error) + device_printf(sc->ic_dev, "resp send failed: %d\n", error); + + if (do_shutdown) + shutdown_nice(RB_POWEROFF); } static int @@ -135,7 +135,8 @@ hv_shutdown_probe(device_t dev) static int hv_shutdown_attach(device_t dev) { - return hv_util_attach(dev, hv_shutdown_cb); + + return (hv_util_attach(dev, vmbus_shutdown_cb)); } static device_method_t shutdown_methods[] = { diff --git a/sys/dev/hyperv/utilities/vmbus_icreg.h b/sys/dev/hyperv/utilities/vmbus_icreg.h index 38643e3dcab..920b466a752 100644 --- a/sys/dev/hyperv/utilities/vmbus_icreg.h +++ b/sys/dev/hyperv/utilities/vmbus_icreg.h @@ -91,4 +91,15 @@ struct vmbus_icmsg_heartbeat { #define VMBUS_ICMSG_HEARTBEAT_SIZE_MIN \ __offsetof(struct vmbus_icmsg_heartbeat, ic_rsvd[0]) +struct vmbus_icmsg_shutdown { + struct vmbus_icmsg_hdr ic_hdr; + uint32_t ic_code; + uint32_t ic_timeo; + uint32_t ic_haltflags; + uint8_t ic_msg[2048]; +} __packed; + +#define VMBUS_ICMSG_SHUTDOWN_SIZE_MIN \ + __offsetof(struct vmbus_icmsg_shutdown, ic_msg[0]) + #endif /* !_VMBUS_ICREG_H_ */ From 225c6e916e69efcdda24b76dd77ef665cd3f3e53 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Fri, 2 Sep 2016 06:23:28 +0000 Subject: [PATCH 35/41] hyperv/ic: Cleanup timesync channel callback. MFC after: 1 week Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D7745 --- sys/dev/hyperv/utilities/hv_timesync.c | 232 ++++++++++++------------- sys/dev/hyperv/utilities/vmbus_icreg.h | 20 +++ 2 files changed, 132 insertions(+), 120 deletions(-) diff --git a/sys/dev/hyperv/utilities/hv_timesync.c b/sys/dev/hyperv/utilities/hv_timesync.c index 93dee256216..2d440263ec0 100644 --- a/sys/dev/hyperv/utilities/hv_timesync.c +++ b/sys/dev/hyperv/utilities/hv_timesync.c @@ -22,37 +22,25 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ */ -/* - * A common driver for all hyper-V util services. - */ +#include +__FBSDID("$FreeBSD$"); #include -#include #include -#include +#include #include -#include -#include #include -#include #include +#include #include #include -#include -#include "hv_util.h" -#include "vmbus_if.h" +#include +#include -#define HV_WLTIMEDELTA 116444736000000000L /* in 100ns unit */ -#define HV_ICTIMESYNCFLAG_PROBE 0 -#define HV_ICTIMESYNCFLAG_SYNC 1 -#define HV_ICTIMESYNCFLAG_SAMPLE 2 -#define HV_NANO_SEC_PER_SEC 1000000000 -#define HV_NANO_SEC_PER_MILLI_SEC 1000000 +#include "vmbus_if.h" static const struct vmbus_ic_desc vmbus_timesync_descs[] = { { @@ -64,135 +52,144 @@ static const struct vmbus_ic_desc vmbus_timesync_descs[] = { VMBUS_IC_DESC_END }; -struct hv_ictimesync_data { - uint64_t parenttime; - uint64_t childtime; - uint64_t roundtriptime; - uint8_t flags; -} __packed; - -/* - * Globals - */ SYSCTL_NODE(_hw, OID_AUTO, hvtimesync, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Hyper-V timesync interface"); -/* Ignore the sync request when set to 1. */ -static int ignore_sync_req = 0; -SYSCTL_INT(_hw_hvtimesync, OID_AUTO, ignore_sync_req, CTLFLAG_RWTUN, - &ignore_sync_req, 0, - "Ignore the sync request when set to 1."); +static int vmbus_ts_ignore_sync = 0; +SYSCTL_INT(_hw_hvtimesync, OID_AUTO, ignore_sync, CTLFLAG_RWTUN, + &vmbus_ts_ignore_sync, 0, "Ignore the sync request."); /* * Trigger sample sync when drift exceeds threshold (ms). * Ignore the sample request when set to 0. */ -static int sample_drift = 100; -SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_drift, CTLFLAG_RWTUN, - &sample_drift, 0, - "Threshold that makes sample request trigger the sync."); +static int vmbus_ts_sample_thresh = 100; +SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_thresh, CTLFLAG_RWTUN, + &vmbus_ts_sample_thresh, 0, + "Threshold that makes sample request trigger the sync (unit: ms)."); -/** - * @brief Synchronize time with host after reboot, restore, etc. - * - * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM. - * After reboot the flag ICTIMESYNCFLAG_SYNC is included in the first time - * message after the timesync channel is opened. Since the hv_utils module is - * loaded after hv_vmbus, the first message is usually missed. The other - * thing is, systime is automatically set to emulated hardware clock which may - * not be UTC time or in the same time zone. So, to override these effects, we - * use the first 50 time samples for initial system time setting. - */ -static inline -void hv_adj_guesttime(hv_util_sc *sc, uint64_t hosttime, uint8_t flags) +static int vmbus_ts_sample_verbose = 0; +SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_verbose, CTLFLAG_RWTUN, + &vmbus_ts_sample_verbose, 0, "Increase sample request verbosity."); + +static void +vmbus_timesync(struct hv_util_sc *sc, uint64_t hvtime, uint8_t tsflags) { - struct timespec guest_ts, host_ts; - uint64_t host_tns, guest_tns; - int64_t diff; - int error; + struct timespec vm_ts; + uint64_t hv_ns, vm_ns; - host_tns = (hosttime - HV_WLTIMEDELTA) * 100; - host_ts.tv_sec = (time_t)(host_tns/HV_NANO_SEC_PER_SEC); - host_ts.tv_nsec = (long)(host_tns%HV_NANO_SEC_PER_SEC); + hv_ns = (hvtime - VMBUS_ICMSG_TS_BASE) * VMBUS_ICMSG_TS_FACTOR; + nanotime(&vm_ts); + vm_ns = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec; - nanotime(&guest_ts); - guest_tns = guest_ts.tv_sec * HV_NANO_SEC_PER_SEC + guest_ts.tv_nsec; + if ((tsflags & VMBUS_ICMSG_TS_FLAG_SYNC) && !vmbus_ts_ignore_sync) { + struct timespec hv_ts; - if ((flags & HV_ICTIMESYNCFLAG_SYNC) != 0 && ignore_sync_req == 0) { if (bootverbose) { - device_printf(sc->ic_dev, "handle sync request " - "{host: %ju, guest: %ju}\n", - (uintmax_t)host_tns, (uintmax_t)guest_tns); + device_printf(sc->ic_dev, "apply sync request, " + "hv: %ju, vm: %ju\n", + (uintmax_t)hv_ns, (uintmax_t)vm_ns); } - - error = kern_clock_settime(curthread, CLOCK_REALTIME, - &host_ts); + hv_ts.tv_sec = hv_ns / NANOSEC; + hv_ts.tv_nsec = hv_ns % NANOSEC; + kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts); + /* Done! */ return; } - if ((flags & HV_ICTIMESYNCFLAG_SAMPLE) != 0 && sample_drift != 0) { - if (bootverbose) { - device_printf(sc->ic_dev, "handle sample request " - "{host: %ju, guest: %ju}\n", - (uintmax_t)host_tns, (uintmax_t)guest_tns); + if ((tsflags & VMBUS_ICMSG_TS_FLAG_SAMPLE) && + vmbus_ts_sample_thresh > 0) { + int64_t diff; + + if (vmbus_ts_sample_verbose) { + device_printf(sc->ic_dev, "sample request, " + "hv: %ju, vm: %ju\n", + (uintmax_t)hv_ns, (uintmax_t)vm_ns); } - diff = (int64_t)(host_tns - guest_tns) / HV_NANO_SEC_PER_MILLI_SEC; - if (diff > sample_drift || diff < -sample_drift) { - error = kern_clock_settime(curthread, CLOCK_REALTIME, - &host_ts); - if (bootverbose) - device_printf(sc->ic_dev, "trigger sample sync"); + if (hv_ns > vm_ns) + diff = hv_ns - vm_ns; + else + diff = vm_ns - hv_ns; + /* nanosec -> millisec */ + diff /= 1000000; + + if (diff > vmbus_ts_sample_thresh) { + struct timespec hv_ts; + + if (bootverbose) { + device_printf(sc->ic_dev, + "apply sample request, hv: %ju, vm: %ju\n", + (uintmax_t)hv_ns, (uintmax_t)vm_ns); + } + hv_ts.tv_sec = hv_ns / NANOSEC; + hv_ts.tv_nsec = hv_ns % NANOSEC; + kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts); } + /* Done */ return; } } -/** - * Time Sync Channel message handler - */ static void -hv_timesync_cb(struct vmbus_channel *channel, void *context) +vmbus_timesync_cb(struct vmbus_channel *chan, void *xsc) { - hv_vmbus_icmsg_hdr* icmsghdrp; - uint32_t recvlen; - uint64_t requestId; - int ret; - uint8_t* time_buf; - struct hv_ictimesync_data* timedatap; - hv_util_sc *softc; + struct hv_util_sc *sc = xsc; + struct vmbus_icmsg_hdr *hdr; + const struct vmbus_icmsg_timesync *msg; + int dlen, error; + uint64_t xactid; + void *data; - softc = (hv_util_sc*)context; - time_buf = softc->receive_buffer; + /* + * Receive request. + */ + data = sc->receive_buffer; + dlen = sc->ic_buflen; + error = vmbus_chan_recv(chan, data, &dlen, &xactid); + KASSERT(error != ENOBUFS, ("icbuf is not large enough")); + if (error) + return; - recvlen = softc->ic_buflen; - ret = vmbus_chan_recv(channel, time_buf, &recvlen, &requestId); - KASSERT(ret != ENOBUFS, ("hvtimesync recvbuf is not large enough")); - /* XXX check recvlen to make sure that it contains enough data */ + if (dlen < sizeof(*hdr)) { + device_printf(sc->ic_dev, "invalid data len %d\n", dlen); + return; + } + hdr = data; - if ((ret == 0) && recvlen > 0) { - icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &time_buf[ - sizeof(struct hv_vmbus_pipe_hdr)]; - - if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { - int error; - - error = vmbus_ic_negomsg(softc, time_buf, &recvlen); + /* + * Update request, which will be echoed back as response. + */ + switch (hdr->ic_type) { + case VMBUS_ICMSG_TYPE_NEGOTIATE: + error = vmbus_ic_negomsg(sc, data, &dlen); if (error) return; - } else { - timedatap = (struct hv_ictimesync_data *) &time_buf[ - sizeof(struct hv_vmbus_pipe_hdr) + - sizeof(struct hv_vmbus_icmsg_hdr)]; - hv_adj_guesttime(softc, timedatap->parenttime, timedatap->flags); - } + break; - icmsghdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION - | HV_ICMSGHDRFLAG_RESPONSE; + case VMBUS_ICMSG_TYPE_TIMESYNC: + if (dlen < sizeof(*msg)) { + device_printf(sc->ic_dev, "invalid timesync len %d\n", + dlen); + return; + } + msg = data; + vmbus_timesync(sc, msg->ic_hvtime, msg->ic_tsflags); + break; - vmbus_chan_send(channel, VMBUS_CHANPKT_TYPE_INBAND, 0, - time_buf, recvlen, requestId); + default: + device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type); + break; } + + /* + * Send response by echoing the updated request back. + */ + hdr->ic_flags = VMBUS_ICMSG_FLAG_XACT | VMBUS_ICMSG_FLAG_RESP; + error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, 0, + data, dlen, xactid); + if (error) + device_printf(sc->ic_dev, "resp send failed: %d\n", error); } static int @@ -205,20 +202,15 @@ hv_timesync_probe(device_t dev) static int hv_timesync_attach(device_t dev) { - return hv_util_attach(dev, hv_timesync_cb); -} -static int -hv_timesync_detach(device_t dev) -{ - return hv_util_detach(dev); + return (hv_util_attach(dev, vmbus_timesync_cb)); } static device_method_t timesync_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hv_timesync_probe), DEVMETHOD(device_attach, hv_timesync_attach), - DEVMETHOD(device_detach, hv_timesync_detach), + DEVMETHOD(device_detach, hv_util_detach), { 0, 0 } }; diff --git a/sys/dev/hyperv/utilities/vmbus_icreg.h b/sys/dev/hyperv/utilities/vmbus_icreg.h index 920b466a752..683e2f83b59 100644 --- a/sys/dev/hyperv/utilities/vmbus_icreg.h +++ b/sys/dev/hyperv/utilities/vmbus_icreg.h @@ -91,6 +91,7 @@ struct vmbus_icmsg_heartbeat { #define VMBUS_ICMSG_HEARTBEAT_SIZE_MIN \ __offsetof(struct vmbus_icmsg_heartbeat, ic_rsvd[0]) +/* VMBUS_ICMSG_TYPE_SHUTDOWN */ struct vmbus_icmsg_shutdown { struct vmbus_icmsg_hdr ic_hdr; uint32_t ic_code; @@ -102,4 +103,23 @@ struct vmbus_icmsg_shutdown { #define VMBUS_ICMSG_SHUTDOWN_SIZE_MIN \ __offsetof(struct vmbus_icmsg_shutdown, ic_msg[0]) +/* VMBUS_ICMSG_TYPE_TIMESYNC */ +struct vmbus_icmsg_timesync { + struct vmbus_icmsg_hdr ic_hdr; + uint64_t ic_hvtime; + uint64_t ic_vmtime; + uint64_t ic_rtt; + uint8_t ic_tsflags; /* VMBUS_ICMSG_TS_FLAG_ */ +} __packed; + +#define VMBUS_ICMSG_TS_FLAG_SYNC 0x01 +#define VMBUS_ICMSG_TS_FLAG_SAMPLE 0x02 + +/* XXX consolidate w/ hyperv */ +#define VMBUS_ICMSG_TS_BASE 116444736000000000ULL +#define VMBUS_ICMSG_TS_FACTOR 100ULL +#ifndef NANOSEC +#define NANOSEC 1000000000ULL +#endif + #endif /* !_VMBUS_ICREG_H_ */ From 7a83ecc454aac6dfc05b2c9901b5c4d2bc44448d Mon Sep 17 00:00:00 2001 From: Enji Cooper Date: Fri, 2 Sep 2016 08:17:43 +0000 Subject: [PATCH 36/41] Skip :test_large on i386 More assertions are failing on ^/head now. PR: 205446 Sponsored by: EMC / Isilon Storage Division --- lib/msun/tests/ctrig_test.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/msun/tests/ctrig_test.c b/lib/msun/tests/ctrig_test.c index 475b6c5f8da..f997aaef46f 100644 --- a/lib/msun/tests/ctrig_test.c +++ b/lib/msun/tests/ctrig_test.c @@ -427,7 +427,6 @@ test_large(void) test_odd_tol(ctanh, z, CMPLXL(1.0, 8.95257245135025991216632140458264468e-309L), DBL_ULP()); -#if !defined(__i386__) z = CMPLXL(30, 0x1p1023L); test_odd_tol(ctanh, z, CMPLXL(1.0, -1.62994325413993477997492170229268382e-26L), @@ -437,7 +436,6 @@ test_large(void) CMPLXL(0.878606311888306869546254022621986509L, -0.225462792499754505792678258169527424L), DBL_ULP()); -#endif z = CMPLXL(710.6, 0.78539816339744830961566084581987572L); test_odd_tol(csinh, z, @@ -475,8 +473,12 @@ main(int argc, char *argv[]) test_small(); printf("ok 5 - ctrig small\n"); +#if defined(__i386__) + printf("ok 6 # SKIP ctrig large # fails on i386 because of bug 205446\n"); +#else test_large(); printf("ok 6 - ctrig large\n"); +#endif return (0); } From 5f51814803d36f2201fa05b95975671aaf6bf7d7 Mon Sep 17 00:00:00 2001 From: Hans Petter Selasky Date: Fri, 2 Sep 2016 08:44:14 +0000 Subject: [PATCH 37/41] Fix array size issue when using the pre-scaling feature for ISOCHRONOUS USB transfers. Make sure enough length and buffer pointers are allocated when setting up the libusb transfer structure to support the maximum number of frames the kernel can handle. MFC after: 1 week --- lib/libusb/libusb20.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/libusb/libusb20.c b/lib/libusb/libusb20.c index 2f4ee780cab..91ed9d797af 100644 --- a/lib/libusb/libusb20.c +++ b/lib/libusb/libusb20.c @@ -176,6 +176,12 @@ libusb20_tr_open_stream(struct libusb20_transfer *xfer, uint32_t MaxBufSize, return (LIBUSB20_ERROR_BUSY); if (MaxFrameCount & LIBUSB20_MAX_FRAME_PRE_SCALE) { MaxFrameCount &= ~LIBUSB20_MAX_FRAME_PRE_SCALE; + /* + * The kernel can setup 8 times more frames when + * pre-scaling ISOCHRONOUS transfers. Make sure the + * length and pointer buffers are big enough: + */ + MaxFrameCount *= 8; pre_scale = 1; } else { pre_scale = 0; @@ -200,8 +206,13 @@ libusb20_tr_open_stream(struct libusb20_transfer *xfer, uint32_t MaxBufSize, } memset(xfer->ppBuffer, 0, size); - error = xfer->pdev->methods->tr_open(xfer, MaxBufSize, - MaxFrameCount, ep_no, stream_id, pre_scale); + if (pre_scale) { + error = xfer->pdev->methods->tr_open(xfer, MaxBufSize, + MaxFrameCount / 8, ep_no, stream_id, 1); + } else { + error = xfer->pdev->methods->tr_open(xfer, MaxBufSize, + MaxFrameCount, ep_no, stream_id, 0); + } if (error) { free(xfer->ppBuffer); From af6936890a55657eb03b0eaf319e32667300eff9 Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Fri, 2 Sep 2016 10:13:51 +0000 Subject: [PATCH 38/41] Add a pc_clock pcpu field and use it to implement cpu_est_clockrate. This will allow drivers that manage the clock frequency to communicate this with the reset of the kernel. Reported by: jmcneill MFC after: 1 week Sponsored by: ABT Systems Ltd --- sys/arm64/arm64/machdep.c | 11 ++++++++++- sys/arm64/include/pcpu.h | 3 ++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/sys/arm64/arm64/machdep.c b/sys/arm64/arm64/machdep.c index 2df33152133..3d2e6fb81b9 100644 --- a/sys/arm64/arm64/machdep.c +++ b/sys/arm64/arm64/machdep.c @@ -416,8 +416,17 @@ cpu_flush_dcache(void *ptr, size_t len) int cpu_est_clockrate(int cpu_id, uint64_t *rate) { + struct pcpu *pc; - panic("ARM64TODO: cpu_est_clockrate"); + pc = pcpu_find(cpu_id); + if (pc == NULL || rate == NULL) + return (EINVAL); + + if (pc->pc_clock == 0) + return (EOPNOTSUPP); + + *rate = pc->pc_clock; + return (0); } void diff --git a/sys/arm64/include/pcpu.h b/sys/arm64/include/pcpu.h index 21b58db5c48..53f4e7b9cce 100644 --- a/sys/arm64/include/pcpu.h +++ b/sys/arm64/include/pcpu.h @@ -38,7 +38,8 @@ #define PCPU_MD_FIELDS \ u_int pc_acpi_id; /* ACPI CPU id */ \ u_int pc_midr; /* stored MIDR value */ \ - char __pad[121] + uint64_t pc_clock; \ + char __pad[113] #ifdef _KERNEL From fd50a7077036ac10648c114d1f0f77470b13b2fe Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Fri, 2 Sep 2016 18:22:56 +0000 Subject: [PATCH 39/41] Merge from CheriBSD: Rename sigprop-table constants to SIGPROP_ from SA_ to reduce the impression of a namespace collision. Submitted by: rwatson Reviewed by: jhb, kib (slightly different versions) Obtained from: CheriBSD (814ec5771cb1cb53deba317c561de62a91ae7684) Sponsored by: DARPA, AFRL Differential Revision: https://reviews.freebsd.org/D7616 --- sys/kern/kern_sig.c | 115 ++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 57 deletions(-) diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index c183c8ff712..fc73f08fe51 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -189,46 +189,46 @@ SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, * The array below categorizes the signals and their default actions * according to the following properties: */ -#define SA_KILL 0x01 /* terminates process by default */ -#define SA_CORE 0x02 /* ditto and coredumps */ -#define SA_STOP 0x04 /* suspend process */ -#define SA_TTYSTOP 0x08 /* ditto, from tty */ -#define SA_IGNORE 0x10 /* ignore by default */ -#define SA_CONT 0x20 /* continue if suspended */ -#define SA_CANTMASK 0x40 /* non-maskable, catchable */ +#define SIGPROP_KILL 0x01 /* terminates process by default */ +#define SIGPROP_CORE 0x02 /* ditto and coredumps */ +#define SIGPROP_STOP 0x04 /* suspend process */ +#define SIGPROP_TTYSTOP 0x08 /* ditto, from tty */ +#define SIGPROP_IGNORE 0x10 /* ignore by default */ +#define SIGPROP_CONT 0x20 /* continue if suspended */ +#define SIGPROP_CANTMASK 0x40 /* non-maskable, catchable */ static int sigproptbl[NSIG] = { - SA_KILL, /* SIGHUP */ - SA_KILL, /* SIGINT */ - SA_KILL|SA_CORE, /* SIGQUIT */ - SA_KILL|SA_CORE, /* SIGILL */ - SA_KILL|SA_CORE, /* SIGTRAP */ - SA_KILL|SA_CORE, /* SIGABRT */ - SA_KILL|SA_CORE, /* SIGEMT */ - SA_KILL|SA_CORE, /* SIGFPE */ - SA_KILL, /* SIGKILL */ - SA_KILL|SA_CORE, /* SIGBUS */ - SA_KILL|SA_CORE, /* SIGSEGV */ - SA_KILL|SA_CORE, /* SIGSYS */ - SA_KILL, /* SIGPIPE */ - SA_KILL, /* SIGALRM */ - SA_KILL, /* SIGTERM */ - SA_IGNORE, /* SIGURG */ - SA_STOP, /* SIGSTOP */ - SA_STOP|SA_TTYSTOP, /* SIGTSTP */ - SA_IGNORE|SA_CONT, /* SIGCONT */ - SA_IGNORE, /* SIGCHLD */ - SA_STOP|SA_TTYSTOP, /* SIGTTIN */ - SA_STOP|SA_TTYSTOP, /* SIGTTOU */ - SA_IGNORE, /* SIGIO */ - SA_KILL, /* SIGXCPU */ - SA_KILL, /* SIGXFSZ */ - SA_KILL, /* SIGVTALRM */ - SA_KILL, /* SIGPROF */ - SA_IGNORE, /* SIGWINCH */ - SA_IGNORE, /* SIGINFO */ - SA_KILL, /* SIGUSR1 */ - SA_KILL, /* SIGUSR2 */ + SIGPROP_KILL, /* SIGHUP */ + SIGPROP_KILL, /* SIGINT */ + SIGPROP_KILL | SIGPROP_CORE, /* SIGQUIT */ + SIGPROP_KILL | SIGPROP_CORE, /* SIGILL */ + SIGPROP_KILL | SIGPROP_CORE, /* SIGTRAP */ + SIGPROP_KILL | SIGPROP_CORE, /* SIGABRT */ + SIGPROP_KILL | SIGPROP_CORE, /* SIGEMT */ + SIGPROP_KILL | SIGPROP_CORE, /* SIGFPE */ + SIGPROP_KILL, /* SIGKILL */ + SIGPROP_KILL | SIGPROP_CORE, /* SIGBUS */ + SIGPROP_KILL | SIGPROP_CORE, /* SIGSEGV */ + SIGPROP_KILL | SIGPROP_CORE, /* SIGSYS */ + SIGPROP_KILL, /* SIGPIPE */ + SIGPROP_KILL, /* SIGALRM */ + SIGPROP_KILL, /* SIGTERM */ + SIGPROP_IGNORE, /* SIGURG */ + SIGPROP_STOP, /* SIGSTOP */ + SIGPROP_STOP | SIGPROP_TTYSTOP, /* SIGTSTP */ + SIGPROP_IGNORE | SIGPROP_CONT, /* SIGCONT */ + SIGPROP_IGNORE, /* SIGCHLD */ + SIGPROP_STOP | SIGPROP_TTYSTOP, /* SIGTTIN */ + SIGPROP_STOP | SIGPROP_TTYSTOP, /* SIGTTOU */ + SIGPROP_IGNORE, /* SIGIO */ + SIGPROP_KILL, /* SIGXCPU */ + SIGPROP_KILL, /* SIGXFSZ */ + SIGPROP_KILL, /* SIGVTALRM */ + SIGPROP_KILL, /* SIGPROF */ + SIGPROP_IGNORE, /* SIGWINCH */ + SIGPROP_IGNORE, /* SIGINFO */ + SIGPROP_KILL, /* SIGUSR1 */ + SIGPROP_KILL, /* SIGUSR2 */ }; static void reschedule_signals(struct proc *p, sigset_t block, int flags); @@ -755,7 +755,7 @@ kern_sigaction(struct thread *td, int sig, const struct sigaction *act, * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || - (sigprop(sig) & SA_IGNORE && + (sigprop(sig) & SIGPROP_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ sigqueue_delete_proc(p, sig); @@ -923,7 +923,7 @@ siginit(p) ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) { - if (sigprop(i) & SA_IGNORE && i != SIGCONT) { + if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) { SIGADDSET(ps->ps_sigignore, i); } } @@ -940,7 +940,7 @@ sigdflt(struct sigacts *ps, int sig) mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(ps->ps_sigcatch, sig); - if ((sigprop(sig) & SA_IGNORE) != 0 && sig != SIGCONT) + if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; SIGDELSET(ps->ps_siginfo, sig); @@ -969,7 +969,7 @@ execsigs(struct proc *p) while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); sigdflt(ps, sig); - if ((sigprop(sig) & SA_IGNORE) != 0) + if ((sigprop(sig) & SIGPROP_IGNORE) != 0) sigqueue_delete_proc(p, sig); } @@ -2154,16 +2154,16 @@ tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) intrval = ERESTART; mtx_unlock(&ps->ps_mtx); - if (prop & SA_CONT) + if (prop & SIGPROP_CONT) sigqueue_delete_stopmask_proc(p); - else if (prop & SA_STOP) { + else if (prop & SIGPROP_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ - if ((prop & SA_TTYSTOP) && + if ((prop & SIGPROP_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) { if (ksi && (ksi->ksi_flags & KSI_INS)) @@ -2188,7 +2188,7 @@ tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && - !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG))) + !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); /* * SIGKILL: Remove procfs STOPEVENTs and ptrace events. @@ -2228,7 +2228,7 @@ tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) goto runfast; } - if (prop & SA_CONT) { + if (prop & SIGPROP_CONT) { /* * If traced process is already stopped, * then no further action is necessary. @@ -2278,7 +2278,7 @@ tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) goto out; } - if (prop & SA_STOP) { + if (prop & SIGPROP_STOP) { /* * If traced process is already stopped, * then no further action is necessary. @@ -2325,7 +2325,7 @@ tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) MPASS(action == SIG_DFL); - if (prop & SA_STOP) { + if (prop & SIGPROP_STOP) { if (p->p_flag & (P_PPWAIT|P_WEXIT)) goto out; p->p_flag |= P_STOPPED_SIG; @@ -2394,7 +2394,7 @@ tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) * priority of the idle thread, since we still allow to signal * kernel processes. */ - if (action == SIG_DFL && (prop & SA_KILL) != 0 && + if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 && td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { @@ -2411,7 +2411,7 @@ tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) * asleep, we are finished; the process should not * be awakened. */ - if ((prop & SA_CONT) && action == SIG_DFL) { + if ((prop & SIGPROP_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); @@ -2427,7 +2427,7 @@ tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) * Don't awaken a sleeping thread for SIGSTOP if the * STOP signal is deferred. */ - if ((prop & SA_STOP) != 0 && (td->td_flags & (TDF_SBDRY | + if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) goto out; @@ -2858,10 +2858,10 @@ issignal(struct thread *td) * if process is member of an orphaned * process group, ignore tty stop signals. */ - if (prop & SA_STOP) { + if (prop & SIGPROP_STOP) { if (p->p_flag & (P_TRACED|P_WEXIT) || (p->p_pgrp->pg_jobc == 0 && - prop & SA_TTYSTOP)) + prop & SIGPROP_TTYSTOP)) break; /* == ignore */ if (TD_SBDRY_INTR(td)) { KASSERT((td->td_flags & TDF_SBDRY) != 0, @@ -2881,7 +2881,7 @@ issignal(struct thread *td) PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); goto next; - } else if (prop & SA_IGNORE) { + } else if (prop & SIGPROP_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. @@ -2897,7 +2897,7 @@ issignal(struct thread *td) * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ - if ((prop & SA_CONT) == 0 && + if ((prop & SIGPROP_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ @@ -3059,7 +3059,8 @@ sigexit(td, sig) * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ - if ((sigprop(sig) & SA_CORE) && thread_single(p, SINGLE_NO_EXIT) == 0) { + if ((sigprop(sig) & SIGPROP_CORE) && + thread_single(p, SINGLE_NO_EXIT) == 0) { p->p_sig = sig; /* * Log signals which would cause core dumps From 2279a9a428305ee43111f62f4addf6a9826eacde Mon Sep 17 00:00:00 2001 From: Hajimu UMEMOTO Date: Fri, 2 Sep 2016 18:28:14 +0000 Subject: [PATCH 40/41] When -n is specified, don't make bogus DNS queries. Instead, when -n is specified more than once, hostnames stored in utmp are attempted to resolve to display them as network addresses. PR: 212272 --- usr.bin/w/w.1 | 4 ++++ usr.bin/w/w.c | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/usr.bin/w/w.1 b/usr.bin/w/w.1 index a8c30be5950..715174b8c62 100644 --- a/usr.bin/w/w.1 +++ b/usr.bin/w/w.1 @@ -85,6 +85,10 @@ default Do not attempt to resolve network addresses (normally .Nm interprets addresses and attempts to display them as names). +When +.Fl n +is specified more than once, hostnames stored in utmp are attempted to +resolve to display them as network addresses. .El .Pp If one or more diff --git a/usr.bin/w/w.c b/usr.bin/w/w.c index 0c955acd8fe..d7743b4e781 100644 --- a/usr.bin/w/w.c +++ b/usr.bin/w/w.c @@ -180,7 +180,7 @@ main(int argc, char *argv[]) nlistf = optarg; break; case 'n': - nflag = 1; + nflag += 1; break; case 'f': case 'l': case 's': case 'u': case 'w': warnx("[-flsuw] no longer supported"); @@ -378,12 +378,12 @@ main(int argc, char *argv[]) lsin->sin_family = AF_INET; isaddr = 1; } - if (!nflag) { + if (nflag == 0) { /* Attempt to change an IP address into a name */ if (isaddr && realhostname_sa(fn, sizeof(fn), sa, sa->sa_len) == HOSTNAME_FOUND) p = fn; - } else if (!isaddr) { + } else if (!isaddr && nflag > 1) { /* * If a host has only one A/AAAA RR, change a * name into an IP address From ec21434933d775bd60d0fed62c978fb07fa9ccdf Mon Sep 17 00:00:00 2001 From: Kristof Provost Date: Fri, 2 Sep 2016 18:33:08 +0000 Subject: [PATCH 41/41] Renaming libifc to libifconfig in response to feedback on initial commit of this library. Sticking to 'libifconfig' (and 'ifconfig_' as function prefix) should reduce chances of namespace collisions, make it more clear what the library does, and be more in line with existing libraries. Submitted by: Marie Helene Kvello-Aune Differential Revision: https://reviews.freebsd.org/D7742 Reviewed by: cem, kp --- lib/Makefile | 2 +- lib/{libifc => libifconfig}/Makefile | 8 +- .../libifc.c => libifconfig/libifconfig.c} | 78 +++++++++---------- .../libifc.h => libifconfig/libifconfig.h} | 50 ++++++------ .../libifconfig_internal.c} | 14 ++-- .../libifconfig_internal.h} | 16 ++-- share/examples/libifc/Makefile | 9 --- share/examples/libifconfig/Makefile | 8 ++ .../{libifc => libifconfig}/ifcreate.c | 16 ++-- .../{libifc => libifconfig}/ifdestroy.c | 14 ++-- .../{libifc => libifconfig}/setdescription.c | 20 ++--- .../examples/{libifc => libifconfig}/setmtu.c | 18 ++--- share/mk/bsd.libnames.mk | 2 +- share/mk/src.libnames.mk | 2 +- 14 files changed, 128 insertions(+), 129 deletions(-) rename lib/{libifc => libifconfig}/Makefile (65%) rename lib/{libifc/libifc.c => libifconfig/libifconfig.c} (78%) rename lib/{libifc/libifc.h => libifconfig/libifconfig.h} (66%) rename lib/{libifc/libifc_internal.c => libifconfig/libifconfig_internal.c} (86%) rename lib/{libifc/libifc_internal.h => libifconfig/libifconfig_internal.h} (85%) delete mode 100644 share/examples/libifc/Makefile create mode 100644 share/examples/libifconfig/Makefile rename share/examples/{libifc => libifconfig}/ifcreate.c (89%) rename share/examples/{libifc => libifconfig}/ifdestroy.c (90%) rename share/examples/{libifc => libifconfig}/setdescription.c (85%) rename share/examples/{libifc => libifconfig}/setmtu.c (89%) diff --git a/lib/Makefile b/lib/Makefile index 3005f247e86..16f74680330 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -60,7 +60,7 @@ SUBDIR= ${SUBDIR_BOOTSTRAP} \ ${_libgssapi} \ ${_librpcsec_gss} \ ${_libiconv_modules} \ - libifc \ + libifconfig \ libipsec \ libjail \ libkiconv \ diff --git a/lib/libifc/Makefile b/lib/libifconfig/Makefile similarity index 65% rename from lib/libifc/Makefile rename to lib/libifconfig/Makefile index 49828d22a2e..924752fac9e 100644 --- a/lib/libifc/Makefile +++ b/lib/libifconfig/Makefile @@ -1,18 +1,18 @@ # $FreeBSD$ PACKAGE= lib${LIB} -LIB= ifc +LIB= ifconfig # Don't build shared library, for now. NO_PIC= SHLIBDIR?= /lib SHLIB_MAJOR= 1 -SRCS= libifc.c libifc_internal.c +SRCS= libifconfig.c libifconfig_internal.c INCSDIR= ${INCLUDEDIR} -INCS= libifc.h +INCS= libifconfig.h -#MAN= libifco.3 +#MAN= libifconfig.3 CFLAGS+= -I${.CURDIR} WARNS?=6 diff --git a/lib/libifc/libifc.c b/lib/libifconfig/libifconfig.c similarity index 78% rename from lib/libifc/libifc.c rename to lib/libifconfig/libifconfig.c index 0cd575d6f64..5acb56c845c 100644 --- a/lib/libifc/libifc.c +++ b/lib/libifconfig/libifconfig.c @@ -71,16 +71,16 @@ #include #include -#include "libifc.h" -#include "libifc_internal.h" +#include "libifconfig.h" +#include "libifconfig_internal.h" -libifc_handle_t * -libifc_open(void) +ifconfig_handle_t * +ifconfig_open(void) { - struct libifc_handle *h; + struct ifconfig_handle *h; - h = calloc(1, sizeof(struct libifc_handle)); + h = calloc(1, sizeof(struct ifconfig_handle)); for (int i = 0; i <= AF_MAX; i++) { h->sockets[i] = -1; @@ -91,7 +91,7 @@ libifc_open(void) void -libifc_close(libifc_handle_t *h) +ifconfig_close(ifconfig_handle_t *h) { for (int i = 0; i <= AF_MAX; i++) { if (h->sockets[i] != -1) { @@ -102,29 +102,29 @@ libifc_close(libifc_handle_t *h) } -libifc_errtype -libifc_err_errtype(libifc_handle_t *h) +ifconfig_errtype +ifconfig_err_errtype(ifconfig_handle_t *h) { return (h->error.errtype); } int -libifc_err_errno(libifc_handle_t *h) +ifconfig_err_errno(ifconfig_handle_t *h) { return (h->error.errcode); } unsigned long -libifc_err_ioctlreq(libifc_handle_t *h) +ifconfig_err_ioctlreq(ifconfig_handle_t *h) { return (h->error.ioctl_request); } int -libifc_get_description(libifc_handle_t *h, const char *name, char **description) +ifconfig_get_description(ifconfig_handle_t *h, const char *name, char **description) { struct ifreq ifr; char *descr = NULL; @@ -141,7 +141,7 @@ libifc_get_description(libifc_handle_t *h, const char *name, char **description) ifr.ifr_buffer.buffer = descr; ifr.ifr_buffer.length = descrlen; - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCGIFDESCR, + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCGIFDESCR, &ifr) != 0) { return (-1); } @@ -166,7 +166,7 @@ libifc_get_description(libifc_handle_t *h, const char *name, char **description) int -libifc_set_description(libifc_handle_t *h, const char *name, +ifconfig_set_description(ifconfig_handle_t *h, const char *name, const char *newdescription) { struct ifreq ifr; @@ -180,7 +180,7 @@ libifc_set_description(libifc_handle_t *h, const char *name, * TODO: Decide whether this should be an error condition instead. */ if (desclen == 0) { - return (libifc_unset_description(h, name)); + return (ifconfig_unset_description(h, name)); } (void)strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); @@ -193,7 +193,7 @@ libifc_set_description(libifc_handle_t *h, const char *name, return (-1); } - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCSIFDESCR, &ifr) != 0) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCSIFDESCR, &ifr) != 0) { free(ifr.ifr_buffer.buffer); return (-1); } @@ -202,7 +202,7 @@ libifc_set_description(libifc_handle_t *h, const char *name, } -int libifc_unset_description(libifc_handle_t *h, const char *name) +int ifconfig_unset_description(ifconfig_handle_t *h, const char *name) { struct ifreq ifr; @@ -211,14 +211,14 @@ int libifc_unset_description(libifc_handle_t *h, const char *name) ifr.ifr_buffer.length = 0; ifr.ifr_buffer.buffer = NULL; - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCSIFDESCR, &ifr) < 0) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCSIFDESCR, &ifr) < 0) { return (-1); } return (0); } -int libifc_set_name(libifc_handle_t *h, const char *name, const char *newname) +int ifconfig_set_name(ifconfig_handle_t *h, const char *name, const char *newname) { struct ifreq ifr; char *tmpname; @@ -234,7 +234,7 @@ int libifc_set_name(libifc_handle_t *h, const char *name, const char *newname) (void)strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_data = tmpname; - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCSIFNAME, &ifr) != 0) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCSIFNAME, &ifr) != 0) { free(tmpname); return (-1); } @@ -243,27 +243,27 @@ int libifc_set_name(libifc_handle_t *h, const char *name, const char *newname) } -int libifc_set_mtu(libifc_handle_t *h, const char *name, const int mtu) +int ifconfig_set_mtu(ifconfig_handle_t *h, const char *name, const int mtu) { struct ifreq ifr; memset(&ifr, 0, sizeof(struct ifreq)); (void)strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_mtu = mtu; - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCSIFMTU, &ifr) < 0) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCSIFMTU, &ifr) < 0) { return (-1); } return (0); } -int libifc_get_mtu(libifc_handle_t *h, const char *name, int *mtu) +int ifconfig_get_mtu(ifconfig_handle_t *h, const char *name, int *mtu) { struct ifreq ifr; memset(&ifr, 0, sizeof(struct ifreq)); (void)strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCGIFMTU, &ifr) == -1) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCGIFMTU, &ifr) == -1) { return (-1); } *mtu = ifr.ifr_mtu; @@ -271,27 +271,27 @@ int libifc_get_mtu(libifc_handle_t *h, const char *name, int *mtu) } -int libifc_set_metric(libifc_handle_t *h, const char *name, const int mtu) +int ifconfig_set_metric(ifconfig_handle_t *h, const char *name, const int mtu) { struct ifreq ifr; memset(&ifr, 0, sizeof(struct ifreq)); (void)strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_mtu = mtu; - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCSIFMETRIC, &ifr) < 0) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCSIFMETRIC, &ifr) < 0) { return (-1); } return (0); } -int libifc_get_metric(libifc_handle_t *h, const char *name, int *metric) +int ifconfig_get_metric(ifconfig_handle_t *h, const char *name, int *metric) { struct ifreq ifr; memset(&ifr, 0, sizeof(struct ifreq)); (void)strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCGIFMETRIC, &ifr) == -1) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCGIFMETRIC, &ifr) == -1) { return (-1); } *metric = ifr.ifr_metric; @@ -299,16 +299,16 @@ int libifc_get_metric(libifc_handle_t *h, const char *name, int *metric) } -int libifc_set_capability(libifc_handle_t *h, const char *name, +int ifconfig_set_capability(ifconfig_handle_t *h, const char *name, const int capability) { struct ifreq ifr; - struct libifc_capabilities ifcap; + struct ifconfig_capabilities ifcap; int flags; int value; memset(&ifr, 0, sizeof(struct ifreq)); - if (libifc_get_capability(h, name, &ifcap) != 0) { + if (ifconfig_get_capability(h, name, &ifcap) != 0) { return (-1); } @@ -329,22 +329,22 @@ int libifc_set_capability(libifc_handle_t *h, const char *name, * set for this request. */ ifr.ifr_reqcap = flags; - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCSIFCAP, &ifr) < 0) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCSIFCAP, &ifr) < 0) { return (-1); } return (0); } -int libifc_get_capability(libifc_handle_t *h, const char *name, - struct libifc_capabilities *capability) +int ifconfig_get_capability(ifconfig_handle_t *h, const char *name, + struct ifconfig_capabilities *capability) { struct ifreq ifr; memset(&ifr, 0, sizeof(struct ifreq)); (void)strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCGIFCAP, &ifr) < 0) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCGIFCAP, &ifr) < 0) { return (-1); } capability->curcap = ifr.ifr_curcap; @@ -353,21 +353,21 @@ int libifc_get_capability(libifc_handle_t *h, const char *name, } -int libifc_destroy_interface(libifc_handle_t *h, const char *name) +int ifconfig_destroy_interface(ifconfig_handle_t *h, const char *name) { struct ifreq ifr; memset(&ifr, 0, sizeof(struct ifreq)); (void)strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCIFDESTROY, &ifr) < 0) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCIFDESTROY, &ifr) < 0) { return (-1); } return (0); } -int libifc_create_interface(libifc_handle_t *h, const char *name, char **ifname) +int ifconfig_create_interface(ifconfig_handle_t *h, const char *name, char **ifname) { struct ifreq ifr; @@ -389,7 +389,7 @@ int libifc_create_interface(libifc_handle_t *h, const char *name, char **ifname) /* No special handling for this interface type. */ - if (libifc_ioctlwrap(h, AF_LOCAL, SIOCIFCREATE2, &ifr) < 0) { + if (ifconfig_ioctlwrap(h, AF_LOCAL, SIOCIFCREATE2, &ifr) < 0) { return (-1); } *ifname = strdup(ifr.ifr_name); diff --git a/lib/libifc/libifc.h b/lib/libifconfig/libifconfig.h similarity index 66% rename from lib/libifc/libifc.h rename to lib/libifconfig/libifconfig.h index 6b947f99a48..a005aa83b33 100644 --- a/lib/libifc/libifc.h +++ b/lib/libifconfig/libifconfig.h @@ -34,16 +34,16 @@ typedef enum { OTHER, IOCTL, SOCKET -} libifc_errtype; +} ifconfig_errtype; /* * Opaque definition so calling application can just pass a * pointer to it for library use. */ -struct libifc_handle; -typedef struct libifc_handle libifc_handle_t; +struct ifconfig_handle; +typedef struct ifconfig_handle ifconfig_handle_t; -struct libifc_capabilities { +struct ifconfig_capabilities { /** Current capabilities (ifconfig prints this as 'options')*/ int curcap; /** Requested capabilities (ifconfig prints this as 'capabilities')*/ @@ -55,57 +55,57 @@ struct libifc_capabilities { * Example usage: *{@code * // Create state object - * libifc_handle_t *lifh = libifc_open(); + * ifconfig_handle_t *lifh = ifconfig_open(); * * // Do stuff with it * * // Dispose of the state object - * libifc_close(lifh); + * ifconfig_close(lifh); * lifh = NULL; *} */ -libifc_handle_t *libifc_open(void); +ifconfig_handle_t *ifconfig_open(void); /** Frees resources held in the provided state object. * @param h The state object to close. - * @see #libifc_open(void) + * @see #ifconfig_open(void) */ -void libifc_close(libifc_handle_t *h); +void ifconfig_close(ifconfig_handle_t *h); /** Identifies what kind of error occured. */ -libifc_errtype libifc_err_errtype(libifc_handle_t *h); +ifconfig_errtype ifconfig_err_errtype(ifconfig_handle_t *h); /** Retrieves the errno associated with the error, if any. */ -int libifc_err_errno(libifc_handle_t *h); +int ifconfig_err_errno(ifconfig_handle_t *h); /** If error type was IOCTL, this identifies which request failed. */ -unsigned long libifc_err_ioctlreq(libifc_handle_t *h); +unsigned long ifconfig_err_ioctlreq(ifconfig_handle_t *h); -int libifc_get_description(libifc_handle_t *h, const char *name, +int ifconfig_get_description(ifconfig_handle_t *h, const char *name, char **description); -int libifc_set_description(libifc_handle_t *h, const char *name, +int ifconfig_set_description(ifconfig_handle_t *h, const char *name, const char *newdescription); -int libifc_unset_description(libifc_handle_t *h, const char *name); -int libifc_set_name(libifc_handle_t *h, const char *name, const char *newname); -int libifc_set_mtu(libifc_handle_t *h, const char *name, const int mtu); -int libifc_get_mtu(libifc_handle_t *h, const char *name, int *mtu); +int ifconfig_unset_description(ifconfig_handle_t *h, const char *name); +int ifconfig_set_name(ifconfig_handle_t *h, const char *name, const char *newname); +int ifconfig_set_mtu(ifconfig_handle_t *h, const char *name, const int mtu); +int ifconfig_get_mtu(ifconfig_handle_t *h, const char *name, int *mtu); -int libifc_set_metric(libifc_handle_t *h, const char *name, const int metric); -int libifc_get_metric(libifc_handle_t *h, const char *name, int *metric); +int ifconfig_set_metric(ifconfig_handle_t *h, const char *name, const int metric); +int ifconfig_get_metric(ifconfig_handle_t *h, const char *name, int *metric); -int libifc_set_capability(libifc_handle_t *h, const char *name, +int ifconfig_set_capability(ifconfig_handle_t *h, const char *name, const int capability); -int libifc_get_capability(libifc_handle_t *h, const char *name, - struct libifc_capabilities *capability); +int ifconfig_get_capability(ifconfig_handle_t *h, const char *name, + struct ifconfig_capabilities *capability); /** Destroy a virtual interface * @param name Interface to destroy */ -int libifc_destroy_interface(libifc_handle_t *h, const char *name); +int ifconfig_destroy_interface(ifconfig_handle_t *h, const char *name); /** Creates a (virtual) interface * @param name Name of interface to create. Example: bridge or bridge42 * @param name ifname Is set to actual name of created interface */ -int libifc_create_interface(libifc_handle_t *h, const char *name, +int ifconfig_create_interface(ifconfig_handle_t *h, const char *name, char **ifname); diff --git a/lib/libifc/libifc_internal.c b/lib/libifconfig/libifconfig_internal.c similarity index 86% rename from lib/libifc/libifc_internal.c rename to lib/libifconfig/libifconfig_internal.c index 341cec41932..304c2800e22 100644 --- a/lib/libifc/libifc_internal.c +++ b/lib/libifconfig/libifconfig_internal.c @@ -40,11 +40,11 @@ #include -#include "libifc.h" // Needed for libifc_errstate -#include "libifc_internal.h" +#include "libifconfig.h" // Needed for ifconfig_errstate +#include "libifconfig_internal.h" int -libifc_ioctlwrap_ret(libifc_handle_t *h, unsigned long request, int rcode) +ifconfig_ioctlwrap_ret(ifconfig_handle_t *h, unsigned long request, int rcode) { if (rcode != 0) { h->error.errtype = IOCTL; @@ -56,17 +56,17 @@ libifc_ioctlwrap_ret(libifc_handle_t *h, unsigned long request, int rcode) int -libifc_ioctlwrap(libifc_handle_t *h, const int addressfamily, +ifconfig_ioctlwrap(ifconfig_handle_t *h, const int addressfamily, unsigned long request, struct ifreq *ifr) { int s; - if (libifc_socket(h, addressfamily, &s) != 0) { + if (ifconfig_socket(h, addressfamily, &s) != 0) { return (-1); } int rcode = ioctl(s, request, ifr); - return (libifc_ioctlwrap_ret(h, request, rcode)); + return (ifconfig_ioctlwrap_ret(h, request, rcode)); } @@ -74,7 +74,7 @@ libifc_ioctlwrap(libifc_handle_t *h, const int addressfamily, * Function to get socket for the specified address family. * If the socket doesn't already exist, attempt to create it. */ -int libifc_socket(libifc_handle_t *h, const int addressfamily, int *s) +int ifconfig_socket(ifconfig_handle_t *h, const int addressfamily, int *s) { if (addressfamily > AF_MAX) { h->error.errtype = SOCKET; diff --git a/lib/libifc/libifc_internal.h b/lib/libifconfig/libifconfig_internal.h similarity index 85% rename from lib/libifc/libifc_internal.h rename to lib/libifconfig/libifconfig_internal.h index b68e9d22f27..2320ca7b376 100644 --- a/lib/libifc/libifc_internal.h +++ b/lib/libifconfig/libifconfig_internal.h @@ -32,14 +32,14 @@ #pragma once -#include "libifc.h" +#include "libifconfig.h" struct errstate { /** * Type of error. */ - libifc_errtype errtype; + ifconfig_errtype errtype; /** * The error occured in this ioctl() request. @@ -53,7 +53,7 @@ struct errstate { int errcode; }; -struct libifc_handle { +struct ifconfig_handle { struct errstate error; int sockets[AF_MAX + 1]; }; @@ -69,7 +69,7 @@ struct libifc_handle { * {@code * static void myfunc() \{ * int s; - * if (libifc_socket(AF_LOCAL, &s) != 0) \{ + * if (ifconfig_socket(AF_LOCAL, &s) != 0) \{ * // Handle error state here * \} * // user code here @@ -77,11 +77,11 @@ struct libifc_handle { * } * } */ -int libifc_socket(libifc_handle_t *h, const int addressfamily, int *s); +int ifconfig_socket(ifconfig_handle_t *h, const int addressfamily, int *s); /** Function used by other wrapper functions to populate _errstate when appropriate.*/ -int libifc_ioctlwrap_ret(libifc_handle_t *h, unsigned long request, int rcode); +int ifconfig_ioctlwrap_ret(ifconfig_handle_t *h, unsigned long request, int rcode); -/** Function to wrap ioctl() and automatically populate libifc_errstate when appropriate.*/ -int libifc_ioctlwrap(libifc_handle_t *h, const int addressfamily, +/** Function to wrap ioctl() and automatically populate ifconfig_errstate when appropriate.*/ +int ifconfig_ioctlwrap(ifconfig_handle_t *h, const int addressfamily, unsigned long request, struct ifreq *ifr); diff --git a/share/examples/libifc/Makefile b/share/examples/libifc/Makefile deleted file mode 100644 index 876c3caeac8..00000000000 --- a/share/examples/libifc/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# $FreeBSD$ - -default: - $(CC) -Wall -fPIC -lifc -g -o example_setdescription setdescription.c - $(CC) -Wall -fPIC -lifc -g -o example_setmtu setmtu.c - $(CC) -Wall -fPIC -lifc -g -o example_ifdestroy ifdestroy.c - $(CC) -Wall -fPIC -lifc -g -o example_ifcreate ifcreate.c -clean: - rm -f example_* diff --git a/share/examples/libifconfig/Makefile b/share/examples/libifconfig/Makefile new file mode 100644 index 00000000000..6926f65b1af --- /dev/null +++ b/share/examples/libifconfig/Makefile @@ -0,0 +1,8 @@ +# $FreeBSD$ +default: + $(CC) -Wall -fPIC -lifconfig -g -o example_setdescription setdescription.c + $(CC) -Wall -fPIC -lifconfig -g -o example_setmtu setmtu.c + $(CC) -Wall -fPIC -lifconfig -g -o example_ifdestroy ifdestroy.c + $(CC) -Wall -fPIC -lifconfig -g -o example_ifcreate ifcreate.c +clean: + rm -f example_* diff --git a/share/examples/libifc/ifcreate.c b/share/examples/libifconfig/ifcreate.c similarity index 89% rename from share/examples/libifc/ifcreate.c rename to share/examples/libifconfig/ifcreate.c index 0c461c8f572..8f5dde506d2 100644 --- a/share/examples/libifc/ifcreate.c +++ b/share/examples/libifconfig/ifcreate.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include int main(int argc, char *argv[]) @@ -55,21 +55,21 @@ int main(int argc, char *argv[]) printf("Requested interface name: %s\n", ifname); - libifc_handle_t *lifh = libifc_open(); - if (libifc_create_interface(lifh, ifname, &ifactualname) == 0) { + ifconfig_handle_t *lifh = ifconfig_open(); + if (ifconfig_create_interface(lifh, ifname, &ifactualname) == 0) { printf("Successfully created interface '%s'\n", ifactualname); - libifc_close(lifh); + ifconfig_close(lifh); lifh = NULL; free(ifname); free(ifactualname); return (0); } else { - switch (libifc_err_errtype(lifh)) { + switch (ifconfig_err_errtype(lifh)) { case SOCKET: warnx("couldn't create socket. This shouldn't happen.\n"); break; case IOCTL: - if (libifc_err_ioctlreq(lifh) == SIOCIFCREATE2) { + if (ifconfig_err_ioctlreq(lifh) == SIOCIFCREATE2) { warnx( "Failed to create interface (SIOCIFCREATE2)\n"); } @@ -79,12 +79,12 @@ int main(int argc, char *argv[]) "This is a thorough example accommodating for temporary" " 'not implemented yet' errors. That's likely what happened" " now. If not, your guess is as good as mine. ;)" - " Error code: %d\n", libifc_err_errno( + " Error code: %d\n", ifconfig_err_errno( lifh)); break; } - libifc_close(lifh); + ifconfig_close(lifh); lifh = NULL; free(ifname); free(ifactualname); diff --git a/share/examples/libifc/ifdestroy.c b/share/examples/libifconfig/ifdestroy.c similarity index 90% rename from share/examples/libifc/ifdestroy.c rename to share/examples/libifconfig/ifdestroy.c index bcfa11113f5..029e70a8d9b 100644 --- a/share/examples/libifc/ifdestroy.c +++ b/share/examples/libifconfig/ifdestroy.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include int main(int argc, char *argv[]) @@ -55,20 +55,20 @@ int main(int argc, char *argv[]) printf("Interface name: %s\n", ifname); - libifc_handle_t *lifh = libifc_open(); - if (libifc_destroy_interface(lifh, ifname) == 0) { + ifconfig_handle_t *lifh = ifconfig_open(); + if (ifconfig_destroy_interface(lifh, ifname) == 0) { printf("Successfully destroyed interface '%s'.", ifname); - libifc_close(lifh); + ifconfig_close(lifh); lifh = NULL; free(ifname); return (0); } else { - switch (libifc_err_errtype(lifh)) { + switch (ifconfig_err_errtype(lifh)) { case SOCKET: warnx("couldn't create socket. This shouldn't happen.\n"); break; case IOCTL: - if (libifc_err_ioctlreq(lifh) == SIOCIFDESTROY) { + if (ifconfig_err_ioctlreq(lifh) == SIOCIFDESTROY) { warnx( "Failed to destroy interface (SIOCIFDESTROY)\n"); } @@ -79,7 +79,7 @@ int main(int argc, char *argv[]) break; } - libifc_close(lifh); + ifconfig_close(lifh); lifh = NULL; free(ifname); return (-1); diff --git a/share/examples/libifc/setdescription.c b/share/examples/libifconfig/setdescription.c similarity index 85% rename from share/examples/libifc/setdescription.c rename to share/examples/libifconfig/setdescription.c index d66a883a0e6..656aba75976 100644 --- a/share/examples/libifc/setdescription.c +++ b/share/examples/libifconfig/setdescription.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include int main(int argc, char *argv[]) @@ -54,27 +54,27 @@ int main(int argc, char *argv[]) printf("Interface name: %s\n", ifname); - libifc_handle_t *lifh = libifc_open(); - if (libifc_get_description(lifh, ifname, &curdescr) == 0) { + ifconfig_handle_t *lifh = ifconfig_open(); + if (ifconfig_get_description(lifh, ifname, &curdescr) == 0) { printf("Old description: %s\n", curdescr); } printf("New description: %s\n\n", ifdescr); - if (libifc_set_description(lifh, ifname, ifdescr) == 0) { + if (ifconfig_set_description(lifh, ifname, ifdescr) == 0) { printf("New description successfully set.\n"); } else { - switch (libifc_err_errtype(lifh)) { + switch (ifconfig_err_errtype(lifh)) { case SOCKET: - err(libifc_err_errno(lifh), "Socket error"); + err(ifconfig_err_errno(lifh), "Socket error"); break; case IOCTL: - err(libifc_err_errno( + err(ifconfig_err_errno( lifh), "IOCTL(%lu) error", - libifc_err_ioctlreq(lifh)); + ifconfig_err_ioctlreq(lifh)); break; case OTHER: - err(libifc_err_errno(lifh), "Other error"); + err(ifconfig_err_errno(lifh), "Other error"); break; } } @@ -86,6 +86,6 @@ int main(int argc, char *argv[]) ifdescr = NULL; curdescr = NULL; - libifc_close(lifh); + ifconfig_close(lifh); return (0); } diff --git a/share/examples/libifc/setmtu.c b/share/examples/libifconfig/setmtu.c similarity index 89% rename from share/examples/libifc/setmtu.c rename to share/examples/libifconfig/setmtu.c index be63e18dafa..4d5a0f8d5f9 100644 --- a/share/examples/libifc/setmtu.c +++ b/share/examples/libifconfig/setmtu.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include int main(int argc, char *argv[]) @@ -58,26 +58,26 @@ int main(int argc, char *argv[]) printf("Interface name: %s\n", ifname); printf("New MTU: %d", mtu); - libifc_handle_t *lifh = libifc_open(); - if (libifc_set_mtu(lifh, ifname, mtu) == 0) { + ifconfig_handle_t *lifh = ifconfig_open(); + if (ifconfig_set_mtu(lifh, ifname, mtu) == 0) { printf("Successfully changed MTU of %s to %d\n", ifname, mtu); - libifc_close(lifh); + ifconfig_close(lifh); lifh = NULL; free(ifname); return (0); } else { - switch (libifc_err_errtype(lifh)) { + switch (ifconfig_err_errtype(lifh)) { case SOCKET: warnx("couldn't create socket. This shouldn't happen.\n"); break; case IOCTL: - if (libifc_err_ioctlreq(lifh) == SIOCSIFMTU) { + if (ifconfig_err_ioctlreq(lifh) == SIOCSIFMTU) { warnx("Failed to set MTU (SIOCSIFMTU)\n"); } else { warnx( "Failed to set MTU due to error in unexpected ioctl() call %lu. Error code: %i.\n", - libifc_err_ioctlreq(lifh), - libifc_err_errno(lifh)); + ifconfig_err_ioctlreq(lifh), + ifconfig_err_errno(lifh)); } break; default: @@ -86,7 +86,7 @@ int main(int argc, char *argv[]) break; } - libifc_close(lifh); + ifconfig_close(lifh); lifh = NULL; free(ifname); return (-1); diff --git a/share/mk/bsd.libnames.mk b/share/mk/bsd.libnames.mk index ff051f243f7..a3e6f2e0f62 100644 --- a/share/mk/bsd.libnames.mk +++ b/share/mk/bsd.libnames.mk @@ -82,7 +82,7 @@ LIBIBMAD?= ${DESTDIR}${LIBDIR}/libibmad.a LIBIBSDP?= ${DESTDIR}${LIBDIR}/libibsdp.a LIBIBUMAD?= ${DESTDIR}${LIBDIR}/libibumad.a LIBIBVERBS?= ${DESTDIR}${LIBDIR}/libibverbs.a -LIBIFC?= ${DESTDIR}${LIBDIR}/libifc.a +LIBIFCONFIG?= ${DESTDIR}${LIBDIR}/libifconfig.a LIBIPSEC?= ${DESTDIR}${LIBDIR}/libipsec.a LIBJAIL?= ${DESTDIR}${LIBDIR}/libjail.a LIBKADM5CLNT?= ${DESTDIR}${LIBDIR}/libkadm5clnt.a diff --git a/share/mk/src.libnames.mk b/share/mk/src.libnames.mk index 94b5d6190cd..f65ce01c008 100644 --- a/share/mk/src.libnames.mk +++ b/share/mk/src.libnames.mk @@ -105,7 +105,7 @@ _LIBRARIES= \ heimntlm \ heimsqlite \ hx509 \ - ifc \ + ifconfig \ ipsec \ jail \ kadm5clnt \