From 220666153d605eea8ffef0a9a6cee71550641856 Mon Sep 17 00:00:00 2001
From: Jung-uk Kim <jkim@FreeBSD.org>
Date: Tue, 19 Oct 2010 20:38:21 +0000
Subject: [PATCH 01/68] Remove undocumented and stale debug.acpi.do_powerstate
 tunable.  It was added with hw.pci.do_powerstate but the PCI version was
 splitted into two separate tunables later and now this is completely stale. 
 To make it worse, PCI devices enumerated in ACPI tree ignore this tunable as
 it is handled by a function in acpi_pci.c instead.

---
 sys/dev/acpica/acpi.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/sys/dev/acpica/acpi.c b/sys/dev/acpica/acpi.c
index c19748be375..24982fe7def 100644
--- a/sys/dev/acpica/acpi.c
+++ b/sys/dev/acpica/acpi.c
@@ -254,12 +254,6 @@ TUNABLE_INT("debug.acpi.interpreter_slack", &acpi_interpreter_slack);
 SYSCTL_INT(_debug_acpi, OID_AUTO, interpreter_slack, CTLFLAG_RDTUN,
     &acpi_interpreter_slack, 1, "Turn on interpreter slack mode.");
 
-/* Power devices off and on in suspend and resume.  XXX Remove once tested. */
-static int acpi_do_powerstate = 1;
-TUNABLE_INT("debug.acpi.do_powerstate", &acpi_do_powerstate);
-SYSCTL_INT(_debug_acpi, OID_AUTO, do_powerstate, CTLFLAG_RW,
-    &acpi_do_powerstate, 1, "Turn off devices when suspending.");
-
 /* Reset system clock while resuming.  XXX Remove once tested. */
 static int acpi_reset_clock = 1;
 TUNABLE_INT("debug.acpi.reset_clock", &acpi_reset_clock);
@@ -668,9 +662,6 @@ acpi_set_power_children(device_t dev, int state)
 	struct pci_devinfo *dinfo;
 	int dstate, i, numdevs;
 
-	if (!acpi_do_powerstate)
-		return;
-
 	if (device_get_children(dev, &devlist, &numdevs) != 0)
 		return;
 
@@ -1493,9 +1484,6 @@ acpi_set_powerstate(device_t child, int state)
     ACPI_HANDLE h;
     ACPI_STATUS status;
 
-    if (!acpi_do_powerstate)
-	return (0);
-
     h = acpi_get_handle(child);
     if (state < ACPI_STATE_D0 || state > ACPI_D_STATES_MAX)
 	return (EINVAL);

From ff662b5c9814043f1fb93ab7b16eaa0d77515959 Mon Sep 17 00:00:00 2001
From: "Justin T. Gibbs" <gibbs@FreeBSD.org>
Date: Tue, 19 Oct 2010 20:53:30 +0000
Subject: [PATCH 02/68] Improve the Xen para-virtualized device infrastructure
 of FreeBSD:

 o Add support for backend devices (e.g. blkback)
 o Implement extensions to the Xen para-virtualized block API to allow
   for larger and more outstanding I/Os.
 o Import a completely rewritten block back driver with support for fronting
   I/O to both raw devices and files.
 o General cleanup and documentation of the XenBus and XenStore support code.
 o Robustness and performance updates for the block front driver.
 o Fixes to the netfront driver.

Sponsored by: Spectra Logic Corporation

sys/xen/xenbus/init.txt:
	Deleted: This file explains the Linux method for XenBus device
	enumeration and thus does not apply to FreeBSD's NewBus approach.

sys/xen/xenbus/xenbus_probe_backend.c:
	Deleted: Linux version of backend XenBus service routines.  It
	was never ported to FreeBSD.  See xenbusb.c, xenbusb_if.m,
	xenbusb_front.c xenbusb_back.c for details of FreeBSD's XenBus
	support.

sys/xen/xenbus/xenbusvar.h:
sys/xen/xenbus/xenbus_xs.c:
sys/xen/xenbus/xenbus_comms.c:
sys/xen/xenbus/xenbus_comms.h:
sys/xen/xenstore/xenstorevar.h:
sys/xen/xenstore/xenstore.c:
	Split XenStore into its own tree.  XenBus is a software layer built
	on top of XenStore.  The old arrangement and the naming of some
	structures and functions blurred these lines making it difficult to
	discern what services are provided by which layer and at what times
	these services are available (e.g. during system startup and shutdown).

sys/xen/xenbus/xenbus_client.c:
sys/xen/xenbus/xenbus.c:
sys/xen/xenbus/xenbus_probe.c:
sys/xen/xenbus/xenbusb.c:
sys/xen/xenbus/xenbusb.h:
	Split up XenBus code into methods available for use by client
	drivers (xenbus.c) and code used by the XenBus "bus code" to
	enumerate, attach, detach, and service bus drivers.

sys/xen/reboot.c:
sys/dev/xen/control/control.c:
	Add a XenBus front driver for handling shutdown, reboot, suspend, and
	resume events published in the XenStore.  Move all PV suspend/reboot
	support from reboot.c into this driver.

sys/xen/blkif.h:
	New file from Xen vendor with macros and structures used by
	a block back driver to service requests from a VM running a
	different ABI (e.g. amd64 back with i386 front).

sys/conf/files:
	Adjust kernel build spec for new XenBus/XenStore layout and added
	Xen functionality.

sys/dev/xen/balloon/balloon.c:
sys/dev/xen/netfront/netfront.c:
sys/dev/xen/blkfront/blkfront.c:
sys/xen/xenbus/...
sys/xen/xenstore/...
	o Rename XenStore APIs and structures from xenbus_* to xs_*.
	o Adjust to use of M_XENBUS and M_XENSTORE malloc types for allocation
	  of objects returned by these APIs.
	o Adjust for changes in the bus interface for Xen drivers.

sys/xen/xenbus/...
sys/xen/xenstore/...
	Add Doxygen comments for these interfaces and the code that
	implements them.

sys/dev/xen/blkback/blkback.c:
	o Rewrite the Block Back driver to attach properly via newbus,
	  operate correctly in both PV and HVM mode regardless of domain
	  (e.g. can be in a DOM other than 0), and to deal with the latest
	  metadata available in XenStore for block devices.

	o Allow users to specify a file as a backend to blkback, in addition
	  to character devices.  Use the namei lookup of the backend path
	  to automatically configure, based on file type, the appropriate
	  backend method.

	The current implementation is limited to a single outstanding I/O
	at a time to file backed storage.

sys/dev/xen/blkback/blkback.c:
sys/xen/interface/io/blkif.h:
sys/xen/blkif.h:
sys/dev/xen/blkfront/blkfront.c:
sys/dev/xen/blkfront/block.h:
	Extend the Xen blkif API: Negotiable request size and number of
	requests.

	This change extends the information recorded in the XenStore
	allowing block front/back devices to negotiate for optimal I/O
	parameters.  This has been achieved without sacrificing backward
	compatibility with drivers that are unaware of these protocol
	enhancements.  The extensions center around the connection protocol
	which now includes these additions:

	o The back-end device publishes its maximum supported values for,
	  request I/O size, the number of page segments that can be
	  associated with a request, the maximum number of requests that
	  can be concurrently active, and the maximum number of pages that
	  can be in the shared request ring.  These values are published
	  before the back-end enters the XenbusStateInitWait state.

	o The front-end waits for the back-end to enter either the InitWait
	  or Initialize state.  At this point, the front end limits it's
	  own capabilities to the lesser of the values it finds published
	  by the backend, it's own maximums, or, should any back-end data
	  be missing in the store, the values supported by the original
	  protocol.  It then initializes it's internal data structures
	  including allocation of the shared ring, publishes its maximum
	  capabilities to the XenStore and transitions to the Initialized
	  state.

	o The back-end waits for the front-end to enter the Initalized
	  state.  At this point, the back end limits it's own capabilities
	  to the lesser of the values it finds published by the frontend,
	  it's own maximums, or, should any front-end data be missing in
	  the store, the values supported by the original protocol.  It
	  then initializes it's internal data structures, attaches to the
	  shared ring and transitions to the Connected state.

	o The front-end waits for the back-end to enter the Connnected
	  state, transitions itself to the connected state, and can
	  commence I/O.

	Although an updated front-end driver must be aware of the back-end's
	InitWait state, the back-end has been coded such that it can
	tolerate a front-end that skips this step and transitions directly
	to the Initialized state without waiting for the back-end.

sys/xen/interface/io/blkif.h:
	o Increase BLKIF_MAX_SEGMENTS_PER_REQUEST to 255.  This is
	  the maximum number possible without changing the blkif
	  request header structure (nr_segs is a uint8_t).

	o Add two new constants:
	  BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK, and
	  BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK.  These respectively
	  indicate the number of segments that can fit in the first
	  ring-buffer entry of a request, and for each subsequent
	  (sg element only) ring-buffer entry associated with the
          "header" ring-buffer entry of the request.

	o Add the blkif_request_segment_t typedef for segment
	  elements.

	o Add the BLKRING_GET_SG_REQUEST() macro which wraps the
	  RING_GET_REQUEST() macro and returns a properly cast
	  pointer to an array of blkif_request_segment_ts.

	o Add the BLKIF_SEGS_TO_BLOCKS() macro which calculates the
	  number of ring entries that will be consumed by a blkif
	  request with the given number of segments.

sys/xen/blkif.h:
	o Update for changes in interface/io/blkif.h macros.

	o Update the BLKIF_MAX_RING_REQUESTS() macro to take the
	  ring size as an argument to allow this calculation on
	  multi-page rings.

	o Add a companion macro to BLKIF_MAX_RING_REQUESTS(),
	  BLKIF_RING_PAGES().  This macro determines the number of
	  ring pages required in order to support a ring with the
	  supplied number of request blocks.

sys/dev/xen/blkback/blkback.c:
sys/dev/xen/blkfront/blkfront.c:
sys/dev/xen/blkfront/block.h:
	o Negotiate with the other-end with the following limits:
	      Reqeust Size:   MAXPHYS
	      Max Segments:   (MAXPHYS/PAGE_SIZE) + 1
	      Max Requests:   256
	      Max Ring Pages: Sufficient to support Max Requests with
	                      Max Segments.

	o Dynamically allocate request pools and segemnts-per-request.

	o Update ring allocation/attachment code to support a
	  multi-page shared ring.

	o Update routines that access the shared ring to handle
	  multi-block requests.

sys/dev/xen/blkfront/blkfront.c:
	o Track blkfront allocations in a blkfront driver specific
	  malloc pool.

	o Strip out XenStore transaction retry logic in the
	  connection code.  Transactions only need to be used when
	  the update to multiple XenStore nodes must be atomic.
	  That is not the case here.

	o Fully disable blkif_resume() until it can be fixed
	  properly (it didn't work before this change).

	o Destroy bus-dma objects during device instance tear-down.

	o Properly handle backend devices with powef-of-2 sector
	  sizes larger than 512b.

sys/dev/xen/blkback/blkback.c:
	Advertise support for and implement the BLKIF_OP_WRITE_BARRIER
	and BLKIF_OP_FLUSH_DISKCACHE blkif opcodes using BIO_FLUSH and
	the BIO_ORDERED attribute of bios.

sys/dev/xen/blkfront/blkfront.c:
sys/dev/xen/blkfront/block.h:
	Fix various bugs in blkfront.

       o gnttab_alloc_grant_references() returns 0 for success and
	 non-zero for failure.  The check for < 0 is a leftover
	 Linuxism.

       o When we negotiate with blkback and have to reduce some of our
	 capabilities, print out the original and reduced capability before
	 changing the local capability.  So the user now gets the correct
	 information.

	o Fix blkif_restart_queue_callback() formatting.  Make sure we hold
	  the mutex in that function before calling xb_startio().

	o Fix a couple of KASSERT()s.

        o Fix a check in the xb_remove_* macro to be a little more specific.

sys/xen/gnttab.h:
sys/xen/gnttab.c:
	Define GNTTAB_LIST_END publicly as GRANT_REF_INVALID.

sys/dev/xen/netfront/netfront.c:
	Use GRANT_REF_INVALID instead of driver private definitions of the
	same constant.

sys/xen/gnttab.h:
sys/xen/gnttab.c:
	Add the gnttab_end_foreign_access_references() API.

	This API allows a client to batch the release of an array of grant
	references, instead of coding a private for loop.  The implementation
	takes advantage of this batching to reduce lock overhead to one
	acquisition and release per-batch instead of per-freed grant reference.

	While here, reduce the duration the gnttab_list_lock is held during
	gnttab_free_grant_references() operations.  The search to find the
	tail of the incoming free list does not rely on global state and so
	can be performed without holding the lock.

sys/dev/xen/xenpci/evtchn.c:
sys/dev/xen/evtchn/evtchn.c:
sys/xen/xen_intr.h:
	o Implement the bind_interdomain_evtchn_to_irqhandler API for HVM mode.
	  This allows an HVM domain to serve back end devices to other domains.
	  This API is already implemented for PV mode.

	o Synchronize the API between HVM and PV.

sys/dev/xen/xenpci/xenpci.c:
	o Scan the full region of CPUID space in which the Xen VMM interface
	  may be implemented.  On systems using SuSE as a Dom0 where the
	  Viridian API is also exported, the VMM interface is above the region
	  we used to search.

	o Pass through bus_alloc_resource() calls so that XenBus drivers
	  attaching on an HVM system can allocate unused physical address
	  space from the nexus.  The block back driver makes use of this
	  facility.

sys/i386/xen/xen_machdep.c:
	Use the correct type for accessing the statically mapped xenstore
	metadata.

sys/xen/interface/hvm/params.h:
sys/xen/xenstore/xenstore.c:
	Move hvm_get_parameter() to the correct global header file instead
	of as a private method to the XenStore.

sys/xen/interface/io/protocols.h:
	Sync with vendor.

sys/xeninterface/io/ring.h:
	Add macro for calculating the number of ring pages needed for an N
	deep ring.

	To avoid duplication within the macros, create and use the new
	__RING_HEADER_SIZE() macro.  This macro calculates the size of the
	ring book keeping struct (producer/consumer indexes, etc.) that
	resides at the head of the ring.

	Add the __RING_PAGES() macro which calculates the number of shared
	ring pages required to support a ring with the given number of
	requests.

	These APIs are used to support the multi-page ring version of the
	Xen block API.

sys/xeninterface/io/xenbus.h:
	Add Comments.

sys/xen/xenbus/...
	o Refactor the FreeBSD XenBus support code to allow for both front and
	  backend device attachments.

	o Make use of new config_intr_hook capabilities to allow front and back
	  devices to be probed/attached in parallel.

	o Fix bugs in probe/attach state machine that could cause the system to
	  hang when confronted with a failure either in the local domain or in
	  a remote domain to which one of our driver instances is attaching.

	o Publish all required state to the XenStore on device detach and
	  failure.  The majority of the missing functionality was for serving
	  as a back end since the typical "hot-plug" scripts in Dom0 don't
	  handle the case of cleaning up for a "service domain" that is not
	  itself.

	o Add dynamic sysctl nodes exposing the generic ivars of
	  XenBus devices.

	o Add doxygen style comments to the majority of the code.

	o Cleanup types, formatting, etc.

sys/xen/xenbus/xenbusb.c:
	Common code used by both front and back XenBus busses.

sys/xen/xenbus/xenbusb_if.m:
	Method definitions for a XenBus bus.

sys/xen/xenbus/xenbusb_front.c:
sys/xen/xenbus/xenbusb_back.c:
	XenBus bus specialization for front and back devices.

MFC after:	1 month
---
 sys/conf/files                                |   19 +-
 sys/dev/xen/balloon/balloon.c                 |   10 +-
 sys/dev/xen/blkback/blkback.c                 | 3833 ++++++++++++-----
 sys/dev/xen/blkfront/blkfront.c               |  600 ++-
 sys/dev/xen/blkfront/block.h                  |   64 +-
 sys/dev/xen/control/control.c                 |  493 +++
 sys/dev/xen/netfront/netfront.c               |   72 +-
 sys/dev/xen/xenpci/evtchn.c                   |   43 +
 sys/dev/xen/xenpci/xenpci.c                   |   81 +-
 sys/i386/xen/xen_machdep.c                    |    6 +-
 sys/xen/blkif.h                               |  145 +
 sys/xen/evtchn/evtchn.c                       |    6 +-
 sys/xen/gnttab.c                              |   80 +-
 sys/xen/gnttab.h                              |    9 +
 sys/xen/interface/grant_table.h               |    2 +
 sys/xen/interface/hvm/params.h                |   26 +
 sys/xen/interface/io/blkif.h                  |   31 +-
 sys/xen/interface/io/protocols.h              |    3 +
 sys/xen/interface/io/ring.h                   |   18 +-
 sys/xen/interface/io/xenbus.h                 |    9 +
 sys/xen/reboot.c                              |  266 --
 sys/xen/xen_intr.h                            |    2 +-
 sys/xen/xenbus/init.txt                       |   14 -
 sys/xen/xenbus/{xenbus_client.c => xenbus.c}  |  143 +-
 sys/xen/xenbus/xenbus_comms.c                 |  226 -
 sys/xen/xenbus/xenbus_comms.h                 |   48 -
 sys/xen/xenbus/xenbus_if.m                    |   14 +-
 sys/xen/xenbus/xenbus_probe.c                 |  602 ---
 sys/xen/xenbus/xenbus_probe_backend.c         |  308 --
 sys/xen/xenbus/xenbus_xs.c                    |  935 ----
 sys/xen/xenbus/xenbusb.c                      |  878 ++++
 sys/xen/xenbus/xenbusb.h                      |  272 ++
 sys/xen/xenbus/xenbusb_back.c                 |  295 ++
 sys/xen/xenbus/xenbusb_front.c                |  195 +
 sys/xen/xenbus/xenbusb_if.m                   |   78 +
 sys/xen/xenbus/xenbusvar.h                    |  365 +-
 sys/xen/xenstore/xenstore.c                   | 1654 +++++++
 .../xenbus_dev.c => xenstore/xenstore_dev.c}  |   93 +-
 sys/xen/xenstore/xenstore_internal.h          |   39 +
 sys/xen/xenstore/xenstorevar.h                |  338 ++
 40 files changed, 8206 insertions(+), 4109 deletions(-)
 create mode 100644 sys/dev/xen/control/control.c
 create mode 100644 sys/xen/blkif.h
 delete mode 100644 sys/xen/reboot.c
 delete mode 100644 sys/xen/xenbus/init.txt
 rename sys/xen/xenbus/{xenbus_client.c => xenbus.c} (65%)
 delete mode 100644 sys/xen/xenbus/xenbus_comms.c
 delete mode 100644 sys/xen/xenbus/xenbus_comms.h
 delete mode 100644 sys/xen/xenbus/xenbus_probe.c
 delete mode 100644 sys/xen/xenbus/xenbus_probe_backend.c
 delete mode 100644 sys/xen/xenbus/xenbus_xs.c
 create mode 100644 sys/xen/xenbus/xenbusb.c
 create mode 100644 sys/xen/xenbus/xenbusb.h
 create mode 100644 sys/xen/xenbus/xenbusb_back.c
 create mode 100644 sys/xen/xenbus/xenbusb_front.c
 create mode 100644 sys/xen/xenbus/xenbusb_if.m
 create mode 100644 sys/xen/xenstore/xenstore.c
 rename sys/xen/{xenbus/xenbus_dev.c => xenstore/xenstore_dev.c} (68%)
 create mode 100644 sys/xen/xenstore/xenstore_internal.h
 create mode 100644 sys/xen/xenstore/xenstorevar.h

diff --git a/sys/conf/files b/sys/conf/files
index 75f16e54913..74f25c1c7c9 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3008,19 +3008,20 @@ xen/gnttab.c			optional xen | xenhvm
 xen/features.c			optional xen | xenhvm
 xen/evtchn/evtchn.c		optional xen
 xen/evtchn/evtchn_dev.c		optional xen | xenhvm
-xen/reboot.c			optional xen
-xen/xenbus/xenbus_client.c	optional xen | xenhvm
-xen/xenbus/xenbus_comms.c	optional xen | xenhvm
-xen/xenbus/xenbus_dev.c		optional xen | xenhvm
 xen/xenbus/xenbus_if.m		optional xen | xenhvm
-xen/xenbus/xenbus_probe.c	optional xen | xenhvm
-#xen/xenbus/xenbus_probe_backend.c	optional xen
-xen/xenbus/xenbus_xs.c		optional xen | xenhvm
+xen/xenbus/xenbus.c		optional xen | xenhvm
+xen/xenbus/xenbusb_if.m		optional xen | xenhvm
+xen/xenbus/xenbusb.c		optional xen | xenhvm
+xen/xenbus/xenbusb_front.c	optional xen | xenhvm
+xen/xenbus/xenbusb_back.c	optional xen | xenhvm
+xen/xenstore/xenstore.c		optional xen | xenhvm
+xen/xenstore/xenstore_dev.c	optional xen | xenhvm
 dev/xen/balloon/balloon.c	optional xen | xenhvm
+dev/xen/blkfront/blkfront.c	optional xen | xenhvm
+dev/xen/blkback/blkback.c	optional xen | xenhvm
 dev/xen/console/console.c	optional xen
 dev/xen/console/xencons_ring.c	optional xen
-dev/xen/blkfront/blkfront.c	optional xen | xenhvm
+dev/xen/control/control.c	optional xen | xenhvm
 dev/xen/netfront/netfront.c	optional xen | xenhvm
 dev/xen/xenpci/xenpci.c		optional xenpci
 dev/xen/xenpci/evtchn.c         optional xenpci
-dev/xen/xenpci/machine_reboot.c optional xenpci
diff --git a/sys/dev/xen/balloon/balloon.c b/sys/dev/xen/balloon/balloon.c
index 6948173765a..eb55dfc7b4c 100644
--- a/sys/dev/xen/balloon/balloon.c
+++ b/sys/dev/xen/balloon/balloon.c
@@ -44,7 +44,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/xen/xenfunc.h>
 #include <machine/xen/xenvar.h>
 #include <xen/hypervisor.h>
-#include <xen/xenbus/xenbusvar.h>
+#include <xen/xenstore/xenstorevar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
@@ -406,20 +406,20 @@ set_new_target(unsigned long target)
 	wakeup(balloon_process);
 }
 
-static struct xenbus_watch target_watch =
+static struct xs_watch target_watch =
 {
 	.node = "memory/target"
 };
 
 /* React to a change in the target key */
 static void 
-watch_target(struct xenbus_watch *watch,
+watch_target(struct xs_watch *watch,
 	     const char **vec, unsigned int len)
 {
 	unsigned long long new_target;
 	int err;
 
-	err = xenbus_scanf(XBT_NIL, "memory", "target", NULL,
+	err = xs_scanf(XST_NIL, "memory", "target", NULL,
 	    "%llu", &new_target);
 	if (err) {
 		/* This is ok (for domain0 at least) - so just return */
@@ -438,7 +438,7 @@ balloon_init_watcher(void *arg)
 {
 	int err;
 
-	err = register_xenbus_watch(&target_watch);
+	err = xs_register_watch(&target_watch);
 	if (err)
 		printf("Failed to set balloon watcher\n");
 
diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c
index 259f2f6c041..72087f5994c 100644
--- a/sys/dev/xen/blkback/blkback.c
+++ b/sys/dev/xen/blkback/blkback.c
@@ -1,1055 +1,1919 @@
-/*
- * Copyright (c) 2006, Cisco Systems, Inc.
+/*-
+ * Copyright (c) 2009-2010 Spectra Logic Corporation
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without 
- * modification, are permitted provided that the following conditions 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
  * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
  *
- * 1. Redistributions of source code must retain the above copyright 
- *    notice, this list of conditions and the following disclaimer. 
- * 2. Redistributions in binary form must reproduce the above copyright 
- *    notice, this list of conditions and the following disclaimer in the 
- *    documentation and/or other materials provided with the distribution. 
- * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors 
- *    may be used to endorse or promote products derived from this software 
- *    without specific prior written permission. 
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
- * POSSIBILITY OF SUCH DAMAGE.
+ * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
+ *          Ken Merry           (Spectra Logic Corporation)
  */
-
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+/**
+ * \file blkback.c
+ *
+ * \brief Device driver supporting the vending of block storage from
+ *        a FreeBSD domain to other domains.
+ */
+
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/mbuf.h>
-#include <sys/malloc.h>
 #include <sys/kernel.h>
-#include <sys/socket.h>
-#include <sys/queue.h>
-#include <sys/taskqueue.h>
+#include <sys/malloc.h>
+
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/devicestat.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/filedesc.h>
+#include <sys/kdb.h>
+#include <sys/module.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
-#include <sys/filedesc.h>
+#include <sys/rman.h>
+#include <sys/taskqueue.h>
+#include <sys/types.h>
 #include <sys/vnode.h>
-#include <sys/fcntl.h>
-#include <sys/disk.h>
-#include <sys/bio.h>
-
-#include <sys/module.h>
-#include <sys/bus.h>
-#include <sys/sysctl.h>
+#include <sys/mount.h>
 
 #include <geom/geom.h>
 
+#include <machine/_inttypes.h>
+#include <machine/xen/xen-os.h>
+
+#include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 
-#include <machine/xen-os.h>
-#include <machine/hypervisor.h>
-#include <machine/hypervisor-ifs.h>
-#include <machine/xen_intr.h>
-#include <machine/evtchn.h>
-#include <machine/xenbus.h>
-#include <machine/gnttab.h>
-#include <machine/xen-public/memory.h>
-#include <dev/xen/xenbus/xenbus_comms.h>
+#include <xen/blkif.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/xen_intr.h>
 
+#include <xen/interface/event_channel.h>
+#include <xen/interface/grant_table.h>
 
-#if XEN_BLKBACK_DEBUG
+#include <xen/xenbus/xenbusvar.h>
+
+/*--------------------------- Compile-time Tunables --------------------------*/
+/**
+ * The maximum number of outstanding request blocks (request headers plus
+ * additional segment blocks) we will allow in a negotiated block-front/back
+ * communication channel.
+ */
+#define	XBB_MAX_REQUESTS	256
+
+/**
+ * \brief Define to force all I/O to be performed on memory owned by the
+ *        backend device, with a copy-in/out to the remote domain's memory.
+ *
+ * \note  This option is currently required when this driver's domain is
+ *        operating in HVM mode on a system using an IOMMU.
+ *
+ * This driver uses Xen's grant table API to gain access to the memory of
+ * the remote domains it serves.  When our domain is operating in PV mode,
+ * the grant table mechanism directly updates our domain's page table entries
+ * to point to the physical pages of the remote domain.  This scheme guarantees
+ * that blkback and the backing devices it uses can safely perform DMA
+ * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
+ * insure that our domain cannot DMA to pages owned by another domain.  As
+ * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
+ * table API.  For this reason, in HVM mode, we must bounce all requests into
+ * memory that is mapped into our domain at domain startup and thus has
+ * valid IOMMU mappings.
+ */
+#define XBB_USE_BOUNCE_BUFFERS
+
+/**
+ * \brief Define to enable rudimentary request logging to the console.
+ */
+#undef XBB_DEBUG
+
+/*---------------------------------- Macros ----------------------------------*/
+/**
+ * Custom malloc type for all driver allocations.
+ */
+MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
+
+#ifdef XBB_DEBUG
 #define DPRINTF(fmt, args...) \
-    printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
+    printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
 #else
-#define DPRINTF(fmt, args...) ((void)0)
+#define DPRINTF(fmt, args...) do {} while(0)
 #endif
 
-#define WPRINTF(fmt, args...) \
-    printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
+/**
+ * The maximum mapped region size per request we will allow in a negotiated
+ * block-front/back communication channel.
+ */
+#define	XBB_MAX_REQUEST_SIZE		\
+	MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
 
-#define BLKBACK_INVALID_HANDLE (~0)
+/**
+ * The maximum number of segments (within a request header and accompanying
+ * segment blocks) per request we will allow in a negotiated block-front/back
+ * communication channel.
+ */
+#define	XBB_MAX_SEGMENTS_PER_REQUEST			\
+	(MIN(UIO_MAXIOV,				\
+	     MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,	\
+		 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
 
-struct ring_ref {
-	vm_offset_t va;
-	grant_handle_t handle;
-	uint64_t bus_addr;
+/**
+ * The maximum number of shared memory ring pages we will allow in a
+ * negotiated block-front/back communication channel.  Allow enough
+ * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
+ */
+#define	XBB_MAX_RING_PAGES						    \
+	BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \
+		       * XBB_MAX_REQUESTS)
+
+/*--------------------------- Forward Declarations ---------------------------*/
+struct xbb_softc;
+
+static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
+			      ...) __attribute__((format(printf, 3, 4)));
+static int  xbb_shutdown(struct xbb_softc *xbb);
+static int  xbb_detach(device_t dev);
+
+/*------------------------------ Data Structures -----------------------------*/
+/**
+ * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
+ */
+struct xbb_xen_req {
+	/**
+	 * Linked list links used to aggregate idle request in the
+	 * request free pool (xbb->request_free_slist).
+	 */
+	SLIST_ENTRY(xbb_xen_req) links;
+
+	/**
+	 * Back reference to the parent block back instance for this
+	 * request.  Used during bio_done handling.
+	 */
+	struct xbb_softc        *xbb;
+
+	/**
+	 * The remote domain's identifier for this I/O request.
+	 */
+	uint64_t		 id;
+
+	/**
+	 * Kernel virtual address space reserved for this request
+	 * structure and used to map the remote domain's pages for
+	 * this I/O, into our domain's address space.
+	 */
+	uint8_t			*kva;
+
+#ifdef XBB_USE_BOUNCE_BUFFERS
+	/**
+	 * Pre-allocated domain local memory used to proxy remote
+	 * domain memory during I/O operations.
+	 */
+	uint8_t			*bounce;
+#endif
+
+	/**
+	 * Base, psuedo-physical address, corresponding to the start
+	 * of this request's kva region.
+	 */
+	uint64_t	 	 gnt_base;
+
+	/**
+	 * The number of pages currently mapped for this request.
+	 */
+	int			 nr_pages;
+
+	/**
+	 * The number of 512 byte sectors comprising this requests.
+	 */
+	int			 nr_512b_sectors;
+
+	/**
+	 * The number of struct bio requests still outstanding for this
+	 * request on the backend device.  This field is only used for	
+	 * device (rather than file) backed I/O.
+	 */
+	int			 pendcnt;
+
+	/**
+	 * BLKIF_OP code for this request.
+	 */
+	int			 operation;
+
+	/**
+	 * BLKIF_RSP status code for this request.
+	 *
+	 * This field allows an error status to be recorded even if the
+	 * delivery of this status must be deferred.  Deferred reporting
+	 * is necessary, for example, when an error is detected during
+	 * completion processing of one bio when other bios for this
+	 * request are still outstanding.
+	 */
+	int			 status;
+
+	/**
+	 * Device statistics request ordering type (ordered or simple).
+	 */
+	devstat_tag_type	 ds_tag_type;
+
+	/**
+	 * Device statistics request type (read, write, no_data).
+	 */
+	devstat_trans_flags	 ds_trans_type;
+
+	/**
+	 * The start time for this request.
+	 */
+	struct bintime		 ds_t0;
+
+	/**
+	 * Array of grant handles (one per page) used to map this request.
+	 */
+	grant_handle_t		*gnt_handles;
+};
+SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
+
+/**
+ * \brief Configuration data for the shared memory request ring
+ *        used to communicate with the front-end client of this
+ *        this driver.
+ */
+struct xbb_ring_config {
+	/** KVA address where ring memory is mapped. */
+	vm_offset_t	va;
+
+	/** The pseudo-physical address where ring memory is mapped.*/
+	uint64_t	gnt_addr;
+
+	/**
+	 * Grant table handles, one per-ring page, returned by the
+	 * hyperpervisor upon mapping of the ring and required to
+	 * unmap it when a connection is torn down.
+	 */
+	grant_handle_t	handle[XBB_MAX_RING_PAGES];
+
+	/**
+	 * The device bus address returned by the hypervisor when
+	 * mapping the ring and required to unmap it when a connection
+	 * is torn down.
+	 */
+	uint64_t	bus_addr[XBB_MAX_RING_PAGES];
+
+	/** The number of ring pages mapped for the current connection. */
+	u_int		ring_pages;
+
+	/**
+	 * The grant references, one per-ring page, supplied by the
+	 * front-end, allowing us to reference the ring pages in the
+	 * front-end's domain and to map these pages into our own domain.
+	 */
+	grant_ref_t	ring_ref[XBB_MAX_RING_PAGES];
+
+	/** The interrupt driven even channel used to signal ring events. */
+	evtchn_port_t   evtchn;
 };
 
-typedef struct blkback_info {
+/**
+ * Per-instance connection state flags.
+ */
+typedef enum
+{
+	/**
+	 * The front-end requested a read-only mount of the
+	 * back-end device/file.
+	 */
+	XBBF_READ_ONLY         = 0x01,
 
-	/* Schedule lists */
-	STAILQ_ENTRY(blkback_info) next_req;
-	int on_req_sched_list;
+	/** Communication with the front-end has been established. */
+	XBBF_RING_CONNECTED    = 0x02,
 
-	struct xenbus_device *xdev;
-	XenbusState frontend_state;
+	/**
+	 * Front-end requests exist in the ring and are waiting for
+	 * xbb_xen_req objects to free up.
+	 */
+	XBBF_RESOURCE_SHORTAGE = 0x04,
 
-	domid_t domid;
+	/** Connection teardown in progress. */
+	XBBF_SHUTDOWN          = 0x08
+} xbb_flag_t;
 
-	int state;
-	int ring_connected;
-	struct ring_ref rr;
-	blkif_back_ring_t ring;
-	evtchn_port_t evtchn;
-	int irq;
-	void *irq_cookie;
+/** Backend device type.  */
+typedef enum {
+	/** Backend type unknown. */
+	XBB_TYPE_NONE		= 0x00,
 
-	int ref_cnt;
+	/**
+	 * Backend type disk (access via cdev switch
+	 * strategy routine).
+	 */
+	XBB_TYPE_DISK		= 0x01,
 
-	int handle;
-	char *mode;
-	char *type;
-	char *dev_name;
+	/** Backend type file (access vnode operations.). */
+	XBB_TYPE_FILE		= 0x02
+} xbb_type;
 
-	struct vnode *vn;
-	struct cdev *cdev;
+/**
+ * \brief Structure used to memoize information about a per-request
+ *        scatter-gather list.
+ *
+ * The chief benefit of using this data structure is it avoids having
+ * to reparse the possibly discontiguous S/G list in the original
+ * request.  Due to the way that the mapping of the memory backing an
+ * I/O transaction is handled by Xen, a second pass is unavoidable.
+ * At least this way the second walk is a simple array traversal.
+ *
+ * \note A single Scatter/Gather element in the block interface covers
+ *       at most 1 machine page.  In this context a sector (blkif
+ *       nomenclature, not what I'd choose) is a 512b aligned unit
+ *       of mapping within the machine page referenced by an S/G
+ *       element.
+ */
+struct xbb_sg {
+	/** The number of 512b data chunks mapped in this S/G element. */
+	int16_t nsect;
+
+	/**
+	 * The index (0 based) of the first 512b data chunk mapped
+	 * in this S/G element.
+	 */
+	uint8_t first_sect;
+
+	/**
+	 * The index (0 based) of the last 512b data chunk mapped
+	 * in this S/G element.
+	 */
+	uint8_t last_sect;
+};
+
+/**
+ * Character device backend specific configuration data.
+ */
+struct xbb_dev_data {
+	/** Cdev used for device backend access.  */
+	struct cdev   *cdev;
+
+	/** Cdev switch used for device backend access.  */
 	struct cdevsw *csw;
-	u_int sector_size;
-	int sector_size_shift;
-	off_t media_size;
-	u_int media_num_sectors;
-	int major;
-	int minor;
-	int read_only;
 
-	struct mtx blk_ring_lock;
+	/** Used to hold a reference on opened cdev backend devices. */
+	int	       dev_ref;
+};
 
-	device_t ndev;
-
-	/* Stats */
-	int st_rd_req;
-	int st_wr_req;
-	int st_oo_req;
-	int st_err_req;
-} blkif_t;
-
-/*
- * These are rather arbitrary. They are fairly large because adjacent requests
- * pulled from a communication ring are quite likely to end up being part of
- * the same scatter/gather request at the disc.
- * 
- * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
- * 
- * This will increase the chances of being able to write whole tracks.
- * 64 should be enough to keep us competitive with Linux.
+/**
+ * File backend specific configuration data.
  */
-static int blkif_reqs = 64;
-TUNABLE_INT("xen.vbd.blkif_reqs", &blkif_reqs);
+struct xbb_file_data {
+	/** Credentials to use for vnode backed (file based) I/O. */
+	struct ucred   *cred;
 
-static int mmap_pages;
+	/**
+	 * \brief Array of io vectors used to process file based I/O.
+	 *
+	 * Only a single file based request is outstanding per-xbb instance,
+	 * so we only need one of these.
+	 */
+	struct iovec	xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST];
+#ifdef XBB_USE_BOUNCE_BUFFERS
 
-/*
- * Each outstanding request that we've passed to the lower device layers has a 
- * 'pending_req' allocated to it. Each buffer_head that completes decrements 
- * the pendcnt towards zero. When it hits zero, the specified domain has a 
- * response queued for it, with the saved 'id' passed back.
+	/**
+	 * \brief Array of io vectors used to handle bouncing of file reads.
+	 *
+	 * Vnode operations are free to modify uio data during their
+	 * exectuion.  In the case of a read with bounce buffering active,
+	 * we need some of the data from the original uio in order to
+	 * bounce-out the read data.  This array serves as the temporary
+	 * storage for this saved data.
+	 */
+	struct iovec	saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST];
+
+	/**
+	 * \brief Array of memoized bounce buffer kva offsets used
+	 *        in the file based backend.
+	 *
+	 * Due to the way that the mapping of the memory backing an
+	 * I/O transaction is handled by Xen, a second pass through
+	 * the request sg elements is unavoidable. We memoize the computed
+	 * bounce address here to reduce the cost of the second walk.
+	 */
+	void		*xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQUEST];
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+};
+
+/**
+ * Collection of backend type specific data.
  */
-typedef struct pending_req {
-	blkif_t       *blkif;
-	uint64_t       id;
-	int            nr_pages;
-	int            pendcnt;
-	unsigned short operation;
-	int            status;
-	STAILQ_ENTRY(pending_req) free_list;
-} pending_req_t;
+union xbb_backend_data {
+	struct xbb_dev_data  dev;
+	struct xbb_file_data file;
+};
 
-static pending_req_t *pending_reqs;
-static STAILQ_HEAD(pending_reqs_list, pending_req) pending_free =
-	STAILQ_HEAD_INITIALIZER(pending_free);
-static struct mtx pending_free_lock;
+/**
+ * Function signature of backend specific I/O handlers.
+ */
+typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, blkif_request_t *ring_req,
+			      struct xbb_xen_req *req, int nseg,
+			      int operation, int flags);
 
-static STAILQ_HEAD(blkback_req_sched_list, blkback_info) req_sched_list =
-	STAILQ_HEAD_INITIALIZER(req_sched_list);
-static struct mtx req_sched_list_lock;
+/**
+ * Per-instance configuration data.
+ */
+struct xbb_softc {
 
-static unsigned long mmap_vstart;
-static unsigned long *pending_vaddrs;
-static grant_handle_t *pending_grant_handles;
+	/**
+	 * Task-queue used to process I/O requests.
+	 */
+	struct taskqueue	 *io_taskqueue;
 
-static struct task blk_req_task;
+	/**
+	 * Single "run the request queue" task enqueued
+	 * on io_taskqueue.
+	 */
+	struct task		  io_task;
 
-/* Protos */
-static void disconnect_ring(blkif_t *blkif);
-static int vbd_add_dev(struct xenbus_device *xdev);
+	/** Device type for this instance. */
+	xbb_type		  device_type;
 
-static inline int vaddr_pagenr(pending_req_t *req, int seg)
+	/** NewBus device corresponding to this instance. */
+	device_t		  dev;
+
+	/** Backend specific dispatch routine for this instance. */
+	xbb_dispatch_t		  dispatch_io;
+
+	/** The number of requests outstanding on the backend device/file. */
+	u_int			  active_request_count;
+
+	/** Free pool of request tracking structures. */
+	struct xbb_xen_req_slist  request_free_slist;
+
+	/** Array, sized at connection time, of request tracking structures. */
+	struct xbb_xen_req	 *requests;
+
+	/**
+	 * Global pool of kva used for mapping remote domain ring
+	 * and I/O transaction data.
+	 */
+	vm_offset_t		  kva;
+
+	/** Psuedo-physical address corresponding to kva. */
+	uint64_t		  gnt_base_addr;
+
+	/** The size of the global kva pool. */
+	int			  kva_size;
+
+	/**
+	 * \brief Cached value of the front-end's domain id.
+	 * 
+	 * This value is used at once for each mapped page in
+	 * a transaction.  We cache it to avoid incuring the
+	 * cost of an ivar access every time this is needed.
+	 */
+	domid_t			  otherend_id;
+
+	/**
+	 * \brief The blkif protocol abi in effect.
+	 *
+	 * There are situations where the back and front ends can
+	 * have a different, native abi (e.g. intel x86_64 and
+	 * 32bit x86 domains on the same machine).  The back-end
+	 * always accomodates the front-end's native abi.  That
+	 * value is pulled from the XenStore and recorded here.
+	 */
+	int			  abi;
+
+	/**
+	 * \brief The maximum number of requests allowed to be in
+	 *        flight at a time.
+	 *
+	 * This value is negotiated via the XenStore.
+	 */
+	uint32_t		  max_requests;
+
+	/**
+	 * \brief The maximum number of segments (1 page per segment)
+	 *	  that can be mapped by a request.
+	 *
+	 * This value is negotiated via the XenStore.
+	 */
+	uint32_t		  max_request_segments;
+
+	/**
+	 * The maximum size of any request to this back-end
+	 * device.
+	 *
+	 * This value is negotiated via the XenStore.
+	 */
+	uint32_t		  max_request_size;
+
+	/** Various configuration and state bit flags. */
+	xbb_flag_t		  flags;
+
+	/** Ring mapping and interrupt configuration data. */
+	struct xbb_ring_config	  ring_config;
+
+	/** Runtime, cross-abi safe, structures for ring access. */
+	blkif_back_rings_t	  rings;
+
+	/** IRQ mapping for the communication ring event channel. */
+	int			  irq;
+
+	/**
+	 * \brief Backend access mode flags (e.g. write, or read-only).
+	 *
+	 * This value is passed to us by the front-end via the XenStore.
+	 */
+	char			 *dev_mode;
+
+	/**
+	 * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
+	 *
+	 * This value is passed to us by the front-end via the XenStore.
+	 * Currently unused.
+	 */
+	char			 *dev_type;
+
+	/**
+	 * \brief Backend device/file identifier.
+	 *
+	 * This value is passed to us by the front-end via the XenStore.
+	 * We expect this to be a POSIX path indicating the file or
+	 * device to open.
+	 */
+	char			 *dev_name;
+
+	/**
+	 * Vnode corresponding to the backend device node or file
+	 * we are acessing.
+	 */
+	struct vnode		 *vn;
+
+	union xbb_backend_data	  backend;
+	/** The native sector size of the backend. */
+	u_int			  sector_size;
+
+	/** log2 of sector_size.  */
+	u_int			  sector_size_shift;
+
+	/** Size in bytes of the backend device or file.  */
+	off_t			  media_size;
+
+	/**
+	 * \brief media_size expressed in terms of the backend native
+	 *	  sector size.
+	 *
+	 * (e.g. xbb->media_size >> xbb->sector_size_shift).
+	 */
+	uint64_t		  media_num_sectors;
+
+	/**
+	 * \brief Array of memoized scatter gather data computed during the
+	 *	  conversion of blkif ring requests to internal xbb_xen_req
+	 *	  structures.
+	 *
+	 * Ring processing is serialized so we only need one of these.
+	 */
+	struct xbb_sg		  xbb_sgs[XBB_MAX_SEGMENTS_PER_REQUEST];
+
+	/** Mutex protecting per-instance data. */
+	struct mtx		  lock;
+
+#ifdef XENHVM
+	/**
+	 * Resource representing allocated physical address space
+	 * associated with our per-instance kva region.
+	 */
+	struct resource		 *pseudo_phys_res;
+
+	/** Resource id for allocated physical address space. */
+	int			  pseudo_phys_res_id;
+#endif
+
+	/** I/O statistics. */
+	struct devstat		 *xbb_stats;
+};
+
+/*---------------------------- Request Processing ----------------------------*/
+/**
+ * Allocate an internal transaction tracking structure from the free pool.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ *
+ * \return  On success, a pointer to the allocated xbb_xen_req structure.
+ *          Otherwise NULL.
+ */
+static inline struct xbb_xen_req *
+xbb_get_req(struct xbb_softc *xbb)
 {
-	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
-}
+	struct xbb_xen_req *req;
 
-static inline unsigned long vaddr(pending_req_t *req, int seg)
-{
-	return pending_vaddrs[vaddr_pagenr(req, seg)];
-}
+	req = NULL;
+	mtx_lock(&xbb->lock);
 
-#define pending_handle(_req, _seg) \
-	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
-
-static unsigned long
-alloc_empty_page_range(unsigned long nr_pages)
-{
-	void *pages;
-	int i = 0, j = 0;
-	multicall_entry_t mcl[17];
-	unsigned long mfn_list[16];
-	struct xen_memory_reservation reservation = {
-		.extent_start = mfn_list,
-		.nr_extents   = 0,
-		.address_bits = 0,
-		.extent_order = 0,
-		.domid        = DOMID_SELF
-	};
-
-	pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT);
-	if (pages == NULL)
-		return 0;
-
-	memset(mcl, 0, sizeof(mcl));
-
-	while (i < nr_pages) {
-		unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE);
-
-		mcl[j].op = __HYPERVISOR_update_va_mapping;
-		mcl[j].args[0] = va;
-
-		mfn_list[j++] = vtomach(va) >> PAGE_SHIFT;
-
-		xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY;
-
-		if (j == 16 || i == nr_pages) {
-			mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL;
-
-			reservation.nr_extents = j;
-
-			mcl[j].op = __HYPERVISOR_memory_op;
-			mcl[j].args[0] = XENMEM_decrease_reservation;
-			mcl[j].args[1] =  (unsigned long)&reservation;
-			
-			(void)HYPERVISOR_multicall(mcl, j+1);
-
-			mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0;
-			j = 0;
+	/*
+	 * Do not allow new requests to be allocated while we
+	 * are shutting down.
+	 */
+	if ((xbb->flags & XBBF_SHUTDOWN) == 0) {
+		if ((req = SLIST_FIRST(&xbb->request_free_slist)) != NULL) {
+			SLIST_REMOVE_HEAD(&xbb->request_free_slist, links);
+			xbb->active_request_count++;
+		} else {
+			xbb->flags |= XBBF_RESOURCE_SHORTAGE;
 		}
 	}
-
-	return (unsigned long)pages;
+	mtx_unlock(&xbb->lock);
+	return (req);
 }
 
-static pending_req_t *
-alloc_req(void)
+/**
+ * Return an allocated transaction tracking structure to the free pool.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ * \param req  The request structure to free.
+ */
+static inline void
+xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
 {
-	pending_req_t *req;
-	mtx_lock(&pending_free_lock);
-	if ((req = STAILQ_FIRST(&pending_free))) {
-		STAILQ_REMOVE(&pending_free, req, pending_req, free_list);
-		STAILQ_NEXT(req, free_list) = NULL;
+	int wake_thread;
+
+	mtx_lock(&xbb->lock);
+	wake_thread = xbb->flags & XBBF_RESOURCE_SHORTAGE;
+	xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
+	SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links);
+	xbb->active_request_count--;
+
+	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
+		/*
+		 * Shutdown is in progress.  See if we can
+		 * progress further now that one more request
+		 * has completed and been returned to the
+		 * free pool.
+		 */
+		xbb_shutdown(xbb);
 	}
-	mtx_unlock(&pending_free_lock);
-	return req;
+	mtx_unlock(&xbb->lock);
+
+	if (wake_thread != 0)
+		taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 }
 
-static void
-free_req(pending_req_t *req)
+/**
+ * Given a page index and 512b sector offset within that page,
+ * calculate an offset into a request's kva region.
+ *
+ * \param req     The request structure whose kva region will be accessed.
+ * \param pagenr  The page index used to compute the kva offset.
+ * \param sector  The 512b sector index used to compute the page relative
+ *                kva offset.
+ *
+ * \return  The computed global KVA offset.
+ */
+static inline uint8_t *
+xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector)
 {
-	int was_empty;
-
-	mtx_lock(&pending_free_lock);
-	was_empty = STAILQ_EMPTY(&pending_free);
-	STAILQ_INSERT_TAIL(&pending_free, req, free_list);
-	mtx_unlock(&pending_free_lock);
-	if (was_empty)
-		taskqueue_enqueue(taskqueue_swi, &blk_req_task); 
+	return (req->kva + (PAGE_SIZE * pagenr) + (sector << 9));
 }
 
-static void
-fast_flush_area(pending_req_t *req)
+#ifdef XBB_USE_BOUNCE_BUFFERS
+/**
+ * Given a page index and 512b sector offset within that page,
+ * calculate an offset into a request's local bounce memory region.
+ *
+ * \param req     The request structure whose bounce region will be accessed.
+ * \param pagenr  The page index used to compute the bounce offset.
+ * \param sector  The 512b sector index used to compute the page relative
+ *                bounce offset.
+ *
+ * \return  The computed global bounce buffer address.
+ */
+static inline uint8_t *
+xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector)
 {
-	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	unsigned int i, invcount = 0;
-	grant_handle_t handle;
-	int ret;
+	return (req->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
+}
+#endif
 
+/**
+ * Given a page number and 512b sector offset within that page,
+ * calculate an offset into the request's memory region that the
+ * underlying backend device/file should use for I/O.
+ *
+ * \param req     The request structure whose I/O region will be accessed.
+ * \param pagenr  The page index used to compute the I/O offset.
+ * \param sector  The 512b sector index used to compute the page relative
+ *                I/O offset.
+ *
+ * \return  The computed global I/O address.
+ *
+ * Depending on configuration, this will either be a local bounce buffer
+ * or a pointer to the memory mapped in from the front-end domain for
+ * this request.
+ */
+static inline uint8_t *
+xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector)
+{
+#ifdef XBB_USE_BOUNCE_BUFFERS
+	return (xbb_req_bounce_addr(req, pagenr, sector));
+#else
+	return (xbb_req_vaddr(req, pagenr, sector));
+#endif
+}
+
+/**
+ * Given a page index and 512b sector offset within that page, calculate
+ * an offset into the local psuedo-physical address space used to map a
+ * front-end's request data into a request.
+ *
+ * \param req     The request structure whose pseudo-physical region
+ *                will be accessed.
+ * \param pagenr  The page index used to compute the pseudo-physical offset.
+ * \param sector  The 512b sector index used to compute the page relative
+ *                pseudo-physical offset.
+ *
+ * \return  The computed global pseudo-phsyical address.
+ *
+ * Depending on configuration, this will either be a local bounce buffer
+ * or a pointer to the memory mapped in from the front-end domain for
+ * this request.
+ */
+static inline uintptr_t
+xbb_req_gntaddr(struct xbb_xen_req *req, int pagenr, int sector)
+{
+	return ((uintptr_t)(req->gnt_base
+			  + (PAGE_SIZE * pagenr) + (sector << 9)));
+}
+
+/**
+ * Unmap the front-end pages associated with this I/O request.
+ *
+ * \param req  The request structure to unmap.
+ */
+static void
+xbb_unmap_req(struct xbb_xen_req *req)
+{
+	struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQUEST];
+	u_int			      i;
+	u_int			      invcount;
+	int			      error;
+
+	invcount = 0;
 	for (i = 0; i < req->nr_pages; i++) {
-		handle = pending_handle(req, i);
-		if (handle == BLKBACK_INVALID_HANDLE)
+
+		if (req->gnt_handles[i] == GRANT_REF_INVALID)
 			continue;
-		unmap[invcount].host_addr    = vaddr(req, i);
+
+		unmap[invcount].host_addr    = xbb_req_gntaddr(req, i, 0);
 		unmap[invcount].dev_bus_addr = 0;
-		unmap[invcount].handle       = handle;
-		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+		unmap[invcount].handle       = req->gnt_handles[i];
+		req->gnt_handles[i]	     = GRANT_REF_INVALID;
 		invcount++;
 	}
 
-	ret = HYPERVISOR_grant_table_op(
-		GNTTABOP_unmap_grant_ref, unmap, invcount);
-	PANIC_IF(ret);
+	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+					  unmap, invcount);
+	KASSERT(error == 0, ("Grant table operation failed"));
 }
 
+/**
+ * Create and transmit a response to a blkif request.
+ * 
+ * \param xbb     Per-instance xbb configuration structure.
+ * \param req     The request structure to which to respond.
+ * \param status  The status code to report.  See BLKIF_RSP_*
+ *                in sys/xen/interface/io/blkif.h.
+ */
 static void
-blkif_get(blkif_t *blkif)
-{
-	atomic_add_int(&blkif->ref_cnt, 1);
-}
-
-static void
-blkif_put(blkif_t *blkif)
-{
-	if (atomic_fetchadd_int(&blkif->ref_cnt, -1) == 1) {
-		DPRINTF("Removing %x\n", (unsigned int)blkif);
-		disconnect_ring(blkif);
-		if (blkif->mode)
-			free(blkif->mode, M_DEVBUF);			
-		if (blkif->type)
-			free(blkif->type, M_DEVBUF);			
-		if (blkif->dev_name)
-			free(blkif->dev_name, M_DEVBUF);			
-		free(blkif, M_DEVBUF);
-	}
-}
-
-static int
-blkif_create(struct xenbus_device *xdev, long handle, char *mode, char *type, char *params)
-{
-	blkif_t *blkif;
-
-	blkif = (blkif_t *)malloc(sizeof(*blkif), M_DEVBUF, M_NOWAIT | M_ZERO);
-	if (!blkif)
-		return ENOMEM;
-	
-	DPRINTF("Created %x\n", (unsigned int)blkif);
-
-	blkif->ref_cnt = 1;
-	blkif->domid = xdev->otherend_id;
-	blkif->handle = handle;
-	blkif->mode = mode;
-	blkif->type = type;
-	blkif->dev_name = params;
-	blkif->xdev = xdev;
-	xdev->data = blkif;
-
-	mtx_init(&blkif->blk_ring_lock, "blk_ring_ock", "blkback ring lock", MTX_DEF);
-
-	if (strcmp(mode, "w"))
-		blkif->read_only = 1;
-
-	return 0;
-}
-
-static void
-add_to_req_schedule_list_tail(blkif_t *blkif)
-{
-	if (!blkif->on_req_sched_list) {
-		mtx_lock(&req_sched_list_lock);
-		if (!blkif->on_req_sched_list && (blkif->state == XenbusStateConnected)) {
-			blkif_get(blkif);
-			STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req);
-			blkif->on_req_sched_list = 1;
-			taskqueue_enqueue(taskqueue_swi, &blk_req_task); 
-		}
-		mtx_unlock(&req_sched_list_lock);
-	}
-}
-
-/* This routine does not call blkif_get(), does not schedule the blk_req_task to run,
-   and assumes that the state is connected */
-static void
-add_to_req_schedule_list_tail2(blkif_t *blkif)
-{
-	mtx_lock(&req_sched_list_lock);
-	if (!blkif->on_req_sched_list) {
-		STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req);
-		blkif->on_req_sched_list = 1;
-	}
-	mtx_unlock(&req_sched_list_lock);
-}
-
-/* Removes blkif from front of list and does not call blkif_put() (caller must) */
-static blkif_t *
-remove_from_req_schedule_list(void)
-{
-	blkif_t *blkif;
-
-	mtx_lock(&req_sched_list_lock);
-
-	if ((blkif = STAILQ_FIRST(&req_sched_list))) {
-		STAILQ_REMOVE(&req_sched_list, blkif, blkback_info, next_req);
-		STAILQ_NEXT(blkif, next_req) = NULL;
-		blkif->on_req_sched_list = 0;
-	}
-
-	mtx_unlock(&req_sched_list_lock);
-
-	return blkif;
-}
-
-static void
-make_response(blkif_t *blkif, uint64_t id, 
-			  unsigned short op, int st)
+xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
 {
 	blkif_response_t *resp;
-	blkif_back_ring_t *blk_ring = &blkif->ring;
-	int more_to_do = 0;
-	int notify;
+	int		  more_to_do;
+	int		  notify;
 
-	mtx_lock(&blkif->blk_ring_lock);
+	more_to_do = 0;
 
+	/*
+	 * Place on the response ring for the relevant domain.
+	 * For now, only the spacing between entries is different
+	 * in the different ABIs, not the response entry layout.
+	 */
+	mtx_lock(&xbb->lock);
+	switch (xbb->abi) {
+	case BLKIF_PROTOCOL_NATIVE:
+		resp = RING_GET_RESPONSE(&xbb->rings.native,
+					 xbb->rings.native.rsp_prod_pvt);
+		break;
+	case BLKIF_PROTOCOL_X86_32:
+		resp = (blkif_response_t *)
+		    RING_GET_RESPONSE(&xbb->rings.x86_32,
+				      xbb->rings.x86_32.rsp_prod_pvt);
+		break;
+	case BLKIF_PROTOCOL_X86_64:
+		resp = (blkif_response_t *)
+		    RING_GET_RESPONSE(&xbb->rings.x86_64,
+				      xbb->rings.x86_64.rsp_prod_pvt);
+		break;
+	default:
+		panic("Unexpected blkif protocol ABI.");
+	}
 
-	/* Place on the response ring for the relevant domain. */ 
-	resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
-	resp->id        = id;
-	resp->operation = op;
-	resp->status    = st;
-	blk_ring->rsp_prod_pvt++;
-	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
+	resp->id        = req->id;
+	resp->operation = req->operation;
+	resp->status    = status;
+
+	xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages);
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify);
+
+	if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
 
-	if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
 		/*
 		 * Tail check for pending requests. Allows frontend to avoid
 		 * notifications if requests are already in flight (lower
 		 * overheads and promotes batching).
 		 */
-		RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
+		RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
+	} else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
 
-	} else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring))
 		more_to_do = 1;
-
-	mtx_unlock(&blkif->blk_ring_lock);
-
-	if (more_to_do)
-		add_to_req_schedule_list_tail(blkif);
-
-	if (notify)
-		notify_remote_via_irq(blkif->irq);
-}
-
-static void
-end_block_io_op(struct bio *bio)
-{
-	pending_req_t *pending_req = bio->bio_caller2;
-
-	if (bio->bio_error) {
-		DPRINTF("BIO returned error %d for operation on device %s\n",
-				bio->bio_error, pending_req->blkif->dev_name);
-		pending_req->status = BLKIF_RSP_ERROR;
-		pending_req->blkif->st_err_req++;
 	}
 
-#if 0
-	printf("done: bio=%x error=%x completed=%llu resid=%lu flags=%x\n",
-		   (unsigned int)bio, bio->bio_error, bio->bio_completed, bio->bio_resid, bio->bio_flags);
-#endif
+	mtx_unlock(&xbb->lock);
 
-	if (atomic_fetchadd_int(&pending_req->pendcnt, -1) == 1) {
-		fast_flush_area(pending_req);
-		make_response(pending_req->blkif, pending_req->id,
-			      pending_req->operation, pending_req->status);
-		blkif_put(pending_req->blkif);
-		free_req(pending_req);
+	if (more_to_do)
+		taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
+
+	if (notify)
+		notify_remote_via_irq(xbb->irq);
+}
+
+/**
+ * Completion handler for buffer I/O requests issued by the device
+ * backend driver.
+ *
+ * \param bio  The buffer I/O request on which to perform completion
+ *             processing.
+ */
+static void
+xbb_bio_done(struct bio *bio)
+{
+	struct xbb_softc   *xbb;
+	struct xbb_xen_req *req;
+
+	req = bio->bio_caller1;
+	xbb = req->xbb;
+
+	/* Only include transferred I/O in stats. */
+	req->nr_512b_sectors -= bio->bio_resid >> 9;
+	if (bio->bio_error) {
+		DPRINTF("BIO returned error %d for operation on device %s\n",
+			bio->bio_error, xbb->dev_name);
+		req->status = BLKIF_RSP_ERROR;
+
+		if (bio->bio_error == ENXIO
+		 && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
+
+			/*
+			 * Backend device has disappeared.  Signal the
+			 * front-end that we (the device proxy) want to
+			 * go away.
+			 */
+			xenbus_set_state(xbb->dev, XenbusStateClosing);
+		}
+	}
+
+#ifdef XBB_USE_BOUNCE_BUFFERS
+	if (bio->bio_cmd == BIO_READ) {
+		vm_offset_t kva_offset;
+
+		kva_offset = (vm_offset_t)bio->bio_data
+			   - (vm_offset_t)req->bounce;
+		memcpy((uint8_t *)req->kva + kva_offset,
+		       bio->bio_data, bio->bio_bcount);
+	}
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+
+	if (atomic_fetchadd_int(&req->pendcnt, -1) == 1) {
+		xbb_unmap_req(req);
+		xbb_send_response(xbb, req, req->status);
+		devstat_end_transaction(xbb->xbb_stats,
+					/*bytes*/req->nr_512b_sectors << 9,
+					req->ds_tag_type,
+					req->ds_trans_type,
+					/*now*/NULL,
+					/*then*/&req->ds_t0);
+		xbb_release_req(xbb, req);
 	}
 
 	g_destroy_bio(bio);
 }
 
+/**
+ * Parse a blkif request into an internal request structure and send
+ * it to the backend for processing.
+ *
+ * \param xbb           Per-instance xbb configuration structure.
+ * \param ring_req      Front-end's I/O request as pulled from the shared
+ *                      communication ring.
+ * \param req           Allocated internal request structure.
+ * \param req_ring_idx  The location of ring_req within the shared
+ *                      communication ring.
+ *
+ * This routine performs the backend common aspects of request parsing
+ * including compiling an internal request structure, parsing the S/G
+ * list and any secondary ring requests in which they may reside, and
+ * the mapping of front-end I/O pages into our domain.
+ */
 static void
-dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, pending_req_t *pending_req)
+xbb_dispatch_io(struct xbb_softc *xbb, blkif_request_t *ring_req,
+		struct xbb_xen_req *req, RING_IDX req_ring_idx)
 {
-	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	struct { 
-		unsigned long buf; unsigned int nsec;
-	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	unsigned int nseg = req->nr_segments, nr_sects = 0;
-	struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	int operation, ret, i, nbio = 0;
+	struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQUEST];
+	struct xbb_sg                *xbb_sg;
+	struct gnttab_map_grant_ref  *map;
+	struct blkif_request_segment *sg;
+	struct blkif_request_segment *last_block_sg;
+	u_int			      nseg;
+	u_int			      seg_idx;
+	u_int			      block_segs;
+	int			      nr_sects;
+	int			      operation;
+	uint8_t			      bio_flags;
+	int			      error;
 
-	/* Check that number of segments is sane. */
-	if (unlikely(nseg == 0) || 
-	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
-		DPRINTF("Bad number of segments in request (%d)\n", nseg);
-		goto fail_response;
+	nseg                 = ring_req->nr_segments;
+	nr_sects             = 0;
+	req->xbb             = xbb;
+	req->id              = ring_req->id;
+	req->operation       = ring_req->operation;
+	req->status          = BLKIF_RSP_OKAY;
+	req->ds_tag_type     = DEVSTAT_TAG_SIMPLE;
+	req->nr_pages        = nseg;
+	req->nr_512b_sectors = 0;
+	bio_flags            = 0;
+	sg	             = NULL;
+
+	binuptime(&req->ds_t0);
+	devstat_start_transaction(xbb->xbb_stats, &req->ds_t0);
+
+	switch (req->operation) {
+	case BLKIF_OP_WRITE_BARRIER:
+		bio_flags       |= BIO_ORDERED;
+		req->ds_tag_type = DEVSTAT_TAG_ORDERED;
+		/* FALLTHROUGH */
+	case BLKIF_OP_WRITE:
+		operation = BIO_WRITE;
+		req->ds_trans_type = DEVSTAT_WRITE;
+		if ((xbb->flags & XBBF_READ_ONLY) != 0) {
+			DPRINTF("Attempt to write to read only device %s\n",
+				xbb->dev_name);
+			goto fail_send_response;
+		}
+		break;
+	case BLKIF_OP_READ:
+		operation = BIO_READ;
+		req->ds_trans_type = DEVSTAT_READ;
+		break;
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		operation = BIO_FLUSH;
+		req->ds_tag_type = DEVSTAT_TAG_ORDERED;
+		req->ds_trans_type = DEVSTAT_NO_DATA;
+		goto do_dispatch;
+		/*NOTREACHED*/
+	default:
+		DPRINTF("error: unknown block io operation [%d]\n",
+			req->operation);
+		goto fail_send_response;
 	}
 
-	if (req->operation == BLKIF_OP_WRITE) {
-		if (blkif->read_only) {
-			DPRINTF("Attempt to write to read only device %s\n", blkif->dev_name);
-			goto fail_response;
+	/* Check that number of segments is sane. */
+	if (unlikely(nseg == 0)
+	 || unlikely(nseg > xbb->max_request_segments)) {
+		DPRINTF("Bad number of segments in request (%d)\n", nseg);
+		goto fail_send_response;
+	}
+
+	map	      = maps;
+	xbb_sg        = xbb->xbb_sgs;
+	block_segs    = MIN(req->nr_pages, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
+	sg            = ring_req->seg;
+	last_block_sg = sg + block_segs;
+	seg_idx	      = 0;
+	while (1) {
+
+		while (sg < last_block_sg) {
+			
+			xbb_sg->first_sect = sg->first_sect;
+			xbb_sg->last_sect  = sg->last_sect;
+			xbb_sg->nsect =
+			    (int8_t)(sg->last_sect - sg->first_sect + 1);
+
+			if ((sg->last_sect >= (PAGE_SIZE >> 9))
+			 || (xbb_sg->nsect <= 0))
+				goto fail_send_response;
+
+			nr_sects += xbb_sg->nsect;
+			map->host_addr = xbb_req_gntaddr(req, seg_idx,
+							 /*sector*/0);
+			map->flags     = GNTMAP_host_map;
+			map->ref       = sg->gref;
+			map->dom       = xbb->otherend_id;
+			if (operation == BIO_WRITE)
+				map->flags |= GNTMAP_readonly;
+			sg++;
+			map++;
+			xbb_sg++;
+			seg_idx++;
 		}
-		operation = BIO_WRITE;
-	} else
-		operation = BIO_READ;
 
-	pending_req->blkif     = blkif;
-	pending_req->id        = req->id;
-	pending_req->operation = req->operation;
-	pending_req->status    = BLKIF_RSP_OKAY;
-	pending_req->nr_pages  = nseg;
+		block_segs = MIN(nseg - seg_idx,
+				 BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
+		if (block_segs == 0)
+			break;
 
-	for (i = 0; i < nseg; i++) {
-		seg[i].nsec = req->seg[i].last_sect - 
-			req->seg[i].first_sect + 1;
-
-		if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
-		    (seg[i].nsec <= 0))
-			goto fail_response;
-		nr_sects += seg[i].nsec;
-
-		map[i].host_addr = vaddr(pending_req, i);
-		map[i].dom = blkif->domid;
-		map[i].ref = req->seg[i].gref;
-		map[i].flags = GNTMAP_host_map;
-		if (operation == BIO_WRITE)
-			map[i].flags |= GNTMAP_readonly;
+		/*
+		 * Fetch the next request block full of SG elements.
+		 * For now, only the spacing between entries is different
+		 * in the different ABIs, not the sg entry layout.
+		 */
+		req_ring_idx++;
+		switch (xbb->abi) {
+		case BLKIF_PROTOCOL_NATIVE:
+			sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native,
+						    req_ring_idx);
+			break;
+		case BLKIF_PROTOCOL_X86_32:
+		{
+			sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32,
+						    req_ring_idx);
+			break;
+		}
+		case BLKIF_PROTOCOL_X86_64:
+		{
+			sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64,
+						    req_ring_idx);
+			break;
+		}
+		default:
+			panic("Unexpected blkif protocol ABI.");
+			/* NOTREACHED */
+		} 
+		last_block_sg = sg + block_segs;
 	}
 
 	/* Convert to the disk's sector size */
-	nr_sects = (nr_sects << 9) >> blkif->sector_size_shift;
+	req->nr_512b_sectors = nr_sects;
+	nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
 
-	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
-	PANIC_IF(ret);
-
-	for (i = 0; i < nseg; i++) {
-		if (unlikely(map[i].status != 0)) {
-			DPRINTF("invalid buffer -- could not remap it\n");
-			goto fail_flush;
-		}
-
-		pending_handle(pending_req, i) = map[i].handle;
-#if 0
-		/* Can't do this in FreeBSD since vtophys() returns the pfn */
-		/* of the remote domain who loaned us the machine page - DPT */
-		xen_phys_machine[(vtophys(vaddr(pending_req, i)) >> PAGE_SHIFT)] =
-			map[i]dev_bus_addr >> PAGE_SHIFT;
-#endif
-		seg[i].buf  = map[i].dev_bus_addr | 
-			(req->seg[i].first_sect << 9);
+	if ((req->nr_512b_sectors & ((xbb->sector_size >> 9) - 1)) != 0) {
+		device_printf(xbb->dev, "%s: I/O size (%d) is not a multiple "
+			      "of the backing store sector size (%d)\n",
+			      __func__, req->nr_512b_sectors << 9,
+			      xbb->sector_size);
+		goto fail_send_response;
 	}
 
-	if (req->sector_number + nr_sects > blkif->media_num_sectors) {
-		DPRINTF("%s of [%llu,%llu] extends past end of device %s\n",
+	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+					  maps, req->nr_pages);
+	if (error != 0)
+		panic("Grant table operation failed (%d)", error);
+
+	for (seg_idx = 0, map = maps; seg_idx < nseg; seg_idx++, map++) {
+
+		if (unlikely(map->status != 0)) {
+			DPRINTF("invalid buffer -- could not remap it (%d)\n",
+				map->status);
+			DPRINTF("Mapping(%d): Host Addr 0x%lx, flags 0x%x "
+				"ref 0x%x, dom %d\n", seg_idx,
+				map->host_addr, map->flags, map->ref,
+				map->dom);
+			goto fail_unmap_req;
+		}
+
+		req->gnt_handles[seg_idx] = map->handle;
+	}
+	if (ring_req->sector_number + nr_sects > xbb->media_num_sectors) {
+
+		DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
+			"extends past end of device %s\n",
 			operation == BIO_READ ? "read" : "write",
-			req->sector_number,
-			req->sector_number + nr_sects, blkif->dev_name); 
-		goto fail_flush;
+			ring_req->sector_number,
+			ring_req->sector_number + nr_sects, xbb->dev_name); 
+		goto fail_unmap_req;
 	}
 
-	for (i = 0; i < nseg; i++) {
-		struct bio *bio;
+do_dispatch:
 
-		if ((int)seg[i].nsec & ((blkif->sector_size >> 9) - 1)) {
-			DPRINTF("Misaligned I/O request from domain %d", blkif->domid);
-			goto fail_put_bio;
-		}
+	error = xbb->dispatch_io(xbb,
+				 ring_req,
+				 req,
+				 nseg,
+				 operation,
+				 bio_flags);
 
-		bio = biolist[nbio++] = g_new_bio();
-		if (unlikely(bio == NULL))
-			goto fail_put_bio;
-
-		bio->bio_cmd = operation;
-		bio->bio_offset = req->sector_number << blkif->sector_size_shift;
-		bio->bio_length = seg[i].nsec << 9;
-		bio->bio_bcount = bio->bio_length;
-		bio->bio_data = (caddr_t)(vaddr(pending_req, i) | (seg[i].buf & PAGE_MASK));
-		bio->bio_done = end_block_io_op;
-		bio->bio_caller2 = pending_req;
-		bio->bio_dev = blkif->cdev;
-
-		req->sector_number += (seg[i].nsec << 9) >> blkif->sector_size_shift;
-#if 0
-		printf("new: bio=%x cmd=%d sect=%llu nsect=%u iosize_max=%u @ %08lx\n",
-			(unsigned int)bio, req->operation, req->sector_number, seg[i].nsec,
-			blkif->cdev->si_iosize_max, seg[i].buf);
-#endif
+	if (error != 0) {
+		if (operation == BIO_FLUSH)
+			goto fail_send_response;
+		else
+			goto fail_unmap_req;
 	}
 
-	pending_req->pendcnt = nbio;
-	blkif_get(blkif);
-
-	for (i = 0; i < nbio; i++)
-		(*blkif->csw->d_strategy)(biolist[i]);
-
 	return;
 
- fail_put_bio:
-	for (i = 0; i < (nbio-1); i++)
-		g_destroy_bio(biolist[i]);
- fail_flush:
-	fast_flush_area(pending_req);
- fail_response:
-	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
-	free_req(pending_req);
+
+fail_unmap_req:
+	xbb_unmap_req(req);
+	/* FALLTHROUGH */
+
+fail_send_response:
+	xbb_send_response(xbb, req, BLKIF_RSP_ERROR);
+	xbb_release_req(xbb, req);
+	devstat_end_transaction(xbb->xbb_stats,
+				/*bytes*/0,
+				req->ds_tag_type,
+				req->ds_trans_type,
+				/*now*/NULL,
+				/*then*/&req->ds_t0);
 }
 
+/**
+ * Process incoming requests from the shared communication ring in response
+ * to a signal on the ring's event channel.
+ *
+ * \param context  Callback argument registerd during task initialization -
+ *                 the xbb_softc for this instance.
+ * \param pending  The number of taskqueue_enqueue events that have
+ *                 occurred since this handler was last run.
+ */
 static void
-blk_req_action(void *context, int pending)
+xbb_run_queue(void *context, int pending)
 {
-	blkif_t *blkif;
+	struct xbb_softc   *xbb;
+	blkif_back_rings_t *rings;
+	RING_IDX	    rp;
 
-	DPRINTF("\n");
 
-	while (!STAILQ_EMPTY(&req_sched_list)) {
-		blkif_back_ring_t *blk_ring;
-		RING_IDX rc, rp;
+	xbb   = (struct xbb_softc *)context;
+	rings = &xbb->rings;
 
-		blkif = remove_from_req_schedule_list();
+	/*
+	 * Cache req_prod to avoid accessing a cache line shared
+	 * with the frontend.
+	 */
+	rp = rings->common.sring->req_prod;
 
-		blk_ring = &blkif->ring;
-		rc = blk_ring->req_cons;
-		rp = blk_ring->sring->req_prod;
-		rmb(); /* Ensure we see queued requests up to 'rp'. */
+	/* Ensure we see queued requests up to 'rp'. */
+	rmb();
 
-		while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
-			blkif_request_t *req;
-			pending_req_t *pending_req;
+	/**
+	 * Run so long as there is work to consume and the generation
+	 * of a response will not overflow the ring.
+	 *
+	 * @note There's a 1 to 1 relationship between requests and responses,
+	 *       so an overflow should never occur.  This test is to protect
+	 *       our domain from digesting bogus data.  Shouldn't we log this?
+	 */
+	while (rings->common.req_cons != rp
+	    && RING_REQUEST_CONS_OVERFLOW(&rings->common,
+					  rings->common.req_cons) == 0) {
+		blkif_request_t     ring_req_storage;
+		blkif_request_t    *ring_req;
+		struct xbb_xen_req *req;
+		RING_IDX	    req_ring_idx;
 
-			pending_req = alloc_req();
-			if (pending_req == NULL)
-				goto out_of_preqs;
+		req = xbb_get_req(xbb);
+		if (req == NULL) {
+			/*
+			 * Resource shortage has been recorded.
+			 * We'll be scheduled to run once a request
+			 * object frees up due to a completion.
+			 */
+			break;
+		}
 
-			req = RING_GET_REQUEST(blk_ring, rc);
-			blk_ring->req_cons = ++rc; /* before make_response() */
+		switch (xbb->abi) {
+		case BLKIF_PROTOCOL_NATIVE:
+			ring_req = RING_GET_REQUEST(&xbb->rings.native,
+						    rings->common.req_cons);
+			break;
+		case BLKIF_PROTOCOL_X86_32:
+		{
+			struct blkif_x86_32_request *ring_req32;
 
-			switch (req->operation) {
-			case BLKIF_OP_READ:
-				blkif->st_rd_req++;
-				dispatch_rw_block_io(blkif, req, pending_req);
-				break;
-			case BLKIF_OP_WRITE:
-				blkif->st_wr_req++;
-				dispatch_rw_block_io(blkif, req, pending_req);
-				break;
-			default:
-				blkif->st_err_req++;
-				DPRINTF("error: unknown block io operation [%d]\n",
-						req->operation);
-				make_response(blkif, req->id, req->operation,
-							  BLKIF_RSP_ERROR);
-				free_req(pending_req);
-				break;
+			ring_req32 = RING_GET_REQUEST(&xbb->rings.x86_32,
+						      rings->common.req_cons);
+			blkif_get_x86_32_req(&ring_req_storage, ring_req32);
+			ring_req = &ring_req_storage;
+			break;
+		}
+		case BLKIF_PROTOCOL_X86_64:
+		{
+			struct blkif_x86_64_request *ring_req64;
+
+			ring_req64 = RING_GET_REQUEST(&xbb->rings.x86_64,
+						      rings->common.req_cons);
+			blkif_get_x86_64_req(&ring_req_storage, ring_req64);
+			ring_req = &ring_req_storage;
+			break;
+		}
+		default:
+			panic("Unexpected blkif protocol ABI.");
+			/* NOTREACHED */
+		} 
+
+		/*
+		 * Signify that	we can overwrite this request with a
+		 * response by incrementing our consumer index. The
+		 * response won't be generated until after we've already
+		 * consumed all necessary data out of the version of the
+		 * request in the ring buffer (for native mode).  We
+		 * must update the consumer index  before issueing back-end
+		 * I/O so there is no possibility that it will complete
+		 * and a response be generated before we make room in
+		 * the queue for that response.
+		 */
+		req_ring_idx = xbb->rings.common.req_cons;
+		xbb->rings.common.req_cons +=
+		    BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments);
+
+		xbb_dispatch_io(xbb, ring_req, req, req_ring_idx);
+	}
+}
+
+/**
+ * Interrupt handler bound to the shared ring's event channel.
+ *
+ * \param arg  Callback argument registerd during event channel
+ *             binding - the xbb_softc for this instance.
+ */
+static void
+xbb_intr(void *arg)
+{
+	struct xbb_softc *xbb;
+
+	/* Defer to kernel thread. */
+	xbb = (struct xbb_softc *)arg;
+	taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
+}
+
+/*----------------------------- Backend Handlers -----------------------------*/
+/**
+ * Backend handler for character device access.
+ *
+ * \param xbb        Per-instance xbb configuration structure.
+ * \param ring_req   Front-end's I/O request as pulled from the shared
+ *                   communication ring.
+ * \param req        Allocated internal request structure.
+ * \param nseg       The number of valid segments for this request in
+ *                   xbb->xbb_sgs.
+ * \param operation  BIO_* I/O operation code.
+ * \param bio_flags  Additional bio_flag data to pass to any generated
+ *                   bios (e.g. BIO_ORDERED)..
+ *
+ * \return  0 for success, errno codes for failure.
+ */
+static int
+xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req,
+		 struct xbb_xen_req *req, int nseg, int operation,
+		 int bio_flags)
+{
+	struct xbb_dev_data *dev_data;
+	struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQUEST];
+	off_t                bio_offset;
+	struct bio          *bio;
+	struct xbb_sg       *xbb_sg;
+	u_int	             nbio;
+	u_int                bio_idx;
+	u_int                seg_idx;
+	int                  error;
+
+	dev_data   = &xbb->backend.dev;
+	bio_offset = (off_t)ring_req->sector_number
+		   << xbb->sector_size_shift;
+	error      = 0;
+	nbio       = 0;
+	bio_idx    = 0;
+
+	if (operation == BIO_FLUSH) {
+		bio = g_new_bio();
+		if (unlikely(bio == NULL)) {
+			DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
+			error = ENOMEM;
+			return (error);
+		}
+
+		bio->bio_cmd	 = BIO_FLUSH;
+		bio->bio_flags	|= BIO_ORDERED;
+		bio->bio_dev	 = dev_data->cdev;
+		bio->bio_offset	 = 0;
+		bio->bio_data	 = 0;
+		bio->bio_done	 = xbb_bio_done;
+		bio->bio_caller1 = req;
+		bio->bio_pblkno	 = 0;
+
+		req->pendcnt	 = 1;
+
+		(*dev_data->csw->d_strategy)(bios[bio_idx]);
+
+		return (0);
+	}
+
+	for (seg_idx = 0, bio = NULL, xbb_sg = xbb->xbb_sgs;
+	     seg_idx < nseg;
+	     seg_idx++, xbb_sg++) {
+
+		/*
+		 * KVA will not be contiguous, so any additional
+		 * I/O will need to be represented in a new bio.
+		 */
+		if ((bio != NULL)
+		 && (xbb_sg->first_sect != 0)) {
+			if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
+				printf("%s: Discontiguous I/O request from "
+				       "domain %d ends on non-sector "
+				       "boundary\n", __func__,
+				       xbb->otherend_id);
+				error = EINVAL;
+				goto fail_free_bios;
 			}
+			bio = NULL;
 		}
 
-		blkif_put(blkif);
+		if (bio == NULL) {
+			/*
+			 * Make sure that the start of this bio is aligned
+			 * to a device sector.
+			 */
+			if ((bio_offset & (xbb->sector_size - 1)) != 0) {
+				printf("%s: Misaligned I/O request from "
+				       "domain %d\n", __func__,
+				       xbb->otherend_id);
+				error = EINVAL;
+				goto fail_free_bios;
+			}
+
+			bio = bios[nbio++] = g_new_bio();
+			if (unlikely(bio == NULL)) {
+				error = ENOMEM;
+				goto fail_free_bios;
+			}
+			bio->bio_cmd     = operation;
+			bio->bio_flags  |= bio_flags;
+			bio->bio_dev     = dev_data->cdev;
+			bio->bio_offset  = bio_offset;
+			bio->bio_data    = xbb_req_ioaddr(req, seg_idx,
+							  xbb_sg->first_sect);
+			bio->bio_done    = xbb_bio_done;
+			bio->bio_caller1 = req;
+			bio->bio_pblkno  = bio_offset
+				        >> xbb->sector_size_shift;
+		}
+
+		bio->bio_length += xbb_sg->nsect << 9;
+		bio->bio_bcount  = bio->bio_length;
+		bio_offset      += xbb_sg->nsect << 9;
+
+		if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
+
+			if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
+				printf("%s: Discontiguous I/O request from "
+				       "domain %d ends on non-sector "
+				       "boundary\n", __func__,
+				       xbb->otherend_id);
+				error = EINVAL;
+				goto fail_free_bios;
+			}
+			/*
+			 * KVA will not be contiguous, so any additional
+			 * I/O will need to be represented in a new bio.
+			 */
+			bio = NULL;
+		}
 	}
 
-	return;
+	req->pendcnt = nbio;
 
- out_of_preqs:
-	/* We ran out of pending req structs */
-	/* Just requeue interface and wait to be rescheduled to run when one is freed */
-	add_to_req_schedule_list_tail2(blkif);
-	blkif->st_oo_req++;
+	for (bio_idx = 0; bio_idx < nbio; bio_idx++)
+	{
+#ifdef XBB_USE_BOUNCE_BUFFERS
+		vm_offset_t kva_offset;
+
+		kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
+			   - (vm_offset_t)req->bounce;
+		if (operation == BIO_WRITE) {
+			memcpy(bios[bio_idx]->bio_data,
+			       (uint8_t *)req->kva + kva_offset,
+			       bios[bio_idx]->bio_bcount);
+		}
+#endif
+		(*dev_data->csw->d_strategy)(bios[bio_idx]);
+	}
+
+	return (error);
+
+fail_free_bios:
+	for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
+		g_destroy_bio(bios[bio_idx]);
+
+	return (error);
 }
 
-/* Handle interrupt from a frontend */
-static void
-blkback_intr(void *arg)
-{
-	blkif_t *blkif = arg;
-	DPRINTF("%x\n", (unsigned int)blkif);
-	add_to_req_schedule_list_tail(blkif);
-}
-
-/* Map grant ref for ring */
+/**
+ * Backend handler for file access.
+ *
+ * \param xbb        Per-instance xbb configuration structure.
+ * \param ring_req   Front-end's I/O request as pulled from the shared
+ *                   communication ring.
+ * \param req        Allocated internal request structure.
+ * \param nseg       The number of valid segments for this request in
+ *                   xbb->xbb_sgs.
+ * \param operation  BIO_* I/O operation code.
+ * \param bio_flags  Additional bio_flag data to pass to any generated bios
+ *                   (e.g. BIO_ORDERED)..
+ *
+ * \return  0 for success, errno codes for failure.
+ */
 static int
-map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring)
+xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req,
+		  struct xbb_xen_req *req, int nseg, int operation,
+		  int flags)
 {
-	struct gnttab_map_grant_ref op;
+	struct xbb_file_data *file_data;
+	u_int                 seg_idx;
+	struct uio            xuio;
+	struct xbb_sg        *xbb_sg;
+	struct iovec         *xiovec;
+#ifdef XBB_USE_BOUNCE_BUFFERS
+	void                **p_vaddr;
+	int                   saved_uio_iovcnt;
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+	int                   vfs_is_locked;
+	int                   error;
 
-	ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE);
-	if (ring->va == 0)
-		return ENOMEM;
+	file_data = &xbb->backend.file;
+	error = 0;
+	bzero(&xuio, sizeof(xuio));
 
-	op.host_addr = ring->va;
-	op.flags = GNTMAP_host_map;
-	op.ref = ref;
-	op.dom = dom;
-	HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
-	if (op.status) {
-		WPRINTF("grant table op err=%d\n", op.status);
-		kmem_free(kernel_map, ring->va, PAGE_SIZE);
-		ring->va = 0;
-		return EACCES;
-	}
+	req->pendcnt = 0;
 
-	ring->handle = op.handle;
-	ring->bus_addr = op.dev_bus_addr;
-
-	return 0;
-}
-
-/* Unmap grant ref for ring */
-static void
-unmap_ring(struct ring_ref *ring)
-{
-	struct gnttab_unmap_grant_ref op;
-
-	op.host_addr = ring->va;
-	op.dev_bus_addr = ring->bus_addr;
-	op.handle = ring->handle;
-	HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
-	if (op.status)
-		WPRINTF("grant table op err=%d\n", op.status);
-
-	kmem_free(kernel_map, ring->va, PAGE_SIZE);
-	ring->va = 0;
-}
-
-static int
-connect_ring(blkif_t *blkif)
-{
-	struct xenbus_device *xdev = blkif->xdev;
-	blkif_sring_t *ring;
-	unsigned long ring_ref;
-	evtchn_port_t evtchn;
-	evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain };
-	int err;
-
-	if (blkif->ring_connected)
-		return 0;
-
-	// Grab FE data and map his memory
-	err = xenbus_gather(NULL, xdev->otherend,
-			"ring-ref", "%lu", &ring_ref,
-		    "event-channel", "%u", &evtchn, NULL);
-	if (err) {
-		xenbus_dev_fatal(xdev, err,
-			"reading %s/ring-ref and event-channel",
-			xdev->otherend);
-		return err;
-	}
-
-	err = map_ring(ring_ref, blkif->domid, &blkif->rr);
-	if (err) {
-		xenbus_dev_fatal(xdev, err, "mapping ring");
-		return err;
-	}
-	ring = (blkif_sring_t *)blkif->rr.va;
-	BACK_RING_INIT(&blkif->ring, ring, PAGE_SIZE);
-
-	op.u.bind_interdomain.remote_dom = blkif->domid;
-	op.u.bind_interdomain.remote_port = evtchn;
-	err = HYPERVISOR_event_channel_op(&op);
-	if (err) {
-		unmap_ring(&blkif->rr);
-		xenbus_dev_fatal(xdev, err, "binding event channel");
-		return err;
-	}
-	blkif->evtchn = op.u.bind_interdomain.local_port;
-
-	/* bind evtchn to irq handler */
-	blkif->irq =
-		bind_evtchn_to_irqhandler(blkif->evtchn, "blkback",
-			blkback_intr, blkif, INTR_TYPE_NET|INTR_MPSAFE, &blkif->irq_cookie);
-
-	blkif->ring_connected = 1;
-
-	DPRINTF("%x rings connected! evtchn=%d irq=%d\n",
-			(unsigned int)blkif, blkif->evtchn, blkif->irq);
-
-	return 0;
-}
-
-static void
-disconnect_ring(blkif_t *blkif)
-{
-	DPRINTF("\n");
-
-	if (blkif->ring_connected) {
-		unbind_from_irqhandler(blkif->irq, blkif->irq_cookie);
-		blkif->irq = 0;
-		unmap_ring(&blkif->rr);
-		blkif->ring_connected = 0;
-	}
-}
-
-static void
-connect(blkif_t *blkif)
-{
-	struct xenbus_transaction *xbt;
-	struct xenbus_device *xdev = blkif->xdev;
-	int err;
-
-	if (!blkif->ring_connected ||
-		blkif->vn == NULL ||
-		blkif->state == XenbusStateConnected)
-		return;
-
-	DPRINTF("%s\n", xdev->otherend);
-
-	/* Supply the information about the device the frontend needs */
-again:
-	xbt = xenbus_transaction_start();
-	if (IS_ERR(xbt)) {
-		xenbus_dev_fatal(xdev, PTR_ERR(xbt),
-						 "Error writing configuration for backend "
-						 "(start transaction)");
-		return;
-	}
-
-	err = xenbus_printf(xbt, xdev->nodename, "sectors", "%u",
-				blkif->media_num_sectors);
-	if (err) {
-		xenbus_dev_fatal(xdev, err, "writing %s/sectors",
-				 xdev->nodename);
-		goto abort;
-	}
-
-	err = xenbus_printf(xbt, xdev->nodename, "info", "%u",
-				blkif->read_only ? VDISK_READONLY : 0);
-	if (err) {
-		xenbus_dev_fatal(xdev, err, "writing %s/info",
-				 xdev->nodename);
-		goto abort;
-	}
-	err = xenbus_printf(xbt, xdev->nodename, "sector-size", "%u",
-			    blkif->sector_size);
-	if (err) {
-		xenbus_dev_fatal(xdev, err, "writing %s/sector-size",
-				 xdev->nodename);
-		goto abort;
-	}
-
-	err = xenbus_transaction_end(xbt, 0);
-	if (err == -EAGAIN)
-		goto again;
-	if (err)
-		xenbus_dev_fatal(xdev, err, "ending transaction");
-
-	err = xenbus_switch_state(xdev, NULL, XenbusStateConnected);
-	if (err)
-		xenbus_dev_fatal(xdev, err, "switching to Connected state",
-				 xdev->nodename);
-
-	blkif->state = XenbusStateConnected;
-
-	return;
-
- abort:
-	xenbus_transaction_end(xbt, 1);
-}
-
-static int
-blkback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id)
-{
-	int err;
-	char *p, *mode = NULL, *type = NULL, *params = NULL;
-	long handle;
-
-	DPRINTF("node=%s\n", xdev->nodename);
-
-	p = strrchr(xdev->otherend, '/') + 1;
-	handle = strtoul(p, NULL, 0);
-
-	mode = xenbus_read(NULL, xdev->nodename, "mode", NULL);
-	if (IS_ERR(mode)) {
-		xenbus_dev_fatal(xdev, PTR_ERR(mode), "reading mode");
-		err = PTR_ERR(mode);
-		goto error;
-	}
-	
-	type = xenbus_read(NULL, xdev->nodename, "type", NULL);
-	if (IS_ERR(type)) {
-		xenbus_dev_fatal(xdev, PTR_ERR(type), "reading type");
-		err = PTR_ERR(type);
-		goto error;
-	}
-	
-	params = xenbus_read(NULL, xdev->nodename, "params", NULL);
-	if (IS_ERR(type)) {
-		xenbus_dev_fatal(xdev, PTR_ERR(params), "reading params");
-		err = PTR_ERR(params);
-		goto error;
-	}
-	
-	err = blkif_create(xdev, handle, mode, type, params);
-	if (err) {
-		xenbus_dev_fatal(xdev, err, "creating blkif");
-		goto error;
-	}
-
-	err = vbd_add_dev(xdev);
-	if (err) {
-		blkif_put((blkif_t *)xdev->data);
-		xenbus_dev_fatal(xdev, err, "adding vbd device");
-	}
-
-	return err;
-
- error:
-	if (mode)
-		free(mode, M_DEVBUF);
-	if (type)
-		free(type, M_DEVBUF);
-	if (params)
-		free(params, M_DEVBUF);
-	return err;
-}
-
-static int
-blkback_remove(struct xenbus_device *xdev)
-{
-	blkif_t *blkif = xdev->data;
-	device_t ndev;
-
-	DPRINTF("node=%s\n", xdev->nodename);
-
-	blkif->state = XenbusStateClosing;
-
-	if ((ndev = blkif->ndev)) {
-		blkif->ndev = NULL;
-		mtx_lock(&Giant);
-		device_detach(ndev);
-		mtx_unlock(&Giant);
-	}
-
-	xdev->data = NULL;
-	blkif->xdev = NULL;
-	blkif_put(blkif);
-
-	return 0;
-}
-
-static int
-blkback_resume(struct xenbus_device *xdev)
-{
-	DPRINTF("node=%s\n", xdev->nodename);
-	return 0;
-}
-
-static void
-frontend_changed(struct xenbus_device *xdev,
-				 XenbusState frontend_state)
-{
-	blkif_t *blkif = xdev->data;
-
-	DPRINTF("state=%d\n", frontend_state);
-
-	blkif->frontend_state = frontend_state;
-
-	switch (frontend_state) {
-	case XenbusStateInitialising:
+	switch (operation) {
+	case BIO_READ:
+		xuio.uio_rw = UIO_READ;
 		break;
-	case XenbusStateInitialised:
-	case XenbusStateConnected:
-		connect_ring(blkif);
-		connect(blkif);
+	case BIO_WRITE:
+		xuio.uio_rw = UIO_WRITE;
 		break;
-	case XenbusStateClosing:
-		xenbus_switch_state(xdev, NULL, XenbusStateClosing);
+	case BIO_FLUSH: {
+		struct mount *mountpoint;
+
+		vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
+
+		(void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
+
+		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
+		error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
+		VOP_UNLOCK(xbb->vn, 0);
+
+		vn_finished_write(mountpoint);
+
+		VFS_UNLOCK_GIANT(vfs_is_locked);
+
+		goto bailout_send_response;
+		/* NOTREACHED */
+	}
+	default:
+		panic("invalid operation %d", operation);
+		/* NOTREACHED */
+	}
+	xuio.uio_offset = (vm_offset_t)ring_req->sector_number
+			<< xbb->sector_size_shift;
+
+	xuio.uio_segflg = UIO_SYSSPACE;
+	xuio.uio_iov = file_data->xiovecs;
+	xuio.uio_iovcnt = 0;
+
+	for (seg_idx = 0, xiovec = NULL, xbb_sg = xbb->xbb_sgs;
+	     seg_idx < nseg; seg_idx++, xbb_sg++) {
+
+		/*
+		 * If the first sector is not 0, the KVA will not be
+		 * contiguous and we'll need to go on to another segment.
+		 */
+		if (xbb_sg->first_sect != 0)
+			xiovec = NULL;
+
+		if (xiovec == NULL) {
+			xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
+			xiovec->iov_base = xbb_req_ioaddr(req, seg_idx,
+							  xbb_sg->first_sect);
+#ifdef XBB_USE_BOUNCE_BUFFERS
+			/*
+			 * Store the address of the incoming buffer at this
+			 * particular offset as well, so we can do the copy
+			 * later without having to do more work to
+			 * recalculate this address.
+		 	 */
+			p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
+			*p_vaddr = xbb_req_vaddr(req, seg_idx,
+						 xbb_sg->first_sect);
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+			xiovec->iov_len = 0;
+			xuio.uio_iovcnt++;
+		}
+
+		xiovec->iov_len += xbb_sg->nsect << 9;
+
+		xuio.uio_resid += xbb_sg->nsect << 9;
+
+		/*
+		 * If the last sector is not the full page size count,
+		 * the next segment will not be contiguous in KVA and we
+		 * need a new iovec.
+		 */
+		if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
+			xiovec = NULL;
+	}
+
+	xuio.uio_td = curthread;
+
+#ifdef XBB_USE_BOUNCE_BUFFERS
+	saved_uio_iovcnt = xuio.uio_iovcnt;
+
+	if (operation == BIO_WRITE) {
+		/* Copy the write data to the local buffer. */
+		for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
+		     xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
+		     seg_idx++, xiovec++, p_vaddr++) {
+
+			memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
+		}
+	} else {
+		/*
+		 * We only need to save off the iovecs in the case of a
+		 * read, because the copy for the read happens after the
+		 * VOP_READ().  (The uio will get modified in that call
+		 * sequence.)
+		 */
+		memcpy(file_data->saved_xiovecs, xuio.uio_iov,
+		       xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
+	}
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+
+	vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
+	switch (operation) {
+	case BIO_READ:
+
+		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
+
+		/*
+		 * UFS pays attention to IO_DIRECT for reads.  If the
+		 * DIRECTIO option is configured into the kernel, it calls
+		 * ffs_rawread().  But that only works for single-segment
+		 * uios with user space addresses.  In our case, with a
+		 * kernel uio, it still reads into the buffer cache, but it
+		 * will just try to release the buffer from the cache later
+		 * on in ffs_read().
+		 *
+		 * ZFS does not pay attention to IO_DIRECT for reads.
+		 *
+		 * UFS does not pay attention to IO_SYNC for reads.
+		 *
+		 * ZFS pays attention to IO_SYNC (which translates into the
+		 * Solaris define FRSYNC for zfs_read()) for reads.  It
+		 * attempts to sync the file before reading.
+		 *
+		 * So, to attempt to provide some barrier semantics in the
+		 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.  
+		 */
+		error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 
+				 (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
+
+		VOP_UNLOCK(xbb->vn, 0);
 		break;
-	case XenbusStateClosed:
-		xenbus_remove_device(xdev);
-		break;
-	case XenbusStateUnknown:
-	case XenbusStateInitWait:
-		xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend",
-						 frontend_state);
+	case BIO_WRITE: {
+		struct mount *mountpoint;
+
+		(void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
+
+		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
+
+		/*
+		 * UFS pays attention to IO_DIRECT for writes.  The write
+		 * is done asynchronously.  (Normally the write would just
+		 * get put into cache.
+		 *
+		 * UFS pays attention to IO_SYNC for writes.  It will
+		 * attempt to write the buffer out synchronously if that
+		 * flag is set.
+		 *
+		 * ZFS does not pay attention to IO_DIRECT for writes.
+		 *
+		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
+		 * for writes.  It will flush the transaction from the
+		 * cache before returning.
+		 *
+		 * So if we've got the BIO_ORDERED flag set, we want
+		 * IO_SYNC in either the UFS or ZFS case.
+		 */
+		error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
+				  IO_SYNC : 0, file_data->cred);
+		VOP_UNLOCK(xbb->vn, 0);
+
+		vn_finished_write(mountpoint);
+
 		break;
 	}
+	default:
+		panic("invalid operation %d", operation);
+		/* NOTREACHED */
+	}
+	VFS_UNLOCK_GIANT(vfs_is_locked);
+
+#ifdef XBB_USE_BOUNCE_BUFFERS
+	/* We only need to copy here for read operations */
+	if (operation == BIO_READ) {
+
+		for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
+		     xiovec = file_data->saved_xiovecs;
+		     seg_idx < saved_uio_iovcnt; seg_idx++,
+		     xiovec++, p_vaddr++) {
+
+			/*
+			 * Note that we have to use the copy of the 
+			 * io vector we made above.  uiomove() modifies
+			 * the uio and its referenced vector as uiomove
+			 * performs the copy, so we can't rely on any
+			 * state from the original uio.
+			 */
+			memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
+		}
+	}
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+
+bailout_send_response:
+
+	/*
+	 * All I/O is already done, send the response.  A lock is not
+	 * necessary here because we're single threaded, and therefore the
+	 * only context accessing this request right now.  If that changes,
+	 * we may need some locking here.
+	 */
+	xbb_unmap_req(req);
+	xbb_send_response(xbb, req, (error == 0) ? BLKIF_RSP_OKAY :
+			  BLKIF_RSP_ERROR);
+	devstat_end_transaction(xbb->xbb_stats,
+				/*bytes*/error == 0 ? req->nr_512b_sectors << 9
+						    : 0,
+				req->ds_tag_type,
+				req->ds_trans_type,
+				/*now*/NULL,
+				/*then*/&req->ds_t0);
+	xbb_release_req(xbb, req);
+
+	return (0);
 }
 
-/* ** Driver registration ** */
-
-static struct xenbus_device_id blkback_ids[] = {
-	{ "vbd" },
-	{ "" }
-};
-
-static struct xenbus_driver blkback = {
-	.name = "blkback",
-	.ids = blkback_ids,
-	.probe = blkback_probe,
-	.remove = blkback_remove,
-	.resume = blkback_resume,
-	.otherend_changed = frontend_changed,
-};
-
+/*--------------------------- Backend Configuration --------------------------*/
+/**
+ * Close and cleanup any backend device/file specific state for this
+ * block back instance. 
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ */
 static void
-blkback_init(void *unused)
+xbb_close_backend(struct xbb_softc *xbb)
 {
-	int i;
-
-	TASK_INIT(&blk_req_task, 0, blk_req_action, NULL);
-	mtx_init(&req_sched_list_lock, "blk_req_sched_lock", "blkback req sched lock", MTX_DEF);
-
-	mtx_init(&pending_free_lock, "blk_pending_req_ock", "blkback pending request lock", MTX_DEF);
-
-	mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
-	pending_reqs = malloc(sizeof(pending_reqs[0]) *
-		blkif_reqs, M_DEVBUF, M_ZERO|M_NOWAIT);
-	pending_grant_handles = malloc(sizeof(pending_grant_handles[0]) *
-		mmap_pages, M_DEVBUF, M_NOWAIT);
-	pending_vaddrs = malloc(sizeof(pending_vaddrs[0]) *
-		mmap_pages, M_DEVBUF, M_NOWAIT);
-	mmap_vstart = alloc_empty_page_range(mmap_pages);
-	if (!pending_reqs || !pending_grant_handles || !pending_vaddrs || !mmap_vstart) {
-		if (pending_reqs)
-			free(pending_reqs, M_DEVBUF);
-		if (pending_grant_handles)
-			free(pending_grant_handles, M_DEVBUF);
-		if (pending_vaddrs)
-			free(pending_vaddrs, M_DEVBUF);
-		WPRINTF("out of memory\n");
-		return;
-	}
-
-	for (i = 0; i < mmap_pages; i++) {
-		pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
-		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
-	}
-
-	for (i = 0; i < blkif_reqs; i++) {
-		STAILQ_INSERT_TAIL(&pending_free, &pending_reqs[i], free_list);
-	}
-
-	DPRINTF("registering %s\n", blkback.name);
-	xenbus_register_backend(&blkback);
-}
-
-SYSINIT(xbbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, blkback_init, NULL)
-
-static void
-close_device(blkif_t *blkif)
-{
-	DPRINTF("closing dev=%s\n", blkif->dev_name);
-	if (blkif->vn) {
+	DROP_GIANT();
+	DPRINTF("closing dev=%s\n", xbb->dev_name);
+	if (xbb->vn) {
 		int flags = FREAD;
+		int vfs_is_locked = 0;
 
-		if (!blkif->read_only)
+		if ((xbb->flags & XBBF_READ_ONLY) == 0)
 			flags |= FWRITE;
 
-		if (blkif->csw) {
-			dev_relthread(blkif->cdev);
-			blkif->csw = NULL;
+		switch (xbb->device_type) {
+		case XBB_TYPE_DISK:
+			if (xbb->backend.dev.csw) {
+				dev_relthread(xbb->backend.dev.cdev,
+					      xbb->backend.dev.dev_ref);
+				xbb->backend.dev.csw  = NULL;
+				xbb->backend.dev.cdev = NULL;
+			}
+			break;
+		case XBB_TYPE_FILE:
+			vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
+			break;
+		case XBB_TYPE_NONE:
+		default:
+			panic("Unexpected backend type.");
+			break;
 		}
 
-		(void)vn_close(blkif->vn, flags, NOCRED, curthread);
-		blkif->vn = NULL;
+		(void)vn_close(xbb->vn, flags, NOCRED, curthread);
+		xbb->vn = NULL;
+
+		switch (xbb->device_type) {
+		case XBB_TYPE_DISK:
+			break;
+		case XBB_TYPE_FILE:
+			VFS_UNLOCK_GIANT(vfs_is_locked);
+			if (xbb->backend.file.cred != NULL) {
+				crfree(xbb->backend.file.cred);
+				xbb->backend.file.cred = NULL;
+			}
+			break;
+		case XBB_TYPE_NONE:
+		default:
+			panic("Unexpected backend type.");
+			break;
+		}
 	}
+	PICKUP_GIANT();
 }
 
+/**
+ * Open a character device to be used for backend I/O.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ *
+ * \return  0 for success, errno codes for failure.
+ */
 static int
-open_device(blkif_t *blkif)
+xbb_open_dev(struct xbb_softc *xbb)
+{
+	struct vattr   vattr;
+	struct cdev   *dev;
+	struct cdevsw *devsw;
+	int	       error;
+
+	xbb->device_type = XBB_TYPE_DISK;
+	xbb->dispatch_io = xbb_dispatch_dev;
+	xbb->backend.dev.cdev = xbb->vn->v_rdev;
+	xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
+					     &xbb->backend.dev.dev_ref);
+	if (xbb->backend.dev.csw == NULL)
+		panic("Unable to retrieve device switch");
+
+	error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
+	if (error) {
+		xenbus_dev_fatal(xbb->dev, error, "error getting "
+				 "vnode attributes for device %s",
+				 xbb->dev_name);
+		return (error);
+	}
+
+
+	dev = xbb->vn->v_rdev;
+	devsw = dev->si_devsw;
+	if (!devsw->d_ioctl) {
+		xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
+				 "device %s!", xbb->dev_name);
+		return (ENODEV);
+	}
+
+	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
+			       (caddr_t)&xbb->sector_size, FREAD,
+			       curthread);
+	if (error) {
+		xenbus_dev_fatal(xbb->dev, error,
+				 "error calling ioctl DIOCGSECTORSIZE "
+				 "for device %s", xbb->dev_name);
+		return (error);
+	}
+
+	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
+			       (caddr_t)&xbb->media_size, FREAD,
+			       curthread);
+	if (error) {
+		xenbus_dev_fatal(xbb->dev, error,
+				 "error calling ioctl DIOCGMEDIASIZE "
+				 "for device %s", xbb->dev_name);
+		return (error);
+	}
+
+	return (0);
+}
+
+/**
+ * Open a file to be used for backend I/O.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ *
+ * \return  0 for success, errno codes for failure.
+ */
+static int
+xbb_open_file(struct xbb_softc *xbb)
+{
+	struct xbb_file_data *file_data;
+	struct vattr          vattr;
+	int                   error;
+
+	file_data = &xbb->backend.file;
+	xbb->device_type = XBB_TYPE_FILE;
+	xbb->dispatch_io = xbb_dispatch_file;
+	error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
+	if (error != 0) {
+		xenbus_dev_fatal(xbb->dev, error,
+				 "error calling VOP_GETATTR()"
+				 "for file %s", xbb->dev_name);
+		return (error);
+	}
+
+	/*
+	 * Verify that we have the ability to upgrade to exclusive
+	 * access on this file so we can trap errors at open instead
+	 * of reporting them during first access.
+	 */
+	if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
+		vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
+		if (xbb->vn->v_iflag & VI_DOOMED) {
+			error = EBADF;
+			xenbus_dev_fatal(xbb->dev, error,
+					 "error locking file %s",
+					 xbb->dev_name);
+
+			return (error);
+		}
+	}
+
+	file_data->cred = crhold(curthread->td_ucred);
+	xbb->media_size = vattr.va_size;
+
+	/*
+	 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
+	 * With ZFS, it is 131072 bytes.  Block sizes that large don't work
+	 * with disklabel and UFS on FreeBSD at least.  Large block sizes
+	 * may not work with other OSes as well.  So just export a sector
+	 * size of 512 bytes, which should work with any OS or
+	 * application.  Since our backing is a file, any block size will
+	 * work fine for the backing store.
+	 */
+#if 0
+	xbb->sector_size = vattr.va_blocksize;
+#endif
+	xbb->sector_size = 512;
+
+	/*
+	 * Sanity check.  The media size has to be at least one
+	 * sector long.
+	 */
+	if (xbb->media_size < xbb->sector_size) {
+		error = EINVAL;
+		xenbus_dev_fatal(xbb->dev, error,
+				 "file %s size %ju < block size %u",
+				 xbb->dev_name,
+				 (uintmax_t)xbb->media_size,
+				 xbb->sector_size);
+	}
+	return (error);
+}
+
+/**
+ * Open the backend provider for this connection.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ *
+ * \return  0 for success, errno codes for failure.
+ */
+static int
+xbb_open_backend(struct xbb_softc *xbb)
 {
 	struct nameidata nd;
-	struct vattr vattr;
-	struct cdev *dev;
-	struct cdevsw *devsw;
-	int flags = FREAD, err = 0;
+	int		 flags;
+	int		 error;
+	int		 vfs_is_locked;
 
-	DPRINTF("opening dev=%s\n", blkif->dev_name);
+	flags = FREAD;
+	error = 0;
 
-	if (!blkif->read_only)
+	DPRINTF("opening dev=%s\n", xbb->dev_name);
+
+	if ((xbb->flags & XBBF_READ_ONLY) == 0)
 		flags |= FWRITE;
 
 	if (!curthread->td_proc->p_fd->fd_cdir) {
@@ -1066,284 +1930,1045 @@ open_device(blkif_t *blkif)
 	}
 
  again:
-	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, blkif->dev_name, curthread);
-	err = vn_open(&nd, &flags, 0, -1);
-	if (err) {
-		if (blkif->dev_name[0] != '/') {
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
+	error = vn_open(&nd, &flags, 0, NULL);
+	if (error) {
+		/*
+		 * This is the only reasonable guess we can make as far as
+		 * path if the user doesn't give us a fully qualified path.
+		 * If they want to specify a file, they need to specify the
+		 * full path.
+		 */
+		if (xbb->dev_name[0] != '/') {
 			char *dev_path = "/dev/";
 			char *dev_name;
 
 			/* Try adding device path at beginning of name */
-			dev_name = malloc(strlen(blkif->dev_name) + strlen(dev_path) + 1, M_DEVBUF, M_NOWAIT);
+			dev_name = malloc(strlen(xbb->dev_name)
+					+ strlen(dev_path) + 1,
+					  M_XENBLOCKBACK, M_NOWAIT);
 			if (dev_name) {
-				sprintf(dev_name, "%s%s", dev_path, blkif->dev_name);
-				free(blkif->dev_name, M_DEVBUF);			
-				blkif->dev_name = dev_name;
+				sprintf(dev_name, "%s%s", dev_path,
+					xbb->dev_name);
+				free(xbb->dev_name, M_XENBLOCKBACK);
+				xbb->dev_name = dev_name;
 				goto again;
 			}
 		}
-		xenbus_dev_fatal(blkif->xdev, err, "error opening device %s", blkif->dev_name);
-		return err;
+		xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
+				 xbb->dev_name);
+		return (error);
 	}
+
+	vfs_is_locked = NDHASGIANT(&nd);
+
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 		
-	blkif->vn = nd.ni_vp;
+	xbb->vn = nd.ni_vp;
 
-	/* We only support disks for now */
-	if (!vn_isdisk(blkif->vn, &err)) {
-		xenbus_dev_fatal(blkif->xdev, err, "device %s is not a disk", blkif->dev_name);
-		VOP_UNLOCK(blkif->vn, 0, curthread);
-		goto error;
+	/* We only support disks and files. */
+	if (vn_isdisk(xbb->vn, &error)) {
+		error = xbb_open_dev(xbb);
+	} else if (xbb->vn->v_type == VREG) {
+		error = xbb_open_file(xbb);
+	} else {
+		error = EINVAL;
+		xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
+				 "or file", xbb->dev_name);
+	}
+	VOP_UNLOCK(xbb->vn, 0);
+	VFS_UNLOCK_GIANT(vfs_is_locked);
+
+	if (error != 0) {
+		xbb_close_backend(xbb);
+		return (error);
 	}
 
-	blkif->cdev = blkif->vn->v_rdev;
-	blkif->csw = dev_refthread(blkif->cdev);
-	PANIC_IF(blkif->csw == NULL);
+	xbb->sector_size_shift = fls(xbb->sector_size) - 1;
+	xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
 
-	err = VOP_GETATTR(blkif->vn, &vattr, NOCRED);
-	if (err) {
-		xenbus_dev_fatal(blkif->xdev, err,
-			"error getting vnode attributes for device %s", blkif->dev_name);
-		VOP_UNLOCK(blkif->vn, 0, curthread);
-		goto error;
+	DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
+		(xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
+		xbb->dev_name, xbb->sector_size, xbb->media_size);
+
+	return (0);
+}
+
+/*------------------------ Inter-Domain Communication ------------------------*/
+/**
+ * Cleanup all inter-domain communication mechanisms.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ */
+static void
+xbb_disconnect(struct xbb_softc *xbb)
+{
+	struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
+	struct gnttab_unmap_grant_ref *op;
+	u_int			       ring_idx;
+	int			       error;
+
+	DPRINTF("\n");
+
+	if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
+		return;
+
+	if (xbb->irq != 0) {
+		unbind_from_irqhandler(xbb->irq);
+		xbb->irq = 0;
 	}
 
-	VOP_UNLOCK(blkif->vn, 0, curthread);
+	for (ring_idx = 0, op = ops;
+	     ring_idx < xbb->ring_config.ring_pages;
+	     ring_idx++, op++) {
 
-	dev = blkif->vn->v_rdev;
-	devsw = dev->si_devsw;
-	if (!devsw->d_ioctl) {
-		err = ENODEV;
-		xenbus_dev_fatal(blkif->xdev, err,
-			"no d_ioctl for device %s!", blkif->dev_name);
-		goto error;
+		op->host_addr    = xbb->ring_config.gnt_addr
+			         + (ring_idx * PAGE_SIZE);
+		op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
+		op->handle	 = xbb->ring_config.handle[ring_idx];
 	}
 
-	err = (*devsw->d_ioctl)(dev, DIOCGSECTORSIZE, (caddr_t)&blkif->sector_size, FREAD, curthread);
-	if (err) {
-		xenbus_dev_fatal(blkif->xdev, err,
-			"error calling ioctl DIOCGSECTORSIZE for device %s", blkif->dev_name);
-		goto error;
+	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
+					  xbb->ring_config.ring_pages);
+	if (error != 0)
+		panic("Grant table op failed (%d)", error);
+
+	xbb->flags &= ~XBBF_RING_CONNECTED;
+}
+
+/**
+ * Map shared memory ring into domain local address space, initialize
+ * ring control structures, and bind an interrupt to the event channel
+ * used to notify us of ring changes.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ */
+static int
+xbb_connect_ring(struct xbb_softc *xbb)
+{
+	struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
+	struct gnttab_map_grant_ref *gnt;
+	u_int			     ring_idx;
+	int			     error;
+
+	if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
+		return (0);
+
+	/*
+	 * Kva for our ring is at the tail of the region of kva allocated
+	 * by xbb_alloc_communication_mem().
+	 */
+	xbb->ring_config.va = xbb->kva
+			    + (xbb->kva_size
+			     - (xbb->ring_config.ring_pages * PAGE_SIZE));
+	xbb->ring_config.gnt_addr = xbb->gnt_base_addr
+				  + (xbb->kva_size
+				   - (xbb->ring_config.ring_pages * PAGE_SIZE));
+
+	for (ring_idx = 0, gnt = gnts;
+	     ring_idx < xbb->ring_config.ring_pages;
+	     ring_idx++, gnt++) {
+
+		gnt->host_addr = xbb->ring_config.gnt_addr
+			       + (ring_idx * PAGE_SIZE);
+		gnt->flags     = GNTMAP_host_map;
+		gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
+		gnt->dom       = xbb->otherend_id;
 	}
-	blkif->sector_size_shift = fls(blkif->sector_size) - 1;
 
-	err = (*devsw->d_ioctl)(dev, DIOCGMEDIASIZE, (caddr_t)&blkif->media_size, FREAD, curthread);
-	if (err) {
-		xenbus_dev_fatal(blkif->xdev, err,
-			"error calling ioctl DIOCGMEDIASIZE for device %s", blkif->dev_name);
-		goto error;
+	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
+					  xbb->ring_config.ring_pages);
+	if (error)
+		panic("blkback: Ring page grant table op failed (%d)", error);
+
+	for (ring_idx = 0, gnt = gnts;
+	     ring_idx < xbb->ring_config.ring_pages;
+	     ring_idx++, gnt++) {
+		if (gnt->status != 0) {
+			xbb->ring_config.va = 0;
+			xenbus_dev_fatal(xbb->dev, EACCES,
+					 "Ring shared page mapping failed. "
+					 "Status %d.", gnt->status);
+			return (EACCES);
+		}
+		xbb->ring_config.handle[ring_idx]   = gnt->handle;
+		xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
 	}
-	blkif->media_num_sectors = blkif->media_size >> blkif->sector_size_shift;
 
-	blkif->major = major(vattr.va_rdev);
-	blkif->minor = minor(vattr.va_rdev);
+	/* Initialize the ring based on ABI. */
+	switch (xbb->abi) {
+	case BLKIF_PROTOCOL_NATIVE:
+	{
+		blkif_sring_t *sring;
+		sring = (blkif_sring_t *)xbb->ring_config.va;
+		BACK_RING_INIT(&xbb->rings.native, sring,
+			       xbb->ring_config.ring_pages * PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_32:
+	{
+		blkif_x86_32_sring_t *sring_x86_32;
+		sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
+		BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
+			       xbb->ring_config.ring_pages * PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_64:
+	{
+		blkif_x86_64_sring_t *sring_x86_64;
+		sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
+		BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
+			       xbb->ring_config.ring_pages * PAGE_SIZE);
+		break;
+	}
+	default:
+		panic("Unexpected blkif protocol ABI.");
+	}
 
-	DPRINTF("opened dev=%s major=%d minor=%d sector_size=%u media_size=%lld\n",
-			blkif->dev_name, blkif->major, blkif->minor, blkif->sector_size, blkif->media_size);
+	xbb->flags |= XBBF_RING_CONNECTED;
+
+	error =
+	    bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id,
+						  xbb->ring_config.evtchn,
+						  device_get_nameunit(xbb->dev),
+						  xbb_intr, /*arg*/xbb,
+						  INTR_TYPE_BIO | INTR_MPSAFE,
+						  &xbb->irq);
+	if (error) {
+		xbb_disconnect(xbb);
+		xenbus_dev_fatal(xbb->dev, error, "binding event channel");
+		return (error);
+	}
+
+	DPRINTF("rings connected!\n");
 
 	return 0;
-
- error:
-	close_device(blkif);
-	return err;
 }
 
+/**
+ * Size KVA and pseudo-physical address allocations based on negotiated
+ * values for the size and number of I/O requests, and the size of our
+ * communication ring.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ *
+ * These address spaces are used to dynamically map pages in the
+ * front-end's domain into our own.
+ */
 static int
-vbd_add_dev(struct xenbus_device *xdev)
+xbb_alloc_communication_mem(struct xbb_softc *xbb)
 {
-	blkif_t *blkif = xdev->data;
-	device_t nexus, ndev;
-	devclass_t dc;
-	int err = 0;
-
-	mtx_lock(&Giant);
-
-	/* We will add a vbd device as a child of nexus0 (for now) */
-	if (!(dc = devclass_find("nexus")) ||
-		!(nexus = devclass_get_device(dc, 0))) {
-		WPRINTF("could not find nexus0!\n");
-		err = ENOENT;
-		goto done;
+	xbb->kva_size = (xbb->ring_config.ring_pages
+		      +  (xbb->max_requests * xbb->max_request_segments))
+		      * PAGE_SIZE;
+#ifndef XENHVM
+	xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size);
+	if (xbb->kva == 0)
+		return (ENOMEM);
+	xbb->gnt_base_addr = xbb->kva;
+#else /* XENHVM */
+	/*
+	 * Reserve a range of pseudo physical memory that we can map
+	 * into kva.  These pages will only be backed by machine
+	 * pages ("real memory") during the lifetime of front-end requests
+	 * via grant table operations.
+	 */
+	xbb->pseudo_phys_res_id = 0;
+	xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY,
+						  &xbb->pseudo_phys_res_id,
+						  0, ~0, xbb->kva_size,
+						  RF_ACTIVE);
+	if (xbb->pseudo_phys_res == NULL) {
+		xbb->kva = 0;
+		return (ENOMEM);
 	}
-
-
-	/* Create a newbus device representing the vbd */
-	ndev = BUS_ADD_CHILD(nexus, 0, "vbd", blkif->handle);
-	if (!ndev) {
-		WPRINTF("could not create newbus device vbd%d!\n", blkif->handle);
-		err = EFAULT;
-		goto done;
-	}
-	
-	blkif_get(blkif);
-	device_set_ivars(ndev, blkif);
-	blkif->ndev = ndev;
-
-	device_probe_and_attach(ndev);
-
- done:
-
-	mtx_unlock(&Giant);
-
-	return err;
+	xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
+	xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
+#endif /* XENHVM */
+	return (0);
 }
 
-enum {
-	VBD_SYSCTL_DOMID,
-	VBD_SYSCTL_ST_RD_REQ,
-	VBD_SYSCTL_ST_WR_REQ,
-	VBD_SYSCTL_ST_OO_REQ,
-	VBD_SYSCTL_ST_ERR_REQ,
-	VBD_SYSCTL_RING,
-};
-
-static char *
-vbd_sysctl_ring_info(blkif_t *blkif, int cmd)
+/**
+ * Free dynamically allocated KVA or pseudo-physical address allocations.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ */
+static void
+xbb_free_communication_mem(struct xbb_softc *xbb)
 {
-	char *buf = malloc(256, M_DEVBUF, M_WAITOK);
-	if (buf) {
-		if (!blkif->ring_connected)
-			sprintf(buf, "ring not connected\n");
-		else {
-			blkif_back_ring_t *ring = &blkif->ring;
-			sprintf(buf, "nr_ents=%x req_cons=%x"
-					" req_prod=%x req_event=%x"
-					" rsp_prod=%x rsp_event=%x",
-					ring->nr_ents, ring->req_cons,
-					ring->sring->req_prod, ring->sring->req_event,
-					ring->sring->rsp_prod, ring->sring->rsp_event);
+	if (xbb->kva != 0) {
+#ifndef XENHVM
+		kmem_free(kernel_map, xbb->kva, xbb->kva_size);
+#else
+		if (xbb->pseudo_phys_res != NULL) {
+			bus_release_resource(xbb->dev, SYS_RES_MEMORY,
+					     xbb->pseudo_phys_res_id,
+					     xbb->pseudo_phys_res);
+			xbb->pseudo_phys_res = NULL;
 		}
+#endif
 	}
-	return buf;
+	xbb->kva = 0;
+	xbb->gnt_base_addr = 0;
 }
 
+/**
+ * Collect front-end information from the XenStore.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ */
 static int
-vbd_sysctl_handler(SYSCTL_HANDLER_ARGS)
+xbb_collect_frontend_info(struct xbb_softc *xbb)
 {
-	device_t dev = (device_t)arg1;
-	blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
-	const char *value;
-	char *buf = NULL;
-	int err;
+	char	    protocol_abi[64];
+	const char *otherend_path;
+	int	    error;
+	u_int	    ring_idx;
 
-	switch (arg2) {
-	case VBD_SYSCTL_DOMID:
-		return sysctl_handle_int(oidp, NULL, blkif->domid, req);
-	case VBD_SYSCTL_ST_RD_REQ:
-		return sysctl_handle_int(oidp, NULL, blkif->st_rd_req, req);
-	case VBD_SYSCTL_ST_WR_REQ:
-		return sysctl_handle_int(oidp, NULL, blkif->st_wr_req, req);
-	case VBD_SYSCTL_ST_OO_REQ:
-		return sysctl_handle_int(oidp, NULL, blkif->st_oo_req, req);
-	case VBD_SYSCTL_ST_ERR_REQ:
-		return sysctl_handle_int(oidp, NULL, blkif->st_err_req, req);
-	case VBD_SYSCTL_RING:
-		value = buf = vbd_sysctl_ring_info(blkif, arg2);
-		break;
-	default:
+	otherend_path = xenbus_get_otherend_path(xbb->dev);
+
+	/*
+	 * Mandatory data (used in all versions of the protocol) first.
+	 */
+	error = xs_gather(XST_NIL, otherend_path,
+			  "ring-ref", "%" PRIu32,
+			  &xbb->ring_config.ring_ref[0],
+			  "event-channel", "%" PRIu32,
+			  &xbb->ring_config.evtchn,
+			  NULL);
+	if (error != 0) {
+		xenbus_dev_fatal(xbb->dev, error,
+				 "Unable to retrieve ring information from "
+				 "frontend %s.  Unable to connect.",
+				 xenbus_get_otherend_path(xbb->dev));
+		return (error);
+	}
+
+	/*
+	 * These fields are initialized to legacy protocol defaults
+	 * so we only need to fail if reading the updated value succeeds
+	 * and the new value is outside of its allowed range.
+	 *
+	 * \note xs_gather() returns on the first encountered error, so
+	 *       we must use independant calls in order to guarantee
+	 *       we don't miss information in a sparsly populated front-end
+	 *       tree.
+	 */
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "ring-pages", NULL, "%" PRIu32,
+		       &xbb->ring_config.ring_pages);
+
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "max-requests", NULL, "%" PRIu32,
+		       &xbb->max_requests);
+
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "max-request-segments", NULL, "%" PRIu32,
+		       &xbb->max_request_segments);
+
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "max-request-size", NULL, "%" PRIu32,
+		       &xbb->max_request_size);
+
+	if (xbb->ring_config.ring_pages	> XBB_MAX_RING_PAGES) {
+		xenbus_dev_fatal(xbb->dev, EINVAL,
+				 "Front-end specificed ring-pages of %u "
+				 "exceeds backend limit of %zu.  "
+				 "Unable to connect.",
+				 xbb->ring_config.ring_pages,
+				 XBB_MAX_RING_PAGES);
+		return (EINVAL);
+	} else if (xbb->max_requests > XBB_MAX_REQUESTS) {
+		xenbus_dev_fatal(xbb->dev, EINVAL,
+				 "Front-end specificed max_requests of %u "
+				 "exceeds backend limit of %u.  "
+				 "Unable to connect.",
+				 xbb->max_requests,
+				 XBB_MAX_REQUESTS);
+		return (EINVAL);
+	} else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) {
+		xenbus_dev_fatal(xbb->dev, EINVAL,
+				 "Front-end specificed max_requests_segments "
+				 "of %u exceeds backend limit of %u.  "
+				 "Unable to connect.",
+				 xbb->max_request_segments,
+				 XBB_MAX_SEGMENTS_PER_REQUEST);
+		return (EINVAL);
+	} else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) {
+		xenbus_dev_fatal(xbb->dev, EINVAL,
+				 "Front-end specificed max_request_size "
+				 "of %u exceeds backend limit of %u.  "
+				 "Unable to connect.",
+				 xbb->max_request_size,
+				 XBB_MAX_REQUEST_SIZE);
 		return (EINVAL);
 	}
 
-	err = SYSCTL_OUT(req, value, strlen(value));
-	if (buf != NULL)
-		free(buf, M_DEVBUF);
+	/* If using a multi-page ring, pull in the remaining references. */
+	for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) {
+		char ring_ref_name[]= "ring_refXX";
 
-	return err;
+		snprintf(ring_ref_name, sizeof(ring_ref_name),
+			 "ring-ref%u", ring_idx);
+		error = xs_scanf(XST_NIL, otherend_path,
+				 ring_ref_name, NULL, "%" PRIu32,
+			         &xbb->ring_config.ring_ref[ring_idx]);
+		if (error != 0) {
+			xenbus_dev_fatal(xbb->dev, error,
+					 "Failed to retriev grant reference "
+					 "for page %u of shared ring.  Unable "
+					 "to connect.", ring_idx);
+			return (error);
+		}
+	}
+
+	error = xs_gather(XST_NIL, otherend_path,
+			  "protocol", "%63s", protocol_abi,
+			  NULL); 
+	if (error != 0
+	 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
+		/*
+		 * Assume native if the frontend has not
+		 * published ABI data or it has published and
+		 * matches our own ABI.
+		 */
+		xbb->abi = BLKIF_PROTOCOL_NATIVE;
+	} else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
+
+		xbb->abi = BLKIF_PROTOCOL_X86_32;
+	} else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
+
+		xbb->abi = BLKIF_PROTOCOL_X86_64;
+	} else {
+
+		xenbus_dev_fatal(xbb->dev, EINVAL,
+				 "Unknown protocol ABI (%s) published by "
+				 "frontend.  Unable to connect.", protocol_abi);
+		return (EINVAL);
+	}
+	return (0);
 }
 
-/* Newbus vbd device driver probe */
+/**
+ * Allocate per-request data structures given request size and number
+ * information negotiated with the front-end.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ */
 static int
-vbd_probe(device_t dev)
+xbb_alloc_requests(struct xbb_softc *xbb)
 {
-	DPRINTF("vbd%d\n", device_get_unit(dev));
-	return 0;
+	struct xbb_xen_req *req;
+	struct xbb_xen_req *last_req;
+	uint8_t		   *req_kva;
+	u_long		    gnt_base;
+
+	/*
+	 * Allocate request book keeping datastructures.
+	 */
+	xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
+			       M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
+	if (xbb->requests == NULL) {
+		xenbus_dev_fatal(xbb->dev, ENOMEM, 
+				  "Unable to allocate request structures");
+		return (ENOMEM);
+	}
+
+	req_kva  = (uint8_t *)xbb->kva;
+	gnt_base = xbb->gnt_base_addr;
+	req      = xbb->requests;
+	last_req = &xbb->requests[xbb->max_requests - 1];
+	while (req <= last_req) {
+		int seg;
+
+		req->xbb         = xbb;
+		req->kva         = req_kva;
+		req->gnt_handles = malloc(xbb->max_request_segments
+					* sizeof(*req->gnt_handles),
+					  M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
+		if (req->gnt_handles == NULL) {
+			xenbus_dev_fatal(xbb->dev, ENOMEM,
+					  "Unable to allocate request "
+					  "grant references");
+			return (ENOMEM);
+		}
+#ifdef XBB_USE_BOUNCE_BUFFERS
+		req->bounce = malloc(xbb->max_request_size,
+				     M_XENBLOCKBACK, M_NOWAIT);
+		if (req->bounce == NULL) {
+			xenbus_dev_fatal(xbb->dev, ENOMEM, 
+					 "Unable to allocate request "
+					 "bounce buffers");
+			return (ENOMEM);
+		}
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+		req->gnt_base = gnt_base;
+		req_kva      += xbb->max_request_segments * PAGE_SIZE;
+		gnt_base     += xbb->max_request_segments * PAGE_SIZE;
+		SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links);
+
+		for (seg = 0; seg < xbb->max_request_segments; seg++)
+			req->gnt_handles[seg] = GRANT_REF_INVALID;
+
+		req++;
+	}
+	return (0);
 }
 
-/* Newbus vbd device driver attach */
+/**
+ * Supply information about the physical device to the frontend
+ * via XenBus.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ */
 static int
-vbd_attach(device_t dev) 
+xbb_publish_backend_info(struct xbb_softc *xbb)
 {
-	blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
+	struct xs_transaction xst;
+	const char	     *our_path;
+	const char	     *leaf;
+	int		      error;
 
-	DPRINTF("%s\n", blkif->dev_name);
+	our_path = xenbus_get_node(xbb->dev);
+	while (1) {
+		error = xs_transaction_start(&xst);
+		if (error != 0) {
+			xenbus_dev_fatal(xbb->dev, error,
+					 "Error publishing backend info "
+					 "(start transaction)");
+			return (error);
+		}
 
-	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
-	    OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD,
-	    dev, VBD_SYSCTL_DOMID, vbd_sysctl_handler, "I",
-	    "domid of frontend");
-	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
-	    OID_AUTO, "rd_reqs", CTLTYPE_INT|CTLFLAG_RD,
-	    dev, VBD_SYSCTL_ST_RD_REQ, vbd_sysctl_handler, "I",
-	    "number of read reqs");
-	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
-	    OID_AUTO, "wr_reqs", CTLTYPE_INT|CTLFLAG_RD,
-	    dev, VBD_SYSCTL_ST_WR_REQ, vbd_sysctl_handler, "I",
-	    "number of write reqs");
-	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
-	    OID_AUTO, "oo_reqs", CTLTYPE_INT|CTLFLAG_RD,
-	    dev, VBD_SYSCTL_ST_OO_REQ, vbd_sysctl_handler, "I",
-	    "number of deferred reqs");
-	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
-	    OID_AUTO, "err_reqs", CTLTYPE_INT|CTLFLAG_RD,
-	    dev, VBD_SYSCTL_ST_ERR_REQ, vbd_sysctl_handler, "I",
-	    "number of reqs that returned error");
-#if XEN_BLKBACK_DEBUG
-	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
-	    OID_AUTO, "ring", CTLFLAG_RD,
-	    dev, VBD_SYSCTL_RING, vbd_sysctl_handler, "A",
-	    "req ring info");
+		leaf = "sectors";
+		error = xs_printf(xst, our_path, leaf,
+				  "%"PRIu64, xbb->media_num_sectors);
+		if (error != 0)
+			break;
+
+		/* XXX Support all VBD attributes here. */
+		leaf = "info";
+		error = xs_printf(xst, our_path, leaf, "%u",
+				  xbb->flags & XBBF_READ_ONLY
+				? VDISK_READONLY : 0);
+		if (error != 0)
+			break;
+
+		leaf = "sector-size";
+		error = xs_printf(xst, our_path, leaf, "%u",
+				  xbb->sector_size);
+		if (error != 0)
+			break;
+
+		error = xs_transaction_end(xst, 0);
+		if (error == 0) {
+			return (0);
+		} else if (error != EAGAIN) {
+			xenbus_dev_fatal(xbb->dev, error, "ending transaction");
+			return (error);
+		}
+	}
+
+	xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
+			our_path, leaf);
+	xs_transaction_end(xst, 1);
+	return (error);
+}
+
+/**
+ * Connect to our blkfront peer now that it has completed publishing
+ * its configuration into the XenStore.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ */
+static void
+xbb_connect(struct xbb_softc *xbb)
+{
+	int		      error;
+
+	if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
+		return;
+
+	if (xbb_collect_frontend_info(xbb) != 0)
+		return;
+
+	/* Allocate resources whose size depends on front-end configuration. */
+	error = xbb_alloc_communication_mem(xbb);
+	if (error != 0) {
+		xenbus_dev_fatal(xbb->dev, error,
+				 "Unable to allocate communication memory");
+		return;
+	}
+
+	error = xbb_alloc_requests(xbb);
+	if (error != 0) {
+		/* Specific errors are reported by xbb_alloc_requests(). */
+		return;
+	}
+
+	/*
+	 * Connect communication channel.
+	 */
+	error = xbb_connect_ring(xbb);
+	if (error != 0) {
+		/* Specific errors are reported by xbb_connect_ring(). */
+		return;
+	}
+	
+	if (xbb_publish_backend_info(xbb) != 0) {
+		/*
+		 * If we can't publish our data, we cannot participate
+		 * in this connection, and waiting for a front-end state
+		 * change will not help the situation.
+		 */
+		xbb_disconnect(xbb);
+		return;
+	}
+
+	/* Ready for I/O. */
+	xenbus_set_state(xbb->dev, XenbusStateConnected);
+}
+
+/*-------------------------- Device Teardown Support -------------------------*/
+/**
+ * Perform device shutdown functions.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ *
+ * Mark this instance as shutting down, wait for any active I/O on the
+ * backend device/file to drain, disconnect from the front-end, and notify
+ * any waiters (e.g. a thread invoking our detach method) that detach can
+ * now proceed.
+ */
+static int
+xbb_shutdown(struct xbb_softc *xbb)
+{
+	static int in_shutdown;
+
+	DPRINTF("\n");
+
+	/*
+	 * Due to the need to drop our mutex during some
+	 * xenbus operations, it is possible for two threads
+	 * to attempt to close out shutdown processing at
+	 * the same time.  Tell the caller that hits this
+	 * race to try back later. 
+	 */
+	if (in_shutdown != 0)
+		return (EAGAIN);
+
+	DPRINTF("\n");
+
+	/* Indicate shutdown is in progress. */
+	xbb->flags |= XBBF_SHUTDOWN;
+
+	/* Wait for requests to complete. */
+	if (xbb->active_request_count != 0)
+		return (EAGAIN);
+	
+	DPRINTF("\n");
+
+	/* Disconnect from the front-end. */
+	xbb_disconnect(xbb);
+
+	in_shutdown = 1;
+	mtx_unlock(&xbb->lock);
+	xenbus_set_state(xbb->dev, XenbusStateClosed);
+	mtx_lock(&xbb->lock);
+	in_shutdown = 0;
+
+	/* Indicate to xbb_detach() that is it safe to proceed. */
+	wakeup(xbb);
+
+	return (0);
+}
+
+/**
+ * Report an attach time error to the console and Xen, and cleanup
+ * this instance by forcing immediate detach processing.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ * \param err  Errno describing the error.
+ * \param fmt  Printf style format and arguments
+ */
+static void
+xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
+{
+	va_list ap;
+	va_list ap_hotplug;
+
+	va_start(ap, fmt);
+	va_copy(ap_hotplug, ap);
+	xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
+		  "hotplug-error", fmt, ap_hotplug);
+	va_end(ap_hotplug);
+	xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+		  "hotplug-status", "error");
+
+	xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
+	va_end(ap);
+
+	xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+		  "online", "0");
+	xbb_detach(xbb->dev);
+}
+
+/*---------------------------- NewBus Entrypoints ----------------------------*/
+/**
+ * Inspect a XenBus device and claim it if is of the appropriate type.
+ * 
+ * \param dev  NewBus device object representing a candidate XenBus device.
+ *
+ * \return  0 for success, errno codes for failure.
+ */
+static int
+xbb_probe(device_t dev)
+{
+ 
+        if (!strcmp(xenbus_get_type(dev), "vbd")) {
+                device_set_desc(dev, "Backend Virtual Block Device");
+                device_quiet(dev);
+                return (0);
+        }
+
+        return (ENXIO);
+}
+
+/**
+ * Attach to a XenBus device that has been claimed by our probe routine.
+ *
+ * \param dev  NewBus device object representing this Xen Block Back instance.
+ *
+ * \return  0 for success, errno codes for failure.
+ */
+static int
+xbb_attach(device_t dev)
+{
+	struct xbb_softc   *xbb;
+	int		    error;
+
+	DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
+
+	/*
+	 * Basic initialization.
+	 * After this block it is safe to call xbb_detach()
+	 * to clean up any allocated data for this instance.
+	 */
+	xbb = device_get_softc(dev);
+	xbb->dev = dev;
+	xbb->otherend_id = xenbus_get_otherend_id(dev);
+	TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
+	mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
+	SLIST_INIT(&xbb->request_free_slist);
+
+	/*
+	 * Protocol defaults valid even if all negotiation fails.
+	 */
+	xbb->ring_config.ring_pages = 1;
+	xbb->max_requests	    = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
+	xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
+	xbb->max_request_size	    = xbb->max_request_segments * PAGE_SIZE;
+
+	/*
+	 * Publish protocol capabilities for consumption by the
+	 * front-end.
+	 */
+	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+			  "feature-barrier", "1");
+	if (error) {
+		xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
+				  xenbus_get_node(xbb->dev));
+		return (error);
+	}
+
+	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+			  "feature-flush-cache", "1");
+	if (error) {
+		xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
+				  xenbus_get_node(xbb->dev));
+		return (error);
+	}
+
+	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+			  "max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
+	if (error) {
+		xbb_attach_failed(xbb, error, "writing %s/max-ring-pages",
+				  xenbus_get_node(xbb->dev));
+		return (error);
+	}
+
+	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+			  "max-requests", "%u", XBB_MAX_REQUESTS);
+	if (error) {
+		xbb_attach_failed(xbb, error, "writing %s/max-requests",
+				  xenbus_get_node(xbb->dev));
+		return (error);
+	}
+
+	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+			  "max-request-segments", "%u",
+			  XBB_MAX_SEGMENTS_PER_REQUEST);
+	if (error) {
+		xbb_attach_failed(xbb, error, "writing %s/max-request-segments",
+				  xenbus_get_node(xbb->dev));
+		return (error);
+	}
+
+	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+			  "max-request-size", "%u",
+			  XBB_MAX_REQUEST_SIZE);
+	if (error) {
+		xbb_attach_failed(xbb, error, "writing %s/max-request-size",
+				  xenbus_get_node(xbb->dev));
+		return (error);
+	}
+
+	/* Collect physical device information. */
+	error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
+			  "device-type", NULL, &xbb->dev_type,
+			  NULL);
+	if (error != 0)
+		xbb->dev_type = NULL;
+
+	error = xs_gather(XST_NIL, xenbus_get_node(dev),
+                          "mode", NULL, &xbb->dev_mode,
+			  "params", NULL, &xbb->dev_name,
+                          NULL);
+	if (error != 0) {
+		xbb_attach_failed(xbb, error, "reading backend fields at %s",
+				  xenbus_get_node(dev));
+                return (ENXIO);
+        }
+
+	/* Parse fopen style mode flags. */
+	if (strchr(xbb->dev_mode, 'w') == NULL)
+		xbb->flags |= XBBF_READ_ONLY;
+
+	/*
+	 * Verify the physical device is present and can support
+	 * the desired I/O mode.
+	 */
+	DROP_GIANT();
+	error = xbb_open_backend(xbb);
+	PICKUP_GIANT();
+	if (error != 0) {
+		xbb_attach_failed(xbb, error, "Unable to open %s",
+				  xbb->dev_name);
+		return (ENXIO);
+	}
+
+	/* Use devstat(9) for recording statistics. */
+	xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
+					   xbb->sector_size,
+					   DEVSTAT_ALL_SUPPORTED,
+					   DEVSTAT_TYPE_DIRECT
+					 | DEVSTAT_TYPE_IF_OTHER,
+					   DEVSTAT_PRIORITY_OTHER);
+	/*
+	 * Create a taskqueue for doing work that must occur from a
+	 * thread context.
+	 */
+	xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT,
+					     taskqueue_thread_enqueue,
+					     /*context*/&xbb->io_taskqueue);
+	if (xbb->io_taskqueue == NULL) {
+		xbb_attach_failed(xbb, error, "Unable to create taskqueue");
+		return (ENOMEM);
+	}
+
+	taskqueue_start_threads(&xbb->io_taskqueue,
+				/*num threads*/1,
+				/*priority*/PWAIT,
+				/*thread name*/
+				"%s taskq", device_get_nameunit(dev));
+
+	/* Update hot-plug status to satisfy xend. */
+	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+			  "hotplug-status", "connected");
+	if (error) {
+		xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
+				  xenbus_get_node(xbb->dev));
+		return (error);
+	}
+
+	/* Tell the front end that we are ready to connect. */
+	xenbus_set_state(dev, XenbusStateInitWait);
+
+	return (0);
+}
+
+/**
+ * Detach from a block back device instanced.
+ *
+ * \param dev  NewBus device object representing this Xen Block Back instance.
+ *
+ * \return  0 for success, errno codes for failure.
+ * 
+ * \note A block back device may be detached at any time in its life-cycle,
+ *       including part way through the attach process.  For this reason,
+ *       initialization order and the intialization state checks in this
+ *       routine must be carefully coupled so that attach time failures
+ *       are gracefully handled.
+ */
+static int
+xbb_detach(device_t dev)
+{
+        struct xbb_softc *xbb;
+
+	DPRINTF("\n");
+
+        xbb = device_get_softc(dev);
+	mtx_lock(&xbb->lock);
+	while (xbb_shutdown(xbb) == EAGAIN) {
+		msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
+		       "xbb_shutdown", 0);
+	}
+	mtx_unlock(&xbb->lock);
+	mtx_destroy(&xbb->lock);
+
+	DPRINTF("\n");
+
+	taskqueue_free(xbb->io_taskqueue);
+	devstat_remove_entry(xbb->xbb_stats);
+
+	xbb_close_backend(xbb);
+	xbb_free_communication_mem(xbb);
+
+	if (xbb->dev_mode != NULL) {
+		free(xbb->dev_mode, M_XENBUS);
+		xbb->dev_mode = NULL;
+	}
+
+	if (xbb->dev_type != NULL) {
+		free(xbb->dev_type, M_XENBUS);
+		xbb->dev_type = NULL;
+	}
+
+	if (xbb->dev_name != NULL) {
+		free(xbb->dev_name, M_XENBUS);
+		xbb->dev_name = NULL;
+	}
+
+	if (xbb->requests != NULL) {
+		struct xbb_xen_req *req;
+		struct xbb_xen_req *last_req;
+
+		req      = xbb->requests;
+		last_req = &xbb->requests[xbb->max_requests - 1];
+		while (req <= last_req) {
+#ifdef XBB_USE_BOUNCE_BUFFERS
+			if (req->bounce != NULL) {
+				free(req->bounce, M_XENBLOCKBACK);
+				req->bounce = NULL;
+			}
+#endif
+			if (req->gnt_handles != NULL) {
+				free (req->gnt_handles, M_XENBLOCKBACK);
+				req->gnt_handles = NULL;
+			}
+			req++;
+		}
+		free(xbb->requests, M_XENBLOCKBACK);
+		xbb->requests = NULL;
+	}
+
+        return (0);
+}
+
+/**
+ * Prepare this block back device for suspension of this VM.
+ * 
+ * \param dev  NewBus device object representing this Xen Block Back instance.
+ *
+ * \return  0 for success, errno codes for failure.
+ */
+static int
+xbb_suspend(device_t dev)
+{
+#ifdef NOT_YET
+        struct xbb_softc *sc = device_get_softc(dev);
+
+        /* Prevent new requests being issued until we fix things up. */
+        mtx_lock(&sc->xb_io_lock);
+        sc->connected = BLKIF_STATE_SUSPENDED;
+        mtx_unlock(&sc->xb_io_lock);
 #endif
 
-	if (!open_device(blkif))
-		connect(blkif);
-
-	return bus_generic_attach(dev);
+        return (0);
 }
 
-/* Newbus vbd device driver detach */
-static int
-vbd_detach(device_t dev)
-{
-	blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
-
-	DPRINTF("%s\n", blkif->dev_name);
-
-	close_device(blkif);
-
-	bus_generic_detach(dev);
-
-	blkif_put(blkif);
-
-	return 0;
-}
-
-static device_method_t vbd_methods[] = {
-	/* Device interface */
-	DEVMETHOD(device_probe,		vbd_probe),
-	DEVMETHOD(device_attach, 	vbd_attach),
-	DEVMETHOD(device_detach,	vbd_detach),
-	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
-	DEVMETHOD(device_suspend,	bus_generic_suspend),
-	DEVMETHOD(device_resume,	bus_generic_resume),
-	{0, 0}
-};
-
-static devclass_t vbd_devclass;
-
-static driver_t vbd_driver = {
-	"vbd",
-	vbd_methods,
-	0,
-};
-
-DRIVER_MODULE(vbd, nexus, vbd_driver, vbd_devclass, 0, 0);
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: t
- * End:
+/**
+ * Perform any processing required to recover from a suspended state.
+ * 
+ * \param dev  NewBus device object representing this Xen Block Back instance.
+ *
+ * \return  0 for success, errno codes for failure.
  */
+static int
+xbb_resume(device_t dev)
+{
+	return (0);
+}
+
+/**
+ * Handle state changes expressed via the XenStore by our front-end peer.
+ *
+ * \param dev             NewBus device object representing this Xen
+ *                        Block Back instance.
+ * \param frontend_state  The new state of the front-end.
+ *
+ * \return  0 for success, errno codes for failure.
+ */
+static int
+xbb_frontend_changed(device_t dev, XenbusState frontend_state)
+{
+	struct xbb_softc *xbb = device_get_softc(dev);
+
+	DPRINTF("state=%s\n", xenbus_strstate(frontend_state));
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateClosing:
+		break;
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+		xbb_connect(xbb);
+		break;
+	case XenbusStateClosed:
+	case XenbusStateInitWait:
+
+		mtx_lock(&xbb->lock);
+		xbb_shutdown(xbb);
+		mtx_unlock(&xbb->lock);
+		break;
+	default:
+		xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+	return (0);
+}
+
+/*---------------------------- NewBus Registration ---------------------------*/
+static device_method_t xbb_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		xbb_probe),
+	DEVMETHOD(device_attach,	xbb_attach),
+	DEVMETHOD(device_detach,	xbb_detach),
+	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
+	DEVMETHOD(device_suspend,	xbb_suspend),
+	DEVMETHOD(device_resume,	xbb_resume),
+
+	/* Xenbus interface */
+	DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
+
+	{ 0, 0 }
+};
+
+static driver_t xbb_driver = {
+        "xbbd",
+        xbb_methods,
+        sizeof(struct xbb_softc),
+};
+devclass_t xbb_devclass;
+
+DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);
diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c
index 6c222ea6ecf..8ff87574bd5 100644
--- a/sys/dev/xen/blkfront/blkfront.c
+++ b/sys/dev/xen/blkfront/blkfront.c
@@ -49,8 +49,10 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 #include <sys/bus_dma.h>
 
+#include <machine/_inttypes.h>
 #include <machine/xen/xen-os.h>
 #include <machine/xen/xenfunc.h>
+
 #include <xen/hypervisor.h>
 #include <xen/xen_intr.h>
 #include <xen/evtchn.h>
@@ -68,17 +70,21 @@ __FBSDID("$FreeBSD$");
 /* prototypes */
 static void xb_free_command(struct xb_command *cm);
 static void xb_startio(struct xb_softc *sc);
-static void connect(struct xb_softc *);
+static void blkfront_connect(struct xb_softc *);
 static void blkfront_closing(device_t);
 static int blkfront_detach(device_t);
-static int talk_to_backend(struct xb_softc *);
 static int setup_blkring(struct xb_softc *);
 static void blkif_int(void *);
+static void blkfront_initialize(struct xb_softc *);
+#if 0
 static void blkif_recover(struct xb_softc *);
-static void blkif_completion(struct xb_command *);
+#endif
+static int blkif_completion(struct xb_command *);
 static void blkif_free(struct xb_softc *, int);
 static void blkif_queue_cb(void *, bus_dma_segment_t *, int, int);
 
+MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data");
+
 #define GRANT_INVALID_REF 0
 
 /* Control whether runtime update of vbds is enabled. */
@@ -113,11 +119,6 @@ static char * blkif_status_name[] = {
 #define DPRINTK(fmt, args...) 
 #endif
 
-#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
-    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
-
-#define BLKIF_MAXIO	(32 * 1024)
-
 static int blkif_open(struct disk *dp);
 static int blkif_close(struct disk *dp);
 static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td);
@@ -202,8 +203,8 @@ blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
 }
 
 int
-xlvbd_add(struct xb_softc *sc, blkif_sector_t capacity,
-    int vdevice, uint16_t vdisk_info, uint16_t sector_size)
+xlvbd_add(struct xb_softc *sc, blkif_sector_t sectors,
+    int vdevice, uint16_t vdisk_info, unsigned long sector_size)
 {
 	int	unit, error = 0;
 	const char *name;
@@ -215,7 +216,6 @@ xlvbd_add(struct xb_softc *sc, blkif_sector_t capacity,
 	if (strcmp(name, "xbd"))
 		device_printf(sc->xb_dev, "attaching as %s%d\n", name, unit);
 
-	memset(&sc->xb_disk, 0, sizeof(sc->xb_disk)); 
 	sc->xb_disk = disk_alloc();
 	sc->xb_disk->d_unit = sc->xb_unit;
 	sc->xb_disk->d_open = blkif_open;
@@ -227,20 +227,14 @@ xlvbd_add(struct xb_softc *sc, blkif_sector_t capacity,
 	sc->xb_disk->d_drv1 = sc;
 	sc->xb_disk->d_sectorsize = sector_size;
 
-	sc->xb_disk->d_mediasize = capacity << XBD_SECTOR_SHFT;
-	sc->xb_disk->d_maxsize = BLKIF_MAXIO;
+	sc->xb_disk->d_mediasize = sectors * sector_size;
+	sc->xb_disk->d_maxsize = sc->max_request_size;
 	sc->xb_disk->d_flags = 0;
 	disk_create(sc->xb_disk, DISK_VERSION_00);
 
 	return error;
 }
 
-void
-xlvbd_del(struct xb_softc *sc)
-{
-
-	disk_destroy(sc->xb_disk);
-}
 /************************ end VBD support *****************/
 
 /*
@@ -357,15 +351,16 @@ xb_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
 			return (EBUSY);
 		}
 
-		if (gnttab_alloc_grant_references(
-		    BLKIF_MAX_SEGMENTS_PER_REQUEST, &cm->gref_head) < 0) {
+		if (gnttab_alloc_grant_references(sc->max_request_segments,
+						  &cm->gref_head) != 0) {
 			xb_free_command(cm);
 			mtx_unlock(&sc->xb_io_lock);
 			device_printf(sc->xb_dev, "no more grant allocs?\n");
 			return (EBUSY);
 		}
 
-		chunk = length > BLKIF_MAXIO ? BLKIF_MAXIO : length;
+		chunk = length > sc->max_request_size
+		      ? sc->max_request_size : length;
 		cm->data = virtual;
 		cm->datalen = chunk;
 		cm->operation = BLKIF_OP_WRITE;
@@ -423,16 +418,18 @@ static int
 blkfront_attach(device_t dev)
 {
 	struct xb_softc *sc;
-	struct xb_command *cm;
 	const char *name;
-	int error, vdevice, i, unit;
+	int error;
+	int vdevice;
+	int i;
+	int unit;
 
 	/* FIXME: Use dynamic device id if this is not set. */
-	error = xenbus_scanf(XBT_NIL, xenbus_get_node(dev),
+	error = xs_scanf(XST_NIL, xenbus_get_node(dev),
 	    "virtual-device", NULL, "%i", &vdevice);
 	if (error) {
 		xenbus_dev_fatal(dev, error, "reading virtual-device");
-		printf("couldn't find virtual device");
+		device_printf(dev, "Couldn't determine virtual device.\n");
 		return (error);
 	}
 
@@ -447,51 +444,18 @@ blkfront_attach(device_t dev)
 	xb_initq_ready(sc);
 	xb_initq_complete(sc);
 	xb_initq_bio(sc);
-
-	/* Allocate parent DMA tag */
-	if (bus_dma_tag_create(	NULL,			/* parent */
-				512, 4096,		/* algnmnt, boundary */
-				BUS_SPACE_MAXADDR,	/* lowaddr */
-				BUS_SPACE_MAXADDR,	/* highaddr */
-				NULL, NULL,		/* filter, filterarg */
-				BLKIF_MAXIO,		/* maxsize */
-				BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* nsegments */
-				PAGE_SIZE,		/* maxsegsize */
-				BUS_DMA_ALLOCNOW,	/* flags */
-				busdma_lock_mutex,	/* lockfunc */
-				&sc->xb_io_lock,	/* lockarg */
-				&sc->xb_io_dmat)) {
-		device_printf(dev, "Cannot allocate parent DMA tag\n");
-		return (ENOMEM);
-	}
-#ifdef notyet
-	if (bus_dma_tag_set(sc->xb_io_dmat, BUS_DMA_SET_MINSEGSZ,
-		XBD_SECTOR_SIZE)) {
-		device_printf(dev, "Cannot set sector size\n");
-		return (EINVAL);
-	}
-#endif		
+	for (i = 0; i < XBF_MAX_RING_PAGES; i++)
+		sc->ring_ref[i] = GRANT_INVALID_REF;
 
 	sc->xb_dev = dev;
 	sc->vdevice = vdevice;
 	sc->connected = BLKIF_STATE_DISCONNECTED;
 
-	/* work queue needed ? */
-	for (i = 0; i < BLK_RING_SIZE; i++) {
-		cm = &sc->shadow[i];
-		cm->req.id = i;
-		cm->cm_sc = sc;
-		if (bus_dmamap_create(sc->xb_io_dmat, 0, &cm->map) != 0)
-			break;
-		xb_free_command(cm);
-	}
-
 	/* Front end dir is a number, which is used as the id. */
 	sc->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0);
 
-	error = talk_to_backend(sc);
-	if (error)
-		return (error);
+	/* Wait for backend device to publish its protocol capabilities. */
+	xenbus_set_state(dev, XenbusStateInitialising);
 
 	return (0);
 }
@@ -512,121 +476,265 @@ blkfront_suspend(device_t dev)
 static int
 blkfront_resume(device_t dev)
 {
+#if 0
 	struct xb_softc *sc = device_get_softc(dev);
-	int err;
 
 	DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev));
 
+/* XXX This can't work!!! */
 	blkif_free(sc, 1);
-	err = talk_to_backend(sc);
-	if (sc->connected == BLKIF_STATE_SUSPENDED && !err)
+	blkfront_initialize(sc);
+	if (sc->connected == BLKIF_STATE_SUSPENDED)
 		blkif_recover(sc);
-
-	return (err);
+#endif
+	return (0);
 }
 
-/* Common code used when first setting up, and when resuming. */
-static int
-talk_to_backend(struct xb_softc *sc)
+static void
+blkfront_initialize(struct xb_softc *sc)
 {
-	device_t dev;
-	struct xenbus_transaction xbt;
-	const char *message = NULL;
-	int err;
+	const char *otherend_path;
+	const char *node_path;
+	int error;
+	int i;
 
-	/* Create shared ring, alloc event channel. */
-	dev = sc->xb_dev;
-	err = setup_blkring(sc);
-	if (err)
-		goto out;
+	if (xenbus_get_state(sc->xb_dev) != XenbusStateInitialising)
+                return;
 
- again:
-	err = xenbus_transaction_start(&xbt);
-	if (err) {
-		xenbus_dev_fatal(dev, err, "starting transaction");
-		goto destroy_blkring;
+	/*
+	 * Protocol defaults valid even if negotiation for a
+	 * setting fails.
+	 */
+	sc->ring_pages = 1;
+	sc->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
+	sc->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
+	sc->max_request_size = sc->max_request_segments * PAGE_SIZE;
+	sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
+
+	/*
+	 * Protocol negotiation.
+	 *
+	 * \note xs_gather() returns on the first encountered error, so
+	 *       we must use independant calls in order to guarantee
+	 *       we don't miss information in a sparsly populated back-end
+	 *       tree.
+	 */
+	otherend_path = xenbus_get_otherend_path(sc->xb_dev);
+	node_path = xenbus_get_node(sc->xb_dev);
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "max-ring-pages", NULL, "%" PRIu32,
+		       &sc->ring_pages);
+
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "max-requests", NULL, "%" PRIu32,
+		       &sc->max_requests);
+
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "max-request-segments", NULL, "%" PRIu32,
+		       &sc->max_request_segments);
+
+	(void)xs_scanf(XST_NIL, otherend_path,
+		       "max-request-size", NULL, "%" PRIu32,
+		       &sc->max_request_size);
+
+	if (sc->ring_pages > XBF_MAX_RING_PAGES) {
+		device_printf(sc->xb_dev, "Back-end specified ring-pages of "
+			      "%u limited to front-end limit of %zu.\n",
+			      sc->ring_pages, XBF_MAX_RING_PAGES);
+		sc->ring_pages = XBF_MAX_RING_PAGES;
 	}
 
-	err = xenbus_printf(xbt, xenbus_get_node(dev),
-			    "ring-ref","%u", sc->ring_ref);
-	if (err) {
-		message = "writing ring-ref";
-		goto abort_transaction;
-	}
-	err = xenbus_printf(xbt, xenbus_get_node(dev),
-		"event-channel", "%u", irq_to_evtchn_port(sc->irq));
-	if (err) {
-		message = "writing event-channel";
-		goto abort_transaction;
-	}
-	err = xenbus_printf(xbt, xenbus_get_node(dev),
-		"protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
-	if (err) {
-		message = "writing protocol";
-		goto abort_transaction;
+	if (sc->max_requests > XBF_MAX_REQUESTS) {
+		device_printf(sc->xb_dev, "Back-end specified max_requests of "
+			      "%u limited to front-end limit of %u.\n",
+			      sc->max_requests, XBF_MAX_REQUESTS);
+		sc->max_requests = XBF_MAX_REQUESTS;
 	}
 
-	err = xenbus_transaction_end(xbt, 0);
-	if (err) {
-		if (err == EAGAIN)
-			goto again;
-		xenbus_dev_fatal(dev, err, "completing transaction");
-		goto destroy_blkring;
+	if (sc->max_request_segments > XBF_MAX_SEGMENTS_PER_REQUEST) {
+		device_printf(sc->xb_dev, "Back-end specificed "
+			      "max_requests_segments of %u limited to "
+			      "front-end limit of %u.\n",
+			      sc->max_request_segments,
+			      XBF_MAX_SEGMENTS_PER_REQUEST);
+		sc->max_request_segments = XBF_MAX_SEGMENTS_PER_REQUEST;
 	}
-	xenbus_set_state(dev, XenbusStateInitialised);
-	
-	return 0;
 
- abort_transaction:
-	xenbus_transaction_end(xbt, 1);
-	if (message)
-		xenbus_dev_fatal(dev, err, "%s", message);
- destroy_blkring:
-	blkif_free(sc, 0);
- out:
-	return err;
+	if (sc->max_request_size > XBF_MAX_REQUEST_SIZE) {
+		device_printf(sc->xb_dev, "Back-end specificed "
+			      "max_request_size of %u limited to front-end "
+			      "limit of %u.\n", sc->max_request_size,
+			      XBF_MAX_REQUEST_SIZE);
+		sc->max_request_size = XBF_MAX_REQUEST_SIZE;
+	}
+	sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
+
+	/* Allocate datastructures based on negotiated values. */
+	error = bus_dma_tag_create(NULL,		/* parent */
+				   512, PAGE_SIZE,	/* algnmnt, boundary */
+				   BUS_SPACE_MAXADDR,	/* lowaddr */
+				   BUS_SPACE_MAXADDR,	/* highaddr */
+				   NULL, NULL,		/* filter, filterarg */
+				   sc->max_request_size,
+				   sc->max_request_segments,
+				   PAGE_SIZE,		/* maxsegsize */
+				   BUS_DMA_ALLOCNOW,	/* flags */
+				   busdma_lock_mutex,	/* lockfunc */
+				   &sc->xb_io_lock,	/* lockarg */
+				   &sc->xb_io_dmat);
+	if (error != 0) {
+		xenbus_dev_fatal(sc->xb_dev, error,
+				 "Cannot allocate parent DMA tag\n");
+		return;
+	}
+
+	/* Per-transaction data allocation. */
+	sc->shadow = malloc(sizeof(*sc->shadow) * sc->max_requests,
+			    M_XENBLOCKFRONT, M_NOWAIT|M_ZERO);
+	if (sc->shadow == NULL) {
+		xenbus_dev_fatal(sc->xb_dev, error,
+				 "Cannot allocate request structures\n");
+	}
+
+	for (i = 0; i < sc->max_requests; i++) {
+		struct xb_command *cm;
+
+		cm = &sc->shadow[i];
+		cm->sg_refs = malloc(sizeof(grant_ref_t)
+				   * sc->max_request_segments,
+				     M_XENBLOCKFRONT, M_NOWAIT);
+		if (cm->sg_refs == NULL)
+			break;
+		cm->id = i;
+		cm->cm_sc = sc;
+		if (bus_dmamap_create(sc->xb_io_dmat, 0, &cm->map) != 0)
+			break;
+		xb_free_command(cm);
+	}
+
+	if (setup_blkring(sc) != 0)
+		return;
+
+	error = xs_printf(XST_NIL, node_path,
+			 "ring-pages","%u", sc->ring_pages);
+	if (error) {
+		xenbus_dev_fatal(sc->xb_dev, error,
+				 "writing %s/ring-pages",
+				 node_path);
+		return;
+	}
+
+	error = xs_printf(XST_NIL, node_path,
+			 "max-requests","%u", sc->max_requests);
+	if (error) {
+		xenbus_dev_fatal(sc->xb_dev, error,
+				 "writing %s/max-requests",
+				 node_path);
+		return;
+	}
+
+	error = xs_printf(XST_NIL, node_path,
+			 "max-request-segments","%u", sc->max_request_segments);
+	if (error) {
+		xenbus_dev_fatal(sc->xb_dev, error,
+				 "writing %s/max-request-segments",
+				 node_path);
+		return;
+	}
+
+	error = xs_printf(XST_NIL, node_path,
+			 "max-request-size","%u", sc->max_request_size);
+	if (error) {
+		xenbus_dev_fatal(sc->xb_dev, error,
+				 "writing %s/max-request-size",
+				 node_path);
+		return;
+	}
+
+	error = xs_printf(XST_NIL, node_path, "event-channel",
+			  "%u", irq_to_evtchn_port(sc->irq));
+	if (error) {
+		xenbus_dev_fatal(sc->xb_dev, error,
+				 "writing %s/event-channel",
+				 node_path);
+		return;
+	}
+
+	error = xs_printf(XST_NIL, node_path,
+			  "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
+	if (error) {
+		xenbus_dev_fatal(sc->xb_dev, error,
+				 "writing %s/protocol",
+				 node_path);
+		return;
+	}
+
+	xenbus_set_state(sc->xb_dev, XenbusStateInitialised);
 }
 
 static int 
 setup_blkring(struct xb_softc *sc)
 {
 	blkif_sring_t *sring;
+	uintptr_t sring_page_addr;
 	int error;
+	int i;
 
-	sc->ring_ref = GRANT_INVALID_REF;
-
-	sring = (blkif_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO);
+	sring = malloc(sc->ring_pages * PAGE_SIZE, M_XENBLOCKFRONT,
+		       M_NOWAIT|M_ZERO);
 	if (sring == NULL) {
 		xenbus_dev_fatal(sc->xb_dev, ENOMEM, "allocating shared ring");
-		return ENOMEM;
+		return (ENOMEM);
 	}
 	SHARED_RING_INIT(sring);
-	FRONT_RING_INIT(&sc->ring, sring, PAGE_SIZE);
+	FRONT_RING_INIT(&sc->ring, sring, sc->ring_pages * PAGE_SIZE);
 
-	error = xenbus_grant_ring(sc->xb_dev,
-	    (vtomach(sc->ring.sring) >> PAGE_SHIFT), &sc->ring_ref);
-	if (error) {
-		free(sring, M_DEVBUF);
-		sc->ring.sring = NULL;
-		goto fail;
+	for (i = 0, sring_page_addr = (uintptr_t)sring;
+	     i < sc->ring_pages;
+	     i++, sring_page_addr += PAGE_SIZE) {
+
+		error = xenbus_grant_ring(sc->xb_dev,
+		    (vtomach(sring_page_addr) >> PAGE_SHIFT), &sc->ring_ref[i]);
+		if (error) {
+			xenbus_dev_fatal(sc->xb_dev, error,
+					 "granting ring_ref(%d)", i);
+			return (error);
+		}
 	}
-	
-	error = bind_listening_port_to_irqhandler(xenbus_get_otherend_id(sc->xb_dev),
+	error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
+			  "ring-ref","%u", sc->ring_ref[0]);
+	if (error) {
+		xenbus_dev_fatal(sc->xb_dev, error, "writing %s/ring-ref",
+				 xenbus_get_node(sc->xb_dev));
+		return (error);
+	}
+	for (i = 1; i < sc->ring_pages; i++) {
+		char ring_ref_name[]= "ring_refXX";
+
+		snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
+		error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
+				 ring_ref_name, "%u", sc->ring_ref[i]);
+		if (error) {
+			xenbus_dev_fatal(sc->xb_dev, error, "writing %s/%s",
+					 xenbus_get_node(sc->xb_dev),
+					 ring_ref_name);
+			return (error);
+		}
+	}
+
+	error = bind_listening_port_to_irqhandler(
+	    xenbus_get_otherend_id(sc->xb_dev),
 	    "xbd", (driver_intr_t *)blkif_int, sc,
 	    INTR_TYPE_BIO | INTR_MPSAFE, &sc->irq);
 	if (error) {
 		xenbus_dev_fatal(sc->xb_dev, error,
 		    "bind_evtchn_to_irqhandler failed");
-		goto fail;
+		return (error);
 	}
 
 	return (0);
- fail:
-	blkif_free(sc, 0);
-	return (error);
 }
 
-
 /**
  * Callback received when the backend's state changes.
  */
@@ -640,15 +748,19 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state)
 	switch (backend_state) {
 	case XenbusStateUnknown:
 	case XenbusStateInitialising:
-	case XenbusStateInitWait:
-	case XenbusStateInitialised:
-	case XenbusStateClosed:
 	case XenbusStateReconfigured:
 	case XenbusStateReconfiguring:
+	case XenbusStateClosed:
 		break;
 
+	case XenbusStateInitWait:
+		blkfront_initialize(sc);
+		break;
+
+	case XenbusStateInitialised:
 	case XenbusStateConnected:
-		connect(sc);
+		blkfront_initialize(sc);
+		blkfront_connect(sc);
 		break;
 
 	case XenbusStateClosing:
@@ -657,20 +769,7 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state)
 					 "Device in use; refusing to close");
 		else
 			blkfront_closing(dev);
-#ifdef notyet
-		bd = bdget(sc->dev);
-		if (bd == NULL)
-			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
-
-		down(&bd->bd_sem);
-		if (sc->users > 0)
-			xenbus_dev_error(dev, -EBUSY,
-					 "Device in use; refusing to close");
-		else
-			blkfront_closing(dev);
-		up(&bd->bd_sem);
-		bdput(bd);
-#endif
+		break;	
 	}
 
 	return (0);
@@ -681,7 +780,7 @@ blkfront_backend_changed(device_t dev, XenbusState backend_state)
 ** the details about the physical device - #sectors, size, etc). 
 */
 static void 
-connect(struct xb_softc *sc)
+blkfront_connect(struct xb_softc *sc)
 {
 	device_t dev = sc->xb_dev;
 	unsigned long sectors, sector_size;
@@ -694,20 +793,20 @@ connect(struct xb_softc *sc)
 
 	DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));
 
-	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
-			    "sectors", "%lu", &sectors,
-			    "info", "%u", &binfo,
-			    "sector-size", "%lu", &sector_size,
-			    NULL);
+	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
+			"sectors", "%lu", &sectors,
+			"info", "%u", &binfo,
+			"sector-size", "%lu", &sector_size,
+			NULL);
 	if (err) {
 		xenbus_dev_fatal(dev, err,
 		    "reading backend fields at %s",
 		    xenbus_get_otherend_path(dev));
 		return;
 	}
-	err = xenbus_gather(XBT_NIL, xenbus_get_otherend_path(dev),
-			    "feature-barrier", "%lu", &feature_barrier,
-			    NULL);
+	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
+			"feature-barrier", "%lu", &feature_barrier,
+			NULL);
 	if (!err || feature_barrier)
 		sc->xb_flags |= XB_BARRIER;
 
@@ -741,15 +840,16 @@ blkfront_closing(device_t dev)
 {
 	struct xb_softc *sc = device_get_softc(dev);
 
+	xenbus_set_state(dev, XenbusStateClosing);
+
 	DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev));
 
-	if (sc->mi) {
-		DPRINTK("Calling xlvbd_del\n");
-		xlvbd_del(sc);
-		sc->mi = NULL;
+	if (sc->xb_disk != NULL) {
+		disk_destroy(sc->xb_disk);
+		sc->xb_disk = NULL;
 	}
 
-	xenbus_set_state(dev, XenbusStateClosed);
+	xenbus_set_state(dev, XenbusStateClosed); 
 }
 
 
@@ -778,11 +878,16 @@ flush_requests(struct xb_softc *sc)
 		notify_remote_via_irq(sc->irq);
 }
 
-static void blkif_restart_queue_callback(void *arg)
+static void
+blkif_restart_queue_callback(void *arg)
 {
 	struct xb_softc *sc = arg;
 
+	mtx_lock(&sc->xb_io_lock);
+
 	xb_startio(sc);
+
+	mtx_unlock(&sc->xb_io_lock);
 }
 
 static int
@@ -874,20 +979,17 @@ xb_bio_command(struct xb_softc *sc)
 		return (NULL);
 	}
 
-	if (gnttab_alloc_grant_references(BLKIF_MAX_SEGMENTS_PER_REQUEST,
-	    &cm->gref_head) < 0) {
+	if (gnttab_alloc_grant_references(sc->max_request_segments,
+	    &cm->gref_head) != 0) {
 		gnttab_request_free_callback(&sc->callback,
 			blkif_restart_queue_callback, sc,
-			BLKIF_MAX_SEGMENTS_PER_REQUEST);
+			sc->max_request_segments);
 		xb_requeue_bio(sc, bp);
 		xb_enqueue_free(cm);
 		sc->xb_flags |= XB_FROZEN;
 		return (NULL);
 	}
 
-	/* XXX Can we grab refs before doing the load so that the ref can
-	 * be filled out here?
-	 */
 	cm->bp = bp;
 	cm->data = bp->bio_data;
 	cm->datalen = bp->bio_bcount;
@@ -921,13 +1023,19 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 	struct xb_softc *sc;
 	struct xb_command *cm;
 	blkif_request_t	*ring_req;
+	struct blkif_request_segment *sg;
+        struct blkif_request_segment *last_block_sg;
+	grant_ref_t *sg_ref;
 	vm_paddr_t buffer_ma;
 	uint64_t fsect, lsect;
-	int ref, i, op;
+	int ref;
+	int op;
+	int block_segs;
 
 	cm = arg;
 	sc = cm->cm_sc;
 
+//printf("%s: Start\n", __func__);
 	if (error) {
 		printf("error %d in blkif_queue_cb\n", error);
 		cm->bp->bio_error = EIO;
@@ -938,43 +1046,62 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 
 	/* Fill out a communications ring structure. */
 	ring_req = RING_GET_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
-	if (ring_req == NULL) {
-		/* XXX Is this possible? */
-		printf("ring_req NULL, requeuing\n");
-		xb_enqueue_ready(cm);
-		return;
-	}
-	ring_req->id = cm->req.id;
+	sc->ring.req_prod_pvt++;
+	ring_req->id = cm->id;
 	ring_req->operation = cm->operation;
 	ring_req->sector_number = cm->sector_number;
 	ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xb_disk;
 	ring_req->nr_segments = nsegs;
+	cm->nseg = nsegs;
 
-	for (i = 0; i < nsegs; i++) {
-		buffer_ma = segs[i].ds_addr;
-		fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
-		lsect = fsect + (segs[i].ds_len  >> XBD_SECTOR_SHFT) - 1;
+	block_segs    = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
+	sg            = ring_req->seg;
+	last_block_sg = sg + block_segs;
+	sg_ref        = cm->sg_refs;
 
-		KASSERT(lsect <= 7, 
-		    ("XEN disk driver data cannot cross a page boundary"));
+	while (1) {
 
-		/* install a grant reference. */
-		ref = gnttab_claim_grant_reference(&cm->gref_head);
-		KASSERT( ref >= 0, ("grant_reference failed") );
+		while (sg < last_block_sg) {
+			buffer_ma = segs->ds_addr;
+			fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
+			lsect = fsect + (segs->ds_len  >> XBD_SECTOR_SHFT) - 1;
 
-		gnttab_grant_foreign_access_ref(
-			ref,
-			xenbus_get_otherend_id(sc->xb_dev),
-			buffer_ma >> PAGE_SHIFT,
-			ring_req->operation & 1 ); /* ??? */
+			KASSERT(lsect <= 7, ("XEN disk driver data cannot "
+				"cross a page boundary"));
 
-		ring_req->seg[i] =
-			(struct blkif_request_segment) {
+			/* install a grant reference. */
+			ref = gnttab_claim_grant_reference(&cm->gref_head);
+
+			/*
+			 * GNTTAB_LIST_END == 0xffffffff, but it is private
+			 * to gnttab.c.
+			 */
+			KASSERT(ref != ~0, ("grant_reference failed"));
+
+			gnttab_grant_foreign_access_ref(
+				ref,
+				xenbus_get_otherend_id(sc->xb_dev),
+				buffer_ma >> PAGE_SHIFT,
+				ring_req->operation == BLKIF_OP_WRITE);
+
+			*sg_ref = ref;
+			*sg = (struct blkif_request_segment) {
 				.gref       = ref,
 				.first_sect = fsect, 
 				.last_sect  = lsect };
-	}
+			sg++;
+			sg_ref++;
+			segs++;
+			nsegs--;
+		}
+		block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
+                if (block_segs == 0)
+                        break;
 
+                sg = BLKRING_GET_SG_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
+		sc->ring.req_prod_pvt++;
+                last_block_sg = sg + block_segs;
+	}
 
 	if (cm->operation == BLKIF_OP_READ)
 		op = BUS_DMASYNC_PREREAD;
@@ -984,15 +1111,10 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 		op = 0;
 	bus_dmamap_sync(sc->xb_io_dmat, cm->map, op);
 
-	sc->ring.req_prod_pvt++;
-
-	/* Keep a private copy so we can reissue requests when recovering. */
-	cm->req = *ring_req;
+	gnttab_free_grant_references(cm->gref_head);
 
 	xb_enqueue_busy(cm);
 
-	gnttab_free_grant_references(cm->gref_head);
-
 	/*
 	 * This flag means that we're probably executing in the busdma swi
 	 * instead of in the startio context, so an explicit flush is needed.
@@ -1000,6 +1122,7 @@ blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 	if (cm->cm_flags & XB_CMD_FROZEN)
 		flush_requests(sc);
 
+//printf("%s: Done\n", __func__);
 	return;
 }
 
@@ -1018,7 +1141,7 @@ xb_startio(struct xb_softc *sc)
 
 	mtx_assert(&sc->xb_io_lock, MA_OWNED);
 
-	while (!RING_FULL(&sc->ring)) {
+	while (RING_FREE_REQUESTS(&sc->ring) >= sc->max_request_blocks) {
 		if (sc->xb_flags & XB_FROZEN)
 			break;
 
@@ -1061,12 +1184,12 @@ blkif_int(void *xsc)
 	rp = sc->ring.sring->rsp_prod;
 	rmb(); /* Ensure we see queued responses up to 'rp'. */
 
-	for (i = sc->ring.rsp_cons; i != rp; i++) {
+	for (i = sc->ring.rsp_cons; i != rp;) {
 		bret = RING_GET_RESPONSE(&sc->ring, i);
 		cm   = &sc->shadow[bret->id];
 
 		xb_remove_busy(cm);
-		blkif_completion(cm);
+		i += blkif_completion(cm);
 
 		if (cm->operation == BLKIF_OP_READ)
 			op = BUS_DMASYNC_POSTREAD;
@@ -1116,35 +1239,61 @@ blkif_int(void *xsc)
 static void 
 blkif_free(struct xb_softc *sc, int suspend)
 {
+	uint8_t *sring_page_ptr;
+	int i;
 	
-/* Prevent new requests being issued until we fix things up. */
+	/* Prevent new requests being issued until we fix things up. */
 	mtx_lock(&sc->xb_io_lock);
 	sc->connected = suspend ? 
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; 
 	mtx_unlock(&sc->xb_io_lock);
 
 	/* Free resources associated with old device channel. */
-	if (sc->ring_ref != GRANT_INVALID_REF) {
-		gnttab_end_foreign_access(sc->ring_ref, 
-					  sc->ring.sring);
-		sc->ring_ref = GRANT_INVALID_REF;
+	if (sc->ring.sring != NULL) {
+		sring_page_ptr = (uint8_t *)sc->ring.sring;
+		for (i = 0; i < sc->ring_pages; i++) {
+			if (sc->ring_ref[i] != GRANT_INVALID_REF) {
+				gnttab_end_foreign_access_ref(sc->ring_ref[i]);
+				sc->ring_ref[i] = GRANT_INVALID_REF;
+			}
+			sring_page_ptr += PAGE_SIZE;
+		}
+		free(sc->ring.sring, M_XENBLOCKFRONT);
 		sc->ring.sring = NULL;
 	}
-	if (sc->irq)
-		unbind_from_irqhandler(sc->irq);
-	sc->irq = 0;
 
+	if (sc->shadow) {
+
+		for (i = 0; i < sc->max_requests; i++) {
+			struct xb_command *cm;
+
+			cm = &sc->shadow[i];
+			if (cm->sg_refs != NULL) {
+				free(cm->sg_refs, M_XENBLOCKFRONT);
+				cm->sg_refs = NULL;
+			}
+
+			bus_dmamap_destroy(sc->xb_io_dmat, cm->map);
+		}
+		free(sc->shadow, M_XENBLOCKFRONT);
+		sc->shadow = NULL;
+	}
+		
+	if (sc->irq) {
+		unbind_from_irqhandler(sc->irq);
+		sc->irq = 0;
+	}
 }
 
-static void 
+static int
 blkif_completion(struct xb_command *s)
 {
-	int i;
-
-	for (i = 0; i < s->req.nr_segments; i++)
-		gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
+//printf("%s: Req %p(%d)\n", __func__, s, s->nseg);
+	gnttab_end_foreign_access_references(s->nseg, s->sg_refs);
+	return (BLKIF_SEGS_TO_BLOCKS(s->nseg));
 }
 
+#if 0
 static void 
 blkif_recover(struct xb_softc *sc)
 {
@@ -1157,6 +1306,7 @@ blkif_recover(struct xb_softc *sc)
 	 * has been removed until further notice.
 	 */
 }
+#endif
 
 /* ** Driver registration ** */
 static device_method_t blkfront_methods[] = { 
@@ -1169,7 +1319,7 @@ static device_method_t blkfront_methods[] = {
 	DEVMETHOD(device_resume,        blkfront_resume), 
  
 	/* Xenbus interface */
-	DEVMETHOD(xenbus_backend_changed, blkfront_backend_changed),
+	DEVMETHOD(xenbus_otherend_changed, blkfront_backend_changed),
 
 	{ 0, 0 } 
 }; 
@@ -1181,4 +1331,4 @@ static driver_t blkfront_driver = {
 }; 
 devclass_t blkfront_devclass; 
  
-DRIVER_MODULE(xbd, xenbus, blkfront_driver, blkfront_devclass, 0, 0); 
+DRIVER_MODULE(xbd, xenbusb_front, blkfront_driver, blkfront_devclass, 0, 0); 
diff --git a/sys/dev/xen/blkfront/block.h b/sys/dev/xen/blkfront/block.h
index 32bfc96a095..6235e515afe 100644
--- a/sys/dev/xen/blkfront/block.h
+++ b/sys/dev/xen/blkfront/block.h
@@ -32,7 +32,43 @@
 
 #ifndef __XEN_DRIVERS_BLOCK_H__
 #define __XEN_DRIVERS_BLOCK_H__
-#include <xen/interface/io/blkif.h>
+#include <xen/blkif.h>
+
+/**
+ * The maximum number of outstanding requests blocks (request headers plus
+ * additional segment blocks) we will allow in a negotiated block-front/back
+ * communication channel.
+ */
+#define XBF_MAX_REQUESTS		256
+
+/**
+ * The maximum mapped region size per request we will allow in a negotiated
+ * block-front/back communication channel.
+ *
+ * \note We reserve a segement from the maximum supported by the transport to
+ *       guarantee we can handle an unaligned transfer without the need to
+ *       use a bounce buffer..
+ */
+#define	XBF_MAX_REQUEST_SIZE		\
+	MIN(MAXPHYS, (BLKIF_MAX_SEGMENTS_PER_REQUEST - 1) * PAGE_SIZE)
+
+/**
+ * The maximum number of segments (within a request header and accompanying
+ * segment blocks) per request we will allow in a negotiated block-front/back
+ * communication channel.
+ */
+#define	XBF_MAX_SEGMENTS_PER_REQUEST		\
+	(MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,	\
+	     (XBF_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))
+
+/**
+ * The maximum number of shared memory ring pages we will allow in a
+ * negotiated block-front/back communication channel.  Allow enough
+ * ring space for all requests to be  XBF_MAX_REQUEST_SIZE'd.
+ */
+#define XBF_MAX_RING_PAGES						    \
+	BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBF_MAX_SEGMENTS_PER_REQUEST) \
+		       * XBF_MAX_REQUESTS)
 
 struct xlbd_type_info
 {
@@ -62,19 +98,19 @@ struct xb_command {
 #define XB_ON_XBQ_COMPLETE	(1<<5)
 #define XB_ON_XBQ_MASK		((1<<2)|(1<<3)|(1<<4)|(1<<5))
 	bus_dmamap_t		map;
-	blkif_request_t		req;
+	uint64_t		id;
+	grant_ref_t		*sg_refs;
 	struct bio		*bp;
 	grant_ref_t		gref_head;
 	void			*data;
 	size_t			datalen;
+	u_int			nseg;
 	int			operation;
 	blkif_sector_t		sector_number;
 	int			status;
 	void			(* cm_complete)(struct xb_command *);
 };
 
-#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
-
 #define XBQ_FREE	0
 #define XBQ_BIO		1
 #define XBQ_READY	2
@@ -108,10 +144,14 @@ struct xb_softc {
 	int			vdevice;
 	blkif_vdev_t		handle;
 	int			connected;
-	int			ring_ref;
+	u_int			ring_pages;
+	uint32_t		max_requests;
+	uint32_t		max_request_segments;
+	uint32_t		max_request_blocks;
+	uint32_t		max_request_size;
+	grant_ref_t		ring_ref[XBF_MAX_RING_PAGES];
 	blkif_front_ring_t	ring;
 	unsigned int		irq;
-	struct xlbd_major_info	*mi;
 	struct gnttab_free_callback	callback;
 	TAILQ_HEAD(,xb_command)	cm_free;
 	TAILQ_HEAD(,xb_command)	cm_ready;
@@ -126,11 +166,12 @@ struct xb_softc {
 	 */
 	int			users;
 	struct mtx		xb_io_lock;
-	struct xb_command	shadow[BLK_RING_SIZE];
+
+	struct xb_command      *shadow;
 };
 
-int xlvbd_add(struct xb_softc *, blkif_sector_t capacity, int device,
-	      uint16_t vdisk_info, uint16_t sector_size);
+int xlvbd_add(struct xb_softc *, blkif_sector_t sectors, int device,
+	      uint16_t vdisk_info, unsigned long sector_size);
 void xlvbd_del(struct xb_softc *);
 
 #define XBQ_ADD(sc, qname)					\
@@ -188,7 +229,8 @@ void xlvbd_del(struct xb_softc *);
 		struct xb_command *cm;					\
 									\
 		if ((cm = TAILQ_FIRST(&sc->cm_ ## name)) != NULL) {	\
-			if ((cm->cm_flags & XB_ON_ ## index) == 0) {	\
+			if ((cm->cm_flags & XB_ON_XBQ_MASK) !=		\
+			     XB_ON_ ## index) {				\
 				printf("command %p not in queue, "	\
 				    "flags = %#x, bit = %#x\n", cm,	\
 				    cm->cm_flags, XB_ON_ ## index);	\
@@ -203,7 +245,7 @@ void xlvbd_del(struct xb_softc *);
 	static __inline void						\
 	xb_remove_ ## name (struct xb_command *cm)			\
 	{								\
-		if ((cm->cm_flags & XB_ON_ ## index) == 0) {		\
+		if ((cm->cm_flags & XB_ON_XBQ_MASK) != XB_ON_ ## index){\
 			printf("command %p not in queue, flags = %#x, " \
 			    "bit = %#x\n", cm, cm->cm_flags,		\
 			    XB_ON_ ## index);				\
diff --git a/sys/dev/xen/control/control.c b/sys/dev/xen/control/control.c
new file mode 100644
index 00000000000..c03d5365530
--- /dev/null
+++ b/sys/dev/xen/control/control.c
@@ -0,0 +1,493 @@
+/*-
+ * Copyright (c) 2010 Justin T. Gibbs, Spectra Logic Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ */
+
+/*-
+ * PV suspend/resume support:
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * Copyright (c) 2004-2006,2008 Kip Macy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * HVM suspend/resume support:
+ *
+ * Copyright (c) 2008 Citrix Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/**
+ * \file control.c
+ *
+ * \brief Device driver to repond to control domain events that impact
+ *        this VM.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/filedesc.h>
+#include <sys/kdb.h>
+#include <sys/module.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/reboot.h>
+#include <sys/rman.h>
+#include <sys/taskqueue.h>
+#include <sys/types.h>
+#include <sys/vnode.h>
+
+#ifndef XENHVM
+#include <sys/sched.h>
+#include <sys/smp.h>
+#endif
+
+
+#include <geom/geom.h>
+
+#include <machine/_inttypes.h>
+#include <machine/xen/xen-os.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+
+#include <xen/blkif.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/xen_intr.h>
+
+#include <xen/interface/event_channel.h>
+#include <xen/interface/grant_table.h>
+
+#include <xen/xenbus/xenbusvar.h>
+
+#define NUM_ELEMENTS(x) (sizeof(x) / sizeof(*(x)))
+
+/*--------------------------- Forward Declarations --------------------------*/
+/** Function signature for shutdown event handlers. */
+typedef	void (xctrl_shutdown_handler_t)(void);
+
+static xctrl_shutdown_handler_t xctrl_poweroff;
+static xctrl_shutdown_handler_t xctrl_reboot;
+static xctrl_shutdown_handler_t xctrl_suspend;
+static xctrl_shutdown_handler_t xctrl_crash;
+static xctrl_shutdown_handler_t xctrl_halt;
+
+/*-------------------------- Private Data Structures -------------------------*/
+/** Element type for lookup table of event name to handler. */
+struct xctrl_shutdown_reason {
+	const char		 *name;
+	xctrl_shutdown_handler_t *handler;
+};
+
+/** Lookup table for shutdown event name to handler. */
+static struct xctrl_shutdown_reason xctrl_shutdown_reasons[] = {
+	{ "poweroff", xctrl_poweroff },
+	{ "reboot",   xctrl_reboot   },
+	{ "suspend",  xctrl_suspend  },
+	{ "crash",    xctrl_crash    },
+	{ "halt",     xctrl_halt     },
+};
+
+struct xctrl_softc {
+
+	/** Must be first */
+	struct xs_watch    xctrl_watch;	
+};
+
+/*------------------------------ Event Handlers ------------------------------*/
+static void
+xctrl_poweroff()
+{
+	shutdown_nice(RB_POWEROFF|RB_HALT);
+}
+
+static void
+xctrl_reboot()
+{
+	shutdown_nice(0);
+}
+
+#ifndef XENHVM
+extern void xencons_suspend(void);
+extern void xencons_resume(void);
+
+/* Full PV mode suspension. */
+static void
+xctrl_suspend()
+{
+	int i, j, k, fpp;
+	unsigned long max_pfn, start_info_mfn;
+
+#ifdef SMP
+	cpumask_t map;
+	/*
+	 * Bind us to CPU 0 and stop any other VCPUs.
+	 */
+	thread_lock(curthread);
+	sched_bind(curthread, 0);
+	thread_unlock(curthread);
+	KASSERT(PCPU_GET(cpuid) == 0, ("xen_suspend: not running on cpu 0"));
+
+	map = PCPU_GET(other_cpus) & ~stopped_cpus;
+	if (map)
+		stop_cpus(map);
+#endif
+
+	if (DEVICE_SUSPEND(root_bus) != 0) {
+		printf("xen_suspend: device_suspend failed\n");
+#ifdef SMP
+		if (map)
+			restart_cpus(map);
+#endif
+		return;
+	}
+
+	local_irq_disable();
+
+	xencons_suspend();
+	gnttab_suspend();
+
+	max_pfn = HYPERVISOR_shared_info->arch.max_pfn;
+
+	void *shared_info = HYPERVISOR_shared_info;
+	HYPERVISOR_shared_info = NULL;
+	pmap_kremove((vm_offset_t) shared_info);
+	PT_UPDATES_FLUSH();
+
+	xen_start_info->store_mfn = MFNTOPFN(xen_start_info->store_mfn);
+	xen_start_info->console.domU.mfn = MFNTOPFN(xen_start_info->console.domU.mfn);
+
+	/*
+	 * We'll stop somewhere inside this hypercall. When it returns,
+	 * we'll start resuming after the restore.
+	 */
+	start_info_mfn = VTOMFN(xen_start_info);
+	pmap_suspend();
+	HYPERVISOR_suspend(start_info_mfn);
+	pmap_resume();
+
+	pmap_kenter_ma((vm_offset_t) shared_info, xen_start_info->shared_info);
+	HYPERVISOR_shared_info = shared_info;
+
+	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+		VTOMFN(xen_pfn_to_mfn_frame_list_list);
+  
+	fpp = PAGE_SIZE/sizeof(unsigned long);
+	for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
+		if ((j % fpp) == 0) {
+			k++;
+			xen_pfn_to_mfn_frame_list_list[k] = 
+				VTOMFN(xen_pfn_to_mfn_frame_list[k]);
+			j = 0;
+		}
+		xen_pfn_to_mfn_frame_list[k][j] = 
+			VTOMFN(&xen_phys_machine[i]);
+	}
+	HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
+
+	gnttab_resume();
+	irq_resume();
+	local_irq_enable();
+	xencons_resume();
+
+#ifdef CONFIG_SMP
+	for_each_cpu(i)
+		vcpu_prepare(i);
+
+#endif
+	/* 
+	 * Only resume xenbus /after/ we've prepared our VCPUs; otherwise
+	 * the VCPU hotplug callback can race with our vcpu_prepare
+	 */
+	DEVICE_RESUME(root_bus);
+
+#ifdef SMP
+	thread_lock(curthread);
+	sched_unbind(curthread);
+	thread_unlock(curthread);
+	if (map)
+		restart_cpus(map);
+#endif
+}
+
+static void
+xen_pv_shutdown_final(void *arg, int howto)
+{
+	/*
+	 * Inform the hypervisor that shutdown is complete.
+	 * This is not necessary in HVM domains since Xen
+	 * emulates ACPI in that mode and FreeBSD's ACPI
+	 * support will request this transition.
+	 */
+	if (howto & (RB_HALT | RB_POWEROFF))
+		HYPERVISOR_shutdown(SHUTDOWN_poweroff);
+	else
+		HYPERVISOR_shutdown(SHUTDOWN_reboot);
+}
+
+#else
+extern void xenpci_resume(void);
+
+/* HVM mode suspension. */
+static void
+xctrl_suspend()
+{
+	int suspend_cancelled;
+
+	if (DEVICE_SUSPEND(root_bus)) {
+		printf("xen_suspend: device_suspend failed\n");
+		return;
+	}
+
+	/*
+	 * Make sure we don't change cpus or switch to some other
+	 * thread. for the duration.
+	 */
+	critical_enter();
+
+	/*
+	 * Prevent any races with evtchn_interrupt() handler.
+	 */
+	irq_suspend();
+	disable_intr();
+
+	suspend_cancelled = HYPERVISOR_suspend(0);
+	if (!suspend_cancelled)
+		xenpci_resume();
+
+	/*
+	 * Re-enable interrupts and put the scheduler back to normal.
+	 */
+	enable_intr();
+	critical_exit();
+
+	/*
+	 * FreeBSD really needs to add DEVICE_SUSPEND_CANCEL or
+	 * similar.
+	 */
+	if (!suspend_cancelled)
+		DEVICE_RESUME(root_bus);
+}
+#endif
+
+static void
+xctrl_crash()
+{
+	panic("Xen directed crash");
+}
+
+static void
+xctrl_halt()
+{
+	shutdown_nice(RB_HALT);
+}
+
+/*------------------------------ Event Reception -----------------------------*/
+static void
+xctrl_on_watch_event(struct xs_watch *watch, const char **vec, unsigned int len)
+{
+	struct xctrl_shutdown_reason *reason;
+	struct xctrl_shutdown_reason *last_reason;
+	char *result;
+	int   error;
+	int   result_len;
+	
+	error = xs_read(XST_NIL, "control", "shutdown",
+			&result_len, (void **)&result);
+	if (error != 0)
+		return;
+
+	reason = xctrl_shutdown_reasons;
+	last_reason = reason + NUM_ELEMENTS(xctrl_shutdown_reasons);
+	while (reason < last_reason) {
+
+		if (!strcmp(result, reason->name)) {
+			reason->handler();
+			break;
+		}
+		reason++;
+	}
+
+	free(result, M_XENSTORE);
+}
+
+/*------------------ Private Device Attachment Functions  --------------------*/
+/**
+ * \brief Identify instances of this device type in the system.
+ *
+ * \param driver  The driver performing this identify action.
+ * \param parent  The NewBus parent device for any devices this method adds.
+ */
+static void
+xctrl_identify(driver_t *driver __unused, device_t parent)
+{
+	/*
+	 * A single device instance for our driver is always present
+	 * in a system operating under Xen.
+	 */
+	BUS_ADD_CHILD(parent, 0, driver->name, 0);
+}
+
+/**
+ * \brief Probe for the existance of the Xen Control device
+ *
+ * \param dev  NewBus device_t for this Xen control instance.
+ *
+ * \return  Always returns 0 indicating success.
+ */
+static int 
+xctrl_probe(device_t dev)
+{
+	device_set_desc(dev, "Xen Control Device");
+
+	return (0);
+}
+
+/**
+ * \brief Attach the Xen control device.
+ *
+ * \param dev  NewBus device_t for this Xen control instance.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+static int
+xctrl_attach(device_t dev)
+{
+	struct xctrl_softc *xctrl;
+
+	xctrl = device_get_softc(dev);
+
+	/* Activate watch */
+	xctrl->xctrl_watch.node = "control/shutdown";
+	xctrl->xctrl_watch.callback = xctrl_on_watch_event;
+	xs_register_watch(&xctrl->xctrl_watch);
+
+#ifndef XENHVM
+	EVENTHANDLER_REGISTER(shutdown_final, xen_pv_shutdown_final, NULL,
+			      SHUTDOWN_PRI_LAST);
+#endif
+
+	return (0);
+}
+
+/**
+ * \brief Detach the Xen control device.
+ *
+ * \param dev  NewBus device_t for this Xen control device instance.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+static int
+xctrl_detach(device_t dev)
+{
+	struct xctrl_softc *xctrl;
+
+	xctrl = device_get_softc(dev);
+
+	/* Release watch */
+	xs_unregister_watch(&xctrl->xctrl_watch);
+
+	return (0);
+}
+
+/*-------------------- Private Device Attachment Data  -----------------------*/
+static device_method_t xctrl_methods[] = { 
+	/* Device interface */ 
+	DEVMETHOD(device_identify,	xctrl_identify),
+	DEVMETHOD(device_probe,         xctrl_probe), 
+	DEVMETHOD(device_attach,        xctrl_attach), 
+	DEVMETHOD(device_detach,        xctrl_detach), 
+ 
+	{ 0, 0 } 
+}; 
+
+DEFINE_CLASS_0(xctrl, xctrl_driver, xctrl_methods, sizeof(struct xctrl_softc));
+devclass_t xctrl_devclass; 
+ 
+DRIVER_MODULE(xctrl, xenstore, xctrl_driver, xctrl_devclass, 0, 0);
diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c
index a6fd9ea2886..423df976154 100644
--- a/sys/dev/xen/netfront/netfront.c
+++ b/sys/dev/xen/netfront/netfront.c
@@ -91,8 +91,6 @@ __FBSDID("$FreeBSD$");
 
 #define XN_CSUM_FEATURES	(CSUM_TCP | CSUM_UDP | CSUM_TSO)
 
-#define GRANT_INVALID_REF	0
-
 #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
 #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
 
@@ -373,7 +371,8 @@ xennet_get_rx_ref(struct netfront_info *np, RING_IDX ri)
 {
 	int i = xennet_rxidx(ri);
 	grant_ref_t ref = np->grant_rx_ref[i];
-	np->grant_rx_ref[i] = GRANT_INVALID_REF;
+	KASSERT(ref != GRANT_REF_INVALID, ("Invalid grant reference!\n"));
+	np->grant_rx_ref[i] = GRANT_REF_INVALID;
 	return ref;
 }
 
@@ -404,7 +403,7 @@ xen_net_read_mac(device_t dev, uint8_t mac[])
 	int error, i;
 	char *s, *e, *macstr;
 
-	error = xenbus_read(XBT_NIL, xenbus_get_node(dev), "mac", NULL,
+	error = xs_read(XST_NIL, xenbus_get_node(dev), "mac", NULL,
 	    (void **) &macstr);
 	if (error)
 		return (error);
@@ -413,12 +412,12 @@ xen_net_read_mac(device_t dev, uint8_t mac[])
 	for (i = 0; i < ETHER_ADDR_LEN; i++) {
 		mac[i] = strtoul(s, &e, 16);
 		if (s == e || (e[0] != ':' && e[0] != 0)) {
-			free(macstr, M_DEVBUF);
+			free(macstr, M_XENBUS);
 			return (ENOENT);
 		}
 		s = &e[1];
 	}
-	free(macstr, M_DEVBUF);
+	free(macstr, M_XENBUS);
 	return (0);
 }
 
@@ -483,7 +482,7 @@ static int
 talk_to_backend(device_t dev, struct netfront_info *info)
 {
 	const char *message;
-	struct xenbus_transaction xbt;
+	struct xs_transaction xst;
 	const char *node = xenbus_get_node(dev);
 	int err;
 
@@ -499,54 +498,54 @@ talk_to_backend(device_t dev, struct netfront_info *info)
 		goto out;
 	
  again:
-	err = xenbus_transaction_start(&xbt);
+	err = xs_transaction_start(&xst);
 	if (err) {
 		xenbus_dev_fatal(dev, err, "starting transaction");
 		goto destroy_ring;
 	}
-	err = xenbus_printf(xbt, node, "tx-ring-ref","%u",
+	err = xs_printf(xst, node, "tx-ring-ref","%u",
 			info->tx_ring_ref);
 	if (err) {
 		message = "writing tx ring-ref";
 		goto abort_transaction;
 	}
-	err = xenbus_printf(xbt, node, "rx-ring-ref","%u",
+	err = xs_printf(xst, node, "rx-ring-ref","%u",
 			info->rx_ring_ref);
 	if (err) {
 		message = "writing rx ring-ref";
 		goto abort_transaction;
 	}
-	err = xenbus_printf(xbt, node,
+	err = xs_printf(xst, node,
 			"event-channel", "%u", irq_to_evtchn_port(info->irq));
 	if (err) {
 		message = "writing event-channel";
 		goto abort_transaction;
 	}
-	err = xenbus_printf(xbt, node, "request-rx-copy", "%u",
+	err = xs_printf(xst, node, "request-rx-copy", "%u",
 			info->copying_receiver);
 	if (err) {
 		message = "writing request-rx-copy";
 		goto abort_transaction;
 	}
-	err = xenbus_printf(xbt, node, "feature-rx-notify", "%d", 1);
+	err = xs_printf(xst, node, "feature-rx-notify", "%d", 1);
 	if (err) {
 		message = "writing feature-rx-notify";
 		goto abort_transaction;
 	}
-	err = xenbus_printf(xbt, node, "feature-sg", "%d", 1);
+	err = xs_printf(xst, node, "feature-sg", "%d", 1);
 	if (err) {
 		message = "writing feature-sg";
 		goto abort_transaction;
 	}
 #if __FreeBSD_version >= 700000
-	err = xenbus_printf(xbt, node, "feature-gso-tcpv4", "%d", 1);
+	err = xs_printf(xst, node, "feature-gso-tcpv4", "%d", 1);
 	if (err) {
 		message = "writing feature-gso-tcpv4";
 		goto abort_transaction;
 	}
 #endif
 
-	err = xenbus_transaction_end(xbt, 0);
+	err = xs_transaction_end(xst, 0);
 	if (err) {
 		if (err == EAGAIN)
 			goto again;
@@ -557,7 +556,7 @@ talk_to_backend(device_t dev, struct netfront_info *info)
 	return 0;
 	
  abort_transaction:
-	xenbus_transaction_end(xbt, 1);
+	xs_transaction_end(xst, 1);
 	xenbus_dev_fatal(dev, err, "%s", message);
  destroy_ring:
 	netif_free(info);
@@ -576,8 +575,8 @@ setup_device(device_t dev, struct netfront_info *info)
 	
 	ifp = info->xn_ifp;
 
-	info->tx_ring_ref = GRANT_INVALID_REF;
-	info->rx_ring_ref = GRANT_INVALID_REF;
+	info->tx_ring_ref = GRANT_REF_INVALID;
+	info->rx_ring_ref = GRANT_REF_INVALID;
 	info->rx.sring = NULL;
 	info->tx.sring = NULL;
 	info->irq = 0;
@@ -750,7 +749,7 @@ netif_release_tx_bufs(struct netfront_info *np)
 		    GNTMAP_readonly);
 		gnttab_release_grant_reference(&np->gref_tx_head,
 		    np->grant_tx_ref[i]);
-		np->grant_tx_ref[i] = GRANT_INVALID_REF;
+		np->grant_tx_ref[i] = GRANT_REF_INVALID;
 		add_id_to_freelist(np->tx_mbufs, i);
 		np->xn_cdata.xn_tx_chain_cnt--;
 		if (np->xn_cdata.xn_tx_chain_cnt < 0) {
@@ -854,7 +853,8 @@ refill:
 		sc->rx_mbufs[id] = m_new;
 
 		ref = gnttab_claim_grant_reference(&sc->gref_rx_head);
-		KASSERT((short)ref >= 0, ("negative ref"));
+		KASSERT(ref != GNTTAB_LIST_END,
+			("reserved grant references exhuasted"));
 		sc->grant_rx_ref[id] = ref;
 
 		vaddr = mtod(m_new, vm_offset_t);
@@ -1135,7 +1135,7 @@ xn_txeof(struct netfront_info *np)
 				np->grant_tx_ref[id]);
 			gnttab_release_grant_reference(
 				&np->gref_tx_head, np->grant_tx_ref[id]);
-			np->grant_tx_ref[id] = GRANT_INVALID_REF;
+			np->grant_tx_ref[id] = GRANT_REF_INVALID;
 			
 			np->tx_mbufs[id] = NULL;
 			add_id_to_freelist(np->tx_mbufs, id);
@@ -1318,12 +1318,13 @@ xennet_get_responses(struct netfront_info *np,
 		 * the backend driver. In future this should flag the bad
 		 * situation to the system controller to reboot the backed.
 		 */
-		if (ref == GRANT_INVALID_REF) {
+		if (ref == GRANT_REF_INVALID) {
 
 #if 0 				
 			if (net_ratelimit())
 				WPRINTK("Bad rx response id %d.\n", rx->id);
 #endif			
+			printf("%s: Bad rx response id %d.\n", __func__,rx->id);
 			err = EINVAL;
 			goto next;
 		}
@@ -1384,7 +1385,7 @@ next_skip_queue:
 			err = ENOENT;
 			printf("%s: cons %u frags %u rp %u, not enough frags\n",
 			       __func__, *cons, frags, rp);
-				break;
+			break;
 		}
 		/*
 		 * Note that m can be NULL, if rx->status < 0 or if
@@ -1526,6 +1527,11 @@ xn_assemble_tx_request(struct netfront_info *sc, struct mbuf *m_head)
 	 * tell the TCP stack to generate a shorter chain of packets.
 	 */
 	if (nfrags > MAX_TX_REQ_FRAGS) {
+#ifdef DEBUG
+		printf("%s: nfrags %d > MAX_TX_REQ_FRAGS %d, netback "
+		       "won't be able to handle it, dropping\n",
+		       __func__, nfrags, MAX_TX_REQ_FRAGS);
+#endif
 		m_freem(m_head);
 		return (EMSGSIZE);
 	}
@@ -1881,11 +1887,11 @@ network_connect(struct netfront_info *np)
 	netif_rx_request_t *req;
 	u_int feature_rx_copy, feature_rx_flip;
 
-	error = xenbus_scanf(XBT_NIL, xenbus_get_otherend_path(np->xbdev),
+	error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
 	    "feature-rx-copy", NULL, "%u", &feature_rx_copy);
 	if (error)
 		feature_rx_copy = 0;
-	error = xenbus_scanf(XBT_NIL, xenbus_get_otherend_path(np->xbdev),
+	error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
 	    "feature-rx-flip", NULL, "%u", &feature_rx_flip);
 	if (error)
 		feature_rx_flip = 1;
@@ -1999,14 +2005,14 @@ create_netdev(device_t dev)
 	/* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
 	for (i = 0; i <= NET_TX_RING_SIZE; i++) {
 		np->tx_mbufs[i] = (void *) ((u_long) i+1);
-		np->grant_tx_ref[i] = GRANT_INVALID_REF;	
+		np->grant_tx_ref[i] = GRANT_REF_INVALID;	
 	}
 	np->tx_mbufs[NET_TX_RING_SIZE] = (void *)0;
 
 	for (i = 0; i <= NET_RX_RING_SIZE; i++) {
 
 		np->rx_mbufs[i] = NULL;
-		np->grant_rx_ref[i] = GRANT_INVALID_REF;
+		np->grant_rx_ref[i] = GRANT_REF_INVALID;
 	}
 	/* A grant for every tx ring slot */
 	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
@@ -2128,8 +2134,8 @@ netif_disconnect_backend(struct netfront_info *info)
 
 	end_access(info->tx_ring_ref, info->tx.sring);
 	end_access(info->rx_ring_ref, info->rx.sring);
-	info->tx_ring_ref = GRANT_INVALID_REF;
-	info->rx_ring_ref = GRANT_INVALID_REF;
+	info->tx_ring_ref = GRANT_REF_INVALID;
+	info->rx_ring_ref = GRANT_REF_INVALID;
 	info->tx.sring = NULL;
 	info->rx.sring = NULL;
 
@@ -2143,7 +2149,7 @@ netif_disconnect_backend(struct netfront_info *info)
 static void
 end_access(int ref, void *page)
 {
-	if (ref != GRANT_INVALID_REF)
+	if (ref != GRANT_REF_INVALID)
 		gnttab_end_foreign_access(ref, page);
 }
 
@@ -2171,7 +2177,7 @@ static device_method_t netfront_methods[] = {
 	DEVMETHOD(device_resume,        netfront_resume), 
  
 	/* Xenbus interface */
-	DEVMETHOD(xenbus_backend_changed, netfront_backend_changed),
+	DEVMETHOD(xenbus_otherend_changed, netfront_backend_changed),
 
 	{ 0, 0 } 
 }; 
@@ -2183,4 +2189,4 @@ static driver_t netfront_driver = {
 }; 
 devclass_t netfront_devclass; 
  
-DRIVER_MODULE(xe, xenbus, netfront_driver, netfront_devclass, 0, 0); 
+DRIVER_MODULE(xe, xenbusb_front, netfront_driver, netfront_devclass, 0, 0); 
diff --git a/sys/dev/xen/xenpci/evtchn.c b/sys/dev/xen/xenpci/evtchn.c
index bdf3ad15572..ea53a7e371c 100644
--- a/sys/dev/xen/xenpci/evtchn.c
+++ b/sys/dev/xen/xenpci/evtchn.c
@@ -181,6 +181,49 @@ bind_listening_port_to_irqhandler(unsigned int remote_domain,
 	return (0);
 }
 
+int 
+bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+    unsigned int remote_port, const char *devname, driver_intr_t handler,
+    void *arg, unsigned long irqflags, unsigned int *irqp)
+{
+	struct evtchn_bind_interdomain bind_interdomain;
+	unsigned int irq;
+	int error;
+
+	irq = alloc_xen_irq();
+	if (irq < 0)
+		return irq;
+
+	mtx_lock(&irq_evtchn[irq].lock);
+
+	bind_interdomain.remote_dom  = remote_domain;
+	bind_interdomain.remote_port = remote_port;
+	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+					    &bind_interdomain);
+	if (error) {
+		mtx_unlock(&irq_evtchn[irq].lock);
+		free_xen_irq(irq);
+		return (-error);
+	}
+
+	irq_evtchn[irq].handler = handler;
+	irq_evtchn[irq].arg     = arg;
+	irq_evtchn[irq].evtchn  = bind_interdomain.local_port;
+	irq_evtchn[irq].close   = 1;
+	irq_evtchn[irq].mpsafe  = (irqflags & INTR_MPSAFE) != 0;
+
+	evtchn_to_irq[bind_interdomain.local_port] = irq;
+
+	unmask_evtchn(bind_interdomain.local_port);
+
+	mtx_unlock(&irq_evtchn[irq].lock);
+
+	if (irqp)
+		*irqp = irq;
+	return (0);
+}
+
+
 int
 bind_caller_port_to_irqhandler(unsigned int caller_port,
     const char *devname, driver_intr_t handler, void *arg,
diff --git a/sys/dev/xen/xenpci/xenpci.c b/sys/dev/xen/xenpci/xenpci.c
index 2f2a79fff21..f4c9f73d686 100644
--- a/sys/dev/xen/xenpci/xenpci.c
+++ b/sys/dev/xen/xenpci/xenpci.c
@@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$");
 char *hypercall_stubs;
 shared_info_t *HYPERVISOR_shared_info;
 static vm_paddr_t shared_info_pa;
+static device_t nexus;
 
 /*
  * This is used to find our platform device instance.
@@ -80,7 +81,7 @@ xenpci_cpuid_base(void)
 {
 	uint32_t base, regs[4];
 
-	for (base = 0x40000000; base < 0x40001000; base += 0x100) {
+	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
 		do_cpuid(base, regs);
 		if (!memcmp("XenVMMXenVMM", &regs[1], 12)
 		    && (regs[0] - base) >= 2)
@@ -204,14 +205,21 @@ xenpci_allocate_resources(device_t dev)
 
 	scp->res_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ,
 			&scp->rid_irq, RF_SHAREABLE|RF_ACTIVE);
-	if (scp->res_irq == NULL)
+	if (scp->res_irq == NULL) {
+		printf("xenpci Could not allocate irq.\n");
 		goto errexit;
+	}
 
 	scp->rid_memory = PCIR_BAR(1);
 	scp->res_memory = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
 			&scp->rid_memory, RF_ACTIVE);
-	if (scp->res_memory == NULL)
+	if (scp->res_memory == NULL) {
+		printf("xenpci Could not allocate memory bar.\n");
 		goto errexit;
+	}
+
+	scp->phys_next = rman_get_start(scp->res_memory);
+
 	return (0);
 
 errexit:
@@ -254,6 +262,36 @@ xenpci_alloc_space(size_t sz, vm_paddr_t *pa)
 	}
 }
 
+static struct resource *
+xenpci_alloc_resource(device_t dev, device_t child, int type, int *rid,
+    u_long start, u_long end, u_long count, u_int flags)
+{
+	return (BUS_ALLOC_RESOURCE(nexus, child, type, rid, start,
+	    end, count, flags));
+}
+
+
+static int
+xenpci_release_resource(device_t dev, device_t child, int type, int rid,
+    struct resource *r)
+{
+	return (BUS_RELEASE_RESOURCE(nexus, child, type, rid, r));
+}
+
+static int
+xenpci_activate_resource(device_t dev, device_t child, int type, int rid,
+    struct resource *r)
+{
+	return (BUS_ACTIVATE_RESOURCE(nexus, child, type, rid, r));
+}
+
+static int
+xenpci_deactivate_resource(device_t dev, device_t child, int type,
+    int rid, struct resource *r)
+{
+	return (BUS_DEACTIVATE_RESOURCE(nexus, child, type, rid, r));
+}
+
 /*
  * Called very early in the resume sequence - reinitialise the various
  * bits of Xen machinery including the hypercall page and the shared
@@ -303,20 +341,36 @@ xenpci_probe(device_t dev)
 static int
 xenpci_attach(device_t dev)
 {
-        int error;
+	int error;
 	struct xenpci_softc *scp = device_get_softc(dev);
 	struct xen_add_to_physmap xatp;
 	vm_offset_t shared_va;
+	devclass_t dc;
+
+	/*
+	 * Find and record nexus0.  Since we are not really on the
+	 * PCI bus, all resource operations are directed to nexus
+	 * instead of through our parent.
+	 */
+	if ((dc = devclass_find("nexus"))  == 0
+	 || (nexus = devclass_get_device(dc, 0)) == 0) {
+		device_printf(dev, "unable to find nexus.");
+		return (ENOENT);
+	}
 
 	error = xenpci_allocate_resources(dev);
-	if (error)
+	if (error) {
+		device_printf(dev, "xenpci_allocate_resources failed(%d).\n",
+		    error);
 		goto errexit;
-
-	scp->phys_next = rman_get_start(scp->res_memory);
+	}
 
 	error = xenpci_init_hypercall_stubs(dev, scp);
-	if (error)
+	if (error) {
+		device_printf(dev, "xenpci_init_hypercall_stubs failed(%d).\n",
+		    error);
 		goto errexit;
+	}
 
 	setup_xen_features();
 
@@ -346,7 +400,7 @@ errexit:
 	 * Undo anything we may have done.
 	 */
 	xenpci_deallocate_resources(dev);
-        return (error);
+	return (error);
 }
 
 /*
@@ -364,8 +418,9 @@ xenpci_detach(device_t dev)
 	 */
 	if (scp->intr_cookie != NULL) {
 		if (BUS_TEARDOWN_INTR(parent, dev,
-			scp->res_irq, scp->intr_cookie) != 0)
-				printf("intr teardown failed.. continuing\n");
+		    scp->res_irq, scp->intr_cookie) != 0)
+			device_printf(dev,
+			    "intr teardown failed.. continuing\n");
 		scp->intr_cookie = NULL;
 	}
 
@@ -386,6 +441,10 @@ static device_method_t xenpci_methods[] = {
 
 	/* Bus interface */
 	DEVMETHOD(bus_add_child,	bus_generic_add_child),
+	DEVMETHOD(bus_alloc_resource,   xenpci_alloc_resource),
+	DEVMETHOD(bus_release_resource, xenpci_release_resource),
+	DEVMETHOD(bus_activate_resource, xenpci_activate_resource),
+	DEVMETHOD(bus_deactivate_resource, xenpci_deactivate_resource),
 
 	{ 0, 0 }
 };
diff --git a/sys/i386/xen/xen_machdep.c b/sys/i386/xen/xen_machdep.c
index 060fad5e72a..542f4df0931 100644
--- a/sys/i386/xen/xen_machdep.c
+++ b/sys/i386/xen/xen_machdep.c
@@ -722,7 +722,9 @@ char *bootmem_start, *bootmem_current, *bootmem_end;
 pteinfo_t *pteinfo_list;
 void initvalues(start_info_t *startinfo);
 
-struct ringbuf_head *xen_store; /* XXX move me */
+struct xenstore_domain_interface;
+extern struct xenstore_domain_interface *xen_store;
+
 char *console_page;
 
 void *
@@ -1082,7 +1084,7 @@ initvalues(start_info_t *startinfo)
 	HYPERVISOR_shared_info = (shared_info_t *)cur_space;
 	cur_space += PAGE_SIZE;
 
-	xen_store = (struct ringbuf_head *)cur_space;
+	xen_store = (struct xenstore_domain_interface *)cur_space;
 	cur_space += PAGE_SIZE;
 
 	console_page = (char *)cur_space;
diff --git a/sys/xen/blkif.h b/sys/xen/blkif.h
new file mode 100644
index 00000000000..48b71ea7f54
--- /dev/null
+++ b/sys/xen/blkif.h
@@ -0,0 +1,145 @@
+/* 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __XEN_BLKIF_H__
+#define __XEN_BLKIF_H__
+
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+/* Not a real protocol.  Used to generate ring structs which contain
+ * the elements common to all protocols only.  This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places.  */
+struct blkif_common_request {
+	char dummy;
+};
+struct blkif_common_response {
+	char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
+};
+struct blkif_x86_32_response {
+	uint64_t        id;              /* copied from request */
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       __attribute__((__aligned__(8))) id;
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
+};
+struct blkif_x86_64_response {
+	uint64_t       __attribute__((__aligned__(8))) id;
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
+
+/*
+ * Maximum number of requests that can be active for a given instance
+ * regardless of the protocol in use, based on the ring size.  This constant
+ * facilitates resource pre-allocation in backend drivers since the size is
+ * known well in advance of attaching to a front end.
+ */
+#define BLKIF_MAX_RING_REQUESTS(_sz) \
+	MAX(__RING_SIZE((blkif_x86_64_sring_t *)NULL, _sz),	\
+	    MAX(__RING_SIZE((blkif_x86_32_sring_t *)NULL, _sz),	\
+		__RING_SIZE((blkif_sring_t *)NULL, _sz)))
+
+/*
+ * The number of ring pages required to support a given number of requests
+ * for a given instance regardless of the protocol in use.
+ */
+#define BLKIF_RING_PAGES(_entries) \
+	MAX(__RING_PAGES((blkif_x86_64_sring_t *)NULL, _entries),	\
+	    MAX(__RING_PAGES((blkif_x86_32_sring_t *)NULL, _entries),	\
+		__RING_PAGES((blkif_sring_t *)NULL, _entries)))
+
+union blkif_back_rings {
+	blkif_back_ring_t        native;
+	blkif_common_back_ring_t common;
+	blkif_x86_32_back_ring_t x86_32;
+	blkif_x86_64_back_ring_t x86_64;
+};
+typedef union blkif_back_rings blkif_back_rings_t;
+
+enum blkif_protocol {
+	BLKIF_PROTOCOL_NATIVE = 1,
+	BLKIF_PROTOCOL_X86_32 = 2,
+	BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
+	dst->operation = src->operation;
+	dst->nr_segments = src->nr_segments;
+	dst->handle = src->handle;
+	dst->id = src->id;
+	dst->sector_number = src->sector_number;
+	barrier();
+	if (n > dst->nr_segments)
+		n = dst->nr_segments;
+	for (i = 0; i < n; i++)
+		dst->seg[i] = src->seg[i];
+}
+
+static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
+	dst->operation = src->operation;
+	dst->nr_segments = src->nr_segments;
+	dst->handle = src->handle;
+	dst->id = src->id;
+	dst->sector_number = src->sector_number;
+	barrier();
+	if (n > dst->nr_segments)
+		n = dst->nr_segments;
+	for (i = 0; i < n; i++)
+		dst->seg[i] = src->seg[i];
+}
+
+#endif /* __XEN_BLKIF_H__ */
diff --git a/sys/xen/evtchn/evtchn.c b/sys/xen/evtchn/evtchn.c
index f280d127031..3832277f0b9 100644
--- a/sys/xen/evtchn/evtchn.c
+++ b/sys/xen/evtchn/evtchn.c
@@ -492,15 +492,15 @@ bind_listening_port_to_irqhandler(unsigned int remote_domain,
 int 
 bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
     unsigned int remote_port, const char *devname,
-    driver_filter_t filter, driver_intr_t handler,
-    unsigned long irqflags, unsigned int *irqp)
+    driver_intr_t handler, void *arg, unsigned long irqflags,
+    unsigned int *irqp)
 {
 	unsigned int irq;
 	int error;
 
 	irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
 	intr_register_source(&xp->xp_pins[irq].xp_intsrc);
-	error = intr_add_handler(devname, irq, filter, handler, NULL,
+	error = intr_add_handler(devname, irq, NULL, handler, arg,
 	    irqflags, &xp->xp_pins[irq].xp_cookie);
 	if (error) {
 		unbind_from_irq(irq);
diff --git a/sys/xen/gnttab.c b/sys/xen/gnttab.c
index ae44e8f6a8f..4ece182c31a 100644
--- a/sys/xen/gnttab.c
+++ b/sys/xen/gnttab.c
@@ -42,7 +42,6 @@ __FBSDID("$FreeBSD$");
 
 /* External tools reserve first few grant table entries. */
 #define NR_RESERVED_ENTRIES 8
-#define GNTTAB_LIST_END 0xffffffff
 #define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(grant_entry_t))
 
 static grant_ref_t **gnttab_list;
@@ -66,7 +65,7 @@ get_free_entries(int count, int *entries)
 {
 	int ref, error;
 	grant_ref_t head;
-	
+
 	mtx_lock(&gnttab_list_lock);
 	if ((gnttab_free_count < count) &&
 	    ((error = gnttab_expand(count - gnttab_free_count)) != 0)) {
@@ -79,7 +78,7 @@ get_free_entries(int count, int *entries)
 		head = gnttab_entry(head);
 	gnttab_free_head = gnttab_entry(head);
 	gnttab_entry(head) = GNTTAB_LIST_END;
-	mtx_unlock(&gnttab_list_lock);	
+	mtx_unlock(&gnttab_list_lock);
 
 	*entries = ref;
 	return (0);
@@ -122,7 +121,7 @@ put_free_entry(grant_ref_t ref)
 	gnttab_free_head = ref;
 	gnttab_free_count++;
 	check_free_callbacks();
-	mtx_unlock(&gnttab_list_lock);	
+	mtx_unlock(&gnttab_list_lock);
 }
 
 /*
@@ -136,7 +135,7 @@ gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly,
 	int error, ref;
 
 	error = get_free_entries(1, &ref);
-	
+
 	if (unlikely(error))
 		return (error);
 
@@ -166,9 +165,9 @@ int
 gnttab_query_foreign_access(grant_ref_t ref)
 {
 	uint16_t nflags;
-	
+
 	nflags = shared[ref].flags;
-	
+
 	return (nflags & (GTF_reading|GTF_writing));
 }
 
@@ -180,7 +179,7 @@ gnttab_end_foreign_access_ref(grant_ref_t ref)
 	nflags = shared[ref].flags;
 	do {
 		if ( (flags = nflags) & (GTF_reading|GTF_writing) ) {
-			printf("WARNING: g.e. still in use!\n");
+			printf("%s: WARNING: g.e. still in use!\n", __func__);
 			return (0);
 		}
 	} while ((nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) !=
@@ -201,7 +200,44 @@ gnttab_end_foreign_access(grant_ref_t ref, void *page)
 	else {
 		/* XXX This needs to be fixed so that the ref and page are
 		   placed on a list to be freed up later. */
-		printf("WARNING: leaking g.e. and page still in use!\n");
+		printf("%s: WARNING: leaking g.e. and page still in use!\n",
+		       __func__);
+	}
+}
+
+void
+gnttab_end_foreign_access_references(u_int count, grant_ref_t *refs)
+{
+	grant_ref_t *last_ref;
+	grant_ref_t  head;
+	grant_ref_t  tail;
+
+	head = GNTTAB_LIST_END;
+	tail = *refs;
+	last_ref = refs + count;
+	while (refs != last_ref) {
+
+		if (gnttab_end_foreign_access_ref(*refs)) {
+			gnttab_entry(*refs) = head;
+			head = *refs;
+		} else {
+			/*
+			 * XXX This needs to be fixed so that the ref 
+			 * is placed on a list to be freed up later.
+			 */
+			printf("%s: WARNING: leaking g.e. still in use!\n",
+			       __func__);
+			count--;
+		}
+		refs++;
+	}
+
+	if (count != 0) {
+		mtx_lock(&gnttab_list_lock);
+		gnttab_free_count += count;
+		gnttab_entry(tail) = gnttab_free_head;
+		gnttab_free_head = head;
+		mtx_unlock(&gnttab_list_lock);
 	}
 }
 
@@ -216,7 +252,7 @@ gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn,
 		return (error);
 
 	gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
-	
+
 	*result = ref;
 	return (0);
 }
@@ -282,16 +318,16 @@ gnttab_free_grant_references(grant_ref_t head)
 {
 	grant_ref_t ref;
 	int count = 1;
-	
+
 	if (head == GNTTAB_LIST_END)
 		return;
-	
-	mtx_lock(&gnttab_list_lock);
+
 	ref = head;
 	while (gnttab_entry(ref) != GNTTAB_LIST_END) {
 		ref = gnttab_entry(ref);
 		count++;
 	}
+	mtx_lock(&gnttab_list_lock);
 	gnttab_entry(ref) = gnttab_free_head;
 	gnttab_free_head = head;
 	gnttab_free_count += count;
@@ -403,7 +439,7 @@ grow_gnttab_list(unsigned int more_frames)
 	check_free_callbacks();
 
 	return (0);
-	
+
 grow_nomem:
 	for ( ; i >= nr_grant_frames; i--)
 		free(gnttab_list[i], M_DEVBUF);
@@ -490,7 +526,7 @@ gnttab_map(unsigned int start_idx, unsigned int end_idx)
 
 	if (shared == NULL) {
 		vm_offset_t area;
-		
+
 		area = kmem_alloc_nofault(kernel_map,
 		    PAGE_SIZE * max_nr_grant_frames());
 		KASSERT(area, ("can't allocate VM space for grant table"));
@@ -502,7 +538,7 @@ gnttab_map(unsigned int start_idx, unsigned int end_idx)
 		    ((vm_paddr_t)frames[i]) << PAGE_SHIFT | PG_RW | PG_V);
 
 	free(frames, M_DEVBUF);
-	
+
 	return (0);
 }
 
@@ -517,7 +553,7 @@ gnttab_resume(void)
 
 int
 gnttab_suspend(void)
-{	
+{
 	int i;
 
 	for (i = 0; i < nr_grant_frames; i++)
@@ -532,7 +568,8 @@ gnttab_suspend(void)
 
 static vm_paddr_t resume_frames;
 
-static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+static int
+gnttab_map(unsigned int start_idx, unsigned int end_idx)
 {
 	struct xen_add_to_physmap xatp;
 	unsigned int i = end_idx;
@@ -552,7 +589,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 
 	if (shared == NULL) {
 		vm_offset_t area;
-		
+
 		area = kmem_alloc_nofault(kernel_map,
 		    PAGE_SIZE * max_nr_grant_frames());
 		KASSERT(area, ("can't allocate VM space for grant table"));
@@ -643,10 +680,10 @@ gnttab_init()
 		if (gnttab_list[i] == NULL)
 			goto ini_nomem;
 	}
-	
+
 	if (gnttab_resume())
 		return (ENODEV);
-	
+
 	nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME;
 
 	for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
@@ -670,4 +707,3 @@ ini_nomem:
 }
 
 MTX_SYSINIT(gnttab, &gnttab_list_lock, "GNTTAB LOCK", MTX_DEF); 
-//SYSINIT(gnttab, SI_SUB_PSEUDO, SI_ORDER_FIRST, gnttab_init, NULL);
diff --git a/sys/xen/gnttab.h b/sys/xen/gnttab.h
index 8348af5351f..1741ec33872 100644
--- a/sys/xen/gnttab.h
+++ b/sys/xen/gnttab.h
@@ -43,6 +43,8 @@
 #include <machine/xen/xen-os.h>
 #include <xen/features.h>
 
+#define GNTTAB_LIST_END GRANT_REF_INVALID
+
 struct gnttab_free_callback {
 	struct gnttab_free_callback *next;
 	void (*fn)(void *);
@@ -74,6 +76,13 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref);
  */
 void gnttab_end_foreign_access(grant_ref_t ref, void *page);
 
+/*
+ * Eventually end access through the given array of grant references.
+ * Access will be ended immediately iff the grant entry is not in use,
+ * otherwise it will happen some time later
+ */
+void gnttab_end_foreign_access_references(u_int count, grant_ref_t *refs);
+
 int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn, grant_ref_t *result);
 
 unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
diff --git a/sys/xen/interface/grant_table.h b/sys/xen/interface/grant_table.h
index 26f2c35b181..e76ca67ec5f 100644
--- a/sys/xen/interface/grant_table.h
+++ b/sys/xen/interface/grant_table.h
@@ -159,6 +159,8 @@ typedef struct grant_entry grant_entry_t;
  */
 typedef uint32_t grant_ref_t;
 
+#define	GRANT_REF_INVALID	0xffffffff
+
 /*
  * Handle to track a mapping created via a grant reference.
  */
diff --git a/sys/xen/interface/hvm/params.h b/sys/xen/interface/hvm/params.h
index 6befa78df8a..d846731e111 100644
--- a/sys/xen/interface/hvm/params.h
+++ b/sys/xen/interface/hvm/params.h
@@ -95,4 +95,30 @@
 
 #define HVM_NR_PARAMS          15
 
+#ifdef XENHVM
+/**
+ * Retrieve an HVM setting from the hypervisor.
+ *
+ * \param index  The index of the HVM parameter to retrieve.
+ *
+ * \return  On error, 0.  Otherwise the value of the requested parameter.
+ */
+static inline unsigned long
+hvm_get_parameter(int index)
+{
+	struct xen_hvm_param xhv;
+	int error;
+
+	xhv.domid = DOMID_SELF;
+	xhv.index = index;
+	error = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
+	if (error) {
+		printf("hvm_get_parameter: failed to get %d, error %d\n",
+		    index, error);
+		return (0);
+	}
+	return (xhv.value);
+}
+#endif
+
 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff --git a/sys/xen/interface/io/blkif.h b/sys/xen/interface/io/blkif.h
index 9e2d3d068b6..020936b1f53 100644
--- a/sys/xen/interface/io/blkif.h
+++ b/sys/xen/interface/io/blkif.h
@@ -78,11 +78,19 @@
 #define BLKIF_OP_FLUSH_DISKCACHE   3
 
 /*
- * Maximum scatter/gather segments per request.
- * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
- * NB. This could be 12 if the ring indexes weren't stored in the same page.
+ * Maximum scatter/gather segments associated with a request header block.
  */
-#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
+#define BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK  11
+
+/*
+ * Maximum scatter/gather segments associated with a segment block.
+ */
+#define BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK 14
+
+/*
+ * Maximum scatter/gather segments per request (header + segment blocks).
+ */
+#define BLKIF_MAX_SEGMENTS_PER_REQUEST 255
 
 struct blkif_request_segment {
     grant_ref_t gref;        /* reference to I/O buffer frame        */
@@ -90,6 +98,7 @@ struct blkif_request_segment {
     /* @last_sect: last sector in frame to transfer (inclusive).     */
     uint8_t     first_sect, last_sect;
 };
+typedef struct blkif_request_segment blkif_request_segment_t;
 
 struct blkif_request {
     uint8_t        operation;    /* BLKIF_OP_???                         */
@@ -97,7 +106,7 @@ struct blkif_request {
     blkif_vdev_t   handle;       /* only for read/write requests         */
     uint64_t       id;           /* private guest value, echoed in resp  */
     blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-    struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK];
 };
 typedef struct blkif_request blkif_request_t;
 
@@ -124,10 +133,22 @@ typedef struct blkif_response blkif_response_t;
 
 DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
 
+#define BLKRING_GET_SG_REQUEST(_r, _idx)				\
+    ((struct blkif_request_segment *)RING_GET_REQUEST(_r, _idx))
+
 #define VDISK_CDROM        0x1
 #define VDISK_REMOVABLE    0x2
 #define VDISK_READONLY     0x4
 
+/*
+ * The number of ring request blocks required to handle an I/O
+ * request containing _segs segments.
+ */
+#define BLKIF_SEGS_TO_BLOCKS(_segs)					\
+	((((_segs - BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK)		\
+	 + (BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK - 1))			\
+        / BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK) + /*header_block*/1)
+
 #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
 
 /*
diff --git a/sys/xen/interface/io/protocols.h b/sys/xen/interface/io/protocols.h
index 77bd1bdd288..fd52934e276 100644
--- a/sys/xen/interface/io/protocols.h
+++ b/sys/xen/interface/io/protocols.h
@@ -26,6 +26,7 @@
 #define XEN_IO_PROTO_ABI_X86_32     "x86_32-abi"
 #define XEN_IO_PROTO_ABI_X86_64     "x86_64-abi"
 #define XEN_IO_PROTO_ABI_IA64       "ia64-abi"
+#define XEN_IO_PROTO_ABI_POWERPC64  "powerpc64-abi"
 
 #if defined(__i386__)
 # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_32
@@ -33,6 +34,8 @@
 # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_64
 #elif defined(__ia64__)
 # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_IA64
+#elif defined(__powerpc64__)
+# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_POWERPC64
 #else
 # error arch fixup needed here
 #endif
diff --git a/sys/xen/interface/io/ring.h b/sys/xen/interface/io/ring.h
index 6ce1d0d485f..6b7fd74e8bb 100644
--- a/sys/xen/interface/io/ring.h
+++ b/sys/xen/interface/io/ring.h
@@ -44,6 +44,12 @@ typedef unsigned int RING_IDX;
 #define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    : __RD8(_x))
 #define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
 
+/*
+ * The amount of space reserved in the shared ring for accounting information.
+ */
+#define __RING_HEADER_SIZE(_s) \
+    ((intptr_t)(_s)->ring - (intptr_t)(_s))
+
 /*
  * Calculate size of a shared ring, given the total available space for the
  * ring and indexes (_sz), and the name tag of the request/response structure.
@@ -51,7 +57,17 @@ typedef unsigned int RING_IDX;
  * power of two (so we can mask with (size-1) to loop around).
  */
 #define __RING_SIZE(_s, _sz) \
-    (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
+    (__RD32(((_sz) - __RING_HEADER_SIZE(_s)) / sizeof((_s)->ring[0])))
+
+/*
+ * The number of pages needed to support a given number of request/reponse
+ * entries.  The entry count is rounded down to the nearest power of two
+ * as required by the ring macros.
+ */
+#define __RING_PAGES(_s, _entries)              \
+    ((__RING_HEADER_SIZE(_s)                    \
+   + (__RD32(_entries) * sizeof((_s)->ring[0])) \
+   + PAGE_SIZE - 1) / PAGE_SIZE)
 
 /*
  * Macros to make the correct C datatypes for a new kind of ring.
diff --git a/sys/xen/interface/io/xenbus.h b/sys/xen/interface/io/xenbus.h
index 4a053df2a83..5e24f31ccf8 100644
--- a/sys/xen/interface/io/xenbus.h
+++ b/sys/xen/interface/io/xenbus.h
@@ -36,6 +36,9 @@
 enum xenbus_state {
     XenbusStateUnknown       = 0,
 
+    /*
+     * Initializing: Back-end is initializing.
+     */
     XenbusStateInitialising  = 1,
 
     /*
@@ -49,6 +52,9 @@ enum xenbus_state {
      */
     XenbusStateInitialised   = 3,
 
+    /*
+     * Connected: The normal state for a front to backend connection.
+     */
     XenbusStateConnected     = 4,
 
     /*
@@ -56,6 +62,9 @@ enum xenbus_state {
      */
     XenbusStateClosing       = 5,
 
+    /*
+     * Closed: No connection exists between front and back end.
+     */
     XenbusStateClosed        = 6,
 
     /*
diff --git a/sys/xen/reboot.c b/sys/xen/reboot.c
deleted file mode 100644
index 04ba1326c8b..00000000000
--- a/sys/xen/reboot.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- *
- * Copyright (c) 2004 Christian Limpach.
- * Copyright (c) 2004-2006,2008 Kip Macy
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *      This product includes software developed by Christian Limpach.
- * 4. The name of the author may not be used to endorse or promote products
- *    derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/bus.h>
-#include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
-#include <sys/reboot.h>
-#include <sys/sched.h>
-#include <sys/smp.h>
-#include <sys/systm.h>
-
-#include <machine/xen/xen-os.h>
-#include <xen/hypervisor.h>
-#include <xen/gnttab.h>
-#include <xen/xen_intr.h>
-#include <xen/xenbus/xenbusvar.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-
-#ifdef XENHVM
-
-#include <dev/xen/xenpci/xenpcivar.h>
-
-#else
-
-static void xen_suspend(void);
-
-#endif
-
-static void 
-shutdown_handler(struct xenbus_watch *watch,
-		 const char **vec, unsigned int len)
-{
-	char *str;
-	struct xenbus_transaction xbt;
-	int error, howto;
-	
-	howto = 0;
-
- again:
-	error = xenbus_transaction_start(&xbt);
-	if (error)
-		return;
-
-	error = xenbus_read(xbt, "control", "shutdown", NULL, (void **) &str);
-
-	/* Ignore read errors and empty reads. */
-	if (error || strlen(str) == 0) {
-		xenbus_transaction_end(xbt, 1);
-		return;
-	}
-
-	xenbus_write(xbt, "control", "shutdown", "");
-
-	error = xenbus_transaction_end(xbt, 0);
-	if (error == EAGAIN) {
-		free(str, M_DEVBUF);
-		goto again;
-	}
-
-	if (strcmp(str, "reboot") == 0)
-		howto = 0;
-	else if (strcmp(str, "poweroff") == 0)
-		howto |= (RB_POWEROFF | RB_HALT);
-	else if (strcmp(str, "halt") == 0)
-#ifdef XENHVM
-		/*
-		 * We rely on acpi powerdown to halt the VM.
-		 */
-		howto |= (RB_POWEROFF | RB_HALT);
-#else
-		howto |= RB_HALT;
-#endif
-	else if (strcmp(str, "suspend") == 0)
-		howto = -1;
-	else {
-		printf("Ignoring shutdown request: %s\n", str);
-		goto done;
-	}
-
-	if (howto == -1) {
-		xen_suspend();
-		goto done;
-	}
-
-	shutdown_nice(howto);
- done:
-	free(str, M_DEVBUF);
-}
-
-#ifndef XENHVM
-
-/*
- * In HV mode, we let acpi take care of halts and reboots.
- */
-
-static void
-xen_shutdown_final(void *arg, int howto)
-{
-
-	if (howto & (RB_HALT | RB_POWEROFF))
-		HYPERVISOR_shutdown(SHUTDOWN_poweroff);
-	else
-		HYPERVISOR_shutdown(SHUTDOWN_reboot);
-}
-
-#endif
-
-static struct xenbus_watch shutdown_watch = {
-	.node = "control/shutdown",
-	.callback = shutdown_handler
-};
-
-static void
-setup_shutdown_watcher(void *unused)
-{
-
-	if (register_xenbus_watch(&shutdown_watch))
-		printf("Failed to set shutdown watcher\n");
-#ifndef XENHVM
-	EVENTHANDLER_REGISTER(shutdown_final, xen_shutdown_final, NULL,
-	    SHUTDOWN_PRI_LAST);
-#endif
-}
-
-SYSINIT(shutdown, SI_SUB_PSEUDO, SI_ORDER_ANY, setup_shutdown_watcher, NULL);
-
-#ifndef XENHVM
-
-extern void xencons_suspend(void);
-extern void xencons_resume(void);
-
-static void 
-xen_suspend()
-{
-	int i, j, k, fpp;
-	unsigned long max_pfn, start_info_mfn;
-
-#ifdef SMP
-	cpumask_t map;
-	/*
-	 * Bind us to CPU 0 and stop any other VCPUs.
-	 */
-	thread_lock(curthread);
-	sched_bind(curthread, 0);
-	thread_unlock(curthread);
-	KASSERT(PCPU_GET(cpuid) == 0, ("xen_suspend: not running on cpu 0"));
-
-	map = PCPU_GET(other_cpus) & ~stopped_cpus;
-	if (map)
-		stop_cpus(map);
-#endif
-
-	if (DEVICE_SUSPEND(root_bus) != 0) {
-		printf("xen_suspend: device_suspend failed\n");
-#ifdef SMP
-		if (map)
-			restart_cpus(map);
-#endif
-		return;
-	}
-
-	local_irq_disable();
-
-	xencons_suspend();
-	gnttab_suspend();
-
-	max_pfn = HYPERVISOR_shared_info->arch.max_pfn;
-
-	void *shared_info = HYPERVISOR_shared_info;
-	HYPERVISOR_shared_info = NULL;
-	pmap_kremove((vm_offset_t) shared_info);
-	PT_UPDATES_FLUSH();
-
-	xen_start_info->store_mfn = MFNTOPFN(xen_start_info->store_mfn);
-	xen_start_info->console.domU.mfn = MFNTOPFN(xen_start_info->console.domU.mfn);
-
-	/*
-	 * We'll stop somewhere inside this hypercall. When it returns,
-	 * we'll start resuming after the restore.
-	 */
-	start_info_mfn = VTOMFN(xen_start_info);
-	pmap_suspend();
-	HYPERVISOR_suspend(start_info_mfn);
-	pmap_resume();
-
-	pmap_kenter_ma((vm_offset_t) shared_info, xen_start_info->shared_info);
-	HYPERVISOR_shared_info = shared_info;
-
-	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-		VTOMFN(xen_pfn_to_mfn_frame_list_list);
-  
-	fpp = PAGE_SIZE/sizeof(unsigned long);
-	for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
-		if ((j % fpp) == 0) {
-			k++;
-			xen_pfn_to_mfn_frame_list_list[k] = 
-				VTOMFN(xen_pfn_to_mfn_frame_list[k]);
-			j = 0;
-		}
-		xen_pfn_to_mfn_frame_list[k][j] = 
-			VTOMFN(&xen_phys_machine[i]);
-	}
-	HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
-
-	gnttab_resume();
-	irq_resume();
-	local_irq_enable();
-	xencons_resume();
-
-#ifdef CONFIG_SMP
-	for_each_cpu(i)
-		vcpu_prepare(i);
-
-#endif
-	/* 
-	 * Only resume xenbus /after/ we've prepared our VCPUs; otherwise
-	 * the VCPU hotplug callback can race with our vcpu_prepare
-	 */
-	DEVICE_RESUME(root_bus);
-
-#ifdef SMP
-	thread_lock(curthread);
-	sched_unbind(curthread);
-	thread_unlock(curthread);
-	if (map)
-		restart_cpus(map);
-#endif
-}
-
-#endif
diff --git a/sys/xen/xen_intr.h b/sys/xen/xen_intr.h
index 68f594333fd..2e753e65ecb 100644
--- a/sys/xen/xen_intr.h
+++ b/sys/xen/xen_intr.h
@@ -76,7 +76,7 @@ extern int bind_ipi_to_irqhandler(unsigned int ipi, unsigned int cpu,
  */
 extern int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
 	unsigned int remote_port, const char *devname,
-	driver_filter_t filter, driver_intr_t handler,
+	driver_intr_t handler, void *arg,
 	unsigned long irqflags, unsigned int *irqp);
 
 /*
diff --git a/sys/xen/xenbus/init.txt b/sys/xen/xenbus/init.txt
deleted file mode 100644
index 42495494bd0..00000000000
--- a/sys/xen/xenbus/init.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-
-
-- frontend driver initializes static xenbus_driver with _ids, _probe, _remove, 
-_resume, _otherend_changed
-
-    - initialization calls xenbus_register_frontend(xenbus_driver)
-
-        - xenbus_register_frontend sets read_otherend details to read_backend_details
-	then calls xenbus_register_driver_common(xenbus_driver, xenbus_frontend)
-
-	     - xenbus_register_driver_common sets underlying driver name to xenbus_driver name
-	     underlying driver bus to xenbus_frontend's bus, driver's probe to xenbus_dev_probe
-	     driver's remove to xenbus_dev_remove then calls driver_register
-
diff --git a/sys/xen/xenbus/xenbus_client.c b/sys/xen/xenbus/xenbus.c
similarity index 65%
rename from sys/xen/xenbus/xenbus_client.c
rename to sys/xen/xenbus/xenbus.c
index 740d66409e1..c3e5fee32bc 100644
--- a/sys/xen/xenbus/xenbus_client.c
+++ b/sys/xen/xenbus/xenbus.c
@@ -1,8 +1,4 @@
 /******************************************************************************
- * Client-facing interface for the Xenbus driver.  In other words, the
- * interface between the Xenbus and the device-specific code, be it the
- * frontend or the backend of that driver.
- *
  * Copyright (C) 2005 XenSource Ltd
  * 
  * This file may be distributed separately from the Linux kernel, or
@@ -27,6 +23,14 @@
  * IN THE SOFTWARE.
  */
 
+/**
+ * \file xenbus.c
+ *
+ * \brief Client-facing interface for the Xenbus driver.
+ *
+ * In other words, the interface between the Xenbus and the device-specific
+ * code, be it the frontend or the backend of that driver.
+ */
 
 #if 0
 #define DPRINTK(fmt, args...) \
@@ -39,9 +43,12 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/libkern.h>
+#include <sys/sbuf.h>
 
 #include <machine/xen/xen-os.h>
 #include <xen/hypervisor.h>
@@ -50,6 +57,34 @@ __FBSDID("$FreeBSD$");
 #include <xen/xenbus/xenbusvar.h>
 #include <machine/stdarg.h>
 
+MALLOC_DEFINE(M_XENBUS, "xenbus", "XenBus Support");
+
+/*------------------------- Private Functions --------------------------------*/
+/**
+ * \brief Construct the error path corresponding to the given XenBus
+ *        device.
+ *
+ * \param dev  The XenBus device for which we are constructing an error path.
+ *
+ * \return  On success, the contructed error path.  Otherwise NULL.
+ *
+ * It is the caller's responsibility to free any returned error path
+ * node using the M_XENBUS malloc type.
+ */
+static char *
+error_path(device_t dev)
+{
+	char *path_buffer = malloc(strlen("error/")
+	    + strlen(xenbus_get_node(dev)) + 1,M_XENBUS, M_WAITOK);
+
+	strcpy(path_buffer, "error/");
+	strcpy(path_buffer + strlen("error/"), xenbus_get_node(dev));
+
+	return (path_buffer);
+}
+
+/*--------------------------- Public Functions -------------------------------*/
+/*-------- API comments for these methods can be found in xenbusvar.h --------*/
 const char *
 xenbus_strstate(XenbusState state)
 {
@@ -67,15 +102,15 @@ xenbus_strstate(XenbusState state)
 }
 
 int 
-xenbus_watch_path(device_t dev, char *path, struct xenbus_watch *watch, 
-    void (*callback)(struct xenbus_watch *, const char **, unsigned int))
+xenbus_watch_path(device_t dev, char *path, struct xs_watch *watch, 
+    xs_watch_cb_t *callback)
 {
 	int error;
 
 	watch->node = path;
 	watch->callback = callback;
 
-	error = register_xenbus_watch(watch);
+	error = xs_register_watch(watch);
 
 	if (error) {
 		watch->node = NULL;
@@ -88,12 +123,12 @@ xenbus_watch_path(device_t dev, char *path, struct xenbus_watch *watch,
 
 int
 xenbus_watch_path2(device_t dev, const char *path,
-    const char *path2, struct xenbus_watch *watch, 
-    void (*callback)(struct xenbus_watch *, const char **, unsigned int))
+    const char *path2, struct xs_watch *watch, 
+    xs_watch_cb_t *callback)
 {
 	int error;
 	char *state = malloc(strlen(path) + 1 + strlen(path2) + 1,
-	    M_DEVBUF, M_WAITOK);
+	   M_XENBUS, M_WAITOK);
 
 	strcpy(state, path);
 	strcat(state, "/");
@@ -101,46 +136,27 @@ xenbus_watch_path2(device_t dev, const char *path,
 
 	error = xenbus_watch_path(dev, state, watch, callback);
 	if (error) {
-		free(state, M_DEVBUF);
+		free(state,M_XENBUS);
 	}
 
 	return (error);
 }
 
-/**
- * Return the path to the error node for the given device, or NULL on failure.
- * If the value returned is non-NULL, then it is the caller's to kfree.
- */
-static char *
-error_path(device_t dev)
-{
-	char *path_buffer = malloc(strlen("error/")
-	    + strlen(xenbus_get_node(dev)) + 1, M_DEVBUF, M_WAITOK);
-
-	strcpy(path_buffer, "error/");
-	strcpy(path_buffer + strlen("error/"), xenbus_get_node(dev));
-
-	return (path_buffer);
-}
-
-
-static void
-_dev_error(device_t dev, int err, const char *fmt, va_list ap)
+void
+xenbus_dev_verror(device_t dev, int err, const char *fmt, va_list ap)
 {
 	int ret;
 	unsigned int len;
 	char *printf_buffer = NULL, *path_buffer = NULL;
 
 #define PRINTF_BUFFER_SIZE 4096
-	printf_buffer = malloc(PRINTF_BUFFER_SIZE, M_DEVBUF, M_WAITOK);
+	printf_buffer = malloc(PRINTF_BUFFER_SIZE,M_XENBUS, M_WAITOK);
 
 	len = sprintf(printf_buffer, "%i ", err);
 	ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
 
 	KASSERT(len + ret <= PRINTF_BUFFER_SIZE-1, ("xenbus error message too big"));
-#if 0	
-	dev_err(&dev->dev, "%s\n", printf_buffer);
-#endif		
+	device_printf(dev, "Error %s\n", printf_buffer);
 	path_buffer = error_path(dev);
 
 	if (path_buffer == NULL) {
@@ -149,7 +165,7 @@ _dev_error(device_t dev, int err, const char *fmt, va_list ap)
 		goto fail;
 	}
 
-	if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
+	if (xs_write(XST_NIL, path_buffer, "error", printf_buffer) != 0) {
 		printf("xenbus: failed to write error node for %s (%s)\n",
 		       xenbus_get_node(dev), printf_buffer);
 		goto fail;
@@ -157,9 +173,9 @@ _dev_error(device_t dev, int err, const char *fmt, va_list ap)
 
  fail:
 	if (printf_buffer)
-		free(printf_buffer, M_DEVBUF);
+		free(printf_buffer,M_XENBUS);
 	if (path_buffer)
-		free(path_buffer, M_DEVBUF);
+		free(path_buffer,M_XENBUS);
 }
 
 void
@@ -168,41 +184,45 @@ xenbus_dev_error(device_t dev, int err, const char *fmt, ...)
 	va_list ap;
 
 	va_start(ap, fmt);
-	_dev_error(dev, err, fmt, ap);
+	xenbus_dev_verror(dev, err, fmt, ap);
 	va_end(ap);
 }
 
+void
+xenbus_dev_vfatal(device_t dev, int err, const char *fmt, va_list ap)
+{
+	xenbus_dev_verror(dev, err, fmt, ap);
+	device_printf(dev, "Fatal error. Transitioning to Closing State\n");
+	xenbus_set_state(dev, XenbusStateClosing);
+}
+
 void
 xenbus_dev_fatal(device_t dev, int err, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
-	_dev_error(dev, err, fmt, ap);
+	xenbus_dev_vfatal(dev, err, fmt, ap);
 	va_end(ap);
-	
-	xenbus_set_state(dev, XenbusStateClosing);
 }
 
 int
-xenbus_grant_ring(device_t dev, unsigned long ring_mfn, int *refp)
+xenbus_grant_ring(device_t dev, unsigned long ring_mfn, grant_ref_t *refp)
 {
 	int error;
-	grant_ref_t ref;
 
 	error = gnttab_grant_foreign_access(
-		xenbus_get_otherend_id(dev), ring_mfn, 0, &ref);
+		xenbus_get_otherend_id(dev), ring_mfn, 0, refp);
 	if (error) {
 		xenbus_dev_fatal(dev, error, "granting access to ring page");
 		return (error);
 	}
 
-	*refp = ref;
 	return (0);
 }
 
 int
-xenbus_alloc_evtchn(device_t dev, int *port)
+xenbus_alloc_evtchn(device_t dev, evtchn_port_t *port)
 {
 	struct evtchn_alloc_unbound alloc_unbound;
 	int err;
@@ -222,7 +242,7 @@ xenbus_alloc_evtchn(device_t dev, int *port)
 }
 
 int
-xenbus_free_evtchn(device_t dev, int port)
+xenbus_free_evtchn(device_t dev, evtchn_port_t port)
 {
 	struct evtchn_close close;
 	int err;
@@ -240,12 +260,29 @@ xenbus_free_evtchn(device_t dev, int port)
 XenbusState
 xenbus_read_driver_state(const char *path)
 {
-	XenbusState result;
-	int error;
+        XenbusState result;
+        int error;
 
-	error = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
-	if (error)
-		result = XenbusStateClosed;
+        error = xs_gather(XST_NIL, path, "state", "%d", &result, NULL);
+        if (error)
+                result = XenbusStateClosed;
 
-	return (result);
+        return (result);
+}
+
+int
+xenbus_dev_is_online(device_t dev)
+{
+	const char *path;
+	int error;
+	int value;
+
+	path = xenbus_get_node(dev);
+	error = xs_gather(XST_NIL, path, "online", "%d", &value, NULL);
+	if (error != 0) {
+		/* Default to not online. */
+		value = 0;
+	}
+
+	return (value);
 }
diff --git a/sys/xen/xenbus/xenbus_comms.c b/sys/xen/xenbus/xenbus_comms.c
deleted file mode 100644
index 2f039551792..00000000000
--- a/sys/xen/xenbus/xenbus_comms.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/******************************************************************************
- * xenbus_comms.c
- *
- * Low level code to talks to Xen Store: ringbuffer and event channel.
- *
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- * 
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/bus.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/sx.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/syslog.h>
-
-#include <machine/xen/xen-os.h>
-#include <xen/hypervisor.h>
-
-#include <xen/xen_intr.h>
-#include <xen/evtchn.h>
-#include <xen/interface/io/xs_wire.h>
-#include <xen/xenbus/xenbus_comms.h>
-
-static unsigned int xenstore_irq;
-
-static inline struct xenstore_domain_interface *
-xenstore_domain_interface(void)
-{
-
-	return (struct xenstore_domain_interface *)xen_store;
-}
-
-static void
-xb_intr(void * arg __attribute__((unused)))
-{
-
-	wakeup(xen_store);
-}
-
-static int
-xb_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
-{
-
-	return ((prod - cons) <= XENSTORE_RING_SIZE);
-}
-
-static void *
-xb_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
-    char *buf, uint32_t *len)
-{
-
-	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
-	if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
-		*len = XENSTORE_RING_SIZE - (prod - cons);
-	return (buf + MASK_XENSTORE_IDX(prod));
-}
-
-static const void *
-xb_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
-    const char *buf, uint32_t *len)
-{
-
-	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
-	if ((prod - cons) < *len)
-		*len = prod - cons;
-	return (buf + MASK_XENSTORE_IDX(cons));
-}
-
-int
-xb_write(const void *tdata, unsigned len, struct lock_object *lock)
-{
-	struct xenstore_domain_interface *intf = xenstore_domain_interface();
-	XENSTORE_RING_IDX cons, prod;
-	const char *data = (const char *)tdata;
-	int error;
-
-	while (len != 0) {
-		void *dst;
-		unsigned int avail;
-
-		while ((intf->req_prod - intf->req_cons)
-		    == XENSTORE_RING_SIZE) {
-			error = _sleep(intf,
-			    lock,
-			    PCATCH, "xbwrite", hz/10);
-			if (error && error != EWOULDBLOCK)
-				return (error);
-		}
-
-		/* Read indexes, then verify. */
-		cons = intf->req_cons;
-		prod = intf->req_prod;
-		mb();
-		if (!xb_check_indexes(cons, prod)) {
-			intf->req_cons = intf->req_prod = 0;
-			return (EIO);
-		}
-
-		dst = xb_get_output_chunk(cons, prod, intf->req, &avail);
-		if (avail == 0)
-			continue;
-		if (avail > len)
-			avail = len;
-		mb();
-				
-		memcpy(dst, data, avail);
-		data += avail;
-		len -= avail;
-
-		/* Other side must not see new header until data is there. */
-		wmb();
-		intf->req_prod += avail;
-
-		/* This implies mb() before other side sees interrupt. */
-		notify_remote_via_evtchn(xen_store_evtchn);
-	}
-
-	return (0);
-}
-
-int
-xb_read(void *tdata, unsigned len, struct lock_object *lock)
-{
-	struct xenstore_domain_interface *intf = xenstore_domain_interface();
-	XENSTORE_RING_IDX cons, prod;
-	char *data = (char *)tdata;
-	int error;
-
-	while (len != 0) {
-		unsigned int avail;
-		const char *src;
-
-		while (intf->rsp_cons == intf->rsp_prod) {
-			error = _sleep(intf, lock,
-			    PCATCH, "xbread", hz/10);
-			if (error && error != EWOULDBLOCK)
-				return (error);
-		}
-			
-		/* Read indexes, then verify. */
-		cons = intf->rsp_cons;
-		prod = intf->rsp_prod;
-		if (!xb_check_indexes(cons, prod)) {
-			intf->rsp_cons = intf->rsp_prod = 0;
-			return (EIO);
-		}
-				
-		src = xb_get_input_chunk(cons, prod, intf->rsp, &avail);
-		if (avail == 0)
-			continue;
-		if (avail > len)
-			avail = len;
-
-		/* We must read header before we read data. */
-		rmb();
-
-		memcpy(data, src, avail);
-		data += avail;
-		len -= avail;
-
-		/* Other side must not see free space until we've copied out */
-		mb();
-		intf->rsp_cons += avail;
-
-		/* Implies mb(): they will see new header. */
-		notify_remote_via_evtchn(xen_store_evtchn);
-	}
-
-	return (0);
-}
-
-/* Set up interrupt handler off store event channel. */
-int
-xb_init_comms(void)
-{
-	struct xenstore_domain_interface *intf = xenstore_domain_interface();
-	int error;
-
-	if (intf->rsp_prod != intf->rsp_cons) {
-		log(LOG_WARNING, "XENBUS response ring is not quiescent "
-		    "(%08x:%08x): fixing up\n",
-		    intf->rsp_cons, intf->rsp_prod);
-		intf->rsp_cons = intf->rsp_prod;
-	}
-
-	if (xenstore_irq)
-		unbind_from_irqhandler(xenstore_irq);
-
-	error = bind_caller_port_to_irqhandler(
-		xen_store_evtchn, "xenbus",
-		    xb_intr, NULL, INTR_TYPE_NET, &xenstore_irq);
-	if (error) {
-		log(LOG_WARNING, "XENBUS request irq failed %i\n", error);
-		return (error);
-	}
-
-	return (0);
-}
diff --git a/sys/xen/xenbus/xenbus_comms.h b/sys/xen/xenbus/xenbus_comms.h
deleted file mode 100644
index fa4733109d9..00000000000
--- a/sys/xen/xenbus/xenbus_comms.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Private include for xenbus communications.
- * 
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- *
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * $FreeBSD$
- */
-
-#ifndef _XENBUS_COMMS_H
-#define _XENBUS_COMMS_H
-
-struct sx;
-extern int xen_store_evtchn;
-extern char *xen_store;
-
-int xs_init(void);
-int xb_init_comms(void);
-
-/* Low level routines. */
-int xb_write(const void *data, unsigned len, struct lock_object *);
-int xb_read(void *data, unsigned len, struct lock_object *);
-extern int xenbus_running;
-
-char *kasprintf(const char *fmt, ...);
-
-
-#endif /* _XENBUS_COMMS_H */
diff --git a/sys/xen/xenbus/xenbus_if.m b/sys/xen/xenbus/xenbus_if.m
index 018a2bbf43b..d6714183178 100644
--- a/sys/xen/xenbus/xenbus_if.m
+++ b/sys/xen/xenbus/xenbus_if.m
@@ -31,7 +31,15 @@
 
 INTERFACE xenbus;
 
-METHOD int backend_changed {
-	device_t dev;
-	enum xenbus_state newstate;
+/**
+ * \brief Callback triggered when the state of the otherend
+ *        of a split device changes.
+ *
+ * \param _dev       NewBus device_t for this XenBus device whose otherend's
+ *                   state has changed..
+ * \param _newstate  The new state of the otherend device.
+ */
+METHOD int otherend_changed {
+	device_t _dev;
+	enum xenbus_state _newstate;
 };
diff --git a/sys/xen/xenbus/xenbus_probe.c b/sys/xen/xenbus/xenbus_probe.c
deleted file mode 100644
index b1e9a2108aa..00000000000
--- a/sys/xen/xenbus/xenbus_probe.c
+++ /dev/null
@@ -1,602 +0,0 @@
-/******************************************************************************
- * Talks to Xen Store to figure out what devices we have.
- *
- * Copyright (C) 2008 Doug Rabson
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- * Copyright (C) 2005 Mike Wray, Hewlett-Packard
- * Copyright (C) 2005 XenSource Ltd
- * 
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#if 0
-#define DPRINTK(fmt, args...) \
-    printf("xenbus_probe (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
-#else
-#define DPRINTK(fmt, args...) ((void)0)
-#endif
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/bus.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/module.h>
-#include <sys/sysctl.h>
-#include <sys/syslog.h>
-#include <sys/systm.h>
-#include <sys/sx.h>
-#include <sys/taskqueue.h>
-
-#include <machine/xen/xen-os.h>
-#include <machine/stdarg.h>
-
-#include <xen/gnttab.h>
-#include <xen/xenbus/xenbusvar.h>
-#include <xen/xenbus/xenbus_comms.h>
-
-struct xenbus_softc {
-	struct xenbus_watch xs_devicewatch;
-	struct task	xs_probechildren;
-	struct intr_config_hook xs_attachcb;
-	device_t	xs_dev;
-};
-
-struct xenbus_device_ivars {
-	struct xenbus_watch xd_otherend_watch; /* must be first */
-	struct sx	xd_lock;
-	device_t	xd_dev;
-	char		*xd_node;	/* node name in xenstore */
-	char		*xd_type;	/* xen device type */
-	enum xenbus_state xd_state;
-	int		xd_otherend_id;
-	char		*xd_otherend_path;
-};
-
-/* Simplified asprintf. */
-char *
-kasprintf(const char *fmt, ...)
-{
-	va_list ap;
-	unsigned int len;
-	char *p, dummy[1];
-
-	va_start(ap, fmt);
-	/* FIXME: vsnprintf has a bug, NULL should work */
-	len = vsnprintf(dummy, 0, fmt, ap);
-	va_end(ap);
-
-	p = malloc(len + 1, M_DEVBUF, M_WAITOK);
-	va_start(ap, fmt);
-	vsprintf(p, fmt, ap);
-	va_end(ap);
-	return p;
-}
-
-static void
-xenbus_identify(driver_t *driver, device_t parent)
-{
-
-	BUS_ADD_CHILD(parent, 0, "xenbus", 0);
-}
-
-static int 
-xenbus_probe(device_t dev)
-{
-	int err = 0;
-
-	DPRINTK("");
-
-	/* Initialize the interface to xenstore. */
-	err = xs_init(); 
-	if (err) {
-		log(LOG_WARNING,
-		    "XENBUS: Error initializing xenstore comms: %i\n", err);
-		return (ENXIO);
-	}
-	err = gnttab_init();
-	if (err) {
-		log(LOG_WARNING,
-		    "XENBUS: Error initializing grant table: %i\n", err);
-		return (ENXIO);
-	}
-	device_set_desc(dev, "Xen Devices");
-
-	return (0);
-}
-
-static enum xenbus_state
-xenbus_otherend_state(struct xenbus_device_ivars *ivars)
-{
-
-	return (xenbus_read_driver_state(ivars->xd_otherend_path));
-}
-
-static void
-xenbus_backend_changed(struct xenbus_watch *watch, const char **vec,
-    unsigned int len)
-{
-	struct xenbus_device_ivars *ivars;
-	device_t dev;
-	enum xenbus_state newstate;
-
-	ivars = (struct xenbus_device_ivars *) watch;
-	dev = ivars->xd_dev;
-
-	if (!ivars->xd_otherend_path
-	    || strncmp(ivars->xd_otherend_path, vec[XS_WATCH_PATH],
-		strlen(ivars->xd_otherend_path)))
-		return;
-
-	newstate = xenbus_otherend_state(ivars);
-	XENBUS_BACKEND_CHANGED(dev, newstate);
-}
-
-static int
-xenbus_device_exists(device_t dev, const char *node)
-{
-	device_t *kids;
-	struct xenbus_device_ivars *ivars;
-	int i, count, result;
-
-	if (device_get_children(dev, &kids, &count))
-		return (FALSE);
-
-	result = FALSE;
-	for (i = 0; i < count; i++) {
-		ivars = device_get_ivars(kids[i]);
-		if (!strcmp(ivars->xd_node, node)) {
-			result = TRUE;
-			break;
-		}
-	}
-	free(kids, M_TEMP);
-
-	return (result);
-}
-
-static int
-xenbus_add_device(device_t dev, const char *bus,
-    const char *type, const char *id)
-{
-	device_t child;
-	struct xenbus_device_ivars *ivars;
-	enum xenbus_state state;
-	char *statepath;
-	int error;
-
-	ivars = malloc(sizeof(struct xenbus_device_ivars),
-	    M_DEVBUF, M_ZERO|M_WAITOK);
-	ivars->xd_node = kasprintf("%s/%s/%s", bus, type, id);
-
-	if (xenbus_device_exists(dev, ivars->xd_node)) {
-		/*
-		 * We are already tracking this node
-		 */
-		free(ivars->xd_node, M_DEVBUF);
-		free(ivars, M_DEVBUF);
-		return (0);
-	}
-
-	state = xenbus_read_driver_state(ivars->xd_node);
-
-	if (state != XenbusStateInitialising) {
-		/*
-		 * Device is not new, so ignore it. This can
-		 * happen if a device is going away after
-		 * switching to Closed.
-		 */
-		free(ivars->xd_node, M_DEVBUF);
-		free(ivars, M_DEVBUF);
-		return (0);
-	}
-
-	/*
-	 * Find the backend details
-	 */
-	error = xenbus_gather(XBT_NIL, ivars->xd_node,
-	    "backend-id", "%i", &ivars->xd_otherend_id,
-	    "backend", NULL, &ivars->xd_otherend_path,
-	    NULL);
-	if (error)
-		return (error);
-
-	sx_init(&ivars->xd_lock, "xdlock");
-	ivars->xd_type = strdup(type, M_DEVBUF);
-	ivars->xd_state = XenbusStateInitialising;
-
-	statepath = malloc(strlen(ivars->xd_otherend_path)
-	    + strlen("/state") + 1, M_DEVBUF, M_WAITOK);
-	sprintf(statepath, "%s/state", ivars->xd_otherend_path);
-
-	ivars->xd_otherend_watch.node = statepath;
-	ivars->xd_otherend_watch.callback = xenbus_backend_changed;
-
-	child = device_add_child(dev, NULL, -1);
-	ivars->xd_dev = child;
-	device_set_ivars(child, ivars);
-
-	return (0);
-}
-
-static int
-xenbus_enumerate_type(device_t dev, const char *bus, const char *type)
-{
-	char **dir;
-	unsigned int i, count;
-	int error;
-
-	error = xenbus_directory(XBT_NIL, bus, type, &count, &dir);
-	if (error)
-		return (error);
-	for (i = 0; i < count; i++)
-		xenbus_add_device(dev, bus, type, dir[i]);
-
-	free(dir, M_DEVBUF);
-
-	return (0);
-}
-
-static int
-xenbus_enumerate_bus(device_t dev, const char *bus)
-{
-	char **dir;
-	unsigned int i, count;
-	int error;
-
-	error = xenbus_directory(XBT_NIL, bus, "", &count, &dir);
-	if (error)
-		return (error);
-	for (i = 0; i < count; i++) {
-		xenbus_enumerate_type(dev, bus, dir[i]);
-	}
-	free(dir, M_DEVBUF);
-
-	return (0);
-}
-
-static int
-xenbus_probe_children(device_t dev)
-{
-	device_t *kids;
-	struct xenbus_device_ivars *ivars;
-	int i, count;
-
-	/*
-	 * Probe any new devices and register watches for any that
-	 * attach successfully. Since part of the protocol which
-	 * establishes a connection with the other end is interrupt
-	 * driven, we sleep until the device reaches a stable state
-	 * (closed or connected).
-	 */
-	if (device_get_children(dev, &kids, &count) == 0) {
-		for (i = 0; i < count; i++) {
-			if (device_get_state(kids[i]) != DS_NOTPRESENT)
-				continue;
-
-			if (device_probe_and_attach(kids[i]))
-				continue;
-			ivars = device_get_ivars(kids[i]);
-			register_xenbus_watch(
-				&ivars->xd_otherend_watch);
-			sx_xlock(&ivars->xd_lock);
-			while (ivars->xd_state != XenbusStateClosed
-			    && ivars->xd_state != XenbusStateConnected)
-				sx_sleep(&ivars->xd_state, &ivars->xd_lock,
-				    0, "xdattach", 0);
-			sx_xunlock(&ivars->xd_lock);
-		}
-		free(kids, M_TEMP);
-	}
-
-	return (0);
-}
-
-static void
-xenbus_probe_children_cb(void *arg, int pending)
-{
-	device_t dev = (device_t) arg;
-
-	xenbus_probe_children(dev);
-}
-
-static void
-xenbus_devices_changed(struct xenbus_watch *watch,
-    const char **vec, unsigned int len)
-{
-	struct xenbus_softc *sc = (struct xenbus_softc *) watch;
-	device_t dev = sc->xs_dev;
-	char *node, *bus, *type, *id, *p;
-
-	node = strdup(vec[XS_WATCH_PATH], M_DEVBUF);
-	p = strchr(node, '/');
-	if (!p)
-		goto out;
-	bus = node;
-	*p = 0;
-	type = p + 1;
-
-	p = strchr(type, '/');
-	if (!p)
-		goto out;
-	*p = 0;
-	id = p + 1;
-
-	p = strchr(id, '/');
-	if (p)
-		*p = 0;
-
-	xenbus_add_device(dev, bus, type, id);
-	taskqueue_enqueue(taskqueue_thread, &sc->xs_probechildren);
-out:
-	free(node, M_DEVBUF);
-}
-
-static void
-xenbus_attach_deferred(void *arg)
-{
-	device_t dev = (device_t) arg;
-	struct xenbus_softc *sc = device_get_softc(dev);
-	int error;
-	
-	error = xenbus_enumerate_bus(dev, "device");
-	if (error)
-		return;
-	xenbus_probe_children(dev);
-
-	sc->xs_dev = dev;
-	sc->xs_devicewatch.node = "device";
-	sc->xs_devicewatch.callback = xenbus_devices_changed;
-
-	TASK_INIT(&sc->xs_probechildren, 0, xenbus_probe_children_cb, dev);
-
-	register_xenbus_watch(&sc->xs_devicewatch);
-
-	config_intrhook_disestablish(&sc->xs_attachcb);
-}
-
-static int
-xenbus_attach(device_t dev)
-{
-	struct xenbus_softc *sc = device_get_softc(dev);
-
-	sc->xs_attachcb.ich_func = xenbus_attach_deferred;
-	sc->xs_attachcb.ich_arg = dev;
-	config_intrhook_establish(&sc->xs_attachcb);
-
-	return (0);
-}
-
-static int
-xenbus_suspend(device_t dev)
-{
-	int error;
-
-	DPRINTK("");
-
-	error = bus_generic_suspend(dev);
-	if (error)
-		return (error);
-
-	xs_suspend();
-
-	return (0);
-}
-
-static int
-xenbus_resume(device_t dev)
-{
-	device_t *kids;
-	struct xenbus_device_ivars *ivars;
-	int i, count, error;
-	char *statepath;
-
-	xb_init_comms();
-	xs_resume();
-
-	/*
-	 * We must re-examine each device and find the new path for
-	 * its backend.
-	 */
-	if (device_get_children(dev, &kids, &count) == 0) {
-		for (i = 0; i < count; i++) {
-			if (device_get_state(kids[i]) == DS_NOTPRESENT)
-				continue;
-
-			ivars = device_get_ivars(kids[i]);
-
-			unregister_xenbus_watch(
-				&ivars->xd_otherend_watch);
-			ivars->xd_state = XenbusStateInitialising;
-
-			/*
-			 * Find the new backend details and
-			 * re-register our watch.
-			 */
-			free(ivars->xd_otherend_path, M_DEVBUF);
-			error = xenbus_gather(XBT_NIL, ivars->xd_node,
-			    "backend-id", "%i", &ivars->xd_otherend_id,
-			    "backend", NULL, &ivars->xd_otherend_path,
-			    NULL);
-			if (error)
-				return (error);
-
-			DEVICE_RESUME(kids[i]);
-
-			statepath = malloc(strlen(ivars->xd_otherend_path)
-			    + strlen("/state") + 1, M_DEVBUF, M_WAITOK);
-			sprintf(statepath, "%s/state", ivars->xd_otherend_path);
-
-			free(ivars->xd_otherend_watch.node, M_DEVBUF);
-			ivars->xd_otherend_watch.node = statepath;
-			register_xenbus_watch(
-				&ivars->xd_otherend_watch);
-
-#if 0
-			/*
-			 * Can't do this yet since we are running in
-			 * the xenwatch thread and if we sleep here,
-			 * we will stop delivering watch notifications
-			 * and the device will never come back online.
-			 */
-			sx_xlock(&ivars->xd_lock);
-			while (ivars->xd_state != XenbusStateClosed
-			    && ivars->xd_state != XenbusStateConnected)
-				sx_sleep(&ivars->xd_state, &ivars->xd_lock,
-				    0, "xdresume", 0);
-			sx_xunlock(&ivars->xd_lock);
-#endif
-		}
-		free(kids, M_TEMP);
-	}
-
-	return (0);
-}
-
-static int
-xenbus_print_child(device_t dev, device_t child)
-{
-	struct xenbus_device_ivars *ivars = device_get_ivars(child);
-	int	retval = 0;
-
-	retval += bus_print_child_header(dev, child);
-	retval += printf(" at %s", ivars->xd_node);
-	retval += bus_print_child_footer(dev, child);
-
-	return (retval);
-}
-
-static int
-xenbus_read_ivar(device_t dev, device_t child, int index,
-    uintptr_t * result)
-{
-	struct xenbus_device_ivars *ivars = device_get_ivars(child);
-
-	switch (index) {
-	case XENBUS_IVAR_NODE:
-		*result = (uintptr_t) ivars->xd_node;
-		return (0);
-
-	case XENBUS_IVAR_TYPE:
-		*result = (uintptr_t) ivars->xd_type;
-		return (0);
-
-	case XENBUS_IVAR_STATE:
-		*result = (uintptr_t) ivars->xd_state;
-		return (0);
-
-	case XENBUS_IVAR_OTHEREND_ID:
-		*result = (uintptr_t) ivars->xd_otherend_id;
-		return (0);
-
-	case XENBUS_IVAR_OTHEREND_PATH:
-		*result = (uintptr_t) ivars->xd_otherend_path;
-		return (0);
-	}
-
-	return (ENOENT);
-}
-
-static int
-xenbus_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
-{
-	struct xenbus_device_ivars *ivars = device_get_ivars(child);
-	enum xenbus_state newstate;
-	int currstate;
-	int error;
-
-	switch (index) {
-	case XENBUS_IVAR_STATE:
-		newstate = (enum xenbus_state) value;
-		sx_xlock(&ivars->xd_lock);
-		if (ivars->xd_state == newstate)
-			goto out;
-
-		error = xenbus_scanf(XBT_NIL, ivars->xd_node, "state",
-		    NULL, "%d", &currstate);
-		if (error)
-			goto out;
-
-		error = xenbus_printf(XBT_NIL, ivars->xd_node, "state",
-		    "%d", newstate);
-		if (error) {
-			if (newstate != XenbusStateClosing) /* Avoid looping */
-				xenbus_dev_fatal(dev, error, "writing new state");
-			goto out;
-		}
-		ivars->xd_state = newstate;
-		wakeup(&ivars->xd_state);
-	out:
-		sx_xunlock(&ivars->xd_lock);
-		return (0);
-
-	case XENBUS_IVAR_NODE:
-	case XENBUS_IVAR_TYPE:
-	case XENBUS_IVAR_OTHEREND_ID:
-	case XENBUS_IVAR_OTHEREND_PATH:
-		/*
-		 * These variables are read-only.
-		 */
-		return (EINVAL);
-	}
-
-	return (ENOENT);
-}
-
-SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen");
-SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xen_store_evtchn, 0, "");
-SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, "");
-
-static device_method_t xenbus_methods[] = { 
-	/* Device interface */ 
-	DEVMETHOD(device_identify,	xenbus_identify),
-	DEVMETHOD(device_probe,         xenbus_probe), 
-	DEVMETHOD(device_attach,        xenbus_attach), 
-	DEVMETHOD(device_detach,        bus_generic_detach), 
-	DEVMETHOD(device_shutdown,      bus_generic_shutdown), 
-	DEVMETHOD(device_suspend,       xenbus_suspend), 
-	DEVMETHOD(device_resume,        xenbus_resume), 
- 
-	/* Bus interface */ 
-	DEVMETHOD(bus_print_child,      xenbus_print_child),
-	DEVMETHOD(bus_read_ivar,        xenbus_read_ivar), 
-	DEVMETHOD(bus_write_ivar,       xenbus_write_ivar), 
- 
-	{ 0, 0 } 
-}; 
-
-static char driver_name[] = "xenbus";
-static driver_t xenbus_driver = { 
-	driver_name, 
-	xenbus_methods, 
-	sizeof(struct xenbus_softc),
-}; 
-devclass_t xenbus_devclass; 
- 
-#ifdef XENHVM
-DRIVER_MODULE(xenbus, xenpci, xenbus_driver, xenbus_devclass, 0, 0);
-#else
-DRIVER_MODULE(xenbus, nexus, xenbus_driver, xenbus_devclass, 0, 0);
-#endif
diff --git a/sys/xen/xenbus/xenbus_probe_backend.c b/sys/xen/xenbus/xenbus_probe_backend.c
deleted file mode 100644
index 20cc49f8229..00000000000
--- a/sys/xen/xenbus/xenbus_probe_backend.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/******************************************************************************
- * Talks to Xen Store to figure out what devices we have (backend half).
- *
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- * Copyright (C) 2005 Mike Wray, Hewlett-Packard
- * Copyright (C) 2005, 2006 XenSource Ltd
- * 
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#if 0
-#define DPRINTK(fmt, args...) \
-    printf("xenbus_probe (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
-#else
-#define DPRINTK(fmt, args...) ((void)0)
-#endif
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/cdefs.h>
-#include <sys/time.h>
-#include <sys/sema.h>
-#include <sys/eventhandler.h>
-#include <sys/errno.h>
-#include <sys/kernel.h>
-#include <sys/malloc.h>
-#include <sys/module.h>
-#include <sys/conf.h>
-#include <sys/systm.h>
-#include <sys/syslog.h>
-#include <sys/proc.h>
-#include <sys/bus.h>
-#include <sys/sx.h>
-
-#include <machine/xen/xen-os.h>
-#include <xen/hypervisor.h>
-#include <machine/xen/xenbus.h>
-#include <machine/stdarg.h>
-
-#include <xen/evtchn.h>
-#include <xen/xenbus/xenbus_comms.h>
-
-#define BUG_ON        PANIC_IF
-#define semaphore     sema
-#define rw_semaphore  sema
-#define DEFINE_SPINLOCK(lock) struct mtx lock
-#define DECLARE_MUTEX(lock) struct sema lock
-#define u32           uint32_t
-#define list_del(head, ent)      TAILQ_REMOVE(head, ent, list) 
-#define simple_strtoul strtoul
-#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
-#define list_empty    TAILQ_EMPTY
-
-extern struct xendev_list_head xenbus_device_backend_list;
-#if 0
-static int xenbus_uevent_backend(struct device *dev, char **envp,
-				 int num_envp, char *buffer, int buffer_size);
-#endif
-static int xenbus_probe_backend(const char *type, const char *domid);
-
-static int read_frontend_details(struct xenbus_device *xendev)
-{
-	return read_otherend_details(xendev, "frontend-id", "frontend");
-}
-
-/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
-static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
-{
-	int domid, err;
-	const char *devid, *type, *frontend;
-	unsigned int typelen;
-
-	type = strchr(nodename, '/');
-	if (!type)
-		return -EINVAL;
-	type++;
-	typelen = strcspn(type, "/");
-	if (!typelen || type[typelen] != '/')
-		return -EINVAL;
-
-	devid = strrchr(nodename, '/') + 1;
-
-	err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
-			    "frontend", NULL, &frontend,
-			    NULL);
-	if (err)
-		return err;
-	if (strlen(frontend) == 0)
-		err = -ERANGE;
-	if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
-		err = -ENOENT;
-	kfree(frontend);
-
-	if (err)
-		return err;
-
-	if (snprintf(bus_id, BUS_ID_SIZE,
-		     "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
-		return -ENOSPC;
-	return 0;
-}
-
-static struct xen_bus_type xenbus_backend = {
-	.root = "backend",
-	.levels = 3, 		/* backend/type/<frontend>/<id> */
-	.get_bus_id = backend_bus_id,
-	.probe = xenbus_probe_backend,
-	.bus = &xenbus_device_backend_list,
-	
-#if 0
-	.error = -ENODEV,
-	.bus = {
-		.name     = "xen-backend",
-		.match    = xenbus_match,
-		.probe    = xenbus_dev_probe,
-		.remove   = xenbus_dev_remove,
-//		.shutdown = xenbus_dev_shutdown,
-		.uevent   = xenbus_uevent_backend,
-	},
-	.dev = {
-		.bus_id = "xen-backend",
-	},
-#endif	
-};
-
-#if 0
-static int xenbus_uevent_backend(struct device *dev, char **envp,
-				 int num_envp, char *buffer, int buffer_size)
-{
-	struct xenbus_device *xdev;
-	struct xenbus_driver *drv;
-	int i = 0;
-	int length = 0;
-
-	DPRINTK("");
-
-	if (dev == NULL)
-		return -ENODEV;
-
-	xdev = to_xenbus_device(dev);
-	if (xdev == NULL)
-		return -ENODEV;
-2
-	/* stuff we want to pass to /sbin/hotplug */
-	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
-		       "XENBUS_TYPE=%s", xdev->devicetype);
-
-	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
-		       "XENBUS_PATH=%s", xdev->nodename);
-
-	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
-		       "XENBUS_BASE_PATH=%s", xenbus_backend.root);
-
-	/* terminate, set to next free slot, shrink available space */
-	envp[i] = NULL;
-	envp = &envp[i];
-	num_envp -= i;
-	buffer = &buffer[length];
-	buffer_size -= length;
-
-	if (dev->driver) {
-		drv = to_xenbus_driver(dev->driver);
-		if (drv && drv->uevent)
-			return drv->uevent(xdev, envp, num_envp, buffer,
-					   buffer_size);
-	}
-
-	return 0;
-}
-#endif
-
-int xenbus_register_backend(struct xenbus_driver *drv)
-{
-	drv->read_otherend_details = read_frontend_details;
-
-	return xenbus_register_driver_common(drv, &xenbus_backend);
-}
-
-/* backend/<typename>/<frontend-uuid>/<name> */
-static int xenbus_probe_backend_unit(const char *dir,
-				     const char *type,
-				     const char *name)
-{
-	char *nodename;
-	int err;
-
-	nodename = kasprintf("%s/%s", dir, name);
-	if (!nodename)
-		return -ENOMEM;
-
-	DPRINTK("%s\n", nodename);
-
-	err = xenbus_probe_node(&xenbus_backend, type, nodename);
-	kfree(nodename);
-	return err;
-}
-
-/* backend/<typename>/<frontend-domid> */
-static int xenbus_probe_backend(const char *type, const char *domid)
-{
-	char *nodename;
-	int err = 0;
-	char **dir;
-	unsigned int i, dir_n = 0;
-
-	DPRINTK("");
-
-	nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid);
-	if (!nodename)
-		return -ENOMEM;
-
-	dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
-	if (IS_ERR(dir)) {
-		kfree(nodename);
-		return PTR_ERR(dir);
-	}
-
-	for (i = 0; i < dir_n; i++) {
-		err = xenbus_probe_backend_unit(nodename, type, dir[i]);
-		if (err)
-			break;
-	}
-	kfree(dir);
-	kfree(nodename);
-	return err;
-}
-
-static void backend_changed(struct xenbus_watch *watch,
-			    const char **vec, unsigned int len)
-{
-	DPRINTK("");
-
-	dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
-}
-
-static struct xenbus_watch be_watch = {
-	.node = "backend",
-	.callback = backend_changed,
-};
-#if 0
-void xenbus_backend_suspend(int (*fn)(struct device *, void *))
-{
-	DPRINTK("");
-	if (!xenbus_backend.error)
-		bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
-}
-
-void xenbus_backend_resume(int (*fn)(struct device *, void *))
-{
-	DPRINTK("");
-	if (!xenbus_backend.error)
-		bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
-}
-#endif
-void xenbus_backend_probe_and_watch(void)
-{
-	xenbus_probe_devices(&xenbus_backend);
-	register_xenbus_watch(&be_watch);
-}
-
-#if 0
-void xenbus_backend_bus_register(void)
-{
-	xenbus_backend.error = bus_register(&xenbus_backend.bus);
-	if (xenbus_backend.error)
-		log(LOG_WARNING,
-		       "XENBUS: Error registering backend bus: %i\n",
-		       xenbus_backend.error);
-}
-
-void xenbus_backend_device_register(void)
-{
-	if (xenbus_backend.error)
-		return;
-
-	xenbus_backend.error = device_register(&xenbus_backend.dev);
-	if (xenbus_backend.error) {
-		bus_unregister(&xenbus_backend.bus);
-		log(LOG_WARNING,
-		       "XENBUS: Error registering backend device: %i\n",
-		       xenbus_backend.error);
-	}
-}
-#endif
diff --git a/sys/xen/xenbus/xenbus_xs.c b/sys/xen/xenbus/xenbus_xs.c
deleted file mode 100644
index 93122553e0c..00000000000
--- a/sys/xen/xenbus/xenbus_xs.c
+++ /dev/null
@@ -1,935 +0,0 @@
-/******************************************************************************
- * xenbus_xs.c
- *
- * This is the kernel equivalent of the "xs" library.  We don't need everything
- * and we use xenbus_comms for communication.
- *
- * Copyright (C) 2005 Rusty Russell, IBM Corporation
- * 
- * This file may be distributed separately from the Linux kernel, or
- * incorporated into other software packages, subject to the following license:
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/uio.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/sx.h>
-#include <sys/syslog.h>
-#include <sys/malloc.h>
-#include <sys/systm.h>
-#include <sys/proc.h>
-#include <sys/kthread.h>
-#include <sys/unistd.h>
-
-#include <machine/xen/xen-os.h>
-#include <xen/hypervisor.h>
-#include <machine/stdarg.h>
-
-#include <xen/xenbus/xenbusvar.h>
-#include <xen/xenbus/xenbus_comms.h>
-#include <xen/interface/hvm/params.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-
-static int xs_process_msg(enum xsd_sockmsg_type *type);
-
-int xenwatch_running = 0;
-int xenbus_running = 0;
-int xen_store_evtchn;
-
-struct xs_stored_msg {
-	TAILQ_ENTRY(xs_stored_msg) list;
-
-	struct xsd_sockmsg hdr;
-
-	union {
-		/* Queued replies. */
-		struct {
-			char *body;
-		} reply;
-
-		/* Queued watch events. */
-		struct {
-			struct xenbus_watch *handle;
-			char **vec;
-			unsigned int vec_size;
-		} watch;
-	} u;
-};
-
-struct xs_handle {
-	/* A list of replies. Currently only one will ever be outstanding. */
-	TAILQ_HEAD(xs_handle_list, xs_stored_msg) reply_list;
-	struct mtx reply_lock;
-	int reply_waitq;
-
-	/* One request at a time. */
-	struct sx request_mutex;
-
-	/* Protect transactions against save/restore. */
-	struct sx suspend_mutex;
-};
-
-static struct xs_handle xs_state;
-
-/* List of registered watches, and a lock to protect it. */
-static LIST_HEAD(watch_list_head, xenbus_watch) watches;
-static struct mtx watches_lock;
-/* List of pending watch callback events, and a lock to protect it. */
-static TAILQ_HEAD(event_list_head, xs_stored_msg) watch_events;
-static struct mtx watch_events_lock;
-
-/*
- * Details of the xenwatch callback kernel thread. The thread waits on the
- * watch_events_waitq for work to do (queued on watch_events list). When it
- * wakes up it acquires the xenwatch_mutex before reading the list and
- * carrying out work.
- */
-static pid_t xenwatch_pid;
-struct sx xenwatch_mutex;
-static int watch_events_waitq;
-
-#define xsd_error_count	(sizeof(xsd_errors) / sizeof(xsd_errors[0]))
-
-static int
-xs_get_error(const char *errorstring)
-{
-	unsigned int i;
-
-	for (i = 0; i < xsd_error_count; i++) {
-		if (!strcmp(errorstring, xsd_errors[i].errstring))
-			return (xsd_errors[i].errnum);
-	}
-	log(LOG_WARNING, "XENBUS xen store gave: unknown error %s",
-	    errorstring);
-	return (EINVAL);
-}
-
-extern void kdb_backtrace(void);
-
-static int
-xs_read_reply(enum xsd_sockmsg_type *type, unsigned int *len, void **result)
-{
-	struct xs_stored_msg *msg;
-	char *body;
-	int error;
-
-	mtx_lock(&xs_state.reply_lock);
-
-	while (TAILQ_EMPTY(&xs_state.reply_list)) {
-		while (TAILQ_EMPTY(&xs_state.reply_list)) {
-			error = mtx_sleep(&xs_state.reply_waitq,
-			    &xs_state.reply_lock,
-			    PCATCH, "xswait", hz/10);
-			if (error && error != EWOULDBLOCK) {
-				mtx_unlock(&xs_state.reply_lock);
-				return (error);
-			}
-		}
-	}
-
-	msg = TAILQ_FIRST(&xs_state.reply_list);
-	TAILQ_REMOVE(&xs_state.reply_list, msg, list);
-
-	mtx_unlock(&xs_state.reply_lock);
-
-	*type = msg->hdr.type;
-	if (len)
-		*len = msg->hdr.len;
-	body = msg->u.reply.body;
-
-	free(msg, M_DEVBUF);
-	*result = body;
-	return (0);
-}
-
-#if 0
-/* Emergency write. UNUSED*/
-void xenbus_debug_write(const char *str, unsigned int count)
-{
-	struct xsd_sockmsg msg = { 0 };
-
-	msg.type = XS_DEBUG;
-	msg.len = sizeof("print") + count + 1;
-
-	sx_xlock(&xs_state.request_mutex);
-	xb_write(&msg, sizeof(msg));
-	xb_write("print", sizeof("print"));
-	xb_write(str, count);
-	xb_write("", 1);
-	sx_xunlock(&xs_state.request_mutex);
-}
-
-#endif
-
-int
-xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void **result)
-{
-	struct xsd_sockmsg req_msg = *msg;
-	int error;
-
-	if (req_msg.type == XS_TRANSACTION_START)
-		sx_slock(&xs_state.suspend_mutex);
-
-	sx_xlock(&xs_state.request_mutex);
-
-	error = xb_write(msg, sizeof(*msg) + msg->len,
-	    &xs_state.request_mutex.lock_object);
-	if (error) {
-		msg->type = XS_ERROR;
-	} else {
-		error = xs_read_reply(&msg->type, &msg->len, result);
-	}
-
-	sx_xunlock(&xs_state.request_mutex);
-
-	if ((msg->type == XS_TRANSACTION_END) ||
-	    ((req_msg.type == XS_TRANSACTION_START) &&
-		(msg->type == XS_ERROR)))
-		sx_sunlock(&xs_state.suspend_mutex);
-
-	return (error);
-}
-
-/*
- * Send message to xs. The reply is returned in *result and should be
- * fred with free(*result, M_DEVBUF). Return zero on success or an
- * error code on failure.
- */
-static int
-xs_talkv(struct xenbus_transaction t, enum xsd_sockmsg_type type,
-    const struct iovec *iovec, unsigned int num_vecs,
-    unsigned int *len, void **result)
-{
-	struct xsd_sockmsg msg;
-	void *ret = NULL;
-	unsigned int i;
-	int error;
-
-	msg.tx_id = t.id;
-	msg.req_id = 0;
-	msg.type = type;
-	msg.len = 0;
-	for (i = 0; i < num_vecs; i++)
-		msg.len += iovec[i].iov_len;
-
-	sx_xlock(&xs_state.request_mutex);
-
-	error = xb_write(&msg, sizeof(msg),
-	    &xs_state.request_mutex.lock_object);
-	if (error) {
-		sx_xunlock(&xs_state.request_mutex);
-		printf("xs_talkv failed %d\n", error);
-		return (error);
-	}
-
-	for (i = 0; i < num_vecs; i++) {
-		error = xb_write(iovec[i].iov_base, iovec[i].iov_len,
-		    &xs_state.request_mutex.lock_object);
-		if (error) {		
-			sx_xunlock(&xs_state.request_mutex);
-			printf("xs_talkv failed %d\n", error);
-			return (error);
-		}
-	}
-
-	error = xs_read_reply(&msg.type, len, &ret);
-
-	sx_xunlock(&xs_state.request_mutex);
-
-	if (error)
-		return (error);
-
-	if (msg.type == XS_ERROR) {
-		error = xs_get_error(ret);
-		free(ret, M_DEVBUF);
-		return (error);
-	}
-
-#if 0
-	if ((xenwatch_running == 0) && (xenwatch_inline == 0)) {
-		xenwatch_inline = 1;
-		while (!TAILQ_EMPTY(&watch_events) 
-		    && xenwatch_running == 0) {
-						
-			struct xs_stored_msg *wmsg = TAILQ_FIRST(&watch_events);
-			TAILQ_REMOVE(&watch_events, wmsg, list);
-						
-			wmsg->u.watch.handle->callback(
-				wmsg->u.watch.handle,
-				(const char **)wmsg->u.watch.vec,
-				wmsg->u.watch.vec_size);
-			free(wmsg->u.watch.vec, M_DEVBUF);
-			free(wmsg, M_DEVBUF);
-		}
-		xenwatch_inline = 0;
-	}
-#endif
-	KASSERT(msg.type == type, ("bad xenstore message type"));
-
-	if (result)
-		*result = ret;
-	else
-		free(ret, M_DEVBUF);
-
-	return (0);
-}
-
-/* Simplified version of xs_talkv: single message. */
-static int
-xs_single(struct xenbus_transaction t, enum xsd_sockmsg_type type,
-    const char *string, unsigned int *len, void **result)
-{
-	struct iovec iovec;
-
-	iovec.iov_base = (void *)(uintptr_t) string;
-	iovec.iov_len = strlen(string) + 1;
-
-	return (xs_talkv(t, type, &iovec, 1, len, result));
-}
-
-static unsigned int
-count_strings(const char *strings, unsigned int len)
-{
-	unsigned int num;
-	const char *p;
-
-	for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
-		num++;
-
-	return num;
-}
-
-/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */ 
-static char *
-join(const char *dir, const char *name)
-{
-	char *buffer;
-
-	buffer = malloc(strlen(dir) + strlen("/") + strlen(name) + 1,
-	    M_DEVBUF, M_WAITOK);
-
-	strcpy(buffer, dir);
-	if (strcmp(name, "")) {
-		strcat(buffer, "/");
-		strcat(buffer, name);
-	}
-
-	return (buffer);
-}
-
-static char **
-split(char *strings, unsigned int len, unsigned int *num)
-{
-	char *p, **ret;
-
-	/* Count the strings. */
-	*num = count_strings(strings, len) + 1;
-
-	/* Transfer to one big alloc for easy freeing. */
-	ret = malloc(*num * sizeof(char *) + len, M_DEVBUF, M_WAITOK);
-	memcpy(&ret[*num], strings, len);
-	free(strings, M_DEVBUF);
-
-	strings = (char *)&ret[*num];
-	for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
-		ret[(*num)++] = p;
-
-	ret[*num] = strings + len;
-		
-	return ret;
-}
-
-/*
- * Return the contents of a directory in *result which should be freed
- * with free(*result, M_DEVBUF).
- */
-int
-xenbus_directory(struct xenbus_transaction t, const char *dir,
-    const char *node, unsigned int *num, char ***result)
-{
-	char *strings, *path;
-	unsigned int len = 0;
-	int error;
-
-	path = join(dir, node);
-	error = xs_single(t, XS_DIRECTORY, path, &len, (void **) &strings);
-	free(path, M_DEVBUF);
-	if (error)
-		return (error);
-
-	*result = split(strings, len, num);
-	return (0);
-}
-
-/*
- * Check if a path exists. Return 1 if it does.
- */
-int
-xenbus_exists(struct xenbus_transaction t, const char *dir, const char *node)
-{
-	char **d;
-	int error, dir_n;
-
-	error = xenbus_directory(t, dir, node, &dir_n, &d);
-	if (error)
-		return (0);
-	free(d, M_DEVBUF);
-	return (1);
-}
-
-/*
- * Get the value of a single file.  Returns the contents in *result
- * which should be freed with free(*result, M_DEVBUF) after use.
- * The length of the value in bytes is returned in *len.
- */
-int
-xenbus_read(struct xenbus_transaction t, const char *dir, const char *node,
-    unsigned int *len, void **result)
-{
-	char *path;
-	void *ret;
-	int error;
-
-	path = join(dir, node);
-	error = xs_single(t, XS_READ, path, len, &ret);
-	free(path, M_DEVBUF);
-	if (error)
-		return (error);
-	*result = ret;
-	return (0);
-}
-
-/*
- * Write the value of a single file.  Returns error on failure.
- */
-int
-xenbus_write(struct xenbus_transaction t, const char *dir, const char *node,
-    const char *string)
-{
-	char *path;
-	struct iovec iovec[2];
-	int error;
-
-	path = join(dir, node);
-
-	iovec[0].iov_base = (void *)(uintptr_t) path;
-	iovec[0].iov_len = strlen(path) + 1;
-	iovec[1].iov_base = (void *)(uintptr_t) string;
-	iovec[1].iov_len = strlen(string);
-
-	error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL);
-	free(path, M_DEVBUF);
-
-	return (error);
-}
-
-/*
- * Create a new directory.
- */
-int
-xenbus_mkdir(struct xenbus_transaction t, const char *dir, const char *node)
-{
-	char *path;
-	int ret;
-
-	path = join(dir, node);
-	ret = xs_single(t, XS_MKDIR, path, NULL, NULL);
-	free(path, M_DEVBUF);
-
-	return (ret);
-}
-
-/*
- * Destroy a file or directory (directories must be empty).
- */
-int
-xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
-{
-	char *path;
-	int ret;
-
-	path = join(dir, node);
-	ret = xs_single(t, XS_RM, path, NULL, NULL);
-	free(path, M_DEVBUF);
-
-	return (ret);
-}
-
-/*
- * Start a transaction: changes by others will not be seen during this
- * transaction, and changes will not be visible to others until end.
- */
-int
-xenbus_transaction_start(struct xenbus_transaction *t)
-{
-	char *id_str;
-	int error;
-
-	sx_slock(&xs_state.suspend_mutex);
-	error = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL,
-	    (void **) &id_str);
-	if (error) {
-		sx_sunlock(&xs_state.suspend_mutex);
-		return (error);
-	}
-
-	t->id = strtoul(id_str, NULL, 0);
-	free(id_str, M_DEVBUF);
-
-	return (0);
-}
-
-/*
- * End a transaction.  If abandon is true, transaction is discarded
- * instead of committed.
- */
-int xenbus_transaction_end(struct xenbus_transaction t, int abort)
-{
-	char abortstr[2];
-	int error;
-
-	if (abort)
-		strcpy(abortstr, "F");
-	else
-		strcpy(abortstr, "T");
-
-	error = xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL);
-		
-	sx_sunlock(&xs_state.suspend_mutex);
-
-	return (error);
-}
-
-/* Single read and scanf: returns zero or errno. */
-int
-xenbus_scanf(struct xenbus_transaction t,
-    const char *dir, const char *node, int *scancountp, const char *fmt, ...)
-{
-	va_list ap;
-	int error, ns;
-	char *val;
-
-	error = xenbus_read(t, dir, node, NULL, (void **) &val);
-	if (error)
-		return (error);
-
-	va_start(ap, fmt);
-	ns = vsscanf(val, fmt, ap);
-	va_end(ap);
-	free(val, M_DEVBUF);
-	/* Distinctive errno. */
-	if (ns == 0)
-		return (ERANGE);
-	if (scancountp)
-		*scancountp = ns;
-	return (0);
-}
-
-/* Single printf and write: returns zero or errno. */
-int
-xenbus_printf(struct xenbus_transaction t,
-    const char *dir, const char *node, const char *fmt, ...)
-{
-	va_list ap;
-	int error, ret;
-#define PRINTF_BUFFER_SIZE 4096
-	char *printf_buffer;
-
-	printf_buffer = malloc(PRINTF_BUFFER_SIZE, M_DEVBUF, M_WAITOK);
-
-	va_start(ap, fmt);
-	ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
-	va_end(ap);
-
-	KASSERT(ret <= PRINTF_BUFFER_SIZE-1, ("xenbus_printf: message too large"));
-	error = xenbus_write(t, dir, node, printf_buffer);
-
-	free(printf_buffer, M_DEVBUF);
-
-	return (error);
-}
-
-/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
-int
-xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
-{
-	va_list ap;
-	const char *name;
-	int error, i;
-
-	for (i = 0; i < 10000; i++)
-		HYPERVISOR_yield();
-		
-	va_start(ap, dir);
-	error = 0;
-	while (error == 0 && (name = va_arg(ap, char *)) != NULL) {
-		const char *fmt = va_arg(ap, char *);
-		void *result = va_arg(ap, void *);
-		char *p;
-
-		error = xenbus_read(t, dir, name, NULL, (void **) &p);
-		if (error)
-			break;
-
-		if (fmt) {
-			if (sscanf(p, fmt, result) == 0)
-				error = EINVAL;
-			free(p, M_DEVBUF);
-		} else
-			*(char **)result = p;
-	}
-	va_end(ap);
-
-	return (error);
-}
-
-static int
-xs_watch(const char *path, const char *token)
-{
-	struct iovec iov[2];
-
-	iov[0].iov_base = (void *)(uintptr_t) path;
-	iov[0].iov_len = strlen(path) + 1;
-	iov[1].iov_base = (void *)(uintptr_t) token;
-	iov[1].iov_len = strlen(token) + 1;
-
-	return (xs_talkv(XBT_NIL, XS_WATCH, iov, 2, NULL, NULL));
-}
-
-static int
-xs_unwatch(const char *path, const char *token)
-{
-	struct iovec iov[2];
-
-	iov[0].iov_base = (void *)(uintptr_t) path;
-	iov[0].iov_len = strlen(path) + 1;
-	iov[1].iov_base = (void *)(uintptr_t) token;
-	iov[1].iov_len = strlen(token) + 1;
-
-	return (xs_talkv(XBT_NIL, XS_UNWATCH, iov, 2, NULL, NULL));
-}
-
-static struct xenbus_watch *
-find_watch(const char *token)
-{
-	struct xenbus_watch *i, *cmp;
-
-	cmp = (void *)strtoul(token, NULL, 16);
-
-	LIST_FOREACH(i, &watches, list)
-		if (i == cmp)
-			return (i);
-
-	return (NULL);
-}
-
-/* Register callback to watch this node. */
-int
-register_xenbus_watch(struct xenbus_watch *watch)
-{
-	/* Pointer in ascii is the token. */
-	char token[sizeof(watch) * 2 + 1];
-	int error;
-
-	sprintf(token, "%lX", (long)watch);
-
-	sx_slock(&xs_state.suspend_mutex);
-
-	mtx_lock(&watches_lock);
-	KASSERT(find_watch(token) == NULL, ("watch already registered"));
-	LIST_INSERT_HEAD(&watches, watch, list);
-	mtx_unlock(&watches_lock);
-
-	error = xs_watch(watch->node, token);
-		
-	/* Ignore errors due to multiple registration. */
-	if (error == EEXIST) {
-		mtx_lock(&watches_lock);
-		LIST_REMOVE(watch, list);
-		mtx_unlock(&watches_lock);
-	}
-
-	sx_sunlock(&xs_state.suspend_mutex);
-
-	return (error);
-}
-
-void
-unregister_xenbus_watch(struct xenbus_watch *watch)
-{
-	struct xs_stored_msg *msg, *tmp;
-	char token[sizeof(watch) * 2 + 1];
-	int error;
-
-	sprintf(token, "%lX", (long)watch);
-		
-	sx_slock(&xs_state.suspend_mutex);
-
-	mtx_lock(&watches_lock);
-	KASSERT(find_watch(token), ("watch not registered"));
-	LIST_REMOVE(watch, list);
-	mtx_unlock(&watches_lock);
-
-	error = xs_unwatch(watch->node, token);
-	if (error)
-		log(LOG_WARNING, "XENBUS Failed to release watch %s: %i\n",
-		    watch->node, error);
-
-	sx_sunlock(&xs_state.suspend_mutex);
-
-	/* Cancel pending watch events. */
-	mtx_lock(&watch_events_lock);
-	TAILQ_FOREACH_SAFE(msg, &watch_events, list, tmp) {
-		if (msg->u.watch.handle != watch)
-			continue;
-		TAILQ_REMOVE(&watch_events, msg, list);
-		free(msg->u.watch.vec, M_DEVBUF);
-		free(msg, M_DEVBUF);
-	}
-	mtx_unlock(&watch_events_lock);
-
-	/* Flush any currently-executing callback, unless we are it. :-) */
-	if (curproc->p_pid != xenwatch_pid) {
-		sx_xlock(&xenwatch_mutex);
-		sx_xunlock(&xenwatch_mutex);
-	}
-}
-
-void
-xs_suspend(void)
-{	
-
-	sx_xlock(&xs_state.suspend_mutex);
-	sx_xlock(&xs_state.request_mutex);
-}
-
-void
-xs_resume(void)
-{
-	struct xenbus_watch *watch;
-	char token[sizeof(watch) * 2 + 1];
-
-	sx_xunlock(&xs_state.request_mutex);
-
-	/* No need for watches_lock: the suspend_mutex is sufficient. */
-	LIST_FOREACH(watch, &watches, list) {
-		sprintf(token, "%lX", (long)watch);
-		xs_watch(watch->node, token);
-	}
-
-	sx_xunlock(&xs_state.suspend_mutex);
-}
-
-static void
-xenwatch_thread(void *unused)
-{
-	struct xs_stored_msg *msg;
-
-	for (;;) {
-
-		mtx_lock(&watch_events_lock);
-		while (TAILQ_EMPTY(&watch_events))
-			mtx_sleep(&watch_events_waitq,
-			    &watch_events_lock,
-			    PWAIT | PCATCH, "waitev", hz/10);
-
-		mtx_unlock(&watch_events_lock);
-		sx_xlock(&xenwatch_mutex);
-
-		mtx_lock(&watch_events_lock);
-		msg = TAILQ_FIRST(&watch_events);
-		if (msg)
-			TAILQ_REMOVE(&watch_events, msg, list);
-		mtx_unlock(&watch_events_lock);
-
-		if (msg != NULL) {
-			/*
-			 * XXX There are messages coming in with a NULL callback.
-			 * XXX This deserves further investigation; the workaround
-			 * XXX here simply prevents the kernel from panic'ing
-			 * XXX on startup.
-			 */
-			if (msg->u.watch.handle->callback != NULL)
-				msg->u.watch.handle->callback(
-					msg->u.watch.handle,
-					(const char **)msg->u.watch.vec,
-					msg->u.watch.vec_size);
-			free(msg->u.watch.vec, M_DEVBUF);
-			free(msg, M_DEVBUF);
-		}
-
-		sx_xunlock(&xenwatch_mutex);
-	}
-}
-
-static int
-xs_process_msg(enum xsd_sockmsg_type *type)
-{
-	struct xs_stored_msg *msg;
-	char *body;
-	int error;
-		
-	msg = malloc(sizeof(*msg), M_DEVBUF, M_WAITOK);
-	mtx_lock(&xs_state.reply_lock);
-	error = xb_read(&msg->hdr, sizeof(msg->hdr),
-	    &xs_state.reply_lock.lock_object);
-	mtx_unlock(&xs_state.reply_lock);
-	if (error) {
-		free(msg, M_DEVBUF);
-		return (error);
-	}
-
-	body = malloc(msg->hdr.len + 1, M_DEVBUF, M_WAITOK);
-	mtx_lock(&xs_state.reply_lock);
-	error = xb_read(body, msg->hdr.len,
-	    &xs_state.reply_lock.lock_object); 
-	mtx_unlock(&xs_state.reply_lock);
-	if (error) {
-		free(body, M_DEVBUF);
-		free(msg, M_DEVBUF);
-		return (error);
-	}
-	body[msg->hdr.len] = '\0';
-
-	*type = msg->hdr.type;
-	if (msg->hdr.type == XS_WATCH_EVENT) {
-		msg->u.watch.vec = split(body, msg->hdr.len,
-		    &msg->u.watch.vec_size);
-				
-		mtx_lock(&watches_lock);
-		msg->u.watch.handle = find_watch(
-			msg->u.watch.vec[XS_WATCH_TOKEN]);
-		if (msg->u.watch.handle != NULL) {
-			mtx_lock(&watch_events_lock);
-			TAILQ_INSERT_TAIL(&watch_events, msg, list);
-			wakeup(&watch_events_waitq);
-			mtx_unlock(&watch_events_lock);
-		} else {
-			free(msg->u.watch.vec, M_DEVBUF);
-			free(msg, M_DEVBUF);
-		}
-		mtx_unlock(&watches_lock);
-	} else {
-		msg->u.reply.body = body;
-		mtx_lock(&xs_state.reply_lock);
-		TAILQ_INSERT_TAIL(&xs_state.reply_list, msg, list);
-		wakeup(&xs_state.reply_waitq);
-		mtx_unlock(&xs_state.reply_lock);
-	}
-		
-	return 0;
-}
-
-static void
-xenbus_thread(void *unused)
-{
-	int error;
-	enum xsd_sockmsg_type type;
-	xenbus_running = 1;
-
-	for (;;) {
-		error = xs_process_msg(&type);
-		if (error) 
-			printf("XENBUS error %d while reading message\n",
-			    error);
-	}
-}
-
-#ifdef XENHVM
-static unsigned long xen_store_mfn;
-char *xen_store;
-
-static inline unsigned long
-hvm_get_parameter(int index)
-{
-	struct xen_hvm_param xhv;
-	int error;
-	
-	xhv.domid = DOMID_SELF;
-	xhv.index = index;
-	error = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
-	if (error) {
-		printf("hvm_get_parameter: failed to get %d, error %d\n",
-		    index, error);
-		return (0);
-	}
-	return (xhv.value);
-}
-
-#endif
-
-int
-xs_init(void)
-{
-	int error;
-	struct proc *p;
-
-#ifdef XENHVM
-	xen_store_evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN);
-	xen_store_mfn = hvm_get_parameter(HVM_PARAM_STORE_PFN);
-	xen_store = pmap_mapdev(xen_store_mfn * PAGE_SIZE, PAGE_SIZE);
-#else
-	xen_store_evtchn = xen_start_info->store_evtchn;
-#endif
-
-	TAILQ_INIT(&xs_state.reply_list);
-	TAILQ_INIT(&watch_events);
-	sx_init(&xenwatch_mutex, "xenwatch");
-
-		
-	mtx_init(&xs_state.reply_lock, "state reply", NULL, MTX_DEF);
-	sx_init(&xs_state.request_mutex, "xenstore request");
-	sx_init(&xs_state.suspend_mutex, "xenstore suspend");
-
-		
-#if 0
-	mtx_init(&xs_state.suspend_mutex, "xenstore suspend", NULL, MTX_DEF);
-	sema_init(&xs_state.request_mutex, 1, "xenstore request");
-	sema_init(&xenwatch_mutex, 1, "xenwatch");
-#endif
-	mtx_init(&watches_lock, "watches", NULL, MTX_DEF);
-	mtx_init(&watch_events_lock, "watch events", NULL, MTX_DEF);
-   
-	/* Initialize the shared memory rings to talk to xenstored */
-	error = xb_init_comms();
-	if (error)
-		return (error);
-
-	xenwatch_running = 1;
-	error = kproc_create(xenwatch_thread, NULL, &p,
-	    RFHIGHPID, 0, "xenwatch");
-	if (error)
-		return (error);
-	xenwatch_pid = p->p_pid;
-
-	error = kproc_create(xenbus_thread, NULL, NULL, 
-	    RFHIGHPID, 0, "xenbus");
-	
-	return (error);
-}
diff --git a/sys/xen/xenbus/xenbusb.c b/sys/xen/xenbus/xenbusb.c
new file mode 100644
index 00000000000..49facb6ddcc
--- /dev/null
+++ b/sys/xen/xenbus/xenbusb.c
@@ -0,0 +1,878 @@
+/******************************************************************************
+ * Copyright (C) 2010 Spectra Logic Corporation
+ * Copyright (C) 2008 Doug Rabson
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005 XenSource Ltd
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * \file xenbusb.c
+ *
+ * \brief Shared support functions for managing the NewBus busses that contain
+ *        Xen front and back end device instances.
+ *
+ * The NewBus implementation of XenBus attaches a xenbusb_front and xenbusb_back
+ * child bus to the xenstore device.  This strategy allows the small differences
+ * in the handling of XenBus operations for front and back devices to be handled
+ * as overrides in xenbusb_front/back.c.  Front and back specific device
+ * classes are also provided so device drivers can register for the devices they
+ * can handle without the need to filter within their probe routines.  The
+ * net result is a device hierarchy that might look like this:
+ *
+ * xenstore0/
+ *           xenbusb_front0/
+ *                         xn0
+ *                         xbd0
+ *                         xbd1
+ *           xenbusb_back0/
+ *                        xbbd0
+ *                        xnb0
+ *                        xnb1
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+
+#include <machine/xen/xen-os.h>
+#include <machine/stdarg.h>
+
+#include <xen/gnttab.h>
+#include <xen/xenstore/xenstorevar.h>
+#include <xen/xenbus/xenbusb.h>
+#include <xen/xenbus/xenbusvar.h>
+
+/*------------------------- Private Functions --------------------------------*/
+/**
+ * \brief Deallocate XenBus device instance variables.
+ *
+ * \param ivars  The instance variable block to free.
+ */
+static void
+xenbusb_free_child_ivars(struct xenbus_device_ivars *ivars)
+{
+	if (ivars->xd_otherend_watch.node != NULL) {
+		xs_unregister_watch(&ivars->xd_otherend_watch);
+		free(ivars->xd_otherend_watch.node, M_XENBUS);
+		ivars->xd_otherend_watch.node = NULL;
+	}
+
+	if (ivars->xd_node != NULL) {
+		free(ivars->xd_node, M_XENBUS);
+		ivars->xd_node = NULL;
+	}
+
+	if (ivars->xd_type != NULL) {
+		free(ivars->xd_type, M_XENBUS);
+		ivars->xd_type = NULL;
+	}
+
+	if (ivars->xd_otherend_path != NULL) {
+		free(ivars->xd_otherend_path, M_XENBUS);
+		ivars->xd_otherend_path = NULL;
+	}
+
+	free(ivars, M_XENBUS);
+}
+
+/**
+ * XenBus watch callback registered against the "state" XenStore
+ * node of the other-end of a split device connection.
+ *
+ * This callback is invoked whenever the state of a device instance's
+ * peer changes.
+ *
+ * \param watch      The xs_watch object used to register this callback
+ *                   function.
+ * \param vec        An array of pointers to NUL terminated strings containing
+ *                   watch event data.  The vector should be indexed via the
+ *                   xs_watch_type enum in xs_wire.h.
+ * \param vec_size   The number of elements in vec.
+ *
+ * \return  The device_t of the found device if any, or NULL.
+ *
+ * \note device_t is a pointer type, so it can be compared against
+ *       NULL for validity. 
+ */
+static void
+xenbusb_otherend_changed(struct xs_watch *watch, const char **vec,
+    unsigned int vec_size __unused)
+{
+	struct xenbus_device_ivars *ivars;
+	device_t dev;
+	enum xenbus_state newstate;
+
+	ivars = (struct xenbus_device_ivars *) watch;
+	dev = ivars->xd_dev;
+
+	if (!ivars->xd_otherend_path
+	 || strncmp(ivars->xd_otherend_path, vec[XS_WATCH_PATH],
+		    strlen(ivars->xd_otherend_path)))
+		return;
+
+	newstate = xenbus_read_driver_state(ivars->xd_otherend_path);
+	XENBUS_OTHEREND_CHANGED(dev, newstate);
+}
+
+/**
+ * Search our internal record of configured devices (not the XenStore)
+ * to determine if the XenBus device indicated by \a node is known to
+ * the system.
+ *
+ * \param dev   The XenBus bus instance to search for device children.
+ * \param node  The XenStore node path for the device to find.
+ *
+ * \return  The device_t of the found device if any, or NULL.
+ *
+ * \note device_t is a pointer type, so it can be compared against
+ *       NULL for validity. 
+ */
+static device_t
+xenbusb_device_exists(device_t dev, const char *node)
+{
+	device_t *kids;
+	device_t result;
+	struct xenbus_device_ivars *ivars;
+	int i, count;
+
+	if (device_get_children(dev, &kids, &count))
+		return (FALSE);
+
+	result = NULL;
+	for (i = 0; i < count; i++) {
+		ivars = device_get_ivars(kids[i]);
+		if (!strcmp(ivars->xd_node, node)) {
+			result = kids[i];
+			break;
+		}
+	}
+	free(kids, M_TEMP);
+
+	return (result);
+}
+
+static void
+xenbusb_delete_child(device_t dev, device_t child)
+{
+	struct xenbus_device_ivars *ivars;
+
+	ivars = device_get_ivars(child);
+
+	/*
+	 * We no longer care about the otherend of the
+	 * connection.  Cancel the watch now so that we
+	 * don't try to handle an event for a partially
+	 * detached child.
+	 */
+	if (ivars->xd_otherend_watch.node != NULL)
+		xs_unregister_watch(&ivars->xd_otherend_watch);
+	
+	device_delete_child(dev, child);
+	xenbusb_free_child_ivars(ivars);
+}
+
+/**
+ * \param dev    The NewBus device representing this XenBus bus.
+ * \param child	 The NewBus device representing a child of dev%'s XenBus bus.
+ */
+static void
+xenbusb_verify_device(device_t dev, device_t child)
+{
+	if (xs_exists(XST_NIL, xenbus_get_node(child), "") == 0) {
+
+		/*
+		 * Device tree has been removed from Xenbus.
+		 * Tear down the device.
+		 */
+		xenbusb_delete_child(dev, child);
+	}
+}
+
+/**
+ * \brief Enumerate the devices on a XenBus bus and register them with
+ *        the NewBus device tree.
+ *
+ * xenbusb_enumerate_bus() will create entries (in state DS_NOTPRESENT)
+ * for nodes that appear in the XenStore, but will not invoke probe/attach
+ * operations on drivers.  Probe/Attach processing must be separately
+ * performed via an invocation of xenbusb_probe_children().  This is usually
+ * done via the xbs_probe_children task.
+ *
+ * \param xbs  XenBus Bus device softc of the owner of the bus to enumerate.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+static int
+xenbusb_enumerate_bus(struct xenbusb_softc *xbs)
+{
+	const char **types;
+	u_int type_idx;
+	u_int type_count;
+	int error;
+
+	error = xs_directory(XST_NIL, xbs->xbs_node, "", &type_count, &types);
+	if (error)
+		return (error);
+
+	for (type_idx = 0; type_idx < type_count; type_idx++)
+		XENBUSB_ENUMERATE_TYPE(xbs->xbs_dev, types[type_idx]);
+
+	free(types, M_XENSTORE);
+
+	return (0);
+}
+
+/**
+ * Handler for all generic XenBus device systcl nodes.
+ */
+static int
+xenbusb_device_sysctl_handler(SYSCTL_HANDLER_ARGS)  
+{
+	device_t dev;
+        const char *value;
+
+	dev = (device_t)arg1;
+        switch (arg2) {
+	case XENBUS_IVAR_NODE:
+		value = xenbus_get_node(dev);
+		break;
+	case XENBUS_IVAR_TYPE:
+		value = xenbus_get_type(dev);
+		break;
+	case XENBUS_IVAR_STATE:
+		value = xenbus_strstate(xenbus_get_state(dev));
+		break;
+	case XENBUS_IVAR_OTHEREND_ID:
+		return (sysctl_handle_int(oidp, NULL,
+					  xenbus_get_otherend_id(dev),
+					  req));
+		/* NOTREACHED */
+	case XENBUS_IVAR_OTHEREND_PATH:
+		value = xenbus_get_otherend_path(dev);
+                break;
+	default:
+		return (EINVAL);
+	}
+	return (SYSCTL_OUT(req, value, strlen(value)));
+}
+
+/**
+ * Create read-only systcl nodes for xenbusb device ivar data.
+ *
+ * \param dev  The XenBus device instance to register with sysctl.
+ */
+static void
+xenbusb_device_sysctl_init(device_t dev)
+{
+	struct sysctl_ctx_list *ctx;
+	struct sysctl_oid      *tree;
+
+	ctx  = device_get_sysctl_ctx(dev);
+	tree = device_get_sysctl_tree(dev);
+
+        SYSCTL_ADD_PROC(ctx,
+			SYSCTL_CHILDREN(tree),
+			OID_AUTO,
+			"xenstore_path",
+			CTLFLAG_RD,
+			dev,
+			XENBUS_IVAR_NODE,
+			xenbusb_device_sysctl_handler,
+			"A",
+			"XenStore path to device");
+
+        SYSCTL_ADD_PROC(ctx,
+			SYSCTL_CHILDREN(tree),
+			OID_AUTO,
+			"xenbus_dev_type",
+			CTLFLAG_RD,
+			dev,
+			XENBUS_IVAR_TYPE,
+			xenbusb_device_sysctl_handler,
+			"A",
+			"XenBus device type");
+
+        SYSCTL_ADD_PROC(ctx,
+			SYSCTL_CHILDREN(tree),
+			OID_AUTO,
+			"xenbus_connection_state",
+			CTLFLAG_RD,
+			dev,
+			XENBUS_IVAR_STATE,
+			xenbusb_device_sysctl_handler,
+			"A",
+			"XenBus state of peer connection");
+
+        SYSCTL_ADD_PROC(ctx,
+			SYSCTL_CHILDREN(tree),
+			OID_AUTO,
+			"xenbus_peer_domid",
+			CTLFLAG_RD,
+			dev,
+			XENBUS_IVAR_OTHEREND_ID,
+			xenbusb_device_sysctl_handler,
+			"I",
+			"Xen domain ID of peer");
+
+        SYSCTL_ADD_PROC(ctx,
+			SYSCTL_CHILDREN(tree),
+			OID_AUTO,
+			"xenstore_peer_path",
+			CTLFLAG_RD,
+			dev,
+			XENBUS_IVAR_OTHEREND_PATH,
+			xenbusb_device_sysctl_handler,
+			"A",
+			"XenStore path to peer device");
+}
+
+/**
+ * \brief Verify the existance of attached device instances and perform
+ *        probe/attach processing for newly arrived devices.
+ *
+ * \param dev  The NewBus device representing this XenBus bus.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+static int
+xenbusb_probe_children(device_t dev)
+{
+	device_t *kids;
+	struct xenbus_device_ivars *ivars;
+	int i, count;
+
+	if (device_get_children(dev, &kids, &count) == 0) {
+		for (i = 0; i < count; i++) {
+			if (device_get_state(kids[i]) != DS_NOTPRESENT) {
+				/*
+				 * We already know about this one.
+				 * Make sure it's still here.
+				 */
+				xenbusb_verify_device(dev, kids[i]);
+				continue;
+			}
+
+			if (device_probe_and_attach(kids[i])) {
+				/*
+				 * Transition device to the closed state
+				 * so the world knows that attachment will
+				 * not occur.
+				 */
+				xenbus_set_state(kids[i], XenbusStateClosed);
+
+				/*
+				 * Remove our record of this device.
+				 * So long as it remains in the closed
+				 * state in the XenStore, we will not find
+				 * it again.  The state will only change
+				 * if the control domain actively reconfigures
+				 * this device.
+				 */
+				xenbusb_delete_child(dev, kids[i]);
+
+				continue;
+			}
+			/*
+			 * Augment default newbus provided dynamic sysctl
+			 * variables with the standard ivar contents of
+			 * XenBus devices.
+			 */
+			xenbusb_device_sysctl_init(kids[i]);
+
+			/*
+			 * Now that we have a driver managing this device
+			 * that can receive otherend state change events,
+			 * hook up a watch for them.
+			 */
+			ivars = device_get_ivars(kids[i]);
+			xs_register_watch(&ivars->xd_otherend_watch);
+		}
+		free(kids, M_TEMP);
+	}
+
+	return (0);
+}
+
+/**
+ * \brief Task callback function to perform XenBus probe operations
+ *        from a known safe context.
+ *
+ * \param arg      The NewBus device_t representing the bus instance to
+ *                 on which to perform probe processing.
+ * \param pending  The number of times this task was queued before it could
+ *                 be run.
+ */
+static void
+xenbusb_probe_children_cb(void *arg, int pending __unused)
+{
+	device_t dev = (device_t)arg;
+
+	/*
+	 * Hold Giant until the Giant free newbus changes are committed.
+	 */
+	mtx_lock(&Giant);
+	xenbusb_probe_children(dev);
+	mtx_unlock(&Giant);
+}
+
+/**
+ * \brief XenStore watch callback for the root node of the XenStore
+ *        subtree representing a XenBus.
+ *
+ * This callback performs, or delegates to the xbs_probe_children task,
+ * all processing necessary to handle dynmaic device arrival and departure
+ * events from a XenBus.
+ *
+ * \param watch  The XenStore watch object associated with this callback.
+ * \param vec    The XenStore watch event data.
+ * \param len	 The number of fields in the event data stream.
+ */
+static void
+xenbusb_devices_changed(struct xs_watch *watch, const char **vec,
+			unsigned int len)
+{
+	struct xenbusb_softc *xbs;
+	device_t dev;
+	char *node;
+	char *bus;
+	char *type;
+	char *id;
+	char *p;
+	u_int component;
+
+	xbs = (struct xenbusb_softc *)watch;
+	dev = xbs->xbs_dev;
+
+	if (len <= XS_WATCH_PATH) {
+		device_printf(dev, "xenbusb_devices_changed: "
+			      "Short Event Data.\n");
+		return;
+	}
+
+	node = strdup(vec[XS_WATCH_PATH], M_XENBUS);
+	p = strchr(node, '/');
+	if (p == NULL)
+		goto out;
+	bus = node;
+	*p = 0;
+	type = p + 1;
+
+	p = strchr(type, '/');
+	if (p == NULL)
+		goto out;
+	*p++ = 0;
+
+	/*
+	 * Extract the device ID.  A device ID has one or more path
+	 * components separated by the '/' character.
+	 *
+	 * e.g. "<frontend vm id>/<frontend dev id>" for backend devices.
+	 */
+	id = p;
+	for (component = 0; component < xbs->xbs_id_components; component++) {
+		p = strchr(p, '/');
+		if (p == NULL)
+			break;
+		p++;
+	}
+	if (p != NULL)
+		*p = 0;
+
+	if (*id != 0 && component >= xbs->xbs_id_components - 1) {
+		xenbusb_add_device(xbs->xbs_dev, type, id);
+		taskqueue_enqueue(taskqueue_thread, &xbs->xbs_probe_children);
+	}
+out:
+	free(node, M_XENBUS);
+}
+
+/**
+ * \brief Interrupt configuration hook callback associated with xbs_attch_ch.
+ *
+ * Since interrupts are always functional at the time of XenBus configuration,
+ * there is nothing to be done when the callback occurs.  This hook is only
+ * registered to hold up boot processing while XenBus devices come online.
+ * 
+ * \param arg  Unused configuration hook callback argument.
+ */
+static void
+xenbusb_nop_confighook_cb(void *arg __unused)
+{
+}
+
+/**
+ * \brief Decrement the number of XenBus child devices in the
+ *        connecting state by one and release the xbs_attch_ch
+ *        interrupt configuration hook if the connecting count
+ *        drops to zero.
+ *
+ * \param xbs  XenBus Bus device softc of the owner of the bus to enumerate.
+ */
+static void
+xenbusb_release_confighook(struct xenbusb_softc *xbs)
+{
+	mtx_lock(&xbs->xbs_lock);
+	KASSERT(xbs->xbs_connecting_children > 0,
+		("Connecting device count error\n"));
+	xbs->xbs_connecting_children--;
+	if (xbs->xbs_connecting_children == 0
+	 && (xbs->xbs_flags & XBS_ATTACH_CH_ACTIVE) != 0) {
+		xbs->xbs_flags &= ~XBS_ATTACH_CH_ACTIVE;
+		mtx_unlock(&xbs->xbs_lock);
+		config_intrhook_disestablish(&xbs->xbs_attach_ch);
+	} else {
+		mtx_unlock(&xbs->xbs_lock);
+	}
+}
+
+/*--------------------------- Public Functions -------------------------------*/
+/*--------- API comments for these methods can be found in xenbusb.h ---------*/
+void
+xenbusb_identify(driver_t *driver __unused, device_t parent)
+{
+	/*
+	 * A single instance of each bus type for which we have a driver
+	 * is always present in a system operating under Xen.
+	 */
+	BUS_ADD_CHILD(parent, 0, driver->name, 0);
+}
+
+int
+xenbusb_add_device(device_t dev, const char *type, const char *id)
+{
+	struct xenbusb_softc *xbs;
+	struct sbuf *devpath_sbuf;
+	char *devpath;
+	struct xenbus_device_ivars *ivars;
+	int error;
+
+	xbs = device_get_softc(dev);
+	devpath_sbuf = sbuf_new_auto();
+	sbuf_printf(devpath_sbuf, "%s/%s/%s", xbs->xbs_node, type, id);
+	sbuf_finish(devpath_sbuf);
+	devpath = sbuf_data(devpath_sbuf);
+
+	ivars = malloc(sizeof(*ivars), M_XENBUS, M_ZERO|M_WAITOK);
+	error = ENXIO;
+
+	if (xs_exists(XST_NIL, devpath, "") != 0) {
+		device_t child;
+		enum xenbus_state state;
+		char *statepath;
+
+		child = xenbusb_device_exists(dev, devpath);
+		if (child != NULL) {
+			/*
+			 * We are already tracking this node
+			 */
+			error = 0;
+			goto out;
+		}
+			
+		state = xenbus_read_driver_state(devpath);
+		if (state != XenbusStateInitialising) {
+			/*
+			 * Device is not new, so ignore it. This can
+			 * happen if a device is going away after
+			 * switching to Closed.
+			 */
+			printf("xenbusb_add_device: Device %s ignored. "
+			       "State %d\n", devpath, state);
+			error = 0;
+			goto out;
+		}
+
+		sx_init(&ivars->xd_lock, "xdlock");
+		ivars->xd_flags = XDF_CONNECTING;
+		ivars->xd_node = strdup(devpath, M_XENBUS);
+		ivars->xd_type  = strdup(type, M_XENBUS);
+		ivars->xd_state = XenbusStateInitialising;
+
+		error = XENBUSB_GET_OTHEREND_NODE(dev, ivars);
+		if (error) {
+			printf("xenbus_update_device: %s no otherend id\n",
+			    devpath); 
+			goto out;
+		}
+
+		statepath = malloc(strlen(ivars->xd_otherend_path)
+		    + strlen("/state") + 1, M_XENBUS, M_WAITOK);
+		sprintf(statepath, "%s/state", ivars->xd_otherend_path);
+
+		ivars->xd_otherend_watch.node = statepath;
+		ivars->xd_otherend_watch.callback = xenbusb_otherend_changed;
+
+		mtx_lock(&xbs->xbs_lock);
+		xbs->xbs_connecting_children++;
+		mtx_unlock(&xbs->xbs_lock);
+
+		child = device_add_child(dev, NULL, -1);
+		ivars->xd_dev = child;
+		device_set_ivars(child, ivars);
+	}
+
+out:
+	sbuf_delete(devpath_sbuf);
+	if (error != 0)
+		xenbusb_free_child_ivars(ivars);
+
+	return (error);
+}
+
+int
+xenbusb_attach(device_t dev, char *bus_node, u_int id_components)
+{
+	struct xenbusb_softc *xbs;
+
+	xbs = device_get_softc(dev);
+	mtx_init(&xbs->xbs_lock, "xenbusb softc lock", NULL, MTX_DEF);
+	xbs->xbs_node = bus_node;
+	xbs->xbs_id_components = id_components;
+	xbs->xbs_dev = dev;
+
+	/*
+	 * Since XenBus busses are attached to the XenStore, and
+	 * the XenStore does not probe children until after interrupt
+	 * services are available, this config hook is used solely
+	 * to ensure that the remainder of the boot process (e.g.
+	 * mount root) is deferred until child devices are adequately
+	 * probed.  We unblock the boot process as soon as the
+	 * connecting child count in our softc goes to 0.
+	 */
+	xbs->xbs_attach_ch.ich_func = xenbusb_nop_confighook_cb;
+	xbs->xbs_attach_ch.ich_arg = dev;
+	config_intrhook_establish(&xbs->xbs_attach_ch);
+	xbs->xbs_flags |= XBS_ATTACH_CH_ACTIVE;
+	xbs->xbs_connecting_children = 1;
+
+	/*
+	 * The subtree for this bus type may not yet exist
+	 * causing initial enumeration to fail.  We still
+	 * want to return success from our attach though
+	 * so that we are ready to handle devices for this
+	 * bus when they are dynamically attached to us
+	 * by a Xen management action.
+	 */
+	(void)xenbusb_enumerate_bus(xbs);
+	xenbusb_probe_children(dev);
+
+	xbs->xbs_device_watch.node = bus_node;
+	xbs->xbs_device_watch.callback = xenbusb_devices_changed;
+
+	TASK_INIT(&xbs->xbs_probe_children, 0, xenbusb_probe_children_cb, dev);
+
+	xs_register_watch(&xbs->xbs_device_watch);
+
+	xenbusb_release_confighook(xbs);
+
+	return (0);
+}
+
+int
+xenbusb_resume(device_t dev)
+{
+	device_t *kids;
+	struct xenbus_device_ivars *ivars;
+	int i, count, error;
+	char *statepath;
+
+	/*
+	 * We must re-examine each device and find the new path for
+	 * its backend.
+	 */
+	if (device_get_children(dev, &kids, &count) == 0) {
+		for (i = 0; i < count; i++) {
+			if (device_get_state(kids[i]) == DS_NOTPRESENT)
+				continue;
+
+			ivars = device_get_ivars(kids[i]);
+
+			xs_unregister_watch(&ivars->xd_otherend_watch);
+			ivars->xd_state = XenbusStateInitialising;
+
+			/*
+			 * Find the new backend details and
+			 * re-register our watch.
+			 */
+			error = XENBUSB_GET_OTHEREND_NODE(dev, ivars);
+			if (error)
+				return (error);
+
+			DEVICE_RESUME(kids[i]);
+
+			statepath = malloc(strlen(ivars->xd_otherend_path)
+			    + strlen("/state") + 1, M_XENBUS, M_WAITOK);
+			sprintf(statepath, "%s/state", ivars->xd_otherend_path);
+
+			free(ivars->xd_otherend_watch.node, M_XENBUS);
+			ivars->xd_otherend_watch.node = statepath;
+			xs_register_watch(&ivars->xd_otherend_watch);
+
+#if 0
+			/*
+			 * Can't do this yet since we are running in
+			 * the xenwatch thread and if we sleep here,
+			 * we will stop delivering watch notifications
+			 * and the device will never come back online.
+			 */
+			sx_xlock(&ivars->xd_lock);
+			while (ivars->xd_state != XenbusStateClosed
+			    && ivars->xd_state != XenbusStateConnected)
+				sx_sleep(&ivars->xd_state, &ivars->xd_lock,
+				    0, "xdresume", 0);
+			sx_xunlock(&ivars->xd_lock);
+#endif
+		}
+		free(kids, M_TEMP);
+	}
+
+	return (0);
+}
+
+int
+xenbusb_print_child(device_t dev, device_t child)
+{
+	struct xenbus_device_ivars *ivars = device_get_ivars(child);
+	int	retval = 0;
+
+	retval += bus_print_child_header(dev, child);
+	retval += printf(" at %s", ivars->xd_node);
+	retval += bus_print_child_footer(dev, child);
+
+	return (retval);
+}
+
+int
+xenbusb_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
+{
+	struct xenbus_device_ivars *ivars = device_get_ivars(child);
+
+	switch (index) {
+	case XENBUS_IVAR_NODE:
+		*result = (uintptr_t) ivars->xd_node;
+		return (0);
+
+	case XENBUS_IVAR_TYPE:
+		*result = (uintptr_t) ivars->xd_type;
+		return (0);
+
+	case XENBUS_IVAR_STATE:
+		*result = (uintptr_t) ivars->xd_state;
+		return (0);
+
+	case XENBUS_IVAR_OTHEREND_ID:
+		*result = (uintptr_t) ivars->xd_otherend_id;
+		return (0);
+
+	case XENBUS_IVAR_OTHEREND_PATH:
+		*result = (uintptr_t) ivars->xd_otherend_path;
+		return (0);
+	}
+
+	return (ENOENT);
+}
+
+int
+xenbusb_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
+{
+	struct xenbus_device_ivars *ivars = device_get_ivars(child);
+	enum xenbus_state newstate;
+	int currstate;
+
+	switch (index) {
+	case XENBUS_IVAR_STATE:
+	{
+		int error;
+
+		newstate = (enum xenbus_state) value;
+		sx_xlock(&ivars->xd_lock);
+		if (ivars->xd_state == newstate) {
+			error = 0;
+			goto out;
+		}
+
+		error = xs_scanf(XST_NIL, ivars->xd_node, "state",
+		    NULL, "%d", &currstate);
+		if (error)
+			goto out;
+
+		do {
+			error = xs_printf(XST_NIL, ivars->xd_node, "state",
+			    "%d", newstate);
+		} while (error == EAGAIN);
+		if (error) {
+			/*
+			 * Avoid looping through xenbus_dev_fatal()
+			 * which calls xenbus_write_ivar to set the
+			 * state to closing.
+			 */
+			if (newstate != XenbusStateClosing)
+				xenbus_dev_fatal(dev, error,
+						 "writing new state");
+			goto out;
+		}
+		ivars->xd_state = newstate;
+
+		if ((ivars->xd_flags & XDF_CONNECTING) != 0
+		 && (newstate == XenbusStateClosed
+		  || newstate == XenbusStateConnected)) {
+			struct xenbusb_softc *xbs;
+
+			ivars->xd_flags &= ~XDF_CONNECTING;
+			xbs = device_get_softc(dev);
+			xenbusb_release_confighook(xbs);
+		}
+
+		wakeup(&ivars->xd_state);
+	out:
+		sx_xunlock(&ivars->xd_lock);
+		return (error);
+	}
+
+	case XENBUS_IVAR_NODE:
+	case XENBUS_IVAR_TYPE:
+	case XENBUS_IVAR_OTHEREND_ID:
+	case XENBUS_IVAR_OTHEREND_PATH:
+		/*
+		 * These variables are read-only.
+		 */
+		return (EINVAL);
+	}
+
+	return (ENOENT);
+}
diff --git a/sys/xen/xenbus/xenbusb.h b/sys/xen/xenbus/xenbusb.h
new file mode 100644
index 00000000000..75abb983c3a
--- /dev/null
+++ b/sys/xen/xenbus/xenbusb.h
@@ -0,0 +1,272 @@
+/*-
+ * Core definitions and data structures shareable across OS platforms.
+ *
+ * Copyright (c) 2010 Spectra Logic Corporation
+ * Copyright (C) 2008 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD$
+ */
+#ifndef _XEN_XENBUS_XENBUSB_H
+#define _XEN_XENBUS_XENBUSB_H
+
+/**
+ * \file xenbusb.h
+ *
+ * Datastructures and function declarations for use in implementing
+ * bus attachements (e.g. frontend and backend device busses) for XenBus.
+ */
+#include "xenbusb_if.h"
+
+/**
+ * Enumeration of state flag values for the xbs_flags field of
+ * the xenbusb_softc structure.
+ */
+typedef enum {
+	/** */
+	XBS_ATTACH_CH_ACTIVE = 0x01
+} xenbusb_softc_flag;
+
+/**
+ * \brief Container for all state needed to manage a Xenbus Bus
+ *	  attachment.
+ */
+struct xenbusb_softc {
+	/**
+	 * XenStore watch used to monitor the subtree of the
+	 * XenStore where devices for this bus attachment arrive	
+	 * and depart.
+	 *
+	 * \note This field must be the first in the softc structure
+	 *       so that a simple cast can be used to retrieve the
+	 *	 softc from within a XenStore watch event callback.
+	 */
+	struct xs_watch	        xbs_device_watch;
+
+	/** Mutex used to protect fields of the xenbusb_softc. */
+	struct mtx		xbs_lock;
+
+	/** State flags. */
+	xenbusb_softc_flag	xbs_flags;
+
+	/**
+	 * A dedicated task for processing child arrival and
+	 * departure events.
+	 */
+	struct task		xbs_probe_children;
+
+	/**
+	 * Config Hook used to block boot processing until
+	 * XenBus devices complete their connection processing
+	 * with other VMs.
+	 */
+	struct intr_config_hook xbs_attach_ch;
+
+	/**
+	 * The number of children for this bus that are still
+	 * in the connecting (to other VMs) state.  This variable
+	 * is used to determine when to release xbs_attach_ch.
+	 */
+	u_int			xbs_connecting_children;
+
+	/** The NewBus device_t for this bus attachment. */
+	device_t		xbs_dev;
+
+	/**
+	 * The VM relative path to the XenStore subtree this
+	 * bus attachment manages.
+	 */
+	const char	       *xbs_node;
+
+	/**
+	 * The number of path components (strings separated by the '/'
+	 * character) that make up the device ID on this bus.
+	 */
+	u_int			xbs_id_components;	
+};
+
+/**
+ * Enumeration of state flag values for the xbs_flags field of
+ * the xenbusb_softc structure.
+ */
+typedef enum {
+
+	/**
+	 * This device is contributing to the xbs_connecting_children
+	 * count of its parent bus.
+	 */
+	XDF_CONNECTING = 0x01
+} xenbus_dev_flag;
+
+/** Instance variables for devices on a XenBus bus. */
+struct xenbus_device_ivars {
+	/**
+	 * XenStore watch used to monitor the subtree of the
+	 * XenStore where information about the otherend of
+	 * the split Xen device this device instance represents.
+	 *
+	 * \note This field must be the first in the instance
+	 *	 variable structure so that a simple cast can be
+	 *	 used to retrieve ivar data from within a XenStore
+	 *	 watch event callback.
+	 */
+	struct xs_watch		xd_otherend_watch;
+
+	/** Sleepable lock used to protect instance data. */
+	struct sx		xd_lock;
+
+	/** State flags. */
+	xenbus_dev_flag		xd_flags;
+
+	/** The NewBus device_t for this XenBus device instance. */
+	device_t		xd_dev;
+
+	/**
+	 * The VM relative path to the XenStore subtree representing
+	 * this VMs half of this device.
+	 */
+	char		       *xd_node;
+
+	/** XenBus device type ("vbd", "vif", etc.). */
+	char		       *xd_type;
+
+	/**
+	 * Cached version of <xd_node>/state node in the XenStore.
+	 */
+	enum xenbus_state	xd_state;
+
+	/** The VM identifier of the other end of this split device. */
+	int			xd_otherend_id;
+
+	/**
+	 * The path to the subtree of the XenStore where information
+	 * about the otherend of this split device instance.
+	 */
+	char		       *xd_otherend_path;
+};
+
+/**
+ * \brief Identify instances of this device type in the system.
+ *
+ * \param driver  The driver performing this identify action.
+ * \param parent  The NewBus parent device for any devices this method adds.
+ */
+void xenbusb_identify(driver_t *driver __unused, device_t parent);
+
+/**
+ * \brief Perform common XenBus bus attach processing.
+ *
+ * \param dev            The NewBus device representing this XenBus bus.
+ * \param bus_node       The XenStore path to the XenStore subtree for
+ *                       this XenBus bus.
+ * \param id_components  The number of '/' separated path components that
+ *                       make up a unique device ID on this XenBus bus.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * Intiailizes the softc for this bus, installs an interrupt driven
+ * configuration hook to block boot processing until XenBus devices fully
+ * configure, performs an initial probe/attach of the bus, and registers
+ * a XenStore watch so we are notified when the bus topology changes.
+ */
+int xenbusb_attach(device_t dev, char *bus_node, u_int id_components);
+
+/**
+ * \brief Perform common XenBus bus resume handling.
+ *
+ * \param dev  The NewBus device representing this XenBus bus.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xenbusb_resume(device_t dev);
+
+/**
+ * \brief Pretty-prints information about a child of a XenBus bus.
+ *
+ * \param dev    The NewBus device representing this XenBus bus.
+ * \param child	 The NewBus device representing a child of dev%'s XenBus bus.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xenbusb_print_child(device_t dev, device_t child);
+
+/**
+ * \brief Common XenBus child instance variable read access method.
+ *
+ * \param dev     The NewBus device representing this XenBus bus.
+ * \param child	  The NewBus device representing a child of dev%'s XenBus bus.
+ * \param index	  The index of the instance variable to access.
+ * \param result  The value of the instance variable accessed.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xenbusb_read_ivar(device_t dev, device_t child, int index,
+		      uintptr_t *result);
+
+/**
+ * \brief Common XenBus child instance variable write access method.
+ *
+ * \param dev    The NewBus device representing this XenBus bus.
+ * \param child	 The NewBus device representing a child of dev%'s XenBus bus.
+ * \param index	 The index of the instance variable to access.
+ * \param value  The new value to set in the instance variable accessed.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xenbusb_write_ivar(device_t dev, device_t child, int index,
+		       uintptr_t value);
+
+/**
+ * \brief Attempt to add a XenBus device instance to this XenBus bus.
+ *
+ * \param dev   The NewBus device representing this XenBus bus.
+ * \param type  The device type being added (e.g. "vbd", "vif").
+ * \param id	The device ID for this device.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.  Failure indicates that either the
+ *          path to this device no longer exists or insufficient
+ *          information exists in the XenStore to create a new
+ *          device.
+ *
+ * If successful, this routine will add a device_t with instance
+ * variable storage to the NewBus device topology.  Probe/Attach
+ * processing is not performed by this routine, but must be scheduled
+ * via the xbs_probe_children task.  This separation of responsibilities
+ * is required to avoid hanging up the XenStore event delivery thread
+ * with our probe/attach work in the event a device is added via
+ * a callback from the XenStore.
+ */
+int xenbusb_add_device(device_t dev, const char *type, const char *id);
+
+#endif /* _XEN_XENBUS_XENBUSB_H */
diff --git a/sys/xen/xenbus/xenbusb_back.c b/sys/xen/xenbus/xenbusb_back.c
new file mode 100644
index 00000000000..32bbc04dbdc
--- /dev/null
+++ b/sys/xen/xenbus/xenbusb_back.c
@@ -0,0 +1,295 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2009, 2010 Spectra Logic Corporation
+ * Copyright (C) 2008 Doug Rabson
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005 XenSource Ltd
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * \file xenbusb_back.c
+ *
+ * XenBus management of the NewBus bus containing the backend instances of
+ * Xen split devices.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+
+#include <machine/xen/xen-os.h>
+#include <machine/stdarg.h>
+
+#include <xen/gnttab.h>
+#include <xen/xenbus/xenbusvar.h>
+#include <xen/xenbus/xenbusb.h>
+
+
+/*------------------ Private Device Attachment Functions  --------------------*/
+/**
+ * \brief Probe for the existance of the XenBus back bus.
+ *
+ * \param dev  NewBus device_t for this XenBus back bus instance.
+ *
+ * \return  Always returns 0 indicating success.
+ */
+static int 
+xenbusb_back_probe(device_t dev)
+{
+	device_set_desc(dev, "Xen Backend Devices");
+
+	return (0);
+}
+
+/**
+ * \brief Attach the XenBus back bus.
+ *
+ * \param dev  NewBus device_t for this XenBus back bus instance.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+static int
+xenbusb_back_attach(device_t dev)
+{
+	struct xenbusb_softc *xbs;
+	int error;
+
+	xbs = device_get_softc(dev);
+	error = xenbusb_attach(dev, "backend", /*id_components*/2);
+
+	/*
+	 * Backend devices operate to serve other domains,
+	 * so there is no need to hold up boot processing
+	 * while connections to foreign domains are made.
+	 */
+	mtx_lock(&xbs->xbs_lock);
+	if ((xbs->xbs_flags & XBS_ATTACH_CH_ACTIVE) != 0) {
+		xbs->xbs_flags &= ~XBS_ATTACH_CH_ACTIVE;
+		mtx_unlock(&xbs->xbs_lock);
+		config_intrhook_disestablish(&xbs->xbs_attach_ch);
+	} else {
+		mtx_unlock(&xbs->xbs_lock);
+	}
+
+	return (error);
+}
+
+/**
+ * \brief Enumerate all devices of the given type on this bus.
+ *
+ * \param dev   NewBus device_t for this XenBus backend bus instance.
+ * \param type  String indicating the device sub-tree (e.g. "vfb", "vif")
+ *              to enumerate. 
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * Devices that are found are entered into the NewBus hierarchy via
+ * xenbusb_add_device().  xenbusb_add_device() ignores duplicate detects
+ * and ignores duplicate devices, so it can be called unconditionally
+ * for any device found in the XenStore.
+ *
+ * The backend XenStore hierarchy has the following format:
+ *
+ *     backend/<device type>/<frontend vm id>/<device id>
+ *
+ */
+static int
+xenbusb_back_enumerate_type(device_t dev, const char *type)
+{
+	struct xenbusb_softc *xbs;
+	const char **vms;
+	u_int vm_idx;
+	u_int vm_count;
+	int error;
+
+	xbs = device_get_softc(dev);
+	error = xs_directory(XST_NIL, xbs->xbs_node, type, &vm_count, &vms);
+	if (error)
+		return (error);
+	for (vm_idx = 0; vm_idx < vm_count; vm_idx++) {
+		struct sbuf *vm_path;
+		const char *vm;
+		const char **devs;
+		u_int dev_idx;
+		u_int dev_count;
+
+		vm = vms[vm_idx];
+
+		vm_path = xs_join(type, vm);
+		error = xs_directory(XST_NIL, xbs->xbs_node, sbuf_data(vm_path),
+		    &dev_count, &devs);
+		sbuf_delete(vm_path);
+		if (error)
+			break;
+
+		for (dev_idx = 0; dev_idx < dev_count; dev_idx++) {
+			const char *dev_num;
+			struct sbuf *id;
+			
+			dev_num = devs[dev_idx];
+			id = xs_join(vm, dev_num);
+			xenbusb_add_device(dev, type, sbuf_data(id));
+			sbuf_delete(id);
+		}
+		free(devs, M_XENSTORE);
+	}
+
+	free(vms, M_XENSTORE);
+
+	return (0);
+}
+
+/**
+ * \brief Determine and store the XenStore path for the other end of
+ *        a split device whose local end is represented by ivars.
+ *
+ * \param dev    NewBus device_t for this XenBus backend bus instance.
+ * \param ivars  Instance variables from the XenBus child device for
+ *               which to perform this function.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * If successful, the xd_otherend_path field of the child's instance
+ * variables will be updated.
+ *
+ */
+static int
+xenbusb_back_get_otherend_node(device_t dev, struct xenbus_device_ivars *ivars)
+{
+	char *otherend_path;
+	int error;
+
+	if (ivars->xd_otherend_path != NULL) {
+		free(ivars->xd_otherend_path, M_XENBUS);
+		ivars->xd_otherend_path = NULL;
+	}
+		
+	error = xs_gather(XST_NIL, ivars->xd_node,
+	    "frontend-id", "%i", &ivars->xd_otherend_id,
+	    "frontend", NULL, &otherend_path,
+	    NULL);
+
+	if (error == 0) {
+		ivars->xd_otherend_path = strdup(otherend_path, M_XENBUS);
+		free(otherend_path, M_XENSTORE);
+	}
+	return (error);
+}
+
+/**
+ * \brief Backend XenBus child instance variable write access method.
+ *
+ * \param dev    The NewBus device representing this XenBus bus.
+ * \param child	 The NewBus device representing a child of dev%'s XenBus bus.
+ * \param index	 The index of the instance variable to access.
+ * \param value  The new value to set in the instance variable accessed.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * Xenbus_back overrides this method so that it can trap state transitions
+ * of local backend devices and clean up their XenStore entries as necessary
+ * during device instance teardown.
+ */
+static int
+xenbusb_back_write_ivar(device_t dev, device_t child, int index,
+			uintptr_t value)
+{
+	int error;
+
+	error = xenbusb_write_ivar(dev, child, index, value); 
+
+	if (index == XENBUS_IVAR_STATE
+	 && (enum xenbus_state)value == XenbusStateClosed
+	 && xenbus_dev_is_online(child) == 0) {
+
+		/*
+		 * Cleanup the hotplug entry in the XenStore if
+		 * present.  The control domain expects any userland
+		 * component associated with this device to destroy
+		 * this node in order to signify it is safe to 
+		 * teardown the device.  However, not all backends
+		 * rely on userland components, and those that
+		 * do should either use a communication channel
+		 * other than the XenStore, or ensure the hotplug
+		 * data is already cleaned up.
+		 *
+		 * This removal ensures that no matter what path
+		 * is taken to mark a back-end closed, the control
+		 * domain will understand that it is closed.
+		 */
+		xs_rm(XST_NIL, xenbus_get_node(child), "hotplug-status");
+	}
+
+	return (error);
+}
+
+/*-------------------- Private Device Attachment Data  -----------------------*/
+static device_method_t xenbusb_back_methods[] = { 
+	/* Device interface */ 
+	DEVMETHOD(device_identify,	xenbusb_identify),
+	DEVMETHOD(device_probe,         xenbusb_back_probe), 
+	DEVMETHOD(device_attach,        xenbusb_back_attach), 
+	DEVMETHOD(device_detach,        bus_generic_detach), 
+	DEVMETHOD(device_shutdown,      bus_generic_shutdown), 
+	DEVMETHOD(device_suspend,       bus_generic_suspend), 
+	DEVMETHOD(device_resume,        bus_generic_resume), 
+ 
+	/* Bus Interface */ 
+	DEVMETHOD(bus_print_child,      xenbusb_print_child),
+	DEVMETHOD(bus_read_ivar,        xenbusb_read_ivar), 
+	DEVMETHOD(bus_write_ivar,       xenbusb_back_write_ivar), 
+	DEVMETHOD(bus_alloc_resource,   bus_generic_alloc_resource),
+	DEVMETHOD(bus_release_resource, bus_generic_release_resource),
+	DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
+	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
+ 
+	/* XenBus Bus Interface */
+	DEVMETHOD(xenbusb_enumerate_type, xenbusb_back_enumerate_type),
+	DEVMETHOD(xenbusb_get_otherend_node, xenbusb_back_get_otherend_node),
+	{ 0, 0 } 
+}; 
+
+DEFINE_CLASS_0(xenbusb_back, xenbusb_back_driver, xenbusb_back_methods,
+	       sizeof(struct xenbusb_softc));
+devclass_t xenbusb_back_devclass; 
+ 
+DRIVER_MODULE(xenbusb_back, xenstore, xenbusb_back_driver,
+	      xenbusb_back_devclass, 0, 0);
diff --git a/sys/xen/xenbus/xenbusb_front.c b/sys/xen/xenbus/xenbusb_front.c
new file mode 100644
index 00000000000..0bc06a4538c
--- /dev/null
+++ b/sys/xen/xenbus/xenbusb_front.c
@@ -0,0 +1,195 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2009, 2010 Spectra Logic Corporation
+ * Copyright (C) 2008 Doug Rabson
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005 XenSource Ltd
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * \file xenbusb_front.c
+ *
+ * XenBus management of the NewBus bus containing the frontend instances of
+ * Xen split devices.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+
+#include <machine/xen/xen-os.h>
+#include <machine/stdarg.h>
+
+#include <xen/gnttab.h>
+#include <xen/xenbus/xenbusvar.h>
+#include <xen/xenbus/xenbusb.h>
+
+
+/*------------------ Private Device Attachment Functions  --------------------*/
+/**
+ * \brief Probe for the existance of the XenBus front bus.
+ *
+ * \param dev  NewBus device_t for this XenBus front bus instance.
+ *
+ * \return  Always returns 0 indicating success.
+ */
+static int 
+xenbusb_front_probe(device_t dev)
+{
+	device_set_desc(dev, "Xen Frontend Devices");
+
+	return (0);
+}
+
+/**
+ * \brief Attach the XenBus front bus.
+ *
+ * \param dev  NewBus device_t for this XenBus front bus instance.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+static int
+xenbusb_front_attach(device_t dev)
+{
+	return (xenbusb_attach(dev, "device", /*id_components*/1));
+}
+
+/**
+ * \brief Enumerate all devices of the given type on this bus.
+ *
+ * \param dev   NewBus device_t for this XenBus front bus instance.
+ * \param type  String indicating the device sub-tree (e.g. "vfb", "vif")
+ *              to enumerate. 
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * Devices that are found are entered into the NewBus hierarchy via
+ * xenbusb_add_device().  xenbusb_add_device() ignores duplicate detects
+ * and ignores duplicate devices, so it can be called unconditionally
+ * for any device found in the XenStore.
+ */
+static int
+xenbusb_front_enumerate_type(device_t dev, const char *type)
+{
+	struct xenbusb_softc *xbs;
+	const char **dir;
+	unsigned int i, count;
+	int error;
+
+	xbs = device_get_softc(dev);
+	error = xs_directory(XST_NIL, xbs->xbs_node, type, &count, &dir);
+	if (error)
+		return (error);
+	for (i = 0; i < count; i++)
+		xenbusb_add_device(dev, type, dir[i]);
+
+	free(dir, M_XENSTORE);
+
+	return (0);
+}
+
+/**
+ * \brief Determine and store the XenStore path for the other end of
+ *        a split device whose local end is represented by ivars.
+ *
+ * If successful, the xd_otherend_path field of the child's instance
+ * variables will be updated.
+ *
+ * \param dev    NewBus device_t for this XenBus front bus instance.
+ * \param ivars  Instance variables from the XenBus child device for
+ *               which to perform this function.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+static int
+xenbusb_front_get_otherend_node(device_t dev, struct xenbus_device_ivars *ivars)
+{
+	char *otherend_path;
+	int error;
+
+	if (ivars->xd_otherend_path != NULL) {
+		free(ivars->xd_otherend_path, M_XENBUS);
+		ivars->xd_otherend_path = NULL;
+	}
+		
+	error = xs_gather(XST_NIL, ivars->xd_node,
+	    "backend-id", "%i", &ivars->xd_otherend_id,
+	    "backend", NULL, &otherend_path,
+	    NULL);
+
+	if (error == 0) {
+		ivars->xd_otherend_path = strdup(otherend_path, M_XENBUS);
+		free(otherend_path, M_XENSTORE);
+	}
+	return (error);
+}
+
+/*-------------------- Private Device Attachment Data  -----------------------*/
+static device_method_t xenbusb_front_methods[] = { 
+	/* Device interface */ 
+	DEVMETHOD(device_identify,	xenbusb_identify),
+	DEVMETHOD(device_probe,         xenbusb_front_probe), 
+	DEVMETHOD(device_attach,        xenbusb_front_attach), 
+	DEVMETHOD(device_detach,        bus_generic_detach), 
+	DEVMETHOD(device_shutdown,      bus_generic_shutdown), 
+	DEVMETHOD(device_suspend,       bus_generic_suspend), 
+	DEVMETHOD(device_resume,        bus_generic_resume), 
+ 
+	/* Bus Interface */ 
+	DEVMETHOD(bus_print_child,      xenbusb_print_child),
+	DEVMETHOD(bus_read_ivar,        xenbusb_read_ivar), 
+	DEVMETHOD(bus_write_ivar,       xenbusb_write_ivar), 
+	DEVMETHOD(bus_alloc_resource,   bus_generic_alloc_resource),
+	DEVMETHOD(bus_release_resource, bus_generic_release_resource),
+	DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
+	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
+ 
+	/* XenBus Bus Interface */
+	DEVMETHOD(xenbusb_enumerate_type, xenbusb_front_enumerate_type),
+	DEVMETHOD(xenbusb_get_otherend_node, xenbusb_front_get_otherend_node),
+	{ 0, 0 } 
+}; 
+
+DEFINE_CLASS_0(xenbusb_front, xenbusb_front_driver, xenbusb_front_methods,
+	       sizeof(struct xenbusb_softc));
+devclass_t xenbusb_front_devclass; 
+ 
+DRIVER_MODULE(xenbusb_front, xenstore, xenbusb_front_driver,
+	      xenbusb_front_devclass, 0, 0);
diff --git a/sys/xen/xenbus/xenbusb_if.m b/sys/xen/xenbus/xenbusb_if.m
new file mode 100644
index 00000000000..a32e3f6fe51
--- /dev/null
+++ b/sys/xen/xenbus/xenbusb_if.m
@@ -0,0 +1,78 @@
+#-
+# Copyright (c) 2010 Spectra Logic Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions, and the following disclaimer,
+#    without modification.
+# 2. Redistributions in binary form must reproduce at minimum a disclaimer
+#    substantially similar to the "NO WARRANTY" disclaimer below
+#    ("Disclaimer") and any redistribution must be conditioned upon
+#    including a substantially similar Disclaimer requirement for further
+#    binary redistribution.
+#
+# NO WARRANTY
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGES.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+
+HEADER {
+struct xenbus_device_ivars;
+}
+
+INTERFACE xenbusb;
+
+/**
+ * \brief Enumerate all devices of the given type on this bus.
+ *
+ * \param _dev  NewBus device_t for this XenBus (front/back) bus instance.
+ * \param _type String indicating the device sub-tree (e.g. "vfb", "vif")
+ *              to enumerate. 
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * Devices that are found should be entered into the NewBus hierarchy via
+ * xenbusb_add_device().  xenbusb_add_device() ignores duplicate detects
+ * and ignores duplicate devices, so it can be called unconditionally
+ * for any device found in the XenStore.
+ */
+METHOD int enumerate_type {
+	device_t _dev;
+	const char *_type;
+};
+
+/**
+ * \brief Determine and store the XenStore path for the other end of
+ *        a split device whose local end is represented by ivars.
+ *
+ * If successful, the xd_otherend_path field of the child's instance
+ * variables must be updated.
+ *
+ * \param _dev    NewBus device_t for this XenBus (front/back) bus instance.
+ * \param _ivars  Instance variables from the XenBus child device for
+ *                which to perform this function.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+METHOD int get_otherend_node {
+	device_t _dev;
+	struct xenbus_device_ivars *_ivars;
+}
diff --git a/sys/xen/xenbus/xenbusvar.h b/sys/xen/xenbus/xenbusvar.h
index 651166421ea..55d7f29d4d6 100644
--- a/sys/xen/xenbus/xenbusvar.h
+++ b/sys/xen/xenbus/xenbusvar.h
@@ -1,8 +1,4 @@
 /******************************************************************************
- * xenbus.h
- *
- * Talks to Xen Store to figure out what devices we have.
- *
  * Copyright (C) 2005 Rusty Russell, IBM Corporation
  * Copyright (C) 2005 XenSource Ltd.
  * 
@@ -30,46 +26,64 @@
  * $FreeBSD$
  */
 
+/**
+ * \file xenbusvar.h
+ *
+ * \brief Datastructures and function declarations for usedby device
+ *        drivers operating on the XenBus.
+ */
+
 #ifndef _XEN_XENBUS_XENBUSVAR_H
 #define _XEN_XENBUS_XENBUSVAR_H
 
 #include <sys/queue.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/sbuf.h>
+
+#include <machine/stdarg.h>
 #include <machine/xen/xen-os.h>
+
+#include <xen/interface/grant_table.h>
 #include <xen/interface/io/xenbus.h>
 #include <xen/interface/io/xs_wire.h>
 
+#include <xen/xenstore/xenstorevar.h>
+
 #include "xenbus_if.h"
 
+/* XenBus allocations including XenStore data returned to clients. */
+MALLOC_DECLARE(M_XENBUS);
+
 enum {
-	/*
+	/**
 	 * Path of this device node.
 	 */
 	XENBUS_IVAR_NODE,
 
-	/*
+	/**
 	 * The device type (e.g. vif, vbd).
 	 */
 	XENBUS_IVAR_TYPE,
 
-	/*
+	/**
 	 * The state of this device (not the otherend's state).
 	 */
 	XENBUS_IVAR_STATE,
 
-	/*
+	/**
 	 * Domain ID of the other end device.
 	 */
 	XENBUS_IVAR_OTHEREND_ID,
 
-	/*
+	/**
 	 * Path of the other end device.
 	 */
 	XENBUS_IVAR_OTHEREND_PATH
 };
 
-/*
+/**
  * Simplified accessors for xenbus devices
  */
 #define	XENBUS_ACCESSOR(var, ivar, type) \
@@ -81,179 +95,184 @@ XENBUS_ACCESSOR(state,		STATE,			enum xenbus_state)
 XENBUS_ACCESSOR(otherend_id,	OTHEREND_ID,		int)
 XENBUS_ACCESSOR(otherend_path,	OTHEREND_PATH,		const char *)
 
-/* Register callback to watch this node. */
-struct xenbus_watch
-{
-	LIST_ENTRY(xenbus_watch) list;
-
-	/* Path being watched. */
-	char *node;
-
-	/* Callback (executed in a process context with no locks held). */
-	void (*callback)(struct xenbus_watch *,
-			 const char **vec, unsigned int len);
-};
-
-typedef int (*xenstore_event_handler_t)(void *);
-
-struct xenbus_transaction
-{
-		uint32_t id;
-};
-
-#define XBT_NIL ((struct xenbus_transaction) { 0 })
-
-int xenbus_directory(struct xenbus_transaction t, const char *dir,
-    const char *node, unsigned int *num, char ***result);
-int xenbus_read(struct xenbus_transaction t, const char *dir,
-    const char *node, unsigned int *len, void **result);
-int xenbus_write(struct xenbus_transaction t, const char *dir,
-    const char *node, const char *string);
-int xenbus_mkdir(struct xenbus_transaction t, const char *dir,
-    const char *node);
-int xenbus_exists(struct xenbus_transaction t, const char *dir,
-    const char *node);
-int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node);
-int xenbus_transaction_start(struct xenbus_transaction *t);
-int xenbus_transaction_end(struct xenbus_transaction t, int abort);
-
-/*
- * Single read and scanf: returns errno or zero. If scancountp is
- * non-null, then number of items scanned is returned in *scanncountp.
- */
-int xenbus_scanf(struct xenbus_transaction t,
-    const char *dir, const char *node, int *scancountp, const char *fmt, ...)
-	__attribute__((format(scanf, 5, 6)));
-
-/* Single printf and write: returns errno or 0. */
-int xenbus_printf(struct xenbus_transaction t,
-		  const char *dir, const char *node, const char *fmt, ...)
-	__attribute__((format(printf, 4, 5)));
-
-/*
- * Generic read function: NULL-terminated triples of name,
- * sprintf-style type string, and pointer. Returns 0 or errno.
- */
-int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
-
-/* notifer routines for when the xenstore comes up */
-int register_xenstore_notifier(xenstore_event_handler_t func, void *arg, int priority);
-#if 0
-void unregister_xenstore_notifier();
-#endif
-int register_xenbus_watch(struct xenbus_watch *watch);
-void unregister_xenbus_watch(struct xenbus_watch *watch);
-void xs_suspend(void);
-void xs_resume(void);
-
-/* Used by xenbus_dev to borrow kernel's store connection. */
-int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void **result);
-
-#if 0
-
-#define XENBUS_IS_ERR_READ(str) ({			\
-	if (!IS_ERR(str) && strlen(str) == 0) {		\
-		free(str, M_DEVBUF);				\
-		str = ERR_PTR(-ERANGE);			\
-	}						\
-	IS_ERR(str);					\
-})
-
-#endif
-
-#define XENBUS_EXIST_ERR(err) ((err) == ENOENT || (err) == ERANGE)
-
-
 /**
- * Register a watch on the given path, using the given xenbus_watch structure
- * for storage, and the given callback function as the callback.  Return 0 on
- * success, or errno on error.  On success, the given path will be saved as
- * watch->node, and remains the caller's to free.  On error, watch->node will
- * be NULL, the device will switch to XenbusStateClosing, and the error will
- * be saved in the store.
- */
-int xenbus_watch_path(device_t dev, char *path,
-		      struct xenbus_watch *watch, 
-		      void (*callback)(struct xenbus_watch *,
-				       const char **, unsigned int));
-
-
-/**
- * Register a watch on the given path/path2, using the given xenbus_watch
- * structure for storage, and the given callback function as the callback.
- * Return 0 on success, or errno on error.  On success, the watched path
- * (path/path2) will be saved as watch->node, and becomes the caller's to
- * kfree().  On error, watch->node will be NULL, so the caller has nothing to
- * free, the device will switch to XenbusStateClosing, and the error will be
- * saved in the store.
- */
-int xenbus_watch_path2(device_t dev, const char *path,
-		       const char *path2, struct xenbus_watch *watch, 
-		       void (*callback)(struct xenbus_watch *,
-					const char **, unsigned int));
-
-
-/**
- * Advertise in the store a change of the given driver to the given new_state.
- * which case this is performed inside its own transaction.  Return 0 on
- * success, or errno on error.  On error, the device will switch to
- * XenbusStateClosing, and the error will be saved in the store.
- */
-int xenbus_switch_state(device_t dev,
-			XenbusState new_state);
-
-
-/**
- * Grant access to the given ring_mfn to the peer of the given device.
- * Return 0 on success, or errno on error.  On error, the device will
- * switch to XenbusStateClosing, and the error will be saved in the
- * store. The grant ring reference is returned in *refp.
- */
-int xenbus_grant_ring(device_t dev, unsigned long ring_mfn, int *refp);
-
-
-/**
- * Allocate an event channel for the given xenbus_device, assigning the newly
- * created local port to *port.  Return 0 on success, or errno on error.  On
- * error, the device will switch to XenbusStateClosing, and the error will be
- * saved in the store.
- */
-int xenbus_alloc_evtchn(device_t dev, int *port);
-
-
-/**
- * Free an existing event channel. Returns 0 on success or errno on error.
- */
-int xenbus_free_evtchn(device_t dev, int port);
-
-
-/**
- * Return the state of the driver rooted at the given store path, or
- * XenbusStateClosed if no state can be read.
+ * Return the state of a XenBus device.
+ *
+ * \param path  The root XenStore path for the device.
+ *
+ * \return  The current state of the device or XenbusStateClosed if no
+ *	    state can be read.
  */
 XenbusState xenbus_read_driver_state(const char *path);
 
-
-/***
- * Report the given negative errno into the store, along with the given
- * formatted message.
+/**
+ * Initialize and register a watch on the given path (client suplied storage).
+ *
+ * \param dev       The XenBus device requesting the watch service.
+ * \param path      The XenStore path of the object to be watched.  The
+ *                  storage for this string must be stable for the lifetime
+ *                  of the watch.
+ * \param watch     The watch object to use for this request.  This object
+ *                  must be stable for the lifetime of the watch.
+ * \param callback  The function to call when XenStore objects at or below
+ *                  path are modified.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * \note  On error, the device 'dev' will be switched to the XenbusStateClosing
+ *        state and the returned error is saved in the per-device error node
+ *        for dev in the XenStore.
  */
-void xenbus_dev_error(device_t dev, int err, const char *fmt,
-		      ...);
+int xenbus_watch_path(device_t dev, char *path,
+		      struct xs_watch *watch, 
+		      xs_watch_cb_t *callback);
 
-
-/***
- * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
- * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
- * closedown of this driver and its peer.
+/**
+ * Initialize and register a watch at path/path2 in the XenStore.
+ *
+ * \param dev       The XenBus device requesting the watch service.
+ * \param path      The base XenStore path of the object to be watched.
+ * \param path2     The tail XenStore path of the object to be watched.
+ * \param watch     The watch object to use for this request.  This object
+ *                  must be stable for the lifetime of the watch.
+ * \param callback  The function to call when XenStore objects at or below
+ *                  path are modified.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * \note  On error, \a dev will be switched to the XenbusStateClosing
+ *        state and the returned error is saved in the per-device error node
+ *        for \a dev in the XenStore.
+ *
+ * Similar to xenbus_watch_path, however the storage for the path to the
+ * watched object is allocated from the heap and filled with "path '/' path2".
+ * Should a call to this function succeed, it is the callers responsibility
+ * to free watch->node using the M_XENBUS malloc type.
  */
-void xenbus_dev_fatal(device_t dev, int err, const char *fmt,
-		      ...);
+int xenbus_watch_path2(device_t dev, const char *path,
+		       const char *path2, struct xs_watch *watch, 
+		       xs_watch_cb_t *callback);
 
-int xenbus_dev_init(void);
+/**
+ * Grant access to the given ring_mfn to the peer of the given device.
+ *
+ * \param dev        The device granting access to the ring page.
+ * \param ring_mfn   The guest machine page number of the page to grant
+ *                   peer access rights.
+ * \param refp[out]  The grant reference for the page.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * A successful call to xenbus_grant_ring should be paired with a call
+ * to gnttab_end_foreign_access() when foregn access to this page is no
+ * longer requried.
+ * 
+ * \note  On error, \a dev will be switched to the XenbusStateClosing
+ *        state and the returned error is saved in the per-device error node
+ *        for \a dev in the XenStore.
+ */
+int xenbus_grant_ring(device_t dev, unsigned long ring_mfn, grant_ref_t *refp);
 
+/**
+ * Allocate an event channel for the given XenBus device.
+ *
+ * \param dev        The device for which to allocate the event channel.
+ * \param port[out]  The port identifier for the allocated event channel.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * A successfully allocated event channel should be free'd using
+ * xenbus_free_evtchn().
+ *
+ * \note  On error, \a dev will be switched to the XenbusStateClosing
+ *        state and the returned error is saved in the per-device error node
+ *        for \a dev in the XenStore.
+ */
+int xenbus_alloc_evtchn(device_t dev, evtchn_port_t *port);
+
+/**
+ * Free an existing event channel.
+ *
+ * \param dev   The device which allocated this event channel.
+ * \param port  The port identifier for the event channel to free.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * \note  On error, \a dev will be switched to the XenbusStateClosing
+ *        state and the returned error is saved in the per-device error node
+ *        for \a dev in the XenStore.
+ */
+int xenbus_free_evtchn(device_t dev, evtchn_port_t port);
+
+/**
+ * Record the given errno, along with the given, printf-style, formatted
+ * message in dev's device specific error node in the XenStore.
+ *
+ * \param dev  The device which encountered the error.
+ * \param err  The errno value corresponding to the error.
+ * \param fmt  Printf format string followed by a variable number of
+ *             printf arguments.
+ */
+void xenbus_dev_error(device_t dev, int err, const char *fmt, ...)
+	__attribute__((format(printf, 3, 4)));
+
+/**
+ * va_list version of xenbus_dev_error().
+ *
+ * \param dev  The device which encountered the error.
+ * \param err  The errno value corresponding to the error.
+ * \param fmt  Printf format string.
+ * \param ap   Va_list of printf arguments.
+ */
+void xenbus_dev_verror(device_t dev, int err, const char *fmt, va_list ap)
+	__attribute__((format(printf, 3, 0)));
+
+/**
+ * Equivalent to xenbus_dev_error(), followed by
+ * xenbus_set_state(dev, XenbusStateClosing).
+ *
+ * \param dev  The device which encountered the error.
+ * \param err  The errno value corresponding to the error.
+ * \param fmt  Printf format string followed by a variable number of
+ *             printf arguments.
+ */
+void xenbus_dev_fatal(device_t dev, int err, const char *fmt, ...)
+	__attribute__((format(printf, 3, 4)));
+
+/**
+ * va_list version of xenbus_dev_fatal().
+ *
+ * \param dev  The device which encountered the error.
+ * \param err  The errno value corresponding to the error.
+ * \param fmt  Printf format string.
+ * \param ap   Va_list of printf arguments.
+ */
+void xenbus_dev_vfatal(device_t dev, int err, const char *fmt, va_list)
+	__attribute__((format(printf, 3, 0)));
+
+/**
+ * Convert a member of the xenbus_state enum into an ASCII string.
+ *
+ * /param state  The XenBus state to lookup.
+ *
+ * /return  A string representing state or, for unrecognized states,
+ *	    the string "Unknown".
+ */
 const char *xenbus_strstate(enum xenbus_state state);
+
+/**
+ * Return the value of a XenBus device's "online" node within the XenStore.
+ *
+ * \param dev  The XenBus device to query.
+ *
+ * \return  The value of the "online" node for the device.  If the node
+ *          does not exist, 0 (offline) is returned.
+ */
 int xenbus_dev_is_online(device_t dev);
-int xenbus_frontend_closed(device_t dev);
 
 #endif /* _XEN_XENBUS_XENBUSVAR_H */
diff --git a/sys/xen/xenstore/xenstore.c b/sys/xen/xenstore/xenstore.c
new file mode 100644
index 00000000000..76dfb5a576b
--- /dev/null
+++ b/sys/xen/xenstore/xenstore.c
@@ -0,0 +1,1654 @@
+/******************************************************************************
+ * xenstore.c
+ *
+ * Low-level kernel interface to the XenStore.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2009,2010 Spectra Logic Corporation
+ *
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/syslog.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/unistd.h>
+
+#include <machine/xen/xen-os.h>
+#include <machine/stdarg.h>
+
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/hypervisor.h>
+#include <xen/xen_intr.h>
+
+#include <xen/interface/hvm/params.h>
+
+#include <xen/xenstore/xenstorevar.h>
+#include <xen/xenstore/xenstore_internal.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+/**
+ * \file xenstore.c
+ * \brief XenStore interface
+ *
+ * The XenStore interface is a simple storage system that is a means of
+ * communicating state and configuration data between the Xen Domain 0
+ * and the various guest domains.  All configuration data other than
+ * a small amount of essential information required during the early
+ * boot process of launching a Xen aware guest, is managed using the
+ * XenStore.
+ *
+ * The XenStore is ASCII string based, and has a structure and semantics
+ * similar to a filesystem.  There are files and directories, the directories
+ * able to contain files or other directories.  The depth of the hierachy
+ * is only limited by the XenStore's maximum path length.
+ *
+ * The communication channel between the XenStore service and other
+ * domains is via two, guest specific, ring buffers in a shared memory
+ * area.  One ring buffer is used for communicating in each direction.
+ * The grant table references for this shared memory are given to the
+ * guest either via the xen_start_info structure for a fully para-
+ * virtualized guest, or via HVM hypercalls for a hardware virtualized
+ * guest.
+ *
+ * The XenStore communication relies on an event channel and thus
+ * interrupts.  For this reason, the attachment of the XenStore
+ * relies on an interrupt driven configuration hook to hold off
+ * boot processing until communication with the XenStore service
+ * can be established.
+ *
+ * Several Xen services depend on the XenStore, most notably the
+ * XenBus used to discover and manage Xen devices.  These services
+ * are implemented as NewBus child attachments to a bus exported
+ * by this XenStore driver.
+ */
+
+static struct xs_watch *find_watch(const char *token);
+
+MALLOC_DEFINE(M_XENSTORE, "xenstore", "XenStore data and results");
+
+/**
+ * Pointer to shared memory communication structures allowing us
+ * to communicate with the XenStore service.
+ *
+ * When operating in full PV mode, this pointer is set early in kernel
+ * startup from within xen_machdep.c.  In HVM mode, we use hypercalls
+ * to get the guest frame number for the shared page and then map it
+ * into kva.  See xs_init() for details.
+ */
+struct xenstore_domain_interface *xen_store;
+
+/*-------------------------- Private Data Structures ------------------------*/
+
+/**
+ * Structure capturing messages received from the XenStore service.
+ */
+struct xs_stored_msg {
+	TAILQ_ENTRY(xs_stored_msg) list;
+
+	struct xsd_sockmsg hdr;
+
+	union {
+		/* Queued replies. */
+		struct {
+			char *body;
+		} reply;
+
+		/* Queued watch events. */
+		struct {
+			struct xs_watch *handle;
+			const char **vec;
+			u_int vec_size;
+		} watch;
+	} u;
+};
+TAILQ_HEAD(xs_stored_msg_list, xs_stored_msg);
+
+/**
+ * Container for all XenStore related state.
+ */
+struct xs_softc {
+	/** Newbus device for the XenStore. */
+	device_t xs_dev;
+
+	/**
+	 * Lock serializing access to ring producer/consumer
+	 * indexes.  Use of this lock guarantees that wakeups
+	 * of blocking readers/writers are not missed due to
+	 * races with the XenStore service.
+	 */
+	struct mtx ring_lock;
+
+	/*
+	 * Mutex used to insure exclusive access to the outgoing
+	 * communication ring.  We use a lock type that can be
+	 * held while sleeping so that xs_write() can block waiting
+	 * for space in the ring to free up, without allowing another
+	 * writer to come in and corrupt a partial message write.
+	 */
+	struct sx request_mutex;
+
+	/**
+	 * A list of replies to our requests.
+	 *
+	 * The reply list is filled by xs_rcv_thread().  It
+	 * is consumed by the context that issued the request
+	 * to which a reply is made.  The requester blocks in
+	 * xs_read_reply().
+	 *
+	 * /note Only one requesting context can be active at a time.
+	 *       This is guaranteed by the request_mutex and insures
+	 *	 that the requester sees replies matching the order
+	 *	 of its requests.
+	 */
+	struct xs_stored_msg_list reply_list;
+
+	/** Lock protecting the reply list. */
+	struct mtx reply_lock;
+
+	/**
+	 * List of registered watches.
+	 */
+	struct xs_watch_list  registered_watches;
+
+	/** Lock protecting the registered watches list. */
+	struct mtx registered_watches_lock;
+
+	/**
+	 * List of pending watch callback events.
+	 */
+	struct xs_stored_msg_list watch_events;
+
+	/** Lock protecting the watch calback list. */
+	struct mtx watch_events_lock;
+
+	/**
+	 * Sleepable lock used to prevent VM suspension while a
+	 * xenstore transaction is outstanding.
+	 *
+	 * Each active transaction holds a shared lock on the
+	 * suspend mutex.  Our suspend method blocks waiting
+	 * to acquire an exclusive lock.  This guarantees that
+	 * suspend processing will only proceed once all active
+	 * transactions have been retired.
+	 */
+	struct sx suspend_mutex;
+
+	/**
+	 * The processid of the xenwatch thread.
+	 */
+	pid_t xenwatch_pid;
+
+	/**
+	 * Sleepable mutex used to gate the execution of XenStore
+	 * watch event callbacks.
+	 *
+	 * xenwatch_thread holds an exclusive lock on this mutex
+	 * while delivering event callbacks, and xenstore_unregister_watch()
+	 * uses an exclusive lock of this mutex to guarantee that no
+	 * callbacks of the just unregistered watch are pending
+	 * before returning to its caller.
+	 */
+	struct sx xenwatch_mutex;
+
+#ifdef XENHVM
+	/**
+	 * The HVM guest pseudo-physical frame number.  This is Xen's mapping
+	 * of the true machine frame number into our "physical address space".
+	 */
+	unsigned long gpfn;
+#endif
+
+	/**
+	 * The event channel for communicating with the
+	 * XenStore service.
+	 */
+	int evtchn;
+
+	/** Interrupt number for our event channel. */
+	u_int irq;
+
+	/**
+	 * Interrupt driven config hook allowing us to defer
+	 * attaching children until interrupts (and thus communication
+	 * with the XenStore service) are available.
+	 */
+	struct intr_config_hook xs_attachcb;
+};
+
+/*-------------------------------- Global Data ------------------------------*/
+static struct xs_softc xs;
+
+/*------------------------- Private Utility Functions -----------------------*/
+
+/**
+ * Count and optionally record pointers to a number of NUL terminated
+ * strings in a buffer.
+ *
+ * \param strings  A pointer to a contiguous buffer of NUL terminated strings.
+ * \param dest	   An array to store pointers to each string found in strings.
+ * \param len	   The length of the buffer pointed to by strings.
+ *
+ * \return  A count of the number of strings found.
+ */
+static u_int
+extract_strings(const char *strings, const char **dest, u_int len)
+{
+	u_int num;
+	const char *p;
+
+	for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) {
+		if (dest != NULL)
+			*dest++ = p;
+		num++;
+	}
+
+	return (num);
+}
+
+/**
+ * Convert a contiguous buffer containing a series of NUL terminated
+ * strings into an array of pointers to strings.
+ *
+ * The returned pointer references the array of string pointers which
+ * is followed by the storage for the string data.  It is the client's
+ * responsibility to free this storage.
+ *
+ * The storage addressed by strings is free'd prior to split returning.
+ *
+ * \param strings  A pointer to a contiguous buffer of NUL terminated strings.
+ * \param len	   The length of the buffer pointed to by strings.
+ * \param num	   The number of strings found and returned in the strings
+ *                 array.
+ *
+ * \return  An array of pointers to the strings found in the input buffer.
+ */
+static const char **
+split(char *strings, u_int len, u_int *num)
+{
+	const char **ret;
+
+	/* Protect against unterminated buffers. */
+	strings[len - 1] = '\0';
+
+	/* Count the strings. */
+	*num = extract_strings(strings, /*dest*/NULL, len);
+
+	/* Transfer to one big alloc for easy freeing by the caller. */
+	ret = malloc(*num * sizeof(char *) + len, M_XENSTORE, M_WAITOK);
+	memcpy(&ret[*num], strings, len);
+	free(strings, M_XENSTORE);
+
+	/* Extract pointers to newly allocated array. */
+	strings = (char *)&ret[*num];
+	(void)extract_strings(strings, /*dest*/ret, len);
+
+	return (ret);
+}
+
+/*------------------------- Public Utility Functions -------------------------*/
+/*------- API comments for these methods can be found in xenstorevar.h -------*/
+struct sbuf *
+xs_join(const char *dir, const char *name)
+{
+	struct sbuf *sb;
+
+	sb = sbuf_new_auto();
+	sbuf_cat(sb, dir);
+	if (name[0] != '\0') {
+		sbuf_putc(sb, '/');
+		sbuf_cat(sb, name);
+	}
+	sbuf_finish(sb);
+
+	return (sb);
+}
+
+/*-------------------- Low Level Communication Management --------------------*/
+/**
+ * Interrupt handler for the XenStore event channel.
+ *
+ * XenStore reads and writes block on "xen_store" for buffer
+ * space.  Wakeup any blocking operations when the XenStore
+ * service has modified the queues.
+ */
+static void
+xs_intr(void * arg __unused /*__attribute__((unused))*/)
+{
+
+	/*
+	 * Hold ring lock across wakeup so that clients
+	 * cannot miss a wakeup.
+	 */
+	mtx_lock(&xs.ring_lock);
+	wakeup(xen_store);
+	mtx_unlock(&xs.ring_lock);
+}
+
+/**
+ * Verify that the indexes for a ring are valid.
+ *
+ * The difference between the producer and consumer cannot
+ * exceed the size of the ring.
+ *
+ * \param cons  The consumer index for the ring to test.
+ * \param prod  The producer index for the ring to test.
+ *
+ * \retval 1  If indexes are in range.
+ * \retval 0  If the indexes are out of range.
+ */
+static int
+xs_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
+{
+
+	return ((prod - cons) <= XENSTORE_RING_SIZE);
+}
+
+/**
+ * Return a pointer to, and the length of, the contiguous
+ * free region available for output in a ring buffer.
+ *
+ * \param cons  The consumer index for the ring.
+ * \param prod  The producer index for the ring.
+ * \param buf   The base address of the ring's storage.
+ * \param len   The amount of contiguous storage available.
+ *
+ * \return  A pointer to the start location of the free region.
+ */
+static void *
+xs_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
+    char *buf, uint32_t *len)
+{
+
+	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
+	if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
+		*len = XENSTORE_RING_SIZE - (prod - cons);
+	return (buf + MASK_XENSTORE_IDX(prod));
+}
+
+/**
+ * Return a pointer to, and the length of, the contiguous
+ * data available to read from a ring buffer.
+ *
+ * \param cons  The consumer index for the ring.
+ * \param prod  The producer index for the ring.
+ * \param buf   The base address of the ring's storage.
+ * \param len   The amount of contiguous data available to read.
+ *
+ * \return  A pointer to the start location of the available data.
+ */
+static const void *
+xs_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
+    const char *buf, uint32_t *len)
+{
+
+	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
+	if ((prod - cons) < *len)
+		*len = prod - cons;
+	return (buf + MASK_XENSTORE_IDX(cons));
+}
+
+/**
+ * Transmit data to the XenStore service.
+ *
+ * \param tdata  A pointer to the contiguous data to send.
+ * \param len    The amount of data to send.
+ *
+ * \return  On success 0, otherwise an errno value indicating the
+ *          cause of failure.
+ *
+ * \invariant  Called from thread context.
+ * \invariant  The buffer pointed to by tdata is at least len bytes
+ *             in length.
+ * \invariant  xs.request_mutex exclusively locked.
+ */
+static int
+xs_write_store(const void *tdata, unsigned len)
+{
+	XENSTORE_RING_IDX cons, prod;
+	const char *data = (const char *)tdata;
+	int error;
+
+	sx_assert(&xs.request_mutex, SX_XLOCKED);
+	while (len != 0) {
+		void *dst;
+		u_int avail;
+
+		/* Hold lock so we can't miss wakeups should we block. */
+		mtx_lock(&xs.ring_lock);
+		cons = xen_store->req_cons;
+		prod = xen_store->req_prod;
+		if ((prod - cons) == XENSTORE_RING_SIZE) {
+			/*
+			 * Output ring is full. Wait for a ring event.
+			 *
+			 * Note that the events from both queues
+			 * are combined, so being woken does not
+			 * guarantee that data exist in the read
+			 * ring.
+			 *
+			 * To simplify error recovery and the retry,
+			 * we specify PDROP so our lock is *not* held
+			 * when msleep returns.
+			 */
+			error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP,
+			     "xbwrite", /*timeout*/0);
+			if (error && error != EWOULDBLOCK)
+				return (error);
+
+			/* Try again. */
+			continue;
+		}
+		mtx_unlock(&xs.ring_lock);
+
+		/* Verify queue sanity. */
+		if (!xs_check_indexes(cons, prod)) {
+			xen_store->req_cons = xen_store->req_prod = 0;
+			return (EIO);
+		}
+
+		dst = xs_get_output_chunk(cons, prod, xen_store->req, &avail);
+		if (avail > len)
+			avail = len;
+
+		memcpy(dst, data, avail);
+		data += avail;
+		len -= avail;
+
+		/*
+		 * The store to the producer index, which indicates
+		 * to the other side that new data has arrived, must
+		 * be visible only after our copy of the data into the
+		 * ring has completed.
+		 */
+		wmb();
+		xen_store->req_prod += avail;
+
+		/*
+		 * notify_remote_via_evtchn implies mb(). The other side
+		 * will see the change to req_prod at the time of the
+		 * interrupt.
+		 */
+		notify_remote_via_evtchn(xs.evtchn);
+	}
+
+	return (0);
+}
+
+/**
+ * Receive data from the XenStore service.
+ *
+ * \param tdata  A pointer to the contiguous buffer to receive the data.
+ * \param len    The amount of data to receive.
+ *
+ * \return  On success 0, otherwise an errno value indicating the
+ *          cause of failure.
+ *
+ * \invariant  Called from thread context.
+ * \invariant  The buffer pointed to by tdata is at least len bytes
+ *             in length.
+ *
+ * \note xs_read does not perform any internal locking to guarantee
+ *       serial access to the incoming ring buffer.  However, there
+ *	 is only one context processing reads: xs_rcv_thread().
+ */
+static int
+xs_read_store(void *tdata, unsigned len)
+{
+	XENSTORE_RING_IDX cons, prod;
+	char *data = (char *)tdata;
+	int error;
+
+	while (len != 0) {
+		u_int avail;
+		const char *src;
+
+		/* Hold lock so we can't miss wakeups should we block. */
+		mtx_lock(&xs.ring_lock);
+		cons = xen_store->rsp_cons;
+		prod = xen_store->rsp_prod;
+		if (cons == prod) {
+			/*
+			 * Nothing to read. Wait for a ring event.
+			 *
+			 * Note that the events from both queues
+			 * are combined, so being woken does not
+			 * guarantee that data exist in the read
+			 * ring.
+			 *
+			 * To simplify error recovery and the retry,
+			 * we specify PDROP so our lock is *not* held
+			 * when msleep returns.
+			 */
+			error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP,
+			    "xbread", /*timout*/0);
+			if (error && error != EWOULDBLOCK)
+				return (error);
+			continue;
+		}
+		mtx_unlock(&xs.ring_lock);
+
+		/* Verify queue sanity. */
+		if (!xs_check_indexes(cons, prod)) {
+			xen_store->rsp_cons = xen_store->rsp_prod = 0;
+			return (EIO);
+		}
+
+		src = xs_get_input_chunk(cons, prod, xen_store->rsp, &avail);
+		if (avail > len)
+			avail = len;
+
+		/*
+		 * Insure the data we read is related to the indexes
+		 * we read above.
+		 */
+		rmb();
+
+		memcpy(data, src, avail);
+		data += avail;
+		len -= avail;
+
+		/*
+		 * Insure that the producer of this ring does not see
+		 * the ring space as free until after we have copied it
+		 * out.
+		 */
+		mb();
+		xen_store->rsp_cons += avail;
+
+		/*
+		 * notify_remote_via_evtchn implies mb(). The producer
+		 * will see the updated consumer index when the event
+		 * is delivered.
+		 */
+		notify_remote_via_evtchn(xs.evtchn);
+	}
+
+	return (0);
+}
+
+/*----------------------- Received Message Processing ------------------------*/
+/**
+ * Block reading the next message from the XenStore service and
+ * process the result.
+ *
+ * \param type  The returned type of the XenStore message received.
+ *
+ * \return  0 on success.  Otherwise an errno value indicating the
+ *          type of failure encountered.
+ */
+static int
+xs_process_msg(enum xsd_sockmsg_type *type)
+{
+	struct xs_stored_msg *msg;
+	char *body;
+	int error;
+
+	msg = malloc(sizeof(*msg), M_XENSTORE, M_WAITOK);
+	error = xs_read_store(&msg->hdr, sizeof(msg->hdr));
+	if (error) {
+		free(msg, M_XENSTORE);
+		return (error);
+	}
+
+	body = malloc(msg->hdr.len + 1, M_XENSTORE, M_WAITOK);
+	error = xs_read_store(body, msg->hdr.len);
+	if (error) {
+		free(body, M_XENSTORE);
+		free(msg, M_XENSTORE);
+		return (error);
+	}
+	body[msg->hdr.len] = '\0';
+
+	*type = msg->hdr.type;
+	if (msg->hdr.type == XS_WATCH_EVENT) {
+		msg->u.watch.vec = split(body, msg->hdr.len,
+		    &msg->u.watch.vec_size);
+
+		mtx_lock(&xs.registered_watches_lock);
+		msg->u.watch.handle = find_watch(
+		    msg->u.watch.vec[XS_WATCH_TOKEN]);
+		if (msg->u.watch.handle != NULL) {
+			mtx_lock(&xs.watch_events_lock);
+			TAILQ_INSERT_TAIL(&xs.watch_events, msg, list);
+			wakeup(&xs.watch_events);
+			mtx_unlock(&xs.watch_events_lock);
+		} else {
+			free(msg->u.watch.vec, M_XENSTORE);
+			free(msg, M_XENSTORE);
+		}
+		mtx_unlock(&xs.registered_watches_lock);
+	} else {
+		msg->u.reply.body = body;
+		mtx_lock(&xs.reply_lock);
+		TAILQ_INSERT_TAIL(&xs.reply_list, msg, list);
+		wakeup(&xs.reply_list);
+		mtx_unlock(&xs.reply_lock);
+	}
+
+	return (0);
+}
+
+/**
+ * Thread body of the XenStore receive thread.
+ *
+ * This thread blocks waiting for data from the XenStore service
+ * and processes and received messages.
+ */
+static void
+xs_rcv_thread(void *arg __unused)
+{
+	int error;
+	enum xsd_sockmsg_type type;
+
+	for (;;) {
+		error = xs_process_msg(&type);
+		if (error)
+			printf("XENSTORE error %d while reading message\n",
+			    error);
+	}
+}
+
+/*---------------- XenStore Message Request/Reply Processing -----------------*/
+/**
+ * Filter invoked before transmitting any message to the XenStore service.
+ *
+ * The role of the filter may expand, but currently serves to manage
+ * the interactions of messages with transaction state.
+ *
+ * \param request_msg_type  The message type for the request.
+ */
+static inline void
+xs_request_filter(uint32_t request_msg_type)
+{
+	if (request_msg_type == XS_TRANSACTION_START)
+		sx_slock(&xs.suspend_mutex);
+}
+
+/**
+ * Filter invoked after transmitting any message to the XenStore service.
+ *
+ * The role of the filter may expand, but currently serves to manage
+ * the interactions of messages with transaction state.
+ *
+ * \param request_msg_type     The message type for the original request.
+ * \param reply_msg_type       The message type for any received reply.
+ * \param request_reply_error  The error status from the attempt to send
+ *                             the request or retrieve the reply.
+ */
+static inline void
+xs_reply_filter(uint32_t request_msg_type,
+    uint32_t reply_msg_type, int request_reply_error)
+{
+	/*
+	 * The count of transactions drops if we attempted
+	 * to end a transaction (even if that attempt fails
+	 * in error), we receive a transaction end acknowledgement
+	 * or if our attempt to begin a transactionfails.
+	 */
+	if (request_msg_type == XS_TRANSACTION_END
+	 || (request_reply_error == 0 && reply_msg_type == XS_TRANSACTION_END)
+	 || (request_msg_type == XS_TRANSACTION_START
+	  && (request_reply_error != 0 || reply_msg_type == XS_ERROR)))
+		sx_sunlock(&xs.suspend_mutex);
+
+}
+
+#define xsd_error_count	(sizeof(xsd_errors) / sizeof(xsd_errors[0]))
+
+/**
+ * Convert a XenStore error string into an errno number.
+ *
+ * \param errorstring  The error string to convert.
+ *
+ * \return  The errno best matching the input string.
+ *
+ * \note Unknown error strings are converted to EINVAL.
+ */
+static int
+xs_get_error(const char *errorstring)
+{
+	u_int i;
+
+	for (i = 0; i < xsd_error_count; i++) {
+		if (!strcmp(errorstring, xsd_errors[i].errstring))
+			return (xsd_errors[i].errnum);
+	}
+	log(LOG_WARNING, "XENSTORE xen store gave: unknown error %s",
+	    errorstring);
+	return (EINVAL);
+}
+
+/**
+ * Block waiting for a reply to a message request.
+ *
+ * \param type	  The returned type of the reply.
+ * \param len	  The returned body length of the reply.
+ * \param result  The returned body of the reply.
+ *
+ * \return  0 on success.  Otherwise an errno indicating the
+ *          cause of failure.
+ */
+static int
+xs_read_reply(enum xsd_sockmsg_type *type, u_int *len, void **result)
+{
+	struct xs_stored_msg *msg;
+	char *body;
+	int error;
+
+	mtx_lock(&xs.reply_lock);
+	while (TAILQ_EMPTY(&xs.reply_list)) {
+		error = mtx_sleep(&xs.reply_list, &xs.reply_lock,
+		    PCATCH, "xswait", hz/10);
+		if (error && error != EWOULDBLOCK) {
+			mtx_unlock(&xs.reply_lock);
+			return (error);
+		}
+	}
+	msg = TAILQ_FIRST(&xs.reply_list);
+	TAILQ_REMOVE(&xs.reply_list, msg, list);
+	mtx_unlock(&xs.reply_lock);
+
+	*type = msg->hdr.type;
+	if (len)
+		*len = msg->hdr.len;
+	body = msg->u.reply.body;
+
+	free(msg, M_XENSTORE);
+	*result = body;
+	return (0);
+}
+
+/**
+ * Pass-thru interface for XenStore access by userland processes
+ * via the XenStore device.
+ *
+ * Reply type and length data are returned by overwriting these
+ * fields in the passed in request message.
+ *
+ * \param msg	  A properly formatted message to transmit to
+ *		  the XenStore service.
+ * \param result  The returned body of the reply.
+ *
+ * \return  0 on success.  Otherwise an errno indicating the cause
+ *          of failure.
+ *
+ * \note The returned result is provided in malloced storage and thus
+ *       must be free'd by the caller with 'free(result, M_XENSTORE);
+ */
+int
+xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result)
+{
+	uint32_t request_type;
+	int error;
+
+	request_type = msg->type;
+	xs_request_filter(request_type);
+
+	sx_xlock(&xs.request_mutex);
+	if ((error = xs_write_store(msg, sizeof(*msg) + msg->len)) == 0)
+		error = xs_read_reply(&msg->type, &msg->len, result);
+	sx_xunlock(&xs.request_mutex);
+
+	xs_reply_filter(request_type, msg->type, error);
+
+	return (error);
+}
+
+/**
+ * Send a message with an optionally muti-part body to the XenStore service.
+ *
+ * \param t              The transaction to use for this request.
+ * \param request_type   The type of message to send.
+ * \param iovec          Pointers to the body sections of the request.
+ * \param num_vecs       The number of body sections in the request.
+ * \param len            The returned length of the reply.
+ * \param result         The returned body of the reply.
+ *
+ * \return  0 on success.  Otherwise an errno indicating
+ *          the cause of failure.
+ *
+ * \note The returned result is provided in malloced storage and thus
+ *       must be free'd by the caller with 'free(*result, M_XENSTORE);
+ */
+static int
+xs_talkv(struct xs_transaction t, enum xsd_sockmsg_type request_type,
+    const struct iovec *iovec, u_int num_vecs, u_int *len, void **result)
+{
+	struct xsd_sockmsg msg;
+	void *ret = NULL;
+	u_int i;
+	int error;
+
+	msg.tx_id = t.id;
+	msg.req_id = 0;
+	msg.type = request_type;
+	msg.len = 0;
+	for (i = 0; i < num_vecs; i++)
+		msg.len += iovec[i].iov_len;
+
+	xs_request_filter(request_type);
+
+	sx_xlock(&xs.request_mutex);
+	error = xs_write_store(&msg, sizeof(msg));
+	if (error) {
+		printf("xs_talkv failed %d\n", error);
+		goto error_lock_held;
+	}
+
+	for (i = 0; i < num_vecs; i++) {
+		error = xs_write_store(iovec[i].iov_base, iovec[i].iov_len);
+		if (error) {
+			printf("xs_talkv failed %d\n", error);
+			goto error_lock_held;
+		}
+	}
+
+	error = xs_read_reply(&msg.type, len, &ret);
+
+error_lock_held:
+	sx_xunlock(&xs.request_mutex);
+	xs_reply_filter(request_type, msg.type, error);
+	if (error)
+		return (error);
+
+	if (msg.type == XS_ERROR) {
+		error = xs_get_error(ret);
+		free(ret, M_XENSTORE);
+		return (error);
+	}
+
+	/* Reply is either error or an echo of our request message type. */
+	KASSERT(msg.type == request_type, ("bad xenstore message type"));
+
+	if (result)
+		*result = ret;
+	else
+		free(ret, M_XENSTORE);
+
+	return (0);
+}
+
+/**
+ * Wrapper for xs_talkv allowing easy transmission of a message with
+ * a single, contiguous, message body.
+ *
+ * \param t              The transaction to use for this request.
+ * \param request_type   The type of message to send.
+ * \param body           The body of the request.
+ * \param len            The returned length of the reply.
+ * \param result         The returned body of the reply.
+ *
+ * \return  0 on success.  Otherwise an errno indicating
+ *          the cause of failure.
+ *
+ * \note The returned result is provided in malloced storage and thus
+ *       must be free'd by the caller with 'free(*result, M_XENSTORE);
+ */
+static int
+xs_single(struct xs_transaction t, enum xsd_sockmsg_type request_type,
+    const char *body, u_int *len, void **result)
+{
+	struct iovec iovec;
+
+	iovec.iov_base = (void *)(uintptr_t)body;
+	iovec.iov_len = strlen(body) + 1;
+
+	return (xs_talkv(t, request_type, &iovec, 1, len, result));
+}
+
+/*------------------------- XenStore Watch Support ---------------------------*/
+/**
+ * Transmit a watch request to the XenStore service.
+ *
+ * \param path    The path in the XenStore to watch.
+ * \param tocken  A unique identifier for this watch.
+ *
+ * \return  0 on success.  Otherwise an errno indicating the
+ *          cause of failure.
+ */
+static int
+xs_watch(const char *path, const char *token)
+{
+	struct iovec iov[2];
+
+	iov[0].iov_base = (void *)(uintptr_t) path;
+	iov[0].iov_len = strlen(path) + 1;
+	iov[1].iov_base = (void *)(uintptr_t) token;
+	iov[1].iov_len = strlen(token) + 1;
+
+	return (xs_talkv(XST_NIL, XS_WATCH, iov, 2, NULL, NULL));
+}
+
+/**
+ * Transmit an uwatch request to the XenStore service.
+ *
+ * \param path    The path in the XenStore to watch.
+ * \param tocken  A unique identifier for this watch.
+ *
+ * \return  0 on success.  Otherwise an errno indicating the
+ *          cause of failure.
+ */
+static int
+xs_unwatch(const char *path, const char *token)
+{
+	struct iovec iov[2];
+
+	iov[0].iov_base = (void *)(uintptr_t) path;
+	iov[0].iov_len = strlen(path) + 1;
+	iov[1].iov_base = (void *)(uintptr_t) token;
+	iov[1].iov_len = strlen(token) + 1;
+
+	return (xs_talkv(XST_NIL, XS_UNWATCH, iov, 2, NULL, NULL));
+}
+
+/**
+ * Convert from watch token (unique identifier) to the associated
+ * internal tracking structure for this watch.
+ *
+ * \param tocken  The unique identifier for the watch to find.
+ *
+ * \return  A pointer to the found watch structure or NULL.
+ */
+static struct xs_watch *
+find_watch(const char *token)
+{
+	struct xs_watch *i, *cmp;
+
+	cmp = (void *)strtoul(token, NULL, 16);
+
+	LIST_FOREACH(i, &xs.registered_watches, list)
+		if (i == cmp)
+			return (i);
+
+	return (NULL);
+}
+
+/**
+ * Thread body of the XenStore watch event dispatch thread.
+ */
+static void
+xenwatch_thread(void *unused)
+{
+	struct xs_stored_msg *msg;
+
+	for (;;) {
+
+		mtx_lock(&xs.watch_events_lock);
+		while (TAILQ_EMPTY(&xs.watch_events))
+			mtx_sleep(&xs.watch_events,
+			    &xs.watch_events_lock,
+			    PWAIT | PCATCH, "waitev", hz/10);
+
+		mtx_unlock(&xs.watch_events_lock);
+		sx_xlock(&xs.xenwatch_mutex);
+
+		mtx_lock(&xs.watch_events_lock);
+		msg = TAILQ_FIRST(&xs.watch_events);
+		if (msg)
+			TAILQ_REMOVE(&xs.watch_events, msg, list);
+		mtx_unlock(&xs.watch_events_lock);
+
+		if (msg != NULL) {
+			/*
+			 * XXX There are messages coming in with a NULL
+			 * XXX callback.  This deserves further investigation;
+			 * XXX the workaround here simply prevents the kernel
+			 * XXX from panic'ing on startup.
+			 */
+			if (msg->u.watch.handle->callback != NULL)
+				msg->u.watch.handle->callback(
+					msg->u.watch.handle,
+					(const char **)msg->u.watch.vec,
+					msg->u.watch.vec_size);
+			free(msg->u.watch.vec, M_XENSTORE);
+			free(msg, M_XENSTORE);
+		}
+
+		sx_xunlock(&xs.xenwatch_mutex);
+	}
+}
+
+/*----------- XenStore Configuration, Initialization, and Control ------------*/
+/**
+ * Setup communication channels with the XenStore service.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+static int
+xs_init_comms(void)
+{
+	int error;
+
+	if (xen_store->rsp_prod != xen_store->rsp_cons) {
+		log(LOG_WARNING, "XENSTORE response ring is not quiescent "
+		    "(%08x:%08x): fixing up\n",
+		    xen_store->rsp_cons, xen_store->rsp_prod);
+		xen_store->rsp_cons = xen_store->rsp_prod;
+	}
+
+	if (xs.irq)
+		unbind_from_irqhandler(xs.irq);
+
+	error = bind_caller_port_to_irqhandler(xs.evtchn, "xenstore",
+	    xs_intr, NULL, INTR_TYPE_NET, &xs.irq);
+	if (error) {
+		log(LOG_WARNING, "XENSTORE request irq failed %i\n", error);
+		return (error);
+	}
+
+	return (0);
+}
+
+/*------------------ Private Device Attachment Functions  --------------------*/
+static void
+xs_identify(driver_t *driver, device_t parent)
+{
+
+	BUS_ADD_CHILD(parent, 0, "xenstore", 0);
+}
+
+/**
+ * Probe for the existance of the XenStore.
+ *
+ * \param dev
+ */
+static int 
+xs_probe(device_t dev)
+{
+	/*
+	 * We are either operating within a PV kernel or being probed
+	 * as the child of the successfully attached xenpci device.
+	 * Thus we are in a Xen environment and there will be a XenStore.
+	 * Uncontitionally return success.
+	 */
+	device_set_desc(dev, "XenStore");
+printf("xs_probe: Probe retuns 0\n");
+	return (0);
+}
+
+static void
+xs_attach_deferred(void *arg)
+{
+	xs_dev_init();
+
+	bus_generic_probe(xs.xs_dev);
+	bus_generic_attach(xs.xs_dev);
+
+	config_intrhook_disestablish(&xs.xs_attachcb);
+}
+
+/**
+ * Attach to the XenStore.
+ *
+ * This routine also prepares for the probe/attach of drivers that rely
+ * on the XenStore.  
+ */
+static int
+xs_attach(device_t dev)
+{
+	int error;
+
+	/* Allow us to get device_t from softc and vice-versa. */
+	xs.xs_dev = dev;
+	device_set_softc(dev, &xs);
+
+	/*
+	 * This seems to be a layering violation.  The XenStore is just
+	 * one of many clients of the Grant Table facility.  It happens
+	 * to be the first and a gating consumer to all other devices,
+	 * so this does work.  A better place would be in the PV support
+	 * code for fully PV kernels and the xenpci driver for HVM kernels.
+	 */
+	error = gnttab_init();
+	if (error != 0) {
+		log(LOG_WARNING,
+		    "XENSTORE: Error initializing grant tables: %d\n", error);
+		return (ENXIO);
+	}
+
+	/* Initialize the interface to xenstore. */
+	struct proc *p;
+
+#ifdef XENHVM
+	xs.evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN);
+	xs.gpfn = hvm_get_parameter(HVM_PARAM_STORE_PFN);
+	xen_store = pmap_mapdev(xs.gpfn * PAGE_SIZE, PAGE_SIZE);
+#else
+	xs.evtchn = xen_start_info->store_evtchn;
+#endif
+
+	TAILQ_INIT(&xs.reply_list);
+	TAILQ_INIT(&xs.watch_events);
+
+	mtx_init(&xs.ring_lock, "ring lock", NULL, MTX_DEF);
+	mtx_init(&xs.reply_lock, "reply lock", NULL, MTX_DEF);
+	sx_init(&xs.xenwatch_mutex, "xenwatch");
+	sx_init(&xs.request_mutex, "xenstore request");
+	sx_init(&xs.suspend_mutex, "xenstore suspend");
+	mtx_init(&xs.registered_watches_lock, "watches", NULL, MTX_DEF);
+	mtx_init(&xs.watch_events_lock, "watch events", NULL, MTX_DEF);
+	xs.irq = 0;
+
+	/* Initialize the shared memory rings to talk to xenstored */
+	error = xs_init_comms();
+	if (error)
+		return (error);
+
+	error = kproc_create(xenwatch_thread, NULL, &p, RFHIGHPID,
+	    0, "xenwatch");
+	if (error)
+		return (error);
+	xs.xenwatch_pid = p->p_pid;
+
+	error = kproc_create(xs_rcv_thread, NULL, NULL,
+	    RFHIGHPID, 0, "xenstore_rcv");
+
+	xs.xs_attachcb.ich_func = xs_attach_deferred;
+	xs.xs_attachcb.ich_arg = NULL;
+	config_intrhook_establish(&xs.xs_attachcb);
+
+	return (error);
+}
+
+/**
+ * Prepare for suspension of this VM by halting XenStore access after
+ * all transactions and individual requests have completed.
+ */
+static int
+xs_suspend(device_t dev __unused)
+{
+
+	sx_xlock(&xs.suspend_mutex);
+	sx_xlock(&xs.request_mutex);
+
+	return (0);
+}
+
+/**
+ * Resume XenStore operations after this VM is resumed.
+ */
+static int
+xs_resume(device_t dev __unused)
+{
+	struct xs_watch *watch;
+	char token[sizeof(watch) * 2 + 1];
+
+	xs_init_comms();
+
+	sx_xunlock(&xs.request_mutex);
+
+	/*
+	 * No need for registered_watches_lock: the suspend_mutex
+	 * is sufficient.
+	 */
+	LIST_FOREACH(watch, &xs.registered_watches, list) {
+		sprintf(token, "%lX", (long)watch);
+		xs_watch(watch->node, token);
+	}
+
+	sx_xunlock(&xs.suspend_mutex);
+
+	return (0);
+}
+
+/*-------------------- Private Device Attachment Data  -----------------------*/
+static device_method_t xenstore_methods[] = { 
+	/* Device interface */ 
+	DEVMETHOD(device_identify,	xs_identify),
+	DEVMETHOD(device_probe,         xs_probe), 
+	DEVMETHOD(device_attach,        xs_attach), 
+	DEVMETHOD(device_detach,        bus_generic_detach), 
+	DEVMETHOD(device_shutdown,      bus_generic_shutdown), 
+	DEVMETHOD(device_suspend,       xs_suspend), 
+	DEVMETHOD(device_resume,        xs_resume), 
+ 
+	/* Bus interface */ 
+	DEVMETHOD(bus_add_child,        bus_generic_add_child),
+	DEVMETHOD(bus_print_child,      bus_generic_print_child),
+	DEVMETHOD(bus_alloc_resource,   bus_generic_alloc_resource),
+	DEVMETHOD(bus_release_resource, bus_generic_release_resource),
+	DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
+	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
+ 
+	{ 0, 0 } 
+}; 
+
+DEFINE_CLASS_0(xenstore, xenstore_driver, xenstore_methods, 0);
+static devclass_t xenstore_devclass; 
+ 
+#ifdef XENHVM
+DRIVER_MODULE(xenstore, xenpci, xenstore_driver, xenstore_devclass, 0, 0);
+#else
+DRIVER_MODULE(xenstore, nexus, xenstore_driver, xenstore_devclass, 0, 0);
+#endif
+
+/*------------------------------- Sysctl Data --------------------------------*/
+/* XXX Shouldn't the node be somewhere else? */
+SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen");
+SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xs.evtchn, 0, "");
+SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, "");
+
+/*-------------------------------- Public API --------------------------------*/
+/*------- API comments for these methods can be found in xenstorevar.h -------*/
+int
+xs_directory(struct xs_transaction t, const char *dir, const char *node,
+    u_int *num, const char ***result)
+{
+	struct sbuf *path;
+	char *strings;
+	u_int len = 0;
+	int error;
+
+	path = xs_join(dir, node);
+	error = xs_single(t, XS_DIRECTORY, sbuf_data(path), &len,
+	    (void **)&strings);
+	sbuf_delete(path);
+	if (error)
+		return (error);
+
+	*result = split(strings, len, num);
+
+	return (0);
+}
+
+int
+xs_exists(struct xs_transaction t, const char *dir, const char *node)
+{
+	const char **d;
+	int error, dir_n;
+
+	error = xs_directory(t, dir, node, &dir_n, &d);
+	if (error)
+		return (0);
+	free(d, M_XENSTORE);
+	return (1);
+}
+
+int
+xs_read(struct xs_transaction t, const char *dir, const char *node,
+    u_int *len, void **result)
+{
+	struct sbuf *path;
+	void *ret;
+	int error;
+
+	path = xs_join(dir, node);
+	error = xs_single(t, XS_READ, sbuf_data(path), len, &ret);
+	sbuf_delete(path);
+	if (error)
+		return (error);
+	*result = ret;
+	return (0);
+}
+
+int
+xs_write(struct xs_transaction t, const char *dir, const char *node,
+    const char *string)
+{
+	struct sbuf *path;
+	struct iovec iovec[2];
+	int error;
+
+	path = xs_join(dir, node);
+
+	iovec[0].iov_base = (void *)(uintptr_t) sbuf_data(path);
+	iovec[0].iov_len = sbuf_len(path) + 1;
+	iovec[1].iov_base = (void *)(uintptr_t) string;
+	iovec[1].iov_len = strlen(string);
+
+	error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL);
+	sbuf_delete(path);
+
+	return (error);
+}
+
+int
+xs_mkdir(struct xs_transaction t, const char *dir, const char *node)
+{
+	struct sbuf *path;
+	int ret;
+
+	path = xs_join(dir, node);
+	ret = xs_single(t, XS_MKDIR, sbuf_data(path), NULL, NULL);
+	sbuf_delete(path);
+
+	return (ret);
+}
+
+int
+xs_rm(struct xs_transaction t, const char *dir, const char *node)
+{
+	struct sbuf *path;
+	int ret;
+
+	path = xs_join(dir, node);
+	ret = xs_single(t, XS_RM, sbuf_data(path), NULL, NULL);
+	sbuf_delete(path);
+
+	return (ret);
+}
+
+int
+xs_rm_tree(struct xs_transaction xbt, const char *base, const char *node)
+{
+	struct xs_transaction local_xbt;
+	struct sbuf *root_path_sbuf;
+	struct sbuf *cur_path_sbuf;
+	char *root_path;
+	char *cur_path;
+	const char **dir;
+	int error;
+	int empty;
+
+retry:
+	root_path_sbuf = xs_join(base, node);
+	cur_path_sbuf  = xs_join(base, node);
+	root_path      = sbuf_data(root_path_sbuf);
+	cur_path       = sbuf_data(cur_path_sbuf);
+	dir            = NULL;
+	local_xbt.id   = 0;
+
+	if (xbt.id == 0) {
+		error = xs_transaction_start(&local_xbt);
+		if (error != 0)
+			goto out;
+		xbt = local_xbt;
+	}
+
+	empty = 0;
+	while (1) {
+		u_int count;
+		u_int i;
+
+		error = xs_directory(xbt, cur_path, "", &count, &dir);
+		if (error)
+			goto out;
+
+		for (i = 0; i < count; i++) {
+			error = xs_rm(xbt, cur_path, dir[i]);
+			if (error == ENOTEMPTY) {
+				struct sbuf *push_dir;
+
+				/*
+				 * Descend to clear out this sub directory.
+				 * We'll return to cur_dir once push_dir
+				 * is empty.
+				 */
+				push_dir = xs_join(cur_path, dir[i]);
+				sbuf_delete(cur_path_sbuf);
+				cur_path_sbuf = push_dir;
+				cur_path = sbuf_data(cur_path_sbuf);
+				break;
+			} else if (error != 0) {
+				goto out;
+			}
+		}
+
+		free(dir, M_XENSTORE);
+		dir = NULL;
+
+		if (i == count) {
+			char *last_slash;
+
+			/* Directory is empty.  It is now safe to remove. */
+			error = xs_rm(xbt, cur_path, "");
+			if (error != 0)
+				goto out;
+
+			if (!strcmp(cur_path, root_path))
+				break;
+
+			/* Return to processing the parent directory. */
+			last_slash = strrchr(cur_path, '/');
+			KASSERT(last_slash != NULL,
+				("xs_rm_tree: mangled path %s", cur_path));
+			*last_slash = '\0';
+		}
+	}
+
+out:
+	sbuf_delete(cur_path_sbuf);
+	sbuf_delete(root_path_sbuf);
+	if (dir != NULL)
+		free(dir, M_XENSTORE);
+
+	if (local_xbt.id != 0) {
+		int terror;
+
+		terror = xs_transaction_end(local_xbt, /*abort*/error != 0);
+		xbt.id = 0;
+		if (terror == EAGAIN && error == 0)
+			goto retry;
+	}
+	return (error);
+}
+
+int
+xs_transaction_start(struct xs_transaction *t)
+{
+	char *id_str;
+	int error;
+
+	error = xs_single(XST_NIL, XS_TRANSACTION_START, "", NULL,
+	    (void **)&id_str);
+	if (error == 0) {
+		t->id = strtoul(id_str, NULL, 0);
+		free(id_str, M_XENSTORE);
+	}
+	return (error);
+}
+
+int
+xs_transaction_end(struct xs_transaction t, int abort)
+{
+	char abortstr[2];
+
+	if (abort)
+		strcpy(abortstr, "F");
+	else
+		strcpy(abortstr, "T");
+
+	return (xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL));
+}
+
+int
+xs_scanf(struct xs_transaction t, const char *dir, const char *node,
+     int *scancountp, const char *fmt, ...)
+{
+	va_list ap;
+	int error, ns;
+	char *val;
+
+	error = xs_read(t, dir, node, NULL, (void **) &val);
+	if (error)
+		return (error);
+
+	va_start(ap, fmt);
+	ns = vsscanf(val, fmt, ap);
+	va_end(ap);
+	free(val, M_XENSTORE);
+	/* Distinctive errno. */
+	if (ns == 0)
+		return (ERANGE);
+	if (scancountp)
+		*scancountp = ns;
+	return (0);
+}
+
+int
+xs_vprintf(struct xs_transaction t,
+    const char *dir, const char *node, const char *fmt, va_list ap)
+{
+	struct sbuf *sb;
+	int error;
+
+	sb = sbuf_new_auto();
+	sbuf_vprintf(sb, fmt, ap);
+	sbuf_finish(sb);
+	error = xs_write(t, dir, node, sbuf_data(sb));
+	sbuf_delete(sb);
+
+	return (error);
+}
+
+int
+xs_printf(struct xs_transaction t, const char *dir, const char *node,
+     const char *fmt, ...)
+{
+	va_list ap;
+	int error;
+
+	va_start(ap, fmt);
+	error = xs_vprintf(t, dir, node, fmt, ap);
+	va_end(ap);
+
+	return (error);
+}
+
+int
+xs_gather(struct xs_transaction t, const char *dir, ...)
+{
+	va_list ap;
+	const char *name;
+	int error;
+
+	va_start(ap, dir);
+	error = 0;
+	while (error == 0 && (name = va_arg(ap, char *)) != NULL) {
+		const char *fmt = va_arg(ap, char *);
+		void *result = va_arg(ap, void *);
+		char *p;
+
+		error = xs_read(t, dir, name, NULL, (void **) &p);
+		if (error)
+			break;
+
+		if (fmt) {
+			if (sscanf(p, fmt, result) == 0)
+				error = EINVAL;
+			free(p, M_XENSTORE);
+		} else
+			*(char **)result = p;
+	}
+	va_end(ap);
+
+	return (error);
+}
+
+int
+xs_register_watch(struct xs_watch *watch)
+{
+	/* Pointer in ascii is the token. */
+	char token[sizeof(watch) * 2 + 1];
+	int error;
+
+	sprintf(token, "%lX", (long)watch);
+
+	sx_slock(&xs.suspend_mutex);
+
+	mtx_lock(&xs.registered_watches_lock);
+	KASSERT(find_watch(token) == NULL, ("watch already registered"));
+	LIST_INSERT_HEAD(&xs.registered_watches, watch, list);
+	mtx_unlock(&xs.registered_watches_lock);
+
+	error = xs_watch(watch->node, token);
+
+	/* Ignore errors due to multiple registration. */
+	if (error == EEXIST)
+		error = 0;
+
+	if (error != 0) {
+		mtx_lock(&xs.registered_watches_lock);
+		LIST_REMOVE(watch, list);
+		mtx_unlock(&xs.registered_watches_lock);
+	}
+
+	sx_sunlock(&xs.suspend_mutex);
+
+	return (error);
+}
+
+void
+xs_unregister_watch(struct xs_watch *watch)
+{
+	struct xs_stored_msg *msg, *tmp;
+	char token[sizeof(watch) * 2 + 1];
+	int error;
+
+	sprintf(token, "%lX", (long)watch);
+
+	sx_slock(&xs.suspend_mutex);
+
+	mtx_lock(&xs.registered_watches_lock);
+	if (find_watch(token) == NULL) {
+		mtx_unlock(&xs.registered_watches_lock);
+		sx_sunlock(&xs.suspend_mutex);
+		return;
+	}
+	LIST_REMOVE(watch, list);
+	mtx_unlock(&xs.registered_watches_lock);
+
+	error = xs_unwatch(watch->node, token);
+	if (error)
+		log(LOG_WARNING, "XENSTORE Failed to release watch %s: %i\n",
+		    watch->node, error);
+
+	sx_sunlock(&xs.suspend_mutex);
+
+	/* Cancel pending watch events. */
+	mtx_lock(&xs.watch_events_lock);
+	TAILQ_FOREACH_SAFE(msg, &xs.watch_events, list, tmp) {
+		if (msg->u.watch.handle != watch)
+			continue;
+		TAILQ_REMOVE(&xs.watch_events, msg, list);
+		free(msg->u.watch.vec, M_XENSTORE);
+		free(msg, M_XENSTORE);
+	}
+	mtx_unlock(&xs.watch_events_lock);
+
+	/* Flush any currently-executing callback, unless we are it. :-) */
+	if (curproc->p_pid != xs.xenwatch_pid) {
+		sx_xlock(&xs.xenwatch_mutex);
+		sx_xunlock(&xs.xenwatch_mutex);
+	}
+}
diff --git a/sys/xen/xenbus/xenbus_dev.c b/sys/xen/xenstore/xenstore_dev.c
similarity index 68%
rename from sys/xen/xenbus/xenbus_dev.c
rename to sys/xen/xenstore/xenstore_dev.c
index ac3f103e1fa..1fa419795ed 100644
--- a/sys/xen/xenbus/xenbus_dev.c
+++ b/sys/xen/xenstore/xenstore_dev.c
@@ -1,8 +1,8 @@
 /*
- * xenbus_dev.c
+ * xenstore_dev.c
  * 
- * Driver giving user-space access to the kernel's xenbus connection
- * to xenstore.
+ * Driver giving user-space access to the kernel's connection to the
+ * XenStore service.
  * 
  * Copyright (c) 2005, Christian Limpach
  * Copyright (c) 2005, Rusty Russell, IBM Corporation
@@ -45,18 +45,19 @@ __FBSDID("$FreeBSD$");
 #include <sys/conf.h>
 
 #include <machine/xen/xen-os.h>
-#include <xen/hypervisor.h>
-#include <xen/xenbus/xenbusvar.h>
-#include <xen/xenbus/xenbus_comms.h>
 
-struct xenbus_dev_transaction {
-	LIST_ENTRY(xenbus_dev_transaction) list;
-	struct xenbus_transaction handle;
+#include <xen/hypervisor.h>
+#include <xen/xenstore/xenstorevar.h>
+#include <xen/xenstore/xenstore_internal.h>
+
+struct xs_dev_transaction {
+	LIST_ENTRY(xs_dev_transaction) list;
+	struct xs_transaction handle;
 };
 
-struct xenbus_dev_data {
+struct xs_dev_data {
 	/* In-progress transaction. */
-	LIST_HEAD(xdd_list_head, xenbus_dev_transaction) transactions;
+	LIST_HEAD(xdd_list_head, xs_dev_transaction) transactions;
 
 	/* Partial request. */
 	unsigned int len;
@@ -72,13 +73,13 @@ struct xenbus_dev_data {
 };
 
 static int 
-xenbus_dev_read(struct cdev *dev, struct uio *uio, int ioflag)
+xs_dev_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	int error;
-	struct xenbus_dev_data *u = dev->si_drv1;
+	struct xs_dev_data *u = dev->si_drv1;
 
 	while (u->read_prod == u->read_cons) {
-		error = tsleep(u, PCATCH, "xbdread", hz/10);
+		error = tsleep(u, PCATCH, "xsdread", hz/10);
 		if (error && error != EWOULDBLOCK)
 			return (error);
 	}
@@ -96,7 +97,7 @@ xenbus_dev_read(struct cdev *dev, struct uio *uio, int ioflag)
 }
 
 static void
-queue_reply(struct xenbus_dev_data *u, char *data, unsigned int len)
+xs_queue_reply(struct xs_dev_data *u, char *data, unsigned int len)
 {
 	int i;
 
@@ -110,11 +111,11 @@ queue_reply(struct xenbus_dev_data *u, char *data, unsigned int len)
 }
 
 static int 
-xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
+xs_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	int error;
-	struct xenbus_dev_data *u = dev->si_drv1;
-	struct xenbus_dev_transaction *trans;
+	struct xs_dev_data *u = dev->si_drv1;
+	struct xs_dev_transaction *trans;
 	void *reply;
 	int len = uio->uio_resid;
 
@@ -141,10 +142,10 @@ xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
 	case XS_MKDIR:
 	case XS_RM:
 	case XS_SET_PERMS:
-		error = xenbus_dev_request_and_reply(&u->u.msg, &reply);
+		error = xs_dev_request_and_reply(&u->u.msg, &reply);
 		if (!error) {
 			if (u->u.msg.type == XS_TRANSACTION_START) {
-				trans = malloc(sizeof(*trans), M_DEVBUF,
+				trans = malloc(sizeof(*trans), M_XENSTORE,
 				    M_WAITOK);
 				trans->handle.id = strtoul(reply, NULL, 0);
 				LIST_INSERT_HEAD(&u->transactions, trans, list);
@@ -156,11 +157,11 @@ xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
 				BUG_ON(&trans->list == &u->transactions);
 #endif
 				LIST_REMOVE(trans, list);
-				free(trans, M_DEVBUF);
+				free(trans, M_XENSTORE);
 			}
-			queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
-			queue_reply(u, (char *)reply, u->u.msg.len);
-			free(reply, M_DEVBUF);
+			xs_queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
+			xs_queue_reply(u, (char *)reply, u->u.msg.len);
+			free(reply, M_XENSTORE);
 		}
 		break;
 
@@ -176,16 +177,14 @@ xenbus_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
 }
 
 static int
-xenbus_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+xs_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
-	struct xenbus_dev_data *u;
+	struct xs_dev_data *u;
 
-	if (xen_store_evtchn == 0)
-		return (ENOENT);
 #if 0 /* XXX figure out if equiv needed */
 	nonseekable_open(inode, filp);
 #endif
-	u = malloc(sizeof(*u), M_DEVBUF, M_WAITOK|M_ZERO);
+	u = malloc(sizeof(*u), M_XENSTORE, M_WAITOK|M_ZERO);
 	LIST_INIT(&u->transactions);
         dev->si_drv1 = u;
 
@@ -193,37 +192,33 @@ xenbus_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 }
 
 static int
-xenbus_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+xs_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
-	struct xenbus_dev_data *u = dev->si_drv1;
-	struct xenbus_dev_transaction *trans, *tmp;
+	struct xs_dev_data *u = dev->si_drv1;
+	struct xs_dev_transaction *trans, *tmp;
 
 	LIST_FOREACH_SAFE(trans, &u->transactions, list, tmp) {
-		xenbus_transaction_end(trans->handle, 1);
+		xs_transaction_end(trans->handle, 1);
 		LIST_REMOVE(trans, list);
-		free(trans, M_DEVBUF);
+		free(trans, M_XENSTORE);
 	}
 
-	free(u, M_DEVBUF);
+	free(u, M_XENSTORE);
 	return (0);
 }
 
-static struct cdevsw xenbus_dev_cdevsw = {
+static struct cdevsw xs_dev_cdevsw = {
 	.d_version = D_VERSION,	
-	.d_read = xenbus_dev_read,
-	.d_write = xenbus_dev_write,
-	.d_open = xenbus_dev_open,
-	.d_close = xenbus_dev_close,
-	.d_name = "xenbus_dev",
+	.d_read = xs_dev_read,
+	.d_write = xs_dev_write,
+	.d_open = xs_dev_open,
+	.d_close = xs_dev_close,
+	.d_name = "xs_dev",
 };
 
-static int
-xenbus_dev_sysinit(void)
+void
+xs_dev_init()
 {
-	make_dev(&xenbus_dev_cdevsw, 0, UID_ROOT, GID_WHEEL, 0400,
-	    "xen/xenbus");
-
-	return (0);
+	make_dev(&xs_dev_cdevsw, 0, UID_ROOT, GID_WHEEL, 0400,
+	    "xen/xenstore");
 }
-SYSINIT(xenbus_dev_sysinit, SI_SUB_DRIVERS, SI_ORDER_MIDDLE,
-    xenbus_dev_sysinit, NULL);
diff --git a/sys/xen/xenstore/xenstore_internal.h b/sys/xen/xenstore/xenstore_internal.h
new file mode 100644
index 00000000000..0398aef708a
--- /dev/null
+++ b/sys/xen/xenstore/xenstore_internal.h
@@ -0,0 +1,39 @@
+/*-
+ * Core definitions and data structures shareable across OS platforms.
+ *
+ * Copyright (c) 2010 Spectra Logic Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD$
+ */
+
+/* Initialize support for userspace access to the XenStore. */
+void xs_dev_init(void);
+
+/* Used by the XenStore character device to borrow kernel's store connection. */
+int xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result);
diff --git a/sys/xen/xenstore/xenstorevar.h b/sys/xen/xenstore/xenstorevar.h
new file mode 100644
index 00000000000..df41e3127cc
--- /dev/null
+++ b/sys/xen/xenstore/xenstorevar.h
@@ -0,0 +1,338 @@
+/******************************************************************************
+ * xenstorevar.h
+ *
+ * Method declarations and structures for accessing the XenStore.h
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 XenSource Ltd.
+ * Copyright (C) 2009,2010 Spectra Logic Corporation
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _XEN_XENSTORE_XENSTOREVAR_H
+#define _XEN_XENSTORE_XENSTOREVAR_H
+
+#include <sys/queue.h>
+#include <sys/bus.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/sbuf.h>
+
+#include <machine/stdarg.h>
+#include <machine/xen/xen-os.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/xenbus.h>
+#include <xen/interface/io/xs_wire.h>
+
+#include "xenbus_if.h"
+
+/* XenStore allocations including XenStore data returned to clients. */
+MALLOC_DECLARE(M_XENSTORE);
+
+struct xenstore_domain_interface;
+struct xs_watch;
+extern struct xenstore_domain_interface *xen_store;
+
+typedef	void (xs_watch_cb_t)(struct xs_watch *,
+				   const char **vec, unsigned int len);
+
+/* Register callback to watch subtree (node) in the XenStore. */
+struct xs_watch
+{
+	LIST_ENTRY(xs_watch) list;
+
+	/* Path being watched. */
+	char *node;
+
+	/* Callback (executed in a process context with no locks held). */
+	xs_watch_cb_t *callback;
+};
+LIST_HEAD(xs_watch_list, xs_watch);
+
+typedef int (*xs_event_handler_t)(void *);
+
+struct xs_transaction
+{
+	uint32_t id;
+};
+
+#define XST_NIL ((struct xs_transaction) { 0 })
+
+/**
+ * Fetch the contents of a directory in the XenStore.
+ *
+ * \param t       The XenStore transaction covering this request.
+ * \param dir     The dirname of the path to read.
+ * \param node    The basename of the path to read.
+ * \param num     The returned number of directory entries.
+ * \param result  An array of directory entry strings.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * \note The results buffer is malloced and should be free'd by the
+ *       caller with 'free(*result, M_XENSTORE)'.
+ */
+int xs_directory(struct xs_transaction t, const char *dir,
+    const char *node, unsigned int *num, const char ***result);
+
+/**
+ * Determine if a path exists in the XenStore.
+ *
+ * \param t       The XenStore transaction covering this request.
+ * \param dir     The dirname of the path to read.
+ * \param node    The basename of the path to read.
+ *
+ * \retval 1  The path exists.
+ * \retval 0  The path does not exist or an error occurred attempting
+ *            to make that determination.
+ */
+int xs_exists(struct xs_transaction t, const char *dir, const char *node);
+
+/**
+ * Get the contents of a single "file".  Returns the contents in
+ * *result which should be freed with free(*result, M_XENSTORE) after
+ * use.  The length of the value in bytes is returned in *len.
+ *
+ * \param t       The XenStore transaction covering this request.
+ * \param dir     The dirname of the file to read.
+ * \param node    The basename of the file to read.
+ * \param len     The amount of data read.
+ * \param result  The returned contents from this file.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ *
+ * \note The results buffer is malloced and should be free'd by the
+ *       caller with 'free(*result, M_XENSTORE)'.
+ */
+int xs_read(struct xs_transaction t, const char *dir,
+    const char *node, unsigned int *len, void **result);
+
+/**
+ * Write to a single file.
+ *
+ * \param t       The XenStore transaction covering this request.
+ * \param dir     The dirname of the file to write.
+ * \param node    The basename of the file to write.
+ * \param string  The NUL terminated string of data to write.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xs_write(struct xs_transaction t, const char *dir,
+    const char *node, const char *string);
+
+/**
+ * Create a new directory.
+ *
+ * \param t       The XenStore transaction covering this request.
+ * \param dir     The dirname of the directory to create.
+ * \param node    The basename of the directory to create.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xs_mkdir(struct xs_transaction t, const char *dir,
+    const char *node);
+
+/**
+ * Remove a file or directory (directories must be empty).
+ *
+ * \param t       The XenStore transaction covering this request.
+ * \param dir     The dirname of the directory to remove.
+ * \param node    The basename of the directory to remove.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xs_rm(struct xs_transaction t, const char *dir, const char *node);
+
+/**
+ * Destroy a tree of files rooted at dir/node.
+ *
+ * \param t       The XenStore transaction covering this request.
+ * \param dir     The dirname of the directory to remove.
+ * \param node    The basename of the directory to remove.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xs_rm_tree(struct xs_transaction t, const char *dir,
+    const char *node);
+
+/**
+ * Start a transaction.
+ *
+ * Changes by others will not be seen during the lifetime of this
+ * transaction, and changes will not be visible to others until it
+ * is committed (xs_transaction_end).
+ *
+ * \param t  The returned transaction.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xs_transaction_start(struct xs_transaction *t);
+
+/**
+ * End a transaction.
+ *
+ * \param t      The transaction to end/commit.
+ * \param abort  If non-zero, the transaction is discarded
+ * 		 instead of committed.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xs_transaction_end(struct xs_transaction t, int abort);
+
+/*
+ * Single file read and scanf parsing of the result.
+ *
+ * \param t           The XenStore transaction covering this request.
+ * \param dir         The dirname of the path to read.
+ * \param node        The basename of the path to read.
+ * \param scancountp  The number of input values assigned (i.e. the result
+ *      	      of scanf).
+ * \param fmt         Scanf format string followed by a variable number of
+ *                    scanf input arguments.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of failure.
+ */
+int xs_scanf(struct xs_transaction t,
+    const char *dir, const char *node, int *scancountp, const char *fmt, ...)
+    __attribute__((format(scanf, 5, 6)));
+
+/**
+ * Printf formatted write to a XenStore file.
+ *
+ * \param t     The XenStore transaction covering this request.
+ * \param dir   The dirname of the path to read.
+ * \param node  The basename of the path to read.
+ * \param fmt   Printf format string followed by a variable number of
+ *              printf arguments.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of write failure.
+ */
+int xs_printf(struct xs_transaction t, const char *dir,
+    const char *node, const char *fmt, ...)
+    __attribute__((format(printf, 4, 5)));
+
+/**
+ * va_list version of xenbus_printf().
+ *
+ * \param t     The XenStore transaction covering this request.
+ * \param dir   The dirname of the path to read.
+ * \param node  The basename of the path to read.
+ * \param fmt   Printf format string.
+ * \param ap    Va_list of printf arguments.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of write failure.
+ */
+int xs_vprintf(struct xs_transaction t, const char *dir,
+    const char *node, const char *fmt, va_list ap);
+
+/**
+ * Multi-file read within a single directory and scanf parsing of
+ * the results.
+ *
+ * \param t    The XenStore transaction covering this request.
+ * \param dir  The dirname of the paths to read.
+ * \param ...  A variable number of argument triples specifying
+ *             the file name, scanf-style format string, and
+ *             output variable (pointer to storage of the results).
+ *             The last triple in the call must be terminated
+ *             will a final NULL argument.  A NULL format string
+ *             will cause the entire contents of the given file
+ *             to be assigned as a NUL terminated, M_XENSTORE heap
+ *             backed, string to the output parameter of that tuple.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of read failure.
+ *
+ * Example:
+ *         char protocol_abi[64];
+ *         uint32_t ring_ref;
+ *         char *dev_type;
+ *         int error;
+ *
+ *         error = xenbus_gather(XBT_NIL, xenbus_get_node(dev),
+ *             "ring-ref", "%" PRIu32, &ring_ref,
+ *             "protocol", "%63s", protocol_abi,
+ *             "device-type", NULL, &dev_type,
+ *             NULL);
+ *
+ *         ...
+ *
+ *         free(dev_type, M_XENSTORE);
+ */
+int xs_gather(struct xs_transaction t, const char *dir, ...);
+
+/**
+ * Register a XenStore watch.
+ *
+ * XenStore watches allow a client to be notified via a callback (embedded
+ * within the watch object) of changes to an object in the XenStore.
+ *
+ * \param watch  A xenbus_watch struct with it's node and callback fields
+ *               properly initialized.
+ *
+ * \return  On success, 0. Otherwise an errno value indicating the
+ *          type of write failure.  EEXIST errors from the XenStore
+ *          are supressed, allowing multiple, physically different,
+ *          xenbus_watch objects, to watch the same path in the XenStore.
+ */
+int xs_register_watch(struct xs_watch *watch);
+ 
+/**
+ * Unregister a XenStore watch.
+ *
+ * \param watch  An xs_watch object previously used in a successful call
+ *		 to xs_register_watch().
+ *
+ * The xs_watch object's node field is not altered by this call.
+ * It is the caller's responsibility to properly dispose of both the
+ * watch object and the data pointed to by watch->node.
+ */
+void xs_unregister_watch(struct xs_watch *watch);
+
+/**
+ * Allocate and return an sbuf containing the XenStore path string
+ * <dir>/<name>.  If name is the NUL string, the returned sbuf contains
+ * the path string <dir>.
+ *
+ * \param dir	The NUL terminated directory prefix for new path.
+ * \param name  The NUL terminated basename for the new path.
+ *
+ * \return  A buffer containing the joined path.
+ */
+struct sbuf *xs_join(const char *, const char *);
+
+#endif /* _XEN_XENSTORE_XENSTOREVAR_H */

From d598b626c04a8b0f83ec16d19f33a759918ff870 Mon Sep 17 00:00:00 2001
From: Pyun YongHyeon <yongari@FreeBSD.org>
Date: Tue, 19 Oct 2010 23:04:23 +0000
Subject: [PATCH 03/68] Add workaround for BCM5906 controller silicon bug. If
 device receive two back-to-back send BDs with less than or equal to 8 total
 bytes then the device may hang. The two back-to-back send BDs must be in the
 same frame for this failure to occur. Thanks to davidch for detailed errata
 information.

Reviewed by:	davidch
---
 sys/dev/bge/if_bge.c    | 43 +++++++++++++++++++++++++++++++++++++++++
 sys/dev/bge/if_bgereg.h |  1 +
 2 files changed, 44 insertions(+)

diff --git a/sys/dev/bge/if_bge.c b/sys/dev/bge/if_bge.c
index 662713fe5b2..5380931ad74 100644
--- a/sys/dev/bge/if_bge.c
+++ b/sys/dev/bge/if_bge.c
@@ -374,6 +374,7 @@ static void bge_tick(void *);
 static void bge_stats_clear_regs(struct bge_softc *);
 static void bge_stats_update(struct bge_softc *);
 static void bge_stats_update_regs(struct bge_softc *);
+static struct mbuf *bge_check_short_dma(struct mbuf *);
 static struct mbuf *bge_setup_tso(struct bge_softc *, struct mbuf *,
     uint16_t *);
 static int bge_encap(struct bge_softc *, struct mbuf **, uint32_t *);
@@ -2633,6 +2634,8 @@ bge_attach(device_t dev)
 	case BGE_ASICREV_BCM5752:
 	case BGE_ASICREV_BCM5906:
 		sc->bge_flags |= BGE_FLAG_575X_PLUS;
+		if (sc->bge_asicrev == BGE_ASICREV_BCM5906)
+			sc->bge_flags |= BGE_FLAG_SHORT_DMA_BUG;
 		/* FALLTHROUGH */
 	case BGE_ASICREV_BCM5705:
 		sc->bge_flags |= BGE_FLAG_5705_PLUS;
@@ -4059,6 +4062,39 @@ bge_cksum_pad(struct mbuf *m)
 	return (0);
 }
 
+static struct mbuf *
+bge_check_short_dma(struct mbuf *m)
+{
+	struct mbuf *n;
+	int found;
+
+	/*
+	 * If device receive two back-to-back send BDs with less than
+	 * or equal to 8 total bytes then the device may hang.  The two
+	 * back-to-back send BDs must in the same frame for this failure
+	 * to occur.  Scan mbuf chains and see whether two back-to-back
+	 * send BDs are there. If this is the case, allocate new mbuf
+	 * and copy the frame to workaround the silicon bug.
+	 */
+	for (n = m, found = 0; n != NULL; n = n->m_next) {
+		if (n->m_len < 8) {
+			found++;
+			if (found > 1)
+				break;
+			continue;
+		}
+		found = 0;
+	}
+
+	if (found > 1) {
+		n = m_defrag(m, M_DONTWAIT);
+		if (n == NULL)
+			m_freem(m);
+	} else
+		n = m;
+	return (n);
+}
+
 static struct mbuf *
 bge_setup_tso(struct bge_softc *sc, struct mbuf *m, uint16_t *mss)
 {
@@ -4132,6 +4168,13 @@ bge_encap(struct bge_softc *sc, struct mbuf **m_head, uint32_t *txidx)
 	csum_flags = 0;
 	mss = 0;
 	vlan_tag = 0;
+	if ((sc->bge_flags & BGE_FLAG_SHORT_DMA_BUG) != 0 &&
+	    m->m_next != NULL) {
+		*m_head = bge_check_short_dma(m);
+		if (*m_head == NULL)
+			return (ENOBUFS);
+		m = *m_head;
+	}
 	if ((m->m_pkthdr.csum_flags & CSUM_TSO) != 0) {
 		*m_head = m = bge_setup_tso(sc, m, &mss);
 		if (*m_head == NULL)
diff --git a/sys/dev/bge/if_bgereg.h b/sys/dev/bge/if_bgereg.h
index 60a39f88fad..8e337c693fb 100644
--- a/sys/dev/bge/if_bgereg.h
+++ b/sys/dev/bge/if_bgereg.h
@@ -2727,6 +2727,7 @@ struct bge_softc {
 #define	BGE_FLAG_40BIT_BUG	0x01000000
 #define	BGE_FLAG_4G_BNDRY_BUG	0x02000000
 #define	BGE_FLAG_RX_ALIGNBUG	0x04000000
+#define	BGE_FLAG_SHORT_DMA_BUG	0x08000000
 	uint32_t		bge_phy_flags;
 #define	BGE_PHY_WIRESPEED	0x00000001
 #define	BGE_PHY_ADC_BUG		0x00000002

From 69b5727f163ea2639f4e207e515c0f406179eec7 Mon Sep 17 00:00:00 2001
From: Pyun YongHyeon <yongari@FreeBSD.org>
Date: Wed, 20 Oct 2010 00:19:25 +0000
Subject: [PATCH 04/68] Correct handling of shared interrupt in sis_intr().
 r212116 incorrectly released a drver lock for shared interrupt case such that
 it caused panic. While I'm here check whether driver is still running before
 serving TX/RX handler.

Reported by:	Jerahmy Pocott < QUAKENET1 <> optusnet dot com dot au >
Tested by:	Jerahmy Pocott < QUAKENET1 <> optusnet dot com dot au >
MFC after:	3 days
---
 sys/dev/sis/if_sis.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/sys/dev/sis/if_sis.c b/sys/dev/sis/if_sis.c
index 93246d33c44..0957419a56f 100644
--- a/sys/dev/sis/if_sis.c
+++ b/sys/dev/sis/if_sis.c
@@ -1795,12 +1795,15 @@ sis_intr(void *arg)
 	if ((status & SIS_INTRS) == 0) {
 		/* Not ours. */
 		SIS_UNLOCK(sc);
+		return;
 	}
 
 	/* Disable interrupts. */
 	CSR_WRITE_4(sc, SIS_IER, 0);
 
 	for (;(status & SIS_INTRS) != 0;) {
+		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+			break;
 		if (status &
 		    (SIS_ISR_TX_DESC_OK | SIS_ISR_TX_ERR |
 		    SIS_ISR_TX_OK | SIS_ISR_TX_IDLE) )
@@ -1825,11 +1828,13 @@ sis_intr(void *arg)
 		status = CSR_READ_4(sc, SIS_ISR);
 	}
 
-	/* Re-enable interrupts. */
-	CSR_WRITE_4(sc, SIS_IER, 1);
+	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+		/* Re-enable interrupts. */
+		CSR_WRITE_4(sc, SIS_IER, 1);
 
-	if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
-		sis_startl(ifp);
+		if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
+			sis_startl(ifp);
+	}
 
 	SIS_UNLOCK(sc);
 }

From cfca8a186287d7920036f3b83fb41cf157c42dd1 Mon Sep 17 00:00:00 2001
From: David Xu <davidxu@FreeBSD.org>
Date: Wed, 20 Oct 2010 00:41:38 +0000
Subject: [PATCH 05/68] - Don't include sx.h, it is not needed. - Check NULL
 pointer, move timeout calculation code outside of   process lock.

---
 sys/kern/kern_thr.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
index 76270271fd8..3a9c721dddc 100644
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@@ -45,7 +45,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
-#include <sys/sx.h>
 #include <sys/ucontext.h>
 #include <sys/thr.h>
 #include <sys/rtprio.h>
@@ -431,40 +430,40 @@ thr_suspend(struct thread *td, struct thr_suspend_args *uap)
 int
 kern_thr_suspend(struct thread *td, struct timespec *tsp)
 {
+	struct proc *p = td->td_proc;
 	struct timeval tv;
 	int error = 0;
 	int timo = 0;
 
-	if (tsp != NULL) {
-		if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000)
-			return (EINVAL);
-	}
-
 	if (td->td_pflags & TDP_WAKEUP) {
 		td->td_pflags &= ~TDP_WAKEUP;
 		return (0);
 	}
 
-	PROC_LOCK(td->td_proc);
-	if ((td->td_flags & TDF_THRWAKEUP) == 0) {
+	if (tsp != NULL) {
+		if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000)
+			return (EINVAL);
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			error = EWOULDBLOCK;
 		else {
 			TIMESPEC_TO_TIMEVAL(&tv, tsp);
 			timo = tvtohz(&tv);
-			error = msleep((void *)td, &td->td_proc->p_mtx,
-				 PCATCH, "lthr", timo);
 		}
 	}
 
+	PROC_LOCK(p);
+	if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0)
+		error = msleep((void *)td, &p->p_mtx,
+			 PCATCH, "lthr", timo);
+
 	if (td->td_flags & TDF_THRWAKEUP) {
 		thread_lock(td);
 		td->td_flags &= ~TDF_THRWAKEUP;
 		thread_unlock(td);
-		PROC_UNLOCK(td->td_proc);
+		PROC_UNLOCK(p);
 		return (0);
 	}
-	PROC_UNLOCK(td->td_proc);
+	PROC_UNLOCK(p);
 	if (error == EWOULDBLOCK)
 		error = ETIMEDOUT;
 	else if (error == ERESTART) {

From de1e74c6a5c57a5f9d4c8547fd47c18328f0e4c3 Mon Sep 17 00:00:00 2001
From: David Xu <davidxu@FreeBSD.org>
Date: Wed, 20 Oct 2010 02:34:02 +0000
Subject: [PATCH 06/68] Revert revision 214007, I realized that MySQL wants to
 resolve a silly rwlock deadlock problem, the deadlock is caused by writer
 waiters, if a thread has already locked a reader lock, and wants to acquire
 another reader lock, it will be blocked by writer waiters, but we had already
 fixed it years ago.

---
 lib/libc/include/namespace.h       |  2 --
 lib/libc/include/un-namespace.h    |  2 --
 lib/libthr/pthread.map             |  4 ----
 lib/libthr/thread/thr_private.h    |  3 ---
 lib/libthr/thread/thr_rwlock.c     | 25 ++-----------------------
 lib/libthr/thread/thr_rwlockattr.c | 22 ----------------------
 6 files changed, 2 insertions(+), 56 deletions(-)

diff --git a/lib/libc/include/namespace.h b/lib/libc/include/namespace.h
index 590eba7ea27..6ba8bab6155 100644
--- a/lib/libc/include/namespace.h
+++ b/lib/libc/include/namespace.h
@@ -177,10 +177,8 @@
 #define		pthread_rwlock_unlock		_pthread_rwlock_unlock
 #define		pthread_rwlock_wrlock		_pthread_rwlock_wrlock
 #define		pthread_rwlockattr_destroy	_pthread_rwlockattr_destroy
-#define		pthread_rwlockattr_getkind_np	_pthread_rwlockattr_getkind_np
 #define		pthread_rwlockattr_getpshared	_pthread_rwlockattr_getpshared
 #define		pthread_rwlockattr_init		_pthread_rwlockattr_init
-#define		pthread_rwlockattr_setkind_np	_pthread_rwlockattr_setkind_np
 #define		pthread_rwlockattr_setpshared	_pthread_rwlockattr_setpshared
 #define		pthread_self			_pthread_self
 #define		pthread_set_name_np		_pthread_set_name_np
diff --git a/lib/libc/include/un-namespace.h b/lib/libc/include/un-namespace.h
index 7a8f9d9b29f..00f0df27480 100644
--- a/lib/libc/include/un-namespace.h
+++ b/lib/libc/include/un-namespace.h
@@ -158,10 +158,8 @@
 #undef		pthread_rwlock_unlock
 #undef		pthread_rwlock_wrlock
 #undef		pthread_rwlockattr_destroy
-#undef		pthread_rwlockattr_getkind_np
 #undef		pthread_rwlockattr_getpshared
 #undef		pthread_rwlockattr_init
-#undef		pthread_rwlockattr_setkind_np
 #undef		pthread_rwlockattr_setpshared
 #undef		pthread_self
 #undef		pthread_set_name_np
diff --git a/lib/libthr/pthread.map b/lib/libthr/pthread.map
index 40940162901..5e36239e8e2 100644
--- a/lib/libthr/pthread.map
+++ b/lib/libthr/pthread.map
@@ -318,9 +318,7 @@ FBSDprivate_1.0 {
 	_pthread_rwlock_wrlock;
 	_pthread_rwlockattr_destroy;
 	_pthread_rwlockattr_getpshared;
-	_pthread_rwlockattr_getkind_np;
 	_pthread_rwlockattr_init;
-	_pthread_rwlockattr_setkind_np;
 	_pthread_rwlockattr_setpshared;
 	_pthread_self;
 	_pthread_set_name_np;
@@ -403,8 +401,6 @@ FBSD_1.1 {
 
 FBSD_1.2 {
 	openat;
-	pthread_rwlockattr_getkind_np;
-	pthread_rwlockattr_setkind_np;
 	setcontext;
 	swapcontext;
 };
diff --git a/lib/libthr/thread/thr_private.h b/lib/libthr/thread/thr_private.h
index 5d0c301cd04..aa9feefcf7d 100644
--- a/lib/libthr/thread/thr_private.h
+++ b/lib/libthr/thread/thr_private.h
@@ -285,14 +285,11 @@ struct pthread_prio {
 
 struct pthread_rwlockattr {
 	int		pshared;
-	int		kind;
 };
 
 struct pthread_rwlock {
 	struct urwlock 	lock;
 	struct pthread	*owner;
-	int	recurse;
-	int	kind;
 };
 
 /*
diff --git a/lib/libthr/thread/thr_rwlock.c b/lib/libthr/thread/thr_rwlock.c
index 20b9b79e31c..ebdeae7e611 100644
--- a/lib/libthr/thread/thr_rwlock.c
+++ b/lib/libthr/thread/thr_rwlock.c
@@ -63,19 +63,13 @@ __weak_reference(_pthread_rwlock_timedwrlock, pthread_rwlock_timedwrlock);
  */
 
 static int
-rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
+rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr __unused)
 {
 	pthread_rwlock_t prwlock;
 
 	prwlock = (pthread_rwlock_t)calloc(1, sizeof(struct pthread_rwlock));
 	if (prwlock == NULL)
 		return (ENOMEM);
-	if (attr != NULL)
-		prwlock->kind = (*attr)->kind;
-	else
-		prwlock->kind = PTHREAD_RWLOCK_DEFAULT_NP;
-	if (prwlock->kind == PTHREAD_RWLOCK_PREFER_READER_NP)
-		prwlock->lock.rw_flags |= URWLOCK_PREFER_READER;
 	*rwlock = prwlock;
 	return (0);
 }
@@ -118,7 +112,7 @@ init_static(struct pthread *thread, pthread_rwlock_t *rwlock)
 }
 
 int
-_pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
+_pthread_rwlock_init (pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
 {
 	*rwlock = NULL;
 	return (rwlock_init(rwlock, attr));
@@ -266,14 +260,6 @@ rwlock_wrlock_common (pthread_rwlock_t *rwlock, const struct timespec *abstime)
 
 	CHECK_AND_INIT_RWLOCK
 
-	if (__predict_false(prwlock->owner == curthread)) {
-		if (__predict_false(
-			prwlock->kind == PTHREAD_RWLOCK_PREFER_WRITER_NP)) {
-			prwlock->recurse++;
-			return (0);
-		}
-	}
-
 	/*
 	 * POSIX said the validity of the abstimeout parameter need
 	 * not be checked if the lock can be immediately acquired.
@@ -349,13 +335,6 @@ _pthread_rwlock_unlock (pthread_rwlock_t *rwlock)
 	if (state & URWLOCK_WRITE_OWNER) {
 		if (__predict_false(prwlock->owner != curthread))
 			return (EPERM);
-		if (__predict_false(
-			prwlock->kind == PTHREAD_RWLOCK_PREFER_WRITER_NP)) {
-			if (prwlock->recurse > 0) {
-				prwlock->recurse--;
-				return (0);
-			}
-		}
 		prwlock->owner = NULL;
 	}
 
diff --git a/lib/libthr/thread/thr_rwlockattr.c b/lib/libthr/thread/thr_rwlockattr.c
index 23ea73e9b92..73ccdc90597 100644
--- a/lib/libthr/thread/thr_rwlockattr.c
+++ b/lib/libthr/thread/thr_rwlockattr.c
@@ -36,10 +36,8 @@
 
 __weak_reference(_pthread_rwlockattr_destroy, pthread_rwlockattr_destroy);
 __weak_reference(_pthread_rwlockattr_getpshared, pthread_rwlockattr_getpshared);
-__weak_reference(_pthread_rwlockattr_getkind_np, pthread_rwlockattr_getkind_np);
 __weak_reference(_pthread_rwlockattr_init, pthread_rwlockattr_init);
 __weak_reference(_pthread_rwlockattr_setpshared, pthread_rwlockattr_setpshared);
-__weak_reference(_pthread_rwlockattr_setkind_np, pthread_rwlockattr_setkind_np);
 
 int
 _pthread_rwlockattr_destroy(pthread_rwlockattr_t *rwlockattr)
@@ -83,7 +81,6 @@ _pthread_rwlockattr_init(pthread_rwlockattr_t *rwlockattr)
 		return(ENOMEM);
 
 	prwlockattr->pshared 	= PTHREAD_PROCESS_PRIVATE;
-	prwlockattr->kind	= PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP;
 	*rwlockattr		= prwlockattr;
 
 	return(0);
@@ -100,22 +97,3 @@ _pthread_rwlockattr_setpshared(pthread_rwlockattr_t *rwlockattr, int pshared)
 
 	return(0);
 }
-
-int
-_pthread_rwlockattr_setkind_np(pthread_rwlockattr_t *attr, int kind)
-{
-	if (kind != PTHREAD_RWLOCK_PREFER_READER_NP &&
-	    kind != PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP &&
-	    kind != PTHREAD_RWLOCK_PREFER_WRITER_NP) {
-		return (EINVAL);
-	}
-	(*attr)->kind = kind;
-	return (0);
-}
-
-int
-_pthread_rwlockattr_getkind_np(const pthread_rwlockattr_t *attr, int *kind)
-{
-	*kind = (*attr)->kind;
-	return (0);
-}

From 7e54c6db1f28f7daca4a4e988a7fa6626253f8e6 Mon Sep 17 00:00:00 2001
From: Maxim Konovalov <maxim@FreeBSD.org>
Date: Wed, 20 Oct 2010 04:53:03 +0000
Subject: [PATCH 07/68] o Put missed w/space back.

Submitted by:	Garrett Cooper
MFC after:	3 days
---
 share/man/man5/device.hints.5 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/share/man/man5/device.hints.5 b/share/man/man5/device.hints.5
index f7a5e97a0f2..9dcaccd71be 100644
--- a/share/man/man5/device.hints.5
+++ b/share/man/man5/device.hints.5
@@ -161,7 +161,7 @@ hint.acpi.0.disabled="1"
 .Sh SEE ALSO
 .Xr kenv 1 ,
 .Xr loader.conf 5 ,
-.Xr loader 8,
+.Xr loader 8 ,
 .Xr resource_int_value 9 .
 .Sh HISTORY
 The

From 55144670c2d7dbb2bc047f275e4d2a75fbb9752a Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Wed, 20 Oct 2010 05:17:23 +0000
Subject: [PATCH 08/68] PG_BUSY -> VPO_BUSY, PG_WANTED -> VPO_WANTED in manual
 pages and comments

Reviewed by:	alc
MFC after:	4 days
---
 share/man/man9/vm_page_io.9         | 4 ++--
 share/man/man9/vm_page_sleep_busy.9 | 2 +-
 share/man/man9/vm_page_wakeup.9     | 8 ++++----
 sys/vm/swap_pager.c                 | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/share/man/man9/vm_page_io.9 b/share/man/man9/vm_page_io.9
index 396cb1903bb..bddb82b8ea7 100644
--- a/share/man/man9/vm_page_io.9
+++ b/share/man/man9/vm_page_io.9
@@ -52,9 +52,9 @@ function lowers the busy count on the page by one, if the resulting busy
 count is zero, a
 .Xr wakeup 9
 will be issued if the page has been marked
-.Dv PG_WANTED .
+.Dv VPO_WANTED .
 A page is typically marked
-.Dv PG_WANTED
+.Dv VPO_WANTED
 by a thread to register its interest in
 the page to either complete I/O or becoming available for general use.
 .Sh AUTHORS
diff --git a/share/man/man9/vm_page_sleep_busy.9 b/share/man/man9/vm_page_sleep_busy.9
index 45aa97761c9..6ccdf684990 100644
--- a/share/man/man9/vm_page_sleep_busy.9
+++ b/share/man/man9/vm_page_sleep_busy.9
@@ -42,7 +42,7 @@
 The
 .Fn vm_page_sleep_busy
 function waits until the
-.Dv PG_BUSY
+.Dv VPO_BUSY
 flag is cleared.
 If
 .Fa also_m_busy
diff --git a/share/man/man9/vm_page_wakeup.9 b/share/man/man9/vm_page_wakeup.9
index adb03a13c05..75f0ca8abf3 100644
--- a/share/man/man9/vm_page_wakeup.9
+++ b/share/man/man9/vm_page_wakeup.9
@@ -50,20 +50,20 @@ of a page.
 .Pp
 .Fn vm_page_busy
 sets the
-.Dv PG_BUSY
+.Dv VPO_BUSY
 flag in the page.
 .Pp
 .Fn vm_page_flash
 checks to see if there is anybody waiting on the page
-.Dv ( PG_WANTED
+.Dv ( VPO_WANTED
 will be set), and if so, clears the
-.Dv PG_WANTED
+.Dv VPO_WANTED
 flag and notifies whoever is waiting via
 .Fn wakeup .
 .Pp
 .Fn vm_page_wakeup
 clears the
-.Dv PG_BUSY
+.Dv VPO_BUSY
 flag on the page, and calls
 .Fn vm_page_flash
 in case somebody has been waiting for it.
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index b359bd47829..bea235af8b6 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -1460,8 +1460,8 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
  *	Completion routine for asynchronous reads and writes from/to swap.
  *	Also called manually by synchronous code to finish up a bp.
  *
- *	For READ operations, the pages are PG_BUSY'd.  For WRITE operations, 
- *	the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY 
+ *	For READ operations, the pages are VPO_BUSY'd.  For WRITE operations, 
+ *	the pages are vm_page_t->busy'd.  For READ operations, we VPO_BUSY 
  *	unbusy all pages except the 'main' request page.  For WRITE 
  *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
  *	because we marked them all VM_PAGER_PEND on return from putpages ).

From c3fdd2de24090ab546a4c822e429e9d1dd2cde77 Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Wed, 20 Oct 2010 05:57:54 +0000
Subject: [PATCH 09/68] VOP_GETPAGES.9: clarify and correct description of
 parameters and requirements

In cooperation with alc and kib, who provided valuable insights and
suggestions.

Reviewed by:	alc, kib (earlier version)
MFC after:	4 days
---
 share/man/man9/VOP_GETPAGES.9 | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/share/man/man9/VOP_GETPAGES.9 b/share/man/man9/VOP_GETPAGES.9
index ab0afe8be09..e490862045f 100644
--- a/share/man/man9/VOP_GETPAGES.9
+++ b/share/man/man9/VOP_GETPAGES.9
@@ -41,9 +41,9 @@
 .In sys/vnode.h
 .In vm/vm.h
 .Ft int
-.Fn VOP_GETPAGES "struct vnode *vp" "vm_page_t *m" "int count" "int reqpage" "vm_ooffset_t offset"
+.Fn VOP_GETPAGES "struct vnode *vp" "vm_page_t *ma" "int count" "int reqpage" "vm_ooffset_t offset"
 .Ft int
-.Fn VOP_PUTPAGES "struct vnode *vp" "vm_page_t *m" "int count" "int sync" "int *rtvals" "vm_ooffset_t offset"
+.Fn VOP_PUTPAGES "struct vnode *vp" "vm_page_t *ma" "int count" "int sync" "int *rtvals" "vm_ooffset_t offset"
 .Sh DESCRIPTION
 The
 .Fn VOP_GETPAGES
@@ -66,11 +66,11 @@ The arguments are:
 .Bl -tag -width reqpage
 .It Fa vp
 The file to access.
-.It Fa m
-Pointer to the first element of an array of contiguous pages representing a
+.It Fa ma
+Pointer to the first element of an array of pages representing a
 contiguous region of the file to be read or written.
 .It Fa count
-The number of pages in the array.
+The number of bytes that should be read into the pages of the array.
 .It Fa sync
 .Dv VM_PAGER_PUT_SYNC
 if the write should be synchronous.
@@ -123,22 +123,27 @@ The page was not handled by this request.
 The
 .Fn VOP_GETPAGES
 method is expected to release any pages in
-.Fa m
+.Fa ma
 that it does not successfully handle, by calling
 .Xr vm_page_free 9 .
 When it succeeds,
 .Fn VOP_GETPAGES
-must set the valid bits appropriately, clear the dirty bit
-(using
-.Xr vm_page_undirty 9 ) ,
-either activate the page (if its wanted bit is set)
+must set the valid bits appropriately.
+.Fn VOP_GETPAGES
+must keep
+.Fa reqpage
+busy.
+It must unbusy all other successfully handled pages and put them
+on appropriate page queue(s).
+For example,
+.Fn VOP_GETPAGES
+may either activate a page (if its wanted bit is set)
 or deactivate it (otherwise), and finally call
 .Xr vm_page_wakeup 9
-to arouse any threads currently waiting for the page to be faulted in,
-for each page read.
+to arouse any threads currently waiting for the page to be faulted in.
 .Sh RETURN VALUES
 If it successfully reads
-.Fa m[reqpage] ,
+.Fa ma[reqpage] ,
 .Fn VOP_GETPAGES
 returns
 .Dv VM_PAGER_OK ;

From 7f0ab7f026fb3a55ed0cb24f9ff9558eb9b694bc Mon Sep 17 00:00:00 2001
From: Andriy Gapon <avg@FreeBSD.org>
Date: Wed, 20 Oct 2010 06:29:11 +0000
Subject: [PATCH 10/68] catch up manual pages with rename of vm_page_sleep_busy
 to vm_page_sleep_if_busy

Suggested by:	alc
MFC after:	4 days
---
 ObsoleteFiles.inc                                    |  2 ++
 share/man/man9/Makefile                              |  2 +-
 ...{vm_page_sleep_busy.9 => vm_page_sleep_if_busy.9} | 12 ++++++------
 share/man/man9/vm_page_wakeup.9                      |  2 +-
 4 files changed, 10 insertions(+), 8 deletions(-)
 rename share/man/man9/{vm_page_sleep_busy.9 => vm_page_sleep_if_busy.9} (90%)

diff --git a/ObsoleteFiles.inc b/ObsoleteFiles.inc
index 9d60d54e420..ba0f33f7d94 100644
--- a/ObsoleteFiles.inc
+++ b/ObsoleteFiles.inc
@@ -14,6 +14,8 @@
 # The file is partitioned: OLD_FILES first, then OLD_LIBS and OLD_DIRS last.
 #
 
+# 20101020: catch up with vm_page_sleep_if_busy rename
+OLD_FILES+=usr/share/man/man9/vm_page_sleep_busy.9.gz
 # 20101011: removed subblock.h from liblzma
 OLD_FILES+=usr/include/lzma/subblock.h
 # 20101002: removed manpath.config
diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile
index 261be4f2aac..79ba70b83b3 100644
--- a/share/man/man9/Makefile
+++ b/share/man/man9/Makefile
@@ -324,7 +324,7 @@ MAN=	accept_filter.9 \
 	vm_page_lookup.9 \
 	vm_page_protect.9 \
 	vm_page_rename.9 \
-	vm_page_sleep_busy.9 \
+	vm_page_sleep_if_busy.9 \
 	vm_page_wakeup.9 \
 	vm_page_wire.9 \
 	vm_page_zero_fill.9 \
diff --git a/share/man/man9/vm_page_sleep_busy.9 b/share/man/man9/vm_page_sleep_if_busy.9
similarity index 90%
rename from share/man/man9/vm_page_sleep_busy.9
rename to share/man/man9/vm_page_sleep_if_busy.9
index 6ccdf684990..850979f2db8 100644
--- a/share/man/man9/vm_page_sleep_busy.9
+++ b/share/man/man9/vm_page_sleep_if_busy.9
@@ -27,20 +27,20 @@
 .\" $FreeBSD$
 .\"
 .Dd July 13, 2001
-.Dt VM_PAGE_SLEEP_BUSY 9
+.Dt VM_PAGE_SLEEP_IF_BUSY 9
 .Os
 .Sh NAME
-.Nm vm_page_sleep_busy
+.Nm vm_page_sleep_if_busy
 .Nd "wait for a busy page to become unbusy"
 .Sh SYNOPSIS
 .In sys/param.h
 .In vm/vm.h
 .In vm/vm_page.h
 .Ft int
-.Fn vm_page_sleep_busy "vm_page_t m" "int also_m_busy" "const char *wmesg"
+.Fn vm_page_sleep_if_busy "vm_page_t m" "int also_m_busy" "const char *wmesg"
 .Sh DESCRIPTION
 The
-.Fn vm_page_sleep_busy
+.Fn vm_page_sleep_if_busy
 function waits until the
 .Dv VPO_BUSY
 flag is cleared.
@@ -51,7 +51,7 @@ is non-zero, it also waits for
 to become zero.
 .Sh RETURN VALUES
 If
-.Fn vm_page_sleep_busy
+.Fn vm_page_sleep_if_busy
 finds the page busy it returns
 .Dv TRUE .
 If not, it returns
@@ -59,7 +59,7 @@ If not, it returns
 Returning
 .Dv TRUE
 does not necessary mean that
-.Fn vm_page_sleep_busy
+.Fn vm_page_sleep_if_busy
 slept, but only that
 .Fn splvm
 was called.
diff --git a/share/man/man9/vm_page_wakeup.9 b/share/man/man9/vm_page_wakeup.9
index 75f0ca8abf3..5908b62001b 100644
--- a/share/man/man9/vm_page_wakeup.9
+++ b/share/man/man9/vm_page_wakeup.9
@@ -68,7 +68,7 @@ flag on the page, and calls
 .Fn vm_page_flash
 in case somebody has been waiting for it.
 .Sh SEE ALSO
-.Xr vm_page_sleep_busy 9 ,
+.Xr vm_page_sleep_if_busy 9 ,
 .Xr wakeup 9
 .Sh AUTHORS
 This manual page was written by

From c0609c547a08526dcff7729adfae2d7002846c34 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 20 Oct 2010 07:22:34 +0000
Subject: [PATCH 11/68] Some style cleanup: - remove commented debugging code;
 - wrap long lines.

---
 sys/dev/mvs/mvs.c     | 79 ++++++++++++++++++++-----------------------
 sys/dev/mvs/mvs_pci.c |  1 -
 sys/dev/mvs/mvs_soc.c |  1 -
 3 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/sys/dev/mvs/mvs.c b/sys/dev/mvs/mvs.c
index 11a8853d95f..ee04f98f0f7 100644
--- a/sys/dev/mvs/mvs.c
+++ b/sys/dev/mvs/mvs.c
@@ -57,7 +57,8 @@ static int mvs_ch_deinit(device_t dev);
 static int mvs_ch_suspend(device_t dev);
 static int mvs_ch_resume(device_t dev);
 static void mvs_dmainit(device_t dev);
-static void mvs_dmasetupc_cb(void *xsc, bus_dma_segment_t *segs, int nsegs, int error);
+static void mvs_dmasetupc_cb(void *xsc,
+	bus_dma_segment_t *segs, int nsegs, int error);
 static void mvs_dmafini(device_t dev);
 static void mvs_slotsalloc(device_t dev);
 static void mvs_slotsfree(device_t dev);
@@ -79,7 +80,8 @@ static void mvs_crbq_intr(device_t dev);
 static void mvs_begin_transaction(device_t dev, union ccb *ccb);
 static void mvs_legacy_execute_transaction(struct mvs_slot *slot);
 static void mvs_timeout(struct mvs_slot *slot);
-static void mvs_dmasetprd(void *arg, bus_dma_segment_t *segs, int nsegs, int error);
+static void mvs_dmasetprd(void *arg,
+	bus_dma_segment_t *segs, int nsegs, int error);
 static void mvs_requeue_frozen(device_t dev);
 static void mvs_execute_transaction(struct mvs_slot *slot);
 static void mvs_end_transaction(struct mvs_slot *slot, enum mvs_err_type et);
@@ -314,9 +316,11 @@ mvs_dmainit(device_t dev)
 	if (bus_dmamem_alloc(ch->dma.workrq_tag, (void **)&ch->dma.workrq, 0,
 	    &ch->dma.workrq_map))
 		goto error;
-	if (bus_dmamap_load(ch->dma.workrq_tag, ch->dma.workrq_map, ch->dma.workrq,
-	    MVS_WORKRQ_SIZE, mvs_dmasetupc_cb, &dcba, 0) || dcba.error) {
-		bus_dmamem_free(ch->dma.workrq_tag, ch->dma.workrq, ch->dma.workrq_map);
+	if (bus_dmamap_load(ch->dma.workrq_tag, ch->dma.workrq_map,
+	    ch->dma.workrq, MVS_WORKRQ_SIZE, mvs_dmasetupc_cb, &dcba, 0) ||
+	    dcba.error) {
+		bus_dmamem_free(ch->dma.workrq_tag,
+		    ch->dma.workrq, ch->dma.workrq_map);
 		goto error;
 	}
 	ch->dma.workrq_bus = dcba.maddr;
@@ -329,9 +333,11 @@ mvs_dmainit(device_t dev)
 	if (bus_dmamem_alloc(ch->dma.workrp_tag, (void **)&ch->dma.workrp, 0,
 	    &ch->dma.workrp_map))
 		goto error;
-	if (bus_dmamap_load(ch->dma.workrp_tag, ch->dma.workrp_map, ch->dma.workrp,
-	    MVS_WORKRP_SIZE, mvs_dmasetupc_cb, &dcba, 0) || dcba.error) {
-		bus_dmamem_free(ch->dma.workrp_tag, ch->dma.workrp, ch->dma.workrp_map);
+	if (bus_dmamap_load(ch->dma.workrp_tag, ch->dma.workrp_map,
+	    ch->dma.workrp, MVS_WORKRP_SIZE, mvs_dmasetupc_cb, &dcba, 0) ||
+	    dcba.error) {
+		bus_dmamem_free(ch->dma.workrp_tag,
+		    ch->dma.workrp, ch->dma.workrp_map);
 		goto error;
 	}
 	ch->dma.workrp_bus = dcba.maddr;
@@ -371,7 +377,8 @@ mvs_dmafini(device_t dev)
 	}
 	if (ch->dma.workrp_bus) {
 		bus_dmamap_unload(ch->dma.workrp_tag, ch->dma.workrp_map);
-		bus_dmamem_free(ch->dma.workrp_tag, ch->dma.workrp, ch->dma.workrp_map);
+		bus_dmamem_free(ch->dma.workrp_tag,
+		    ch->dma.workrp, ch->dma.workrp_map);
 		ch->dma.workrp_bus = 0;
 		ch->dma.workrp_map = NULL;
 		ch->dma.workrp = NULL;
@@ -382,7 +389,8 @@ mvs_dmafini(device_t dev)
 	}
 	if (ch->dma.workrq_bus) {
 		bus_dmamap_unload(ch->dma.workrq_tag, ch->dma.workrq_map);
-		bus_dmamem_free(ch->dma.workrq_tag, ch->dma.workrq, ch->dma.workrq_map);
+		bus_dmamem_free(ch->dma.workrq_tag,
+		    ch->dma.workrq, ch->dma.workrq_map);
 		ch->dma.workrq_bus = 0;
 		ch->dma.workrq_map = NULL;
 		ch->dma.workrq = NULL;
@@ -444,14 +452,16 @@ mvs_setup_edma_queues(device_t dev)
 	ATA_OUTL(ch->r_mem, EDMA_REQQBAH, work >> 32);
 	ATA_OUTL(ch->r_mem, EDMA_REQQIP, work & 0xffffffff);
 	ATA_OUTL(ch->r_mem, EDMA_REQQOP, work & 0xffffffff);
-	bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map, BUS_DMASYNC_PREWRITE);
+	bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map,
+	    BUS_DMASYNC_PREWRITE);
 	/* Reponses queue. */
 	bzero(ch->dma.workrp, 256);
 	work = ch->dma.workrp_bus;
 	ATA_OUTL(ch->r_mem, EDMA_RESQBAH, work >> 32);
 	ATA_OUTL(ch->r_mem, EDMA_RESQIP, work & 0xffffffff);
 	ATA_OUTL(ch->r_mem, EDMA_RESQOP, work & 0xffffffff);
-	bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map, BUS_DMASYNC_PREREAD);
+	bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map,
+	    BUS_DMASYNC_PREREAD);
 	ch->out_idx = 0;
 	ch->in_idx = 0;
 }
@@ -678,20 +688,15 @@ mvs_ch_intr(void *data)
 	int i, ccs, port = -1, selfdis = 0;
 	int edma = (ch->numtslots != 0 || ch->numdslots != 0);
 
-//device_printf(dev, "irq cause %02x EDMA %d IEC %08x\n",
-//    arg->cause, edma, ATA_INL(ch->r_mem, EDMA_IEC));
 	/* New item in response queue. */
 	if ((arg->cause & 2) && edma)
 		mvs_crbq_intr(dev);
 	/* Some error or special event. */
 	if (arg->cause & 1) {
 		iec = ATA_INL(ch->r_mem, EDMA_IEC);
-//device_printf(dev, "irq cause %02x EDMA %d IEC %08x\n",
-//    arg->cause, edma, iec);
 		if (iec & EDMA_IE_SERRINT) {
 			serr = ATA_INL(ch->r_mem, SATA_SE);
 			ATA_OUTL(ch->r_mem, SATA_SE, serr);
-//device_printf(dev, "SERR %08x\n", serr);
 		}
 		/* EDMA self-disabled due to error. */
 		if (iec & EDMA_IE_ESELFDIS)
@@ -706,7 +711,6 @@ mvs_ch_intr(void *data)
 				fisic = SATA_FISC_FISWAIT4HOSTRDYEN_B1;
 			else	/* For Gen-IIe - read FIS interrupt cause. */
 				fisic = ATA_INL(ch->r_mem, SATA_FISIC);
-//device_printf(dev, "FISIC %08x\n", fisic);
 		}
 		if (selfdis)
 			ch->curr_mode = MVS_EDMA_UNKNOWN;
@@ -745,7 +749,6 @@ mvs_ch_intr(void *data)
 					}
 				}
 			}
-//device_printf(dev, "err slot %d port %d\n", ccs, port);
 			mvs_requeue_frozen(dev);
 			for (i = 0; i < MVS_MAX_SLOTS; i++) {
 				/* XXX: reqests in loading state. */
@@ -771,7 +774,8 @@ mvs_ch_intr(void *data)
 					ch->fatalerr = 1;
 				    }
 				} else if (iec & 0xfc1e9000) {
-					if (ch->numtslots == 0 && i != ccs && port != -2)
+					if (ch->numtslots == 0 &&
+					    i != ccs && port != -2)
 						et = MVS_ERR_INNOCENT;
 					else
 						et = MVS_ERR_SATA;
@@ -823,8 +827,6 @@ mvs_legacy_intr(device_t dev)
 
 	/* Clear interrupt and get status. */
 	status = mvs_getstatus(dev, 1);
-//	device_printf(dev, "Legacy intr status %02x\n",
-//	    status);
 	if (slot->state < MVS_SLOT_RUNNING)
 	    return;
 	port = ccb->ccb_h.target_id & 0x0f;
@@ -867,7 +869,8 @@ mvs_legacy_intr(device_t dev)
 			/* If data write command - put them */
 			if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
 				if (mvs_wait(dev, ATA_S_DRQ, ATA_S_BUSY, 1000) < 0) {
-				    device_printf(dev, "timeout waiting for write DRQ\n");
+				    device_printf(dev,
+					"timeout waiting for write DRQ\n");
 				    et = MVS_ERR_TIMEOUT;
 				    goto end_finished;
 				}
@@ -890,19 +893,18 @@ mvs_legacy_intr(device_t dev)
 		ATA_OUTL(ch->r_mem, DMA_C, 0);
 		goto end_finished;
 	} else {			/* ATAPI PIO */
-		length = ATA_INB(ch->r_mem,ATA_CYL_LSB) | (ATA_INB(ch->r_mem,ATA_CYL_MSB) << 8);
+		length = ATA_INB(ch->r_mem,ATA_CYL_LSB) |
+		    (ATA_INB(ch->r_mem,ATA_CYL_MSB) << 8);
 		ireason = ATA_INB(ch->r_mem,ATA_IREASON);
-//device_printf(dev, "status %02x, ireason %02x, length %d\n", status, ireason, length);
 		switch ((ireason & (ATA_I_CMD | ATA_I_IN)) |
 			(status & ATA_S_DRQ)) {
 
 		case ATAPI_P_CMDOUT:
-device_printf(dev, "ATAPI CMDOUT\n");
+		    device_printf(dev, "ATAPI CMDOUT\n");
 		    /* Return wait for interrupt */
 		    return;
 
 		case ATAPI_P_WRITE:
-//device_printf(dev, "ATAPI WRITE\n");
 		    if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
 			device_printf(dev, "trying to write on read buffer\n");
 			et = MVS_ERR_TFE;
@@ -920,7 +922,6 @@ device_printf(dev, "ATAPI CMDOUT\n");
 		    return;
 
 		case ATAPI_P_READ:
-//device_printf(dev, "ATAPI READ\n");
 		    if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
 			device_printf(dev, "trying to read on write buffer\n");
 			et = MVS_ERR_TFE;
@@ -937,7 +938,6 @@ device_printf(dev, "ATAPI CMDOUT\n");
 		    return;
 
 		case ATAPI_P_DONEDRQ:
-device_printf(dev, "ATAPI DONEDRQ\n");
 		    device_printf(dev,
 			  "WARNING - DONEDRQ non conformant device\n");
 		    if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) {
@@ -958,13 +958,13 @@ device_printf(dev, "ATAPI DONEDRQ\n");
 
 		case ATAPI_P_ABORT:
 		case ATAPI_P_DONE:
-//device_printf(dev, "ATAPI ABORT/DONE\n");
 		    if (status & (ATA_S_ERROR | ATA_S_DWF))
 			et = MVS_ERR_TFE;
 		    goto end_finished;
 
 		default:
-		    device_printf(dev, "unknown transfer phase (status %02x, ireason %02x)\n",
+		    device_printf(dev, "unknown transfer phase"
+			" (status %02x, ireason %02x)\n",
 			status, ireason);
 		    et = MVS_ERR_TFE;
 		}
@@ -1266,8 +1266,6 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot)
 	ch->rslots |= (1 << slot->slot);
 	ATA_OUTB(ch->r_mem, SATA_SATAICTL, port << SATA_SATAICTL_PMPTX_SHIFT);
 	if (ccb->ccb_h.func_code == XPT_ATA_IO) {
-//		device_printf(dev, "%d Legacy command %02x size %d\n",
-//		    port, ccb->ataio.cmd.command, ccb->ataio.dxfer_len);
 		mvs_tfd_write(dev, ccb);
 		/* Device reset doesn't interrupt. */
 		if (ccb->ataio.cmd.command == ATA_DEVICE_RESET) {
@@ -1287,7 +1285,8 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot)
 		/* If data write command - output the data */
 		if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) {
 			if (mvs_wait(dev, ATA_S_DRQ, ATA_S_BUSY, 1000) < 0) {
-				device_printf(dev, "timeout waiting for write DRQ\n");
+				device_printf(dev,
+				    "timeout waiting for write DRQ\n");
 				mvs_end_transaction(slot, MVS_ERR_TIMEOUT);
 				return;
 			}
@@ -1296,9 +1295,6 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot)
 			   ch->transfersize / 2);
 		}
 	} else {
-//		device_printf(dev, "%d ATAPI command %02x size %d dma %d\n",
-//		    port, ccb->csio.cdb_io.cdb_bytes[0], ccb->csio.dxfer_len,
-//		    ch->basic_dma);
 		ch->donecount = 0;
 		ch->transfersize = min(ccb->csio.dxfer_len,
 		    ch->curr[port].bytecount);
@@ -1331,7 +1327,8 @@ mvs_legacy_execute_transaction(struct mvs_slot *slot)
 		    DELAY(20);
 		}
 		if (timeout <= 0) {
-			device_printf(dev, "timeout waiting for ATAPI command ready\n");
+			device_printf(dev,
+			    "timeout waiting for ATAPI command ready\n");
 			mvs_end_transaction(slot, MVS_ERR_TIMEOUT);
 			return;
 		}
@@ -1371,8 +1368,6 @@ mvs_execute_transaction(struct mvs_slot *slot)
 	int port = ccb->ccb_h.target_id & 0x0f;
 	int i;
 
-//	device_printf(dev, "%d EDMA command %02x size %d slot %d tag %d\n",
-//	    port, ccb->ataio.cmd.command, ccb->ataio.dxfer_len, slot->slot, slot->tag);
 	/* Get address of the prepared EPRD */
 	eprd = ch->dma.workrq_bus + MVS_EPRD_OFFSET + (MVS_EPRD_SIZE * slot->slot);
 	/* Prepare CRQB. Gen IIe uses different CRQB format. */
@@ -1554,7 +1549,6 @@ mvs_end_transaction(struct mvs_slot *slot, enum mvs_err_type et)
 	union ccb *ccb = slot->ccb;
 	int lastto;
 
-//device_printf(dev, "cmd done status %d\n", et);
 	bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map,
 	    BUS_DMASYNC_POSTWRITE);
 	/* Read result registers to the result struct
@@ -1792,7 +1786,8 @@ mvs_process_read_log(device_t dev, union ccb *ccb)
 		if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
 			device_printf(dev, "Error while READ LOG EXT\n");
 		else if ((data[0] & 0x80) == 0) {
-			device_printf(dev, "Non-queued command error in READ LOG EXT\n");
+			device_printf(dev,
+			    "Non-queued command error in READ LOG EXT\n");
 		}
 		for (i = 0; i < MVS_MAX_SLOTS; i++) {
 			if (!ch->hold[i])
diff --git a/sys/dev/mvs/mvs_pci.c b/sys/dev/mvs/mvs_pci.c
index 6b18e2c8ef6..e2e37daf595 100644
--- a/sys/dev/mvs/mvs_pci.c
+++ b/sys/dev/mvs/mvs_pci.c
@@ -339,7 +339,6 @@ mvs_intr(void *data)
 	u_int32_t ic, aic;
 
 	ic = ATA_INL(ctlr->r_mem, CHIP_MIC);
-//device_printf(ctlr->dev, "irq MIC:%08x\n", ic);
 	if (ctlr->msi) {
 		/* We have to to mask MSI during processing. */
 		mtx_lock(&ctlr->mtx);
diff --git a/sys/dev/mvs/mvs_soc.c b/sys/dev/mvs/mvs_soc.c
index 03029c25624..ed861f29db8 100644
--- a/sys/dev/mvs/mvs_soc.c
+++ b/sys/dev/mvs/mvs_soc.c
@@ -295,7 +295,6 @@ mvs_intr(void *data)
 	u_int32_t ic, aic;
 
 	ic = ATA_INL(ctlr->r_mem, CHIP_SOC_MIC);
-//device_printf(ctlr->dev, "irq MIC:%08x\n", ic);
 	if ((ic & IC_HC0) == 0)
 		return;
 	/* Acknowledge interrupts of this HC. */

From 6c872350980217fef8ae96365bd0c4193952ea0c Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 20 Oct 2010 07:47:31 +0000
Subject: [PATCH 12/68] Workaround strange situation when EDMA_RESQIP register
 returns zero instead of proper value. It caused bunch of "EMPTY CRPB"
 messages and potentially may cause premature requests completion, which could
 cause data corruption. For most cases it seems enough to just reread register
 to get proper value. To protect against worse cases - erase processed queue
 entries with impossible values and ignore them if problem still happen.

---
 sys/dev/mvs/mvs.c | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/sys/dev/mvs/mvs.c b/sys/dev/mvs/mvs.c
index ee04f98f0f7..99689380d86 100644
--- a/sys/dev/mvs/mvs.c
+++ b/sys/dev/mvs/mvs.c
@@ -455,7 +455,7 @@ mvs_setup_edma_queues(device_t dev)
 	bus_dmamap_sync(ch->dma.workrq_tag, ch->dma.workrq_map,
 	    BUS_DMASYNC_PREWRITE);
 	/* Reponses queue. */
-	bzero(ch->dma.workrp, 256);
+	memset(ch->dma.workrp, 0xff, MVS_WORKRP_SIZE);
 	work = ch->dma.workrp_bus;
 	ATA_OUTL(ch->r_mem, EDMA_RESQBAH, work >> 32);
 	ATA_OUTL(ch->r_mem, EDMA_RESQIP, work & 0xffffffff);
@@ -980,38 +980,54 @@ mvs_crbq_intr(device_t dev)
 	struct mvs_channel *ch = device_get_softc(dev);
 	struct mvs_crpb *crpb;
 	union ccb *ccb;
-	int in_idx, cin_idx, slot;
+	int in_idx, fin_idx, cin_idx, slot;
+	uint32_t val;
 	uint16_t flags;
 
-	in_idx = (ATA_INL(ch->r_mem, EDMA_RESQIP) & EDMA_RESQP_ERPQP_MASK) >>
+	val = ATA_INL(ch->r_mem, EDMA_RESQIP);
+	if (val == 0)
+		val = ATA_INL(ch->r_mem, EDMA_RESQIP);
+	in_idx = (val & EDMA_RESQP_ERPQP_MASK) >>
 	    EDMA_RESQP_ERPQP_SHIFT;
 	bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map,
 	    BUS_DMASYNC_POSTREAD);
-	cin_idx = ch->in_idx;
+	fin_idx = cin_idx = ch->in_idx;
 	ch->in_idx = in_idx;
 	while (in_idx != cin_idx) {
 		crpb = (struct mvs_crpb *)
-		    (ch->dma.workrp + MVS_CRPB_OFFSET + (MVS_CRPB_SIZE * cin_idx));
+		    (ch->dma.workrp + MVS_CRPB_OFFSET +
+		    (MVS_CRPB_SIZE * cin_idx));
 		slot = le16toh(crpb->id) & MVS_CRPB_TAG_MASK;
 		flags = le16toh(crpb->rspflg);
-//device_printf(dev, "CRPB %d %d %04x\n", cin_idx, slot, flags);
 		/*
 		 * Handle only successfull completions here.
 		 * Errors will be handled by main intr handler.
 		 */
-		if (ch->numtslots != 0 || (flags & EDMA_IE_EDEVERR) == 0) {
-if ((flags >> 8) & ATA_S_ERROR)
-device_printf(dev, "ERROR STATUS CRPB %d %d %04x\n", cin_idx, slot, flags);
+		if (crpb->id == 0xffff && crpb->rspflg == 0xffff) {
+			device_printf(dev, "Unfilled CRPB "
+			    "%d (%d->%d) tag %d flags %04x rs %08x\n",
+			    cin_idx, fin_idx, in_idx, slot, flags, ch->rslots);
+		} else if (ch->numtslots != 0 ||
+		    (flags & EDMA_IE_EDEVERR) == 0) {
+			crpb->id = 0xffff;
+			crpb->rspflg = 0xffff;
 			if (ch->slot[slot].state >= MVS_SLOT_RUNNING) {
 				ccb = ch->slot[slot].ccb;
-				ccb->ataio.res.status = (flags & MVS_CRPB_ATASTS_MASK) >>
+				ccb->ataio.res.status =
+				    (flags & MVS_CRPB_ATASTS_MASK) >>
 				    MVS_CRPB_ATASTS_SHIFT;
 				mvs_end_transaction(&ch->slot[slot], MVS_ERR_NONE);
-			} else 
-device_printf(dev, "EMPTY CRPB %d (->%d) %d %04x\n", cin_idx, in_idx, slot, flags);
-		} else
-device_printf(dev, "ERROR FLAGS CRPB %d %d %04x\n", cin_idx, slot, flags);
-
+			} else {
+				device_printf(dev, "Unused tag in CRPB "
+				    "%d (%d->%d) tag %d flags %04x rs %08x\n",
+				    cin_idx, fin_idx, in_idx, slot, flags,
+				    ch->rslots);
+			}
+		} else {
+			device_printf(dev,
+			    "CRPB with error %d tag %d flags %04x\n",
+			    cin_idx, slot, flags);
+		}
 		cin_idx = (cin_idx + 1) & (MVS_MAX_SLOTS - 1);
 	}
 	bus_dmamap_sync(ch->dma.workrp_tag, ch->dma.workrp_map,

From a2995e0d9b69c32731a0851d1a5cbf76c4d79d9b Mon Sep 17 00:00:00 2001
From: Ed Schouten <ed@FreeBSD.org>
Date: Wed, 20 Oct 2010 09:35:20 +0000
Subject: [PATCH 13/68] Remove setpgid() call before executing child process.

Using a separate process group here is bad, since (for example) job
control in the TTY layer prevents interaction with the TTY, causing the
child process to hang.

Mentioned on:	current@
MFC after:	2 weeks
---
 usr.bin/truss/setup.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/usr.bin/truss/setup.c b/usr.bin/truss/setup.c
index ce18f98e7ad..bf23ec76001 100644
--- a/usr.bin/truss/setup.c
+++ b/usr.bin/truss/setup.c
@@ -78,7 +78,6 @@ setup_and_wait(char *command[])
 	}
 	if (pid == 0) {	/* Child */
 		ptrace(PT_TRACE_ME, 0, 0, 0);
-		setpgid (0, 0); 
 		execvp(command[0], command);
 		err(1, "execvp %s", command[0]);
 	}

From 18ad6a4db2019118f1b6be206c50c2de5cd51d7d Mon Sep 17 00:00:00 2001
From: "Jayachandran C." <jchandra@FreeBSD.org>
Date: Wed, 20 Oct 2010 09:41:36 +0000
Subject: [PATCH 14/68] On uniprocessor, warn and fixup hardware cpu mask if
 more than on CPU is enabled by the bootloader.

---
 sys/mips/rmi/xlr_machdep.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sys/mips/rmi/xlr_machdep.c b/sys/mips/rmi/xlr_machdep.c
index b34955d13f9..8f9663375e8 100644
--- a/sys/mips/rmi/xlr_machdep.c
+++ b/sys/mips/rmi/xlr_machdep.c
@@ -167,6 +167,14 @@ xlr_parse_mmu_options(void)
 	 */
 	xlr_ncores = 1;
 	cpu_map = xlr_boot1_info.cpu_online_map;
+
+#ifndef SMP /* Uniprocessor! */
+	if (cpu_map != 0x1) {
+		printf("WARNING: Starting uniprocessor kernel on cpumask [0x%lx]!\n"
+		   "WARNING: Other CPUs will be unused.\n", (u_long)cpu_map);
+		cpu_map = 0x1;
+	}
+#endif
 	core0_thr_mask = cpu_map & 0xf;
 	switch (core0_thr_mask) {
 	case 1:
@@ -188,9 +196,9 @@ xlr_parse_mmu_options(void)
 			xlr_ncores++;
 		}
 	}
+	xlr_hw_thread_mask = cpu_map;
 
 	/* setup hardware processor id to cpu id mapping */
-	xlr_hw_thread_mask = xlr_boot1_info.cpu_online_map;
 	for (i = 0; i< MAXCPU; i++)
 		xlr_cpuid_to_hwtid[i] = 
 		    xlr_hwtid_to_cpuid [i] = -1;

From 7850efa68d7cf0af09c5d54fe308a8649f96f974 Mon Sep 17 00:00:00 2001
From: "Jayachandran C." <jchandra@FreeBSD.org>
Date: Wed, 20 Oct 2010 09:50:11 +0000
Subject: [PATCH 15/68] Network driver updates - Fix network driver issue on a
 XLS eval board (major# 8). - Fix issue uncovered by r213475 in check for
 XGMII

Submitted by:	Sriram Gorti (srgorti at netlogicmicro dot com)
---
 sys/mips/rmi/board.c            | 12 ++++++------
 sys/mips/rmi/dev/nlge/if_nlge.c |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/sys/mips/rmi/board.c b/sys/mips/rmi/board.c
index 3bee863226c..4c49dacc5ce 100644
--- a/sys/mips/rmi/board.c
+++ b/sys/mips/rmi/board.c
@@ -283,14 +283,14 @@ xls_board_specific_overrides(struct xlr_board_info* board)
 		break;
 
 	case RMI_XLR_BOARD_ARIZONA_VIII:
-
-		if (blk1->enabled) { 
+		if (blk1->enabled) {
 			/* There is just one Octal PHY on the board and it is 
 			 * connected to the MII interface for NA Quad 0. */
-			blk1->gmac_port[0].mii_addr = XLR_IO_GMAC_0_OFFSET;
-			blk1->gmac_port[1].mii_addr = XLR_IO_GMAC_0_OFFSET;
-			blk1->gmac_port[2].mii_addr = XLR_IO_GMAC_0_OFFSET;
-			blk1->gmac_port[3].mii_addr = XLR_IO_GMAC_0_OFFSET;
+			for (i = 0; i < 4; i++) { 
+				blk1->gmac_port[i].mii_addr =
+				    XLR_IO_GMAC_0_OFFSET; 
+				blk1->gmac_port[i].mdint_id = 0; 
+			} 
 		}
 		break;
 
diff --git a/sys/mips/rmi/dev/nlge/if_nlge.c b/sys/mips/rmi/dev/nlge/if_nlge.c
index 6495e4bf1f6..37e1c54c3c1 100644
--- a/sys/mips/rmi/dev/nlge/if_nlge.c
+++ b/sys/mips/rmi/dev/nlge/if_nlge.c
@@ -861,7 +861,7 @@ nlge_mii_read(struct device *dev, int phyaddr, int regidx)
 	int val;
 
 	sc = device_get_softc(dev);
-	val = (sc->port_type != XLR_XGMII) ? (0xffff) :
+	val = (sc->port_type == XLR_XGMII) ? (0xffff) :
 	    nlge_mii_read_internal(sc->mii_base, phyaddr, regidx);
 
 	return (val);

From 347263c935b6e667b95706baf634a938b4e9c59b Mon Sep 17 00:00:00 2001
From: Jung-uk Kim <jkim@FreeBSD.org>
Date: Wed, 20 Oct 2010 16:40:14 +0000
Subject: [PATCH 16/68] Do not apply do_power_resume for suspending P2P bridge
 as we did in r214064.

---
 sys/dev/pci/pci_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sys/dev/pci/pci_pci.c b/sys/dev/pci/pci_pci.c
index 9992b8119df..5dd8dbe72ee 100644
--- a/sys/dev/pci/pci_pci.c
+++ b/sys/dev/pci/pci_pci.c
@@ -447,7 +447,7 @@ pcib_suspend(device_t dev)
 
 	pcib_cfg_save(device_get_softc(dev));
 	error = bus_generic_suspend(dev);
-	if (error == 0 && pci_do_power_resume) {
+	if (error == 0) {
 		dstate = PCI_POWERSTATE_D3;
 		pcib = device_get_parent(device_get_parent(dev));
 		if (PCIB_POWER_FOR_SLEEP(pcib, dev, &dstate) == 0)

From f3e0b109731024857fa2119c53946257305ede73 Mon Sep 17 00:00:00 2001
From: Jung-uk Kim <jkim@FreeBSD.org>
Date: Wed, 20 Oct 2010 16:47:09 +0000
Subject: [PATCH 17/68] Introduce a new tunable 'hw.pci.do_power_suspend'. 
 This tunable lets you avoid PCI power state transition from D0 to D3 for
 suspending case.  Default is 1 or enabled.

---
 sys/dev/acpica/acpi_pci.c |  2 +-
 sys/dev/pci/pci.c         | 10 +++++++++-
 sys/dev/pci/pci_pci.c     |  2 +-
 sys/dev/pci/pci_private.h |  1 +
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/sys/dev/acpica/acpi_pci.c b/sys/dev/acpica/acpi_pci.c
index bf7cf2eafca..a14e0ba4198 100644
--- a/sys/dev/acpica/acpi_pci.c
+++ b/sys/dev/acpica/acpi_pci.c
@@ -179,7 +179,7 @@ acpi_pci_set_powerstate_method(device_t dev, device_t child, int state)
 	 */
 	ACPI_SERIAL_BEGIN(pci_powerstate);
 	old_state = pci_get_powerstate(child);
-	if (old_state < state) {
+	if (old_state < state && pci_do_power_suspend) {
 		error = pci_set_powerstate_method(dev, child, state);
 		if (error)
 			goto out;
diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index d1b211a9897..21263b726a4 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -257,6 +257,12 @@ SYSCTL_INT(_hw_pci, OID_AUTO, do_power_resume, CTLFLAG_RW,
     &pci_do_power_resume, 1,
   "Transition from D3 -> D0 on resume.");
 
+int pci_do_power_suspend = 1;
+TUNABLE_INT("hw.pci.do_power_suspend", &pci_do_power_suspend);
+SYSCTL_INT(_hw_pci, OID_AUTO, do_power_suspend, CTLFLAG_RW,
+    &pci_do_power_suspend, 1,
+  "Transition from D0 -> D3 on suspend.");
+
 static int pci_do_msi = 1;
 TUNABLE_INT("hw.pci.enable_msi", &pci_do_msi);
 SYSCTL_INT(_hw_pci, OID_AUTO, enable_msi, CTLFLAG_RW, &pci_do_msi, 1,
@@ -2954,7 +2960,9 @@ pci_suspend(device_t dev)
 		free(devlist, M_TEMP);
 		return (error);
 	}
-	pci_set_power_children(dev, devlist, numdevs, PCI_POWERSTATE_D3);
+	if (pci_do_power_suspend)
+		pci_set_power_children(dev, devlist, numdevs,
+		    PCI_POWERSTATE_D3);
 	free(devlist, M_TEMP);
 	return (0);
 }
diff --git a/sys/dev/pci/pci_pci.c b/sys/dev/pci/pci_pci.c
index 5dd8dbe72ee..79158187cab 100644
--- a/sys/dev/pci/pci_pci.c
+++ b/sys/dev/pci/pci_pci.c
@@ -447,7 +447,7 @@ pcib_suspend(device_t dev)
 
 	pcib_cfg_save(device_get_softc(dev));
 	error = bus_generic_suspend(dev);
-	if (error == 0) {
+	if (error == 0 && pci_do_power_suspend) {
 		dstate = PCI_POWERSTATE_D3;
 		pcib = device_get_parent(device_get_parent(dev));
 		if (PCIB_POWER_FOR_SLEEP(pcib, dev, &dstate) == 0)
diff --git a/sys/dev/pci/pci_private.h b/sys/dev/pci/pci_private.h
index 70d887b3232..90866eff618 100644
--- a/sys/dev/pci/pci_private.h
+++ b/sys/dev/pci/pci_private.h
@@ -39,6 +39,7 @@
 DECLARE_CLASS(pci_driver);
 
 extern int 	pci_do_power_resume;
+extern int 	pci_do_power_suspend;
 
 void		pci_add_children(device_t dev, int domain, int busno,
 		    size_t dinfo_size);

From ab05568beb985e7949c87e0a1a1dff7afca056c9 Mon Sep 17 00:00:00 2001
From: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Date: Wed, 20 Oct 2010 19:52:27 +0000
Subject: [PATCH 18/68] Correct typos.

---
 sys/boot/common/gpt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sys/boot/common/gpt.c b/sys/boot/common/gpt.c
index 62e86ddded8..9c0098008e3 100644
--- a/sys/boot/common/gpt.c
+++ b/sys/boot/common/gpt.c
@@ -49,7 +49,7 @@ static int curent, bootonce;
 
 /*
  * Buffer below 64kB passed on gptread(), which can hold at least
- * one sector od data (512 bytes).
+ * one sector of data (512 bytes).
  */
 static char *secbuf;
 
@@ -62,7 +62,7 @@ gptupdate(const char *which, struct dsk *dskp, struct gpt_hdr *hdr,
 
 	/*
 	 * We need to update the following for both primary and backup GPT:
-	 * 1. Sector on disk that contains curent partition.
+	 * 1. Sector on disk that contains current partition.
 	 * 2. Partition table checksum.
 	 * 3. Header checksum.
 	 * 4. Header on disk.

From 587250b286a2cd9b02c296e5d78ecd21c7bdf255 Mon Sep 17 00:00:00 2001
From: Ed Schouten <ed@FreeBSD.org>
Date: Wed, 20 Oct 2010 19:53:29 +0000
Subject: [PATCH 19/68] Get rid of hand-rolled closefrom(3).

---
 usr.bin/login/login.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/usr.bin/login/login.c b/usr.bin/login/login.c
index 3e826d57bda..e671c8dfac7 100644
--- a/usr.bin/login/login.c
+++ b/usr.bin/login/login.c
@@ -233,8 +233,7 @@ main(int argc, char *argv[])
 
 	setproctitle("-%s", getprogname());
 
-	for (cnt = getdtablesize(); cnt > 2; cnt--)
-		(void)close(cnt);
+	closefrom(3);
 
 	/*
 	 * Get current TTY

From 056638c469617019df6f89121f9fd0a0e884a332 Mon Sep 17 00:00:00 2001
From: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Date: Wed, 20 Oct 2010 20:01:45 +0000
Subject: [PATCH 20/68] - Add missing comments. - Make a comment consistent
 with others.

---
 sys/geom/eli/g_eli.c           | 6 ++++--
 sys/geom/eli/g_eli_integrity.c | 5 +++++
 sys/geom/eli/g_eli_privacy.c   | 5 +++++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/sys/geom/eli/g_eli.c b/sys/geom/eli/g_eli.c
index 05402ce3f41..4e15877da09 100644
--- a/sys/geom/eli/g_eli.c
+++ b/sys/geom/eli/g_eli.c
@@ -245,8 +245,10 @@ g_eli_orphan(struct g_consumer *cp)
 }
 
 /*
- * BIO_READ : G_ELI_START -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
- * BIO_WRITE: G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
+ * BIO_READ:
+ *	G_ELI_START -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ * BIO_WRITE:
+ *	G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
  */
 static void
 g_eli_start(struct bio *bp)
diff --git a/sys/geom/eli/g_eli_integrity.c b/sys/geom/eli/g_eli_integrity.c
index bafce9683e8..0103a32c90f 100644
--- a/sys/geom/eli/g_eli_integrity.c
+++ b/sys/geom/eli/g_eli_integrity.c
@@ -392,6 +392,11 @@ g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp)
 /*
  * This is the main function responsible for cryptography (ie. communication
  * with crypto(9) subsystem).
+ *
+ * BIO_READ:
+ *	g_eli_start -> g_eli_auth_read -> g_io_request -> g_eli_read_done -> G_ELI_AUTH_RUN -> g_eli_auth_read_done -> g_io_deliver
+ * BIO_WRITE:
+ *	g_eli_start -> G_ELI_AUTH_RUN -> g_eli_auth_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
  */
 void
 g_eli_auth_run(struct g_eli_worker *wr, struct bio *bp)
diff --git a/sys/geom/eli/g_eli_privacy.c b/sys/geom/eli/g_eli_privacy.c
index a6f572b4770..c2814bc6d07 100644
--- a/sys/geom/eli/g_eli_privacy.c
+++ b/sys/geom/eli/g_eli_privacy.c
@@ -166,6 +166,11 @@ g_eli_crypto_write_done(struct cryptop *crp)
 /*
  * This is the main function responsible for cryptography (ie. communication
  * with crypto(9) subsystem).
+ *
+ * BIO_READ:
+ *	g_eli_start -> g_io_request -> g_eli_read_done -> G_ELI_CRYPTO_RUN -> g_eli_crypto_read_done -> g_io_deliver
+ * BIO_WRITE:
+ *	g_eli_start -> G_ELI_CRYPTO_RUN -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
  */
 void
 g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp)

From 5ad4a7c74aecfa48612e6bb1845294b0cf8d1de8 Mon Sep 17 00:00:00 2001
From: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Date: Wed, 20 Oct 2010 20:50:55 +0000
Subject: [PATCH 21/68] Bring in geli suspend/resume functionality (finally).

Before this change if you wanted to suspend your laptop and be sure that your
encryption keys are safe, you had to stop all processes that use file system
stored on encrypted device, unmount the file system and detach geli provider.

This isn't very handy. If you are a lucky user of a laptop where suspend/resume
actually works with FreeBSD (I'm not!) you most likely want to suspend your
laptop, because you don't want to start everything over again when you turn
your laptop back on.

And this is where geli suspend/resume steps in. When you execute:

	# geli suspend -a

geli will wait for all in-flight I/O requests, suspend new I/O requests, remove
all geli sensitive data from the kernel memory (like encryption keys) and will
wait for either 'geli resume' or 'geli detach'.

Now with no keys in memory you can suspend your laptop without stopping any
processes or unmounting any file systems.

When you resume your laptop you have to resume geli devices using 'geli resume'
command. You need to provide your passphrase, etc. again so the keys can be
restored and suspended I/O requests released.

Of course you need to remember that 'geli suspend' won't clear file system
cache and other places where data from your geli-encrypted file system might be
present. But to get rid of those stopping processes and unmounting file system
won't help either - you have to turn your laptop off. Be warned.

Also note, that suspending geli device which contains file system with geli
utility (or anything used by 'geli resume') is not very good idea, as you won't
be able to resume it - when you execute geli(8), the kernel will try to read it
and this read I/O request will be suspended.
---
 sbin/geom/class/eli/geli.8     |  94 +++++++++++++-
 sbin/geom/class/eli/geom_eli.c |  59 +++++++++
 sys/geom/eli/g_eli.c           | 126 +++++++++++++++----
 sys/geom/eli/g_eli.h           |   7 ++
 sys/geom/eli/g_eli_ctl.c       | 215 ++++++++++++++++++++++++++++++++-
 sys/geom/eli/g_eli_integrity.c |   8 +-
 sys/geom/eli/g_eli_privacy.c   |  58 ++++++++-
 7 files changed, 531 insertions(+), 36 deletions(-)

diff --git a/sbin/geom/class/eli/geli.8 b/sbin/geom/class/eli/geli.8
index 268e8e64533..67f5ea97629 100644
--- a/sbin/geom/class/eli/geli.8
+++ b/sbin/geom/class/eli/geli.8
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd September 25, 2010
+.Dd October 20, 2010
 .Dt GELI 8
 .Os
 .Sh NAME
@@ -119,6 +119,16 @@ utility:
 .Ar file
 .Ar prov
 .Nm
+.Cm suspend
+.Op Fl v
+.Fl a | Ar prov ...
+.Nm
+.Cm resume
+.Op Fl pv
+.Op Fl j Ar passfile
+.Op Fl k Ar keyfile
+.Ar prov
+.Nm
 .Cm resize
 .Op Fl v
 .Fl s Ar oldsize
@@ -207,6 +217,8 @@ Allows to attach a provider with a random, one-time key - useful for swap
 partitions and temporary file systems.
 .It
 Allows to verify data integrity (data authentication).
+.It
+Allows to suspend and resume encrypted devices.
 .El
 .Pp
 The first argument to
@@ -458,6 +470,8 @@ will not be detached even if all keys will be destroyed.
 It can be even rescued with the
 .Cm setkey
 subcommand.
+.Pp
+Additional options include:
 .Bl -tag -width ".Fl a Ar keyno"
 .It Fl a
 Destroy all keys (does not need
@@ -482,6 +496,8 @@ backup, your data is gone for good.
 In case the provider was attached with the
 .Fl r
 flag, the keys will not be destroyed, only the provider will be detached.
+.Pp
+Additional options include:
 .Bl -tag -width ".Fl a"
 .It Fl a
 If specified, all currently attached providers will be killed.
@@ -490,6 +506,8 @@ If specified, all currently attached providers will be killed.
 Backup metadata from the given provider to the given file.
 .It Cm restore
 Restore metadata from the given file to the given provider.
+.Pp
+Additional options include:
 .Bl -tag -width ".Fl f"
 .It Fl f
 Metadata contains the size of the provider to ensure that the correct
@@ -508,12 +526,73 @@ through
 and
 .Cm restore .
 .El
+.It Cm suspend
+Suspend device by waiting for all inflight request to finish, clearing all
+sensitive informations (like keys) from the kernel memory and blocking all
+further I/O requests until the
+.Cm resume
+subcommand is executed.
+This functionality is useful for eg. laptops - when one wants to suspend a
+laptop, one does not want to leave encrypted device attached.
+Instead of closing all files and directories opened from a file system placed
+on an encrypted device, unmounting the file system and detaching the device,
+the
+.Cm suspend
+subcommand can be used.
+Any access to the encrypted device will be blocked until the keys are
+recovered through
+.Cm resume
+subcommand, thus there is no need to close nor unmount anything.
+The
+.Cm suspend
+subcommand does not work with devices created with the
+.Cm onetime
+subcommand.
+Please note that sensitive data might still be present in memory after
+suspending encrypted device, because of file system cache, etc.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl a"
+.It Fl a
+Suspend all
+.Nm
+devices.
+.El
+.It Cm resume
+Resume previously suspended device.
+The caller must ensure that executing this subcommand won't try to access
+suspended device, which will lead to a deadlock.
+For example suspending device, which contains file system where the
+.Nm
+utility is stored is bad idea.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl j Ar passfile"
+.It Fl j Ar passfile
+Specifies a file which contains the passphrase or its part.
+For more information see the description of the
+.Fl J
+option for the
+.Cm init
+subcommand.
+.It Fl k Ar keyfile
+Specifies a file which contains part of the key.
+For more information see the description of the
+.Fl K
+option for the
+.Cm init
+subcommand.
+.It Fl p
+Do not use passphrase as the key component.
+.El
 .It Cm resize
 Inform
 .Nm
 that the provider has been resized.
 The old metadata block is relocated to the correct position at the end of the
 provider and the provider size is updated.
+.Pp
+Additional options include:
 .Bl -tag -width ".Fl s Ar oldsize"
 .It Fl s Ar oldsize
 The size of the provider before it was resized.
@@ -746,6 +825,19 @@ prompt:
 # geli attach da0
 Enter passphrase: foobar
 .Ed
+.Pp
+Suspend all
+.Nm
+devices, suspend a laptop, then resume devices one by one after resuming a
+laptop:
+.Bd -literal -offset indent
+# geli suspend -a
+# zzz
+<resume your laptop>
+# geli resume -p -k keyfile gpt/secret
+# geli resume gpt/private
+Enter passphrase:
+.Ed
 .Sh ENCRYPTION MODES
 .Nm
 supports two encryption modes:
diff --git a/sbin/geom/class/eli/geom_eli.c b/sbin/geom/class/eli/geom_eli.c
index f7953d7b8fb..984d4898191 100644
--- a/sbin/geom/class/eli/geom_eli.c
+++ b/sbin/geom/class/eli/geom_eli.c
@@ -67,6 +67,7 @@ static void eli_attach(struct gctl_req *req);
 static void eli_configure(struct gctl_req *req);
 static void eli_setkey(struct gctl_req *req);
 static void eli_delkey(struct gctl_req *req);
+static void eli_resume(struct gctl_req *req);
 static void eli_kill(struct gctl_req *req);
 static void eli_backup(struct gctl_req *req);
 static void eli_restore(struct gctl_req *req);
@@ -89,6 +90,8 @@ static int eli_backup_create(struct gctl_req *req, const char *prov,
  * configure [-bB] prov ...
  * setkey [-pPv] [-n keyno] [-j passfile] [-J newpassfile] [-k keyfile] [-K newkeyfile] prov
  * delkey [-afv] [-n keyno] prov
+ * suspend [-v] -a | prov ...
+ * resume [-pv] [-j passfile] [-k keyfile] prov
  * kill [-av] [prov ...]
  * backup [-v] prov file
  * restore [-fv] file prov
@@ -198,6 +201,22 @@ struct g_command class_commands[] = {
 	    },
 	    "[-afv] [-n keyno] prov"
 	},
+	{ "suspend", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'a', "all", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    },
+	    "[-v] -a | prov ..."
+	},
+	{ "resume", G_FLAG_VERBOSE, eli_main,
+	    {
+		{ 'j', "passfile", G_VAL_OPTIONAL, G_TYPE_STRING | G_TYPE_MULTI },
+		{ 'k', "keyfile", G_VAL_OPTIONAL, G_TYPE_STRING | G_TYPE_MULTI },
+		{ 'p', "nopassphrase", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    },
+	    "[-pv] [-j passfile] [-k keyfile] prov"
+	},
 	{ "kill", G_FLAG_VERBOSE, eli_main,
 	    {
 		{ 'a', "all", NULL, G_TYPE_BOOL },
@@ -280,6 +299,8 @@ eli_main(struct gctl_req *req, unsigned int flags)
 		eli_setkey(req);
 	else if (strcmp(name, "delkey") == 0)
 		eli_delkey(req);
+	else if (strcmp(name, "resume") == 0)
+		eli_resume(req);
 	else if (strcmp(name, "kill") == 0)
 		eli_kill(req);
 	else if (strcmp(name, "backup") == 0)
@@ -1118,6 +1139,44 @@ eli_delkey(struct gctl_req *req)
 		eli_delkey_detached(req, prov);
 }
 
+static void
+eli_resume(struct gctl_req *req)
+{
+	struct g_eli_metadata md;
+	unsigned char key[G_ELI_USERKEYLEN];
+	const char *prov;
+	off_t mediasize;
+	int nargs;
+
+	nargs = gctl_get_int(req, "nargs");
+	if (nargs != 1) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+	prov = gctl_get_ascii(req, "arg0");
+
+	if (eli_metadata_read(req, prov, &md) == -1)
+		return;
+
+	mediasize = g_get_mediasize(prov);
+	if (md.md_provsize != (uint64_t)mediasize) {
+		gctl_error(req, "Provider size mismatch.");
+		return;
+	}
+
+	if (eli_genkey(req, &md, key, false) == NULL) {
+		bzero(key, sizeof(key));
+		return;
+	}
+
+	gctl_ro_param(req, "key", sizeof(key), key);
+	if (gctl_issue(req) == NULL) {
+		if (verbose)
+			printf("Resumed %s.\n", prov);
+	}
+	bzero(key, sizeof(key));
+}
+
 static int
 eli_trash_metadata(struct gctl_req *req, const char *prov, int fd, off_t offset)
 {
diff --git a/sys/geom/eli/g_eli.c b/sys/geom/eli/g_eli.c
index 4e15877da09..66f641b6520 100644
--- a/sys/geom/eli/g_eli.c
+++ b/sys/geom/eli/g_eli.c
@@ -106,7 +106,7 @@ struct g_class g_eli_class = {
 /*
  * Code paths:
  * BIO_READ:
- *	g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ *	g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
  * BIO_WRITE:
  *	g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
  */
@@ -148,7 +148,7 @@ g_eli_crypto_rerun(struct cryptop *crp)
 /*
  * The function is called afer reading encrypted data from the provider.
  *
- * g_eli_start -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ * g_eli_start -> g_eli_crypto_read -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
  */
 void
 g_eli_read_done(struct bio *bp)
@@ -167,6 +167,7 @@ g_eli_read_done(struct bio *bp)
 	if (pbp->bio_inbed < pbp->bio_children)
 		return;
 	g_destroy_bio(bp);
+	sc = pbp->bio_to->geom->softc;
 	if (pbp->bio_error != 0) {
 		G_ELI_LOGREQ(0, pbp, "%s() failed", __func__);
 		pbp->bio_completed = 0;
@@ -175,9 +176,9 @@ g_eli_read_done(struct bio *bp)
 			pbp->bio_driver2 = NULL;
 		}
 		g_io_deliver(pbp, pbp->bio_error);
+		atomic_subtract_int(&sc->sc_inflight, 1);
 		return;
 	}
-	sc = pbp->bio_to->geom->softc;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, pbp);
 	mtx_unlock(&sc->sc_queue_mtx);
@@ -192,6 +193,7 @@ g_eli_read_done(struct bio *bp)
 void
 g_eli_write_done(struct bio *bp)
 {
+	struct g_eli_softc *sc;
 	struct bio *pbp;
 
 	G_ELI_LOGREQ(2, bp, "Request done.");
@@ -218,7 +220,9 @@ g_eli_write_done(struct bio *bp)
 	 * Write is finished, send it up.
 	 */
 	pbp->bio_completed = pbp->bio_length;
+	sc = pbp->bio_to->geom->softc;
 	g_io_deliver(pbp, pbp->bio_error);
+	atomic_subtract_int(&sc->sc_inflight, 1);
 }
 
 /*
@@ -241,12 +245,12 @@ g_eli_orphan(struct g_consumer *cp)
 	sc = cp->geom->softc;
 	if (sc == NULL)
 		return;
-	g_eli_destroy(sc, 1);
+	g_eli_destroy(sc, TRUE);
 }
 
 /*
  * BIO_READ:
- *	G_ELI_START -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ *	G_ELI_START -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
  * BIO_WRITE:
  *	G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
  */
@@ -284,24 +288,16 @@ g_eli_start(struct bio *bp)
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
+	bp->bio_driver1 = cbp;
+	bp->bio_pflags = G_ELI_NEW_BIO;
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) {
-			bp->bio_driver2 = NULL;
-			cbp->bio_done = g_eli_read_done;
-			cp = LIST_FIRST(&sc->sc_geom->consumer);
-			cbp->bio_to = cp->provider;
-			G_ELI_LOGREQ(2, cbp, "Sending request.");
-			/*
-			 * Read encrypted data from provider.
-			 */
-			g_io_request(cbp, cp);
+			g_eli_crypto_read(sc, bp, 0);
 			break;
 		}
-		bp->bio_pflags = 255;
 		/* FALLTHROUGH */
 	case BIO_WRITE:
-		bp->bio_driver1 = cbp;
 		mtx_lock(&sc->sc_queue_mtx);
 		bioq_insert_tail(&sc->sc_queue, bp);
 		mtx_unlock(&sc->sc_queue_mtx);
@@ -318,6 +314,41 @@ g_eli_start(struct bio *bp)
 	}
 }
 
+static void
+g_eli_cancel(struct g_eli_softc *sc)
+{
+	struct bio *bp;
+
+	mtx_assert(&sc->sc_queue_mtx, MA_OWNED);
+
+	while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) {
+		KASSERT(bp->bio_pflags == G_ELI_NEW_BIO,
+		    ("Not new bio when canceling (bp=%p).", bp));
+		g_io_deliver(bp, ENXIO);
+	}
+}
+
+static struct bio *
+g_eli_takefirst(struct g_eli_softc *sc)
+{
+	struct bio *bp;
+
+	mtx_assert(&sc->sc_queue_mtx, MA_OWNED);
+
+	if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND))
+		return (bioq_takefirst(&sc->sc_queue));
+	/*
+	 * Device suspended, so we skip new I/O requests.
+	 */
+	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
+		if (bp->bio_pflags != G_ELI_NEW_BIO)
+			break;
+	}
+	if (bp != NULL)
+		bioq_remove(&sc->sc_queue, bp);
+	return (bp);
+}
+
 /*
  * This is the main function for kernel worker thread when we don't have
  * hardware acceleration and we have to do cryptography in software.
@@ -351,9 +382,11 @@ g_eli_worker(void *arg)
 
 	for (;;) {
 		mtx_lock(&sc->sc_queue_mtx);
-		bp = bioq_takefirst(&sc->sc_queue);
+again:
+		bp = g_eli_takefirst(sc);
 		if (bp == NULL) {
 			if (sc->sc_flags & G_ELI_FLAG_DESTROY) {
+				g_eli_cancel(sc);
 				LIST_REMOVE(wr, w_next);
 				crypto_freesession(wr->w_sid);
 				free(wr, M_ELI);
@@ -363,16 +396,54 @@ g_eli_worker(void *arg)
 				mtx_unlock(&sc->sc_queue_mtx);
 				kproc_exit(0);
 			}
+			while (sc->sc_flags & G_ELI_FLAG_SUSPEND) {
+				if (sc->sc_inflight > 0) {
+					G_ELI_DEBUG(0, "inflight=%d", sc->sc_inflight);
+					/*
+					 * We still have inflight BIOs, so
+					 * sleep and retry.
+					 */
+					msleep(sc, &sc->sc_queue_mtx, PRIBIO,
+					    "geli:inf", hz / 5);
+					goto again;
+				}
+				/*
+				 * Suspend requested, mark the worker as
+				 * suspended and go to sleep.
+				 */
+				wr->w_active = 0;
+				wakeup(&sc->sc_workers);
+				msleep(sc, &sc->sc_queue_mtx, PRIBIO,
+				    "geli:suspend", 0);
+				if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND))
+					wr->w_active = 1;
+				goto again;
+			}
 			msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0);
 			continue;
 		}
+		if (bp->bio_pflags == G_ELI_NEW_BIO)
+			atomic_add_int(&sc->sc_inflight, 1);
 		mtx_unlock(&sc->sc_queue_mtx);
-		if (bp->bio_cmd == BIO_READ && bp->bio_pflags == 255)
-			g_eli_auth_read(sc, bp);
-		else if (sc->sc_flags & G_ELI_FLAG_AUTH)
-			g_eli_auth_run(wr, bp);
-		else
-			g_eli_crypto_run(wr, bp);
+		if (bp->bio_pflags == G_ELI_NEW_BIO) {
+			bp->bio_pflags = 0;
+			if (sc->sc_flags & G_ELI_FLAG_AUTH) {
+				if (bp->bio_cmd == BIO_READ)
+					g_eli_auth_read(sc, bp);
+				else
+					g_eli_auth_run(wr, bp);
+			} else {
+				if (bp->bio_cmd == BIO_READ)
+					g_eli_crypto_read(sc, bp, 1);
+				else
+					g_eli_crypto_run(wr, bp);
+			}
+		} else {
+			if (sc->sc_flags & G_ELI_FLAG_AUTH)
+				g_eli_auth_run(wr, bp);
+			else
+				g_eli_crypto_run(wr, bp);
+		}
 	}
 }
 
@@ -502,7 +573,7 @@ g_eli_last_close(struct g_eli_softc *sc)
 	gp = sc->sc_geom;
 	pp = LIST_FIRST(&gp->provider);
 	strlcpy(ppname, pp->name, sizeof(ppname));
-	error = g_eli_destroy(sc, 1);
+	error = g_eli_destroy(sc, TRUE);
 	KASSERT(error == 0, ("Cannot detach %s on last close (error=%d).",
 	    ppname, error));
 	G_ELI_DEBUG(0, "Detached %s on last close.", ppname);
@@ -586,6 +657,7 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
 	else
 		gp->access = g_std_access;
 
+	sc->sc_inflight = 0;
 	sc->sc_crypto = G_ELI_CRYPTO_SW;
 	sc->sc_flags = md->md_flags;
 	/* Backward compatibility. */
@@ -730,6 +802,7 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
 		wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO);
 		wr->w_softc = sc;
 		wr->w_number = i;
+		wr->w_active = TRUE;
 
 		/*
 		 * If this is the first pass, try to get hardware support.
@@ -877,7 +950,7 @@ g_eli_destroy_geom(struct gctl_req *req __unused,
 	struct g_eli_softc *sc;
 
 	sc = gp->softc;
-	return (g_eli_destroy(sc, 0));
+	return (g_eli_destroy(sc, FALSE));
 }
 
 static int
@@ -1108,6 +1181,7 @@ g_eli_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
+		ADD_FLAG(G_ELI_FLAG_SUSPEND, "SUSPEND");
 		ADD_FLAG(G_ELI_FLAG_SINGLE_KEY, "SINGLE-KEY");
 		ADD_FLAG(G_ELI_FLAG_NATIVE_BYTE_ORDER, "NATIVE-BYTE-ORDER");
 		ADD_FLAG(G_ELI_FLAG_ONETIME, "ONETIME");
@@ -1169,7 +1243,7 @@ g_eli_shutdown_pre_sync(void *arg, int howto)
 		pp = LIST_FIRST(&gp->provider);
 		KASSERT(pp != NULL, ("No provider? gp=%p (%s)", gp, gp->name));
 		if (pp->acr + pp->acw + pp->ace == 0)
-			error = g_eli_destroy(sc, 1);
+			error = g_eli_destroy(sc, TRUE);
 		else {
 			sc->sc_flags |= G_ELI_FLAG_RW_DETACH;
 			gp->access = g_eli_access;
diff --git a/sys/geom/eli/g_eli.h b/sys/geom/eli/g_eli.h
index e6b311d62f6..fd53d5fea83 100644
--- a/sys/geom/eli/g_eli.h
+++ b/sys/geom/eli/g_eli.h
@@ -86,6 +86,10 @@
 #define	G_ELI_FLAG_NATIVE_BYTE_ORDER	0x00040000
 /* Provider uses single encryption key. */
 #define	G_ELI_FLAG_SINGLE_KEY		0x00080000
+/* Device suspended. */
+#define	G_ELI_FLAG_SUSPEND		0x00100000
+
+#define	G_ELI_NEW_BIO	255
 
 #define	SHA512_MDLEN		64
 #define	G_ELI_AUTH_SECKEYLEN	SHA256_DIGEST_LENGTH
@@ -140,6 +144,7 @@ struct g_eli_worker {
 	struct proc		*w_proc;
 	u_int			 w_number;
 	uint64_t		 w_sid;
+	boolean_t		 w_active;
 	LIST_ENTRY(g_eli_worker) w_next;
 };
 
@@ -160,6 +165,7 @@ struct g_eli_softc {
 	SHA256_CTX	  sc_ivctx;
 	int		  sc_nkey;
 	uint32_t	  sc_flags;
+	int		  sc_inflight;
 	off_t		  sc_mediasize;
 	size_t		  sc_sectorsize;
 	u_int		  sc_bytes_per_sector;
@@ -499,6 +505,7 @@ uint8_t *g_eli_crypto_key(struct g_eli_softc *sc, off_t offset,
 void g_eli_crypto_ivgen(struct g_eli_softc *sc, off_t offset, u_char *iv,
     size_t size);
 
+void g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker);
 void g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp);
 
 void g_eli_auth_read(struct g_eli_softc *sc, struct bio *bp);
diff --git a/sys/geom/eli/g_eli_ctl.c b/sys/geom/eli/g_eli_ctl.c
index 02ede13ecc4..7147b270df8 100644
--- a/sys/geom/eli/g_eli_ctl.c
+++ b/sys/geom/eli/g_eli_ctl.c
@@ -217,7 +217,7 @@ g_eli_ctl_detach(struct gctl_req *req, struct g_class *mp)
 			sc->sc_flags |= G_ELI_FLAG_RW_DETACH;
 			sc->sc_geom->access = g_eli_access;
 		} else {
-			error = g_eli_destroy(sc, *force);
+			error = g_eli_destroy(sc, *force ? TRUE : FALSE);
 			if (error != 0) {
 				gctl_error(req,
 				    "Cannot destroy device %s (error=%d).",
@@ -699,6 +699,213 @@ g_eli_ctl_delkey(struct gctl_req *req, struct g_class *mp)
 		G_ELI_DEBUG(1, "Key %d removed from %s.", nkey, pp->name);
 }
 
+static int
+g_eli_suspend_one(struct g_eli_softc *sc)
+{
+	struct g_eli_worker *wr;
+
+	g_topology_assert();
+
+	if (sc == NULL)
+		return (ENOENT);
+	if (sc->sc_flags & G_ELI_FLAG_ONETIME)
+		return (EOPNOTSUPP);
+
+	mtx_lock(&sc->sc_queue_mtx);
+	if (sc->sc_flags & G_ELI_FLAG_SUSPEND) {
+		mtx_unlock(&sc->sc_queue_mtx);
+		return (EALREADY);
+	}
+	sc->sc_flags |= G_ELI_FLAG_SUSPEND;
+	wakeup(sc);
+	for (;;) {
+		LIST_FOREACH(wr, &sc->sc_workers, w_next) {
+			if (wr->w_active)
+				break;
+		}
+		if (wr == NULL)
+			break;
+		/* Not all threads suspended. */
+		msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO,
+		    "geli:suspend", 0);
+	}
+	/*
+	 * Clear sensitive data on suspend, they will be recovered on resume.
+	 */
+	bzero(sc->sc_mkey, sizeof(sc->sc_mkey));
+	bzero(sc->sc_ekeys,
+	    sc->sc_nekeys * (sizeof(uint8_t *) + G_ELI_DATAKEYLEN));
+	free(sc->sc_ekeys, M_ELI);
+	sc->sc_ekeys = NULL;
+	bzero(sc->sc_akey, sizeof(sc->sc_akey));
+	bzero(&sc->sc_akeyctx, sizeof(sc->sc_akeyctx));
+	bzero(sc->sc_ivkey, sizeof(sc->sc_ivkey));
+	bzero(&sc->sc_ivctx, sizeof(sc->sc_ivctx));
+	mtx_unlock(&sc->sc_queue_mtx);
+	G_ELI_DEBUG(0, "%s has been suspended.", sc->sc_name);
+	return (0);
+}
+
+static void
+g_eli_ctl_suspend(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_eli_softc *sc;
+	int *all, *nargs;
+	int error;
+
+	g_topology_assert();
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	all = gctl_get_paraml(req, "all", sizeof(*all));
+	if (all == NULL) {
+		gctl_error(req, "No '%s' argument.", "all");
+		return;
+	}
+	if (!*all && *nargs == 0) {
+		gctl_error(req, "Too few arguments.");
+		return;
+	}
+
+	if (*all) {
+		struct g_geom *gp, *gp2;
+
+		LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
+			sc = gp->softc;
+			if (sc->sc_flags & G_ELI_FLAG_ONETIME)
+				continue;
+			error = g_eli_suspend_one(sc);
+			if (error != 0)
+				gctl_error(req, "Not fully done.");
+		}
+	} else {
+		const char *prov;
+		char param[16];
+		int i;
+
+		for (i = 0; i < *nargs; i++) {
+			snprintf(param, sizeof(param), "arg%d", i);
+			prov = gctl_get_asciiparam(req, param);
+			if (prov == NULL) {
+				G_ELI_DEBUG(0, "No 'arg%d' argument.", i);
+				continue;
+			}
+
+			sc = g_eli_find_device(mp, prov);
+			if (sc == NULL) {
+				G_ELI_DEBUG(0, "No such provider: %s.", prov);
+				continue;
+			}
+			error = g_eli_suspend_one(sc);
+			if (error != 0)
+				gctl_error(req, "Not fully done.");
+		}
+	}
+}
+
+static void
+g_eli_ctl_resume(struct gctl_req *req, struct g_class *mp)
+{
+	struct g_eli_metadata md;
+	struct g_eli_softc *sc;
+	struct g_provider *pp;
+	struct g_consumer *cp;
+	const char *name;
+	u_char *key, mkey[G_ELI_DATAIVKEYLEN];
+	int *nargs, keysize, error;
+	u_int nkey;
+
+	g_topology_assert();
+
+	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
+	if (nargs == NULL) {
+		gctl_error(req, "No '%s' argument.", "nargs");
+		return;
+	}
+	if (*nargs != 1) {
+		gctl_error(req, "Invalid number of arguments.");
+		return;
+	}
+
+	name = gctl_get_asciiparam(req, "arg0");
+	if (name == NULL) {
+		gctl_error(req, "No 'arg%u' argument.", 0);
+		return;
+	}
+	sc = g_eli_find_device(mp, name);
+	if (sc == NULL) {
+		gctl_error(req, "Provider %s is invalid.", name);
+		return;
+	}
+	if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND)) {
+		gctl_error(req, "Provider %s not suspended.", name);
+		return;
+	}
+	cp = LIST_FIRST(&sc->sc_geom->consumer);
+	pp = cp->provider;
+	error = g_eli_read_metadata(mp, pp, &md);
+	if (error != 0) {
+		gctl_error(req, "Cannot read metadata from %s (error=%d).",
+		    name, error);
+		return;
+	}
+	if (md.md_keys == 0x00) {
+		bzero(&md, sizeof(md));
+		gctl_error(req, "No valid keys on %s.", pp->name);
+		return;
+	}
+
+	key = gctl_get_param(req, "key", &keysize);
+	if (key == NULL || keysize != G_ELI_USERKEYLEN) {
+		bzero(&md, sizeof(md));
+		gctl_error(req, "No '%s' argument.", "key");
+		return;
+	}
+
+	error = g_eli_mkey_decrypt(&md, key, mkey, &nkey);
+	bzero(key, keysize);
+	if (error == -1) {
+		bzero(&md, sizeof(md));
+		gctl_error(req, "Wrong key for %s.", pp->name);
+		return;
+	} else if (error > 0) {
+		bzero(&md, sizeof(md));
+		gctl_error(req, "Cannot decrypt Master Key for %s (error=%d).",
+		    pp->name, error);
+		return;
+	}
+	G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name);
+
+	mtx_lock(&sc->sc_queue_mtx);
+	/* Restore sc_mkey, sc_ekeys, sc_akey and sc_ivkey. */
+	g_eli_mkey_propagate(sc, mkey);
+	bzero(mkey, sizeof(mkey));
+	bzero(&md, sizeof(md));
+	/* Restore sc_akeyctx. */
+	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
+		SHA256_Init(&sc->sc_akeyctx);
+		SHA256_Update(&sc->sc_akeyctx, sc->sc_akey,
+		    sizeof(sc->sc_akey));
+	}
+	/* Restore sc_ivctx. */
+	switch (sc->sc_ealgo) {
+	case CRYPTO_AES_XTS:
+		break;
+	default:
+		SHA256_Init(&sc->sc_ivctx);
+		SHA256_Update(&sc->sc_ivctx, sc->sc_ivkey,
+		    sizeof(sc->sc_ivkey));
+		break;
+	}
+	sc->sc_flags &= ~G_ELI_FLAG_SUSPEND;
+	mtx_unlock(&sc->sc_queue_mtx);
+	G_ELI_DEBUG(1, "Resumed %s.", pp->name);
+	wakeup(sc);
+}
+
 static int
 g_eli_kill_one(struct g_eli_softc *sc)
 {
@@ -749,7 +956,7 @@ g_eli_kill_one(struct g_eli_softc *sc)
 	}
 	if (error == 0)
 		G_ELI_DEBUG(0, "%s has been killed.", pp->name);
-	g_eli_destroy(sc, 1);
+	g_eli_destroy(sc, TRUE);
 	return (error);
 }
 
@@ -839,6 +1046,10 @@ g_eli_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 		g_eli_ctl_setkey(req, mp);
 	else if (strcmp(verb, "delkey") == 0)
 		g_eli_ctl_delkey(req, mp);
+	else if (strcmp(verb, "suspend") == 0)
+		g_eli_ctl_suspend(req, mp);
+	else if (strcmp(verb, "resume") == 0)
+		g_eli_ctl_resume(req, mp);
 	else if (strcmp(verb, "kill") == 0)
 		g_eli_ctl_kill(req, mp);
 	else
diff --git a/sys/geom/eli/g_eli_integrity.c b/sys/geom/eli/g_eli_integrity.c
index 0103a32c90f..24586bd786c 100644
--- a/sys/geom/eli/g_eli_integrity.c
+++ b/sys/geom/eli/g_eli_integrity.c
@@ -129,6 +129,7 @@ g_eli_auth_keygen(struct g_eli_softc *sc, off_t offset, u_char *key)
 static int
 g_eli_auth_read_done(struct cryptop *crp)
 {
+	struct g_eli_softc *sc;
 	struct bio *bp;
 
 	if (crp->crp_etype == EAGAIN) {
@@ -152,8 +153,8 @@ g_eli_auth_read_done(struct cryptop *crp)
 	 */
 	if (bp->bio_inbed < bp->bio_children)
 		return (0);
+	sc = bp->bio_to->geom->softc;
 	if (bp->bio_error == 0) {
-		struct g_eli_softc *sc;
 		u_int i, lsec, nsec, data_secsize, decr_secsize, encr_secsize;
 		u_char *srcdata, *dstdata, *auth;
 		off_t coroff, corsize;
@@ -161,7 +162,6 @@ g_eli_auth_read_done(struct cryptop *crp)
 		/*
 		 * Verify data integrity based on calculated and read HMACs.
 		 */
-		sc = bp->bio_to->geom->softc;
 		/* Sectorsize of decrypted provider eg. 4096. */
 		decr_secsize = bp->bio_to->sectorsize;
 		/* The real sectorsize of encrypted provider, eg. 512. */
@@ -240,6 +240,7 @@ g_eli_auth_read_done(struct cryptop *crp)
 	 * Read is finished, send it up.
 	 */
 	g_io_deliver(bp, bp->bio_error);
+	atomic_subtract_int(&sc->sc_inflight, 1);
 	return (0);
 }
 
@@ -276,6 +277,7 @@ g_eli_auth_write_done(struct cryptop *crp)
 	 */
 	if (bp->bio_inbed < bp->bio_children)
 		return (0);
+	sc = bp->bio_to->geom->softc;
 	if (bp->bio_error != 0) {
 		G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).",
 		    bp->bio_error);
@@ -285,9 +287,9 @@ g_eli_auth_write_done(struct cryptop *crp)
 		bp->bio_driver1 = NULL;
 		g_destroy_bio(cbp);
 		g_io_deliver(bp, bp->bio_error);
+		atomic_subtract_int(&sc->sc_inflight, 1);
 		return (0);
 	}
-	sc = bp->bio_to->geom->softc;
 	cp = LIST_FIRST(&sc->sc_geom->consumer);
 	cbp = bp->bio_driver1;
 	bp->bio_driver1 = NULL;
diff --git a/sys/geom/eli/g_eli_privacy.c b/sys/geom/eli/g_eli_privacy.c
index c2814bc6d07..ee133c6a200 100644
--- a/sys/geom/eli/g_eli_privacy.c
+++ b/sys/geom/eli/g_eli_privacy.c
@@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$");
 /*
  * Code paths:
  * BIO_READ:
- *	g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ *	g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
  * BIO_WRITE:
  *	g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
  */
@@ -63,11 +63,12 @@ MALLOC_DECLARE(M_ELI);
 /*
  * The function is called after we read and decrypt data.
  *
- * g_eli_start -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> G_ELI_CRYPTO_READ_DONE -> g_io_deliver
+ * g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> G_ELI_CRYPTO_READ_DONE -> g_io_deliver
  */
 static int
 g_eli_crypto_read_done(struct cryptop *crp)
 {
+	struct g_eli_softc *sc;
 	struct bio *bp;
 
 	if (crp->crp_etype == EAGAIN) {
@@ -101,7 +102,9 @@ g_eli_crypto_read_done(struct cryptop *crp)
 	/*
 	 * Read is finished, send it up.
 	 */
+	sc = bp->bio_to->geom->softc;
 	g_io_deliver(bp, bp->bio_error);
+	atomic_subtract_int(&sc->sc_inflight, 1);
 	return (0);
 }
 
@@ -113,6 +116,7 @@ g_eli_crypto_read_done(struct cryptop *crp)
 static int
 g_eli_crypto_write_done(struct cryptop *crp)
 {
+	struct g_eli_softc *sc;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	struct bio *bp, *cbp;
@@ -141,18 +145,20 @@ g_eli_crypto_write_done(struct cryptop *crp)
 	bp->bio_children = 1;
 	cbp = bp->bio_driver1;
 	bp->bio_driver1 = NULL;
+	gp = bp->bio_to->geom;
 	if (bp->bio_error != 0) {
 		G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).",
 		    bp->bio_error);
 		free(bp->bio_driver2, M_ELI);
 		bp->bio_driver2 = NULL;
 		g_destroy_bio(cbp);
+		sc = gp->softc;
 		g_io_deliver(bp, bp->bio_error);
+		atomic_subtract_int(&sc->sc_inflight, 1);
 		return (0);
 	}
 	cbp->bio_data = bp->bio_driver2;
 	cbp->bio_done = g_eli_write_done;
-	gp = bp->bio_to->geom;
 	cp = LIST_FIRST(&gp->consumer);
 	cbp->bio_to = cp->provider;
 	G_ELI_LOGREQ(2, cbp, "Sending request.");
@@ -163,12 +169,56 @@ g_eli_crypto_write_done(struct cryptop *crp)
 	return (0);
 }
 
+/*
+ * The function is called to read encrypted data.
+ *
+ * g_eli_start -> G_ELI_CRYPTO_READ -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
+ */
+void
+g_eli_crypto_read(struct g_eli_softc *sc, struct bio *bp, boolean_t fromworker)
+{
+	struct g_consumer *cp;
+	struct bio *cbp;
+
+	if (!fromworker) {
+		/*
+		 * We are not called from the worker thread, so check if
+		 * device is suspended.
+		 */
+		mtx_lock(&sc->sc_queue_mtx);
+		if (sc->sc_flags & G_ELI_FLAG_SUSPEND) {
+			/*
+			 * If device is suspended, we place the request onto
+			 * the queue, so it can be handled after resume.
+			 */
+			G_ELI_DEBUG(0, "device suspended, move onto queue");
+			bioq_insert_tail(&sc->sc_queue, bp);
+			mtx_unlock(&sc->sc_queue_mtx);
+			wakeup(sc);
+			return;
+		}
+		atomic_add_int(&sc->sc_inflight, 1);
+		mtx_unlock(&sc->sc_queue_mtx);
+	}
+	bp->bio_pflags = 0;
+	bp->bio_driver2 = NULL;
+	cbp = bp->bio_driver1;
+	cbp->bio_done = g_eli_read_done;
+	cp = LIST_FIRST(&sc->sc_geom->consumer);
+	cbp->bio_to = cp->provider;
+	G_ELI_LOGREQ(2, cbp, "Sending request.");
+	/*
+	 * Read encrypted data from provider.
+	 */
+	g_io_request(cbp, cp);
+}
+
 /*
  * This is the main function responsible for cryptography (ie. communication
  * with crypto(9) subsystem).
  *
  * BIO_READ:
- *	g_eli_start -> g_io_request -> g_eli_read_done -> G_ELI_CRYPTO_RUN -> g_eli_crypto_read_done -> g_io_deliver
+ *	g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> G_ELI_CRYPTO_RUN -> g_eli_crypto_read_done -> g_io_deliver
  * BIO_WRITE:
  *	g_eli_start -> G_ELI_CRYPTO_RUN -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
  */

From 6c71649c5f033d5c1e6d1899ad39b31cb576efe8 Mon Sep 17 00:00:00 2001
From: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Date: Wed, 20 Oct 2010 21:10:01 +0000
Subject: [PATCH 22/68] Use closefrom(2) instead of close(2) in a loop.

MFC after:	1 week
---
 sbin/hastd/hooks.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/sbin/hastd/hooks.c b/sbin/hastd/hooks.c
index 2e43357fdea..bd132427e89 100644
--- a/sbin/hastd/hooks.c
+++ b/sbin/hastd/hooks.c
@@ -88,32 +88,19 @@ static void hook_free(struct hookproc *hp);
 static void
 descriptors(void)
 {
-	long maxfd;
 	int fd;
 
 	/*
-	 * Close all descriptors.
+	 * Close all (or almost all) descriptors.
 	 */
-	maxfd = sysconf(_SC_OPEN_MAX);
-	if (maxfd < 0) {
-		pjdlog_errno(LOG_WARNING, "sysconf(_SC_OPEN_MAX) failed");
-		maxfd = 1024;
-	}
-	for (fd = 0; fd <= maxfd; fd++) {
-		switch (fd) {
-		case STDIN_FILENO:
-		case STDOUT_FILENO:
-		case STDERR_FILENO:
-			if (pjdlog_mode_get() == PJDLOG_MODE_STD)
-				break;
-			/* FALLTHROUGH */
-		default:
-			close(fd);
-			break;
-		}
-	}
-	if (pjdlog_mode_get() == PJDLOG_MODE_STD)
+	if (pjdlog_mode_get() == PJDLOG_MODE_STD) {
+		closefrom(MAX(MAX(STDIN_FILENO, STDOUT_FILENO),
+		    STDERR_FILENO) + 1);
 		return;
+	}
+
+	closefrom(0);
+
 	/*
 	 * Redirect stdin, stdout and stderr to /dev/null.
 	 */

From d815d0abb75d3660992017661fe4546c05b02a74 Mon Sep 17 00:00:00 2001
From: Jung-uk Kim <jkim@FreeBSD.org>
Date: Wed, 20 Oct 2010 23:41:16 +0000
Subject: [PATCH 23/68] Update PCI power management registers per PCI Bus Power
 Management Interface Specification Rev. 1.2.  Rename pp_pcmcsr field of PM
 capabilities to pp_bse to avoid further confusions and adjust some comments
 accordingly.  The real PMCSR (Power Management Control/Status Register) is
 PCIR_POWER_STATUS and it is actually BSE (PCI-to-PCI Bridge Support
 Extensions) register.

---
 sys/dev/pci/pci.c    |  2 +-
 sys/dev/pci/pcireg.h | 32 ++++++++++++++++++--------------
 sys/dev/pci/pcivar.h |  6 +++---
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index 21263b726a4..6534ba2d3ea 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -600,7 +600,7 @@ pci_read_extcap(device_t pcib, pcicfgregs *cfg)
 			if (cfg->pp.pp_cap == 0) {
 				cfg->pp.pp_cap = REG(ptr + PCIR_POWER_CAP, 2);
 				cfg->pp.pp_status = ptr + PCIR_POWER_STATUS;
-				cfg->pp.pp_pmcsr = ptr + PCIR_POWER_PMCSR;
+				cfg->pp.pp_bse = ptr + PCIR_POWER_BSE;
 				if ((nextptr - ptr) > PCIR_POWER_DATA)
 					cfg->pp.pp_data = ptr + PCIR_POWER_DATA;
 			}
diff --git a/sys/dev/pci/pcireg.h b/sys/dev/pci/pcireg.h
index a0d12dbb8d2..02fa7ea2dce 100644
--- a/sys/dev/pci/pcireg.h
+++ b/sys/dev/pci/pcireg.h
@@ -427,12 +427,16 @@
 #define	PCIR_POWER_CAP		0x2
 #define	PCIM_PCAP_SPEC			0x0007
 #define	PCIM_PCAP_PMEREQCLK		0x0008
-#define	PCIM_PCAP_PMEREQPWR		0x0010
 #define	PCIM_PCAP_DEVSPECINIT		0x0020
-#define	PCIM_PCAP_DYNCLOCK		0x0040
-#define	PCIM_PCAP_SECCLOCK		0x00c0
-#define	PCIM_PCAP_CLOCKMASK		0x00c0
-#define	PCIM_PCAP_REQFULLCLOCK		0x0100
+#define	PCIM_PCAP_AUXPWR_0		0x0000
+#define	PCIM_PCAP_AUXPWR_55		0x0040
+#define	PCIM_PCAP_AUXPWR_100		0x0080
+#define	PCIM_PCAP_AUXPWR_160		0x00c0
+#define	PCIM_PCAP_AUXPWR_220		0x0100
+#define	PCIM_PCAP_AUXPWR_270		0x0140
+#define	PCIM_PCAP_AUXPWR_320		0x0180
+#define	PCIM_PCAP_AUXPWR_375		0x01c0
+#define	PCIM_PCAP_AUXPWRMASK		0x01c0
 #define	PCIM_PCAP_D1SUPP		0x0200
 #define	PCIM_PCAP_D2SUPP		0x0400
 #define	PCIM_PCAP_D0PME			0x0800
@@ -447,16 +451,17 @@
 #define	PCIM_PSTAT_D2			0x0002
 #define	PCIM_PSTAT_D3			0x0003
 #define	PCIM_PSTAT_DMASK		0x0003
-#define	PCIM_PSTAT_REPENABLE		0x0010
+#define	PCIM_PSTAT_NOSOFTRESET		0x0008
 #define	PCIM_PSTAT_PMEENABLE		0x0100
 #define	PCIM_PSTAT_D0POWER		0x0000
 #define	PCIM_PSTAT_D1POWER		0x0200
 #define	PCIM_PSTAT_D2POWER		0x0400
 #define	PCIM_PSTAT_D3POWER		0x0600
 #define	PCIM_PSTAT_D0HEAT		0x0800
-#define	PCIM_PSTAT_D1HEAT		0x1000
-#define	PCIM_PSTAT_D2HEAT		0x1200
-#define	PCIM_PSTAT_D3HEAT		0x1400
+#define	PCIM_PSTAT_D1HEAT		0x0a00
+#define	PCIM_PSTAT_D2HEAT		0x0c00
+#define	PCIM_PSTAT_D3HEAT		0x0e00
+#define	PCIM_PSTAT_DATASELMASK		0x1e00
 #define	PCIM_PSTAT_DATAUNKN		0x0000
 #define	PCIM_PSTAT_DATADIV10		0x2000
 #define	PCIM_PSTAT_DATADIV100		0x4000
@@ -464,11 +469,10 @@
 #define	PCIM_PSTAT_DATADIVMASK		0x6000
 #define	PCIM_PSTAT_PME			0x8000
 
-#define	PCIR_POWER_PMCSR	0x6
-#define	PCIM_PMCSR_DCLOCK		0x10
-#define	PCIM_PMCSR_B2SUPP		0x20
-#define	PCIM_BMCSR_B3SUPP		0x40
-#define	PCIM_BMCSR_BPCE			0x80
+#define	PCIR_POWER_BSE		0x6
+#define	PCIM_PMCSR_BSE_D3B3		0x00
+#define	PCIM_PMCSR_BSE_D3B2		0x40
+#define	PCIM_PMCSR_BSE_BPCCE		0x80
 
 #define	PCIR_POWER_DATA		0x7
 
diff --git a/sys/dev/pci/pcivar.h b/sys/dev/pci/pcivar.h
index d6a2a0e9ba9..aee967a7957 100644
--- a/sys/dev/pci/pcivar.h
+++ b/sys/dev/pci/pcivar.h
@@ -42,9 +42,9 @@ typedef uint64_t pci_addr_t;
 /* Interesting values for PCI power management */
 struct pcicfg_pp {
     uint16_t	pp_cap;		/* PCI power management capabilities */
-    uint8_t	pp_status;	/* config space address of PCI power status reg */
-    uint8_t	pp_pmcsr;	/* config space address of PMCSR reg */
-    uint8_t	pp_data;	/* config space address of PCI power data reg */
+    uint8_t	pp_status;	/* conf. space addr. of PM control/status reg */
+    uint8_t	pp_bse;		/* conf. space addr. of PM BSE reg */
+    uint8_t	pp_data;	/* conf. space addr. of PM data reg */
 };
  
 struct vpd_readonly {

From e782099404e7c96bc1fdbf04af156c864d69c5a6 Mon Sep 17 00:00:00 2001
From: Edwin Groothuis <edwin@FreeBSD.org>
Date: Thu, 21 Oct 2010 06:52:14 +0000
Subject: [PATCH 24/68] Fix printing of files located on ZFS filesystem with an
 st_dev or st_ino larger than 2**31.

From the PR:

   Printing from a ZFS filesystem using 'lp' fails and returns an
   email reporting "Your printer job was not printed because it was
   not linked to the original file".

   In order to protect against files being switched when files
   are printed using 'lp' or 'lpr -s', the st_dev and st_ino
   values for the original file are saved by lpr and verified
   by lpd before the file is printed. Unfortunately, lpr prints
   both values using '%d' (although both fields are unsigned)
   and lpd(8) assumes a string of decimal digits.

   ZFS (at least) generates st_dev values greater than 2^31-1,
   resulting in negative values being printed - which lpd cannot
   parse, leading it to report that the file has been switched.

   A similar problem would occur with large inode numbers.

   How-To-Repeat:

   Find a file with either st_dev or st_ino greater than 2^31-1
   (stat(1) will report both numbers) and print it with 'lpq -s'.
   This should generate an email reporting that the file could
   not be printed because it was not linked to the original file

PR:		bin/151567
Submitted by:	Peter Jeremy <Peter.Jeremy@alcatel-lucent.com>
MFC after:	1 week
---
 usr.sbin/lpr/lpr/lpr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/usr.sbin/lpr/lpr/lpr.c b/usr.sbin/lpr/lpr/lpr.c
index 98e9cea4327..c2f88a07af6 100644
--- a/usr.sbin/lpr/lpr/lpr.c
+++ b/usr.sbin/lpr/lpr/lpr.c
@@ -386,7 +386,7 @@ main(int argc, char *argv[])
 			continue;	/* file unreasonable */
 
 		if (sflag && (cp = linked(arg)) != NULL) {
-			(void) snprintf(buf, sizeof(buf), "%d %d", statb.st_dev,
+			(void) snprintf(buf, sizeof(buf), "%u %u", statb.st_dev,
 				statb.st_ino);
 			card('S', buf);
 			if (format == 'p')

From 00e3c12e031607360d67e4ca23f1ccfc01596183 Mon Sep 17 00:00:00 2001
From: Xin LI <delphij@FreeBSD.org>
Date: Thu, 21 Oct 2010 08:57:25 +0000
Subject: [PATCH 25/68] In syscall_module_handler(): all switch branches
 return, remove unreached code as pointed out in a Chinese forum [1].

[1] http://www.freebsdchina.org/forum/viewtopic.php?t=50619

Pointed out by:		btw616 <btw s qq com>
MFC after:		1 month
---
 sys/kern/kern_syscalls.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
index d1a5c0dba5f..aa3bb34acef 100644
--- a/sys/kern/kern_syscalls.c
+++ b/sys/kern/kern_syscalls.c
@@ -184,10 +184,7 @@ syscall_module_handler(struct module *mod, int what, void *arg)
 		return EOPNOTSUPP;
 	}
 
-	if (data->chainevh)
-		return (data->chainevh(mod, what, data->chainarg));
-	else
-		return (0);
+	/* NOTREACHED */
 }
 
 int

From 91cca30d9c3c2a9926e4b9c6bb02c331e69000dc Mon Sep 17 00:00:00 2001
From: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Date: Thu, 21 Oct 2010 10:36:36 +0000
Subject: [PATCH 26/68] - Simplify gctl_get_handle() a bit. - Prefer 'unsigned
 int' over 'u_int' in userland code.

---
 lib/libgeom/geom_ctl.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/lib/libgeom/geom_ctl.c b/lib/libgeom/geom_ctl.c
index 59691cc2e9f..3ede5f4fbc1 100644
--- a/lib/libgeom/geom_ctl.c
+++ b/lib/libgeom/geom_ctl.c
@@ -54,7 +54,7 @@ static char nomemmsg[] = "Could not allocate memory";
 void
 gctl_dump(struct gctl_req *req, FILE *f)
 {
-	u_int i;
+	unsigned int i;
 	int j;
 	struct gctl_req_arg *ap;
 
@@ -126,10 +126,8 @@ gctl_check_alloc(struct gctl_req *req, void *ptr)
 struct gctl_req *
 gctl_get_handle(void)
 {
-	struct gctl_req *rp;
 
-	rp = calloc(1, sizeof *rp);
-	return (rp);
+	return (calloc(1, sizeof(struct gctl_req)));
 }
 
 /*
@@ -233,7 +231,7 @@ gctl_issue(struct gctl_req *req)
 void
 gctl_free(struct gctl_req *req)
 {
-	u_int i;
+	unsigned int i;
 
 	if (req == NULL)
 		return;

From 42e2e8990aa02c57d516bc80a64de7894778f375 Mon Sep 17 00:00:00 2001
From: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Date: Thu, 21 Oct 2010 10:38:14 +0000
Subject: [PATCH 27/68] Remove code duplication by introducing static
 gctl_param_add() function which is now used by both gctl_ro_param() and
 gctl_rw_param().

---
 lib/libgeom/geom_ctl.c | 50 ++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/lib/libgeom/geom_ctl.c b/lib/libgeom/geom_ctl.c
index 3ede5f4fbc1..956169f9d85 100644
--- a/lib/libgeom/geom_ctl.c
+++ b/lib/libgeom/geom_ctl.c
@@ -150,33 +150,9 @@ gctl_new_arg(struct gctl_req *req)
 	return (ap);
 }
 
-void
-gctl_ro_param(struct gctl_req *req, const char *name, int len, const void* value)
-{
-	struct gctl_req_arg *ap;
-
-	if (req == NULL || req->error != NULL)
-		return;
-	ap = gctl_new_arg(req);
-	if (ap == NULL)
-		return;
-	ap->name = strdup(name);
-	gctl_check_alloc(req, ap->name);
-	if (ap->name == NULL)
-		return;
-	ap->nlen = strlen(ap->name) + 1;
-	ap->value = __DECONST(void *, value);
-	ap->flag = GCTL_PARAM_RD;
-	if (len >= 0)
-		ap->len = len;
-	else if (len < 0) {
-		ap->flag |= GCTL_PARAM_ASCII;
-		ap->len = strlen(value) + 1;	
-	}
-}
-
-void
-gctl_rw_param(struct gctl_req *req, const char *name, int len, void* value)
+static void
+gctl_param_add(struct gctl_req *req, const char *name, int len, void *value,
+    int flag)
 {
 	struct gctl_req_arg *ap;
 
@@ -191,11 +167,27 @@ gctl_rw_param(struct gctl_req *req, const char *name, int len, void* value)
 		return;
 	ap->nlen = strlen(ap->name) + 1;
 	ap->value = value;
-	ap->flag = GCTL_PARAM_RW;
+	ap->flag = flag;
 	if (len >= 0)
 		ap->len = len;
-	else if (len < 0)
+	else if (len < 0) {
+		ap->flag |= GCTL_PARAM_ASCII;
 		ap->len = strlen(value) + 1;	
+	}
+}
+
+void
+gctl_ro_param(struct gctl_req *req, const char *name, int len, const void* value)
+{
+
+	gctl_param_add(req, name, len, __DECONST(void *, value), GCTL_PARAM_RD);
+}
+
+void
+gctl_rw_param(struct gctl_req *req, const char *name, int len, void *value)
+{
+
+	gctl_param_add(req, name, len, value, GCTL_PARAM_RW);
 }
 
 const char *

From d63e9da300d41547bcdc9b0e9591ae9c84067318 Mon Sep 17 00:00:00 2001
From: Sergey Kandaurov <pluknet@FreeBSD.org>
Date: Thu, 21 Oct 2010 10:38:52 +0000
Subject: [PATCH 28/68] Update PD state firmware definitions: add copyback,
 system.

Reviewed by:	jhb
Approved by:	avg (mentor)
MFC after:	1 week
---
 sys/dev/mfi/mfireg.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sys/dev/mfi/mfireg.h b/sys/dev/mfi/mfireg.h
index 17ab4b3b5cf..e08a16dd471 100644
--- a/sys/dev/mfi/mfireg.h
+++ b/sys/dev/mfi/mfireg.h
@@ -975,7 +975,9 @@ enum mfi_pd_state {
 	MFI_PD_STATE_OFFLINE = 0x10,
 	MFI_PD_STATE_FAILED = 0x11,
 	MFI_PD_STATE_REBUILD = 0x14,
-	MFI_PD_STATE_ONLINE = 0x18
+	MFI_PD_STATE_ONLINE = 0x18,
+	MFI_PD_STATE_COPYBACK = 0x20,
+	MFI_PD_STATE_SYSTEM = 0x40
 };
 
 union mfi_ld_ref {

From 08f2463092b7a5f64fe082921911c7dd170e1625 Mon Sep 17 00:00:00 2001
From: Sergey Kandaurov <pluknet@FreeBSD.org>
Date: Thu, 21 Oct 2010 10:46:18 +0000
Subject: [PATCH 29/68] Enable copyback and system PD states.

Reviewed by:	jhb
Approved by:	avg (mentor)
MFC after:	1 week
X-MFC-After:	r214130
---
 usr.sbin/mfiutil/mfi_drive.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/usr.sbin/mfiutil/mfi_drive.c b/usr.sbin/mfiutil/mfi_drive.c
index 104422814ea..5a57c09f132 100644
--- a/usr.sbin/mfiutil/mfi_drive.c
+++ b/usr.sbin/mfiutil/mfi_drive.c
@@ -65,6 +65,10 @@ mfi_pdstate(enum mfi_pd_state state)
 		return ("REBUILD");
 	case MFI_PD_STATE_ONLINE:
 		return ("ONLINE");
+	case MFI_PD_STATE_COPYBACK:
+		return ("COPYBACK");
+	case MFI_PD_STATE_SYSTEM:
+		return ("SYSTEM");
 	default:
 		sprintf(buf, "PSTATE 0x%04x", state);
 		return (buf);

From 2914feeb7e7bc6d1ef98809230b82c6a52364a1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ulrich=20Sp=C3=B6rlein?= <uqs@FreeBSD.org>
Date: Thu, 21 Oct 2010 12:27:13 +0000
Subject: [PATCH 30/68] mdoc: make pages render with mandoc

It's a bit more pedantic regarding .Bl list elements. This has an added
benefit of unbreaking the ipfw(8) manpage, where groff was silently
skipping one list element.
---
 lib/libc/sys/getpriority.2       | 2 +-
 lib/libc/sys/pathconf.2          | 3 +--
 lib/libc/sys/stat.2              | 2 +-
 sbin/fsirand/fsirand.8           | 2 +-
 sbin/ipfw/ipfw.8                 | 3 +--
 sbin/restore/restore.8           | 1 -
 share/man/man4/iscsi_initiator.4 | 2 +-
 share/man/man5/elf.5             | 2 +-
 share/man/man9/sysctl_add_oid.9  | 2 +-
 usr.bin/mesg/mesg.1              | 2 +-
 usr.sbin/mtest/mtest.8           | 2 +-
 11 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/lib/libc/sys/getpriority.2 b/lib/libc/sys/getpriority.2
index 28d1f1490d3..ae70f5f441d 100644
--- a/lib/libc/sys/getpriority.2
+++ b/lib/libc/sys/getpriority.2
@@ -129,10 +129,10 @@ or
 .Dv PRIO_USER .
 .El
 .Pp
-.Bl -tag -width Er
 In addition to the errors indicated above,
 .Fn setpriority
 will fail if:
+.Bl -tag -width Er
 .It Bq Er EPERM
 A process was located, but neither its effective nor real user
 ID matched the effective user ID of the caller.
diff --git a/lib/libc/sys/pathconf.2 b/lib/libc/sys/pathconf.2
index 495bc650f05..3cfcdbc99b7 100644
--- a/lib/libc/sys/pathconf.2
+++ b/lib/libc/sys/pathconf.2
@@ -89,7 +89,6 @@ returns information about the file the link references.
 The available values are as follows:
 .Pp
 .Bl -tag -width 6n
-.Pp
 .It Li _PC_LINK_MAX
 The maximum file link count.
 .It Li _PC_MAX_CANON
@@ -234,11 +233,11 @@ Too many symbolic links were encountered in translating the pathname.
 An I/O error occurred while reading from or writing to the file system.
 .El
 .Pp
-.Bl -tag -width Er
 The
 .Fn fpathconf
 system call
 will fail if:
+.Bl -tag -width Er
 .It Bq Er EBADF
 The
 .Fa fd
diff --git a/lib/libc/sys/stat.2 b/lib/libc/sys/stat.2
index 85f0cf0b0c8..ab37c0d5408 100644
--- a/lib/libc/sys/stat.2
+++ b/lib/libc/sys/stat.2
@@ -345,10 +345,10 @@ represented correctly in the structure pointed to by
 .Fa sb .
 .El
 .Pp
-.Bl -tag -width Er
 The
 .Fn fstat
 system call will fail if:
+.Bl -tag -width Er
 .It Bq Er EBADF
 The
 .Fa fd
diff --git a/sbin/fsirand/fsirand.8 b/sbin/fsirand/fsirand.8
index 66a1724303c..3156787a8bd 100644
--- a/sbin/fsirand/fsirand.8
+++ b/sbin/fsirand/fsirand.8
@@ -73,8 +73,8 @@ The
 utility may be used on the root file system in single-user mode
 but the system should be rebooted via ``reboot -n'' afterwards.
 .Sh OPTIONS
-.Bl -tag -width indent
 The available options are as follows:
+.Bl -tag -width indent
 .It Fl b
 Use the default block size (usually 512 bytes) instead
 of the value gleaned from the disklabel.
diff --git a/sbin/ipfw/ipfw.8 b/sbin/ipfw/ipfw.8
index f870ee4b435..a954c1d5415 100644
--- a/sbin/ipfw/ipfw.8
+++ b/sbin/ipfw/ipfw.8
@@ -1009,11 +1009,11 @@ The second format
 with multiple addresses) is provided for convenience only and
 its use is discouraged.
 .It Ar addr : Oo Cm not Oc Bro
-.Bl -tag -width indent
 .Cm any | me | me6 |
 .Cm table Ns Pq Ar number Ns Op , Ns Ar value
 .Ar | addr-list | addr-set
 .Brc
+.Bl -tag -width indent
 .It Cm any
 matches any IP address.
 .It Cm me
@@ -2176,7 +2176,6 @@ Finally, the following parameters can be configured for both
 pipes and queues:
 .Pp
 .Bl -tag -width XXXX -compact
-.Pp
 .It Cm buckets Ar hash-table-size
 Specifies the size of the hash table used for storing the
 various queues.
diff --git a/sbin/restore/restore.8 b/sbin/restore/restore.8
index cf6a364693b..94ca8784f43 100644
--- a/sbin/restore/restore.8
+++ b/sbin/restore/restore.8
@@ -412,7 +412,6 @@ Most checks are self-explanatory or can ``never happen''.
 Common errors are given below.
 .Pp
 .Bl -tag -width Ds -compact
-.Pp
 .It <filename>: not found on tape
 The specified file name was listed in the tape directory,
 but was not found on the tape.
diff --git a/share/man/man4/iscsi_initiator.4 b/share/man/man4/iscsi_initiator.4
index 7c857bd22b5..90df509ee65 100644
--- a/share/man/man4/iscsi_initiator.4
+++ b/share/man/man4/iscsi_initiator.4
@@ -90,8 +90,8 @@ see
 The 
 .Nm
 driver creates the following:
-.Bl -tag -width ".Pa /dev/iscsi%dxx" -compact
 .Pp
+.Bl -tag -width ".Pa /dev/iscsi%dxx" -compact
 .It Pa /dev/iscsi
 used to create new sessions.
 .It Pa /dev/iscsi%d
diff --git a/share/man/man5/elf.5 b/share/man/man5/elf.5
index 1d34dfe2491..b68363ef12c 100644
--- a/share/man/man5/elf.5
+++ b/share/man/man5/elf.5
@@ -462,8 +462,8 @@ member in the total struct.
 .It Dv p_type
 This member of the Phdr struct tells what kind of segment this array
 element describes or how to interpret the array element's information.
-.Bl -tag -width "PT_DYNAMIC" -compact
 .Pp
+.Bl -tag -width "PT_DYNAMIC" -compact
 .It Dv PT_NULL
 The array element is unused and the other members' values are undefined.
 This lets the program header have ignored entries.
diff --git a/share/man/man9/sysctl_add_oid.9 b/share/man/man9/sysctl_add_oid.9
index 43f3112a4d0..74e76d7f5be 100644
--- a/share/man/man9/sysctl_add_oid.9
+++ b/share/man/man9/sysctl_add_oid.9
@@ -411,8 +411,8 @@ and to delete them later in orderly fashion.
 .Pp
 There is a set of macros defined
 that helps to create oids of given type.
-.Bl -tag -width SYSCTL_ADD_STRINGXX
 They are as follows:
+.Bl -tag -width SYSCTL_ADD_STRINGXX
 .It Fn SYSCTL_ADD_OID
 creates a raw oid.
 This macro is functionally equivalent to the
diff --git a/usr.bin/mesg/mesg.1 b/usr.bin/mesg/mesg.1
index b596f75cf7d..e2e2587d4fe 100644
--- a/usr.bin/mesg/mesg.1
+++ b/usr.bin/mesg/mesg.1
@@ -70,8 +70,8 @@ displays the present message status to the standard output.
 The
 .Nm
 utility exits with one of the following values:
-.Bl -tag -width flag -compact -offset indent
 .Pp
+.Bl -tag -width flag -compact -offset indent
 .It Li "\ 0"
 Messages are allowed.
 .It Li "\ 1"
diff --git a/usr.sbin/mtest/mtest.8 b/usr.sbin/mtest/mtest.8
index 9a961e01846..801501d040d 100644
--- a/usr.sbin/mtest/mtest.8
+++ b/usr.sbin/mtest/mtest.8
@@ -42,8 +42,8 @@ is a small program for testing multicast socket operations.
 .Pp
 It accepts the following commands, interactively, or as part of a scripted
 input file (useful for automated testing):
-.Bl -tag -width "a ifname e.e.e.e e.e.e.e" -compact -offset indent
 .Pp
+.Bl -tag -width "a ifname e.e.e.e e.e.e.e" -compact -offset indent
 .\"
 .It Ic a Ar ifname Ar mac-addr
 Join the link-layer group address

From 738ffa97801eeaad1a1dfe15e917a2af1f4dbe83 Mon Sep 17 00:00:00 2001
From: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Date: Thu, 21 Oct 2010 12:58:26 +0000
Subject: [PATCH 31/68] Fix a bug introduced in r213067 where we use
 authentication key before initializing it.

---
 sys/geom/eli/g_eli.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/sys/geom/eli/g_eli.c b/sys/geom/eli/g_eli.c
index 66f641b6520..0d9b9c7c04f 100644
--- a/sys/geom/eli/g_eli.c
+++ b/sys/geom/eli/g_eli.c
@@ -686,14 +686,6 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
 		sc->sc_bytes_per_sector =
 		    (md->md_sectorsize - 1) / sc->sc_data_per_sector + 1;
 		sc->sc_bytes_per_sector *= bpp->sectorsize;
-		/*
-		 * Precalculate SHA256 for HMAC key generation.
-		 * This is expensive operation and we can do it only once now or
-		 * for every access to sector, so now will be much better.
-		 */
-		SHA256_Init(&sc->sc_akeyctx);
-		SHA256_Update(&sc->sc_akeyctx, sc->sc_akey,
-		    sizeof(sc->sc_akey));
 	}
 
 	gp->softc = sc;
@@ -753,7 +745,16 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
 	 */
 	g_eli_mkey_propagate(sc, mkey);
 	sc->sc_ekeylen = md->md_keylen;
-
+	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
+		/*
+		 * Precalculate SHA256 for HMAC key generation.
+		 * This is expensive operation and we can do it only once now or
+		 * for every access to sector, so now will be much better.
+		 */
+		SHA256_Init(&sc->sc_akeyctx);
+		SHA256_Update(&sc->sc_akeyctx, sc->sc_akey,
+		    sizeof(sc->sc_akey));
+	}
 	/*
 	 * Precalculate SHA256 for IV generation.
 	 * This is expensive operation and we can do it only once now or for

From 9301df81791a215d0532952128af467bd218ee51 Mon Sep 17 00:00:00 2001
From: Ed Schouten <ed@FreeBSD.org>
Date: Thu, 21 Oct 2010 15:10:35 +0000
Subject: [PATCH 32/68] Fix error handling logic of pututxline(3).

Instead of only returning NULL when the entry is invalid and can't be
matched against the current database, also return it when it cannot open
the log files properly.
---
 lib/libc/gen/pututxline.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/lib/libc/gen/pututxline.c b/lib/libc/gen/pututxline.c
index ce4dc280c3c..3b2c7a808eb 100644
--- a/lib/libc/gen/pututxline.c
+++ b/lib/libc/gen/pututxline.c
@@ -65,7 +65,7 @@ futx_open(const char *file)
 	return (fp);
 }
 
-static void
+static int
 utx_active_add(const struct futx *fu)
 {
 	FILE *fp;
@@ -78,7 +78,7 @@ utx_active_add(const struct futx *fu)
 	 */
 	fp = futx_open(_PATH_UTX_ACTIVE);
 	if (fp == NULL)
-		return;
+		return (1);
 	while (fread(&fe, sizeof fe, 1, fp) == 1) {
 		switch (fe.fu_type) {
 		case USER_PROCESS:
@@ -110,6 +110,7 @@ utx_active_add(const struct futx *fu)
 exact:
 	fwrite(fu, sizeof *fu, 1, fp);
 	fclose(fp);
+	return (0);
 }
 
 static int
@@ -123,7 +124,7 @@ utx_active_remove(struct futx *fu)
 	 */
 	fp = futx_open(_PATH_UTX_ACTIVE);
 	if (fp == NULL)
-		return (0);
+		return (1);
 	while (fread(&fe, sizeof fe, 1, fp) == 1) {
 		switch (fe.fu_type) {
 		case USER_PROCESS:
@@ -151,7 +152,7 @@ utx_active_purge(void)
 	truncate(_PATH_UTX_ACTIVE, 0);
 }
 
-static void
+static int
 utx_lastlogin_add(const struct futx *fu)
 {
 	FILE *fp;
@@ -164,7 +165,7 @@ utx_lastlogin_add(const struct futx *fu)
 	 */
 	fp = futx_open(_PATH_UTX_LASTLOGIN);
 	if (fp == NULL)
-		return;
+		return (1);
 	while (fread(&fe, sizeof fe, 1, fp) == 1) {
 		if (strncmp(fu->fu_user, fe.fu_user, sizeof fe.fu_user) != 0)
 			continue;
@@ -175,6 +176,7 @@ utx_lastlogin_add(const struct futx *fu)
 	}
 	fwrite(fu, sizeof *fu, 1, fp);
 	fclose(fp);
+	return (0);
 }
 
 static void
@@ -197,7 +199,7 @@ utx_lastlogin_upgrade(void)
 	_close(fd);
 }
 
-static void
+static int
 utx_log_add(const struct futx *fu)
 {
 	int fd;
@@ -219,15 +221,17 @@ utx_log_add(const struct futx *fu)
 
 	fd = _open(_PATH_UTX_LOG, O_CREAT|O_WRONLY|O_APPEND, 0644);
 	if (fd < 0)
-		return;
+		return (1);
 	_writev(fd, vec, 2);
 	_close(fd);
+	return (0);
 }
 
 struct utmpx *
 pututxline(const struct utmpx *utmpx)
 {
 	struct futx fu;
+	int bad = 0;
 
 	utx_to_futx(utmpx, &fu);
 	
@@ -241,16 +245,21 @@ pututxline(const struct utmpx *utmpx)
 	case NEW_TIME:
 		break;
 	case USER_PROCESS:
-		utx_active_add(&fu);
-		utx_lastlogin_add(&fu);
+		bad |= utx_active_add(&fu);
+		bad |= utx_lastlogin_add(&fu);
 		break;
 #if 0 /* XXX: Are these records of any use to us? */
 	case INIT_PROCESS:
 	case LOGIN_PROCESS:
-		utx_active_add(&fu);
+		bad |= utx_active_add(&fu);
 		break;
 #endif
 	case DEAD_PROCESS:
+		/*
+		 * In case writing a logout entry fails, never attempt
+		 * to write it to utx.log.  The logout entry's ut_id
+		 * might be invalid.
+		 */
 		if (utx_active_remove(&fu) != 0)
 			return (NULL);
 		break;
@@ -258,6 +267,6 @@ pututxline(const struct utmpx *utmpx)
 		return (NULL);
 	}
 
-	utx_log_add(&fu);
-	return (futx_to_utx(&fu));
+	bad |= utx_log_add(&fu);
+	return (bad ? NULL : futx_to_utx(&fu));
 }

From 37931439fddf76d537547b5bbdbbc6e71cd3705f Mon Sep 17 00:00:00 2001
From: Robert Watson <rwatson@FreeBSD.org>
Date: Thu, 21 Oct 2010 16:08:31 +0000
Subject: [PATCH 33/68] Improve the structure and implementation of the
 syscall_timing microbenchmark suite:

- Use common benchmark_start/benchmark_stop routines to simplify
  individual benchmarks.
- Add a central table of tests with names, where new tests can be
  hooked in easily.
- Add new benchmarks for dup, shm_open, shm_open + fstat, fork,
  vfork, vfork + exec, chroot, setuid.
- Accept a number of loops, not just a number of iterations.
- Report results more usefully in a table.

Sponsored by:	Google, Inc.
MFC after:	2 weeks
---
 tools/tools/syscall_timing/syscall_timing.c | 480 +++++++++++++++-----
 1 file changed, 374 insertions(+), 106 deletions(-)

diff --git a/tools/tools/syscall_timing/syscall_timing.c b/tools/tools/syscall_timing/syscall_timing.c
index 7263906ea18..068b36cce64 100644
--- a/tools/tools/syscall_timing/syscall_timing.c
+++ b/tools/tools/syscall_timing/syscall_timing.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2003-2004 Robert N. M. Watson
+ * Copyright (c) 2003-2004, 2010 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -27,26 +27,48 @@
  */
 
 #include <sys/types.h>
+#include <sys/mman.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <sys/time.h>
+#include <sys/wait.h>
 
 #include <assert.h>
+#include <err.h>
+#include <fcntl.h>
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
-#define timespecsub(vvp, uvp)                                           \
-        do {                                                            \
-                (vvp)->tv_sec -= (uvp)->tv_sec;                         \
-                (vvp)->tv_nsec -= (uvp)->tv_nsec;                       \
-                if ((vvp)->tv_nsec < 0) {                               \
-                        (vvp)->tv_sec--;                                \
-                        (vvp)->tv_nsec += 1000000000;                   \
-                }                                                       \
-        } while (0)
+static struct timespec ts_start, ts_end;
 
-inline void
+#define timespecsub(vvp, uvp)						\
+	do {								\
+		(vvp)->tv_sec -= (uvp)->tv_sec;				\
+		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
+		if ((vvp)->tv_nsec < 0) {				\
+			(vvp)->tv_sec--;				\
+			(vvp)->tv_nsec += 1000000000;			\
+		}							\
+	} while (0)
+
+static void
+benchmark_start(void)
+{
+
+	assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0);
+}
+
+static void
+benchmark_stop(void)
+{
+
+	assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
+}
+  
+void
 test_getuid(int num)
 {
 	int i;
@@ -55,11 +77,13 @@ test_getuid(int num)
 	 * Thread-local data should require no locking if system
 	 * call is MPSAFE.
 	 */
+	benchmark_start();
 	for (i = 0; i < num; i++)
 		getuid();
+	benchmark_stop();
 }
 
-inline void
+void
 test_getppid(int num)
 {
 	int i;
@@ -68,28 +92,28 @@ test_getppid(int num)
 	 * This is process-local, but can change, so will require a
 	 * lock.
 	 */
+	benchmark_start();
 	for (i = 0; i < num; i++)
 		getppid();
+	benchmark_stop();
 }
 
-inline void
+void
 test_clock_gettime(int num)
 {
 	struct timespec ts;
 	int i;
 
-	for (i = 0; i < num; i++) {
-		if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
-			perror("clock_gettime");
-			exit(-1);
-		}
-	}
+	benchmark_start();
+	for (i = 0; i < num; i++)
+		(void)clock_gettime(CLOCK_REALTIME, &ts);
+	benchmark_stop();
 }
 
-inline void
+void
 test_pipe(int num)
 {
-	int i;
+	int fd[2], i;
 
 	/*
 	 * pipe creation is expensive, as it will allocate a new file
@@ -97,153 +121,397 @@ test_pipe(int num)
 	 * Destroying is also expensive, as we now have to free up
 	 * the file descriptors and return the pipe.
 	 */
+	if (pipe(fd) < 0)
+		err(-1, "test_pipe: pipe");
+	close(fd[0]);
+	close(fd[1]);
+	benchmark_start();
 	for (i = 0; i < num; i++) {
-		int fd[2];
-		if (pipe(fd) == -1) {
-			perror("pipe");
-			exit(-1);
-		}
-
+		if (pipe(fd) == -1)
+			err(-1, "test_pipe: pipe");
 		close(fd[0]);
 		close(fd[1]);
 	}
+	benchmark_stop();
 }
 
-inline void
+void
 test_socket_stream(int num)
 {
 	int i, so;
 
+	so = socket(PF_LOCAL, SOCK_STREAM, 0);
+	if (so < 0)
+		err(-1, "test_socket_stream: socket");
+	close(so);
+	benchmark_start();
 	for (i = 0; i < num; i++) {
 		so = socket(PF_LOCAL, SOCK_STREAM, 0);
-		if (so == -1) {
-			perror("socket_stream");
-			exit(-1);
-		}
+		if (so == -1)
+			err(-1, "test_socket_stream: socket");
 		close(so);
 	}
+	benchmark_stop();
 }
 
-inline void
+void
 test_socket_dgram(int num)
 {
 	int i, so;
 
+	so = socket(PF_LOCAL, SOCK_DGRAM, 0);
+	if (so < 0)
+		err(-1, "test_socket_dgram: socket");
+	close(so);
+	benchmark_start();
 	for (i = 0; i < num; i++) {
 		so = socket(PF_LOCAL, SOCK_DGRAM, 0);
-		if (so == -1) {
-			perror("socket_dgram");
-			exit(-1);
-		}
+		if (so == -1)
+			err(-1, "test_socket_dgram: socket");
 		close(so);
 	}
+	benchmark_stop();
 }
 
-inline void
+void
 test_socketpair_stream(int num)
 {
 	int i, so[2];
 
+	if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1)
+		err(-1, "test_socketpair_stream: socketpair");
+	close(so[0]);
+	close(so[1]);
+	benchmark_start();
 	for (i = 0; i < num; i++) {
-		if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1) {
-			perror("socketpair_stream");
-			exit(-1);
-		}
+		if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1)
+			err(-1, "test_socketpair_stream: socketpair");
 		close(so[0]);
 		close(so[1]);
 	}
+	benchmark_stop();
 }
 
-inline void
+void
 test_socketpair_dgram(int num)
 {
 	int i, so[2];
 
+	if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1)
+		err(-1, "test_socketpair_dgram: socketpair");
+	close(so[0]);
+	close(so[1]);
+	benchmark_start();
 	for (i = 0; i < num; i++) {
-		if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1) {
-			perror("socketpair_dgram");
-			exit(-1);
-		}
+		if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1)
+			err(-1, "test_socketpair_dgram: socketpair");
 		close(so[0]);
 		close(so[1]);
 	}
+	benchmark_stop();
 }
 
+void
+test_dup(int num)
+{
+	int fd, i, shmfd;
+
+	shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
+	if (shmfd < 0)
+		err(-1, "test_dup: shm_open");
+	fd = dup(shmfd);
+	if (fd >= 0)
+		close(fd);
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		fd = dup(shmfd);
+		if (fd >= 0)
+			close(fd);
+	}
+	benchmark_stop();
+	close(shmfd);
+}
+
+void
+test_shmfd(int num)
+{
+	int i, shmfd;
+
+	shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
+	if (shmfd < 0)
+		err(-1, "test_shmfd: shm_open");
+	close(shmfd);
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
+		if (shmfd < 0)
+			err(-1, "test_shmfd: shm_open");
+		close(shmfd);
+	}
+	benchmark_stop();
+}
+
+void
+test_fstat_shmfd(int num)
+{
+	struct stat sb;
+	int i, shmfd;
+
+	shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
+	if (shmfd < 0)
+		err(-1, "test_fstat_shmfd: shm_open");
+	if (fstat(shmfd, &sb) < 0)
+		err(-1, "test_fstat_shmfd: fstat");
+	benchmark_start();
+	for (i = 0; i < num; i++)
+		(void)fstat(shmfd, &sb);
+	benchmark_stop();
+	close(shmfd);
+}
+
+void
+test_fork(int num)
+{
+	pid_t pid;
+	int i;
+
+	pid = fork();
+	if (pid < 0)
+		err(-1, "test_fork: fork");
+	if (pid == 0)
+		_exit(0);
+	if (waitpid(pid, NULL, 0) < 0)
+		err(-1, "test_fork: waitpid");
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		pid = fork();
+		if (pid < 0)
+			err(-1, "test_fork: fork");
+		if (pid == 0)
+			_exit(0);
+		if (waitpid(pid, NULL, 0) < 0)
+			err(-1, "test_fork: waitpid");
+	}
+	benchmark_stop();
+}
+
+void
+test_vfork(int num)
+{
+	pid_t pid;
+	int i;
+
+	pid = vfork();
+	if (pid < 0)
+		err(-1, "test_vfork: vfork");
+	if (pid == 0)
+		_exit(0);
+	if (waitpid(pid, NULL, 0) < 0)
+		err(-1, "test_vfork: waitpid");
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		pid = vfork();
+		if (pid < 0)
+			err(-1, "test_vfork: vfork");
+		if (pid == 0)
+			_exit(0);
+		if (waitpid(pid, NULL, 0) < 0)
+			err(-1, "test_vfork: waitpid");
+	}
+	benchmark_stop();
+}
+
+#define	USR_BIN_TRUE	"/usr/bin/true"
+static char *execve_args[] = { USR_BIN_TRUE, NULL};
+extern char **environ;
+
+void
+test_fork_exec(int num)
+{
+	pid_t pid;
+	int i;
+
+	pid = fork();
+	if (pid < 0)
+		err(-1, "test_fork_exec: fork");
+	if (pid == 0) {
+		(void)execve(USR_BIN_TRUE, execve_args, environ);
+		err(-1, "execve");
+	}
+	if (waitpid(pid, NULL, 0) < 0)
+		err(-1, "test_fork: waitpid");
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		pid = fork();
+		if (pid < 0)
+			err(-1, "test_fork_exec: fork");
+		if (pid == 0) {
+			(void)execve(USR_BIN_TRUE, execve_args, environ);
+			err(-1, "test_fork_exec: execve");
+		}
+		if (waitpid(pid, NULL, 0) < 0)
+			err(-1, "test_fork_exec: waitpid");
+	}
+	benchmark_stop();
+}
+
+void
+test_vfork_exec(int num)
+{
+	pid_t pid;
+	int i;
+
+	pid = vfork();
+	if (pid < 0)
+		err(-1, "test_vfork_exec: vfork");
+	if (pid == 0) {
+		(void)execve(USR_BIN_TRUE, execve_args, environ);
+		err(-1, "test_vfork_exec: execve");
+	}
+	if (waitpid(pid, NULL, 0) < 0)
+		err(-1, "test_vfork_exec: waitpid");
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		pid = vfork();
+		if (pid < 0)
+			err(-1, "test_vfork_exec: vfork");
+		if (pid == 0) {
+			(void)execve(USR_BIN_TRUE, execve_args, environ);
+			err(-1, "execve");
+		}
+		if (waitpid(pid, NULL, 0) < 0)
+			err(-1, "test_vfork_exec: waitpid");
+	}
+	benchmark_stop();
+}
+
+void
+test_chroot(int num)
+{
+	int i;
+
+	if (chroot("/") < 0)
+		err(-1, "test_chroot: chroot");
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		if (chroot("/") < 0)
+			err(-1, "test_chroot: chroot");
+	}
+	benchmark_stop();
+}
+
+void
+test_setuid(int num)
+{
+	uid_t uid;
+	int i;
+
+	uid = getuid();
+	if (setuid(uid) < 0)
+		err(-1, "test_setuid: setuid");
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		if (setuid(uid) < 0)
+			err(-1, "test_setuid: setuid");
+	}
+	benchmark_stop();
+}
+
+struct test {
+	const char	*t_name;
+	void		(*t_func)(int);
+};
+
+static const struct test tests[] = {
+	{ "getuid", test_getuid },
+	{ "getppid", test_getppid },
+	{ "clock_gettime", test_clock_gettime },
+	{ "pipe", test_pipe },
+	{ "socket_stream", test_socket_stream },
+	{ "socket_dgram", test_socket_dgram },
+	{ "socketpair_stream", test_socketpair_stream },
+	{ "socketpair_dgram", test_socketpair_dgram },
+	{ "dup", test_dup },
+	{ "shmfd", test_shmfd },
+	{ "fstat_shmfd", test_fstat_shmfd },
+	{ "fork", test_fork },
+	{ "vfork", test_vfork },
+	{ "fork_exec", test_fork_exec },
+	{ "vfork_exec", test_vfork_exec },
+	{ "chroot", test_chroot },
+	{ "setuid", test_setuid },
+};
+static const int tests_count = sizeof(tests) / sizeof(tests[0]);
+
 static void
 usage(void)
 {
+	int i;
 
-	fprintf(stderr, "syscall_timing [iterations] [test]\n");
-	fprintf(stderr,
-	    "supported tests: getuid getppid clock_gettime pipe\n"
-	    "socket_stream socket_dgram socketpair_stream\n"
-	    "socketpair_dgram\n");
+	fprintf(stderr, "syscall_timing [iterations] [loops] [test]\n");
+	for (i = 0; i < tests_count; i++)
+		fprintf(stderr, "  %s\n", tests[i].t_name);
 	exit(-1);
 }
 
 int
 main(int argc, char *argv[])
 {
-	struct timespec ts_start, ts_end, ts_res;
-	int count;
+	struct timespec ts_res;
+	const struct test *the_test;
+	long long ll;
+	char *endp;
+	int i, j, k;
+	int iterations, loops;
 
-	if (argc != 3)
+	if (argc < 4)
 		usage();
-	count = atoi(argv[1]);
+
+	ll = strtoll(argv[1], &endp, 10);
+	if (*endp != 0 || ll < 0 || ll > 100000)
+		usage();
+	iterations = ll;
+
+	ll = strtoll(argv[2], &endp, 10);
+	if (*endp != 0 || ll < 0 || ll > 100000)
+		usage();
+	loops = ll;
 
 	assert(clock_getres(CLOCK_REALTIME, &ts_res) == 0);
-	printf("Clock resolution: %d.%09lu\n", ts_res.tv_sec, ts_res.tv_nsec);
+	printf("Clock resolution: %ju.%ju\n", (uintmax_t)ts_res.tv_sec,
+	    (uintmax_t)ts_res.tv_nsec);
+	printf("test\tloop\ttotal\titerations\tperiteration\n");
 
-	if (strcmp(argv[2], "getuid") == 0) {
-		assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0);
-		test_getuid(count);
-		assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
-	} else if (strcmp(argv[2], "getppid") == 0) {
-		assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0);
-		test_getppid(count);
-		assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
-	} else if (strcmp(argv[2], "clock_gettime") == 0) {
-		assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0);
-		test_clock_gettime(count);
-		assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
-	} else if (strcmp(argv[2], "pipe") == 0) {
-		assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0);
-		test_pipe(count);
-		assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
-	} else if (strcmp(argv[2], "socket_stream") == 0) {
-		assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0);
-		test_socket_stream(count);
-		assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
-	} else if (strcmp(argv[2], "socket_dgram") == 0) {
-		assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0);
-		test_socket_dgram(count);
-		assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
-	} else if (strcmp(argv[2], "socketpair_stream") == 0) {
-		assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0);
-		test_socketpair_stream(count);
-		assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
-	} else if (strcmp(argv[2], "socketpair_dgram") == 0) {
-		assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0);
-		test_socketpair_dgram(count);
-		assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
-	 } else
-		usage();
+	for (j = 3; j < argc; j++) {
+		the_test = NULL;
+		for (i = 0; i < tests_count; i++) {
+			if (strcmp(argv[j], tests[i].t_name) == 0)
+				the_test = &tests[i];
+		}
+		if (the_test == NULL)
+			usage();
 
-	timespecsub(&ts_end, &ts_start);
+		/*
+		 * Run one warmup, then do the real thing (loops) times.
+		 */
+		the_test->t_func(iterations);
+		for (k = 0; k < loops; k++) {
+			the_test->t_func(iterations);
+			timespecsub(&ts_end, &ts_start);
+			printf("%s\t%d\t", the_test->t_name, k);
+			printf("%ju.%09ju\t%d\t", (uintmax_t)ts_end.tv_sec,
+			    (uintmax_t)ts_end.tv_nsec, iterations);
 
-	printf("test: %s\n", argv[2]);
-
-	printf("%d.%09lu for %d iterations\n", ts_end.tv_sec,
-	    ts_end.tv_nsec, count);
-
-	/*
-	 * Note.  This assumes that each iteration takes less than
-	 * a second, and that our total nanoseconds doesn't exceed
-	 * the room in our arithmetic unit.  Fine for system calls,
-	 * but not for long things.
-	 */
-	ts_end.tv_sec *= 1000000000 / count;
-	printf("0.%09lu per/iteration\n", 
-	    ts_end.tv_sec + ts_end.tv_nsec / count);
+		/*
+		 * Note.  This assumes that each iteration takes less than
+		 * a second, and that our total nanoseconds doesn't exceed
+		 * the room in our arithmetic unit.  Fine for system calls,
+		 * but not for long things.
+		 */
+			ts_end.tv_sec *= 1000000000 / iterations;
+			printf("0.%09ju\n", (uintmax_t)(ts_end.tv_sec +
+			    ts_end.tv_nsec / iterations));
+		}
+	}
 	return (0);
 }

From 9af74f3d68c331e0cfa741fec75a0e8a7f87670d Mon Sep 17 00:00:00 2001
From: Sergey Kandaurov <pluknet@FreeBSD.org>
Date: Thu, 21 Oct 2010 16:20:48 +0000
Subject: [PATCH 34/68] Reshuffle SIOCGIFCONF32 handler from r155224.

- move all the chunks into one file, which allows to hide SIOCGIFCONF32
  global definition as well.
- replace __amd64__ with proper COMPAT_FREEBSD32 around.
- handle 32bit capacity before going into the handler itself instead of
  doing internal 32bit specific changes within it (e.g. as it's done for
  SIOCGDEFIFACE32_IN6).
- use explicitely sized types for ABI compat.

Approved by:	kib (mentor)
MFC after:	2 weeks
---
 sys/net/if.c     | 48 ++++++++++++++++++++++++++++++------------------
 sys/net/if.h     | 10 ----------
 sys/sys/sockio.h |  3 ---
 3 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/sys/net/if.c b/sys/net/if.c
index bd54acf28c2..3c8486aa6b6 100644
--- a/sys/net/if.c
+++ b/sys/net/if.c
@@ -92,6 +92,11 @@
 
 #include <security/mac/mac_framework.h>
 
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <compat/freebsd32/freebsd32.h>
+#endif
+
 struct ifindex_entry {
 	struct  ifnet *ife_ifnet;
 };
@@ -2402,6 +2407,17 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 	return (error);
 }
 
+#ifdef COMPAT_FREEBSD32
+struct ifconf32 {
+	int32_t	ifc_len;
+	union {
+		uint32_t	ifcu_buf;
+		uint32_t	ifcu_req;
+	} ifc_ifcu;
+};
+#define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
+#endif
+
 /*
  * Interface ioctls.
  */
@@ -2416,10 +2432,21 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 	switch (cmd) {
 	case SIOCGIFCONF:
 	case OSIOCGIFCONF:
-#ifdef __amd64__
-	case SIOCGIFCONF32:
-#endif
 		return (ifconf(cmd, data));
+
+#ifdef COMPAT_FREEBSD32
+	case SIOCGIFCONF32:
+		{
+			struct ifconf32 *ifc32;
+			struct ifconf ifc;
+
+			ifc32 = (struct ifconf32 *)data;
+			ifc.ifc_len = ifc32->ifc_len;
+			ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
+
+			return (ifconf(SIOCGIFCONF, (void *)&ifc));
+		}
+#endif
 	}
 	ifr = (struct ifreq *)data;
 
@@ -2646,23 +2673,12 @@ static int
 ifconf(u_long cmd, caddr_t data)
 {
 	struct ifconf *ifc = (struct ifconf *)data;
-#ifdef __amd64__
-	struct ifconf32 *ifc32 = (struct ifconf32 *)data;
-	struct ifconf ifc_swab;
-#endif
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
-#ifdef __amd64__
-	if (cmd == SIOCGIFCONF32) {
-		ifc_swab.ifc_len = ifc32->ifc_len;
-		ifc_swab.ifc_buf = (caddr_t)(uintptr_t)ifc32->ifc_buf;
-		ifc = &ifc_swab;
-	}
-#endif
 	/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
 	max_len = MAXPHYS - 1;
 
@@ -2752,10 +2768,6 @@ again:
 	}
 
 	ifc->ifc_len = valid_len;
-#ifdef __amd64__
-	if (cmd == SIOCGIFCONF32)
-		ifc32->ifc_len = valid_len;
-#endif
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
diff --git a/sys/net/if.h b/sys/net/if.h
index ae0daf5b639..a99b4a7c8ad 100644
--- a/sys/net/if.h
+++ b/sys/net/if.h
@@ -391,16 +391,6 @@ struct	ifconf {
 #define	ifc_req	ifc_ifcu.ifcu_req	/* array of structures returned */
 };
 
-#if defined (__amd64__)
-struct ifconf32 {
-	int	ifc_len;		/* size of associated buffer */
-	union {
-		u_int	ifcu_buf;
-		u_int	ifcu_req;
-	} ifc_ifcu;
-};
-#endif
-
 /*
  * interface groups
  */
diff --git a/sys/sys/sockio.h b/sys/sys/sockio.h
index 2af2467da3b..4c1c4839cc8 100644
--- a/sys/sys/sockio.h
+++ b/sys/sys/sockio.h
@@ -62,9 +62,6 @@
 #define	SIOCSIFBRDADDR	 _IOW('i', 19, struct ifreq)	/* set broadcast addr */
 #define	OSIOCGIFCONF	_IOWR('i', 20, struct ifconf)	/* get ifnet list */
 #define	SIOCGIFCONF	_IOWR('i', 36, struct ifconf)	/* get ifnet list */
-#if  defined (__amd64__)
-#define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)	/* get ifnet list */
-#endif
 #define	OSIOCGIFNETMASK	_IOWR('i', 21, struct ifreq)	/* get net addr mask */
 #define	SIOCGIFNETMASK	_IOWR('i', 37, struct ifreq)	/* get net addr mask */
 #define	SIOCSIFNETMASK	 _IOW('i', 22, struct ifreq)	/* set net addr mask */

From 85080d2d68270d2716690c10b14ce4fa6dc7d773 Mon Sep 17 00:00:00 2001
From: Gleb Smirnoff <glebius@FreeBSD.org>
Date: Thu, 21 Oct 2010 17:05:15 +0000
Subject: [PATCH 35/68] Make it possible to read input from stdin.

Without this change I don't see a way to
unpack a multivolume archive without wasting
disk space for a temporary file.
---
 usr.bin/unzip/unzip.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/usr.bin/unzip/unzip.c b/usr.bin/unzip/unzip.c
index 87927d2bb2e..a3852f709cd 100644
--- a/usr.bin/unzip/unzip.c
+++ b/usr.bin/unzip/unzip.c
@@ -859,7 +859,9 @@ unzip(const char *fn)
 	int fd, ret;
 	uintmax_t total_size, file_count, error_count;
 
-	if ((fd = open(fn, O_RDONLY)) < 0)
+	if (strcmp(fn, "-") == 0)
+		fd = STDIN_FILENO;
+	else if ((fd = open(fn, O_RDONLY)) < 0)
 		error("%s", fn);
 
 	if ((a = archive_read_new()) == NULL)
@@ -913,7 +915,7 @@ unzip(const char *fn)
 	ac(archive_read_close(a));
 	(void)archive_read_finish(a);
 
-	if (close(fd) != 0)
+	if (fd != STDIN_FILENO && close(fd) != 0)
 		error("%s", fn);
 
 	if (t_opt) {

From c4201e3f337a27a88a33b8d366eeb1aaa879f308 Mon Sep 17 00:00:00 2001
From: Warner Losh <imp@FreeBSD.org>
Date: Thu, 21 Oct 2010 17:14:44 +0000
Subject: [PATCH 36/68] This small patch updates the "geli setkey" flags
 pc-sysinstall uses when saving a users passphrase, to make it work in HEAD
 with recent geli improvements.

Submitted by: Kris Moore
PR: 151002
---
 usr.sbin/pc-sysinstall/backend/functions-cleanup.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/usr.sbin/pc-sysinstall/backend/functions-cleanup.sh b/usr.sbin/pc-sysinstall/backend/functions-cleanup.sh
index 56e5389e3e3..bf2e4670ec3 100755
--- a/usr.sbin/pc-sysinstall/backend/functions-cleanup.sh
+++ b/usr.sbin/pc-sysinstall/backend/functions-cleanup.sh
@@ -296,7 +296,7 @@ setup_geli_loading()
 
      # If we have a passphrase, set it up now
      if [ -e "${PARTDIR}-enc/${PART}-encpass" ] ; then
-       cat ${PARTDIR}-enc/${PART}-encpass | geli setkey -S -n 0 -p -k ${KEYFILE} -K ${KEYFILE} ${PART}
+       geli setkey -J ${PARTDIR}-enc/${PART}-encpass -n 0 -p -k ${KEYFILE} -K ${KEYFILE} ${PART}
        geli configure -b ${PART}
      fi
 

From a919280d8b5cf4ec9a3e1a6f4ae3b45ec5dbc71f Mon Sep 17 00:00:00 2001
From: Warner Losh <imp@FreeBSD.org>
Date: Thu, 21 Oct 2010 17:20:37 +0000
Subject: [PATCH 37/68] This is an updated patch to the last patch to do this
 which fixes a local variable issue. This patch decompresses compressed images
 to the stdout when writing to a device to avoid running out of space issues.

Submitted by: John Hixson
Pr: 151049
---
 usr.sbin/pc-sysinstall/backend/functions.sh | 104 +++++++++-----------
 1 file changed, 44 insertions(+), 60 deletions(-)

diff --git a/usr.sbin/pc-sysinstall/backend/functions.sh b/usr.sbin/pc-sysinstall/backend/functions.sh
index cb3b6980c3c..46b5a02f4fc 100755
--- a/usr.sbin/pc-sysinstall/backend/functions.sh
+++ b/usr.sbin/pc-sysinstall/backend/functions.sh
@@ -325,64 +325,8 @@ get_compression_type()
   export VAL
 }
 
-decompress_file()
-{
-  local FILE
-  local COMPRESSION
-
-  FILE="$1"
-  COMPRESSION="$2"
-
-  if [ -n "${COMPRESSION}" ]
-  then
-    case "${COMPRESSION}" in
-      lzw)
-        rc_halt "uncompress ${FILE}"
-        VAL="${FILE%.Z}"
-        ;;
-
-      lzo)
-        rc_halt "lzop -d ${FILE}"
-        VAL="${FILE%.lzo}"
-        ;;
-
-      lzma)
-        rc_halt "lzma -d ${FILE}"
-        VAL="${FILE%.lzma}"
-        ;;
-
-      gzip)
-        rc_halt "gunzip ${FILE}"
-        VAL="${FILE%.gz}"
-        ;;
-
-      bzip2)
-        rc_halt "bunzip2 ${FILE}"
-        VAL="${FILE%.bz2}"
-        ;;
-
-      xz)
-        rc_halt "xz -d ${FILE}"
-        VAL="${FILE%.xz}"
-        ;;
-
-      zip)
-        rc_halt "unzip ${FILE}"
-        VAL="${FILE%.zip}"
-        ;;
-
-      *) 
-        exit_err "ERROR: ${COMPRESSION} compression is not supported"
-        ;;
-    esac
-  fi
-
-  export VAL
-}
-
 write_image()
 {
-  local IMAGE_FILE
   local DEVICE_FILE
 
   IMAGE_FILE="$1"
@@ -418,11 +362,51 @@ write_image()
     get_compression_type "${IMAGE_FILE}"
 	COMPRESSION="${VAL}"
 
-	decompress_file "${IMAGE_FILE}" "${COMPRESSION}"
-	IMAGE_FILE="${VAL}"
-  fi
+    case "${COMPRESSION}" in
+      lzw)
+        rc_halt "uncompress ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        IMAGE_FILE="${IMAGE_FILE%.Z}"
+        ;;
 
-  rc_halt "dd if=${IMAGE_FILE} of=${DEVICE_FILE} bs=128k"
+      lzo)
+        rc_halt "lzop -d $IMAGE_{FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        IMAGE_FILE="${IMAGE_FILE%.lzo}"
+        ;;
+
+      lzma)
+        rc_halt "lzma -d ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        IMAGE_FILE="${IMAGE_FILE%.lzma}"
+        ;;
+
+      gzip)
+        rc_halt "gunzip ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        IMAGE_FILE="${IMAGE_FILE%.gz}"
+        ;;
+
+      bzip2)
+        rc_halt "bunzip2 ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        IMAGE_FILE="${IMAGE_FILE%.bz2}"
+        ;;
+
+      xz)
+        rc_halt "xz -d ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        IMAGE_FILE="${IMAGE_FILE%.xz}"
+        ;;
+
+      zip)
+        rc_halt "unzip ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        IMAGE_FILE="${IMAGE_FILE%.zip}"
+        ;;
+
+      *) 
+        exit_err "ERROR: ${COMPRESSION} compression is not supported"
+        ;;
+    esac
+
+  else
+    rc_halt "dd if=${IMAGE_FILE} of=${DEVICE_FILE} bs=128k"
+
+  fi
 };
 
 install_fresh()

From f3d45e140958eb14d4c7e9c051caa4961fb88c8f Mon Sep 17 00:00:00 2001
From: Gleb Smirnoff <glebius@FreeBSD.org>
Date: Thu, 21 Oct 2010 17:23:09 +0000
Subject: [PATCH 38/68] Document possibility to read from stdin.

---
 usr.bin/unzip/unzip.1 | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/usr.bin/unzip/unzip.1 b/usr.bin/unzip/unzip.1
index 3d4de4628de..b399a645c15 100644
--- a/usr.bin/unzip/unzip.1
+++ b/usr.bin/unzip/unzip.1
@@ -25,7 +25,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd February 16, 2010
+.Dd October 21, 2010
 .Dt UNZIP 1
 .Os
 .Sh NAME
@@ -111,6 +111,10 @@ Note that only one of
 and
 .Fl u
 may be specified.
+Is specified filename is
+.Va Qq - ,
+then data is read from
+.Va stdin .
 .Sh ENVIRONMENT
 If the
 .Ev UNZIP_DEBUG

From b459a68591a7726f52afefdbd7b0b89697a0ea92 Mon Sep 17 00:00:00 2001
From: Warner Losh <imp@FreeBSD.org>
Date: Thu, 21 Oct 2010 17:23:48 +0000
Subject: [PATCH 39/68] This patch will only list components if the directory
 exists. The directory exist on PC-BSD but not FreeBSD, so an extra check is
 made.

Submitted by: John Hixson
PR: 151461
---
 .../backend-query/list-components.sh          | 37 ++++++++++---------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/usr.sbin/pc-sysinstall/backend-query/list-components.sh b/usr.sbin/pc-sysinstall/backend-query/list-components.sh
index 294faad1d40..a7cde89076d 100755
--- a/usr.sbin/pc-sysinstall/backend-query/list-components.sh
+++ b/usr.sbin/pc-sysinstall/backend-query/list-components.sh
@@ -32,23 +32,24 @@
 
 echo "Available Components:"
 
-cd ${COMPDIR}
-for i in `ls -d *`
-do
-  if [ -e "${i}/component.cfg" -a -e "${i}/install.sh" -a -e "${i}/distfiles" ]
-  then
-    NAME="`grep 'name:' ${i}/component.cfg | cut -d ':' -f 2`"
-    DESC="`grep 'description:' ${i}/component.cfg | cut -d ':' -f 2`"
-    TYPE="`grep 'type:' ${i}/component.cfg | cut -d ':' -f 2`"
-    echo " "
-    echo "name: ${i}"
-    echo "desc:${DESC}"
-    echo "type:${TYPE}"
-    if [ -e "${i}/component.png" ]
+if [ -d "${COMPDIR}" ]
+then
+  cd ${COMPDIR}
+  for i in `ls -d *`
+  do
+    if [ -e "${i}/component.cfg" -a -e "${i}/install.sh" -a -e "${i}/distfiles" ]
     then
-      echo "icon: ${COMPDIR}/${i}/component.png"
+      NAME="`grep 'name:' ${i}/component.cfg | cut -d ':' -f 2`"
+      DESC="`grep 'description:' ${i}/component.cfg | cut -d ':' -f 2`"
+      TYPE="`grep 'type:' ${i}/component.cfg | cut -d ':' -f 2`"
+      echo " "
+      echo "name: ${i}"
+      echo "desc:${DESC}"
+      echo "type:${TYPE}"
+      if [ -e "${i}/component.png" ]
+      then
+        echo "icon: ${COMPDIR}/${i}/component.png"
+      fi
     fi
-  fi
-
-done
-
+  done
+fi

From 1dca6a284407b55093004875aa3e07caef4fef07 Mon Sep 17 00:00:00 2001
From: Robert Watson <rwatson@FreeBSD.org>
Date: Thu, 21 Oct 2010 17:27:39 +0000
Subject: [PATCH 40/68] Further enhancements to syscall_timing:

- Use getopt rather than hand-parsed arguments
- Allow iterations to be specified and/or a new number of seconds bound
  on the number of iterations
- Fix printout of timer resolution
- Add new tests, such as TCP and UDP socket creation, and open/read/close
  of /dev/zero and /dev/null.

Sponsored by:	Google, Inc.
MFC after:	2 weeks
---
 tools/tools/syscall_timing/syscall_timing.c | 337 +++++++++++++++-----
 1 file changed, 258 insertions(+), 79 deletions(-)

diff --git a/tools/tools/syscall_timing/syscall_timing.c b/tools/tools/syscall_timing/syscall_timing.c
index 068b36cce64..94975a8183b 100644
--- a/tools/tools/syscall_timing/syscall_timing.c
+++ b/tools/tools/syscall_timing/syscall_timing.c
@@ -37,12 +37,16 @@
 #include <err.h>
 #include <fcntl.h>
 #include <inttypes.h>
+#include <limits.h>
+#include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 static struct timespec ts_start, ts_end;
+static int alarm_timeout;
+static volatile int alarm_fired;
 
 #define timespecsub(vvp, uvp)						\
 	do {								\
@@ -54,10 +58,22 @@ static struct timespec ts_start, ts_end;
 		}							\
 	} while (0)
 
+static void
+alarm_handler(int signum)
+{
+
+	alarm_fired = 1;
+}
+
 static void
 benchmark_start(void)
 {
 
+	alarm_fired = 0;
+	if (alarm_timeout) {
+		signal(SIGALRM, alarm_handler);
+		alarm(alarm_timeout);
+	}
 	assert(clock_gettime(CLOCK_REALTIME, &ts_start) == 0);
 }
 
@@ -68,50 +84,62 @@ benchmark_stop(void)
 	assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
 }
   
-void
-test_getuid(int num)
+uint64_t
+test_getuid(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
-	int i;
+	uint64_t i;
 
 	/*
 	 * Thread-local data should require no locking if system
 	 * call is MPSAFE.
 	 */
 	benchmark_start();
-	for (i = 0; i < num; i++)
+	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		getuid();
+	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_getppid(int num)
+uint64_t
+test_getppid(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
-	int i;
+	uint64_t i;
 
 	/*
 	 * This is process-local, but can change, so will require a
 	 * lock.
 	 */
 	benchmark_start();
-	for (i = 0; i < num; i++)
+	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		getppid();
+	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_clock_gettime(int num)
+uint64_t
+test_clock_gettime(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
 	struct timespec ts;
-	int i;
+	uint64_t i;
 
 	benchmark_start();
-	for (i = 0; i < num; i++)
+	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		(void)clock_gettime(CLOCK_REALTIME, &ts);
+	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_pipe(int num)
+uint64_t
+test_pipe(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
 	int fd[2], i;
 
@@ -127,56 +155,66 @@ test_pipe(int num)
 	close(fd[1]);
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		if (pipe(fd) == -1)
 			err(-1, "test_pipe: pipe");
 		close(fd[0]);
 		close(fd[1]);
 	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_socket_stream(int num)
+uint64_t
+test_socket_stream(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
-	int i, so;
+	uint64_t i, so;
 
-	so = socket(PF_LOCAL, SOCK_STREAM, 0);
+	so = socket(int_arg, SOCK_STREAM, 0);
 	if (so < 0)
 		err(-1, "test_socket_stream: socket");
 	close(so);
 	benchmark_start();
 	for (i = 0; i < num; i++) {
-		so = socket(PF_LOCAL, SOCK_STREAM, 0);
+		if (alarm_fired)
+			break;
+		so = socket(int_arg, SOCK_STREAM, 0);
 		if (so == -1)
 			err(-1, "test_socket_stream: socket");
 		close(so);
 	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_socket_dgram(int num)
+uint64_t
+test_socket_dgram(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
-	int i, so;
+	uint64_t i, so;
 
-	so = socket(PF_LOCAL, SOCK_DGRAM, 0);
+	so = socket(int_arg, SOCK_DGRAM, 0);
 	if (so < 0)
 		err(-1, "test_socket_dgram: socket");
 	close(so);
 	benchmark_start();
 	for (i = 0; i < num; i++) {
-		so = socket(PF_LOCAL, SOCK_DGRAM, 0);
+		if (alarm_fired)
+			break;
+		so = socket(int_arg, SOCK_DGRAM, 0);
 		if (so == -1)
 			err(-1, "test_socket_dgram: socket");
 		close(so);
 	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_socketpair_stream(int num)
+uint64_t
+test_socketpair_stream(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
-	int i, so[2];
+	uint64_t i;
+	int so[2];
 
 	if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1)
 		err(-1, "test_socketpair_stream: socketpair");
@@ -184,18 +222,22 @@ test_socketpair_stream(int num)
 	close(so[1]);
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1)
 			err(-1, "test_socketpair_stream: socketpair");
 		close(so[0]);
 		close(so[1]);
 	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_socketpair_dgram(int num)
+uint64_t
+test_socketpair_dgram(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
-	int i, so[2];
+	uint64_t i;
+	int so[2];
 
 	if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1)
 		err(-1, "test_socketpair_dgram: socketpair");
@@ -203,16 +245,71 @@ test_socketpair_dgram(int num)
 	close(so[1]);
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1)
 			err(-1, "test_socketpair_dgram: socketpair");
 		close(so[0]);
 		close(so[1]);
 	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_dup(int num)
+uint64_t
+test_open_close(uint64_t num, uint64_t int_arg, const char *string_arg)
+{
+	const char *path = string_arg;
+	uint64_t i;
+	int fd;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		err(-1, "test_open_close: %s", path);
+	close(fd);
+
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
+		fd = open(path, O_RDONLY);
+		if (fd < 0)
+			err(-1, "test_open_close: %s", path);
+		close(fd);
+	}
+	benchmark_stop();
+	return (i);
+}
+
+uint64_t
+test_open_read_close(uint64_t num, uint64_t int_arg, const char *string_arg)
+{
+	char buf[int_arg];
+	uint64_t i;
+	int fd;
+
+	fd = open(string_arg, O_RDONLY);
+	if (fd < 0)
+		err(-1, "test_open_close: %s", string_arg);
+	(void)read(fd, buf, int_arg);
+	close(fd);
+
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
+		fd = open(string_arg, O_RDONLY);
+		if (fd < 0)
+			err(-1, "test_open_close: %s", string_arg);
+		(void)read(fd, buf, int_arg);
+		close(fd);
+	}
+	benchmark_stop();
+	return (i);
+}
+
+uint64_t
+test_dup(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
 	int fd, i, shmfd;
 
@@ -224,18 +321,21 @@ test_dup(int num)
 		close(fd);
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		fd = dup(shmfd);
 		if (fd >= 0)
 			close(fd);
 	}
 	benchmark_stop();
 	close(shmfd);
+	return (i);
 }
 
-void
-test_shmfd(int num)
+uint64_t
+test_shmfd(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
-	int i, shmfd;
+	uint64_t i, shmfd;
 
 	shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
 	if (shmfd < 0)
@@ -243,19 +343,22 @@ test_shmfd(int num)
 	close(shmfd);
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
 		if (shmfd < 0)
 			err(-1, "test_shmfd: shm_open");
 		close(shmfd);
 	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_fstat_shmfd(int num)
+uint64_t
+test_fstat_shmfd(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
 	struct stat sb;
-	int i, shmfd;
+	uint64_t i, shmfd;
 
 	shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
 	if (shmfd < 0)
@@ -263,17 +366,21 @@ test_fstat_shmfd(int num)
 	if (fstat(shmfd, &sb) < 0)
 		err(-1, "test_fstat_shmfd: fstat");
 	benchmark_start();
-	for (i = 0; i < num; i++)
+	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		(void)fstat(shmfd, &sb);
+	}
 	benchmark_stop();
 	close(shmfd);
+	return (i);
 }
 
-void
-test_fork(int num)
+uint64_t
+test_fork(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
 	pid_t pid;
-	int i;
+	uint64_t i;
 
 	pid = fork();
 	if (pid < 0)
@@ -284,6 +391,8 @@ test_fork(int num)
 		err(-1, "test_fork: waitpid");
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		pid = fork();
 		if (pid < 0)
 			err(-1, "test_fork: fork");
@@ -293,13 +402,14 @@ test_fork(int num)
 			err(-1, "test_fork: waitpid");
 	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_vfork(int num)
+uint64_t
+test_vfork(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
 	pid_t pid;
-	int i;
+	uint64_t i;
 
 	pid = vfork();
 	if (pid < 0)
@@ -310,6 +420,8 @@ test_vfork(int num)
 		err(-1, "test_vfork: waitpid");
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		pid = vfork();
 		if (pid < 0)
 			err(-1, "test_vfork: vfork");
@@ -319,17 +431,18 @@ test_vfork(int num)
 			err(-1, "test_vfork: waitpid");
 	}
 	benchmark_stop();
+	return (i);
 }
 
 #define	USR_BIN_TRUE	"/usr/bin/true"
 static char *execve_args[] = { USR_BIN_TRUE, NULL};
 extern char **environ;
 
-void
-test_fork_exec(int num)
+uint64_t
+test_fork_exec(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
 	pid_t pid;
-	int i;
+	uint64_t i;
 
 	pid = fork();
 	if (pid < 0)
@@ -342,6 +455,8 @@ test_fork_exec(int num)
 		err(-1, "test_fork: waitpid");
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		pid = fork();
 		if (pid < 0)
 			err(-1, "test_fork_exec: fork");
@@ -353,13 +468,14 @@ test_fork_exec(int num)
 			err(-1, "test_fork_exec: waitpid");
 	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_vfork_exec(int num)
+uint64_t
+test_vfork_exec(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
 	pid_t pid;
-	int i;
+	uint64_t i;
 
 	pid = vfork();
 	if (pid < 0)
@@ -372,6 +488,8 @@ test_vfork_exec(int num)
 		err(-1, "test_vfork_exec: waitpid");
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		pid = vfork();
 		if (pid < 0)
 			err(-1, "test_vfork_exec: vfork");
@@ -383,43 +501,52 @@ test_vfork_exec(int num)
 			err(-1, "test_vfork_exec: waitpid");
 	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_chroot(int num)
+uint64_t
+test_chroot(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
-	int i;
+	uint64_t i;
 
 	if (chroot("/") < 0)
 		err(-1, "test_chroot: chroot");
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		if (chroot("/") < 0)
 			err(-1, "test_chroot: chroot");
 	}
 	benchmark_stop();
+	return (i);
 }
 
-void
-test_setuid(int num)
+uint64_t
+test_setuid(uint64_t num, uint64_t int_arg, const char *string_arg)
 {
 	uid_t uid;
-	int i;
+	uint64_t i;
 
 	uid = getuid();
 	if (setuid(uid) < 0)
 		err(-1, "test_setuid: setuid");
 	benchmark_start();
 	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
 		if (setuid(uid) < 0)
 			err(-1, "test_setuid: setuid");
 	}
 	benchmark_stop();
+	return (i);
 }
 
 struct test {
 	const char	*t_name;
-	void		(*t_func)(int);
+	uint64_t	(*t_func)(uint64_t, uint64_t, const char *);
+	uint64_t	 t_int;
+	const char	*t_string;
 };
 
 static const struct test tests[] = {
@@ -427,10 +554,26 @@ static const struct test tests[] = {
 	{ "getppid", test_getppid },
 	{ "clock_gettime", test_clock_gettime },
 	{ "pipe", test_pipe },
-	{ "socket_stream", test_socket_stream },
-	{ "socket_dgram", test_socket_dgram },
+	{ "socket_local_stream", test_socket_stream, .t_int = PF_LOCAL },
+	{ "socket_local_dgram", test_socket_dgram, .t_int = PF_LOCAL },
 	{ "socketpair_stream", test_socketpair_stream },
 	{ "socketpair_dgram", test_socketpair_dgram },
+	{ "socket_tcp", test_socket_stream, .t_int = PF_INET },
+	{ "socket_udp", test_socket_dgram, .t_int = PF_INET },
+	{ "open_close_devnull", test_open_close, .t_string = "/dev/null" },
+	{ "open_close_devzero", test_open_close, .t_string = "/dev/zero" },
+	{ "open_read_close_devzero_1", test_open_read_close,
+	    .t_string = "/dev/zero", .t_int = 1 },
+	{ "open_read_close_devzero_10", test_open_read_close,
+	    .t_string = "/dev/zero", .t_int = 10 },
+	{ "open_read_close_devzero_100", test_open_read_close,
+	    .t_string = "/dev/zero", .t_int = 100 },
+	{ "open_read_close_devzero_1000", test_open_read_close,
+	    .t_string = "/dev/zero", .t_int = 1000 },
+	{ "open_read_close_devzero_10000", test_open_read_close,
+	    .t_string = "/dev/zero", .t_int = 10000 },
+	{ "open_read_close_devzero_100000", test_open_read_close,
+	    .t_string = "/dev/zero", .t_int = 100000 },
 	{ "dup", test_dup },
 	{ "shmfd", test_shmfd },
 	{ "fstat_shmfd", test_fstat_shmfd },
@@ -448,7 +591,8 @@ usage(void)
 {
 	int i;
 
-	fprintf(stderr, "syscall_timing [iterations] [loops] [test]\n");
+	fprintf(stderr, "syscall_timing [-i iterations] [-l loops] "
+	    "[-s seconds] test\n");
 	for (i = 0; i < tests_count; i++)
 		fprintf(stderr, "  %s\n", tests[i].t_name);
 	exit(-1);
@@ -461,28 +605,61 @@ main(int argc, char *argv[])
 	const struct test *the_test;
 	long long ll;
 	char *endp;
-	int i, j, k;
-	int iterations, loops;
+	int ch, i, j, k;
+	uint64_t iterations, loops;
 
-	if (argc < 4)
-		usage();
+	alarm_timeout = 0;
+	iterations = 0;
+	loops = 0;
+	while ((ch = getopt(argc, argv, "i:l:s:")) != -1) {
+		switch (ch) {
+		case 'i':
+			ll = strtol(optarg, &endp, 10);
+			if (*endp != 0 || ll < 1 || ll > 100000)
+				usage();
+			iterations = ll;
+			break;
 
-	ll = strtoll(argv[1], &endp, 10);
-	if (*endp != 0 || ll < 0 || ll > 100000)
-		usage();
-	iterations = ll;
+		case 'l':
+			ll = strtol(optarg, &endp, 10);
+			if (*endp != 0 || ll < 1 || ll > 100000)
+				usage();
+			loops = ll;
+			break;
 
-	ll = strtoll(argv[2], &endp, 10);
-	if (*endp != 0 || ll < 0 || ll > 100000)
+		case 's':
+			ll = strtol(optarg, &endp, 10);
+			if (*endp != 0 || ll < 1 || ll > 60*60)
+				usage();
+			alarm_timeout = ll;
+			break;
+
+		case '?':
+		default:
+			usage();
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (iterations < 1 && alarm_timeout < 1)
+		usage();
+	if (iterations < 1)
+		iterations = UINT64_MAX;
+	if (loops < 1)
+		loops = 1;
+
+	if (argc != 1)
 		usage();
-	loops = ll;
 
 	assert(clock_getres(CLOCK_REALTIME, &ts_res) == 0);
-	printf("Clock resolution: %ju.%ju\n", (uintmax_t)ts_res.tv_sec,
+	printf("Clock resolution: %ju.%08ju\n", (uintmax_t)ts_res.tv_sec,
 	    (uintmax_t)ts_res.tv_nsec);
 	printf("test\tloop\ttotal\titerations\tperiteration\n");
 
-	for (j = 3; j < argc; j++) {
+	for (j = 0; j < argc; j++) {
+		uint64_t calls;
+
 		the_test = NULL;
 		for (i = 0; i < tests_count; i++) {
 			if (strcmp(argv[j], tests[i].t_name) == 0)
@@ -494,13 +671,15 @@ main(int argc, char *argv[])
 		/*
 		 * Run one warmup, then do the real thing (loops) times.
 		 */
-		the_test->t_func(iterations);
+		calls = the_test->t_func(iterations, the_test->t_int,
+		    the_test->t_string);
 		for (k = 0; k < loops; k++) {
-			the_test->t_func(iterations);
+			the_test->t_func(iterations, the_test->t_int,
+		    the_test->t_string);
 			timespecsub(&ts_end, &ts_start);
 			printf("%s\t%d\t", the_test->t_name, k);
 			printf("%ju.%09ju\t%d\t", (uintmax_t)ts_end.tv_sec,
-			    (uintmax_t)ts_end.tv_nsec, iterations);
+			    (uintmax_t)ts_end.tv_nsec, calls);
 
 		/*
 		 * Note.  This assumes that each iteration takes less than
@@ -508,9 +687,9 @@ main(int argc, char *argv[])
 		 * the room in our arithmetic unit.  Fine for system calls,
 		 * but not for long things.
 		 */
-			ts_end.tv_sec *= 1000000000 / iterations;
+			ts_end.tv_sec *= 1000000000 / calls;
 			printf("0.%09ju\n", (uintmax_t)(ts_end.tv_sec +
-			    ts_end.tv_nsec / iterations));
+			    ts_end.tv_nsec / calls));
 		}
 	}
 	return (0);

From e96466166614f21dabdd4e5cfb14c4b7ac44f082 Mon Sep 17 00:00:00 2001
From: Warner Losh <imp@FreeBSD.org>
Date: Thu, 21 Oct 2010 17:29:18 +0000
Subject: [PATCH 41/68] Left over from prior patch removed.

Submitted by: John Hixon
PR: 151442 (but the patch was backwards there)
---
 usr.sbin/pc-sysinstall/backend/functions-disk.sh | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/usr.sbin/pc-sysinstall/backend/functions-disk.sh b/usr.sbin/pc-sysinstall/backend/functions-disk.sh
index 1ff33faa43e..c6e849cb01c 100755
--- a/usr.sbin/pc-sysinstall/backend/functions-disk.sh
+++ b/usr.sbin/pc-sysinstall/backend/functions-disk.sh
@@ -507,17 +507,6 @@ setup_disk_slice()
 			DEST="${DISK}"
           fi 
 
-          if iscompressed "${IMAGE}"
-          then
-            local COMPRESSION
-  
-            get_compression_type "${IMAGE}"
-            COMPRESSION="${VAL}"
-  
-            decompress_file "${IMAGE}" "${COMPRESSION}"
-            IMAGE="${VAL}"
-          fi
-
           write_image "${IMAGE}" "${DEST}"
           check_disk_layout "${DEST}"
 		fi

From 1a587ef2a548f69d40ee88ca9b9a4fe6c32c5345 Mon Sep 17 00:00:00 2001
From: John Baldwin <jhb@FreeBSD.org>
Date: Thu, 21 Oct 2010 17:29:32 +0000
Subject: [PATCH 42/68] - Make 'vm_refcnt' volatile so that compilers won't be
 tempted to treat   its value as a loop invariant.  Currently this is a no-op
 because   'atomic_cmpset_int()' clobbers all memory on current architectures.
 - Use atomic_fetchadd_int() instead of an atomic_cmpset_int() loop to drop  
 a reference in vmspace_free().

Reviewed by:	alc
MFC after:	1 month
---
 sys/vm/vm_map.c | 6 +-----
 sys/vm/vm_map.h | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index 2e0f0012a85..40c317d93a6 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -339,15 +339,11 @@ vmspace_dofree(struct vmspace *vm)
 void
 vmspace_free(struct vmspace *vm)
 {
-	int refcnt;
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
 
-	do
-		refcnt = vm->vm_refcnt;
-	while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
-	if (refcnt == 1)
+	if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
 		vmspace_dofree(vm);
 }
 
diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h
index 18a1edf809a..8715b41dc7f 100644
--- a/sys/vm/vm_map.h
+++ b/sys/vm/vm_map.h
@@ -236,7 +236,7 @@ struct vmspace {
 	caddr_t vm_taddr;	/* (c) user virtual address of text */
 	caddr_t vm_daddr;	/* (c) user virtual address of data */
 	caddr_t vm_maxsaddr;	/* user VA at max stack growth */
-	int	vm_refcnt;	/* number of references */
+	volatile int vm_refcnt;	/* number of references */
 	/*
 	 * Keep the PMAP last, so that CPU-specific variations of that
 	 * structure on a single architecture don't result in offset

From 3e36aa656e7a26b55510e0a963773b5f3b91a097 Mon Sep 17 00:00:00 2001
From: Robert Watson <rwatson@FreeBSD.org>
Date: Thu, 21 Oct 2010 17:35:08 +0000
Subject: [PATCH 43/68] Fix bug in recent syscall_timing change: measure the
 number of iterations each loop, rather than once up front.  The distinction
 is unimportant when doing a fix iteration count, but when using a timer, it
 should vary.

Sponsored by:	Google, Inc.
MFC after:	2 weeks
---
 tools/tools/syscall_timing/syscall_timing.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/tools/syscall_timing/syscall_timing.c b/tools/tools/syscall_timing/syscall_timing.c
index 94975a8183b..030e7390545 100644
--- a/tools/tools/syscall_timing/syscall_timing.c
+++ b/tools/tools/syscall_timing/syscall_timing.c
@@ -671,11 +671,12 @@ main(int argc, char *argv[])
 		/*
 		 * Run one warmup, then do the real thing (loops) times.
 		 */
-		calls = the_test->t_func(iterations, the_test->t_int,
+		the_test->t_func(iterations, the_test->t_int,
 		    the_test->t_string);
+		calls = 0;
 		for (k = 0; k < loops; k++) {
-			the_test->t_func(iterations, the_test->t_int,
-		    the_test->t_string);
+			calls = the_test->t_func(iterations,
+			    the_test->t_int, the_test->t_string);
 			timespecsub(&ts_end, &ts_start);
 			printf("%s\t%d\t", the_test->t_name, k);
 			printf("%ju.%09ju\t%d\t", (uintmax_t)ts_end.tv_sec,

From fb2439a6f606de0fb6038bd5b3809f020a33ca77 Mon Sep 17 00:00:00 2001
From: John Baldwin <jhb@FreeBSD.org>
Date: Thu, 21 Oct 2010 17:46:23 +0000
Subject: [PATCH 44/68] Clarify a misleading comment.  The test in
 pci_reserve_map() was meant to ignore BARs that are invalid due to having a
 size of zero, not to ignore BARs with an existing base of zero.  While here,
 reorganize the code slightly to make the intent clearer.

Reported by:	avg
MFC after:	1 week
---
 sys/dev/pci/pci.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index 6534ba2d3ea..53313427492 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -3664,9 +3664,15 @@ pci_reserve_map(device_t dev, device_t child, int type, int *rid,
 	res = NULL;
 	pci_read_bar(child, *rid, &map, &testval);
 
-	/* Ignore a BAR with a base of 0. */
-	if ((*rid == PCIR_BIOS && pci_rombase(testval) == 0) ||
-	    pci_mapbase(testval) == 0)
+	/*
+	 * Determine the size of the BAR and ignore BARs with a size
+	 * of 0.  Device ROM BARs use a different mask value.
+	 */
+	if (*rid == PCIR_BIOS)
+		mapsize = pci_romsize(testval);
+	else
+		mapsize = pci_mapsize(testval);
+	if (mapsize == 0)
 		goto out;
 
 	if (PCI_BAR_MEM(testval) || *rid == PCIR_BIOS) {
@@ -3695,13 +3701,7 @@ pci_reserve_map(device_t dev, device_t child, int type, int *rid,
 	 * actually uses and we would otherwise have a
 	 * situation where we might allocate the excess to
 	 * another driver, which won't work.
-	 *
-	 * Device ROM BARs use a different mask value.
 	 */
-	if (*rid == PCIR_BIOS)
-		mapsize = pci_romsize(testval);
-	else
-		mapsize = pci_mapsize(testval);
 	count = 1UL << mapsize;
 	if (RF_ALIGNMENT(flags) < mapsize)
 		flags = (flags & ~RF_ALIGNMENT_MASK) | RF_ALIGNMENT_LOG2(mapsize);

From e9e38bc2e7039b4acedaff32e64b67944dca532a Mon Sep 17 00:00:00 2001
From: Benedict Reuschling <bcr@FreeBSD.org>
Date: Thu, 21 Oct 2010 18:21:19 +0000
Subject: [PATCH 45/68] Sync with OpenBSD rev. 1.13: strtonum does not require
 limits.h

Obtained from:  OpenBSD
Discussed with: ru@
MFC after:      5 days
---
 lib/libc/stdlib/strtonum.3 | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/libc/stdlib/strtonum.3 b/lib/libc/stdlib/strtonum.3
index 90f0b578ff3..b83aadda42e 100644
--- a/lib/libc/stdlib/strtonum.3
+++ b/lib/libc/stdlib/strtonum.3
@@ -12,7 +12,7 @@
 .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 .\"
-.\" $OpenBSD: strtonum.3,v 1.12 2005/10/26 11:37:58 jmc Exp $
+.\" $OpenBSD: strtonum.3,v 1.13 2006/04/25 05:15:42 tedu Exp $
 .\" $FreeBSD$
 .\"
 .Dd April 29, 2004
@@ -23,7 +23,6 @@
 .Nd "reliably convert string value to an integer"
 .Sh SYNOPSIS
 .In stdlib.h
-.In limits.h
 .Ft long long
 .Fo strtonum
 .Fa "const char *nptr"

From a8af8b783ae9407630ff678cdc9185cdc17ba263 Mon Sep 17 00:00:00 2001
From: Benedict Reuschling <bcr@FreeBSD.org>
Date: Thu, 21 Oct 2010 18:30:48 +0000
Subject: [PATCH 46/68] Document strtonum()s behavior of setting errno to 0
 when no error is found.

PR:             docs/143330
Submitted by:   Efstratios Karatzas (gpf dot kira at gmail dot com)
Discussed with: ru@
MFC after:      7 days
---
 lib/libc/stdlib/strtonum.3 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/libc/stdlib/strtonum.3 b/lib/libc/stdlib/strtonum.3
index b83aadda42e..7ca88c53316 100644
--- a/lib/libc/stdlib/strtonum.3
+++ b/lib/libc/stdlib/strtonum.3
@@ -83,6 +83,8 @@ is set, and
 .Fa errstr
 will point to an error message.
 On success,
+.Va errno
+is set to 0 and
 .Fa *errstr
 will be set to
 .Dv NULL ;

From 8a1b5ade5f2d944321983a8eec920f128768c511 Mon Sep 17 00:00:00 2001
From: Rick Macklem <rmacklem@FreeBSD.org>
Date: Thu, 21 Oct 2010 18:49:12 +0000
Subject: [PATCH 47/68] Modify the experimental NFS server in a manner
 analagous to r214049 for the regular NFS server, so that it will not do a
 VOP_LOOKUP() of ".." when at the root of a file system when performing a
 ReaddirPlus RPC.

MFC after:	10 days
---
 sys/fs/nfsserver/nfs_nfsdport.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c
index 5e50d3e389c..0d35d1d04b0 100644
--- a/sys/fs/nfsserver/nfs_nfsdport.c
+++ b/sys/fs/nfsserver/nfs_nfsdport.c
@@ -1933,7 +1933,15 @@ again:
 							vn_lock(vp,
 							    LK_EXCLUSIVE |
 							    LK_RETRY);
-						r = VOP_LOOKUP(vp, &nvp, &cn);
+						if ((vp->v_vflag & VV_ROOT) != 0
+						    && (cn.cn_flags & ISDOTDOT)
+						    != 0) {
+							vref(vp);
+							nvp = vp;
+							r = 0;
+						} else
+							r = VOP_LOOKUP(vp, &nvp,
+							    &cn);
 					}
 				}
 				if (!r) {

From c4cc2fbf5c09643d46c1bc54571585b0bc902bbc Mon Sep 17 00:00:00 2001
From: Robert Watson <rwatson@FreeBSD.org>
Date: Thu, 21 Oct 2010 19:01:59 +0000
Subject: [PATCH 48/68] Further syscall_timing improvements: allow an arbitrary
 "path" string argument to be passed on the command line, allowing
 file-related tests to be pointed at wherever desired.

Sponsored by:	Google, Inc.
MFC after:	2 weeks
---
 tools/tools/syscall_timing/syscall_timing.c | 86 ++++++++++-----------
 1 file changed, 41 insertions(+), 45 deletions(-)

diff --git a/tools/tools/syscall_timing/syscall_timing.c b/tools/tools/syscall_timing/syscall_timing.c
index 030e7390545..16688c0b93d 100644
--- a/tools/tools/syscall_timing/syscall_timing.c
+++ b/tools/tools/syscall_timing/syscall_timing.c
@@ -85,7 +85,7 @@ benchmark_stop(void)
 }
   
 uint64_t
-test_getuid(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_getuid(uint64_t num, uint64_t int_arg, const char *path)
 {
 	uint64_t i;
 
@@ -104,7 +104,7 @@ test_getuid(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_getppid(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_getppid(uint64_t num, uint64_t int_arg, const char *path)
 {
 	uint64_t i;
 
@@ -123,7 +123,7 @@ test_getppid(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_clock_gettime(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_clock_gettime(uint64_t num, uint64_t int_arg, const char *path)
 {
 	struct timespec ts;
 	uint64_t i;
@@ -139,7 +139,7 @@ test_clock_gettime(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_pipe(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_pipe(uint64_t num, uint64_t int_arg, const char *path)
 {
 	int fd[2], i;
 
@@ -167,7 +167,7 @@ test_pipe(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_socket_stream(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_socket_stream(uint64_t num, uint64_t int_arg, const char *path)
 {
 	uint64_t i, so;
 
@@ -189,7 +189,7 @@ test_socket_stream(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_socket_dgram(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_socket_dgram(uint64_t num, uint64_t int_arg, const char *path)
 {
 	uint64_t i, so;
 
@@ -211,7 +211,7 @@ test_socket_dgram(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_socketpair_stream(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_socketpair_stream(uint64_t num, uint64_t int_arg, const char *path)
 {
 	uint64_t i;
 	int so[2];
@@ -234,7 +234,7 @@ test_socketpair_stream(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_socketpair_dgram(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_socketpair_dgram(uint64_t num, uint64_t int_arg, const char *path)
 {
 	uint64_t i;
 	int so[2];
@@ -257,9 +257,8 @@ test_socketpair_dgram(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_open_close(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_open_close(uint64_t num, uint64_t int_arg, const char *path)
 {
-	const char *path = string_arg;
 	uint64_t i;
 	int fd;
 
@@ -282,15 +281,15 @@ test_open_close(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_open_read_close(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_open_read_close(uint64_t num, uint64_t int_arg, const char *path)
 {
 	char buf[int_arg];
 	uint64_t i;
 	int fd;
 
-	fd = open(string_arg, O_RDONLY);
+	fd = open(path, O_RDONLY);
 	if (fd < 0)
-		err(-1, "test_open_close: %s", string_arg);
+		err(-1, "test_open_close: %s", path);
 	(void)read(fd, buf, int_arg);
 	close(fd);
 
@@ -298,9 +297,9 @@ test_open_read_close(uint64_t num, uint64_t int_arg, const char *string_arg)
 	for (i = 0; i < num; i++) {
 		if (alarm_fired)
 			break;
-		fd = open(string_arg, O_RDONLY);
+		fd = open(path, O_RDONLY);
 		if (fd < 0)
-			err(-1, "test_open_close: %s", string_arg);
+			err(-1, "test_open_close: %s", path);
 		(void)read(fd, buf, int_arg);
 		close(fd);
 	}
@@ -309,7 +308,7 @@ test_open_read_close(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_dup(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_dup(uint64_t num, uint64_t int_arg, const char *path)
 {
 	int fd, i, shmfd;
 
@@ -333,7 +332,7 @@ test_dup(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_shmfd(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_shmfd(uint64_t num, uint64_t int_arg, const char *path)
 {
 	uint64_t i, shmfd;
 
@@ -355,7 +354,7 @@ test_shmfd(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_fstat_shmfd(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_fstat_shmfd(uint64_t num, uint64_t int_arg, const char *path)
 {
 	struct stat sb;
 	uint64_t i, shmfd;
@@ -377,7 +376,7 @@ test_fstat_shmfd(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_fork(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_fork(uint64_t num, uint64_t int_arg, const char *path)
 {
 	pid_t pid;
 	uint64_t i;
@@ -406,7 +405,7 @@ test_fork(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_vfork(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_vfork(uint64_t num, uint64_t int_arg, const char *path)
 {
 	pid_t pid;
 	uint64_t i;
@@ -439,7 +438,7 @@ static char *execve_args[] = { USR_BIN_TRUE, NULL};
 extern char **environ;
 
 uint64_t
-test_fork_exec(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_fork_exec(uint64_t num, uint64_t int_arg, const char *path)
 {
 	pid_t pid;
 	uint64_t i;
@@ -472,7 +471,7 @@ test_fork_exec(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_vfork_exec(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_vfork_exec(uint64_t num, uint64_t int_arg, const char *path)
 {
 	pid_t pid;
 	uint64_t i;
@@ -505,7 +504,7 @@ test_vfork_exec(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_chroot(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_chroot(uint64_t num, uint64_t int_arg, const char *path)
 {
 	uint64_t i;
 
@@ -523,7 +522,7 @@ test_chroot(uint64_t num, uint64_t int_arg, const char *string_arg)
 }
 
 uint64_t
-test_setuid(uint64_t num, uint64_t int_arg, const char *string_arg)
+test_setuid(uint64_t num, uint64_t int_arg, const char *path)
 {
 	uid_t uid;
 	uint64_t i;
@@ -546,7 +545,6 @@ struct test {
 	const char	*t_name;
 	uint64_t	(*t_func)(uint64_t, uint64_t, const char *);
 	uint64_t	 t_int;
-	const char	*t_string;
 };
 
 static const struct test tests[] = {
@@ -560,20 +558,13 @@ static const struct test tests[] = {
 	{ "socketpair_dgram", test_socketpair_dgram },
 	{ "socket_tcp", test_socket_stream, .t_int = PF_INET },
 	{ "socket_udp", test_socket_dgram, .t_int = PF_INET },
-	{ "open_close_devnull", test_open_close, .t_string = "/dev/null" },
-	{ "open_close_devzero", test_open_close, .t_string = "/dev/zero" },
-	{ "open_read_close_devzero_1", test_open_read_close,
-	    .t_string = "/dev/zero", .t_int = 1 },
-	{ "open_read_close_devzero_10", test_open_read_close,
-	    .t_string = "/dev/zero", .t_int = 10 },
-	{ "open_read_close_devzero_100", test_open_read_close,
-	    .t_string = "/dev/zero", .t_int = 100 },
-	{ "open_read_close_devzero_1000", test_open_read_close,
-	    .t_string = "/dev/zero", .t_int = 1000 },
-	{ "open_read_close_devzero_10000", test_open_read_close,
-	    .t_string = "/dev/zero", .t_int = 10000 },
-	{ "open_read_close_devzero_100000", test_open_read_close,
-	    .t_string = "/dev/zero", .t_int = 100000 },
+	{ "open_close", test_open_close },
+	{ "open_read_close_1", test_open_read_close, .t_int = 1 },
+	{ "open_read_close_10", test_open_read_close, .t_int = 10 },
+	{ "open_read_close_100", test_open_read_close, .t_int = 100 },
+	{ "open_read_close_1000", test_open_read_close, .t_int = 1000 },
+	{ "open_read_close_10000", test_open_read_close, .t_int = 10000 },
+	{ "open_read_close_100000", test_open_read_close, .t_int = 100000 },
 	{ "dup", test_dup },
 	{ "shmfd", test_shmfd },
 	{ "fstat_shmfd", test_fstat_shmfd },
@@ -592,7 +583,7 @@ usage(void)
 	int i;
 
 	fprintf(stderr, "syscall_timing [-i iterations] [-l loops] "
-	    "[-s seconds] test\n");
+	    "[-p path] [-s seconds] test\n");
 	for (i = 0; i < tests_count; i++)
 		fprintf(stderr, "  %s\n", tests[i].t_name);
 	exit(-1);
@@ -603,6 +594,7 @@ main(int argc, char *argv[])
 {
 	struct timespec ts_res;
 	const struct test *the_test;
+	const char *path;
 	long long ll;
 	char *endp;
 	int ch, i, j, k;
@@ -611,7 +603,8 @@ main(int argc, char *argv[])
 	alarm_timeout = 0;
 	iterations = 0;
 	loops = 0;
-	while ((ch = getopt(argc, argv, "i:l:s:")) != -1) {
+	path = NULL;
+	while ((ch = getopt(argc, argv, "i:l:p:s:")) != -1) {
 		switch (ch) {
 		case 'i':
 			ll = strtol(optarg, &endp, 10);
@@ -627,6 +620,10 @@ main(int argc, char *argv[])
 			loops = ll;
 			break;
 
+		case 'p':
+			path = optarg;
+			break;
+
 		case 's':
 			ll = strtol(optarg, &endp, 10);
 			if (*endp != 0 || ll < 1 || ll > 60*60)
@@ -671,12 +668,11 @@ main(int argc, char *argv[])
 		/*
 		 * Run one warmup, then do the real thing (loops) times.
 		 */
-		the_test->t_func(iterations, the_test->t_int,
-		    the_test->t_string);
+		the_test->t_func(iterations, the_test->t_int, path);
 		calls = 0;
 		for (k = 0; k < loops; k++) {
-			calls = the_test->t_func(iterations,
-			    the_test->t_int, the_test->t_string);
+			calls = the_test->t_func(iterations, the_test->t_int,
+			    path);
 			timespecsub(&ts_end, &ts_start);
 			printf("%s\t%d\t", the_test->t_name, k);
 			printf("%ju.%09ju\t%d\t", (uintmax_t)ts_end.tv_sec,

From 4e20912ad8b67c2d59480d0f17c8df5309575bd1 Mon Sep 17 00:00:00 2001
From: Robert Watson <rwatson@FreeBSD.org>
Date: Thu, 21 Oct 2010 19:03:24 +0000
Subject: [PATCH 49/68] Add Cambridge/Google tag since the copyright has been
 updated.

MFC after:	2 weeks
---
 tools/tools/syscall_timing/syscall_timing.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/tools/syscall_timing/syscall_timing.c b/tools/tools/syscall_timing/syscall_timing.c
index 16688c0b93d..8b19e22e196 100644
--- a/tools/tools/syscall_timing/syscall_timing.c
+++ b/tools/tools/syscall_timing/syscall_timing.c
@@ -2,6 +2,9 @@
  * Copyright (c) 2003-2004, 2010 Robert N. M. Watson
  * All rights reserved.
  *
+ * Portions of this software were developed at the University of Cambridge
+ * Computer Laboratory with support from a grant from Google, Inc.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:

From d680caab7339fa02425d881004d92d01937dde61 Mon Sep 17 00:00:00 2001
From: John Baldwin <jhb@FreeBSD.org>
Date: Thu, 21 Oct 2010 19:17:40 +0000
Subject: [PATCH 50/68] - When disabling ktracing on a process, free any
 pending requests that   may be left.  This fixes a memory leak that can occur
 when tracing is   disabled on a process via disabling tracing of a specific
 file (or if   an I/O error occurs with the tracefile) if the process's next
 system   call is exit().  The trace disabling code clears p_traceflag, so
 exit1()   doesn't do any KTRACE-related cleanup leading to the leak.  I chose
 to   make the free'ing of pending records synchronous rather than patching  
 exit1(). - Move KTRACE-specific logic out of kern_(exec|exit|fork).c and into
   kern_ktrace.c instead.  Make ktrace_mtx private to kern_ktrace.c as a  
 result.

MFC after:	1 month
---
 sys/kern/kern_exec.c   |  12 +---
 sys/kern/kern_exit.c   |  32 +----------
 sys/kern/kern_fork.c   |  16 +-----
 sys/kern/kern_ktrace.c | 126 +++++++++++++++++++++++++++++++++--------
 sys/sys/ktrace.h       |   4 +-
 5 files changed, 108 insertions(+), 82 deletions(-)

diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 3c05530160a..1e4d690b729 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -655,16 +655,8 @@ interpret:
 		setsugid(p);
 
 #ifdef KTRACE
-		if (p->p_tracevp != NULL &&
-		    priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) {
-			mtx_lock(&ktrace_mtx);
-			p->p_traceflag = 0;
-			tracevp = p->p_tracevp;
-			p->p_tracevp = NULL;
-			tracecred = p->p_tracecred;
-			p->p_tracecred = NULL;
-			mtx_unlock(&ktrace_mtx);
-		}
+		if (priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0))
+			ktrprocexec(p, &tracecred, &tracevp);
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 029f1c3e177..31389e13126 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -121,10 +121,6 @@ exit1(struct thread *td, int rv)
 	struct proc *p, *nq, *q;
 	struct vnode *vtmp;
 	struct vnode *ttyvp = NULL;
-#ifdef KTRACE
-	struct vnode *tracevp;
-	struct ucred *tracecred;
-#endif
 	struct plimit *plim;
 	int locked;
 
@@ -356,33 +352,7 @@ exit1(struct thread *td, int rv)
 	if (ttyvp != NULL)
 		vrele(ttyvp);
 #ifdef KTRACE
-	/*
-	 * Disable tracing, then drain any pending records and release
-	 * the trace file.
-	 */
-	if (p->p_traceflag != 0) {
-		PROC_LOCK(p);
-		mtx_lock(&ktrace_mtx);
-		p->p_traceflag = 0;
-		mtx_unlock(&ktrace_mtx);
-		PROC_UNLOCK(p);
-		ktrprocexit(td);
-		PROC_LOCK(p);
-		mtx_lock(&ktrace_mtx);
-		tracevp = p->p_tracevp;
-		p->p_tracevp = NULL;
-		tracecred = p->p_tracecred;
-		p->p_tracecred = NULL;
-		mtx_unlock(&ktrace_mtx);
-		PROC_UNLOCK(p);
-		if (tracevp != NULL) {
-			locked = VFS_LOCK_GIANT(tracevp->v_mount);
-			vrele(tracevp);
-			VFS_UNLOCK_GIANT(locked);
-		}
-		if (tracecred != NULL)
-			crfree(tracecred);
-	}
+	ktrprocexit(td);
 #endif
 	/*
 	 * Release reference to text vnode
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index da2c415caa5..126c668f12c 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -645,21 +645,7 @@ again:
 	callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);
 
 #ifdef KTRACE
-	/*
-	 * Copy traceflag and tracefile if enabled.
-	 */
-	mtx_lock(&ktrace_mtx);
-	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
-	if (p1->p_traceflag & KTRFAC_INHERIT) {
-		p2->p_traceflag = p1->p_traceflag;
-		if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
-			VREF(p2->p_tracevp);
-			KASSERT(p1->p_tracecred != NULL,
-			    ("ktrace vnode with no cred"));
-			p2->p_tracecred = crhold(p1->p_tracecred);
-		}
-	}
-	mtx_unlock(&ktrace_mtx);
+	ktrprocfork(p1, p2);
 #endif
 
 	/*
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
index bf530e1d9ec..6e2285b391c 100644
--- a/sys/kern/kern_ktrace.c
+++ b/sys/kern/kern_ktrace.c
@@ -126,7 +126,7 @@ SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
     0, "Maximum size of genio event payload");
 
 static int print_message = 1;
-struct mtx ktrace_mtx;
+static struct mtx ktrace_mtx;
 static struct sx ktrace_sx;
 
 static void ktrace_init(void *dummy);
@@ -134,7 +134,10 @@ static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
 static u_int ktrace_resize_pool(u_int newsize);
 static struct ktr_request *ktr_getrequest(int type);
 static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
+static void ktr_freeproc(struct proc *p, struct ucred **uc,
+    struct vnode **vp);
 static void ktr_freerequest(struct ktr_request *req);
+static void ktr_freerequest_locked(struct ktr_request *req);
 static void ktr_writerequest(struct thread *td, struct ktr_request *req);
 static int ktrcanset(struct thread *,struct proc *);
 static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
@@ -375,11 +378,43 @@ static void
 ktr_freerequest(struct ktr_request *req)
 {
 
+	mtx_lock(&ktrace_mtx);
+	ktr_freerequest_locked(req);
+	mtx_unlock(&ktrace_mtx);
+}
+
+static void
+ktr_freerequest_locked(struct ktr_request *req)
+{
+
+	mtx_assert(&ktrace_mtx, MA_OWNED);
 	if (req->ktr_buffer != NULL)
 		free(req->ktr_buffer, M_KTRACE);
-	mtx_lock(&ktrace_mtx);
 	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
-	mtx_unlock(&ktrace_mtx);
+}
+
+/*
+ * Disable tracing for a process and release all associated resources.
+ * The caller is responsible for releasing a reference on the returned
+ * vnode and credentials.
+ */
+static void
+ktr_freeproc(struct proc *p, struct ucred **uc, struct vnode **vp)
+{
+	struct ktr_request *req;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	mtx_assert(&ktrace_mtx, MA_OWNED);
+	*uc = p->p_tracecred;
+	p->p_tracecred = NULL;
+	if (vp != NULL)
+		*vp = p->p_tracevp;
+	p->p_tracevp = NULL;
+	p->p_traceflag = 0;
+	while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
+		STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
+		ktr_freerequest_locked(req);
+	}
 }
 
 void
@@ -432,19 +467,78 @@ ktrsysret(code, error, retval)
 }
 
 /*
- * When a process exits, drain per-process asynchronous trace records.
+ * When a setuid process execs, disable tracing.
+ *
+ * XXX: We toss any pending asynchronous records.
+ */
+void
+ktrprocexec(struct proc *p, struct ucred **uc, struct vnode **vp)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	mtx_lock(&ktrace_mtx);
+	ktr_freeproc(p, uc, vp);
+	mtx_unlock(&ktrace_mtx);
+}
+
+/*
+ * When a process exits, drain per-process asynchronous trace records
+ * and disable tracing.
  */
 void
 ktrprocexit(struct thread *td)
 {
+	struct proc *p;
+	struct ucred *cred;
+	struct vnode *vp;
+	int vfslocked;
+
+	p = td->td_proc;
+	if (p->p_traceflag == 0)
+		return;
 
 	ktrace_enter(td);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
+	PROC_LOCK(p);
+	mtx_lock(&ktrace_mtx);
+	ktr_freeproc(p, &cred, &vp);
+	mtx_unlock(&ktrace_mtx);
+	PROC_UNLOCK(p);
+	if (vp != NULL) {
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+		vrele(vp);
+		VFS_UNLOCK_GIANT(vfslocked);
+	}
+	if (cred != NULL)
+		crfree(cred);
 	ktrace_exit(td);
 }
 
+/*
+ * When a process forks, enable tracing in the new process if needed.
+ */
+void
+ktrprocfork(struct proc *p1, struct proc *p2)
+{
+
+	PROC_LOCK_ASSERT(p1, MA_OWNED);
+	PROC_LOCK_ASSERT(p2, MA_OWNED);
+	mtx_lock(&ktrace_mtx);
+	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
+	if (p1->p_traceflag & KTRFAC_INHERIT) {
+		p2->p_traceflag = p1->p_traceflag;
+		if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
+			VREF(p2->p_tracevp);
+			KASSERT(p1->p_tracecred != NULL,
+			    ("ktrace vnode with no cred"));
+			p2->p_tracecred = crhold(p1->p_tracecred);
+		}
+	}
+	mtx_unlock(&ktrace_mtx);
+}
+
 /*
  * When a thread returns, drain any asynchronous records generated by the
  * system call.
@@ -694,10 +788,7 @@ ktrace(td, uap)
 			if (p->p_tracevp == vp) {
 				if (ktrcanset(td, p)) {
 					mtx_lock(&ktrace_mtx);
-					cred = p->p_tracecred;
-					p->p_tracecred = NULL;
-					p->p_tracevp = NULL;
-					p->p_traceflag = 0;
+					ktr_freeproc(p, &cred, NULL);
 					mtx_unlock(&ktrace_mtx);
 					vrele_count++;
 					crfree(cred);
@@ -864,14 +955,9 @@ ktrops(td, p, ops, facs, vp)
 			p->p_traceflag |= KTRFAC_ROOT;
 	} else {
 		/* KTROP_CLEAR */
-		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
+		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
 			/* no more tracing */
-			p->p_traceflag = 0;
-			tracevp = p->p_tracevp;
-			p->p_tracevp = NULL;
-			tracecred = p->p_tracecred;
-			p->p_tracecred = NULL;
-		}
+			ktr_freeproc(p, &tracecred, &tracevp);
 	}
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
@@ -1036,10 +1122,7 @@ ktr_writerequest(struct thread *td, struct ktr_request *req)
 		PROC_LOCK(p);
 		if (p->p_tracevp == vp) {
 			mtx_lock(&ktrace_mtx);
-			p->p_tracevp = NULL;
-			p->p_traceflag = 0;
-			cred = p->p_tracecred;
-			p->p_tracecred = NULL;
+			ktr_freeproc(p, &cred, NULL);
 			mtx_unlock(&ktrace_mtx);
 			vrele_count++;
 		}
@@ -1051,11 +1134,6 @@ ktr_writerequest(struct thread *td, struct ktr_request *req)
 	}
 	sx_sunlock(&allproc_lock);
 
-	/*
-	 * We can't clear any pending requests in threads that have cached
-	 * them but not yet committed them, as those are per-thread.  The
-	 * thread will have to clear it itself on system call return.
-	 */
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	while (vrele_count-- > 0)
 		vrele(vp);
diff --git a/sys/sys/ktrace.h b/sys/sys/ktrace.h
index a3e546957e4..9f8810c9f30 100644
--- a/sys/sys/ktrace.h
+++ b/sys/sys/ktrace.h
@@ -191,8 +191,6 @@ struct stat;
 #define	KTRFAC_DROP	0x20000000	/* last event was dropped */
 
 #ifdef	_KERNEL
-extern struct mtx ktrace_mtx;
-
 void	ktrnamei(char *);
 void	ktrcsw(int, int);
 void	ktrpsig(int, sig_t, sigset_t *, int);
@@ -200,7 +198,9 @@ void	ktrgenio(int, enum uio_rw, struct uio *, int);
 void	ktrsyscall(int, int narg, register_t args[]);
 void	ktrsysctl(int *name, u_int namelen);
 void	ktrsysret(int, int, register_t);
+void	ktrprocexec(struct proc *, struct ucred **, struct vnode **);
 void	ktrprocexit(struct thread *);
+void	ktrprocfork(struct proc *, struct proc *);
 void	ktruserret(struct thread *);
 void	ktrstruct(const char *, void *, size_t);
 #define ktrsockaddr(s) \

From f3c95fe74885d8cb188fff293ee648afa42637f0 Mon Sep 17 00:00:00 2001
From: Bernhard Schmidt <bschmidt@FreeBSD.org>
Date: Thu, 21 Oct 2010 19:28:52 +0000
Subject: [PATCH 51/68] Instead of calling return when reaching the end of the
 assoc notification break the loop instead. We want to run the code after the
 while loop to set an associd and capinfo. If we don't do this net80211 will
 drop frames because it assumes the node has not yet been associated.

MFC after:	1 week
---
 sys/dev/iwi/if_iwi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sys/dev/iwi/if_iwi.c b/sys/dev/iwi/if_iwi.c
index f5ba34f1f6b..68379f4d49e 100644
--- a/sys/dev/iwi/if_iwi.c
+++ b/sys/dev/iwi/if_iwi.c
@@ -1356,7 +1356,7 @@ iwi_checkforqos(struct ieee80211vap *vap,
 
 	wme = NULL;
 	while (frm < efrm) {
-		IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1], return);
+		IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1], break);
 		switch (*frm) {
 		case IEEE80211_ELEMID_VENDOR:
 			if (iswmeoui(frm))

From 9f953145385c0424c33ebb6a77528e21237bba1b Mon Sep 17 00:00:00 2001
From: Bernhard Schmidt <bschmidt@FreeBSD.org>
Date: Thu, 21 Oct 2010 19:30:55 +0000
Subject: [PATCH 52/68] The firmware always sets bit 14 and 15, to get the real
 associd we need to clear those bits.

MFC after:	1 week
---
 sys/dev/iwi/if_iwi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sys/dev/iwi/if_iwi.c b/sys/dev/iwi/if_iwi.c
index 68379f4d49e..62b53be5bcd 100644
--- a/sys/dev/iwi/if_iwi.c
+++ b/sys/dev/iwi/if_iwi.c
@@ -1368,7 +1368,7 @@ iwi_checkforqos(struct ieee80211vap *vap,
 
 	ni = vap->iv_bss;
 	ni->ni_capinfo = capinfo;
-	ni->ni_associd = associd;
+	ni->ni_associd = associd & 0x3fff;
 	if (wme != NULL)
 		ni->ni_flags |= IEEE80211_NODE_QOS;
 	else

From 3ac01bc2ae614d88ee9b1a99fab6bae742b2d44c Mon Sep 17 00:00:00 2001
From: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Date: Thu, 21 Oct 2010 19:44:28 +0000
Subject: [PATCH 53/68] Free opencrypto sessions on suspend, as they also might
 keep encryption keys.

---
 sys/geom/eli/g_eli.c | 115 ++++++++++++++++++++++++++++++-------------
 sys/geom/eli/g_eli.h |   1 +
 2 files changed, 81 insertions(+), 35 deletions(-)

diff --git a/sys/geom/eli/g_eli.c b/sys/geom/eli/g_eli.c
index 0d9b9c7c04f..3b9ffdd81b7 100644
--- a/sys/geom/eli/g_eli.c
+++ b/sys/geom/eli/g_eli.c
@@ -314,6 +314,69 @@ g_eli_start(struct bio *bp)
 	}
 }
 
+static int
+g_eli_newsession(struct g_eli_worker *wr)
+{
+	struct g_eli_softc *sc;
+	struct cryptoini crie, cria;
+	int error;
+
+	sc = wr->w_softc;
+
+	bzero(&crie, sizeof(crie));
+	crie.cri_alg = sc->sc_ealgo;
+	crie.cri_klen = sc->sc_ekeylen;
+	if (sc->sc_ealgo == CRYPTO_AES_XTS)
+		crie.cri_klen <<= 1;
+	crie.cri_key = sc->sc_ekeys[0];
+	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
+		bzero(&cria, sizeof(cria));
+		cria.cri_alg = sc->sc_aalgo;
+		cria.cri_klen = sc->sc_akeylen;
+		cria.cri_key = sc->sc_akey;
+		crie.cri_next = &cria;
+	}
+
+	switch (sc->sc_crypto) {
+	case G_ELI_CRYPTO_SW:
+		error = crypto_newsession(&wr->w_sid, &crie,
+		    CRYPTOCAP_F_SOFTWARE);
+		break;
+	case G_ELI_CRYPTO_HW:
+		error = crypto_newsession(&wr->w_sid, &crie,
+		    CRYPTOCAP_F_HARDWARE);
+		break;
+	case G_ELI_CRYPTO_UNKNOWN:
+		error = crypto_newsession(&wr->w_sid, &crie,
+		    CRYPTOCAP_F_HARDWARE);
+		if (error == 0) {
+			mtx_lock(&sc->sc_queue_mtx);
+			if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN)
+				sc->sc_crypto = G_ELI_CRYPTO_HW;
+			mtx_unlock(&sc->sc_queue_mtx);
+		} else {
+			error = crypto_newsession(&wr->w_sid, &crie,
+			    CRYPTOCAP_F_SOFTWARE);
+			mtx_lock(&sc->sc_queue_mtx);
+			if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN)
+				sc->sc_crypto = G_ELI_CRYPTO_SW;
+			mtx_unlock(&sc->sc_queue_mtx);
+		}
+		break;
+	default:
+		panic("%s: invalid condition", __func__);
+	}
+
+	return (error);
+}
+
+static void
+g_eli_freesession(struct g_eli_worker *wr)
+{
+
+	crypto_freesession(wr->w_sid);
+}
+
 static void
 g_eli_cancel(struct g_eli_softc *sc)
 {
@@ -361,6 +424,7 @@ g_eli_worker(void *arg)
 	struct g_eli_softc *sc;
 	struct g_eli_worker *wr;
 	struct bio *bp;
+	int error;
 
 	wr = arg;
 	sc = wr->w_softc;
@@ -388,7 +452,7 @@ again:
 			if (sc->sc_flags & G_ELI_FLAG_DESTROY) {
 				g_eli_cancel(sc);
 				LIST_REMOVE(wr, w_next);
-				crypto_freesession(wr->w_sid);
+				g_eli_freesession(wr);
 				free(wr, M_ELI);
 				G_ELI_DEBUG(1, "Thread %s exiting.",
 				    curthread->td_proc->p_comm);
@@ -411,12 +475,21 @@ again:
 				 * Suspend requested, mark the worker as
 				 * suspended and go to sleep.
 				 */
-				wr->w_active = 0;
+				if (wr->w_active) {
+					g_eli_freesession(wr);
+					wr->w_active = FALSE;
+				}
 				wakeup(&sc->sc_workers);
 				msleep(sc, &sc->sc_queue_mtx, PRIBIO,
 				    "geli:suspend", 0);
-				if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND))
-					wr->w_active = 1;
+				if (!wr->w_active &&
+				    !(sc->sc_flags & G_ELI_FLAG_SUSPEND)) {
+					error = g_eli_newsession(wr);
+					KASSERT(error == 0,
+					    ("g_eli_newsession() failed on resume (error=%d)",
+					    error));
+					wr->w_active = TRUE;
+				}
 				goto again;
 			}
 			msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0);
@@ -630,7 +703,6 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *cp;
-	struct cryptoini crie, cria;
 	u_int i, threads;
 	int error;
 
@@ -658,7 +730,7 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
 		gp->access = g_std_access;
 
 	sc->sc_inflight = 0;
-	sc->sc_crypto = G_ELI_CRYPTO_SW;
+	sc->sc_crypto = G_ELI_CRYPTO_UNKNOWN;
 	sc->sc_flags = md->md_flags;
 	/* Backward compatibility. */
 	if (md->md_version < 4)
@@ -772,20 +844,6 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
 
 	LIST_INIT(&sc->sc_workers);
 
-	bzero(&crie, sizeof(crie));
-	crie.cri_alg = sc->sc_ealgo;
-	crie.cri_klen = sc->sc_ekeylen;
-	if (sc->sc_ealgo == CRYPTO_AES_XTS)
-		crie.cri_klen <<= 1;
-	crie.cri_key = sc->sc_ekeys[0];
-	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
-		bzero(&cria, sizeof(cria));
-		cria.cri_alg = sc->sc_aalgo;
-		cria.cri_klen = sc->sc_akeylen;
-		cria.cri_key = sc->sc_akey;
-		crie.cri_next = &cria;
-	}
-
 	threads = g_eli_threads;
 	if (threads == 0)
 		threads = mp_ncpus;
@@ -805,20 +863,7 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
 		wr->w_number = i;
 		wr->w_active = TRUE;
 
-		/*
-		 * If this is the first pass, try to get hardware support.
-		 * Use software cryptography, if we cannot get it.
-		 */
-		if (LIST_EMPTY(&sc->sc_workers)) {
-			error = crypto_newsession(&wr->w_sid, &crie,
-			    CRYPTOCAP_F_HARDWARE);
-			if (error == 0)
-				sc->sc_crypto = G_ELI_CRYPTO_HW;
-		}
-		if (sc->sc_crypto == G_ELI_CRYPTO_SW) {
-			error = crypto_newsession(&wr->w_sid, &crie,
-			    CRYPTOCAP_F_SOFTWARE);
-		}
+		error = g_eli_newsession(wr);
 		if (error != 0) {
 			free(wr, M_ELI);
 			if (req != NULL) {
@@ -834,7 +879,7 @@ g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
 		error = kproc_create(g_eli_worker, wr, &wr->w_proc, 0, 0,
 		    "g_eli[%u] %s", i, bpp->name);
 		if (error != 0) {
-			crypto_freesession(wr->w_sid);
+			g_eli_freesession(wr);
 			free(wr, M_ELI);
 			if (req != NULL) {
 				gctl_error(req, "Cannot create kernel thread "
diff --git a/sys/geom/eli/g_eli.h b/sys/geom/eli/g_eli.h
index fd53d5fea83..d67e43653c4 100644
--- a/sys/geom/eli/g_eli.h
+++ b/sys/geom/eli/g_eli.h
@@ -113,6 +113,7 @@ extern int g_eli_debug;
 extern u_int g_eli_overwrites;
 extern u_int g_eli_batch;
 
+#define	G_ELI_CRYPTO_UNKNOWN	0
 #define	G_ELI_CRYPTO_HW		1
 #define	G_ELI_CRYPTO_SW		2
 

From 283e21dd3239e07e5c5e4e6871ae3579124a796a Mon Sep 17 00:00:00 2001
From: Gleb Smirnoff <glebius@FreeBSD.org>
Date: Thu, 21 Oct 2010 20:22:00 +0000
Subject: [PATCH 54/68] Fix typo in last commit.

Submitted by:	bcr
---
 usr.bin/unzip/unzip.1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/usr.bin/unzip/unzip.1 b/usr.bin/unzip/unzip.1
index b399a645c15..7fee8522afe 100644
--- a/usr.bin/unzip/unzip.1
+++ b/usr.bin/unzip/unzip.1
@@ -111,7 +111,7 @@ Note that only one of
 and
 .Fl u
 may be specified.
-Is specified filename is
+If specified filename is
 .Va Qq - ,
 then data is read from
 .Va stdin .

From 5e5fd037d6d57c5ecd1b0e48ab58c732f604865f Mon Sep 17 00:00:00 2001
From: Xin LI <delphij@FreeBSD.org>
Date: Thu, 21 Oct 2010 20:31:50 +0000
Subject: [PATCH 55/68] Call chainevh callback when we are invoked with neither
 MOD_LOAD nor MOD_UNLOAD.  This makes it possible to add custom hooks for
 other module events.

Return EOPNOTSUPP when there is no callback available.

Pointed out by:	jhb
Reviewed by:	jhb
MFC after:	1 month
---
 sys/kern/kern_syscalls.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
index aa3bb34acef..03f6088403d 100644
--- a/sys/kern/kern_syscalls.c
+++ b/sys/kern/kern_syscalls.c
@@ -181,7 +181,9 @@ syscall_module_handler(struct module *mod, int what, void *arg)
 		error = syscall_deregister(data->offset, &data->old_sysent);
 		return (error);
 	default:
-		return EOPNOTSUPP;
+		if (data->chainevh)
+			return (data->chainevh(mod, what, data->chainarg));
+		return (EOPNOTSUPP);
 	}
 
 	/* NOTREACHED */

From 73e5f7634976602a2606b6538f1cfa9f1968151c Mon Sep 17 00:00:00 2001
From: Robert Watson <rwatson@FreeBSD.org>
Date: Thu, 21 Oct 2010 21:08:12 +0000
Subject: [PATCH 56/68] Universally use uintmax_t in syscall_timing; rearrange
 arithmetic to suffer fewer rounding errors with smaller numbers; fix argc
 validation so multiple tests run on a single command line.

Sponsored by:	Google, Inc.
MFC after:	2 weeks
---
 tools/tools/syscall_timing/syscall_timing.c | 129 ++++++++++----------
 1 file changed, 65 insertions(+), 64 deletions(-)

diff --git a/tools/tools/syscall_timing/syscall_timing.c b/tools/tools/syscall_timing/syscall_timing.c
index 8b19e22e196..3a21974adb0 100644
--- a/tools/tools/syscall_timing/syscall_timing.c
+++ b/tools/tools/syscall_timing/syscall_timing.c
@@ -87,10 +87,10 @@ benchmark_stop(void)
 	assert(clock_gettime(CLOCK_REALTIME, &ts_end) == 0);
 }
   
-uint64_t
-test_getuid(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_getuid(uintmax_t num, uintmax_t int_arg, const char *path)
 {
-	uint64_t i;
+	uintmax_t i;
 
 	/*
 	 * Thread-local data should require no locking if system
@@ -106,10 +106,10 @@ test_getuid(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_getppid(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_getppid(uintmax_t num, uintmax_t int_arg, const char *path)
 {
-	uint64_t i;
+	uintmax_t i;
 
 	/*
 	 * This is process-local, but can change, so will require a
@@ -125,11 +125,11 @@ test_getppid(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_clock_gettime(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_clock_gettime(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	struct timespec ts;
-	uint64_t i;
+	uintmax_t i;
 
 	benchmark_start();
 	for (i = 0; i < num; i++) {
@@ -141,8 +141,8 @@ test_clock_gettime(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_pipe(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_pipe(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	int fd[2], i;
 
@@ -169,10 +169,10 @@ test_pipe(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_socket_stream(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_socket_stream(uintmax_t num, uintmax_t int_arg, const char *path)
 {
-	uint64_t i, so;
+	uintmax_t i, so;
 
 	so = socket(int_arg, SOCK_STREAM, 0);
 	if (so < 0)
@@ -191,10 +191,10 @@ test_socket_stream(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_socket_dgram(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_socket_dgram(uintmax_t num, uintmax_t int_arg, const char *path)
 {
-	uint64_t i, so;
+	uintmax_t i, so;
 
 	so = socket(int_arg, SOCK_DGRAM, 0);
 	if (so < 0)
@@ -213,10 +213,10 @@ test_socket_dgram(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_socketpair_stream(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_socketpair_stream(uintmax_t num, uintmax_t int_arg, const char *path)
 {
-	uint64_t i;
+	uintmax_t i;
 	int so[2];
 
 	if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1)
@@ -236,10 +236,10 @@ test_socketpair_stream(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_socketpair_dgram(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_socketpair_dgram(uintmax_t num, uintmax_t int_arg, const char *path)
 {
-	uint64_t i;
+	uintmax_t i;
 	int so[2];
 
 	if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1)
@@ -259,10 +259,10 @@ test_socketpair_dgram(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_open_close(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_open_close(uintmax_t num, uintmax_t int_arg, const char *path)
 {
-	uint64_t i;
+	uintmax_t i;
 	int fd;
 
 	fd = open(path, O_RDONLY);
@@ -283,11 +283,11 @@ test_open_close(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_open_read_close(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_open_read_close(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	char buf[int_arg];
-	uint64_t i;
+	uintmax_t i;
 	int fd;
 
 	fd = open(path, O_RDONLY);
@@ -310,8 +310,8 @@ test_open_read_close(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_dup(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_dup(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	int fd, i, shmfd;
 
@@ -334,10 +334,10 @@ test_dup(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_shmfd(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_shmfd(uintmax_t num, uintmax_t int_arg, const char *path)
 {
-	uint64_t i, shmfd;
+	uintmax_t i, shmfd;
 
 	shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
 	if (shmfd < 0)
@@ -356,11 +356,11 @@ test_shmfd(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_fstat_shmfd(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_fstat_shmfd(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	struct stat sb;
-	uint64_t i, shmfd;
+	uintmax_t i, shmfd;
 
 	shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600);
 	if (shmfd < 0)
@@ -378,11 +378,11 @@ test_fstat_shmfd(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_fork(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_fork(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	pid_t pid;
-	uint64_t i;
+	uintmax_t i;
 
 	pid = fork();
 	if (pid < 0)
@@ -407,11 +407,11 @@ test_fork(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_vfork(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_vfork(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	pid_t pid;
-	uint64_t i;
+	uintmax_t i;
 
 	pid = vfork();
 	if (pid < 0)
@@ -440,11 +440,11 @@ test_vfork(uint64_t num, uint64_t int_arg, const char *path)
 static char *execve_args[] = { USR_BIN_TRUE, NULL};
 extern char **environ;
 
-uint64_t
-test_fork_exec(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_fork_exec(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	pid_t pid;
-	uint64_t i;
+	uintmax_t i;
 
 	pid = fork();
 	if (pid < 0)
@@ -473,11 +473,11 @@ test_fork_exec(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_vfork_exec(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_vfork_exec(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	pid_t pid;
-	uint64_t i;
+	uintmax_t i;
 
 	pid = vfork();
 	if (pid < 0)
@@ -506,10 +506,10 @@ test_vfork_exec(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_chroot(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_chroot(uintmax_t num, uintmax_t int_arg, const char *path)
 {
-	uint64_t i;
+	uintmax_t i;
 
 	if (chroot("/") < 0)
 		err(-1, "test_chroot: chroot");
@@ -524,11 +524,11 @@ test_chroot(uint64_t num, uint64_t int_arg, const char *path)
 	return (i);
 }
 
-uint64_t
-test_setuid(uint64_t num, uint64_t int_arg, const char *path)
+uintmax_t
+test_setuid(uintmax_t num, uintmax_t int_arg, const char *path)
 {
 	uid_t uid;
-	uint64_t i;
+	uintmax_t i;
 
 	uid = getuid();
 	if (setuid(uid) < 0)
@@ -546,8 +546,8 @@ test_setuid(uint64_t num, uint64_t int_arg, const char *path)
 
 struct test {
 	const char	*t_name;
-	uint64_t	(*t_func)(uint64_t, uint64_t, const char *);
-	uint64_t	 t_int;
+	uintmax_t	(*t_func)(uintmax_t, uintmax_t, const char *);
+	uintmax_t	 t_int;
 };
 
 static const struct test tests[] = {
@@ -601,7 +601,7 @@ main(int argc, char *argv[])
 	long long ll;
 	char *endp;
 	int ch, i, j, k;
-	uint64_t iterations, loops;
+	uintmax_t iterations, loops;
 
 	alarm_timeout = 0;
 	iterations = 0;
@@ -649,16 +649,16 @@ main(int argc, char *argv[])
 	if (loops < 1)
 		loops = 1;
 
-	if (argc != 1)
+	if (argc < 1)
 		usage();
 
 	assert(clock_getres(CLOCK_REALTIME, &ts_res) == 0);
-	printf("Clock resolution: %ju.%08ju\n", (uintmax_t)ts_res.tv_sec,
+	printf("Clock resolution: %ju.%09ju\n", (uintmax_t)ts_res.tv_sec,
 	    (uintmax_t)ts_res.tv_nsec);
 	printf("test\tloop\ttotal\titerations\tperiteration\n");
 
 	for (j = 0; j < argc; j++) {
-		uint64_t calls;
+		uintmax_t calls, nsecsperit;
 
 		the_test = NULL;
 		for (i = 0; i < tests_count; i++) {
@@ -687,9 +687,10 @@ main(int argc, char *argv[])
 		 * the room in our arithmetic unit.  Fine for system calls,
 		 * but not for long things.
 		 */
-			ts_end.tv_sec *= 1000000000 / calls;
-			printf("0.%09ju\n", (uintmax_t)(ts_end.tv_sec +
-			    ts_end.tv_nsec / calls));
+			nsecsperit = ts_end.tv_sec * 1000000000;
+			nsecsperit += ts_end.tv_nsec;
+			nsecsperit /= calls;
+			printf("0.%09ju\n", (uintmax_t)nsecsperit);
 		}
 	}
 	return (0);

From c9489e0d75e65b3dc1fcdf195a7695c3563b1f5e Mon Sep 17 00:00:00 2001
From: Warner Losh <imp@FreeBSD.org>
Date: Thu, 21 Oct 2010 22:33:50 +0000
Subject: [PATCH 57/68] Minor tweaks in compression support: o We need an eval
 here to get the right expansion of the command o bs=128k doesn't work in some
 cases, so eliminate it and cope with the   minor performance hit.

Submitted by:	john hixson
---
 usr.sbin/pc-sysinstall/backend/functions.sh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/usr.sbin/pc-sysinstall/backend/functions.sh b/usr.sbin/pc-sysinstall/backend/functions.sh
index 46b5a02f4fc..2b78bbc57c4 100755
--- a/usr.sbin/pc-sysinstall/backend/functions.sh
+++ b/usr.sbin/pc-sysinstall/backend/functions.sh
@@ -138,7 +138,7 @@ rc_halt()
   fi
 
   echo "Running: ${CMD}" >>${LOGOUT}
-  ${CMD} >>${LOGOUT} 2>>${LOGOUT}
+  eval ${CMD} >>${LOGOUT} 2>>${LOGOUT}
   STATUS="$?"
   if [ "${STATUS}" != "0" ]
   then
@@ -364,37 +364,37 @@ write_image()
 
     case "${COMPRESSION}" in
       lzw)
-        rc_halt "uncompress ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        rc_halt "uncompress ${IMAGE_FILE} -c | dd of=${DEVICE_FILE}"
         IMAGE_FILE="${IMAGE_FILE%.Z}"
         ;;
 
       lzo)
-        rc_halt "lzop -d $IMAGE_{FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        rc_halt "lzop -d $IMAGE_{FILE} -c | dd of=${DEVICE_FILE}"
         IMAGE_FILE="${IMAGE_FILE%.lzo}"
         ;;
 
       lzma)
-        rc_halt "lzma -d ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        rc_halt "lzma -d ${IMAGE_FILE} -c | dd of=${DEVICE_FILE}"
         IMAGE_FILE="${IMAGE_FILE%.lzma}"
         ;;
 
       gzip)
-        rc_halt "gunzip ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        rc_halt "gunzip ${IMAGE_FILE} -c | dd of=${DEVICE_FILE}"
         IMAGE_FILE="${IMAGE_FILE%.gz}"
         ;;
 
       bzip2)
-        rc_halt "bunzip2 ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        rc_halt "bunzip2 ${IMAGE_FILE} -c | dd of=${DEVICE_FILE}"
         IMAGE_FILE="${IMAGE_FILE%.bz2}"
         ;;
 
       xz)
-        rc_halt "xz -d ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        rc_halt "xz -d ${IMAGE_FILE} -c | dd of=${DEVICE_FILE}"
         IMAGE_FILE="${IMAGE_FILE%.xz}"
         ;;
 
       zip)
-        rc_halt "unzip ${IMAGE_FILE} -c | dd of=${DEVICE_FILE} bs=128k"
+        rc_halt "unzip ${IMAGE_FILE} -c | dd of=${DEVICE_FILE}"
         IMAGE_FILE="${IMAGE_FILE%.zip}"
         ;;
 
@@ -404,7 +404,7 @@ write_image()
     esac
 
   else
-    rc_halt "dd if=${IMAGE_FILE} of=${DEVICE_FILE} bs=128k"
+    rc_halt "dd if=${IMAGE_FILE} of=${DEVICE_FILE}"
 
   fi
 };

From ce2502c9f8b02999e14436e138b249bef3860308 Mon Sep 17 00:00:00 2001
From: Warner Losh <imp@FreeBSD.org>
Date: Thu, 21 Oct 2010 22:46:10 +0000
Subject: [PATCH 58/68] More support for IMAGE installations

---
 usr.sbin/pc-sysinstall/backend/functions-bsdlabel.sh | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/usr.sbin/pc-sysinstall/backend/functions-bsdlabel.sh b/usr.sbin/pc-sysinstall/backend/functions-bsdlabel.sh
index f5cf86ae0eb..e8017140ade 100755
--- a/usr.sbin/pc-sysinstall/backend/functions-bsdlabel.sh
+++ b/usr.sbin/pc-sysinstall/backend/functions-bsdlabel.sh
@@ -219,9 +219,17 @@ setup_mbr_partitions()
         USINGENCROOT="0" ; export USINGENCROOT
       fi
           
+      if [ -n "${IMAGE}" ]
+      then
+        FS="IMAGE"
+        SIZE=`ls -l "${IMAGE}" | awk '{ print $5 }'`
+        MNT=`echo $STRING | tr -s '\t' ' ' | cut -d ' ' -f 2`
+		SIZE=`convert_byte_to_megabyte $SIZE`
+      fi
+
       # Now check that these values are sane
       case $FS in
-        UFS|UFS+S|UFS+J|UFS+SUJ|ZFS|SWAP) ;;
+        UFS|UFS+S|UFS+J|UFS+SUJ|ZFS|SWAP|IMAGE) ;;
        *) exit_err "ERROR: Invalid file system specified on $line" ;;
       esac
 

From c2176d895d15deb4d2eb31e0a11782d41383ff83 Mon Sep 17 00:00:00 2001
From: Warner Losh <imp@FreeBSD.org>
Date: Thu, 21 Oct 2010 23:08:42 +0000
Subject: [PATCH 59/68] Fix two typos

Submitted by:	Benedict Reuschling
---
 usr.sbin/pc-sysinstall/backend/functions-extractimage.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/usr.sbin/pc-sysinstall/backend/functions-extractimage.sh b/usr.sbin/pc-sysinstall/backend/functions-extractimage.sh
index 3077f2a5e7a..f9be5eaad2a 100755
--- a/usr.sbin/pc-sysinstall/backend/functions-extractimage.sh
+++ b/usr.sbin/pc-sysinstall/backend/functions-extractimage.sh
@@ -70,7 +70,7 @@ start_extract_uzip_tar()
       if [ "$?" != "0" ]
       then
         cd /
-        echo "TAR failure occured:" >>${LOGOUT}
+        echo "TAR failure occurred:" >>${LOGOUT}
         cat ${FSMNT}/.tar-extract.log | grep "tar:" >>${LOGOUT}
         umount ${FSMNT}.uzip
         mdconfig -d -u ${MDDEVICE}
@@ -275,7 +275,7 @@ EOF
   INSFILE="${OUTFILE}" ; export INSFILE
 }
 
-# Function which does the rsync download from the server specifed in cfg
+# Function which does the rsync download from the server specified in cfg
 start_rsync_copy()
 {
   # Load our rsync config values

From f1bed2243782cb677f8948fd942f1a3e111a7736 Mon Sep 17 00:00:00 2001
From: Warner Losh <imp@FreeBSD.org>
Date: Fri, 22 Oct 2010 00:10:48 +0000
Subject: [PATCH 60/68] Remove unnecessary variable.

Submitted by:	 Alex Kozlov
---
 usr.sbin/pc-sysinstall/backend-query/query-langs.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/usr.sbin/pc-sysinstall/backend-query/query-langs.sh b/usr.sbin/pc-sysinstall/backend-query/query-langs.sh
index 419d6263117..2ce60662403 100755
--- a/usr.sbin/pc-sysinstall/backend-query/query-langs.sh
+++ b/usr.sbin/pc-sysinstall/backend-query/query-langs.sh
@@ -25,8 +25,6 @@
 #
 # $FreeBSD$
 
-FOUND="0"
-
 cat ${PROGDIR}/conf/avail-langs
 
 exit 0

From 56f8b30888f0bf74e64a19edc222fe6a31881a70 Mon Sep 17 00:00:00 2001
From: Warner Losh <imp@FreeBSD.org>
Date: Fri, 22 Oct 2010 00:11:55 +0000
Subject: [PATCH 61/68] More simplifications

Submitted by:	 Alex Kozlov
---
 .../pc-sysinstall/backend-query/disk-info.sh  | 19 +++++--------------
 .../pc-sysinstall/backend-query/test-live.sh  | 11 ++---------
 2 files changed, 7 insertions(+), 23 deletions(-)

diff --git a/usr.sbin/pc-sysinstall/backend-query/disk-info.sh b/usr.sbin/pc-sysinstall/backend-query/disk-info.sh
index 222e8c5423c..8c55fce7142 100755
--- a/usr.sbin/pc-sysinstall/backend-query/disk-info.sh
+++ b/usr.sbin/pc-sysinstall/backend-query/disk-info.sh
@@ -26,25 +26,16 @@
 # $FreeBSD$
 
 # Query a disk for partitions and display them
-#############################
+#############################################################################
 
-. ${PROGDIR}/backend/functions.sh
 . ${PROGDIR}/backend/functions-disk.sh
 
-if [ -z "${1}" ]
-then
-  echo "Error: No disk specified!"
-  exit 1
-fi
-
-if [ ! -e "/dev/${1}" ]
-then
-  echo "Error: Disk /dev/${1} does not exist!"
-  exit 1
-fi
-
 DISK="${1}"
 
+[ -z "${DISK}" ] && { echo 'Error: No disk specified!'; exit 1; }
+[ ! -e "/dev/${DISK}" ] && \
+	{ echo "Error: Disk /dev/${DISK} does not exist!"; exit 1; }
+
 get_disk_cyl "${DISK}"
 CYLS="${VAL}"
 
diff --git a/usr.sbin/pc-sysinstall/backend-query/test-live.sh b/usr.sbin/pc-sysinstall/backend-query/test-live.sh
index b48c2eecf3a..86acc30705c 100755
--- a/usr.sbin/pc-sysinstall/backend-query/test-live.sh
+++ b/usr.sbin/pc-sysinstall/backend-query/test-live.sh
@@ -28,13 +28,6 @@
 # Script which checks if we are running from install media, or real system
 #############################################################################
 
-dmesg | grep "md0: Preloaded image" >/dev/null 2>/dev/null
-if [ "$?" = "0" ]
-then
-  echo "INSTALL-MEDIA"
-  exit 0
-else
-  echo "REAL-DISK"
-  exit 1
-fi
+dmesg | grep -q 'md0: Preloaded image' || { echo 'REAL-DISK'; exit 1; }
 
+echo 'INSTALL-MEDIA'

From 30ec71ad048696e11f4ff028061f2668490288c1 Mon Sep 17 00:00:00 2001
From: Marcel Moolenaar <marcel@FreeBSD.org>
Date: Fri, 22 Oct 2010 04:43:04 +0000
Subject: [PATCH 62/68] Unbreak ia64.

With r169630 I disabled symbol versioning because it broke rtld.  With
r211706 rtld got broken for ia64 & powerpc64.  It was fixed for powerpc64
with r212497.  In between, r211749 removed the exports table because the
version script handled the exports.  But wait, symbol versioning was
disabled on ia64.

With exports controlled by the version script and symbol versioning
disabled, all symbols are exported and too many symbols bind to the
definition in rtld. Let's just say that waird things happen.

So, enable symbol versioning on ia64 and apply a work-around for the
SIGSEGV that triggered r169630 to begin with: when rtld relocates
itself, it comes across r_debug_state and for some reason can't find the
definition. This causes a failure, relocation aborts and null pointers
galore. The work-around is to ignore the missing definition when rtld
is relocating itself and keep going.

Maybe with the next binutils this will all go away. Maybe not, in
which case I still need to figure out why r_debug_state cannot be found.

BTW: r_debug_state is in the symbol map -- I don't think any other rtld
symbols that rtld references are in the symbol map...
---
 libexec/rtld-elf/Makefile     |  2 --
 libexec/rtld-elf/ia64/reloc.c | 19 ++++++++++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/libexec/rtld-elf/Makefile b/libexec/rtld-elf/Makefile
index a798f3aeaca..62aaaab96a9 100644
--- a/libexec/rtld-elf/Makefile
+++ b/libexec/rtld-elf/Makefile
@@ -34,7 +34,6 @@ LDFLAGS+=	-shared -Wl,-Bsymbolic
 DPADD=		${LIBC_PIC}
 LDADD=		-lc_pic -lssp_nonshared
 
-.if ${MACHINE_CPUARCH} != "ia64"
 .if ${MK_SYMVER} == "yes"
 LIBCDIR=	${.CURDIR}/../../lib/libc
 VERSION_DEF=	${LIBCDIR}/Versions.def
@@ -48,7 +47,6 @@ ${PROG}:	${VERSION_MAP}
 SYMBOL_MAPS+=	${.CURDIR}/${RTLD_ARCH}/Symbol.map
 .endif
 .endif
-.endif
 
 .sinclude "${.CURDIR}/${RTLD_ARCH}/Makefile.inc"
 
diff --git a/libexec/rtld-elf/ia64/reloc.c b/libexec/rtld-elf/ia64/reloc.c
index 9a8d0670f63..728fe3064f7 100644
--- a/libexec/rtld-elf/ia64/reloc.c
+++ b/libexec/rtld-elf/ia64/reloc.c
@@ -195,9 +195,22 @@ reloc_non_plt_obj(Obj_Entry *obj_rtld, Obj_Entry *obj, const Elf_Rela *rela,
 		int sym_index;
 
 		def = find_symdef(ELF_R_SYM(rela->r_info), obj, &defobj,
-				  false, cache);
-		if (def == NULL)
-			return -1;
+				  true, cache);
+		if (def == NULL) {
+			/*
+			 * XXX r_debug_state is problematic and find_symdef()
+			 * returns NULL for it. This probably has something to
+			 * do with symbol versioning (r_debug_state is in the
+			 * symbol map). If we return -1 in that case we abort
+			 * relocating rtld, which typically is fatal. So, for
+			 * now just skip the symbol when we're relocating
+			 * rtld. We don't care about r_debug_state unless we
+			 * are being debugged.
+			 */
+			if (obj != obj_rtld)
+				return -1;
+			break;
+		}
 
 		if (def->st_shndx != SHN_UNDEF) {
 			target = (Elf_Addr)(defobj->relocbase + def->st_value);

From a10502a58bcca6b4b6168e26d58563f3c4ef5b6f Mon Sep 17 00:00:00 2001
From: Benedict Reuschling <bcr@FreeBSD.org>
Date: Fri, 22 Oct 2010 08:51:49 +0000
Subject: [PATCH 63/68] Revert to r214147, errno is not clobbered as originally
 thought.

---
 lib/libc/stdlib/strtonum.3 | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/libc/stdlib/strtonum.3 b/lib/libc/stdlib/strtonum.3
index 7ca88c53316..b83aadda42e 100644
--- a/lib/libc/stdlib/strtonum.3
+++ b/lib/libc/stdlib/strtonum.3
@@ -83,8 +83,6 @@ is set, and
 .Fa errstr
 will point to an error message.
 On success,
-.Va errno
-is set to 0 and
 .Fa *errstr
 will be set to
 .Dv NULL ;

From 1c2b97e084b8cc025347d8a68397aa918c4154ee Mon Sep 17 00:00:00 2001
From: Robert Watson <rwatson@FreeBSD.org>
Date: Fri, 22 Oct 2010 11:22:19 +0000
Subject: [PATCH 64/68] Validate syscall_timing test names before starting to
 provide earlier feedback regarding user error.

Provide default loop and timing settings.

Add a new test that just times pread() without the open()/close().

Mark tests requiring a path argument so we can provide better feedback
to the user than EFAULT on (null).

Sponsored by:	Google, Inc.
MFC after:	2 weeks
---
 tools/tools/syscall_timing/syscall_timing.c | 80 +++++++++++++++++----
 1 file changed, 68 insertions(+), 12 deletions(-)

diff --git a/tools/tools/syscall_timing/syscall_timing.c b/tools/tools/syscall_timing/syscall_timing.c
index 3a21974adb0..0bab9830f70 100644
--- a/tools/tools/syscall_timing/syscall_timing.c
+++ b/tools/tools/syscall_timing/syscall_timing.c
@@ -283,6 +283,29 @@ test_open_close(uintmax_t num, uintmax_t int_arg, const char *path)
 	return (i);
 }
 
+uintmax_t
+test_read(uintmax_t num, uintmax_t int_arg, const char *path)
+{
+	char buf[int_arg];
+	uintmax_t i;
+	int fd;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		err(-1, "test_open_close: %s", path);
+	(void)pread(fd, buf, int_arg, 0);
+
+	benchmark_start();
+	for (i = 0; i < num; i++) {
+		if (alarm_fired)
+			break;
+		(void)pread(fd, buf, int_arg, 0);
+	}
+	benchmark_stop();
+	close(fd);
+	return (i);
+}
+
 uintmax_t
 test_open_read_close(uintmax_t num, uintmax_t int_arg, const char *path)
 {
@@ -547,9 +570,12 @@ test_setuid(uintmax_t num, uintmax_t int_arg, const char *path)
 struct test {
 	const char	*t_name;
 	uintmax_t	(*t_func)(uintmax_t, uintmax_t, const char *);
+	int		 t_flags;
 	uintmax_t	 t_int;
 };
 
+#define	FLAG_PATH	0x00000001
+
 static const struct test tests[] = {
 	{ "getuid", test_getuid },
 	{ "getppid", test_getppid },
@@ -561,13 +587,28 @@ static const struct test tests[] = {
 	{ "socketpair_dgram", test_socketpair_dgram },
 	{ "socket_tcp", test_socket_stream, .t_int = PF_INET },
 	{ "socket_udp", test_socket_dgram, .t_int = PF_INET },
-	{ "open_close", test_open_close },
-	{ "open_read_close_1", test_open_read_close, .t_int = 1 },
-	{ "open_read_close_10", test_open_read_close, .t_int = 10 },
-	{ "open_read_close_100", test_open_read_close, .t_int = 100 },
-	{ "open_read_close_1000", test_open_read_close, .t_int = 1000 },
-	{ "open_read_close_10000", test_open_read_close, .t_int = 10000 },
-	{ "open_read_close_100000", test_open_read_close, .t_int = 100000 },
+	{ "open_close", test_open_close, .t_flags = FLAG_PATH },
+	{ "open_read_close_1", test_open_read_close, .t_flags = FLAG_PATH,
+	    .t_int = 1 },
+	{ "open_read_close_10", test_open_read_close, .t_flags = FLAG_PATH,
+	    .t_int = 10 },
+	{ "open_read_close_100", test_open_read_close, .t_flags = FLAG_PATH,
+	    .t_int = 100 },
+	{ "open_read_close_1000", test_open_read_close, .t_flags = FLAG_PATH,
+	    .t_int = 1000 },
+	{ "open_read_close_10000", test_open_read_close,
+	    .t_flags = FLAG_PATH, .t_int = 10000 },
+	{ "open_read_close_100000", test_open_read_close,
+	    .t_flags = FLAG_PATH, .t_int = 100000 },
+	{ "open_read_close_1000000", test_open_read_close,
+	    .t_flags = FLAG_PATH, .t_int = 1000000 },
+	{ "read_1", test_read, .t_flags = FLAG_PATH, .t_int = 1 },
+	{ "read_10", test_read, .t_flags = FLAG_PATH, .t_int = 10 },
+	{ "read_100", test_read, .t_flags = FLAG_PATH, .t_int = 100 },
+	{ "read_1000", test_read, .t_flags = FLAG_PATH, .t_int = 1000 },
+	{ "read_10000", test_read, .t_flags = FLAG_PATH, .t_int = 10000 },
+	{ "read_100000", test_read, .t_flags = FLAG_PATH, .t_int = 100000 },
+	{ "read_1000000", test_read, .t_flags = FLAG_PATH, .t_int = 1000000 },
 	{ "dup", test_dup },
 	{ "shmfd", test_shmfd },
 	{ "fstat_shmfd", test_fstat_shmfd },
@@ -603,9 +644,9 @@ main(int argc, char *argv[])
 	int ch, i, j, k;
 	uintmax_t iterations, loops;
 
-	alarm_timeout = 0;
+	alarm_timeout = 1;
 	iterations = 0;
-	loops = 0;
+	loops = 10;
 	path = NULL;
 	while ((ch = getopt(argc, argv, "i:l:p:s:")) != -1) {
 		switch (ch) {
@@ -652,10 +693,27 @@ main(int argc, char *argv[])
 	if (argc < 1)
 		usage();
 
+	/*
+	 * Validate test list and that, if a path is required, it is
+	 * defined.
+	 */
+	for (j = 0; j < argc; j++) {
+		the_test = NULL;
+		for (i = 0; i < tests_count; i++) {
+			if (strcmp(argv[j], tests[i].t_name) == 0)
+				the_test = &tests[i];
+		}
+		if (the_test == NULL)
+			usage();
+		if ((the_test->t_flags & FLAG_PATH) && (path == NULL)) {
+			errx(-1, "%s requires -p", the_test->t_name);
+		}
+	}
+
 	assert(clock_getres(CLOCK_REALTIME, &ts_res) == 0);
 	printf("Clock resolution: %ju.%09ju\n", (uintmax_t)ts_res.tv_sec,
 	    (uintmax_t)ts_res.tv_nsec);
-	printf("test\tloop\ttotal\titerations\tperiteration\n");
+	printf("test\tloop\ttime\titerations\tperiteration\n");
 
 	for (j = 0; j < argc; j++) {
 		uintmax_t calls, nsecsperit;
@@ -665,8 +723,6 @@ main(int argc, char *argv[])
 			if (strcmp(argv[j], tests[i].t_name) == 0)
 				the_test = &tests[i];
 		}
-		if (the_test == NULL)
-			usage();
 
 		/*
 		 * Run one warmup, then do the real thing (loops) times.

From ba577448a2c9ea9bde8f575ddcc12fcbb16619df Mon Sep 17 00:00:00 2001
From: John Baldwin <jhb@FreeBSD.org>
Date: Fri, 22 Oct 2010 11:42:02 +0000
Subject: [PATCH 65/68] - Add a new PCI quirk to whitelist an old chipset that
 doesn't support   PCI-express or PCI-X capabilities if we are running in a
 virtual machine. - Whitelist the Intel 82440 chipset used by QEMU.

Tested by:	jfv
MFC after:	1 week
---
 sys/dev/pci/pci.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index 53313427492..ef80f815d94 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -182,6 +182,7 @@ struct pci_quirk {
 	int	type;
 #define	PCI_QUIRK_MAP_REG	1 /* PCI map register in weird place */
 #define	PCI_QUIRK_DISABLE_MSI	2 /* MSI/MSI-X doesn't work */
+#define	PCI_QUIRK_ENABLE_MSI_VM	3 /* Older chipset in VM where MSI works */
 	int	arg1;
 	int	arg2;
 };
@@ -218,6 +219,12 @@ struct pci_quirk pci_quirks[] = {
 	 */
 	{ 0x74501022, PCI_QUIRK_DISABLE_MSI,	0,	0 },
 
+	/*
+	 * Some virtualization environments emulate an older chipset
+	 * but support MSI just fine.  QEMU uses the Intel 82440.
+	 */
+	{ 0x12378086, PCI_QUIRK_ENABLE_MSI_VM,	0,	0 },
+
 	{ 0 }
 };
 
@@ -1833,6 +1840,23 @@ pci_msi_device_blacklisted(device_t dev)
 	return (0);
 }
 
+/*
+ * Returns true if a specified chipset supports MSI when it is
+ * emulated hardware in a virtual machine.
+ */
+static int
+pci_msi_vm_chipset(device_t dev)
+{
+	struct pci_quirk *q;
+
+	for (q = &pci_quirks[0]; q->devid; q++) {
+		if (q->devid == pci_get_devid(dev) &&
+		    q->type == PCI_QUIRK_ENABLE_MSI_VM)
+			return (1);
+	}
+	return (0);
+}
+
 /*
  * Determine if MSI is blacklisted globally on this sytem.  Currently,
  * we just check for blacklisted chipsets as represented by the
@@ -1849,8 +1873,14 @@ pci_msi_blacklisted(void)
 		return (0);
 
 	/* Blacklist all non-PCI-express and non-PCI-X chipsets. */
-	if (!(pcie_chipset || pcix_chipset))
+	if (!(pcie_chipset || pcix_chipset)) {
+		if (vm_guest != VM_GUEST_NO) {
+			dev = pci_find_bsf(0, 0, 0);
+			if (dev != NULL)
+				return (pci_msi_vm_chipset(dev) == 0);
+		}
 		return (1);
+	}
 
 	dev = pci_find_bsf(0, 0, 0);
 	if (dev != NULL)

From 104d506dddcaaa08161cb4c5b43606d82403ba1c Mon Sep 17 00:00:00 2001
From: Roman Divacky <rdivacky@FreeBSD.org>
Date: Fri, 22 Oct 2010 18:07:21 +0000
Subject: [PATCH 66/68] Avoid using memcpy() for copying 32bit chunks. This
 shrinks the resulting code a little.

Approved by:    rpaulo (mentor)
Reviewed by:    jhb
---
 sys/boot/i386/boot2/boot2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sys/boot/i386/boot2/boot2.c b/sys/boot/i386/boot2/boot2.c
index f521fd7e071..307d4c5df08 100644
--- a/sys/boot/i386/boot2/boot2.c
+++ b/sys/boot/i386/boot2/boot2.c
@@ -348,7 +348,7 @@ load(void)
 	    return;
 	p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE);
 	bootinfo.bi_symtab = VTOP(p);
-	memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms));
+	*(uint32_t*)p = hdr.ex.a_syms;
 	p += sizeof(hdr.ex.a_syms);
 	if (hdr.ex.a_syms) {
 	    if (xfsread(ino, p, hdr.ex.a_syms))
@@ -385,7 +385,7 @@ load(void)
 	    if (xfsread(ino, &es, sizeof(es)))
 		return;
 	    for (i = 0; i < 2; i++) {
-		memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size));
+		*(Elf32_Word *)p = es[i].sh_size;
 		p += sizeof(es[i].sh_size);
 		fs_off = es[i].sh_offset;
 		if (xfsread(ino, p, es[i].sh_size))

From f6a65488854e413eaaea842fc5191e394cc8c24e Mon Sep 17 00:00:00 2001
From: Pyun YongHyeon <yongari@FreeBSD.org>
Date: Fri, 22 Oct 2010 18:31:44 +0000
Subject: [PATCH 67/68] Enable TX MAC state machine lockup fix for both BCM5755
 or higher and BCM5906. Publicly available data sheet just says it may happen
 due to corrupted TxMbuf.

---
 sys/dev/bge/if_bge.c    | 7 ++++++-
 sys/dev/bge/if_bgereg.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/sys/dev/bge/if_bge.c b/sys/dev/bge/if_bge.c
index 5380931ad74..139df3d0855 100644
--- a/sys/dev/bge/if_bge.c
+++ b/sys/dev/bge/if_bge.c
@@ -4409,6 +4409,7 @@ bge_init_locked(struct bge_softc *sc)
 {
 	struct ifnet *ifp;
 	uint16_t *m;
+	uint32_t mode;
 
 	BGE_LOCK_ASSERT(sc);
 
@@ -4514,8 +4515,12 @@ bge_init_locked(struct bge_softc *sc)
 	/* Init TX ring. */
 	bge_init_tx_ring(sc);
 
+	/* Enable TX MAC state machine lockup fix. */
+	mode = CSR_READ_4(sc, BGE_TX_MODE);
+	if (BGE_IS_5755_PLUS(sc) || sc->bge_asicrev == BGE_ASICREV_BCM5906)
+		mode |= BGE_TXMODE_MBUF_LOCKUP_FIX;
 	/* Turn on transmitter. */
-	BGE_SETBIT(sc, BGE_TX_MODE, BGE_TXMODE_ENABLE);
+	CSR_WRITE_4(sc, BGE_TX_MODE, mode | BGE_TXMODE_ENABLE);
 
 	/* Turn on receiver. */
 	BGE_SETBIT(sc, BGE_RX_MODE, BGE_RXMODE_ENABLE);
diff --git a/sys/dev/bge/if_bgereg.h b/sys/dev/bge/if_bgereg.h
index 8e337c693fb..8211df183c8 100644
--- a/sys/dev/bge/if_bgereg.h
+++ b/sys/dev/bge/if_bgereg.h
@@ -765,6 +765,7 @@
 #define	BGE_TXMODE_FLOWCTL_ENABLE	0x00000010
 #define	BGE_TXMODE_BIGBACKOFF_ENABLE	0x00000020
 #define	BGE_TXMODE_LONGPAUSE_ENABLE	0x00000040
+#define	BGE_TXMODE_MBUF_LOCKUP_FIX	0x00000100
 
 /* Transmit MAC status register */
 #define	BGE_TXSTAT_RX_XOFFED		0x00000001

From 8d5f71818fa6c067477d6aa9d0c96d5acbf990c3 Mon Sep 17 00:00:00 2001
From: Pyun YongHyeon <yongari@FreeBSD.org>
Date: Fri, 22 Oct 2010 19:30:56 +0000
Subject: [PATCH 68/68] Add workaround for BCM5906 A1 controller silicon bug.
 When auto-negotiation results in half-duplex operation, excess collision on
 the ethernet link may cause internal chip delays that may result in
 subsequent valid frames being dropped due to insufficient receive buffer
 resources. The workaround is to choose de-pipeline method as a flow control
 decision for SDI. De-pipeline method allows only 1 data in TxMbuf at a time
 such that a request to RDMA from SDI is made only when TxMbuf is empty.
 Thanks for david for providing detailed errata information.

---
 sys/dev/bge/if_bge.c    | 5 +++++
 sys/dev/bge/if_bgereg.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/sys/dev/bge/if_bge.c b/sys/dev/bge/if_bge.c
index 139df3d0855..95de34cb0bb 100644
--- a/sys/dev/bge/if_bge.c
+++ b/sys/dev/bge/if_bge.c
@@ -1693,6 +1693,11 @@ bge_blockinit(struct bge_softc *sc)
 		bge_writembx(sc, BGE_MBX_RX_MINI_PROD_LO, 0);
 	}
 
+	/* Choose de-pipeline mode for BCM5906 A1. */
+	if (sc->bge_asicrev == BGE_ASICREV_BCM5906 &&
+	    sc->bge_chiprev == BGE_CHIPID_BCM5906_A1)
+		CSR_WRITE_4(sc, BGE_ISO_PKT_TX,
+		    (CSR_READ_4(sc, BGE_ISO_PKT_TX) & ~3) | 2);
 	/*
 	 * The BD ring replenish thresholds control how often the
 	 * hardware fetches new BD's from the producer rings in host
diff --git a/sys/dev/bge/if_bgereg.h b/sys/dev/bge/if_bgereg.h
index 8211df183c8..a4f3f834b61 100644
--- a/sys/dev/bge/if_bgereg.h
+++ b/sys/dev/bge/if_bgereg.h
@@ -880,6 +880,7 @@
 #define	BGE_SDI_STATS_CTL		0x0C08
 #define	BGE_SDI_STATS_ENABLE_MASK	0x0C0C
 #define	BGE_SDI_STATS_INCREMENT_MASK	0x0C10
+#define	BGE_ISO_PKT_TX			0x0C20
 #define	BGE_LOCSTATS_COS0		0x0C80
 #define	BGE_LOCSTATS_COS1		0x0C84
 #define	BGE_LOCSTATS_COS2		0x0C88