- Merge soft-updates journaling from projects/suj/head into head. This

brings in support for an optional intent log which eliminates the need for background fsck on unclean shutdown. Sponsored by: iXsystems, Yahoo!, and Juniper. With help from: McKusick and Peter Holm
2026-05-28 04:12:45 -04:00 · 2010-04-24 07:05:35 +00:00 · 2010-04-24 07:05:35 +00:00 · 113db2dddb
commit 113db2dddb
parent 07b9cc2f46
40 changed files with 13165 additions and 2139 deletions
--- a/lib/libufs/Makefile
+++ b/lib/libufs/Makefile
@ -3,7 +3,7 @@
 LIB=	ufs
 SHLIBDIR?= /lib

-SRCS=	block.c cgroup.c inode.c sblock.c type.c
+SRCS=	block.c cgroup.c inode.c sblock.c type.c ffs_subr.c ffs_tables.c
 INCS=	libufs.h

 MAN=	bread.3 cgread.3 libufs.3 sbread.3 ufs_disk_close.3
@ -16,8 +16,11 @@ MLINKS+= ufs_disk_close.3 ufs_disk_fillout.3
 MLINKS+= ufs_disk_close.3 ufs_disk_fillout_blank.3
 MLINKS+= ufs_disk_close.3 ufs_disk_write.3

-WARNS?=	3
+.PATH:  ${.CURDIR}/../../sys/ufs/ffs

+WARNS?=	2
+
+DEBUG_FLAGS = -g
 CFLAGS+= -D_LIBUFS
 .if defined(LIBUFS_DEBUG)
 CFLAGS+= -D_LIBUFS_DEBUGGING
--- a/lib/libufs/cgroup.c
+++ b/lib/libufs/cgroup.c
@ -40,11 +40,143 @@ __FBSDID("$FreeBSD$");
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <unistd.h>

 #include <libufs.h>

+ufs2_daddr_t
+cgballoc(struct uufsd *disk)
+{
+	u_int8_t *blksfree;
+	struct cg *cgp;
+	struct fs *fs;
+	long bno;
+
+	fs = &disk->d_fs;
+	cgp = &disk->d_cg;
+	blksfree = cg_blksfree(cgp);
+	for (bno = 0; bno < fs->fs_fpg / fs->fs_frag; bno++)
+		if (ffs_isblock(fs, blksfree, bno))
+			goto gotit;
+	return (0);
+gotit:
+	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
+	ffs_clrblock(fs, blksfree, (long)bno);
+	ffs_clusteracct(fs, cgp, bno, -1);
+	cgp->cg_cs.cs_nbfree--;
+	fs->fs_cstotal.cs_nbfree--;
+	fs->fs_fmod = 1;
+	return (cgbase(fs, cgp->cg_cgx) + blkstofrags(fs, bno));
+}
+
+int
+cgbfree(struct uufsd *disk, ufs2_daddr_t bno, long size)
+{
+	u_int8_t *blksfree;
+	struct fs *fs;
+	struct cg *cgp;
+	ufs1_daddr_t fragno, cgbno;
+	int i, cg, blk, frags, bbase;
+
+	fs = &disk->d_fs;
+	cg = dtog(fs, bno);
+	if (cgread1(disk, cg) != 1)
+		return (-1);
+	cgp = &disk->d_cg;
+	cgbno = dtogd(fs, bno);
+	blksfree = cg_blksfree(cgp);
+	if (size == fs->fs_bsize) {
+		fragno = fragstoblks(fs, cgbno);
+		ffs_setblock(fs, blksfree, fragno);
+		ffs_clusteracct(fs, cgp, fragno, 1);
+		cgp->cg_cs.cs_nbfree++;
+		fs->fs_cstotal.cs_nbfree++;
+		fs->fs_cs(fs, cg).cs_nbfree++;
+	} else {
+		bbase = cgbno - fragnum(fs, cgbno);
+		/*
+		 * decrement the counts associated with the old frags
+		 */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
+		/*
+		 * deallocate the fragment
+		 */
+		frags = numfrags(fs, size);
+		for (i = 0; i < frags; i++)
+			setbit(blksfree, cgbno + i);
+		cgp->cg_cs.cs_nffree += i;
+		fs->fs_cstotal.cs_nffree += i;
+		fs->fs_cs(fs, cg).cs_nffree += i;
+		/*
+		 * add back in counts associated with the new frags
+		 */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
+		/*
+		 * if a complete block has been reassembled, account for it
+		 */
+		fragno = fragstoblks(fs, bbase);
+		if (ffs_isblock(fs, blksfree, fragno)) {
+			cgp->cg_cs.cs_nffree -= fs->fs_frag;
+			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
+			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
+			ffs_clusteracct(fs, cgp, fragno, 1);
+			cgp->cg_cs.cs_nbfree++;
+			fs->fs_cstotal.cs_nbfree++;
+			fs->fs_cs(fs, cg).cs_nbfree++;
+		}
+	}
+	return cgwrite(disk);
+}
+
+ino_t
+cgialloc(struct uufsd *disk)
+{
+	struct ufs2_dinode *dp2;
+	u_int8_t *inosused;
+	struct cg *cgp;
+	struct fs *fs;
+	ino_t ino;
+	int i;
+
+	fs = &disk->d_fs;
+	cgp = &disk->d_cg;
+	inosused = cg_inosused(cgp);
+	for (ino = 0; ino < fs->fs_ipg / NBBY; ino++)
+		if (isclr(inosused, ino))
+			goto gotit;
+	return (0);
+gotit:
+	if (fs->fs_magic == FS_UFS2_MAGIC &&
+	    ino + INOPB(fs) > cgp->cg_initediblk &&
+	    cgp->cg_initediblk < cgp->cg_niblk) {
+		char block[MAXBSIZE];
+		bzero(block, (int)fs->fs_bsize);
+		dp2 = (struct ufs2_dinode *)&block;
+		for (i = 0; i < INOPB(fs); i++) {
+			dp2->di_gen = arc4random() / 2 + 1;
+			dp2++;
+		}
+		if (bwrite(disk, ino_to_fsba(fs,
+		    cgp->cg_cgx * fs->fs_ipg + cgp->cg_initediblk),
+		    block, fs->fs_bsize))
+			return (0);
+		cgp->cg_initediblk += INOPB(fs);
+	}
+
+	setbit(inosused, ino);
+	cgp->cg_irotor = ino;
+	cgp->cg_cs.cs_nifree--;
+	fs->fs_cstotal.cs_nifree--;
+	fs->fs_cs(fs, cgp->cg_cgx).cs_nifree--;
+	fs->fs_fmod = 1;
+
+	return (ino + (cgp->cg_cgx * fs->fs_ipg));
+}
+
 int
 cgread(struct uufsd *disk)
 {
@ -55,14 +187,12 @@ int
 cgread1(struct uufsd *disk, int c)
 {
 	struct fs *fs;
-	off_t ccg;

 	fs = &disk->d_fs;

 	if ((unsigned)c >= fs->fs_ncg) {
 		return (0);
 	}
-	ccg = fsbtodb(fs, cgtod(fs, c)) * disk->d_bsize;
 	if (bread(disk, fsbtodb(fs, cgtod(fs, c)), disk->d_cgunion.d_buf,
 	    fs->fs_bsize) == -1) {
 		ERROR(disk, "unable to read cylinder group");
@ -72,6 +202,12 @@ cgread1(struct uufsd *disk, int c)
 	return (1);
 }

+int
+cgwrite(struct uufsd *disk)
+{
+	return (cgwrite1(disk, disk->d_lcg));
+}
+
 int
 cgwrite1(struct uufsd *disk, int c)
 {
--- a/lib/libufs/inode.c
+++ b/lib/libufs/inode.c
@ -93,3 +93,19 @@ gotit:	switch (disk->d_ufs) {
 	ERROR(disk, "unknown UFS filesystem type");
 	return (-1);
 }
+
+int
+putino(struct uufsd *disk)
+{
+	struct fs *fs;
+
+	fs = &disk->d_fs;
+	if (disk->d_inoblock == NULL) {
+		ERROR(disk, "No inode block allocated");
+		return (-1);
+	}
+	if (bwrite(disk, fsbtodb(fs, ino_to_fsba(&disk->d_fs, disk->d_inomin)),
+	    disk->d_inoblock, disk->d_fs.fs_bsize) <= 0)
+		return (-1);
+	return (0);
+}
--- a/lib/libufs/libufs.h
+++ b/lib/libufs/libufs.h
@ -71,6 +71,7 @@ struct uufsd {
 	int d_fd;		/* raw device file descriptor */
 	long d_bsize;		/* device bsize */
 	ufs2_daddr_t d_sblock;	/* superblock location */
+	struct csum *d_sbcsum;	/* Superblock summary info */
 	caddr_t d_inoblock;	/* inode block */
 	ino_t d_inomin;		/* low inode */
 	ino_t d_inomax;		/* high inode */
@ -109,14 +110,19 @@ int berase(struct uufsd *, ufs2_daddr_t, ufs2_daddr_t);
 /*
 * cgroup.c
 */
+ufs2_daddr_t cgballoc(struct uufsd *);
+int cgbfree(struct uufsd *, ufs2_daddr_t, long);
+ino_t cgialloc(struct uufsd *);
 int cgread(struct uufsd *);
 int cgread1(struct uufsd *, int);
+int cgwrite(struct uufsd *);
 int cgwrite1(struct uufsd *, int);

 /*
 * inode.c
 */
 int getino(struct uufsd *, void **, ino_t, int *);
+int putino(struct uufsd *);

 /*
 * sblock.c
@ -132,6 +138,16 @@ int ufs_disk_fillout(struct uufsd *, const char *);
 int ufs_disk_fillout_blank(struct uufsd *, const char *);
 int ufs_disk_write(struct uufsd *);

+/*
+ * ffs_subr.c
+ */
+void	ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
+void	ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
+void	ffs_fragacct(struct fs *, int, int32_t [], int);
+int	ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
+int	ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t);
+void	ffs_setblock(struct fs *, u_char *, ufs1_daddr_t);
+
 __END_DECLS

 #endif	/* __LIBUFS_H__ */
--- a/lib/libufs/sblock.c
+++ b/lib/libufs/sblock.c
@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
 #include <errno.h>
 #include <stdio.h>
 #include <string.h>
+#include <stdlib.h>
 #include <unistd.h>

 #include <libufs.h>
@ -49,8 +50,11 @@ static int superblocks[] = SBLOCKSEARCH;
 int
 sbread(struct uufsd *disk)
 {
+	uint8_t block[MAXBSIZE];
 	struct fs *fs;
 	int sb, superblock;
+	int i, size, blks;
+	uint8_t *space;

 	ERROR(disk, NULL);

@ -86,6 +90,34 @@ sbread(struct uufsd *disk)
 	}
 	disk->d_bsize = fs->fs_fsize / fsbtodb(fs, 1);
 	disk->d_sblock = superblock / disk->d_bsize;
+	/*
+	 * Read in the superblock summary information.
+	 */
+	size = fs->fs_cssize;
+	blks = howmany(size, fs->fs_fsize);
+	size += fs->fs_ncg * sizeof(int32_t);
+	space = malloc(size);
+	if (space == NULL) {
+		ERROR(disk, "failed to allocate space for summary information");
+		return (-1);
+	}
+	fs->fs_csp = (struct csum *)space;
+	for (i = 0; i < blks; i += fs->fs_frag) {
+		size = fs->fs_bsize;
+		if (i + fs->fs_frag > blks)
+			size = (blks - i) * fs->fs_fsize;
+		if (bread(disk, fsbtodb(fs, fs->fs_csaddr + i), block, size)
+		    == -1) {
+			ERROR(disk, "Failed to read sb summary information");
+			free(fs->fs_csp);
+			return (-1);
+		}
+		bcopy(block, space, size);
+		space += size;
+	}
+	fs->fs_maxcluster = (uint32_t *)space;
+	disk->d_sbcsum = fs->fs_csp;
+
 	return (0);
 }

@ -93,6 +125,8 @@ int
 sbwrite(struct uufsd *disk, int all)
 {
 	struct fs *fs;
+	int blks, size;
+	uint8_t *space;
 	unsigned i;

 	ERROR(disk, NULL);
@ -107,6 +141,22 @@ sbwrite(struct uufsd *disk, int all)
 		ERROR(disk, "failed to write superblock");
 		return (-1);
 	}
+	/*
+	 * Write superblock summary information.
+	 */
+	blks = howmany(fs->fs_cssize, fs->fs_fsize);
+	space = (uint8_t *)disk->d_sbcsum;
+	for (i = 0; i < blks; i += fs->fs_frag) {
+		size = fs->fs_bsize;
+		if (i + fs->fs_frag > blks)
+			size = (blks - i) * fs->fs_fsize;
+		if (bwrite(disk, fsbtodb(fs, fs->fs_csaddr + i), space, size)
+		    == -1) {
+			ERROR(disk, "Failed to write sb summary information");
+			return (-1);
+		}
+		space += size;
+	}
 	if (all) {
 		for (i = 0; i < fs->fs_ncg; i++)
 			if (bwrite(disk, fsbtodb(fs, cgsblock(fs, i)),
--- a/lib/libufs/type.c
+++ b/lib/libufs/type.c
@ -66,6 +66,10 @@ ufs_disk_close(struct uufsd *disk)
 		free((char *)(uintptr_t)disk->d_name);
 		disk->d_name = NULL;
 	}
+	if (disk->d_sbcsum != NULL) {
+		free(disk->d_sbcsum);
+		disk->d_sbcsum = NULL;
+	}
 	return (0);
 }

@ -156,6 +160,7 @@ again:	if ((ret = stat(name, &st)) < 0) {
 	disk->d_mine = 0;
 	disk->d_ufs = 0;
 	disk->d_error = NULL;
+	disk->d_sbcsum = NULL;

 	if (oname != name) {
 		name = strdup(name);
--- a/sbin/dumpfs/dumpfs.c
+++ b/sbin/dumpfs/dumpfs.c
@ -238,7 +238,7 @@ dumpfs(const char *name)
 	if (fsflags & FS_UNCLEAN)
 		printf("unclean ");
 	if (fsflags & FS_DOSOFTDEP)
-		printf("soft-updates ");
+		printf("soft-updates%s ", (fsflags & FS_SUJ) ? "+journal" : "");
 	if (fsflags & FS_NEEDSFSCK)
 		printf("needs fsck run ");
 	if (fsflags & FS_INDEXDIRS)
@ -255,7 +255,7 @@ dumpfs(const char *name)
 		printf("nfsv4acls ");
 	fsflags &= ~(FS_UNCLEAN | FS_DOSOFTDEP | FS_NEEDSFSCK | FS_INDEXDIRS |
 		     FS_ACLS | FS_MULTILABEL | FS_GJOURNAL | FS_FLAGS_UPDATED |
-		     FS_NFS4ACLS);
+		     FS_NFS4ACLS | FS_SUJ);
 	if (fsflags != 0)
 		printf("unknown flags (%#x)", fsflags);
 	putchar('\n');
--- a/sbin/fsck_ffs/Makefile
+++ b/sbin/fsck_ffs/Makefile
@ -7,8 +7,7 @@ LINKS+=	${BINDIR}/fsck_ffs ${BINDIR}/fsck_4.2bsd
 MAN=	fsck_ffs.8
 MLINKS=	fsck_ffs.8 fsck_ufs.8 fsck_ffs.8 fsck_4.2bsd.8
 SRCS=	dir.c ea.c fsutil.c inode.c main.c pass1.c pass1b.c pass2.c pass3.c \
-	pass4.c pass5.c setup.c utilities.c ffs_subr.c ffs_tables.c gjournal.c \
-	getmntopts.c
+	pass4.c pass5.c setup.c suj.c utilities.c gjournal.c getmntopts.c
 DPADD=	${LIBUFS}
 LDADD=	-lufs
 WARNS?=	2
--- a/sbin/fsck_ffs/fsck.h
+++ b/sbin/fsck_ffs/fsck.h
@ -347,10 +347,6 @@ void		direrror(ino_t ino, const char *errmesg);
 int		dirscan(struct inodesc *);
 int		dofix(struct inodesc *, const char *msg);
 int		eascan(struct inodesc *, struct ufs2_dinode *dp);
-void		ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
-void		ffs_fragacct(struct fs *, int, int32_t [], int);
-int		ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
-void		ffs_setblock(struct fs *, u_char *, ufs1_daddr_t);
 void		fileerror(ino_t cwd, ino_t ino, const char *errmesg);
 int		findino(struct inodesc *);
 int		findname(struct inodesc *);
@ -392,3 +388,4 @@ void		sblock_init(void);
 void		setinodebuf(ino_t);
 int		setup(char *dev);
 void		gjournal_check(const char *filesys);
+int		suj_check(const char *filesys);
--- a/sbin/fsck_ffs/gjournal.c
+++ b/sbin/fsck_ffs/gjournal.c
@ -95,27 +95,6 @@ struct ufs2_dinode ufs2_zino;

 static void putcgs(void);

-/*
- * Write current block of inodes.
- */
-static int
-putino(struct uufsd *disk, ino_t inode)
-{
-	caddr_t inoblock;
-	struct fs *fs;
-	ssize_t ret;
-
-	fs = &disk->d_fs;
-	inoblock = disk->d_inoblock;
-
-	assert(inoblock != NULL);
-	assert(inode >= disk->d_inomin && inode <= disk->d_inomax);
-	ret = bwrite(disk, fsbtodb(fs, ino_to_fsba(fs, inode)), inoblock,
-	    fs->fs_bsize);
-
-	return (ret == -1 ? -1 : 0);
-}
-
 /*
 * Return cylinder group from the cache or load it if it is not in the
 * cache yet.
@ -242,13 +221,11 @@ cancelcgs(void)
 #endif

 /*
- * Open the given provider, load statistics.
+ * Open the given provider, load superblock.
 */
 static void
-getdisk(void)
+opendisk(void)
 {
-	int i;
-
 	if (disk != NULL)
 		return;
 	disk = malloc(sizeof(*disk));
@ -259,24 +236,6 @@ getdisk(void)
 		    disk->d_error);
 	}
 	fs = &disk->d_fs;
-	fs->fs_csp = malloc((size_t)fs->fs_cssize);
-	if (fs->fs_csp == NULL)
-		err(1, "malloc(%zu)", (size_t)fs->fs_cssize);
-	bzero(fs->fs_csp, (size_t)fs->fs_cssize);
-	for (i = 0; i < fs->fs_cssize; i += fs->fs_bsize) {
-		if (bread(disk, fsbtodb(fs, fs->fs_csaddr + numfrags(fs, i)),
-		    (void *)(((char *)fs->fs_csp) + i),
-		    (size_t)(fs->fs_cssize - i < fs->fs_bsize ? fs->fs_cssize - i : fs->fs_bsize)) == -1) {
-			err(1, "bread: %s", disk->d_error);
-		}
-	}
-	if (fs->fs_contigsumsize > 0) {
-		fs->fs_maxcluster = malloc(fs->fs_ncg * sizeof(int32_t));
-		if (fs->fs_maxcluster == NULL)
-			err(1, "malloc(%zu)", fs->fs_ncg * sizeof(int32_t));
-		for (i = 0; i < fs->fs_ncg; i++)
-			fs->fs_maxcluster[i] = fs->fs_contigsumsize;
-	}
 }

 /*
@ -286,11 +245,6 @@ static void
 closedisk(void)
 {

-	free(fs->fs_csp);
-	if (fs->fs_contigsumsize > 0) {
-		free(fs->fs_maxcluster);
-		fs->fs_maxcluster = NULL;
-	}
 	fs->fs_clean = 1;
 	if (sbwrite(disk, 0) == -1)
 		err(1, "sbwrite(%s)", devnam);
@ -301,227 +255,6 @@ closedisk(void)
 	fs = NULL;
 }

-/*
- * Write the statistics back, call closedisk().
- */
-static void
-putdisk(void)
-{
-	int i;
-
-	assert(disk != NULL && fs != NULL);
-	for (i = 0; i < fs->fs_cssize; i += fs->fs_bsize) {
-		if (bwrite(disk, fsbtodb(fs, fs->fs_csaddr + numfrags(fs, i)),
-		    (void *)(((char *)fs->fs_csp) + i),
-		    (size_t)(fs->fs_cssize - i < fs->fs_bsize ? fs->fs_cssize - i : fs->fs_bsize)) == -1) {
-			err(1, "bwrite: %s", disk->d_error);
-		}
-	}
-	closedisk();
-}
-
-#if 0
-/*
- * Free memory, close the disk, but don't write anything back.
- */
-static void
-canceldisk(void)
-{
-	int i;
-
-	assert(disk != NULL && fs != NULL);
-	free(fs->fs_csp);
-	if (fs->fs_contigsumsize > 0)
-		free(fs->fs_maxcluster);
-	if (ufs_disk_close(disk) == -1)
-		err(1, "ufs_disk_close(%s)", devnam);
-	free(disk);
-	disk = NULL;
-	fs = NULL;
-}
-#endif
-
-static int
-isblock(unsigned char *cp, ufs1_daddr_t h)
-{
-	unsigned char mask;
-
-	switch ((int)fs->fs_frag) {
-	case 8:
-		return (cp[h] == 0xff);
-	case 4:
-		mask = 0x0f << ((h & 0x1) << 2);
-		return ((cp[h >> 1] & mask) == mask);
-	case 2:
-		mask = 0x03 << ((h & 0x3) << 1);
-		return ((cp[h >> 2] & mask) == mask);
-	case 1:
-		mask = 0x01 << (h & 0x7);
-		return ((cp[h >> 3] & mask) == mask);
-	default:
-		assert(!"isblock: invalid number of fragments");
-	}
-	return (0);
-}
-
-/*
- * put a block into the map
- */
-static void
-setblock(unsigned char *cp, ufs1_daddr_t h)
-{
-
-	switch ((int)fs->fs_frag) {
-	case 8:
-		cp[h] = 0xff;
-		return;
-	case 4:
-		cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
-		return;
-	case 2:
-		cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
-		return;
-	case 1:
-		cp[h >> 3] |= (0x01 << (h & 0x7));
-		return;
-	default:
-		assert(!"setblock: invalid number of fragments");
-	}
-}
-
-/*
- * check if a block is free
- */
-static int
-isfreeblock(u_char *cp, ufs1_daddr_t h)
-{
-
-	switch ((int)fs->fs_frag) {
-	case 8:
-		return (cp[h] == 0);
-	case 4:
-		return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
-	case 2:
-		return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
-	case 1:
-		return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
-	default:
-		assert(!"isfreeblock: invalid number of fragments");
-	}
-	return (0);
-}
-
-/*
- * Update the frsum fields to reflect addition or deletion
- * of some frags.
- */
-void
-fragacct(int fragmap, int32_t fraglist[], int cnt)
-{
-	int inblk;
-	int field, subfield;
-	int siz, pos;
-
-	inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
-	fragmap <<= 1;
-	for (siz = 1; siz < fs->fs_frag; siz++) {
-		if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0)
-			continue;
-		field = around[siz];
-		subfield = inside[siz];
-		for (pos = siz; pos <= fs->fs_frag; pos++) {
-			if ((fragmap & field) == subfield) {
-				fraglist[siz] += cnt;
-				pos += siz;
-				field <<= siz;
-				subfield <<= siz;
-			}
-			field <<= 1;
-			subfield <<= 1;
-		}
-	}
-}
-
-static void
-clusteracct(struct cg *cgp, ufs1_daddr_t blkno)
-{
-	int32_t *sump;
-	int32_t *lp;
-	u_char *freemapp, *mapp;
-	int i, start, end, forw, back, map, bit;
-
-	if (fs->fs_contigsumsize <= 0)
-		return;
-	freemapp = cg_clustersfree(cgp);
-	sump = cg_clustersum(cgp);
-	/*
-	 * Clear the actual block.
-	 */
-	setbit(freemapp, blkno);
-	/*
-	 * Find the size of the cluster going forward.
-	 */
-	start = blkno + 1;
-	end = start + fs->fs_contigsumsize;
-	if (end >= cgp->cg_nclusterblks)
-		end = cgp->cg_nclusterblks;
-	mapp = &freemapp[start / NBBY];
-	map = *mapp++;
-	bit = 1 << (start % NBBY);
-	for (i = start; i < end; i++) {
-		if ((map & bit) == 0)
-			break;
-		if ((i & (NBBY - 1)) != (NBBY - 1)) {
-			bit <<= 1;
-		} else {
-			map = *mapp++;
-			bit = 1;
-		}
-	}
-	forw = i - start;
-	/*
-	 * Find the size of the cluster going backward.
-	 */
-	start = blkno - 1;
-	end = start - fs->fs_contigsumsize;
-	if (end < 0)
-		end = -1;
-	mapp = &freemapp[start / NBBY];
-	map = *mapp--;
-	bit = 1 << (start % NBBY);
-	for (i = start; i > end; i--) {
-		if ((map & bit) == 0)
-			break;
-		if ((i & (NBBY - 1)) != 0) {
-			bit >>= 1;
-		} else {
-			map = *mapp--;
-			bit = 1 << (NBBY - 1);
-		}
-	}
-	back = start - i;
-	/*
-	 * Account for old cluster and the possibly new forward and
-	 * back clusters.
-	 */
-	i = back + forw + 1;
-	if (i > fs->fs_contigsumsize)
-		i = fs->fs_contigsumsize;
-	sump[i]++;
-	if (back > 0)
-		sump[back]--;
-	if (forw > 0)
-		sump[forw]--;
-	/*
-	 * Update cluster summary information.
-	 */
-	lp = &sump[fs->fs_contigsumsize];
-	for (i = fs->fs_contigsumsize; i > 0; i--)
-		if (*lp-- > 0)
-			break;
-	fs->fs_maxcluster[cgp->cg_cgx] = i;
-}
-
 static void
 blkfree(ufs2_daddr_t bno, long size)
 {
@ -539,10 +272,10 @@ blkfree(ufs2_daddr_t bno, long size)
 	blksfree = cg_blksfree(cgp);
 	if (size == fs->fs_bsize) {
 		fragno = fragstoblks(fs, cgbno);
-		if (!isfreeblock(blksfree, fragno))
+		if (!ffs_isfreeblock(fs, blksfree, fragno))
 			assert(!"blkfree: freeing free block");
-		setblock(blksfree, fragno);
-		clusteracct(cgp, fragno);
+		ffs_setblock(fs, blksfree, fragno);
+		ffs_clusteracct(fs, cgp, fragno, 1);
 		cgp->cg_cs.cs_nbfree++;
 		fs->fs_cstotal.cs_nbfree++;
 		fs->fs_cs(fs, cg).cs_nbfree++;
@ -552,7 +285,7 @@ blkfree(ufs2_daddr_t bno, long size)
 		 * decrement the counts associated with the old frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
-		fragacct(blk, cgp->cg_frsum, -1);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
 		/*
 		 * deallocate the fragment
 		 */
@ -569,16 +302,16 @@ blkfree(ufs2_daddr_t bno, long size)
 		 * add back in counts associated with the new frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
-		fragacct(blk, cgp->cg_frsum, 1);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
 		/*
 		 * if a complete block has been reassembled, account for it
 		 */
 		fragno = fragstoblks(fs, bbase);
-		if (isblock(blksfree, fragno)) {
+		if (ffs_isblock(fs, blksfree, fragno)) {
 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
-			clusteracct(cgp, fragno);
+			ffs_clusteracct(fs, cgp, fragno, 1);
 			cgp->cg_cs.cs_nbfree++;
 			fs->fs_cstotal.cs_nbfree++;
 			fs->fs_cs(fs, cg).cs_nbfree++;
@ -599,7 +332,7 @@ freeindir(ufs2_daddr_t blk, int level)
 	if (bread(disk, fsbtodb(fs, blk), (void *)&sblks, (size_t)fs->fs_bsize) == -1)
 		err(1, "bread: %s", disk->d_error);
 	blks = (ufs2_daddr_t *)&sblks;
-	for (i = 0; i < howmany(fs->fs_bsize, sizeof(ufs2_daddr_t)); i++) {
+	for (i = 0; i < NINDIR(fs); i++) {
 		if (blks[i] == 0)
 			break;
 		if (level == 0)
@ -671,7 +404,7 @@ gjournal_check(const char *filesys)
 	int cg, mode;

 	devnam = filesys;
-	getdisk();
+	opendisk();
 	/* Are there any unreferenced inodes in this file system? */
 	if (fs->fs_unrefs == 0) {
 		//printf("No unreferenced inodes.\n");
@ -747,7 +480,7 @@ gjournal_check(const char *filesys)
 			/* Zero-fill the inode. */
 			*dino = ufs2_zino;
 			/* Write the inode back. */
-			if (putino(disk, ino) == -1)
+			if (putino(disk) == -1)
 				err(1, "putino(cg=%d ino=%d)", cg, ino);
 			if (cgp->cg_unrefs == 0) {
 				//printf("No more unreferenced inodes in cg=%d.\n", cg);
@ -772,5 +505,5 @@ gjournal_check(const char *filesys)
 	/* Write back modified cylinder groups. */
 	putcgs();
 	/* Write back updated statistics and super-block. */
-	putdisk();
+	closedisk();
 }
--- a/sbin/fsck_ffs/main.c
+++ b/sbin/fsck_ffs/main.c
@ -242,8 +242,9 @@ checkfilesys(char *filesys)
 		if ((fsreadfd = open(filesys, O_RDONLY)) < 0 || readsb(0) == 0)
 			exit(3);	/* Cannot read superblock */
 		close(fsreadfd);
-		if (sblock.fs_flags & FS_NEEDSFSCK)
-			exit(4);	/* Earlier background failed */
+		/* Earlier background failed or journaled */
+		if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ))
+			exit(4);
 		if ((sblock.fs_flags & FS_DOSOFTDEP) == 0)
 			exit(5);	/* Not running soft updates */
 		size = MIBSIZE;
@ -299,7 +300,7 @@ checkfilesys(char *filesys)
 			pfatal("MOUNTED READ-ONLY, CANNOT RUN IN BACKGROUND\n");
 		} else if ((fsreadfd = open(filesys, O_RDONLY)) >= 0) {
 			if (readsb(0) != 0) {
-				if (sblock.fs_flags & FS_NEEDSFSCK) {
+				if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ)) {
 					bkgrdflag = 0;
 					pfatal("UNEXPECTED INCONSISTENCY, %s\n",
 					    "CANNOT RUN IN BACKGROUND\n");
@ -384,6 +385,26 @@ checkfilesys(char *filesys)
 		    sblock.fs_cstotal.cs_nffree * 100.0 / sblock.fs_dsize);
 		return (0);
 	}
+	/*
+	 * Determine if we can and should do journal recovery.
+	 */
+	if ((sblock.fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == FS_SUJ) {
+		if (preen || reply("USE JOURNAL?")) {
+			if (suj_check(filesys) == 0) {
+				if (chkdoreload(mntp) == 0)
+					exit(0);
+				exit(4);
+			}
+			/* suj_check failed, fall through. */
+		}
+		printf("** Skipping journal, falling through to full fsck\n");
+		/*
+		 * Write the superblock so we don't try to recover the
+		 * journal on another pass.
+		 */
+		sblock.fs_mtime = time(NULL);
+		sbdirty();
+	}
 	
 	/*
 	 * Cleared if any questions answered no. Used to decide if
--- a/sbin/fsck_ffs/pass5.c
+++ b/sbin/fsck_ffs/pass5.c
@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <inttypes.h>
 #include <limits.h>
 #include <string.h>
+#include <libufs.h>

 #include "fsck.h"

--- a/sbin/fsck_ffs/suj.c
+++ b/sbin/fsck_ffs/suj.c
--- a/sbin/fsdb/fsdb.c
+++ b/sbin/fsdb/fsdb.c
@ -396,7 +396,8 @@ const char *typename[] = {
    "unregistered #13",
    "whiteout",
 };
-    
+
+int diroff; 
 int slot;

 int
@ -404,9 +405,10 @@ scannames(struct inodesc *idesc)
 {
 	struct direct *dirp = idesc->id_dirp;

-	printf("slot %d ino %d reclen %d: %s, `%.*s'\n",
-	       slot++, dirp->d_ino, dirp->d_reclen, typename[dirp->d_type],
-	       dirp->d_namlen, dirp->d_name);
+	printf("slot %d off %d ino %d reclen %d: %s, `%.*s'\n",
+	       slot++, diroff, dirp->d_ino, dirp->d_reclen,
+	       typename[dirp->d_type], dirp->d_namlen, dirp->d_name);
+	diroff += dirp->d_reclen;
 	return (KEEPON);
 }

@ -416,6 +418,7 @@ CMDFUNCSTART(ls)
    checkactivedir();			/* let it go on anyway */

    slot = 0;
+    diroff = 0;
    idesc.id_number = curinum;
    idesc.id_func = scannames;
    idesc.id_type = DATA;
--- a/sbin/fsdb/fsdbutil.c
+++ b/sbin/fsdb/fsdbutil.c
@ -52,7 +52,7 @@ static const char rcsid[] =
 #include "fsck.h"

 static int charsperline(void);
-static int printindir(ufs2_daddr_t blk, int level, char *bufp);
+static void printindir(ufs2_daddr_t blk, int level, char *bufp);
 static void printblocks(ino_t inum, union dinode *dp);

 char **
@ -226,7 +226,7 @@ charsperline(void)
 /*
 * Recursively print a list of indirect blocks.
 */
-static int
+static void
 printindir(ufs2_daddr_t blk, int level, char *bufp)
 {
    struct bufarea buf, *bp;
@ -234,6 +234,9 @@ printindir(ufs2_daddr_t blk, int level, char *bufp)
    int i, j, cpl, charssofar;
    ufs2_daddr_t blkno;

+    if (blk == 0)
+	return;
+    printf("%jd (%d) =>\n", (intmax_t)blk, level);
    if (level == 0) {
 	/* for the final indirect level, don't use the cache */
 	bp = &buf;
@ -251,11 +254,8 @@ printindir(ufs2_daddr_t blk, int level, char *bufp)
 		blkno = bp->b_un.b_indir1[i];
 	else
 		blkno = bp->b_un.b_indir2[i];
-	if (blkno == 0) {
-	    if (level == 0)
-		putchar('\n');
-	    return 0;
-	}
+	if (blkno == 0)
+	    continue;
 	j = sprintf(tempbuf, "%jd", (intmax_t)blkno);
 	if (level == 0) {
 	    charssofar += j;
@ -270,13 +270,14 @@ printindir(ufs2_daddr_t blk, int level, char *bufp)
 	    charssofar += 2;
 	} else {
 	    printf(" =>\n");
-	    if (printindir(blkno, level - 1, bufp) == 0)
-		return 0;
+	    printindir(blkno, level - 1, bufp);
+	    printf("\n");
+	    charssofar = 0;
 	}
    }
    if (level == 0)
 	putchar('\n');
-    return 1;
+    return;
 }


@ -309,7 +310,7 @@ printblocks(ino_t inum, union dinode *dp)
 	}
    }
    putchar('\n');
-    if (DIP(dp, di_ib[0]) == 0)
+    if (ndb == 0)
 	return;

    bufp = malloc((unsigned int)sblock.fs_bsize);
@ -317,8 +318,7 @@ printblocks(ino_t inum, union dinode *dp)
 	errx(EEXIT, "cannot allocate indirect block buffer");
    printf("Indirect blocks:\n");
    for (i = 0; i < NIADDR; i++)
-	if (printindir(DIP(dp, di_ib[i]), i, bufp) == 0)
-	    break;
+	printindir(DIP(dp, di_ib[i]), i, bufp);
    free(bufp);
 }

--- a/sbin/tunefs/tunefs.8
+++ b/sbin/tunefs/tunefs.8
@ -28,7 +28,7 @@
 .\"     @(#)tunefs.8	8.2 (Berkeley) 12/11/93
 .\" $FreeBSD$
 .\"
-.Dd October 21, 2009
+.Dd March 6, 2010
 .Dt TUNEFS 8
 .Os
 .Sh NAME
@ -40,6 +40,7 @@
 .Op Fl a Cm enable | disable
 .Op Fl e Ar maxbpg
 .Op Fl f Ar avgfilesize
+.Op Fl j Cm enable | disable
 .Op Fl J Cm enable | disable
 .Op Fl L Ar volname
 .Op Fl l Cm enable | disable
@ -49,6 +50,7 @@
 .Op Fl o Cm space | time
 .Op Fl p
 .Op Fl s Ar avgfpdir
+.Op Fl S Ar size
 .Ar special | filesystem
 .Sh DESCRIPTION
 The
@ -89,6 +91,8 @@ For file systems with exclusively large files,
 this parameter should be set higher.
 .It Fl f Ar avgfilesize
 Specify the expected average file size.
+.It Fl j Cm enable | disable
+Turn on/off soft updates journaling.
 .It Fl J Cm enable | disable
 Turn on/off gjournal flag.
 .It Fl L Ar volname
@ -136,6 +140,9 @@ obtained from the
 utility.
 .It Fl s Ar avgfpdir
 Specify the expected number of files per directory.
+.It Fl S Ar size
+Specify the softdep journal size in bytes.
+The minimum is 4M.
 .El
 .Pp
 At least one of the above flags is required.
--- a/sbin/tunefs/tunefs.c
+++ b/sbin/tunefs/tunefs.c
@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/dinode.h>
 #include <ufs/ffs/fs.h>
+#include <ufs/ufs/dir.h>

 #include <ctype.h>
 #include <err.h>
@ -61,6 +62,7 @@ __FBSDID("$FreeBSD$");
 #include <paths.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdint.h>
 #include <string.h>
 #include <unistd.h>

@ -72,16 +74,20 @@ struct uufsd disk;

 void usage(void);
 void printfs(void);
+int journal_alloc(int64_t size);
+void journal_clear(void);
+void sbdirty(void);

 int
 main(int argc, char *argv[])
 {
-	char *avalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue;
+	char *avalue, *jvalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue;
 	const char *special, *on;
 	const char *name;
 	int active;
-	int Aflag, aflag, eflag, evalue, fflag, fvalue, Jflag, Lflag, lflag;
-	int mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag, svalue;
+	int Aflag, aflag, eflag, evalue, fflag, fvalue, jflag, Jflag, Lflag;
+	int lflag, mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag;
+	int svalue, Sflag, Svalue;
 	int ch, found_arg, i;
 	const char *chg[2];
 	struct ufs_args args;
@ -89,13 +95,13 @@ main(int argc, char *argv[])

 	if (argc < 3)
 		usage();
-	Aflag = aflag = eflag = fflag = Jflag = Lflag = lflag = mflag = 0;
-	Nflag = nflag = oflag = pflag = sflag = 0;
-	avalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL;
-	evalue = fvalue = mvalue = ovalue = svalue = 0;
+	Aflag = aflag = eflag = fflag = jflag = Jflag = Lflag = lflag = 0;
+	mflag = Nflag = nflag = oflag = pflag = sflag = 0;
+	avalue = jvalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL;
+	evalue = fvalue = mvalue = ovalue = svalue = Svalue = 0;
 	active = 0;
 	found_arg = 0;		/* At least one arg is required. */
-	while ((ch = getopt(argc, argv, "Aa:e:f:J:L:l:m:N:n:o:ps:")) != -1)
+	while ((ch = getopt(argc, argv, "Aa:e:f:j:J:L:l:m:N:n:o:ps:S:")) != -1)
 		switch (ch) {

 		case 'A':
@ -135,6 +141,18 @@ main(int argc, char *argv[])
 			fflag = 1;
 			break;

+		case 'j':
+			found_arg = 1;
+			name = "softdep journaled file system";
+			jvalue = optarg;
+			if (strcmp(jvalue, "enable") &&
+			    strcmp(jvalue, "disable")) {
+				errx(10, "bad %s (options are %s)",
+				    name, "`enable' or `disable'");
+			}
+			jflag = 1;
+			break;
+
 		case 'J':
 			found_arg = 1;
 			name = "gjournaled file system";
@ -240,6 +258,16 @@ main(int argc, char *argv[])
 			sflag = 1;
 			break;

+		case 'S':
+			found_arg = 1;
+			name = "Softdep Journal Size";
+			Svalue = atoi(optarg);
+			if (Svalue < SUJ_MIN)
+				errx(10, "%s must be >= %d (was %s)",
+				    name, SUJ_MIN, optarg);
+			Sflag = 1;
+			break;
+
 		default:
 			usage();
 		}
@ -310,6 +338,33 @@ main(int argc, char *argv[])
 			sblock.fs_avgfilesize = fvalue;
 		}
 	}
+	if (jflag) {
+ 		name = "soft updates journaling";
+ 		if (strcmp(jvalue, "enable") == 0) {
+			if ((sblock.fs_flags & (FS_DOSOFTDEP | FS_SUJ)) ==
+			    (FS_DOSOFTDEP | FS_SUJ)) {
+				warnx("%s remains unchanged as enabled", name);
+			} else if (sblock.fs_clean == 0) {
+				warnx("%s cannot be enabled until fsck is run",
+				    name);
+			} else if (journal_alloc(Svalue) != 0) {
+				warnx("%s can not be enabled", name);
+			} else {
+ 				sblock.fs_flags |= FS_DOSOFTDEP | FS_SUJ;
+ 				warnx("%s set", name);
+			}
+ 		} else if (strcmp(jvalue, "disable") == 0) {
+			if ((~sblock.fs_flags & FS_SUJ) == FS_SUJ) {
+				warnx("%s remains unchanged as disabled", name);
+			} else {
+				journal_clear();
+ 				sblock.fs_flags &= ~(FS_DOSOFTDEP | FS_SUJ);
+				sblock.fs_sujfree = 0;
+ 				warnx("%s cleared, "
+				    "remove .sujournal to reclaim space", name);
+			}
+ 		}
+	}
 	if (Jflag) {
 		name = "gjournal";
 		if (strcmp(Jvalue, "enable") == 0) {
@ -455,6 +510,500 @@ err:
 		err(12, "%s", special);
 }

+void
+sbdirty(void)
+{
+	disk.d_fs.fs_flags |= FS_UNCLEAN | FS_NEEDSFSCK;
+	disk.d_fs.fs_clean = 0;
+}
+
+int blocks;
+static char clrbuf[MAXBSIZE];
+
+static ufs2_daddr_t
+journal_balloc(void)
+{
+	ufs2_daddr_t blk;
+	struct cg *cgp;
+	int valid;
+	static int contig = 1;
+
+	cgp = &disk.d_cg;
+	for (;;) {
+		blk = cgballoc(&disk);
+		if (blk > 0)
+			break;
+		/*
+		 * If we failed to allocate a block from this cg, move to
+		 * the next.
+		 */
+		if (cgwrite(&disk) < 0) {
+			warn("Failed to write updated cg");
+			return (-1);
+		}
+		while ((valid = cgread(&disk)) == 1) {
+			/*
+			 * Try to minimize fragmentation by requiring a minimum
+			 * number of blocks present.
+			 */
+			if (cgp->cg_cs.cs_nbfree > blocks / 8)
+				break;
+			if (contig == 0 && cgp->cg_cs.cs_nbfree)
+				break;
+		}
+		if (valid)
+			continue;
+		/*
+		 * Try once through looking only for large contiguous regions
+		 * and again taking any space we can find.
+		 */
+		if (contig) {
+			contig = 0;
+			disk.d_ccg = 0;
+			warnx("Journal file fragmented.");
+			continue;
+		}
+		warnx("Failed to find sufficient free blocks for the journal");
+		return -1;
+	}
+	if (bwrite(&disk, fsbtodb(&sblock, blk), clrbuf,
+	    sblock.fs_bsize) <= 0) {
+		warn("Failed to initialize new block");
+		return -1;
+	}
+	return (blk);
+}
+
+/*
+ * Search a directory block for the SUJ_FILE.
+ */
+static ino_t
+dir_search(ufs2_daddr_t blk, int bytes)
+{
+	char block[MAXBSIZE];
+	struct direct *dp;
+	int off;
+
+	if (bread(&disk, fsbtodb(&sblock, blk), block, bytes) <= 0) {
+		warn("Failed to read dir block");
+		return (-1);
+	}
+	for (off = 0; off < bytes; off += dp->d_reclen) {
+		dp = (struct direct *)&block[off];
+		if (dp->d_reclen == 0)
+			break;
+		if (dp->d_ino == 0)
+			continue;
+		if (dp->d_namlen != strlen(SUJ_FILE))
+			continue;
+		if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0)
+			continue;
+		return (dp->d_ino);
+	}
+
+	return (0);
+}
+
+/*
+ * Search in the ROOTINO for the SUJ_FILE.  If it exists we can not enable
+ * journaling.
+ */
+static ino_t
+journal_findfile(void)
+{
+	struct ufs1_dinode *dp1;
+	struct ufs2_dinode *dp2;
+	ino_t ino;
+	int mode;
+	void *ip;
+	int i;
+
+	if (getino(&disk, &ip, ROOTINO, &mode) != 0) {
+		warn("Failed to get root inode");
+		return (-1);
+	}
+	dp2 = ip;
+	dp1 = ip;
+	if (sblock.fs_magic == FS_UFS1_MAGIC) {
+		if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) {
+			warnx("ROOTINO extends beyond direct blocks.");
+			return (-1);
+		}
+		for (i = 0; i < NDADDR; i++) {
+			if (dp1->di_db[i] == 0)
+				break;
+			if ((ino = dir_search(dp1->di_db[i],
+			    sblksize(&sblock, (off_t)dp1->di_size, i))) != 0)
+				return (ino);
+		}
+	} else {
+		if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) {
+			warnx("ROOTINO extends beyond direct blocks.");
+			return (-1);
+		}
+		for (i = 0; i < NDADDR; i++) {
+			if (dp2->di_db[i] == 0)
+				break;
+			if ((ino = dir_search(dp2->di_db[i],
+			    sblksize(&sblock, (off_t)dp2->di_size, i))) != 0)
+				return (ino);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Insert the journal at inode 'ino' into directory blk 'blk' at the first
+ * free offset of 'off'.  DIRBLKSIZ blocks after off are initialized as
+ * empty.
+ */
+static int
+dir_insert(ufs2_daddr_t blk, off_t off, ino_t ino)
+{
+	struct direct *dp;
+	char block[MAXBSIZE];
+
+	if (bread(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) {
+		warn("Failed to read dir block");
+		return (-1);
+	}
+	bzero(&block[off], sblock.fs_bsize - off);
+	dp = (struct direct *)&block[off];
+	dp->d_ino = ino;
+	dp->d_reclen = DIRBLKSIZ;
+	dp->d_type = DT_REG;
+	dp->d_namlen = strlen(SUJ_FILE);
+	bcopy(SUJ_FILE, &dp->d_name, strlen(SUJ_FILE));
+	off += DIRBLKSIZ;
+	for (; off < sblock.fs_bsize; off += DIRBLKSIZ) {
+		dp = (struct direct *)&block[off];
+		dp->d_ino = 0;
+		dp->d_reclen = DIRBLKSIZ;
+		dp->d_type = DT_UNKNOWN;
+	}
+	if (bwrite(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) {
+		warn("Failed to write dir block");
+		return (-1);
+	}
+	return (0);
+}
+
+/*
+ * Extend a directory block in 'blk' by copying it to a full size block
+ * and inserting the new journal inode into .sujournal.
+ */
+static int
+dir_extend(ufs2_daddr_t blk, ufs2_daddr_t nblk, off_t size, ino_t ino)
+{
+	char block[MAXBSIZE];
+
+	if (bread(&disk, fsbtodb(&sblock, blk), block, size) <= 0) {
+		warn("Failed to read dir block");
+		return (-1);
+	}
+	if (bwrite(&disk, fsbtodb(&sblock, nblk), block, size) <= 0) {
+		warn("Failed to write dir block");
+		return (-1);
+	}
+
+	return dir_insert(nblk, size, ino);
+}
+
+/*
+ * Insert the journal file into the ROOTINO directory.  We always extend the
+ * last frag
+ */
+static int
+journal_insertfile(ino_t ino)
+{
+	struct ufs1_dinode *dp1;
+	struct ufs2_dinode *dp2;
+	void *ip;
+	ufs2_daddr_t nblk;
+	ufs2_daddr_t blk;
+	ufs_lbn_t lbn;
+	int size;
+	int mode;
+	int off;
+
+	if (getino(&disk, &ip, ROOTINO, &mode) != 0) {
+		warn("Failed to get root inode");
+		sbdirty();
+		return (-1);
+	}
+	dp2 = ip;
+	dp1 = ip;
+	blk = 0;
+	size = 0;
+	nblk = journal_balloc();
+	if (nblk <= 0)
+		return (-1);
+	/*
+	 * For simplicity sake we aways extend the ROOTINO into a new
+	 * directory block rather than searching for space and inserting
+	 * into an existing block.  However, if the rootino has frags
+	 * have to free them and extend the block.
+	 */
+	if (sblock.fs_magic == FS_UFS1_MAGIC) {
+		lbn = lblkno(&sblock, dp1->di_size);
+		off = blkoff(&sblock, dp1->di_size);
+		blk = dp1->di_db[lbn];
+		size = sblksize(&sblock, (off_t)dp1->di_size, lbn);
+	} else {
+		lbn = lblkno(&sblock, dp2->di_size);
+		off = blkoff(&sblock, dp2->di_size);
+		blk = dp2->di_db[lbn];
+		size = sblksize(&sblock, (off_t)dp2->di_size, lbn);
+	}
+	if (off != 0) {
+		if (dir_extend(blk, nblk, off, ino) == -1)
+			return (-1);
+	} else {
+		blk = 0;
+		if (dir_insert(nblk, 0, ino) == -1)
+			return (-1);
+	}
+	if (sblock.fs_magic == FS_UFS1_MAGIC) {
+		dp1->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE;
+		dp1->di_db[lbn] = nblk;
+		dp1->di_size = lblktosize(&sblock, lbn+1);
+	} else {
+		dp2->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE;
+		dp2->di_db[lbn] = nblk;
+		dp2->di_size = lblktosize(&sblock, lbn+1);
+	}
+	if (putino(&disk) < 0) {
+		warn("Failed to write root inode");
+		return (-1);
+	}
+	if (cgwrite(&disk) < 0) {
+		warn("Failed to write updated cg");
+		sbdirty();
+		return (-1);
+	}
+	if (blk) {
+		if (cgbfree(&disk, blk, size) < 0) {
+			warn("Failed to write cg");
+			return (-1);
+		}
+	}
+
+	return (0);
+}
+
+static int
+indir_fill(ufs2_daddr_t blk, int level, int *resid)
+{
+	char indirbuf[MAXBSIZE];
+	ufs1_daddr_t *bap1;
+	ufs2_daddr_t *bap2;
+	ufs2_daddr_t nblk;
+	int ncnt;
+	int cnt;
+	int i;
+
+	bzero(indirbuf, sizeof(indirbuf));
+	bap1 = (ufs1_daddr_t *)indirbuf;
+	bap2 = (void *)bap1;
+	cnt = 0;
+	for (i = 0; i < NINDIR(&sblock) && *resid != 0; i++) {
+		nblk = journal_balloc();
+		if (nblk <= 0)
+			return (-1);
+		cnt++;
+		if (sblock.fs_magic == FS_UFS1_MAGIC)
+			*bap1++ = nblk;
+		else
+			*bap2++ = nblk;
+		if (level != 0) {
+			ncnt = indir_fill(nblk, level - 1, resid);
+			if (ncnt <= 0)
+				return (-1);
+			cnt += ncnt;
+		} else 
+			(*resid)--;
+	}
+	if (bwrite(&disk, fsbtodb(&sblock, blk), indirbuf,
+	    sblock.fs_bsize) <= 0) {
+		warn("Failed to write indirect");
+		return (-1);
+	}
+	return (cnt);
+}
+
+/*
+ * Clear the flag bits so the journal can be removed.
+ */
+void
+journal_clear(void)
+{
+	struct ufs1_dinode *dp1;
+	struct ufs2_dinode *dp2;
+	ino_t ino;
+	int mode;
+	void *ip;
+
+	ino = journal_findfile();
+	if (ino == (ino_t)-1 || ino == 0) {
+		warnx("Journal file does not exist");
+		return;
+	}
+	printf("Clearing journal flags from inode %d\n", ino);
+	if (getino(&disk, &ip, ino, &mode) != 0) {
+		warn("Failed to get journal inode");
+		return;
+	}
+	dp2 = ip;
+	dp1 = ip;
+	if (sblock.fs_magic == FS_UFS1_MAGIC)
+		dp1->di_flags = 0;
+	else
+		dp2->di_flags = 0;
+	if (putino(&disk) < 0) {
+		warn("Failed to write journal inode");
+		return;
+	}
+}
+
+int
+journal_alloc(int64_t size)
+{
+	struct ufs1_dinode *dp1;
+	struct ufs2_dinode *dp2;
+	ufs2_daddr_t blk;
+	void *ip;
+	struct cg *cgp;
+	int resid;
+	ino_t ino;
+	int blks;
+	int mode;
+	int i;
+
+	cgp = &disk.d_cg;
+	ino = 0;
+
+	/*
+	 * If the journal file exists we can't allocate it.
+	 */
+	ino = journal_findfile();
+	if (ino == (ino_t)-1)
+		return (-1);
+	if (ino > 0) {
+		warnx("Journal file %s already exists, please remove.",
+		    SUJ_FILE);
+		return (-1);
+	}
+	/*
+	 * If the user didn't supply a size pick one based on the filesystem
+	 * size constrained with hardcoded MIN and MAX values.  We opt for
+	 * 1/1024th of the filesystem up to MAX but not exceeding one CG and
+	 * not less than the MIN.
+	 */
+	if (size == 0) {
+		size = (sblock.fs_size * sblock.fs_bsize) / 1024;
+		size = MIN(SUJ_MAX, size);
+		if (size / sblock.fs_fsize > sblock.fs_fpg)
+			size = sblock.fs_fpg * sblock.fs_fsize;
+		size = MAX(SUJ_MIN, size);
+	}
+	resid = blocks = size / sblock.fs_bsize;
+	if (sblock.fs_cstotal.cs_nbfree < blocks) {
+		warn("Insufficient free space for %jd byte journal", size);
+		return (-1);
+	}
+	/*
+	 * Find a cg with enough blocks to satisfy the journal
+	 * size.  Presently the journal does not span cgs.
+	 */
+	while (cgread(&disk) == 1) {
+		if (cgp->cg_cs.cs_nifree == 0)
+			continue;
+		ino = cgialloc(&disk);
+		if (ino <= 0)
+			break;
+		printf("Using inode %d in cg %d for %jd byte journal\n", 
+		    ino, cgp->cg_cgx, size);
+		if (getino(&disk, &ip, ino, &mode) != 0) {
+			warn("Failed to get allocated inode");
+			sbdirty();
+			goto out;
+		}
+		/*
+		 * We leave fields unrelated to the number of allocated
+		 * blocks and size uninitialized.  This causes legacy
+		 * fsck implementations to clear the inode.
+		 */
+		dp2 = ip;
+		dp1 = ip;
+		if (sblock.fs_magic == FS_UFS1_MAGIC) {
+			bzero(dp1, sizeof(*dp1));
+			dp1->di_size = size;
+			dp1->di_mode = IFREG | IREAD;
+			dp1->di_nlink = 1;
+			dp1->di_flags = SF_IMMUTABLE | SF_NOUNLINK | UF_NODUMP;
+		} else {
+			bzero(dp2, sizeof(*dp2));
+			dp2->di_size = size;
+			dp2->di_mode = IFREG | IREAD;
+			dp2->di_nlink = 1;
+			dp2->di_flags = SF_IMMUTABLE | SF_NOUNLINK | UF_NODUMP;
+		}
+		for (i = 0; i < NDADDR && resid; i++, resid--) {
+			blk = journal_balloc();
+			if (blk <= 0)
+				goto out;
+			if (sblock.fs_magic == FS_UFS1_MAGIC) {
+				dp1->di_db[i] = blk;
+				dp1->di_blocks++;
+			} else {
+				dp2->di_db[i] = blk;
+				dp2->di_blocks++;
+			}
+		}
+		for (i = 0; i < NIADDR && resid; i++) {
+			blk = journal_balloc();
+			if (blk <= 0)
+				goto out;
+			blks = indir_fill(blk, i, &resid) + 1;
+			if (blks <= 0) {
+				sbdirty();
+				goto out;
+			}
+			if (sblock.fs_magic == FS_UFS1_MAGIC) {
+				dp1->di_ib[i] = blk;
+				dp1->di_blocks += blks;
+			} else {
+				dp2->di_ib[i] = blk;
+				dp2->di_blocks += blks;
+			}
+		}
+		if (sblock.fs_magic == FS_UFS1_MAGIC)
+			dp1->di_blocks *= sblock.fs_bsize / disk.d_bsize;
+		else
+			dp2->di_blocks *= sblock.fs_bsize / disk.d_bsize;
+		if (putino(&disk) < 0) {
+			warn("Failed to write inode");
+			sbdirty();
+			return (-1);
+		}
+		if (cgwrite(&disk) < 0) {
+			warn("Failed to write updated cg");
+			sbdirty();
+			return (-1);
+		}
+		if (journal_insertfile(ino) < 0) {
+			sbdirty();
+			return (-1);
+		}
+		sblock.fs_sujfree = 0;
+		return (0);
+	}
+	warnx("Insufficient free space for the journal.");
+out:
+	return (-1);
+}
+
 void
 usage(void)
 {
@ -477,6 +1026,8 @@ printfs(void)
 		(sblock.fs_flags & FS_MULTILABEL)? "enabled" : "disabled");
 	warnx("soft updates: (-n)                                 %s", 
 		(sblock.fs_flags & FS_DOSOFTDEP)? "enabled" : "disabled");
+	warnx("soft update journaling: (-j)                       %s", 
+		(sblock.fs_flags & FS_SUJ)? "enabled" : "disabled");
 	warnx("gjournal: (-J)                                     %s",
 		(sblock.fs_flags & FS_GJOURNAL)? "enabled" : "disabled");
 	warnx("maximum blocks per file in a cylinder group: (-e)  %d",
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@ -215,6 +215,14 @@ SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
 */
 static int bd_request;

+/*
+ * Request for the buf daemon to write more buffers than is indicated by
+ * lodirtybuf.  This may be necessary to push out excess dependencies or
+ * defragment the address space where a simple count of the number of dirty
+ * buffers is insufficient to characterize the demand for flushing them.
+ */
+static int bd_speedupreq;
+
 /*
 * This lock synchronizes access to bd_request.
 */
@ -467,12 +475,20 @@ bd_wakeup(int dirtybuflevel)
 * bd_speedup - speedup the buffer cache flushing code
 */

-static __inline
 void
 bd_speedup(void)
 {
+	int needwake;

-	bd_wakeup(1);
+	mtx_lock(&bdlock);
+	needwake = 0;
+	if (bd_speedupreq == 0 || bd_request == 0)
+		needwake = 1;
+	bd_speedupreq = 1;
+	bd_request = 1;
+	if (needwake)
+		wakeup(&bd_request);
+	mtx_unlock(&bdlock);
 }

 /*
@ -2120,6 +2136,7 @@ buf_do_flush(struct vnode *vp)
 static void
 buf_daemon()
 {
+	int lodirtysave;

 	/*
 	 * This process needs to be suspended prior to shutdown sync.
@ -2137,7 +2154,11 @@ buf_daemon()
 		mtx_unlock(&bdlock);

 		kproc_suspend_check(bufdaemonproc);
-
+		lodirtysave = lodirtybuffers;
+		if (bd_speedupreq) {
+			lodirtybuffers = numdirtybuffers / 2;
+			bd_speedupreq = 0;
+		}
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
@ -2149,6 +2170,7 @@ buf_daemon()
 				break;
 			uio_yield();
 		}
+		lodirtybuffers = lodirtysave;

 		/*
 		 * Only clear bd_request if we have reached our low water
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@ -2815,6 +2815,7 @@ DB_SHOW_COMMAND(mount, db_show_mount)
 	MNT_FLAG(MNT_FORCE);
 	MNT_FLAG(MNT_SNAPSHOT);
 	MNT_FLAG(MNT_BYFSID);
+	MNT_FLAG(MNT_SOFTDEP);
 #undef MNT_FLAG
 	if (flags != 0) {
 		if (buf[0] != '\0')
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@ -215,7 +215,7 @@ struct buf {
 #define	B_DIRTY		0x00200000	/* Needs writing later (in EXT2FS). */
 #define	B_RELBUF	0x00400000	/* Release VMIO buffer. */
 #define	B_00800000	0x00800000	/* Available flag. */
-#define	B_01000000	0x01000000	/* Available flag. */
+#define	B_NOCOPY	0x01000000	/* Don't copy-on-write this buf. */
 #define	B_NEEDSGIANT	0x02000000	/* Buffer's vnode needs giant. */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
 #define B_MANAGED	0x08000000	/* Managed by FS. */
@ -493,6 +493,7 @@ int	bufwait(struct buf *);
 int	bufwrite(struct buf *);
 void	bufdone(struct buf *);
 void	bufdone_finish(struct buf *);
+void	bd_speedup(void);

 int	cluster_read(struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, long, int, struct buf **);
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@ -275,7 +275,8 @@ void          __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp);
 			MNT_ROOTFS	| MNT_NOATIME	| MNT_NOCLUSTERR| \
 			MNT_NOCLUSTERW	| MNT_SUIDDIR	| MNT_SOFTDEP	| \
 			MNT_IGNORE	| MNT_EXPUBLIC	| MNT_NOSYMFOLLOW | \
-			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS | MNT_NFS4ACLS)
+			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS	| \
+			MNT_NFS4ACLS)

 /* Mask of flags that can be updated. */
 #define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| \
@ -324,6 +325,7 @@ void          __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp);
 #define	MNTK_REFEXPIRE	0x00000020	/* refcount expiring is happening */
 #define MNTK_EXTENDED_SHARED	0x00000040 /* Allow shared locking for more ops */
 #define	MNTK_SHARED_WRITES	0x00000080 /* Allow shared locking for writes */
+#define	MNTK_SUJ	0x00000100	/* Softdep journaling enabled */
 #define MNTK_UNMOUNT	0x01000000	/* unmount in progress */
 #define	MNTK_MWAIT	0x02000000	/* waiting for unmount to finish */
 #define	MNTK_SUSPEND	0x08000000	/* request write suspension */
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@ -94,24 +94,24 @@ __FBSDID("$FreeBSD$");
 #include <ufs/ffs/ffs_extern.h>

 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref,
-				  int size);
+				  int size, int rsize);

-static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int);
+static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int);
 static ufs2_daddr_t
-	      ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t);
+	      ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int);
 #ifdef INVARIANTS
 static int	ffs_checkblk(struct inode *, ufs2_daddr_t, long);
 #endif
-static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int);
-static void	ffs_clusteracct(struct ufsmount *, struct fs *, struct cg *,
-		    ufs1_daddr_t, int);
+static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int,
+		    int);
 static ino_t	ffs_dirpref(struct inode *);
 static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t,
 		    int, int);
 static void	ffs_fserr(struct fs *, ino_t, char *);
 static ufs2_daddr_t	ffs_hashalloc
-		(struct inode *, u_int, ufs2_daddr_t, int, allocfcn_t *);
-static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int);
+		(struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *);
+static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int,
+		    int);
 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
 static int	ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
 static int	ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
@ -188,7 +188,7 @@ retry:
 		cg = ino_to_cg(fs, ip->i_number);
 	else
 		cg = dtog(fs, bpref);
-	bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);
+	bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);
 	if (bno > 0) {
 		delta = btodb(size);
 		if (ip->i_flag & IN_SPACECOUNTED) {
@ -387,16 +387,12 @@ retry:
 		panic("ffs_realloccg: bad optim");
 		/* NOTREACHED */
 	}
-	bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg);
+	bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);
 	if (bno > 0) {
 		bp->b_blkno = fsbtodb(fs, bno);
 		if (!DOINGSOFTDEP(vp))
 			ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize,
-			    ip->i_number);
-		if (nsize < request)
-			ffs_blkfree(ump, fs, ip->i_devvp,
-			    bno + numfrags(fs, nsize),
-			    (long)(request - nsize), ip->i_number);
+			    ip->i_number, NULL);
 		delta = btodb(nsize - osize);
 		if (ip->i_flag & IN_SPACECOUNTED) {
 			UFS_LOCK(ump);
@ -487,6 +483,14 @@ ffs_reallocblks(ap)

 	if (doreallocblks == 0)
 		return (ENOSPC);
+	/*
+	 * We can't wait in softdep prealloc as it may fsync and recurse
+	 * here.  Instead we simply fail to reallocate blocks if this
+	 * rare condition arises.
+	 */
+	if (DOINGSOFTDEP(ap->a_vp))
+		if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
+			return (ENOSPC);
 	if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1)
 		return (ffs_reallocblks_ufs1(ap));
 	return (ffs_reallocblks_ufs2(ap));
@ -587,7 +591,7 @@ ffs_reallocblks_ufs1(ap)
 	 * Search the block map looking for an allocation of the desired size.
 	 */
 	if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
-	    len, ffs_clusteralloc)) == 0) {
+	    len, len, ffs_clusteralloc)) == 0) {
 		UFS_UNLOCK(ump);
 		goto fail;
 	}
@ -673,7 +677,7 @@ ffs_reallocblks_ufs1(ap)
 		if (!DOINGSOFTDEP(vp))
 			ffs_blkfree(ump, fs, ip->i_devvp,
 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-			    fs->fs_bsize, ip->i_number);
+			    fs->fs_bsize, ip->i_number, NULL);
 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip,
@ -795,7 +799,7 @@ ffs_reallocblks_ufs2(ap)
 	 * Search the block map looking for an allocation of the desired size.
 	 */
 	if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
-	    len, ffs_clusteralloc)) == 0) {
+	    len, len, ffs_clusteralloc)) == 0) {
 		UFS_UNLOCK(ump);
 		goto fail;
 	}
@ -881,7 +885,7 @@ ffs_reallocblks_ufs2(ap)
 		if (!DOINGSOFTDEP(vp))
 			ffs_blkfree(ump, fs, ip->i_devvp,
 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-			    fs->fs_bsize, ip->i_number);
+			    fs->fs_bsize, ip->i_number, NULL);
 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip,
@ -969,7 +973,7 @@ ffs_valloc(pvp, mode, cred, vpp)
 		if (fs->fs_contigdirs[cg] > 0)
 			fs->fs_contigdirs[cg]--;
 	}
-	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode,
+	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,
 					(allocfcn_t *)ffs_nodealloccg);
 	if (ino == 0)
 		goto noinodes;
@ -1278,11 +1282,12 @@ ffs_blkpref_ufs2(ip, lbn, indx, bap)
 */
 /*VARARGS5*/
 static ufs2_daddr_t
-ffs_hashalloc(ip, cg, pref, size, allocator)
+ffs_hashalloc(ip, cg, pref, size, rsize, allocator)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t pref;
-	int size;	/* size for data blocks, mode for inodes */
+	int size;	/* Search size for data blocks, mode for inodes */
+	int rsize;	/* Real allocated size. */
 	allocfcn_t *allocator;
 {
 	struct fs *fs;
@ -1298,7 +1303,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)
 	/*
 	 * 1: preferred cylinder group
 	 */
-	result = (*allocator)(ip, cg, pref, size);
+	result = (*allocator)(ip, cg, pref, size, rsize);
 	if (result)
 		return (result);
 	/*
@ -1308,7 +1313,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)
 		cg += i;
 		if (cg >= fs->fs_ncg)
 			cg -= fs->fs_ncg;
-		result = (*allocator)(ip, cg, 0, size);
+		result = (*allocator)(ip, cg, 0, size, rsize);
 		if (result)
 			return (result);
 	}
@ -1319,7 +1324,7 @@ ffs_hashalloc(ip, cg, pref, size, allocator)
 	 */
 	cg = (icg + 2) % fs->fs_ncg;
 	for (i = 2; i < fs->fs_ncg; i++) {
-		result = (*allocator)(ip, cg, 0, size);
+		result = (*allocator)(ip, cg, 0, size, rsize);
 		if (result)
 			return (result);
 		cg++;
@ -1401,7 +1406,8 @@ ffs_fragextend(ip, cg, bprev, osize, nsize)
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,
+		    frags, numfrags(fs, osize));
 	bdwrite(bp);
 	return (bprev);

@ -1419,11 +1425,12 @@ fail:
 * and if it is, allocate it.
 */
 static ufs2_daddr_t
-ffs_alloccg(ip, cg, bpref, size)
+ffs_alloccg(ip, cg, bpref, size, rsize)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t bpref;
 	int size;
+	int rsize;
 {
 	struct fs *fs;
 	struct cg *cgp;
@ -1451,7 +1458,7 @@ ffs_alloccg(ip, cg, bpref, size)
 	cgp->cg_old_time = cgp->cg_time = time_second;
 	if (size == fs->fs_bsize) {
 		UFS_LOCK(ump);
-		blkno = ffs_alloccgblk(ip, bp, bpref);
+		blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		bdwrite(bp);
@ -1475,21 +1482,14 @@ ffs_alloccg(ip, cg, bpref, size)
 		if (cgp->cg_cs.cs_nbfree == 0)
 			goto fail;
 		UFS_LOCK(ump);
-		blkno = ffs_alloccgblk(ip, bp, bpref);
-		bno = dtogd(fs, blkno);
-		for (i = frags; i < fs->fs_frag; i++)
-			setbit(blksfree, bno + i);
-		i = fs->fs_frag - frags;
-		cgp->cg_cs.cs_nffree += i;
-		fs->fs_cstotal.cs_nffree += i;
-		fs->fs_cs(fs, cg).cs_nffree += i;
-		fs->fs_fmod = 1;
-		cgp->cg_frsum[i]++;
+		blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		bdwrite(bp);
 		return (blkno);
 	}
+	KASSERT(size == rsize,
+	    ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));
 	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
 	if (bno < 0)
 		goto fail;
@ -1507,7 +1507,7 @@ ffs_alloccg(ip, cg, bpref, size)
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);
 	bdwrite(bp);
 	return (blkno);

@ -1529,10 +1529,11 @@ fail:
 * blocks may be fragmented by the routine that allocates them.
 */
 static ufs2_daddr_t
-ffs_alloccgblk(ip, bp, bpref)
+ffs_alloccgblk(ip, bp, bpref, size)
 	struct inode *ip;
 	struct buf *bp;
 	ufs2_daddr_t bpref;
+	int size;
 {
 	struct fs *fs;
 	struct cg *cgp;
@ -1540,6 +1541,7 @@ ffs_alloccgblk(ip, bp, bpref)
 	ufs1_daddr_t bno;
 	ufs2_daddr_t blkno;
 	u_int8_t *blksfree;
+	int i;

 	fs = ip->i_fs;
 	ump = ip->i_ump;
@ -1567,16 +1569,32 @@ ffs_alloccgblk(ip, bp, bpref)
 gotit:
 	blkno = fragstoblks(fs, bno);
 	ffs_clrblock(fs, blksfree, (long)blkno);
-	ffs_clusteracct(ump, fs, cgp, blkno, -1);
+	ffs_clusteracct(fs, cgp, blkno, -1);
 	cgp->cg_cs.cs_nbfree--;
 	fs->fs_cstotal.cs_nbfree--;
 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
 	fs->fs_fmod = 1;
 	blkno = cgbase(fs, cgp->cg_cgx) + bno;
+	/*
+	 * If the caller didn't want the whole block free the frags here.
+	 */
+	size = numfrags(fs, size);
+	if (size != fs->fs_frag) {
+		bno = dtogd(fs, blkno);
+		for (i = size; i < fs->fs_frag; i++)
+			setbit(blksfree, bno + i);
+		i = fs->fs_frag - size;
+		cgp->cg_cs.cs_nffree += i;
+		fs->fs_cstotal.cs_nffree += i;
+		fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;
+		fs->fs_fmod = 1;
+		cgp->cg_frsum[i]++;
+	}
 	/* XXX Fixme. */
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
+		    size, 0);
 	UFS_LOCK(ump);
 	return (blkno);
 }
@ -1589,11 +1607,12 @@ gotit:
 * take the first one that we find following bpref.
 */
 static ufs2_daddr_t
-ffs_clusteralloc(ip, cg, bpref, len)
+ffs_clusteralloc(ip, cg, bpref, len, unused)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t bpref;
 	int len;
+	int unused;
 {
 	struct fs *fs;
 	struct cg *cgp;
@ -1689,7 +1708,7 @@ ffs_clusteralloc(ip, cg, bpref, len)
 	len = blkstofrags(fs, len);
 	UFS_LOCK(ump);
 	for (i = 0; i < len; i += fs->fs_frag)
-		if (ffs_alloccgblk(ip, bp, bno + i) != bno + i)
+		if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)
 			panic("ffs_clusteralloc: lost block");
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
@ -1713,11 +1732,12 @@ fail:
 *      inode in the specified cylinder group.
 */
 static ufs2_daddr_t
-ffs_nodealloccg(ip, cg, ipref, mode)
+ffs_nodealloccg(ip, cg, ipref, mode, unused)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t ipref;
 	int mode;
+	int unused;
 {
 	struct fs *fs;
 	struct cg *cgp;
@ -1819,28 +1839,6 @@ gotit:
 	return ((ino_t)(cg * fs->fs_ipg + ipref));
 }

-/*
- * check if a block is free
- */
-static int
-ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)
-{
-
-	switch ((int)fs->fs_frag) {
-	case 8:
-		return (cp[h] == 0);
-	case 4:
-		return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
-	case 2:
-		return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
-	case 1:
-		return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
-	default:
-		panic("ffs_isfreeblock");
-	}
-	return (0);
-}
-
 /*
 * Free a block or fragment.
 *
@ -1849,14 +1847,16 @@ ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)
 * block reassembly is checked.
 */
 void
-ffs_blkfree(ump, fs, devvp, bno, size, inum)
+ffs_blkfree(ump, fs, devvp, bno, size, inum, dephd)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
+	struct workhead *dephd;
 {
+	struct mount *mp;
 	struct cg *cgp;
 	struct buf *bp;
 	ufs1_daddr_t fragno, cgbno;
@ -1923,7 +1923,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)
 			panic("ffs_blkfree: freeing free block");
 		}
 		ffs_setblock(fs, blksfree, fragno);
-		ffs_clusteracct(ump, fs, cgp, fragno, 1);
+		ffs_clusteracct(fs, cgp, fragno, 1);
 		cgp->cg_cs.cs_nbfree++;
 		fs->fs_cstotal.cs_nbfree++;
 		fs->fs_cs(fs, cg).cs_nbfree++;
@ -1963,7 +1963,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)
 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
-			ffs_clusteracct(ump, fs, cgp, fragno, 1);
+			ffs_clusteracct(fs, cgp, fragno, 1);
 			cgp->cg_cs.cs_nbfree++;
 			fs->fs_cstotal.cs_nbfree++;
 			fs->fs_cs(fs, cg).cs_nbfree++;
@ -1972,6 +1972,10 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum)
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
+	mp = UFSTOVFS(ump);
+	if (mp->mnt_flag & MNT_SOFTDEP && devvp->v_type != VREG)
+		softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
+		    numfrags(fs, size), dephd);
 	bdwrite(bp);
 }

@ -2042,7 +2046,8 @@ ffs_vfree(pvp, ino, mode)
 		return (0);
 	}
 	ip = VTOI(pvp);
-	return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode));
+	return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode,
+	    NULL));
 }

 /*
@ -2050,12 +2055,13 @@ ffs_vfree(pvp, ino, mode)
 * The specified inode is placed back in the free map.
 */
 int
-ffs_freefile(ump, fs, devvp, ino, mode)
+ffs_freefile(ump, fs, devvp, ino, mode, wkhd)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ino_t ino;
 	int mode;
+	struct workhead *wkhd;
 {
 	struct cg *cgp;
 	struct buf *bp;
@ -2112,6 +2118,9 @@ ffs_freefile(ump, fs, devvp, ino, mode)
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
+	if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP && devvp->v_type != VREG)
+		softdep_setup_inofree(UFSTOVFS(ump), bp,
+		    ino + cg * fs->fs_ipg, wkhd);
 	bdwrite(bp);
 	return (0);
 }
@ -2225,101 +2234,6 @@ ffs_mapsearch(fs, cgp, bpref, allocsiz)
 	return (-1);
 }

-/*
- * Update the cluster map because of an allocation or free.
- *
- * Cnt == 1 means free; cnt == -1 means allocating.
- */
-void
-ffs_clusteracct(ump, fs, cgp, blkno, cnt)
-	struct ufsmount *ump;
-	struct fs *fs;
-	struct cg *cgp;
-	ufs1_daddr_t blkno;
-	int cnt;
-{
-	int32_t *sump;
-	int32_t *lp;
-	u_char *freemapp, *mapp;
-	int i, start, end, forw, back, map, bit;
-
-	mtx_assert(UFS_MTX(ump), MA_OWNED);
-
-	if (fs->fs_contigsumsize <= 0)
-		return;
-	freemapp = cg_clustersfree(cgp);
-	sump = cg_clustersum(cgp);
-	/*
-	 * Allocate or clear the actual block.
-	 */
-	if (cnt > 0)
-		setbit(freemapp, blkno);
-	else
-		clrbit(freemapp, blkno);
-	/*
-	 * Find the size of the cluster going forward.
-	 */
-	start = blkno + 1;
-	end = start + fs->fs_contigsumsize;
-	if (end >= cgp->cg_nclusterblks)
-		end = cgp->cg_nclusterblks;
-	mapp = &freemapp[start / NBBY];
-	map = *mapp++;
-	bit = 1 << (start % NBBY);
-	for (i = start; i < end; i++) {
-		if ((map & bit) == 0)
-			break;
-		if ((i & (NBBY - 1)) != (NBBY - 1)) {
-			bit <<= 1;
-		} else {
-			map = *mapp++;
-			bit = 1;
-		}
-	}
-	forw = i - start;
-	/*
-	 * Find the size of the cluster going backward.
-	 */
-	start = blkno - 1;
-	end = start - fs->fs_contigsumsize;
-	if (end < 0)
-		end = -1;
-	mapp = &freemapp[start / NBBY];
-	map = *mapp--;
-	bit = 1 << (start % NBBY);
-	for (i = start; i > end; i--) {
-		if ((map & bit) == 0)
-			break;
-		if ((i & (NBBY - 1)) != 0) {
-			bit >>= 1;
-		} else {
-			map = *mapp--;
-			bit = 1 << (NBBY - 1);
-		}
-	}
-	back = start - i;
-	/*
-	 * Account for old cluster and the possibly new forward and
-	 * back clusters.
-	 */
-	i = back + forw + 1;
-	if (i > fs->fs_contigsumsize)
-		i = fs->fs_contigsumsize;
-	sump[i] += cnt;
-	if (back > 0)
-		sump[back] -= cnt;
-	if (forw > 0)
-		sump[forw] -= cnt;
-	/*
-	 * Update cluster summary information.
-	 */
-	lp = &sump[fs->fs_contigsumsize];
-	for (i = fs->fs_contigsumsize; i > 0; i--)
-		if (*lp-- > 0)
-			break;
-	fs->fs_maxcluster[cgp->cg_cgx] = i;
-}
-
 /*
 * Fserr prints the name of a filesystem with an error diagnostic.
 *
@ -2540,7 +2454,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 #endif /* DEBUG */
 		while (cmd.size > 0) {
 			if ((error = ffs_freefile(ump, fs, ump->um_devvp,
-			    cmd.value, filetype)))
+			    cmd.value, filetype, NULL)))
 				break;
 			cmd.size -= 1;
 			cmd.value += 1;
@ -2568,7 +2482,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 			if (blksize > blkcnt)
 				blksize = blkcnt;
 			ffs_blkfree(ump, fs, ump->um_devvp, blkno,
-			    blksize * fs->fs_fsize, ROOTINO);
+			    blksize * fs->fs_fsize, ROOTINO, NULL);
 			blkno += blksize;
 			blkcnt -= blksize;
 			blksize = fs->fs_frag;
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@ -120,6 +120,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
 	if (lbn < 0)
 		return (EFBIG);

+	if (DOINGSOFTDEP(vp))
+		softdep_prealloc(vp, MNT_WAIT);
 	/*
 	 * If the next write will extend the file into a new block,
 	 * and the file is currently composed of a fragment
@ -418,6 +420,8 @@ fail:
 	 * slow, running out of disk space is not expected to be a common
 	 * occurence. The error return from fsync is ignored as we already
 	 * have an error to return to the user.
+	 *
+	 * XXX Still have to journal the free below
 	 */
 	(void) ffs_syncvnode(vp, MNT_WAIT);
 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
@ -473,7 +477,7 @@ fail:
 	 */
 	for (blkp = allociblk; blkp < allocblk; blkp++) {
 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
-		    ip->i_number);
+		    ip->i_number, NULL);
 	}
 	return (error);
 }
@ -515,6 +519,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
 	if (lbn < 0)
 		return (EFBIG);

+	if (DOINGSOFTDEP(vp))
+		softdep_prealloc(vp, MNT_WAIT);
+	
 	/*
 	 * Check for allocating external data.
 	 */
@ -930,6 +937,8 @@ fail:
 	 * slow, running out of disk space is not expected to be a common
 	 * occurence. The error return from fsync is ignored as we already
 	 * have an error to return to the user.
+	 *
+	 * XXX Still have to journal the free below
 	 */
 	(void) ffs_syncvnode(vp, MNT_WAIT);
 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
@ -985,7 +994,7 @@ fail:
 	 */
 	for (blkp = allociblk; blkp < allocblk; blkp++) {
 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
-		    ip->i_number);
+		    ip->i_number, NULL);
 	}
 	return (error);
 }
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@ -47,6 +47,7 @@ struct ucred;
 struct vnode;
 struct vop_fsync_args;
 struct vop_reallocblks_args;
+struct workhead;

 int	ffs_alloc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, int, int,
 	    struct ucred *, ufs2_daddr_t *);
@ -56,20 +57,23 @@ int	ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size,
            struct ucred *a_cred, int a_flags, struct buf **a_bpp);
 int	ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
 void	ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
-	    ufs2_daddr_t, long, ino_t);
+	    ufs2_daddr_t, long, ino_t, struct workhead *);
 ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
 ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
 int	ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
 void	ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
+void	ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
 void	ffs_bdflush(struct bufobj *, struct buf *);
 int	ffs_copyonwrite(struct vnode *, struct buf *);
 int	ffs_flushfiles(struct mount *, int, struct thread *);
 void	ffs_fragacct(struct fs *, int, int32_t [], int);
 int	ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t,
-	    int);
+	    int, struct workhead *);
 int	ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
+int	ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t);
 void	ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);
 int	ffs_mountroot(void);
+void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
 int	ffs_reallocblks(struct vop_reallocblks_args *);
 int	ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t,
 	    ufs2_daddr_t, int, int, int, struct ucred *, struct buf **);
@ -103,12 +107,14 @@ extern struct vop_vector ffs_fifoops2;

 int	softdep_check_suspend(struct mount *, struct vnode *,
 	  int, int, int, int);
+int	softdep_complete_trunc(struct vnode *, void *);
 void	softdep_get_depcounts(struct mount *, int *, int *);
 void	softdep_initialize(void);
 void	softdep_uninitialize(void);
 int	softdep_mount(struct vnode *, struct mount *, struct fs *,
 	    struct ucred *);
-void	softdep_move_dependencies(struct buf *, struct buf *);
+void	softdep_unmount(struct mount *);
+int	softdep_move_dependencies(struct buf *, struct buf *);
 int	softdep_flushworklist(struct mount *, int *, struct thread *);
 int	softdep_flushfiles(struct mount *, int, struct thread *);
 void	softdep_update_inodeblock(struct inode *, struct buf *, int);
@ -117,7 +123,8 @@ void	softdep_freefile(struct vnode *, ino_t, int);
 int	softdep_request_cleanup(struct fs *, struct vnode *);
 void	softdep_setup_freeblocks(struct inode *, off_t, int);
 void	softdep_setup_inomapdep(struct buf *, struct inode *, ino_t);
-void	softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t);
+void	softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t,
+	    int, int);
 void	softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t,
 	    ufs2_daddr_t, long, long, struct buf *);
 void	softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t,
@ -126,11 +133,20 @@ void	softdep_setup_allocindir_meta(struct buf *, struct inode *,
 	    struct buf *, int, ufs2_daddr_t);
 void	softdep_setup_allocindir_page(struct inode *, ufs_lbn_t,
 	    struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *);
+void	softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int,
+	    struct workhead *);
+void	softdep_setup_inofree(struct mount *, struct buf *, ino_t,
+	    struct workhead *);
+void	softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *);
+void 	*softdep_setup_trunc(struct vnode *vp, off_t length, int flags);
 void	softdep_fsync_mountdev(struct vnode *);
 int	softdep_sync_metadata(struct vnode *);
 int     softdep_process_worklist(struct mount *, int);
 int     softdep_fsync(struct vnode *);
 int	softdep_waitidle(struct mount *);
+int	softdep_prealloc(struct vnode *, int);
+int	softdep_journal_lookup(struct mount *, struct vnode **);
+

 int	ffs_rdonly(struct inode *);

--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@ -92,15 +92,6 @@ ffs_update(vp, waitfor)
 	fs = ip->i_fs;
 	if (fs->fs_ronly)
 		return (0);
-	/*
-	 * Ensure that uid and gid are correct. This is a temporary
-	 * fix until fsck has been changed to do the update.
-	 */
-	if (fs->fs_magic == FS_UFS1_MAGIC &&		/* XXX */
-	    fs->fs_old_inodefmt < FS_44INODEFMT) {	/* XXX */
-		ip->i_din1->di_ouid = ip->i_uid;	/* XXX */
-		ip->i_din1->di_ogid = ip->i_gid;	/* XXX */
-	}						/* XXX */
 	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		(int)fs->fs_bsize, NOCRED, &bp);
 	if (error) {
@ -160,6 +151,7 @@ ffs_truncate(vp, length, flags, cred, td)
 	ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR];
 	ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
 	ufs2_daddr_t count, blocksreleased = 0, datablocks;
+	void *cookie;
 	struct bufobj *bo;
 	struct fs *fs;
 	struct buf *bp;
@ -173,11 +165,14 @@ ffs_truncate(vp, length, flags, cred, td)
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	bo = &vp->v_bufobj;
+	cookie = NULL;

 	ASSERT_VOP_LOCKED(vp, "ffs_truncate");

 	if (length < 0)
 		return (EINVAL);
+	if (length > fs->fs_maxfilesize)
+		return (EFBIG);
 	/*
 	 * Historically clients did not have to specify which data
 	 * they were truncating. So, if not specified, we assume
@ -192,6 +187,7 @@ ffs_truncate(vp, length, flags, cred, td)
 	 * (e.g., the file is being unlinked), then pick it off with
 	 * soft updates below.
 	 */
+	allerror = 0;
 	needextclean = 0;
 	softdepslowdown = DOINGSOFTDEP(vp) && softdep_slowdown(vp);
 	extblocks = 0;
@ -212,6 +208,8 @@ ffs_truncate(vp, length, flags, cred, td)
 				panic("ffs_truncate: partial trunc of extdata");
 			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
 				return (error);
+			if (DOINGSUJ(vp))
+				cookie = softdep_setup_trunc(vp, length, flags);
 			osize = ip->i_din2->di_extsize;
 			ip->i_din2->di_blocks -= extblocks;
 #ifdef QUOTA
@ -227,19 +225,19 @@ ffs_truncate(vp, length, flags, cred, td)
 			}
 			ip->i_flag |= IN_CHANGE;
 			if ((error = ffs_update(vp, 1)))
-				return (error);
+				goto out;
 			for (i = 0; i < NXADDR; i++) {
 				if (oldblks[i] == 0)
 					continue;
 				ffs_blkfree(ump, fs, ip->i_devvp, oldblks[i],
-				    sblksize(fs, osize, i), ip->i_number);
+				    sblksize(fs, osize, i), ip->i_number, NULL);
 			}
 		}
 	}
-	if ((flags & IO_NORMAL) == 0)
-		return (0);
-	if (length > fs->fs_maxfilesize)
-		return (EFBIG);
+	if ((flags & IO_NORMAL) == 0) {
+		error = 0;
+		goto out;
+	}
 	if (vp->v_type == VLNK &&
 	    (ip->i_size < vp->v_mount->mnt_maxsymlinklen ||
 	     datablocks == 0)) {
@ -253,24 +251,52 @@ ffs_truncate(vp, length, flags, cred, td)
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (needextclean)
 			softdep_setup_freeblocks(ip, length, IO_EXT);
-		return (ffs_update(vp, 1));
+		error = ffs_update(vp, 1);
+		goto out;
 	}
 	if (ip->i_size == length) {
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (needextclean)
 			softdep_setup_freeblocks(ip, length, IO_EXT);
-		return (ffs_update(vp, 0));
+		error = ffs_update(vp, 0);
+		goto out;
 	}
 	if (fs->fs_ronly)
 		panic("ffs_truncate: read-only filesystem");
 #ifdef QUOTA
 	error = getinoquota(ip);
 	if (error)
-		return (error);
+		goto out;
 #endif
 	if ((ip->i_flags & SF_SNAPSHOT) != 0)
 		ffs_snapremove(vp);
 	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+	osize = ip->i_size;
+	/*
+	 * Lengthen the size of the file. We must ensure that the
+	 * last byte of the file is allocated. Since the smallest
+	 * value of osize is 0, length will be at least 1.
+	 */
+	if (osize < length) {
+		vnode_pager_setsize(vp, length);
+		flags |= BA_CLRBUF;
+		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
+		if (error) {
+			vnode_pager_setsize(vp, osize);
+			goto out;
+		}
+		ip->i_size = length;
+		DIP_SET(ip, i_size, length);
+		if (bp->b_bufsize == fs->fs_bsize)
+			bp->b_flags |= B_CLUSTEROK;
+		if (flags & IO_SYNC)
+			bwrite(bp);
+		else
+			bawrite(bp);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		error = ffs_update(vp, 1);
+		goto out;
+	}
 	if (DOINGSOFTDEP(vp)) {
 		if (length > 0 || softdepslowdown) {
 			/*
@ -283,11 +309,18 @@ ffs_truncate(vp, length, flags, cred, td)
 			 * so that it will have no data structures left.
 			 */
 			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
-				return (error);
+				goto out;
 			UFS_LOCK(ump);
 			if (ip->i_flag & IN_SPACECOUNTED)
 				fs->fs_pendingblocks -= datablocks;
 			UFS_UNLOCK(ump);
+			/*
+			 * We have to journal the truncation before we change
+			 * any blocks so we don't leave the file partially
+			 * truncated.
+			 */
+			if (DOINGSUJ(vp) && cookie == NULL)
+				cookie = softdep_setup_trunc(vp, length, flags);
 		} else {
 #ifdef QUOTA
 			(void) chkdq(ip, -datablocks, NOCRED, 0);
@ -301,34 +334,10 @@ ffs_truncate(vp, length, flags, cred, td)
 				    OFF_TO_IDX(lblktosize(fs, -extblocks)));
 			vnode_pager_setsize(vp, 0);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
-			return (ffs_update(vp, 0));
+			error = ffs_update(vp, 0);
+			goto out;
 		}
 	}
-	osize = ip->i_size;
-	/*
-	 * Lengthen the size of the file. We must ensure that the
-	 * last byte of the file is allocated. Since the smallest
-	 * value of osize is 0, length will be at least 1.
-	 */
-	if (osize < length) {
-		vnode_pager_setsize(vp, length);
-		flags |= BA_CLRBUF;
-		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
-		if (error) {
-			vnode_pager_setsize(vp, osize);
-			return (error);
-		}
-		ip->i_size = length;
-		DIP_SET(ip, i_size, length);
-		if (bp->b_bufsize == fs->fs_bsize)
-			bp->b_flags |= B_CLUSTEROK;
-		if (flags & IO_SYNC)
-			bwrite(bp);
-		else
-			bawrite(bp);
-		ip->i_flag |= IN_CHANGE | IN_UPDATE;
-		return (ffs_update(vp, 1));
-	}
 	/*
 	 * Shorten the size of the file. If the file is not being
 	 * truncated to a block boundary, the contents of the
@ -345,9 +354,8 @@ ffs_truncate(vp, length, flags, cred, td)
 		lbn = lblkno(fs, length);
 		flags |= BA_CLRBUF;
 		error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp);
-		if (error) {
-			return (error);
-		}
+		if (error)
+			goto out;
 		/*
 		 * When we are doing soft updates and the UFS_BALLOC
 		 * above fills in a direct block hole with a full sized
@ -359,7 +367,7 @@ ffs_truncate(vp, length, flags, cred, td)
 		if (DOINGSOFTDEP(vp) && lbn < NDADDR &&
 		    fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
 		    (error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
-			return (error);
+			goto out;
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);
 		size = blksize(fs, ip, lbn);
@ -405,7 +413,13 @@ ffs_truncate(vp, length, flags, cred, td)
 			DIP_SET(ip, i_db[i], 0);
 	}
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
-	allerror = ffs_update(vp, 1);
+	/*
+	 * When doing softupdate journaling we must preserve the size along
+	 * with the old pointers until they are freed or we might not
+	 * know how many fragments remain.
+	 */
+	if (!DOINGSUJ(vp))
+		allerror = ffs_update(vp, 1);
 	
 	/*
 	 * Having written the new inode to disk, save its new configuration
@ -445,7 +459,7 @@ ffs_truncate(vp, length, flags, cred, td)
 			if (lastiblock[level] < 0) {
 				DIP_SET(ip, i_ib[level], 0);
 				ffs_blkfree(ump, fs, ip->i_devvp, bn,
-				    fs->fs_bsize, ip->i_number);
+				    fs->fs_bsize, ip->i_number, NULL);
 				blocksreleased += nblocks;
 			}
 		}
@ -464,7 +478,8 @@ ffs_truncate(vp, length, flags, cred, td)
 			continue;
 		DIP_SET(ip, i_db[i], 0);
 		bsize = blksize(fs, ip, i);
-		ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number);
+		ffs_blkfree(ump, fs, ip->i_devvp, bn, bsize, ip->i_number,
+		    NULL);
 		blocksreleased += btodb(bsize);
 	}
 	if (lastblock < 0)
@ -496,7 +511,7 @@ ffs_truncate(vp, length, flags, cred, td)
 			 */
 			bn += numfrags(fs, newspace);
 			ffs_blkfree(ump, fs, ip->i_devvp, bn,
-			    oldspace - newspace, ip->i_number);
+			    oldspace - newspace, ip->i_number, NULL);
 			blocksreleased += btodb(oldspace - newspace);
 		}
 	}
@ -528,7 +543,14 @@ done:
 #ifdef QUOTA
 	(void) chkdq(ip, -blocksreleased, NOCRED, 0);
 #endif
-	return (allerror);
+	error = allerror;
+out:
+	if (cookie) {
+		allerror = softdep_complete_trunc(vp, cookie);
+		if (allerror != 0 && error == 0)
+			error = allerror;
+	}
+	return (error);
 }

 /*
@ -638,7 +660,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 			blocksreleased += blkcount;
 		}
 		ffs_blkfree(ip->i_ump, fs, ip->i_devvp, nb, fs->fs_bsize,
-		    ip->i_number);
+		    ip->i_number, NULL);
 		blocksreleased += nblocks;
 	}

--- a/sys/ufs/ffs/ffs_snapshot.c
+++ b/sys/ufs/ffs/ffs_snapshot.c
@ -142,7 +142,7 @@ MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF);
 static int cgaccount(int, struct vnode *, struct buf *, int);
 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
-    ufs_lbn_t, int), int);
+    ufs_lbn_t, int), int, int);
 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
    ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
@ -155,7 +155,7 @@ static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
    struct fs *, ufs_lbn_t, int);
 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
-    ufs_lbn_t, int), int);
+    ufs_lbn_t, int), int, int);
 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
    ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
@ -582,7 +582,8 @@ loop:
 			len = fragroundup(fs, blkoff(fs, xp->i_size));
 			if (len != 0 && len < fs->fs_bsize) {
 				ffs_blkfree(ump, copy_fs, vp,
-				    DIP(xp, i_db[loc]), len, xp->i_number);
+				    DIP(xp, i_db[loc]), len, xp->i_number,
+				    NULL);
 				blkno = DIP(xp, i_db[loc]);
 				DIP_SET(xp, i_db[loc], 0);
 			}
@ -590,15 +591,15 @@ loop:
 		snaplistsize += 1;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
-			    BLK_NOCOPY);
+			    BLK_NOCOPY, 1);
 		else
 			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
-			    BLK_NOCOPY);
+			    BLK_NOCOPY, 1);
 		if (blkno)
 			DIP_SET(xp, i_db[loc], blkno);
 		if (!error)
 			error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
-			    xp->i_mode);
+			    xp->i_mode, NULL);
 		VOP_UNLOCK(xvp, 0);
 		vdrop(xvp);
 		if (error) {
@ -611,6 +612,26 @@ loop:
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
+	/*
+	 * Erase the journal file from the snapshot.
+	 */
+	if (fs->fs_flags & FS_SUJ) {
+		error = softdep_journal_lookup(mp, &xvp);
+		if (error) {
+			free(copy_fs->fs_csp, M_UFSMNT);
+			bawrite(sbp);
+			sbp = NULL;
+			goto out1;
+		}
+		xp = VTOI(xvp);
+		if (xp->i_ump->um_fstype == UFS1)
+			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
+			    BLK_NOCOPY, 0);
+		else
+			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
+			    BLK_NOCOPY, 0);
+		vput(xvp);
+	}
 	/*
 	 * Acquire a lock on the snapdata structure, creating it if necessary.
 	 */
@ -691,16 +712,16 @@ out1:
 			break;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
-			    BLK_SNAP);
+			    BLK_SNAP, 0);
 		else
 			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
-			    BLK_SNAP);
+			    BLK_SNAP, 0);
 		if (error == 0 && xp->i_effnlink == 0) {
 			error = ffs_freefile(ump,
 					     copy_fs,
 					     vp,
 					     xp->i_number,
-					     xp->i_mode);
+					     xp->i_mode, NULL);
 		}
 		if (error) {
 			fs->fs_snapinum[snaploc] = 0;
@ -719,9 +740,11 @@ out1:
 	 * the list of allocated blocks in i_snapblklist.
 	 */
 	if (ip->i_ump->um_fstype == UFS1)
-		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
+		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,
+		    BLK_SNAP, 0);
 	else
-		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
+		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2,
+		    BLK_SNAP, 0);
 	if (error) {
 		fs->fs_snapinum[snaploc] = 0;
 		free(snapblklist, M_UFSMNT);
@ -954,13 +977,14 @@ cgaccount(cg, vp, nbp, passno)
 * is reproduced once each for UFS1 and UFS2.
 */
 static int
-expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
+expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
+	int clearmode;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
@ -1005,7 +1029,7 @@ expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
 	 */
 	dip = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
-	if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0)
+	if (clearmode || cancelip->i_effnlink == 0)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
@ -1220,7 +1244,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
-		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
+		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL);
 	}
 	return (0);
 }
@ -1234,13 +1258,14 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 * is reproduced once each for UFS1 and UFS2.
 */
 static int
-expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
+expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
+	int clearmode;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
@ -1285,7 +1310,7 @@ expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
 	 */
 	dip = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
-	if (expungetype == BLK_NOCOPY)
+	if (clearmode || cancelip->i_effnlink == 0)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
@ -1500,7 +1525,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
-		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
+		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL);
 	}
 	return (0);
 }
@ -1657,6 +1682,13 @@ ffs_snapremove(vp)
 	ip->i_flags &= ~SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+	/*
+	 * The dirtied indirects must be written out before
+	 * softdep_setup_freeblocks() is called.  Otherwise indir_trunc()
+	 * may find indirect pointers using the magic BLK_* values.
+	 */
+	if (DOINGSOFTDEP(vp))
+		ffs_syncvnode(vp, MNT_WAIT);
 #ifdef QUOTA
 	/*
 	 * Reenable disk quotas for ex-snapshot file.
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
--- a/sys/ufs/ffs/ffs_subr.c
+++ b/sys/ufs/ffs/ffs_subr.c
@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$");
 #ifndef _KERNEL
 #include <ufs/ufs/dinode.h>
 #include <ufs/ffs/fs.h>
-#include "fsck.h"
 #else
 #include <sys/systm.h>
 #include <sys/lock.h>
@ -223,7 +222,38 @@ ffs_isblock(fs, cp, h)
 		mask = 0x01 << (h & 0x7);
 		return ((cp[h >> 3] & mask) == mask);
 	default:
+#ifdef _KERNEL
 		panic("ffs_isblock");
+#endif
+		break;
+	}
+	return (0);
+}
+
+/*
+ * check if a block is free
+ */
+int
+ffs_isfreeblock(fs, cp, h)
+	struct fs *fs;
+	u_char *cp;
+	ufs1_daddr_t h;
+{
+ 
+	switch ((int)fs->fs_frag) {
+	case 8:
+		return (cp[h] == 0);
+	case 4:
+		return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
+	case 2:
+		return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
+	case 1:
+		return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
+	default:
+#ifdef _KERNEL
+		panic("ffs_isfreeblock");
+#endif
+		break;
 	}
 	return (0);
 }
@ -252,7 +282,10 @@ ffs_clrblock(fs, cp, h)
 		cp[h >> 3] &= ~(0x01 << (h & 0x7));
 		return;
 	default:
+#ifdef _KERNEL
 		panic("ffs_clrblock");
+#endif
+		break;
 	}
 }

@ -281,6 +314,101 @@ ffs_setblock(fs, cp, h)
 		cp[h >> 3] |= (0x01 << (h & 0x7));
 		return;
 	default:
+#ifdef _KERNEL
 		panic("ffs_setblock");
+#endif
+		break;
 	}
 }
+
+/*
+ * Update the cluster map because of an allocation or free.
+ *
+ * Cnt == 1 means free; cnt == -1 means allocating.
+ */
+void
+ffs_clusteracct(fs, cgp, blkno, cnt)
+	struct fs *fs;
+	struct cg *cgp;
+	ufs1_daddr_t blkno;
+	int cnt;
+{
+	int32_t *sump;
+	int32_t *lp;
+	u_char *freemapp, *mapp;
+	int i, start, end, forw, back, map, bit;
+
+	if (fs->fs_contigsumsize <= 0)
+		return;
+	freemapp = cg_clustersfree(cgp);
+	sump = cg_clustersum(cgp);
+	/*
+	 * Allocate or clear the actual block.
+	 */
+	if (cnt > 0)
+		setbit(freemapp, blkno);
+	else
+		clrbit(freemapp, blkno);
+	/*
+	 * Find the size of the cluster going forward.
+	 */
+	start = blkno + 1;
+	end = start + fs->fs_contigsumsize;
+	if (end >= cgp->cg_nclusterblks)
+		end = cgp->cg_nclusterblks;
+	mapp = &freemapp[start / NBBY];
+	map = *mapp++;
+	bit = 1 << (start % NBBY);
+	for (i = start; i < end; i++) {
+		if ((map & bit) == 0)
+			break;
+		if ((i & (NBBY - 1)) != (NBBY - 1)) {
+			bit <<= 1;
+		} else {
+			map = *mapp++;
+			bit = 1;
+		}
+	}
+	forw = i - start;
+	/*
+	 * Find the size of the cluster going backward.
+	 */
+	start = blkno - 1;
+	end = start - fs->fs_contigsumsize;
+	if (end < 0)
+		end = -1;
+	mapp = &freemapp[start / NBBY];
+	map = *mapp--;
+	bit = 1 << (start % NBBY);
+	for (i = start; i > end; i--) {
+		if ((map & bit) == 0)
+			break;
+		if ((i & (NBBY - 1)) != 0) {
+			bit >>= 1;
+		} else {
+			map = *mapp--;
+			bit = 1 << (NBBY - 1);
+		}
+	}
+	back = start - i;
+	/*
+	 * Account for old cluster and the possibly new forward and
+	 * back clusters.
+	 */
+	i = back + forw + 1;
+	if (i > fs->fs_contigsumsize)
+		i = fs->fs_contigsumsize;
+	sump[i] += cnt;
+	if (back > 0)
+		sump[back] -= cnt;
+	if (forw > 0)
+		sump[forw] -= cnt;
+	/*
+	 * Update cluster summary information.
+	 */
+	lp = &sump[fs->fs_contigsumsize];
+	for (i = fs->fs_contigsumsize; i > 0; i--)
+		if (*lp-- > 0)
+			break;
+	fs->fs_maxcluster[cgp->cg_cgx] = i;
+}
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@ -79,7 +79,6 @@ static int	ffs_reload(struct mount *, struct thread *);
 static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
 static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
 		    ufs2_daddr_t);
-static void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
 static void	ffs_ifree(struct ufsmount *ump, struct inode *ip);
 static vfs_init_t ffs_init;
 static vfs_uninit_t ffs_uninit;
@ -299,7 +298,8 @@ ffs_mount(struct mount *mp)
 			if (fs->fs_clean == 0) {
 				fs->fs_flags |= FS_UNCLEAN;
 				if ((mp->mnt_flag & MNT_FORCE) ||
-				    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
+				    ((fs->fs_flags &
+				     (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
 				     (fs->fs_flags & FS_DOSOFTDEP))) {
 					printf("WARNING: %s was not %s\n",
 					   fs->fs_fsmnt, "properly dismounted");
@ -307,6 +307,9 @@ ffs_mount(struct mount *mp)
 					printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 					    fs->fs_fsmnt);
+					if (fs->fs_flags & FS_SUJ)
+						printf(
+"WARNING: Forced mount will invalidated journal contents\n");
 					return (EPERM);
 				}
 			}
@ -330,17 +333,18 @@ ffs_mount(struct mount *mp)
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
-			fs->fs_clean = 0;
-			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
-				vn_finished_write(mp);
-				return (error);
-			}
+			fs->fs_mtime = time_second;
 			/* check to see if we need to start softdep */
 			if ((fs->fs_flags & FS_DOSOFTDEP) &&
 			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
 				vn_finished_write(mp);
 				return (error);
 			}
+			fs->fs_clean = 0;
+			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
+				vn_finished_write(mp);
+				return (error);
+			}
 			if (fs->fs_snapinum[0] != 0)
 				ffs_snapshot_mount(mp);
 			vn_finished_write(mp);
@ -705,7 +709,7 @@ ffs_mountfs(devvp, mp, td)
 	if (fs->fs_clean == 0) {
 		fs->fs_flags |= FS_UNCLEAN;
 		if (ronly || (mp->mnt_flag & MNT_FORCE) ||
-		    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
+		    ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
 		     (fs->fs_flags & FS_DOSOFTDEP))) {
 			printf(
 "WARNING: %s was not properly dismounted\n",
@ -714,6 +718,9 @@ ffs_mountfs(devvp, mp, td)
 			printf(
 "WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
 			    fs->fs_fsmnt);
+			if (fs->fs_flags & FS_SUJ)
+				printf(
+"WARNING: Forced mount will invalidated journal contents\n");
 			error = EPERM;
 			goto out;
 		}
@ -896,6 +903,7 @@ ffs_mountfs(devvp, mp, td)
 	 */
 	bzero(fs->fs_fsmnt, MAXMNTLEN);
 	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
+	mp->mnt_stat.f_iosize = fs->fs_bsize;

 	if( mp->mnt_flag & MNT_ROOTFS) {
 		/*
@ -907,6 +915,7 @@ ffs_mountfs(devvp, mp, td)
 	}

 	if (ronly == 0) {
+		fs->fs_mtime = time_second;
 		if ((fs->fs_flags & FS_DOSOFTDEP) &&
 		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
 			free(fs->fs_csp, M_UFSMNT);
@ -937,7 +946,6 @@ ffs_mountfs(devvp, mp, td)
 	 * This would all happen while the filesystem was busy/not
 	 * available, so would effectively be "atomic".
 	 */
-	mp->mnt_stat.f_iosize = fs->fs_bsize;
 	(void) ufs_extattr_autostart(mp, td);
 #endif /* !UFS_EXTATTR_AUTOSTART */
 #endif /* !UFS_EXTATTR */
@ -1037,7 +1045,7 @@ ffs_oldfscompat_read(fs, ump, sblockloc)
 * XXX - Parts get retired eventually.
 * Unfortunately new bits get added.
 */
-static void
+void
 ffs_oldfscompat_write(fs, ump)
 	struct fs *fs;
 	struct ufsmount *ump;
@ -1132,6 +1140,7 @@ ffs_unmount(mp, mntflags)
 		fs->fs_pendinginodes = 0;
 	}
 	UFS_UNLOCK(ump);
+	softdep_unmount(mp);
 	if (fs->fs_ronly == 0) {
 		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
 		error = ffs_sbupdate(ump, MNT_WAIT, 0);
@ -1573,16 +1582,6 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags)
 			DIP_SET(ip, i_gen, ip->i_gen);
 		}
 	}
-	/*
-	 * Ensure that uid and gid are correct. This is a temporary
-	 * fix until fsck has been changed to do the update.
-	 */
-	if (fs->fs_magic == FS_UFS1_MAGIC &&		/* XXX */
-	    fs->fs_old_inodefmt < FS_44INODEFMT) {	/* XXX */
-		ip->i_uid = ip->i_din1->di_ouid;	/* XXX */
-		ip->i_gid = ip->i_din1->di_ogid;	/* XXX */
-	}						/* XXX */
-
 #ifdef MAC
 	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
 		/*
@ -1726,6 +1725,8 @@ ffs_sbupdate(mp, waitfor, suspended)
 	}
 	fs->fs_fmod = 0;
 	fs->fs_time = time_second;
+	if (fs->fs_flags & FS_DOSOFTDEP)
+		softdep_setup_sbupdate(mp, (struct fs *)bp->b_data, bp);
 	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
 	if (suspended)
@ -1867,9 +1868,6 @@ ffs_bufwrite(struct buf *bp)
 	}
 	BO_UNLOCK(bp->b_bufobj);

-	/* Mark the buffer clean */
-	bundirty(bp);
-
 	/*
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
@ -1910,9 +1908,16 @@ ffs_bufwrite(struct buf *bp)
 		newbp->b_flags &= ~B_INVAL;

 #ifdef SOFTUPDATES
-		/* move over the dependencies */
-		if (!LIST_EMPTY(&bp->b_dep))
-			softdep_move_dependencies(bp, newbp);
+		/*
+		 * Move over the dependencies.  If there are rollbacks,
+		 * leave the parent buffer dirtied as it will need to
+		 * be written again.
+		 */
+		if (LIST_EMPTY(&bp->b_dep) ||
+		    softdep_move_dependencies(bp, newbp) == 0)
+			bundirty(bp);
+#else
+		bundirty(bp);
 #endif 

 		/*
@ -1925,7 +1930,10 @@ ffs_bufwrite(struct buf *bp)
 		 */
 		bqrelse(bp);
 		bp = newbp;
-	}
+	} else
+		/* Mark the buffer clean */
+		bundirty(bp);
+

 	/* Let the normal bufwrite do the rest for us */
 normal_write:
@ -1939,6 +1947,7 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
 	struct vnode *vp;
 	int error;
 	struct buf *tbp;
+	int nocopy;

 	vp = bo->__bo_vnode;
 	if (bp->b_iocmd == BIO_WRITE) {
@ -1946,8 +1955,9 @@ ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
 		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
 		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			panic("ffs_geom_strategy: bad I/O");
-		bp->b_flags &= ~B_VALIDSUSPWRT;
-		if ((vp->v_vflag & VV_COPYONWRITE) &&
+		nocopy = bp->b_flags & B_NOCOPY;
+		bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY);
+		if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 &&
 		    vp->v_rdev->si_snapdata != NULL) {
 			if ((bp->b_flags & B_CLUSTER) != 0) {
 				runningbufwakeup(bp);
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@ -225,6 +225,7 @@ ffs_syncvnode(struct vnode *vp, int waitfor)
 	wait = (waitfor == MNT_WAIT);
 	lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
 	bo = &vp->v_bufobj;
+	ip->i_flag &= ~IN_NEEDSYNC;

 	/*
 	 * Flush all dirty buffers associated with a vnode.
--- a/sys/ufs/ffs/fs.h
+++ b/sys/ufs/ffs/fs.h
@ -340,7 +340,9 @@ struct fs {
 	u_int32_t fs_avgfilesize;	/* expected average file size */
 	u_int32_t fs_avgfpdir;		/* expected # of files per directory */
 	int32_t	 fs_save_cgsize;	/* save real cg size to use fs_bsize */
-	int32_t	 fs_sparecon32[26];	/* reserved for future constants */
+	ufs_time_t fs_mtime;		/* Last mount or fsck time. */
+	int32_t  fs_sujfree;		/* SUJ free list */
+	int32_t	 fs_sparecon32[23];	/* reserved for future constants */
 	int32_t  fs_flags;		/* see FS_ flags below */
 	int32_t	 fs_contigsumsize;	/* size of cluster summary array */ 
 	int32_t	 fs_maxsymlinklen;	/* max length of an internal symlink */
@ -408,12 +410,13 @@ CTASSERT(sizeof(struct fs) == 1376);
 #define FS_UNCLEAN	0x0001	/* filesystem not clean at mount */
 #define FS_DOSOFTDEP	0x0002	/* filesystem using soft dependencies */
 #define FS_NEEDSFSCK	0x0004	/* filesystem needs sync fsck before mount */
-#define FS_INDEXDIRS	0x0008	/* kernel supports indexed directories */
+#define	FS_SUJ       	0x0008	/* Filesystem using softupdate journal */
 #define FS_ACLS		0x0010	/* file system has POSIX.1e ACLs enabled */
 #define FS_MULTILABEL	0x0020	/* file system is MAC multi-label */
 #define FS_GJOURNAL	0x0040	/* gjournaled file system */
 #define FS_FLAGS_UPDATED 0x0080	/* flags have been moved to new location */
 #define FS_NFS4ACLS	0x0100	/* file system has NFSv4 ACLs enabled */
+#define FS_INDEXDIRS	0x0200	/* kernel supports indexed directories */

 /*
 * Macros to access bits in the fs_active array.
@ -603,7 +606,31 @@ struct cg {
 	  ? (fs)->fs_bsize \
 	  : (fragroundup(fs, blkoff(fs, (size)))))

-
+/*
+ * Indirect lbns are aligned on NDADDR addresses where single indirects
+ * are the negated address of the lowest lbn reachable, double indirects
+ * are this lbn - 1 and triple indirects are this lbn - 2.  This yields
+ * an unusual bit order to determine level.
+ */
+static inline int
+lbn_level(ufs_lbn_t lbn)
+{
+	if (lbn >= 0)
+		return 0;
+	switch (lbn & 0x3) {
+	case 0:
+		return (0);
+	case 1:
+		break;
+	case 2:
+		return (2);
+	case 3:
+		return (1);
+	default:
+		break;
+	}
+	return (-1);
+}
 /*
 * Number of inodes in a secondary storage block/fragment.
 */
@ -615,6 +642,108 @@ struct cg {
 */
 #define	NINDIR(fs)	((fs)->fs_nindir)

+/*
+ * Softdep journal record format.
+ */
+
+#define	JOP_ADDREF	1	/* Add a reference to an inode. */
+#define	JOP_REMREF	2	/* Remove a reference from an inode. */
+#define	JOP_NEWBLK	3	/* Allocate a block. */
+#define	JOP_FREEBLK	4	/* Free a block or a tree of blocks. */
+#define	JOP_MVREF	5	/* Move a reference from one off to another. */
+#define	JOP_TRUNC	6	/* Partial truncation record. */
+
+#define	JREC_SIZE	32	/* Record and segment header size. */
+
+#define	SUJ_MIN		(4 * 1024 * 1024)	/* Minimum journal size */
+#define	SUJ_MAX		(32 * 1024 * 1024)	/* Maximum journal size */
+#define	SUJ_FILE	".sujournal"		/* Journal file name */
+
+/*
+ * Size of the segment record header.  There is at most one for each disk
+ * block n the journal.  The segment header is followed by an array of
+ * records.  fsck depends on the first element in each record being 'op'
+ * and the second being 'ino'.  Segments may span multiple disk blocks but
+ * the header is present on each.
+ */
+struct jsegrec {
+	uint64_t	jsr_seq;	/* Our sequence number */
+	uint64_t	jsr_oldest;	/* Oldest valid sequence number */
+	uint16_t	jsr_cnt;	/* Count of valid records */
+	uint16_t	jsr_blocks;	/* Count of DEV_BSIZE blocks. */
+	uint32_t	jsr_crc;	/* 32bit crc of the valid space */
+	ufs_time_t	jsr_time;	/* timestamp for mount instance */
+};
+
+/*
+ * Reference record.  Records a single link count modification.
+ */
+struct jrefrec {
+	uint32_t	jr_op;
+	ino_t		jr_ino;
+	ino_t		jr_parent;
+	uint16_t	jr_nlink;
+	uint16_t	jr_mode;
+	off_t		jr_diroff;
+	uint64_t	jr_unused;
+};
+
+/*
+ * Move record.  Records a reference moving within a directory block.  The
+ * nlink is unchanged but we must search both locations.
+ */
+struct jmvrec {
+	uint32_t	jm_op;
+	ino_t		jm_ino;
+	ino_t		jm_parent;
+	uint16_t	jm_unused;
+	off_t		jm_oldoff;
+	off_t		jm_newoff;
+};
+
+/*
+ * Block record.  A set of frags or tree of blocks starting at an indirect are
+ * freed or a set of frags are allocated.
+ */
+struct jblkrec {
+	uint32_t	jb_op;
+	uint32_t	jb_ino;
+	ufs2_daddr_t	jb_blkno;
+	ufs_lbn_t	jb_lbn;
+	uint16_t	jb_frags;
+	uint16_t	jb_oldfrags;
+	uint32_t	jb_unused;
+};
+
+/*
+ * Truncation record.  Records a partial truncation so that it may be
+ * completed later.
+ */
+struct jtrncrec {
+	uint32_t	jt_op;
+	uint32_t	jt_ino;
+	off_t		jt_size;
+	uint32_t	jt_extsize;
+	uint32_t	jt_pad[3];
+};
+
+union jrec {
+	struct jsegrec	rec_jsegrec;
+	struct jrefrec	rec_jrefrec;
+	struct jmvrec	rec_jmvrec;
+	struct jblkrec	rec_jblkrec;
+	struct jtrncrec	rec_jtrncrec;
+};
+
+#ifdef CTASSERT
+CTASSERT(sizeof(struct jsegrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jrefrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jmvrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jblkrec) == JREC_SIZE);
+CTASSERT(sizeof(struct jtrncrec) == JREC_SIZE);
+CTASSERT(sizeof(union jrec) == JREC_SIZE);
+#endif
+
 extern int inside[], around[];
 extern u_char *fragtbl[];

--- a/sys/ufs/ffs/softdep.h
+++ b/sys/ufs/ffs/softdep.h
@ -94,22 +94,29 @@
 * The ONWORKLIST flag shows whether the structure is currently linked
 * onto a worklist.
 */
-#define	ATTACHED	0x0001
-#define	UNDONE		0x0002
-#define	COMPLETE	0x0004
-#define	DEPCOMPLETE	0x0008
-#define	MKDIR_PARENT	0x0010	/* diradd & mkdir only */
-#define	MKDIR_BODY	0x0020	/* diradd & mkdir only */
-#define	RMDIR		0x0040	/* dirrem only */
-#define	DIRCHG		0x0080	/* diradd & dirrem only */
-#define	GOINGAWAY	0x0100	/* indirdep only */
-#define	IOSTARTED	0x0200	/* inodedep & pagedep only */
-#define	SPACECOUNTED	0x0400	/* inodedep only */
-#define	NEWBLOCK	0x0800	/* pagedep only */
-#define	INPROGRESS	0x1000	/* dirrem, freeblks, freefrag, freefile only */
-#define	UFS1FMT		0x2000	/* indirdep only */
-#define	EXTDATA		0x4000	/* allocdirect only */
-#define ONWORKLIST	0x8000
+#define	ATTACHED	0x000001
+#define	UNDONE		0x000002
+#define	COMPLETE	0x000004
+#define	DEPCOMPLETE	0x000008
+#define	MKDIR_PARENT	0x000010 /* diradd, mkdir, jaddref, jsegdep only */
+#define	MKDIR_BODY	0x000020 /* diradd, mkdir, jaddref only */
+#define	RMDIR		0x000040 /* dirrem only */
+#define	DIRCHG		0x000080 /* diradd, dirrem only */
+#define	GOINGAWAY	0x000100 /* indirdep, jremref only */
+#define	IOSTARTED	0x000200 /* inodedep, pagedep, bmsafemap only */
+#define	SPACECOUNTED	0x000400 /* inodedep only */
+#define	NEWBLOCK	0x000800 /* pagedep, jaddref only */
+#define	INPROGRESS	0x001000 /* dirrem, freeblks, freefrag, freefile only */
+#define	UFS1FMT		0x002000 /* indirdep only */
+#define	EXTDATA		0x004000 /* allocdirect only */
+#define ONWORKLIST	0x008000
+#define	IOWAITING	0x010000 /* Thread is waiting for IO to complete. */
+#define	ONDEPLIST	0x020000 /* Structure is on a dependency list. */
+#define	UNLINKED	0x040000 /* inodedep has been unlinked. */
+#define	UNLINKNEXT	0x080000 /* inodedep has valid di_freelink */
+#define	UNLINKPREV	0x100000 /* inodedep is pointed at in the unlink list */
+#define	UNLINKONLIST	0x200000 /* inodedep is in the unlinked list on disk */
+#define	UNLINKLINKS	(UNLINKNEXT | UNLINKPREV)

 #define	ALLCOMPLETE	(ATTACHED | COMPLETE | DEPCOMPLETE)

@ -135,25 +142,38 @@
 * and the macros below changed to use it.
 */
 struct worklist {
-	struct mount		*wk_mp;		/* Mount we live in */
 	LIST_ENTRY(worklist)	wk_list;	/* list of work requests */
-	unsigned short		wk_type;	/* type of request */
-	unsigned short		wk_state;	/* state flags */
+	struct mount		*wk_mp;		/* Mount we live in */
+	unsigned int		wk_type:8,	/* type of request */
+				wk_state:24;	/* state flags */
 };
 #define WK_DATA(wk) ((void *)(wk))
 #define WK_PAGEDEP(wk) ((struct pagedep *)(wk))
 #define WK_INODEDEP(wk) ((struct inodedep *)(wk))
 #define WK_BMSAFEMAP(wk) ((struct bmsafemap *)(wk))
+#define	WK_NEWBLK(wk)  ((struct newblk *)(wk))
 #define WK_ALLOCDIRECT(wk) ((struct allocdirect *)(wk))
 #define WK_INDIRDEP(wk) ((struct indirdep *)(wk))
 #define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk))
 #define WK_FREEFRAG(wk) ((struct freefrag *)(wk))
 #define WK_FREEBLKS(wk) ((struct freeblks *)(wk))
+#define WK_FREEWORK(wk) ((struct freework *)(wk))
 #define WK_FREEFILE(wk) ((struct freefile *)(wk))
 #define WK_DIRADD(wk) ((struct diradd *)(wk))
 #define WK_MKDIR(wk) ((struct mkdir *)(wk))
 #define WK_DIRREM(wk) ((struct dirrem *)(wk))
 #define WK_NEWDIRBLK(wk) ((struct newdirblk *)(wk))
+#define	WK_JADDREF(wk) ((struct jaddref *)(wk))
+#define	WK_JREMREF(wk) ((struct jremref *)(wk))
+#define	WK_JMVREF(wk) ((struct jmvref *)(wk))
+#define	WK_JSEGDEP(wk) ((struct jsegdep *)(wk))
+#define	WK_JSEG(wk) ((struct jseg *)(wk))
+#define	WK_JNEWBLK(wk) ((struct jnewblk *)(wk))
+#define	WK_JFREEBLK(wk) ((struct jfreeblk *)(wk))
+#define	WK_FREEDEP(wk) ((struct freedep *)(wk))
+#define	WK_JFREEFRAG(wk) ((struct jfreefrag *)(wk))
+#define	WK_SBDEP(wk) ((struct sbdep *)wk)
+#define	WK_JTRUNC(wk) ((struct jtrunc *)(wk))

 /*
 * Various types of lists
@ -165,6 +185,15 @@ LIST_HEAD(inodedephd, inodedep);
 LIST_HEAD(allocindirhd, allocindir);
 LIST_HEAD(allocdirecthd, allocdirect);
 TAILQ_HEAD(allocdirectlst, allocdirect);
+LIST_HEAD(indirdephd, indirdep);
+LIST_HEAD(jaddrefhd, jaddref);
+LIST_HEAD(jremrefhd, jremref);
+LIST_HEAD(jmvrefhd, jmvref);
+LIST_HEAD(jnewblkhd, jnewblk);
+LIST_HEAD(jfreeblkhd, jfreeblk);
+LIST_HEAD(freeworkhd, freework);
+TAILQ_HEAD(jseglst, jseg);
+TAILQ_HEAD(inoreflst, inoref);

 /*
 * The "pagedep" structure tracks the various dependencies related to
@ -192,9 +221,11 @@ struct pagedep {
 	LIST_ENTRY(pagedep) pd_hash;	/* hashed lookup */
 	ino_t	pd_ino;			/* associated file */
 	ufs_lbn_t pd_lbn;		/* block within file */
+	struct	newdirblk *pd_newdirblk; /* associated newdirblk if NEWBLOCK */
 	struct	dirremhd pd_dirremhd;	/* dirrem's waiting for page */
 	struct	diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */
 	struct	diraddhd pd_pendinghd;	/* directory entries awaiting write */
+	struct	jmvrefhd pd_jmvrefhd;	/* Dependent journal writes. */
 };

 /*
@ -248,13 +279,18 @@ struct inodedep {
 	struct	worklist id_list;	/* buffer holding inode block */
 #	define	id_state id_list.wk_state /* inode dependency state */
 	LIST_ENTRY(inodedep) id_hash;	/* hashed lookup */
+	TAILQ_ENTRY(inodedep) id_unlinked;	/* Unlinked but ref'd inodes */
 	struct	fs *id_fs;		/* associated filesystem */
 	ino_t	id_ino;			/* dependent inode */
 	nlink_t	id_nlinkdelta;		/* saved effective link count */
+	nlink_t	id_savednlink;		/* Link saved during rollback */
 	LIST_ENTRY(inodedep) id_deps;	/* bmsafemap's list of inodedep's */
-	struct	buf *id_buf;		/* related bmsafemap (if pending) */
+	struct	bmsafemap *id_bmsafemap; /* related bmsafemap (if pending) */
+	struct	diradd *id_mkdiradd;	/* diradd for a mkdir. */
+	struct	inoreflst id_inoreflst;	/* Inode reference adjustments. */
 	long	id_savedextsize;	/* ext size saved during rollback */
 	off_t	id_savedsize;		/* file size saved during rollback */
+	struct	dirremhd id_dirremhd;	/* Removals pending. */
 	struct	workhead id_pendinghd;	/* entries awaiting directory write */
 	struct	workhead id_bufwait;	/* operations after inode written */
 	struct	workhead id_inowait;	/* operations waiting inode update */
@ -270,23 +306,6 @@ struct inodedep {
 #define id_savedino1 id_un.idu_savedino1
 #define id_savedino2 id_un.idu_savedino2

-/*
- * A "newblk" structure is attached to a bmsafemap structure when a block
- * or fragment is allocated from a cylinder group. Its state is set to
- * DEPCOMPLETE when its cylinder group map is written. It is consumed by
- * an associated allocdirect or allocindir allocation which will attach
- * themselves to the bmsafemap structure if the newblk's DEPCOMPLETE flag
- * is not set (i.e., its cylinder group map has not been written).
- */ 
-struct newblk {
-	LIST_ENTRY(newblk) nb_hash;	/* hashed lookup */
-	struct	fs *nb_fs;		/* associated filesystem */
-	int	nb_state;		/* state of bitmap dependency */
-	ufs2_daddr_t nb_newblkno;	/* allocated block number */
-	LIST_ENTRY(newblk) nb_deps;	/* bmsafemap's list of newblk's */
-	struct	bmsafemap *nb_bmsafemap; /* associated bmsafemap */
-};
-
 /*
 * A "bmsafemap" structure maintains a list of dependency structures
 * that depend on the update of a particular cylinder group map.
@ -299,11 +318,41 @@ struct newblk {
 */
 struct bmsafemap {
 	struct	worklist sm_list;	/* cylgrp buffer */
+#	define	sm_state sm_list.wk_state
+	int	sm_cg;
+	LIST_ENTRY(bmsafemap) sm_hash;	/* Hash links. */
 	struct	buf *sm_buf;		/* associated buffer */
 	struct	allocdirecthd sm_allocdirecthd; /* allocdirect deps */
+	struct	allocdirecthd sm_allocdirectwr; /* writing allocdirect deps */
 	struct	allocindirhd sm_allocindirhd; /* allocindir deps */
+	struct	allocindirhd sm_allocindirwr; /* writing allocindir deps */
 	struct	inodedephd sm_inodedephd; /* inodedep deps */
+	struct	inodedephd sm_inodedepwr; /* writing inodedep deps */
 	struct	newblkhd sm_newblkhd;	/* newblk deps */
+	struct	newblkhd sm_newblkwr;	/* writing newblk deps */
+	struct	jaddrefhd sm_jaddrefhd;	/* Pending inode allocations. */
+	struct	jnewblkhd sm_jnewblkhd;	/* Pending block allocations. */
+};
+
+/*
+ * A "newblk" structure is attached to a bmsafemap structure when a block
+ * or fragment is allocated from a cylinder group. Its state is set to
+ * DEPCOMPLETE when its cylinder group map is written. It is converted to
+ * an allocdirect or allocindir allocation once the allocator calls the
+ * appropriate setup function.
+ */ 
+struct newblk {
+	struct	worklist nb_list;
+#	define	nb_state nb_list.wk_state
+	LIST_ENTRY(newblk) nb_hash;	/* hashed lookup */
+	LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblks */
+	struct	jnewblk *nb_jnewblk;	/* New block journal entry. */
+	struct	bmsafemap *nb_bmsafemap;/* cylgrp dep (if pending) */
+	struct	freefrag *nb_freefrag;	/* fragment to be freed (if any) */
+	struct	indirdephd nb_indirdeps; /* Children indirect blocks. */
+	struct	workhead nb_newdirblk;	/* dir block to notify when written */
+	struct	workhead nb_jwork;	/* Journal work pending. */
+	ufs2_daddr_t	nb_newblkno;	/* new value of block pointer */
 };

 /*
@ -334,20 +383,18 @@ struct bmsafemap {
 * and inodedep->id_pendinghd lists.
 */
 struct allocdirect {
-	struct	worklist ad_list;	/* buffer holding block */
-#	define	ad_state ad_list.wk_state /* block pointer state */
+	struct	newblk ad_block;	/* Common block logic */
+#	define	ad_state ad_block.nb_list.wk_state /* block pointer state */
 	TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */
-	ufs_lbn_t ad_lbn;		/* block within file */
-	ufs2_daddr_t ad_newblkno;	/* new value of block pointer */
-	ufs2_daddr_t ad_oldblkno;	/* old value of block pointer */
-	long	ad_newsize;		/* size of new block */
-	long	ad_oldsize;		/* size of old block */
-	LIST_ENTRY(allocdirect) ad_deps; /* bmsafemap's list of allocdirect's */
-	struct	buf *ad_buf;		/* cylgrp buffer (if pending) */
 	struct	inodedep *ad_inodedep;	/* associated inodedep */
-	struct	freefrag *ad_freefrag;	/* fragment to be freed (if any) */
-	struct	workhead ad_newdirblk;	/* dir block to notify when written */
+	ufs2_daddr_t	ad_oldblkno;	/* old value of block pointer */
+	int		ad_offset;	/* Pointer offset in parent. */
+	long		ad_newsize;	/* size of new block */
+	long		ad_oldsize;	/* size of old block */
 };
+#define	ad_newblkno	ad_block.nb_newblkno
+#define	ad_freefrag	ad_block.nb_freefrag
+#define	ad_newdirblk	ad_block.nb_newdirblk

 /*
 * A single "indirdep" structure manages all allocation dependencies for
@ -369,10 +416,14 @@ struct allocdirect {
 struct indirdep {
 	struct	worklist ir_list;	/* buffer holding indirect block */
 #	define	ir_state ir_list.wk_state /* indirect block pointer state */
-	caddr_t ir_saveddata;		/* buffer cache contents */
+	LIST_ENTRY(indirdep) ir_next;	/* alloc{direct,indir} list */
+	caddr_t	ir_saveddata;		/* buffer cache contents */
 	struct	buf *ir_savebp;		/* buffer holding safe copy */
+	struct	allocindirhd ir_completehd; /* waiting for indirdep complete */
+	struct	allocindirhd ir_writehd; /* Waiting for the pointer write. */
 	struct	allocindirhd ir_donehd;	/* done waiting to update safecopy */
 	struct	allocindirhd ir_deplisthd; /* allocindir deps for this block */
+	struct	workhead ir_jwork;	/* Journal work pending. */
 };

 /*
@ -389,16 +440,25 @@ struct indirdep {
 * can then be freed as it is no longer applicable.
 */
 struct allocindir {
-	struct	worklist ai_list;	/* buffer holding indirect block */
-#	define	ai_state ai_list.wk_state /* indirect block pointer state */
+	struct	newblk ai_block;	/* Common block area */
+#	define	ai_state ai_block.nb_list.wk_state /* indirect pointer state */
 	LIST_ENTRY(allocindir) ai_next;	/* indirdep's list of allocindir's */
-	int	ai_offset;		/* pointer offset in indirect block */
-	ufs2_daddr_t ai_newblkno;	/* new block pointer value */
-	ufs2_daddr_t ai_oldblkno;	/* old block pointer value */
-	struct	freefrag *ai_freefrag;	/* block to be freed when complete */
 	struct	indirdep *ai_indirdep;	/* address of associated indirdep */
-	LIST_ENTRY(allocindir) ai_deps;	/* bmsafemap's list of allocindir's */
-	struct	buf *ai_buf;		/* cylgrp buffer (if pending) */
+	ufs2_daddr_t	ai_oldblkno;	/* old value of block pointer */
+	int		ai_offset;	/* Pointer offset in parent. */
+};
+#define	ai_newblkno	ai_block.nb_newblkno
+#define	ai_freefrag	ai_block.nb_freefrag
+#define	ai_newdirblk	ai_block.nb_newdirblk
+
+/*
+ * The allblk union is used to size the newblk structure on allocation so
+ * that it may be any one of three types.
+ */
+union allblk {
+	struct	allocindir ab_allocindir;
+	struct	allocdirect ab_allocdirect;
+	struct	newblk	ab_newblk;
 };

 /*
@ -406,14 +466,13 @@ struct allocindir {
 * allocated fragment is replaced with a larger fragment, rather than extended.
 * The "freefrag" structure is constructed and attached when the replacement
 * block is first allocated. It is processed after the inode claiming the
- * bigger block that replaces it has been written to disk. Note that the
- * ff_state field is is used to store the uid, so may lose data. However,
- * the uid is used only in printing an error message, so is not critical.
- * Keeping it in a short keeps the data structure down to 32 bytes.
+ * bigger block that replaces it has been written to disk.
 */
 struct freefrag {
 	struct	worklist ff_list;	/* id_inowait or delayed worklist */
-#	define	ff_state ff_list.wk_state /* owning user; should be uid_t */
+#	define	ff_state ff_list.wk_state
+	struct	jfreefrag *ff_jfreefrag; /* Associated journal entry. */
+	struct	workhead ff_jwork;	/* Journal work pending. */
 	ufs2_daddr_t ff_blkno;		/* fragment physical block number */
 	long	ff_fragsize;		/* size of fragment being deleted */
 	ino_t	ff_inum;		/* owning inode number */
@ -423,20 +482,57 @@ struct freefrag {
 * A "freeblks" structure is attached to an "inodedep" when the
 * corresponding file's length is reduced to zero. It records all
 * the information needed to free the blocks of a file after its
- * zero'ed inode has been written to disk.
+ * zero'ed inode has been written to disk.  The actual work is done
+ * by child freework structures which are responsible for individual
+ * inode pointers while freeblks is responsible for retiring the
+ * entire operation when it is complete and holding common members.
 */
 struct freeblks {
 	struct	worklist fb_list;	/* id_inowait or delayed worklist */
 #	define	fb_state fb_list.wk_state /* inode and dirty block state */
+	struct	jfreeblkhd fb_jfreeblkhd; /* Journal entries pending */
+	struct	workhead fb_freeworkhd;	/* Work items pending */
+	struct	workhead fb_jwork;	/* Journal work pending */
 	ino_t	fb_previousinum;	/* inode of previous owner of blocks */
 	uid_t	fb_uid;			/* uid of previous owner of blocks */
 	struct	vnode *fb_devvp;	/* filesystem device vnode */
-	long	fb_oldextsize;		/* previous ext data size */
-	off_t	fb_oldsize;		/* previous file size */
 	ufs2_daddr_t fb_chkcnt;		/* used to check cnt of blks released */
-	ufs2_daddr_t fb_dblks[NDADDR];	/* direct blk ptrs to deallocate */
-	ufs2_daddr_t fb_iblks[NIADDR];	/* indirect blk ptrs to deallocate */
-	ufs2_daddr_t fb_eblks[NXADDR];	/* indirect blk ptrs to deallocate */
+	int	fb_ref;			/* Children outstanding. */
+};
+
+/*
+ * A "freework" structure handles the release of a tree of blocks or a single
+ * block.  Each indirect block in a tree is allocated its own freework
+ * structure so that the indrect block may be freed only when all of its
+ * children are freed.  In this way we enforce the rule that an allocated
+ * block must have a valid path to a root that is journaled.  Each child
+ * block acquires a reference and when the ref hits zero the parent ref
+ * is decremented.  If there is no parent the freeblks ref is decremented.
+ */
+struct freework {
+	struct	worklist fw_list;
+#	define	fw_state fw_list.wk_state
+	LIST_ENTRY(freework) fw_next;		/* Queue for freeblksk. */
+	struct	freeblks *fw_freeblks;		/* Root of operation. */
+	struct	freework *fw_parent;		/* Parent indirect. */
+	ufs2_daddr_t	 fw_blkno;		/* Our block #. */
+	ufs_lbn_t	 fw_lbn;		/* Original lbn before free. */
+	int		 fw_frags;		/* Number of frags. */
+	int		 fw_ref;		/* Number of children out. */
+	int		 fw_off;		/* Current working position. */
+	struct	workhead fw_jwork;		/* Journal work pending. */
+};
+
+/*
+ * A "freedep" structure is allocated to track the completion of a bitmap
+ * write for a freework.  One freedep may cover many freed blocks so long
+ * as they reside in the same cylinder group.  When the cg is written
+ * the freedep decrements the ref on the freework which may permit it
+ * to be freed as well.
+ */
+struct freedep {
+	struct	worklist fd_list;
+	struct	freework *fd_freework;	/* Parent freework. */
 };

 /*
@ -450,6 +546,7 @@ struct freefile {
 	mode_t	fx_mode;		/* mode of inode */
 	ino_t	fx_oldinum;		/* inum of the unlinked file */
 	struct	vnode *fx_devvp;	/* filesystem device vnode */
+	struct	workhead fx_jwork;	/* journal work pending. */
 };

 /*
@ -482,12 +579,11 @@ struct freefile {
 * than zero.
 *
 * The overlaying of da_pagedep and da_previous is done to keep the
- * structure down to 32 bytes in size on a 32-bit machine. If a
- * da_previous entry is present, the pointer to its pagedep is available
- * in the associated dirrem entry. If the DIRCHG flag is set, the
- * da_previous entry is valid; if not set the da_pagedep entry is valid.
- * The DIRCHG flag never changes; it is set when the structure is created
- * if appropriate and is never cleared.
+ * structure down. If a da_previous entry is present, the pointer to its
+ * pagedep is available in the associated dirrem entry. If the DIRCHG flag
+ * is set, the da_previous entry is valid; if not set the da_pagedep entry
+ * is valid. The DIRCHG flag never changes; it is set when the structure
+ * is created if appropriate and is never cleared.
 */
 struct diradd {
 	struct	worklist da_list;	/* id_inowait or id_pendinghd list */
@ -499,6 +595,7 @@ struct diradd {
 	struct	dirrem *dau_previous;	/* entry being replaced in dir change */
 	struct	pagedep *dau_pagedep;	/* pagedep dependency for addition */
 	} da_un;
+	struct workhead da_jwork;	/* Journal work awaiting completion. */
 };
 #define da_previous da_un.dau_previous
 #define da_pagedep da_un.dau_pagedep
@ -525,12 +622,13 @@ struct diradd {
 * mkdir structures that reference it. The deletion would be faster if the
 * diradd structure were simply augmented to have two pointers that referenced
 * the associated mkdir's. However, this would increase the size of the diradd
- * structure from 32 to 64-bits to speed a very infrequent operation.
+ * structure to speed a very infrequent operation.
 */
 struct mkdir {
 	struct	worklist md_list;	/* id_inowait or buffer holding dir */
 #	define	md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */
 	struct	diradd *md_diradd;	/* associated diradd */
+	struct	jaddref *md_jaddref;	/* dependent jaddref. */
 	struct	buf *md_buf;		/* MKDIR_BODY: buffer holding dir */
 	LIST_ENTRY(mkdir) md_mkdirs;	/* list of all mkdirs */
 };
@ -542,20 +640,19 @@ LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
 * list of the pagedep for the directory page that contains the entry.
 * It is processed after the directory page with the deleted entry has
 * been written to disk.
- *
- * The overlaying of dm_pagedep and dm_dirinum is done to keep the
- * structure down to 32 bytes in size on a 32-bit machine. It works
- * because they are never used concurrently.
 */
 struct dirrem {
 	struct	worklist dm_list;	/* delayed worklist */
 #	define	dm_state dm_list.wk_state /* state of the old directory entry */
 	LIST_ENTRY(dirrem) dm_next;	/* pagedep's list of dirrem's */
+	LIST_ENTRY(dirrem) dm_inonext;	/* inodedep's list of dirrem's */
+	struct	jremrefhd dm_jremrefhd;	/* Pending remove reference deps. */
 	ino_t	dm_oldinum;		/* inum of the removed dir entry */
 	union {
 	struct	pagedep *dmu_pagedep;	/* pagedep dependency for remove */
 	ino_t	dmu_dirinum;		/* parent inode number (for rmdir) */
 	} dm_un;
+	struct workhead dm_jwork;	/* Journal work awaiting completion. */
 };
 #define dm_pagedep dm_un.dmu_pagedep
 #define dm_dirinum dm_un.dmu_dirinum
@ -577,9 +674,200 @@ struct dirrem {
 * blocks using a similar scheme with the allocindir structures. Rather
 * than adding this level of complexity, we simply write those newly 
 * allocated indirect blocks synchronously as such allocations are rare.
+ * In the case of a new directory the . and .. links are tracked with
+ * a mkdir rather than a pagedep.  In this case we track the mkdir
+ * so it can be released when it is written.  A workhead is used
+ * to simplify canceling a mkdir that is removed by a subsequent dirrem.
 */
 struct newdirblk {
 	struct	worklist db_list;	/* id_inowait or pg_newdirblk */
 #	define	db_state db_list.wk_state /* unused */
 	struct	pagedep *db_pagedep;	/* associated pagedep */
+	struct	workhead db_mkdir;
+};
+
+/*
+ * The inoref structure holds the elements common to jaddref and jremref
+ * so they may easily be queued in-order on the inodedep.
+ */
+struct inoref {
+	struct	worklist if_list;
+#	define	if_state if_list.wk_state
+	TAILQ_ENTRY(inoref) if_deps;	/* Links for inodedep. */
+	struct	jsegdep	*if_jsegdep;
+	off_t		if_diroff;	/* Directory offset. */
+	ino_t		if_ino;		/* Inode number. */
+	ino_t		if_parent;	/* Parent inode number. */
+	nlink_t		if_nlink;	/* nlink before addition. */
+	uint16_t	if_mode;	/* File mode, needed for IFMT. */
+};
+
+/*
+ * A "jaddref" structure tracks a new reference (link count) on an inode
+ * and prevents the link count increase and bitmap allocation until a
+ * journal entry can be written.  Once the journal entry is written,
+ * the inode is put on the pendinghd of the bmsafemap and a diradd or
+ * mkdir entry is placed on the bufwait list of the inode.  The DEPCOMPLETE
+ * flag is used to indicate that all of the required information for writing
+ * the journal entry is present.  MKDIR_BODY and MKDIR_PARENT are used to
+ * differentiate . and .. links from regular file names.  NEWBLOCK indicates
+ * a bitmap is still pending.  If a new reference is canceled by a delete
+ * prior to writing the journal the jaddref write is canceled and the
+ * structure persists to prevent any disk-visible changes until it is
+ * ultimately released when the file is freed or the link is dropped again.
+ */
+struct jaddref {
+	struct	inoref	ja_ref;
+#	define	ja_list	ja_ref.if_list	/* Journal pending or jseg entries. */
+#	define	ja_state ja_ref.if_list.wk_state
+	LIST_ENTRY(jaddref) ja_bmdeps;	/* Links for bmsafemap. */
+	union {
+		struct	diradd	*jau_diradd;	/* Pending diradd. */
+		struct	mkdir	*jau_mkdir;	/* MKDIR_{PARENT,BODY} */
+	} ja_un;
+};
+#define	ja_diradd	ja_un.jau_diradd
+#define	ja_mkdir	ja_un.jau_mkdir
+#define	ja_diroff	ja_ref.if_diroff
+#define	ja_ino		ja_ref.if_ino
+#define	ja_parent	ja_ref.if_parent
+#define	ja_mode		ja_ref.if_mode
+
+/*
+ * A "jremref" structure tracks a removed reference (unlink) on an
+ * inode and prevents the directory remove from proceeding until the
+ * journal entry is written.  Once the journal has been written the remove
+ * may proceed as normal. 
+ */
+struct jremref {
+	struct	inoref	jr_ref;
+#	define	jr_list	jr_ref.if_list	/* Journal pending or jseg entries. */
+#	define	jr_state jr_ref.if_list.wk_state
+	LIST_ENTRY(jremref) jr_deps;	/* Links for pagdep. */
+	struct	dirrem	*jr_dirrem;	/* Back pointer to dirrem. */
+};
+
+struct jmvref {
+	struct	worklist jm_list;
+	LIST_ENTRY(jmvref) jm_deps;
+	struct pagedep	*jm_pagedep;
+	ino_t		jm_parent;
+	ino_t		jm_ino;
+	off_t		jm_oldoff;
+	off_t		jm_newoff;
+};
+
+/*
+ * A "jnewblk" structure tracks a newly allocated block or fragment and
+ * prevents the direct or indirect block pointer as well as the cg bitmap
+ * from being written until it is logged.  After it is logged the jsegdep
+ * is attached to the allocdirect or allocindir until the operation is
+ * completed or reverted.  If the operation is reverted prior to the journal
+ * write the jnewblk structure is maintained to prevent the bitmaps from
+ * reaching the disk.  Ultimately the jnewblk structure will be passed
+ * to the free routine as the in memory cg is modified back to the free
+ * state at which time it can be released.
+ */
+struct jnewblk {
+	struct	worklist jn_list;
+#	define	jn_state jn_list.wk_state
+	struct	jsegdep	*jn_jsegdep;
+	LIST_ENTRY(jnewblk) jn_deps;		/* All jnewblks on bmsafemap */
+	struct	newblk	*jn_newblk;
+	ino_t		jn_ino;
+	ufs_lbn_t	jn_lbn;
+	ufs2_daddr_t	jn_blkno;
+	int		jn_oldfrags;
+	int		jn_frags;
+};
+
+/*
+ * A "jfreeblk" structure tracks the journal write for freeing a block
+ * or tree of blocks.  The block pointer must not be cleared in the inode
+ * or indirect prior to the jfreeblk being written.
+ */
+struct jfreeblk {
+	struct	worklist jf_list;
+#	define	jf_state jf_list.wk_state
+	struct	jsegdep	*jf_jsegdep;
+	struct freeblks	*jf_freeblks;
+	LIST_ENTRY(jfreeblk) jf_deps;
+	ino_t		jf_ino;
+	ufs_lbn_t	jf_lbn;
+	ufs2_daddr_t	jf_blkno;
+	int		jf_frags;
+};
+
+/*
+ * A "jfreefrag" tracks the freeing of a single block when a fragment is
+ * extended or an indirect page is replaced.  It is not part of a larger
+ * freeblks operation.
+ */
+struct jfreefrag {
+	struct	worklist fr_list;
+#	define	fr_state fr_list.wk_state
+	struct	jsegdep	*fr_jsegdep;
+	struct freefrag	*fr_freefrag;
+	ino_t		fr_ino;
+	ufs_lbn_t	fr_lbn;
+	ufs2_daddr_t	fr_blkno;
+	int		fr_frags;
+};
+
+/*
+ * A "jtrunc" journals the intent to truncate an inode to a non-zero
+ * value.  This is done synchronously prior to the synchronous partial
+ * truncation process.  The jsegdep is not released until the truncation
+ * is complete and the truncated inode is fsync'd.
+ */
+struct jtrunc {
+	struct	worklist jt_list;
+	struct	jsegdep	*jt_jsegdep;
+	ino_t		 jt_ino;
+	off_t		 jt_size;
+	int		 jt_extsize;
+};
+
+/*
+ * A "jsegdep" structure tracks a single reference to a written journal
+ * segment so the journal space can be reclaimed when all dependencies
+ * have been written.
+ */
+struct jsegdep {
+	struct	worklist jd_list;
+#	define	jd_state jd_list.wk_state
+	struct	jseg	*jd_seg;
+};
+
+/*
+ * A "jseg" structure contains all of the journal records written in a
+ * single disk write.  jaddref and jremref structures are linked into
+ * js_entries so thay may be completed when the write completes.  The
+ * js_deps array contains as many entries as there are ref counts to
+ * reduce the number of allocations required per journal write to one.
+ */
+struct jseg {
+	struct	worklist js_list;	/* b_deps link for journal */
+#	define	js_state js_list.wk_state
+	struct	workhead js_entries;	/* Entries awaiting write */
+	TAILQ_ENTRY(jseg) js_next;
+	struct	jblocks *js_jblocks;	/* Back pointer to block/seg list */
+	struct	buf *js_buf;		/* Buffer while unwritten */
+	uint64_t js_seq;
+	int	js_size;		/* Allocated size in bytes */
+	int	js_cnt;			/* Total items allocated */
+	int	js_refs;		/* Count of items pending completion */
+};
+
+/*
+ * A 'sbdep' structure tracks the head of the free inode list and
+ * superblock writes.  This makes sure the superblock is always pointing at
+ * the first possible unlinked inode for the suj recovery process.  If a
+ * block write completes and we discover a new head is available the buf
+ * is dirtied and the dep is kept.
+ */
+struct sbdep {
+	struct	worklist sb_list;	/* b_dep linkage */
+	struct	fs	*sb_fs;		/* Filesystem pointer within buf. */
+	struct	ufsmount *sb_ump;
 };
--- a/sys/ufs/ufs/dinode.h
+++ b/sys/ufs/ufs/dinode.h
@ -146,7 +146,8 @@ struct ufs2_dinode {
 	ufs2_daddr_t	di_db[NDADDR];	/* 112: Direct disk blocks. */
 	ufs2_daddr_t	di_ib[NIADDR];	/* 208: Indirect disk blocks. */
 	u_int64_t	di_modrev;	/* 232: i_modrev for NFSv4 */
-	int64_t		di_spare[2];	/* 240: Reserved; currently unused */
+	ino_t		di_freelink;	/* 240: SUJ: Next unlinked inode. */
+	uint32_t	di_spare[3];	/* 244: Reserved; currently unused */
 };

 /*
@ -167,9 +168,7 @@ struct ufs2_dinode {
 struct ufs1_dinode {
 	u_int16_t	di_mode;	/*   0: IFMT, permissions; see below. */
 	int16_t		di_nlink;	/*   2: File link count. */
-	union {
-		u_int16_t oldids[2];	/*   4: Ffs: old user and group ids. */
-	} di_u;
+	ino_t		di_freelink;	/*   4: SUJ: Next unlinked inode. */
 	u_int64_t	di_size;	/*   8: File byte count. */
 	int32_t		di_atime;	/*  16: Last access time. */
 	int32_t		di_atimensec;	/*  20: Last access time. */
@ -186,7 +185,5 @@ struct ufs1_dinode {
 	u_int32_t	di_gid;		/* 116: File group. */
 	u_int64_t	di_modrev;	/* 120: i_modrev for NFSv4 */
 };
-#define	di_ogid		di_u.oldids[1]
-#define	di_ouid		di_u.oldids[0]

 #endif /* _UFS_UFS_DINODE_H_ */
--- a/sys/ufs/ufs/inode.h
+++ b/sys/ufs/ufs/inode.h
@ -120,7 +120,7 @@ struct inode {
 #define	IN_CHANGE	0x0002		/* Inode change time update request. */
 #define	IN_UPDATE	0x0004		/* Modification time update request. */
 #define	IN_MODIFIED	0x0008		/* Inode has been modified. */
-#define	IN_RENAME	0x0010		/* Inode is being renamed. */
+#define	IN_NEEDSYNC	0x0010		/* Inode requires fsync. */
 #define	IN_LAZYMOD	0x0040		/* Modified, but don't write yet. */
 #define	IN_SPACECOUNTED	0x0080		/* Blocks to be freed in free count. */
 #define	IN_LAZYACCESS	0x0100		/* Process IN_ACCESS after the
@ -175,6 +175,7 @@ struct indir {
 /* Determine if soft dependencies are being done */
 #define DOINGSOFTDEP(vp)	((vp)->v_mount->mnt_flag & MNT_SOFTDEP)
 #define DOINGASYNC(vp)		((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC)
+#define DOINGSUJ(vp)		((vp)->v_mount->mnt_kern_flag & MNTK_SUJ)

 /* This overlays the fid structure (see mount.h). */
 struct ufid {
--- a/sys/ufs/ufs/ufs_dirhash.c
+++ b/sys/ufs/ufs/ufs_dirhash.c
@ -68,8 +68,6 @@ __FBSDID("$FreeBSD$");

 static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables");

-static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
-
 static int ufs_mindirhashsize = DIRBLKSIZ * 5;
 SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
    &ufs_mindirhashsize,
--- a/sys/ufs/ufs/ufs_extern.h
+++ b/sys/ufs/ufs/ufs_extern.h
@ -57,7 +57,7 @@ int	 ufs_bmap(struct vop_bmap_args *);
 int	 ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *,
 	    struct buf *, int *, int *);
 int	 ufs_fhtovp(struct mount *, struct ufid *, struct vnode **);
-int	 ufs_checkpath(ino_t, struct inode *, struct ucred *);
+int	 ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *);
 void	 ufs_dirbad(struct inode *, doff_t, char *);
 int	 ufs_dirbadentry(struct vnode *, struct direct *, int);
 int	 ufs_dirempty(struct inode *, ino_t, struct ucred *);
@ -66,9 +66,11 @@ int	 ufs_extwrite(struct vop_write_args *);
 void	 ufs_makedirentry(struct inode *, struct componentname *,
 	    struct direct *);
 int	 ufs_direnter(struct vnode *, struct vnode *, struct direct *,
-	    struct componentname *, struct buf *);
+	    struct componentname *, struct buf *, int);
 int	 ufs_dirremove(struct vnode *, struct inode *, int, int);
 int	 ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int);
+int	 ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *,
+	    ino_t *);
 int	 ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *);
 int	 ufs_inactive(struct vop_inactive_args *);
 int	 ufs_init(struct vfsconf *);
@ -81,19 +83,33 @@ vfs_root_t ufs_root;
 int	 ufs_uninit(struct vfsconf *);
 int	 ufs_vinit(struct mount *, struct vop_vector *, struct vnode **);

+#include <sys/sysctl.h>
+SYSCTL_DECL(_vfs_ufs);
+
 /*
 * Soft update function prototypes.
 */
 int	softdep_setup_directory_add(struct buf *, struct inode *, off_t,
 	    ino_t, struct buf *, int);
-void	softdep_change_directoryentry_offset(struct inode *, caddr_t,
-	    caddr_t, caddr_t, int);
+void	softdep_change_directoryentry_offset(struct buf *, struct inode *,
+	    caddr_t, caddr_t, caddr_t, int);
 void	softdep_setup_remove(struct buf *,struct inode *, struct inode *, int);
 void	softdep_setup_directory_change(struct buf *, struct inode *,
 	    struct inode *, ino_t, int);
 void	softdep_change_linkcnt(struct inode *);
 void	softdep_releasefile(struct inode *);
 int	softdep_slowdown(struct vnode *);
+void	softdep_setup_create(struct inode *, struct inode *);
+void	softdep_setup_dotdot_link(struct inode *, struct inode *);
+void	softdep_setup_link(struct inode *, struct inode *);
+void	softdep_setup_mkdir(struct inode *, struct inode *);
+void	softdep_setup_rmdir(struct inode *, struct inode *);
+void	softdep_setup_unlink(struct inode *, struct inode *);
+void	softdep_revert_create(struct inode *, struct inode *);
+void	softdep_revert_dotdot_link(struct inode *, struct inode *);
+void	softdep_revert_link(struct inode *, struct inode *);
+void	softdep_revert_mkdir(struct inode *, struct inode *);
+void	softdep_revert_rmdir(struct inode *, struct inode *);

 /*
 * Flags to low-level allocation routines.  The low 16-bits are reserved
--- a/sys/ufs/ufs/ufs_lookup.c
+++ b/sys/ufs/ufs/ufs_lookup.c
@ -77,9 +77,6 @@ SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, "");
 /* true if old FS format...*/
 #define OFSFMT(vp)	((vp)->v_mount->mnt_maxsymlinklen <= 0)

-static int ufs_lookup_(struct vnode *, struct vnode **, struct componentname *,
-    ino_t *);
-
 static int
 ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred,
    struct thread *td)
@ -189,11 +186,11 @@ ufs_lookup(ap)
 	} */ *ap;
 {

-	return (ufs_lookup_(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
+	return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
 }

-static int
-ufs_lookup_(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
+int
+ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
    ino_t *dd_ino)
 {
 	struct inode *dp;		/* inode for directory being searched */
@ -524,6 +521,8 @@ notfound:
 	return (ENOENT);

 found:
+	if (dd_ino != NULL)
+		*dd_ino = ino;
 	if (numdirpasses == 2)
 		nchstats.ncs_pass2++;
 	/*
@ -546,11 +545,6 @@ found:
 	if ((flags & ISLASTCN) && nameiop == LOOKUP)
 		dp->i_diroff = i_offset &~ (DIRBLKSIZ - 1);

-	if (dd_ino != NULL) {
-		*dd_ino = ino;
-		return (0);
-	}
-
 	/*
 	 * If deleting, and at end of pathname, return
 	 * parameters which can be used to remove file.
@ -558,17 +552,6 @@ found:
 	if (nameiop == DELETE && (flags & ISLASTCN)) {
 		if (flags & LOCKPARENT)
 			ASSERT_VOP_ELOCKED(vdp, __FUNCTION__);
-		if ((error = VFS_VGET(vdp->v_mount, ino,
-		    LK_EXCLUSIVE, &tdp)) != 0)
-			return (error);
-
-		error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread);
-		if (error) {
-			vput(tdp);
-			return (error);
-		}
-
-
 		/*
 		 * Return pointer to current entry in dp->i_offset,
 		 * and distance past previous entry (if there
@ -585,6 +568,16 @@ found:
 			dp->i_count = 0;
 		else
 			dp->i_count = dp->i_offset - prevoff;
+		if (dd_ino != NULL)
+			return (0);
+		if ((error = VFS_VGET(vdp->v_mount, ino,
+		    LK_EXCLUSIVE, &tdp)) != 0)
+			return (error);
+		error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread);
+		if (error) {
+			vput(tdp);
+			return (error);
+		}
 		if (dp->i_number == ino) {
 			VREF(vdp);
 			*vpp = vdp;
@ -616,6 +609,8 @@ found:
 		dp->i_offset = i_offset;
 		if (dp->i_number == ino)
 			return (EISDIR);
+		if (dd_ino != NULL)
+			return (0);
 		if ((error = VFS_VGET(vdp->v_mount, ino,
 		    LK_EXCLUSIVE, &tdp)) != 0)
 			return (error);
@ -650,6 +645,8 @@ found:
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
+	if (dd_ino != NULL)
+		return (0);

 	/*
 	 * Step through the translation in the name.  We do not `vput' the
@ -681,7 +678,7 @@ found:
 		 * to the inode we looked up before vdp lock was
 		 * dropped.
 		 */
-		error = ufs_lookup_(pdp, NULL, cnp, &ino1);
+		error = ufs_lookup_ino(pdp, NULL, cnp, &ino1);
 		if (error) {
 			vput(tdp);
 			return (error);
@ -833,12 +830,13 @@ ufs_makedirentry(ip, cnp, newdirp)
 * soft dependency code).
 */
 int
-ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
+ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename)
 	struct vnode *dvp;
 	struct vnode *tvp;
 	struct direct *dirp;
 	struct componentname *cnp;
 	struct buf *newdirbp;
+	int isrename;
 {
 	struct ucred *cr;
 	struct thread *td;
@ -911,22 +909,28 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
 				blkoff += DIRBLKSIZ;
 			}
 			if (softdep_setup_directory_add(bp, dp, dp->i_offset,
-			    dirp->d_ino, newdirbp, 1) == 0) {
-				bdwrite(bp);
+			    dirp->d_ino, newdirbp, 1))
+				dp->i_flag |= IN_NEEDSYNC;
+			if (newdirbp)
+				bdwrite(newdirbp);
+			bdwrite(bp);
+			if ((dp->i_flag & IN_NEEDSYNC) == 0)
 				return (UFS_UPDATE(dvp, 0));
-			}
-			/* We have just allocated a directory block in an
-			 * indirect block. Rather than tracking when it gets
-			 * claimed by the inode, we simply do a VOP_FSYNC
-			 * now to ensure that it is there (in case the user
-			 * does a future fsync). Note that we have to unlock
-			 * the inode for the entry that we just entered, as
-			 * the VOP_FSYNC may need to lock other inodes which
-			 * can lead to deadlock if we also hold a lock on
-			 * the newly entered node.
+			/*
+			 * We have just allocated a directory block in an
+			 * indirect block.  We must prevent holes in the
+			 * directory created if directory entries are
+			 * written out of order.  To accomplish this we
+			 * fsync when we extend a directory into indirects.
+			 * During rename it's not safe to drop the tvp lock
+			 * so sync must be delayed until it is.
+			 *
+			 * This synchronous step could be removed if fsck and
+			 * the kernel were taught to fill in sparse
+			 * directories rather than panic.
 			 */
-			if ((error = bwrite(bp)))
-				return (error);
+			if (isrename)
+				return (0);
 			if (tvp != NULL)
 				VOP_UNLOCK(tvp, 0);
 			error = VOP_FSYNC(dvp, MNT_WAIT, td);
@ -1015,7 +1019,7 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
 			    dp->i_offset + ((char *)ep - dirbuf));
 #endif
 		if (DOINGSOFTDEP(dvp))
-			softdep_change_directoryentry_offset(dp, dirbuf,
+			softdep_change_directoryentry_offset(bp, dp, dirbuf,
 			    (caddr_t)nep, (caddr_t)ep, dsize); 
 		else
 			bcopy((caddr_t)nep, (caddr_t)ep, dsize);
@ -1067,6 +1071,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
 		(void) softdep_setup_directory_add(bp, dp,
 		    dp->i_offset + (caddr_t)ep - dirbuf,
 		    dirp->d_ino, newdirbp, 0);
+		if (newdirbp != NULL)
+			bdwrite(newdirbp);
 		bdwrite(bp);
 	} else {
 		if (DOINGASYNC(dvp)) {
@ -1084,7 +1090,8 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
 	 * lock other inodes which can lead to deadlock if we also hold a
 	 * lock on the newly entered node.
 	 */
-	if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) {
+	if (isrename == 0 && error == 0 &&
+	    dp->i_endoff && dp->i_endoff < dp->i_size) {
 		if (tvp != NULL)
 			VOP_UNLOCK(tvp, 0);
 #ifdef UFS_DIRHASH
@ -1125,6 +1132,19 @@ ufs_dirremove(dvp, ip, flags, isrmdir)

 	dp = VTOI(dvp);

+	/*
+	 * Adjust the link count early so softdep can block if necessary.
+	 */
+	if (ip) {
+		ip->i_effnlink--;
+		if (DOINGSOFTDEP(dvp)) {
+			softdep_setup_unlink(dp, ip);
+		} else {
+			ip->i_nlink--;
+			DIP_SET(ip, i_nlink, ip->i_nlink);
+			ip->i_flag |= IN_CHANGE;
+		}
+	}
 	if (flags & DOWHITEOUT) {
 		/*
 		 * Whiteout entry: set d_ino to WINO.
@ -1154,6 +1174,9 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
 	if (dp->i_dirhash != NULL)
 		ufsdirhash_remove(dp, rep, dp->i_offset);
 #endif
+	if (ip && rep->d_ino != ip->i_number)
+		panic("ufs_dirremove: ip %d does not match dirent ino %d\n",
+		    ip->i_number, rep->d_ino);
 	if (dp->i_count == 0) {
 		/*
 		 * First entry in block: set d_ino to zero.
@ -1172,31 +1195,20 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
 		    dp->i_offset & ~(DIRBLKSIZ - 1));
 #endif
 out:
+	error = 0;
 	if (DOINGSOFTDEP(dvp)) {
-		if (ip) {
-			ip->i_effnlink--;
-			softdep_change_linkcnt(ip);
+		if (ip)
 			softdep_setup_remove(bp, dp, ip, isrmdir);
-		}
-		if (softdep_slowdown(dvp)) {
+		if (softdep_slowdown(dvp))
 			error = bwrite(bp);
-		} else {
+		else
 			bdwrite(bp);
-			error = 0;
-		}
 	} else {
-		if (ip) {
-			ip->i_effnlink--;
-			ip->i_nlink--;
-			DIP_SET(ip, i_nlink, ip->i_nlink);
-			ip->i_flag |= IN_CHANGE;
-		}
 		if (flags & DOWHITEOUT)
 			error = bwrite(bp);
-		else if (DOINGASYNC(dvp) && dp->i_count != 0) {
+		else if (DOINGASYNC(dvp) && dp->i_count != 0)
 			bdwrite(bp);
-			error = 0;
-		} else
+		else
 			error = bwrite(bp);
 	}
 	dp->i_flag |= IN_CHANGE | IN_UPDATE;
@ -1229,6 +1241,19 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
 	struct vnode *vdp = ITOV(dp);
 	int error;

+	/*
+	 * Drop the link before we lock the buf so softdep can block if
+	 * necessary.
+	 */
+	oip->i_effnlink--;
+	if (DOINGSOFTDEP(vdp)) {
+		softdep_setup_unlink(dp, oip);
+	} else {
+		oip->i_nlink--;
+		DIP_SET(oip, i_nlink, oip->i_nlink);
+		oip->i_flag |= IN_CHANGE;
+	}
+
 	error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp);
 	if (error)
 		return (error);
@ -1240,15 +1265,10 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir)
 	ep->d_ino = newinum;
 	if (!OFSFMT(vdp))
 		ep->d_type = newtype;
-	oip->i_effnlink--;
 	if (DOINGSOFTDEP(vdp)) {
-		softdep_change_linkcnt(oip);
 		softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir);
 		bdwrite(bp);
 	} else {
-		oip->i_nlink--;
-		DIP_SET(oip, i_nlink, oip->i_nlink);
-		oip->i_flag |= IN_CHANGE;
 		if (DOINGASYNC(vdp)) {
 			bdwrite(bp);
 			error = 0;
@ -1363,25 +1383,25 @@ ufs_dir_dd_ino(struct vnode *vp, struct ucred *cred, ino_t *dd_ino)

 /*
 * Check if source directory is in the path of the target directory.
- * Target is supplied locked, source is unlocked.
- * The target is always vput before returning.
 */
 int
-ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)
+ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino)
 {
-	struct vnode *vp, *vp1;
+	struct mount *mp;
+	struct vnode *tvp, *vp, *vp1;
 	int error;
 	ino_t dd_ino;

-	vp = ITOV(target);
-	if (target->i_number == source_ino) {
-		error = EEXIST;
-		goto out;
-	}
-	error = 0;
+	vp = tvp = ITOV(target);
+	mp = vp->v_mount;
+	*wait_ino = 0;
+	if (target->i_number == source_ino)
+		return (EEXIST);
+	if (target->i_number == parent_ino)
+		return (0);
 	if (target->i_number == ROOTINO)
-		goto out;
-
+		return (0);
+	error = 0;
 	for (;;) {
 		error = ufs_dir_dd_ino(vp, cred, &dd_ino);
 		if (error != 0)
@ -1392,9 +1412,13 @@ ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)
 		}
 		if (dd_ino == ROOTINO)
 			break;
-		error = vn_vget_ino(vp, dd_ino, LK_EXCLUSIVE, &vp1);
-		if (error != 0)
+		if (dd_ino == parent_ino)
 			break;
+		error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, &vp1);
+		if (error != 0) {
+			*wait_ino = dd_ino;
+			break;
+		}
 		/* Recheck that ".." still points to vp1 after relock of vp */
 		error = ufs_dir_dd_ino(vp, cred, &dd_ino);
 		if (error != 0) {
@ -1406,14 +1430,14 @@ ufs_checkpath(ino_t source_ino, struct inode *target, struct ucred *cred)
 			vput(vp1);
 			continue;
 		}
-		vput(vp);
+		if (vp != tvp)
+			vput(vp);
 		vp = vp1;
 	}

-out:
 	if (error == ENOTDIR)
-		printf("checkpath: .. not a directory\n");
-	if (vp != NULL)
+		panic("checkpath: .. not a directory\n");
+	if (vp != tvp)
 		vput(vp);
 	return (error);
 }
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@ -114,6 +114,8 @@ static vop_close_t	ufsfifo_close;
 static vop_kqfilter_t	ufsfifo_kqfilter;
 static vop_pathconf_t	ufsfifo_pathconf;

+SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
+
 /*
 * A virgin directory (no blushing please).
 */
@ -974,6 +976,9 @@ ufs_link(ap)
 		error = EXDEV;
 		goto out;
 	}
+	if (VTOI(tdvp)->i_effnlink < 2)
+		panic("ufs_link: Bad link count %d on parent",
+		    VTOI(tdvp)->i_effnlink);
 	ip = VTOI(vp);
 	if ((nlink_t)ip->i_nlink >= LINK_MAX) {
 		error = EMLINK;
@ -988,11 +993,11 @@ ufs_link(ap)
 	DIP_SET(ip, i_nlink, ip->i_nlink);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(vp))
-		softdep_change_linkcnt(ip);
+		softdep_setup_link(VTOI(tdvp), ip);
 	error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp)));
 	if (!error) {
 		ufs_makedirentry(ip, cnp, &newdir);
-		error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL);
+		error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0);
 	}

 	if (error) {
@ -1001,7 +1006,7 @@ ufs_link(ap)
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(vp))
-			softdep_change_linkcnt(ip);
+			softdep_revert_link(VTOI(tdvp), ip);
 	}
 out:
 	return (error);
@ -1043,7 +1048,7 @@ ufs_whiteout(ap)
 		newdir.d_namlen = cnp->cn_namelen;
 		bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1);
 		newdir.d_type = DT_WHT;
-		error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL);
+		error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0);
 		break;

 	case DELETE:
@ -1062,6 +1067,11 @@ ufs_whiteout(ap)
 	return (error);
 }

+static volatile int rename_restarts;
+SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD,
+    __DEVOLATILE(int *, &rename_restarts), 0,
+    "Times rename had to restart due to lock contention");
+
 /*
 * Rename system call.
 * 	rename("foo", "bar");
@ -1101,111 +1111,183 @@ ufs_rename(ap)
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
+	struct vnode *nvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct thread *td = fcnp->cn_thread;
-	struct inode *ip, *xp, *dp;
+	struct inode *fip, *tip, *tdp, *fdp;
 	struct direct newdir;
-	int doingdirectory = 0, oldparent = 0, newparent = 0;
+	off_t endoff;
+	int doingdirectory, newparent;
 	int error = 0, ioflag;
-	ino_t fvp_ino;
+	struct mount *mp;
+	ino_t ino;

 #ifdef INVARIANTS
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("ufs_rename: no name");
 #endif
+	endoff = 0;
+	mp = tdvp->v_mount;
+	VOP_UNLOCK(tdvp, 0);
+	if (tvp && tvp != tdvp)
+		VOP_UNLOCK(tvp, 0);
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
-abortit:
-		if (tdvp == tvp)
-			vrele(tdvp);
-		else
-			vput(tdvp);
-		if (tvp)
-			vput(tvp);
-		vrele(fdvp);
-		vrele(fvp);
-		return (error);
+		mp = NULL;
+		goto releout;
 	}
-
+	error = vfs_busy(mp, 0);
+	if (error) {
+		mp = NULL;
+		goto releout;
+	}
+relock:
+	/* 
+	 * We need to acquire 2 to 4 locks depending on whether tvp is NULL
+	 * and fdvp and tdvp are the same directory.  Subsequently we need
+	 * to double-check all paths and in the directory rename case we
+	 * need to verify that we are not creating a directory loop.  To
+	 * handle this we acquire all but fdvp using non-blocking
+	 * acquisitions.  If we fail to acquire any lock in the path we will
+	 * drop all held locks, acquire the new lock in a blocking fashion,
+	 * and then release it and restart the rename.  This acquire/release
+	 * step ensures that we do not spin on a lock waiting for release.
+	 */
+	error = vn_lock(fdvp, LK_EXCLUSIVE);
+	if (error)
+		goto releout;
+	if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+		VOP_UNLOCK(fdvp, 0);
+		error = vn_lock(tdvp, LK_EXCLUSIVE);
+		if (error)
+			goto releout;
+		VOP_UNLOCK(tdvp, 0);
+		atomic_add_int(&rename_restarts, 1);
+		goto relock;
+	}
+	/*
+	 * Re-resolve fvp to be certain it still exists and fetch the
+	 * correct vnode.
+	 */
+	error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
+	if (error) {
+		VOP_UNLOCK(fdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		goto releout;
+	}
+	error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
+	if (error) {
+		VOP_UNLOCK(fdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		if (error != EBUSY)
+			goto releout;
+		error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
+		if (error != 0)
+			goto releout;
+		VOP_UNLOCK(nvp, 0);
+		vrele(fvp);
+		fvp = nvp;
+		atomic_add_int(&rename_restarts, 1);
+		goto relock;
+	}
+	vrele(fvp);
+	fvp = nvp;
+	/*
+	 * Re-resolve tvp and acquire the vnode lock if present.
+	 */
+	error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino);
+	if (error != 0 && error != EJUSTRETURN) {
+		VOP_UNLOCK(fdvp, 0);
+		VOP_UNLOCK(tdvp, 0);
+		VOP_UNLOCK(fvp, 0);
+		goto releout;
+	}
+	/*
+	 * If tvp disappeared we just carry on.
+	 */
+	if (error == EJUSTRETURN && tvp != NULL) {
+		vrele(tvp);
+		tvp = NULL;
+	}
+	/*
+	 * Get the tvp ino if the lookup succeeded.  We may have to restart
+	 * if the non-blocking acquire fails.
+	 */
+	if (error == 0) {
+		nvp = NULL;
+		error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
+		if (tvp)
+			vrele(tvp);
+		tvp = nvp;
+		if (error) {
+			VOP_UNLOCK(fdvp, 0);
+			VOP_UNLOCK(tdvp, 0);
+			VOP_UNLOCK(fvp, 0);
+			if (error != EBUSY)
+				goto releout;
+			error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
+			if (error != 0)
+				goto releout;
+			VOP_UNLOCK(nvp, 0);
+			atomic_add_int(&rename_restarts, 1);
+			goto relock;
+		}
+	}
+	fdp = VTOI(fdvp);
+	fip = VTOI(fvp);
+	tdp = VTOI(tdvp);
+	tip = NULL;
+	if (tvp)
+		tip = VTOI(tvp);
 	if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(tdvp)->i_flags & APPEND))) {
 		error = EPERM;
-		goto abortit;
+		goto unlockout;
 	}
-
 	/*
 	 * Renaming a file to itself has no effect.  The upper layers should
-	 * not call us in that case.  Temporarily just warn if they do.
+	 * not call us in that case.  However, things could change after
+	 * we drop the locks above.
 	 */
 	if (fvp == tvp) {
-		printf("ufs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
-		goto abortit;
+		goto unlockout;
 	}
-
-	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
-		goto abortit;
-	dp = VTOI(fdvp);
-	ip = VTOI(fvp);
-	if (ip->i_nlink >= LINK_MAX) {
-		VOP_UNLOCK(fvp, 0);
+	doingdirectory = 0;
+	newparent = 0;
+	ino = fip->i_number;
+	if (fip->i_nlink >= LINK_MAX) {
 		error = EMLINK;
-		goto abortit;
+		goto unlockout;
 	}
-	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
-	    || (dp->i_flags & APPEND)) {
-		VOP_UNLOCK(fvp, 0);
+	if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
+	    || (fdp->i_flags & APPEND)) {
 		error = EPERM;
-		goto abortit;
+		goto unlockout;
 	}
-	if ((ip->i_mode & IFMT) == IFDIR) {
+	if ((fip->i_mode & IFMT) == IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
-		    dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT ||
-		    (ip->i_flag & IN_RENAME)) {
-			VOP_UNLOCK(fvp, 0);
+		    fdp == fip ||
+		    (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
 			error = EINVAL;
-			goto abortit;
+			goto unlockout;
 		}
-		ip->i_flag |= IN_RENAME;
-		oldparent = dp->i_number;
+		if (fdp->i_number != tdp->i_number)
+			newparent = tdp->i_number;
 		doingdirectory = 1;
 	}
-	vrele(fdvp);
-
-	/*
-	 * When the target exists, both the directory
-	 * and target vnodes are returned locked.
-	 */
-	dp = VTOI(tdvp);
-	xp = NULL;
-	if (tvp)
-		xp = VTOI(tvp);
-
-	/*
-	 * 1) Bump link count while we're moving stuff
-	 *    around.  If we crash somewhere before
-	 *    completing our work, the link count
-	 *    may be wrong, but correctable.
-	 */
-	ip->i_effnlink++;
-	ip->i_nlink++;
-	DIP_SET(ip, i_nlink, ip->i_nlink);
-	ip->i_flag |= IN_CHANGE;
-	if (DOINGSOFTDEP(fvp))
-		softdep_change_linkcnt(ip);
-	if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) |
-				       DOINGASYNC(fvp)))) != 0) {
-		VOP_UNLOCK(fvp, 0);
-		goto bad;
+	if (fvp->v_mountedhere != NULL || (tvp && tvp->v_mountedhere != NULL)) {
+		error = EXDEV;
+		goto unlockout;
 	}

 	/*
@ -1214,35 +1296,55 @@ abortit:
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
-	 * as to be able to change "..". We must repeat the call
-	 * to namei, as the parent directory is unlocked by the
-	 * call to checkpath().
+	 * as to be able to change "..".
 	 */
-	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
-	fvp_ino = ip->i_number;
-	VOP_UNLOCK(fvp, 0);
-	if (oldparent != dp->i_number)
-		newparent = dp->i_number;
 	if (doingdirectory && newparent) {
-		if (error)	/* write access check above */
-			goto bad;
-		if (xp != NULL)
-			vput(tvp);
-		error = ufs_checkpath(fvp_ino, dp, tcnp->cn_cred);
+		error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 		if (error)
-			goto out;
+			goto unlockout;
+		error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred,
+		    &ino);
+		/*
+		 * We encountered a lock that we have to wait for.  Unlock
+		 * everything else and VGET before restarting.
+		 */
+		if (ino) {
+			VOP_UNLOCK(fdvp, 0);
+			VOP_UNLOCK(fvp, 0);
+			VOP_UNLOCK(tdvp, 0);
+			if (tvp)
+				VOP_UNLOCK(tvp, 0);
+			error = VFS_VGET(mp, ino, LK_SHARED, &nvp);
+			if (error == 0)
+				vput(nvp);
+			atomic_add_int(&rename_restarts, 1);
+			goto relock;
+		}
+		if (error)
+			goto unlockout;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("ufs_rename: lost to startdir");
-		VREF(tdvp);
-		error = relookup(tdvp, &tvp, tcnp);
-		if (error)
-			goto out;
-		vrele(tdvp);
-		dp = VTOI(tdvp);
-		xp = NULL;
-		if (tvp)
-			xp = VTOI(tvp);
 	}
+	if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 ||
+	    tdp->i_effnlink == 0)
+		panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp);
+
+	/*
+	 * 1) Bump link count while we're moving stuff
+	 *    around.  If we crash somewhere before
+	 *    completing our work, the link count
+	 *    may be wrong, but correctable.
+	 */
+	fip->i_effnlink++;
+	fip->i_nlink++;
+	DIP_SET(fip, i_nlink, fip->i_nlink);
+	fip->i_flag |= IN_CHANGE;
+	if (DOINGSOFTDEP(fvp))
+		softdep_setup_link(tdp, fip);
+	error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp)));
+	if (error)
+		goto bad;
+
 	/*
 	 * 2) If target doesn't exist, link the target
 	 *    to the source and unlink the source.
@ -1250,52 +1352,37 @@ abortit:
 	 *    entry to reference the source inode and
 	 *    expunge the original entry's existence.
 	 */
-	if (xp == NULL) {
-		if (dp->i_dev != ip->i_dev)
+	if (tip == NULL) {
+		if (tdp->i_dev != fip->i_dev)
 			panic("ufs_rename: EXDEV");
-		/*
-		 * Account for ".." in new directory.
-		 * When source and destination have the same
-		 * parent we don't fool with the link count.
-		 */
 		if (doingdirectory && newparent) {
-			if ((nlink_t)dp->i_nlink >= LINK_MAX) {
+			/*
+			 * Account for ".." in new directory.
+			 * When source and destination have the same
+			 * parent we don't adjust the link count.  The
+			 * actual link modification is completed when
+			 * .. is rewritten below.
+			 */
+			if ((nlink_t)tdp->i_nlink >= LINK_MAX) {
 				error = EMLINK;
 				goto bad;
 			}
-			dp->i_effnlink++;
-			dp->i_nlink++;
-			DIP_SET(dp, i_nlink, dp->i_nlink);
-			dp->i_flag |= IN_CHANGE;
-			if (DOINGSOFTDEP(tdvp))
-				softdep_change_linkcnt(dp);
-			error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) |
-						   DOINGASYNC(tdvp)));
-			if (error)
-				goto bad;
 		}
-		ufs_makedirentry(ip, tcnp, &newdir);
-		error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL);
-		if (error) {
-			if (doingdirectory && newparent) {
-				dp->i_effnlink--;
-				dp->i_nlink--;
-				DIP_SET(dp, i_nlink, dp->i_nlink);
-				dp->i_flag |= IN_CHANGE;
-				if (DOINGSOFTDEP(tdvp))
-					softdep_change_linkcnt(dp);
-				(void)UFS_UPDATE(tdvp, 1);
-			}
+		ufs_makedirentry(fip, tcnp, &newdir);
+		error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1);
+		if (error)
 			goto bad;
-		}
-		vput(tdvp);
+		/* Setup tdvp for directory compaction if needed. */
+		if (tdp->i_count && tdp->i_endoff &&
+		    tdp->i_endoff < tdp->i_size)
+			endoff = tdp->i_endoff;
 	} else {
-		if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
+		if (tip->i_dev != tdp->i_dev || tip->i_dev != fip->i_dev)
 			panic("ufs_rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
 		 */
-		if (xp->i_number == ip->i_number)
+		if (tip->i_number == fip->i_number)
 			panic("ufs_rename: same file");
 		/*
 		 * If the parent directory is "sticky", then the caller
@ -1303,7 +1390,7 @@ abortit:
 		 * destination of the rename.  This implements append-only
 		 * directories.
 		 */
-		if ((dp->i_mode & S_ISTXT) &&
+		if ((tdp->i_mode & S_ISTXT) &&
 		    VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) &&
 		    VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) {
 			error = EPERM;
@ -1314,9 +1401,9 @@ abortit:
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
-		if ((xp->i_mode&IFMT) == IFDIR) {
-			if ((xp->i_effnlink > 2) ||
-			    !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) {
+		if ((tip->i_mode & IFMT) == IFDIR) {
+			if ((tip->i_effnlink > 2) ||
+			    !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
@ -1329,20 +1416,30 @@ abortit:
 			error = EISDIR;
 			goto bad;
 		}
-		error = ufs_dirrewrite(dp, xp, ip->i_number,
-		    IFTODT(ip->i_mode),
-		    (doingdirectory && newparent) ? newparent : doingdirectory);
-		if (error)
-			goto bad;
 		if (doingdirectory) {
 			if (!newparent) {
-				dp->i_effnlink--;
+				tdp->i_effnlink--;
 				if (DOINGSOFTDEP(tdvp))
-					softdep_change_linkcnt(dp);
+					softdep_change_linkcnt(tdp);
 			}
-			xp->i_effnlink--;
+			tip->i_effnlink--;
 			if (DOINGSOFTDEP(tvp))
-				softdep_change_linkcnt(xp);
+				softdep_change_linkcnt(tip);
+		}
+		error = ufs_dirrewrite(tdp, tip, fip->i_number,
+		    IFTODT(fip->i_mode),
+		    (doingdirectory && newparent) ? newparent : doingdirectory);
+		if (error) {
+			if (doingdirectory) {
+				if (!newparent) {
+					tdp->i_effnlink++;
+					if (DOINGSOFTDEP(tdvp))
+						softdep_change_linkcnt(tdp);
+				}
+				tip->i_effnlink++;
+				if (DOINGSOFTDEP(tvp))
+					softdep_change_linkcnt(tip);
+			}
 		}
 		if (doingdirectory && !DOINGSOFTDEP(tvp)) {
 			/*
@ -1357,115 +1454,107 @@ abortit:
 			 * them now.
 			 */
 			if (!newparent) {
-				dp->i_nlink--;
-				DIP_SET(dp, i_nlink, dp->i_nlink);
-				dp->i_flag |= IN_CHANGE;
+				tdp->i_nlink--;
+				DIP_SET(tdp, i_nlink, tdp->i_nlink);
+				tdp->i_flag |= IN_CHANGE;
 			}
-			xp->i_nlink--;
-			DIP_SET(xp, i_nlink, xp->i_nlink);
-			xp->i_flag |= IN_CHANGE;
+			tip->i_nlink--;
+			DIP_SET(tip, i_nlink, tip->i_nlink);
+			tip->i_flag |= IN_CHANGE;
 			ioflag = IO_NORMAL;
 			if (!DOINGASYNC(tvp))
 				ioflag |= IO_SYNC;
+			/* Don't go to bad here as the new link exists. */
 			if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag,
 			    tcnp->cn_cred, tcnp->cn_thread)) != 0)
-				goto bad;
+				goto unlockout;
 		}
-		vput(tdvp);
-		vput(tvp);
-		xp = NULL;
 	}

 	/*
-	 * 3) Unlink the source.
+	 * 3) Unlink the source.  We have to resolve the path again to
+	 * fixup the directory offset and count for ufs_dirremove.
 	 */
-	fcnp->cn_flags &= ~MODMASK;
-	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
-	if ((fcnp->cn_flags & SAVESTART) == 0)
-		panic("ufs_rename: lost from startdir");
-	VREF(fdvp);
-	error = relookup(fdvp, &fvp, fcnp);
-	if (error == 0)
-		vrele(fdvp);
-	if (fvp != NULL) {
-		xp = VTOI(fvp);
-		dp = VTOI(fdvp);
-	} else {
-		/*
-		 * From name has disappeared.  IN_RENAME is not sufficient
-		 * to protect against directory races due to timing windows,
-		 * so we have to remove the panic.  XXX the only real way
-		 * to solve this issue is at a much higher level.  By the
-		 * time we hit ufs_rename() it's too late.
-		 */
-#if 0
-		if (doingdirectory)
-			panic("ufs_rename: lost dir entry");
-#endif
-		vrele(ap->a_fvp);
-		return (0);
+	if (fdvp == tdvp) {
+		error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
+		if (error)
+			panic("ufs_rename: from entry went away!");
+		if (ino != fip->i_number)
+			panic("ufs_rename: ino mismatch %d != %d\n", ino,
+			    fip->i_number);
 	}
 	/*
-	 * Ensure that the directory entry still exists and has not
-	 * changed while the new name has been entered. If the source is
-	 * a file then the entry may have been unlinked or renamed. In
-	 * either case there is no further work to be done. If the source
-	 * is a directory then it cannot have been rmdir'ed; the IN_RENAME
-	 * flag ensures that it cannot be moved by another rename or removed
-	 * by a rmdir.
+	 * If the source is a directory with a
+	 * new parent, the link count of the old
+	 * parent directory must be decremented
+	 * and ".." set to point to the new parent.
 	 */
-	if (xp != ip) {
+	if (doingdirectory && newparent) {
 		/*
-		 * From name resolves to a different inode.  IN_RENAME is
-		 * not sufficient protection against timing window races
-		 * so we can't panic here.  XXX the only real way
-		 * to solve this issue is at a much higher level.  By the
-		 * time we hit ufs_rename() it's too late.
+		 * If tip exists we simply use its link, otherwise we must
+		 * add a new one.
 		 */
-#if 0
-		if (doingdirectory)
-			panic("ufs_rename: lost dir entry");
-#endif
-	} else {
-		/*
-		 * If the source is a directory with a
-		 * new parent, the link count of the old
-		 * parent directory must be decremented
-		 * and ".." set to point to the new parent.
-		 */
-		if (doingdirectory && newparent) {
-			xp->i_offset = mastertemplate.dot_reclen;
-			ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0);
-			cache_purge(fdvp);
+		if (tip == NULL) {
+			tdp->i_effnlink++;
+			tdp->i_nlink++;
+			DIP_SET(tdp, i_nlink, tdp->i_nlink);
+			tdp->i_flag |= IN_CHANGE;
+			if (DOINGSOFTDEP(tdvp))
+				softdep_setup_dotdot_link(tdp, fip);
+			error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) |
+						   DOINGASYNC(tdvp)));
+			/* Don't go to bad here as the new link exists. */
+			if (error)
+				goto unlockout;
 		}
-		error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0);
-		xp->i_flag &= ~IN_RENAME;
+		fip->i_offset = mastertemplate.dot_reclen;
+		ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0);
+		cache_purge(fdvp);
 	}
-	if (dp)
-		vput(fdvp);
-	if (xp)
-		vput(fvp);
-	vrele(ap->a_fvp);
+	error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0);
+
+unlockout:
+	vput(fdvp);
+	vput(fvp);
+	if (tvp)
+		vput(tvp);
+	/*
+	 * If compaction or fsync was requested do it now that other locks
+	 * are no longer needed.
+	 */
+	if (error == 0 && endoff != 0) {
+#ifdef UFS_DIRHASH
+		if (tdp->i_dirhash != NULL)
+			ufsdirhash_dirtrunc(tdp, endoff);
+#endif
+		UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | IO_SYNC, tcnp->cn_cred,
+		    td);
+	}
+	if (error == 0 && tdp->i_flag & IN_NEEDSYNC)
+		error = VOP_FSYNC(tdvp, MNT_WAIT, td);
+	vput(tdvp);
+	if (mp)
+		vfs_unbusy(mp);
 	return (error);

 bad:
-	if (xp)
-		vput(ITOV(xp));
-	vput(ITOV(dp));
-out:
-	if (doingdirectory)
-		ip->i_flag &= ~IN_RENAME;
-	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
-		ip->i_effnlink--;
-		ip->i_nlink--;
-		DIP_SET(ip, i_nlink, ip->i_nlink);
-		ip->i_flag |= IN_CHANGE;
-		ip->i_flag &= ~IN_RENAME;
-		if (DOINGSOFTDEP(fvp))
-			softdep_change_linkcnt(ip);
-		vput(fvp);
-	} else
-		vrele(fvp);
+	fip->i_effnlink--;
+	fip->i_nlink--;
+	DIP_SET(fip, i_nlink, fip->i_nlink);
+	fip->i_flag |= IN_CHANGE;
+	if (DOINGSOFTDEP(fvp))
+		softdep_revert_link(tdp, fip);
+	goto unlockout;
+
+releout:
+	vrele(fdvp);
+	vrele(fvp);
+	vrele(tdvp);
+	if (tvp)
+		vrele(tvp);
+	if (mp)
+		vfs_unbusy(mp);
+
 	return (error);
 }

@ -1767,8 +1856,7 @@ ufs_mkdir(ap)
 	ip->i_effnlink = 2;
 	ip->i_nlink = 2;
 	DIP_SET(ip, i_nlink, 2);
-	if (DOINGSOFTDEP(tvp))
-		softdep_change_linkcnt(ip);
+
 	if (cnp->cn_flags & ISWHITEOUT) {
 		ip->i_flags |= UF_OPAQUE;
 		DIP_SET(ip, i_flags, ip->i_flags);
@ -1784,8 +1872,8 @@ ufs_mkdir(ap)
 	DIP_SET(dp, i_nlink, dp->i_nlink);
 	dp->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(dvp))
-		softdep_change_linkcnt(dp);
-	error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp)));
+		softdep_setup_mkdir(dp, ip);
+	error = UFS_UPDATE(dvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp)));
 	if (error)
 		goto bad;
 #ifdef MAC
@ -1863,7 +1951,7 @@ ufs_mkdir(ap)
 	else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp))))
 		goto bad;
 	ufs_makedirentry(ip, cnp, &newdir);
-	error = ufs_direnter(dvp, tvp, &newdir, cnp, bp);
+	error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0);
 	
 bad:
 	if (error == 0) {
@ -1873,8 +1961,6 @@ bad:
 		dp->i_nlink--;
 		DIP_SET(dp, i_nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
-		if (DOINGSOFTDEP(dvp))
-			softdep_change_linkcnt(dp);
 		/*
 		 * No need to do an explicit VOP_TRUNCATE here, vrele will
 		 * do this for us because we set the link count to 0.
@ -1884,7 +1970,8 @@ bad:
 		DIP_SET(ip, i_nlink, 0);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(tvp))
-			softdep_change_linkcnt(ip);
+			softdep_revert_mkdir(dp, ip);
+
 		vput(tvp);
 	}
 out:
@ -1920,10 +2007,13 @@ ufs_rmdir(ap)
 	 * tries to remove a locally mounted on directory).
 	 */
 	error = 0;
-	if ((ip->i_flag & IN_RENAME) || ip->i_effnlink < 2) {
+	if (ip->i_effnlink < 2) {
 		error = EINVAL;
 		goto out;
 	}
+	if (dp->i_effnlink < 3)
+		panic("ufs_dirrem: Bad link count %d on parent",
+		    dp->i_effnlink);
 	if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 		error = ENOTEMPTY;
 		goto out;
@ -1947,18 +2037,14 @@ ufs_rmdir(ap)
 	 */
 	dp->i_effnlink--;
 	ip->i_effnlink--;
-	if (DOINGSOFTDEP(vp)) {
-		softdep_change_linkcnt(dp);
-		softdep_change_linkcnt(ip);
-	}
+	if (DOINGSOFTDEP(vp))
+		softdep_setup_rmdir(dp, ip);
 	error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);
 	if (error) {
 		dp->i_effnlink++;
 		ip->i_effnlink++;
-		if (DOINGSOFTDEP(vp)) {
-			softdep_change_linkcnt(dp);
-			softdep_change_linkcnt(ip);
-		}
+		if (DOINGSOFTDEP(vp))
+			softdep_revert_rmdir(dp, ip);
 		goto out;
 	}
 	cache_purge(dvp);
@ -2464,6 +2550,9 @@ ufs_makeinode(mode, dvp, vpp, cnp)
 	if ((mode & IFMT) == 0)
 		mode |= IFREG;

+	if (VTOI(dvp)->i_effnlink < 2)
+		panic("ufs_makeinode: Bad link count %d on parent",
+		    VTOI(dvp)->i_effnlink);
 	error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp);
 	if (error)
 		return (error);
@ -2539,7 +2628,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)
 	ip->i_nlink = 1;
 	DIP_SET(ip, i_nlink, 1);
 	if (DOINGSOFTDEP(tvp))
-		softdep_change_linkcnt(ip);
+		softdep_setup_create(VTOI(dvp), ip);
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
 	    priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) {
 		ip->i_mode &= ~ISGID;
@ -2579,7 +2668,7 @@ ufs_makeinode(mode, dvp, vpp, cnp)
 	}
 #endif /* !UFS_ACL */
 	ufs_makedirentry(ip, cnp, &newdir);
-	error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL);
+	error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0);
 	if (error)
 		goto bad;
 	*vpp = tvp;
@ -2595,7 +2684,7 @@ bad:
 	DIP_SET(ip, i_nlink, 0);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(tvp))
-		softdep_change_linkcnt(ip);
+		softdep_revert_create(VTOI(dvp), ip);
 	vput(tvp);
 	return (error);
 }
--- a/sys/ufs/ufs/ufsmount.h
+++ b/sys/ufs/ufs/ufsmount.h
@ -57,6 +57,10 @@ struct ucred;
 struct uio;
 struct vnode;
 struct ufs_extattr_per_mount;
+struct jblocks;
+struct inodedep;
+
+TAILQ_HEAD(inodedeplst, inodedep);

 /* This structure describes the UFS specific mount structure data. */
 struct ufsmount {
@ -75,6 +79,11 @@ struct ufsmount {
 	long	um_numindirdeps;		/* outstanding indirdeps */
 	struct	workhead softdep_workitem_pending; /* softdep work queue */
 	struct	worklist *softdep_worklist_tail; /* Tail pointer for above */
+	struct	workhead softdep_journal_pending; /* journal work queue */
+	struct	worklist *softdep_journal_tail;	/* Tail pointer for above */
+	struct	jblocks *softdep_jblocks;	/* Journal block information */
+	struct	inodedeplst softdep_unlinked; /* Unlinked inodes */
+	int	softdep_on_journal;		/* Items on the journal list */
 	int	softdep_on_worklist;		/* Items on the worklist */
 	int	softdep_on_worklist_inprogress;	/* Busy items on worklist */
 	int	softdep_deps;			/* Total dependency count */
--- a/usr.sbin/makefs/ffs/ffs_bswap.c
+++ b/usr.sbin/makefs/ffs/ffs_bswap.c
@ -136,8 +136,6 @@ ffs_dinode1_swap(struct ufs1_dinode *o, struct ufs1_dinode *n)

 	n->di_mode = bswap16(o->di_mode);
 	n->di_nlink = bswap16(o->di_nlink);
-	n->di_u.oldids[0] = bswap16(o->di_u.oldids[0]);
-	n->di_u.oldids[1] = bswap16(o->di_u.oldids[1]);
 	n->di_size = bswap64(o->di_size);
 	n->di_atime = bswap32(o->di_atime);
 	n->di_atimensec = bswap32(o->di_atimensec);