Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
2005-04-16 15:20:36 -07:00
commit 1da177e4c3
17291 changed files with 6718755 additions and 0 deletions
--- a/arch/m68k/math-emu/Makefile
+++ b/arch/m68k/math-emu/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the linux kernel.
+#
+
+EXTRA_AFLAGS := -traditional
+
+#EXTRA_AFLAGS += -DFPU_EMU_DEBUG
+#EXTRA_CFLAGS += -DFPU_EMU_DEBUG
+
+obj-y		:= fp_entry.o fp_scan.o fp_util.o fp_move.o fp_movem.o \
+			fp_cond.o fp_arith.o fp_log.o fp_trig.o
--- a/arch/m68k/math-emu/fp_arith.c
+++ b/arch/m68k/math-emu/fp_arith.c
@@ -0,0 +1,701 @@
+/*
+
+   fp_arith.c: floating-point math routines for the Linux-m68k
+   floating point emulator.
+
+   Copyright (c) 1998-1999 David Huggins-Daines.
+
+   Somewhat based on the AlphaLinux floating point emulator, by David
+   Mosberger-Tang.
+
+   You may copy, modify, and redistribute this file under the terms of
+   the GNU General Public License, version 2, or any later version, at
+   your convenience.
+ */
+
+#include "fp_emu.h"
+#include "multi_arith.h"
+#include "fp_arith.h"
+
+const struct fp_ext fp_QNaN =
+{
+	.exp = 0x7fff,
+	.mant = { .m64 = ~0 }
+};
+
+const struct fp_ext fp_Inf =
+{
+	.exp = 0x7fff,
+};
+
+/* let's start with the easy ones */
+
+struct fp_ext *
+fp_fabs(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fabs\n");
+
+	fp_monadic_check(dest, src);
+
+	dest->sign = 0;
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fneg(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fneg\n");
+
+	fp_monadic_check(dest, src);
+
+	dest->sign = !dest->sign;
+
+	return dest;
+}
+
+/* Now, the slightly harder ones */
+
+/* fp_fadd: Implements the kernel of the FADD, FSADD, FDADD, FSUB,
+   FDSUB, and FCMP instructions. */
+
+struct fp_ext *
+fp_fadd(struct fp_ext *dest, struct fp_ext *src)
+{
+	int diff;
+
+	dprint(PINSTR, "fadd\n");
+
+	fp_dyadic_check(dest, src);
+
+	if (IS_INF(dest)) {
+		/* infinity - infinity == NaN */
+		if (IS_INF(src) && (src->sign != dest->sign))
+			fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_INF(src)) {
+		fp_copy_ext(dest, src);
+		return dest;
+	}
+
+	if (IS_ZERO(dest)) {
+		if (IS_ZERO(src)) {
+			if (src->sign != dest->sign) {
+				if (FPDATA->rnd == FPCR_ROUND_RM)
+					dest->sign = 1;
+				else
+					dest->sign = 0;
+			}
+		} else
+			fp_copy_ext(dest, src);
+		return dest;
+	}
+
+	dest->lowmant = src->lowmant = 0;
+
+	if ((diff = dest->exp - src->exp) > 0)
+		fp_denormalize(src, diff);
+	else if ((diff = -diff) > 0)
+		fp_denormalize(dest, diff);
+
+	if (dest->sign == src->sign) {
+		if (fp_addmant(dest, src))
+			if (!fp_addcarry(dest))
+				return dest;
+	} else {
+		if (dest->mant.m64 < src->mant.m64) {
+			fp_submant(dest, src, dest);
+			dest->sign = !dest->sign;
+		} else
+			fp_submant(dest, dest, src);
+	}
+
+	return dest;
+}
+
+/* fp_fsub: Implements the kernel of the FSUB, FSSUB, and FDSUB
+   instructions.
+
+   Remember that the arguments are in assembler-syntax order! */
+
+struct fp_ext *
+fp_fsub(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fsub ");
+
+	src->sign = !src->sign;
+	return fp_fadd(dest, src);
+}
+
+
+struct fp_ext *
+fp_fcmp(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fcmp ");
+
+	FPDATA->temp[1] = *dest;
+	src->sign = !src->sign;
+	return fp_fadd(&FPDATA->temp[1], src);
+}
+
+struct fp_ext *
+fp_ftst(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "ftst\n");
+
+	(void)dest;
+
+	return src;
+}
+
+struct fp_ext *
+fp_fmul(struct fp_ext *dest, struct fp_ext *src)
+{
+	union fp_mant128 temp;
+	int exp;
+
+	dprint(PINSTR, "fmul\n");
+
+	fp_dyadic_check(dest, src);
+
+	/* calculate the correct sign now, as it's necessary for infinities */
+	dest->sign = src->sign ^ dest->sign;
+
+	/* Handle infinities */
+	if (IS_INF(dest)) {
+		if (IS_ZERO(src))
+			fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_INF(src)) {
+		if (IS_ZERO(dest))
+			fp_set_nan(dest);
+		else
+			fp_copy_ext(dest, src);
+		return dest;
+	}
+
+	/* Of course, as we all know, zero * anything = zero.  You may
+	   not have known that it might be a positive or negative
+	   zero... */
+	if (IS_ZERO(dest) || IS_ZERO(src)) {
+		dest->exp = 0;
+		dest->mant.m64 = 0;
+		dest->lowmant = 0;
+
+		return dest;
+	}
+
+	exp = dest->exp + src->exp - 0x3ffe;
+
+	/* shift up the mantissa for denormalized numbers,
+	   so that the highest bit is set, this makes the
+	   shift of the result below easier */
+	if ((long)dest->mant.m32[0] >= 0)
+		exp -= fp_overnormalize(dest);
+	if ((long)src->mant.m32[0] >= 0)
+		exp -= fp_overnormalize(src);
+
+	/* now, do a 64-bit multiply with expansion */
+	fp_multiplymant(&temp, dest, src);
+
+	/* normalize it back to 64 bits and stuff it back into the
+	   destination struct */
+	if ((long)temp.m32[0] > 0) {
+		exp--;
+		fp_putmant128(dest, &temp, 1);
+	} else
+		fp_putmant128(dest, &temp, 0);
+
+	if (exp >= 0x7fff) {
+		fp_set_ovrflw(dest);
+		return dest;
+	}
+	dest->exp = exp;
+	if (exp < 0) {
+		fp_set_sr(FPSR_EXC_UNFL);
+		fp_denormalize(dest, -exp);
+	}
+
+	return dest;
+}
+
+/* fp_fdiv: Implements the "kernel" of the FDIV, FSDIV, FDDIV and
+   FSGLDIV instructions.
+
+   Note that the order of the operands is counter-intuitive: instead
+   of src / dest, the result is actually dest / src. */
+
+struct fp_ext *
+fp_fdiv(struct fp_ext *dest, struct fp_ext *src)
+{
+	union fp_mant128 temp;
+	int exp;
+
+	dprint(PINSTR, "fdiv\n");
+
+	fp_dyadic_check(dest, src);
+
+	/* calculate the correct sign now, as it's necessary for infinities */
+	dest->sign = src->sign ^ dest->sign;
+
+	/* Handle infinities */
+	if (IS_INF(dest)) {
+		/* infinity / infinity = NaN (quiet, as always) */
+		if (IS_INF(src))
+			fp_set_nan(dest);
+		/* infinity / anything else = infinity (with approprate sign) */
+		return dest;
+	}
+	if (IS_INF(src)) {
+		/* anything / infinity = zero (with appropriate sign) */
+		dest->exp = 0;
+		dest->mant.m64 = 0;
+		dest->lowmant = 0;
+
+		return dest;
+	}
+
+	/* zeroes */
+	if (IS_ZERO(dest)) {
+		/* zero / zero = NaN */
+		if (IS_ZERO(src))
+			fp_set_nan(dest);
+		/* zero / anything else = zero */
+		return dest;
+	}
+	if (IS_ZERO(src)) {
+		/* anything / zero = infinity (with appropriate sign) */
+		fp_set_sr(FPSR_EXC_DZ);
+		dest->exp = 0x7fff;
+		dest->mant.m64 = 0;
+
+		return dest;
+	}
+
+	exp = dest->exp - src->exp + 0x3fff;
+
+	/* shift up the mantissa for denormalized numbers,
+	   so that the highest bit is set, this makes lots
+	   of things below easier */
+	if ((long)dest->mant.m32[0] >= 0)
+		exp -= fp_overnormalize(dest);
+	if ((long)src->mant.m32[0] >= 0)
+		exp -= fp_overnormalize(src);
+
+	/* now, do the 64-bit divide */
+	fp_dividemant(&temp, dest, src);
+
+	/* normalize it back to 64 bits and stuff it back into the
+	   destination struct */
+	if (!temp.m32[0]) {
+		exp--;
+		fp_putmant128(dest, &temp, 32);
+	} else
+		fp_putmant128(dest, &temp, 31);
+
+	if (exp >= 0x7fff) {
+		fp_set_ovrflw(dest);
+		return dest;
+	}
+	dest->exp = exp;
+	if (exp < 0) {
+		fp_set_sr(FPSR_EXC_UNFL);
+		fp_denormalize(dest, -exp);
+	}
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsglmul(struct fp_ext *dest, struct fp_ext *src)
+{
+	int exp;
+
+	dprint(PINSTR, "fsglmul\n");
+
+	fp_dyadic_check(dest, src);
+
+	/* calculate the correct sign now, as it's necessary for infinities */
+	dest->sign = src->sign ^ dest->sign;
+
+	/* Handle infinities */
+	if (IS_INF(dest)) {
+		if (IS_ZERO(src))
+			fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_INF(src)) {
+		if (IS_ZERO(dest))
+			fp_set_nan(dest);
+		else
+			fp_copy_ext(dest, src);
+		return dest;
+	}
+
+	/* Of course, as we all know, zero * anything = zero.  You may
+	   not have known that it might be a positive or negative
+	   zero... */
+	if (IS_ZERO(dest) || IS_ZERO(src)) {
+		dest->exp = 0;
+		dest->mant.m64 = 0;
+		dest->lowmant = 0;
+
+		return dest;
+	}
+
+	exp = dest->exp + src->exp - 0x3ffe;
+
+	/* do a 32-bit multiply */
+	fp_mul64(dest->mant.m32[0], dest->mant.m32[1],
+		 dest->mant.m32[0] & 0xffffff00,
+		 src->mant.m32[0] & 0xffffff00);
+
+	if (exp >= 0x7fff) {
+		fp_set_ovrflw(dest);
+		return dest;
+	}
+	dest->exp = exp;
+	if (exp < 0) {
+		fp_set_sr(FPSR_EXC_UNFL);
+		fp_denormalize(dest, -exp);
+	}
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsgldiv(struct fp_ext *dest, struct fp_ext *src)
+{
+	int exp;
+	unsigned long quot, rem;
+
+	dprint(PINSTR, "fsgldiv\n");
+
+	fp_dyadic_check(dest, src);
+
+	/* calculate the correct sign now, as it's necessary for infinities */
+	dest->sign = src->sign ^ dest->sign;
+
+	/* Handle infinities */
+	if (IS_INF(dest)) {
+		/* infinity / infinity = NaN (quiet, as always) */
+		if (IS_INF(src))
+			fp_set_nan(dest);
+		/* infinity / anything else = infinity (with approprate sign) */
+		return dest;
+	}
+	if (IS_INF(src)) {
+		/* anything / infinity = zero (with appropriate sign) */
+		dest->exp = 0;
+		dest->mant.m64 = 0;
+		dest->lowmant = 0;
+
+		return dest;
+	}
+
+	/* zeroes */
+	if (IS_ZERO(dest)) {
+		/* zero / zero = NaN */
+		if (IS_ZERO(src))
+			fp_set_nan(dest);
+		/* zero / anything else = zero */
+		return dest;
+	}
+	if (IS_ZERO(src)) {
+		/* anything / zero = infinity (with appropriate sign) */
+		fp_set_sr(FPSR_EXC_DZ);
+		dest->exp = 0x7fff;
+		dest->mant.m64 = 0;
+
+		return dest;
+	}
+
+	exp = dest->exp - src->exp + 0x3fff;
+
+	dest->mant.m32[0] &= 0xffffff00;
+	src->mant.m32[0] &= 0xffffff00;
+
+	/* do the 32-bit divide */
+	if (dest->mant.m32[0] >= src->mant.m32[0]) {
+		fp_sub64(dest->mant, src->mant);
+		fp_div64(quot, rem, dest->mant.m32[0], 0, src->mant.m32[0]);
+		dest->mant.m32[0] = 0x80000000 | (quot >> 1);
+		dest->mant.m32[1] = (quot & 1) | rem;	/* only for rounding */
+	} else {
+		fp_div64(quot, rem, dest->mant.m32[0], 0, src->mant.m32[0]);
+		dest->mant.m32[0] = quot;
+		dest->mant.m32[1] = rem;		/* only for rounding */
+		exp--;
+	}
+
+	if (exp >= 0x7fff) {
+		fp_set_ovrflw(dest);
+		return dest;
+	}
+	dest->exp = exp;
+	if (exp < 0) {
+		fp_set_sr(FPSR_EXC_UNFL);
+		fp_denormalize(dest, -exp);
+	}
+
+	return dest;
+}
+
+/* fp_roundint: Internal rounding function for use by several of these
+   emulated instructions.
+
+   This one rounds off the fractional part using the rounding mode
+   specified. */
+
+static void fp_roundint(struct fp_ext *dest, int mode)
+{
+	union fp_mant64 oldmant;
+	unsigned long mask;
+
+	if (!fp_normalize_ext(dest))
+		return;
+
+	/* infinities and zeroes */
+	if (IS_INF(dest) || IS_ZERO(dest))
+		return;
+
+	/* first truncate the lower bits */
+	oldmant = dest->mant;
+	switch (dest->exp) {
+	case 0 ... 0x3ffe:
+		dest->mant.m64 = 0;
+		break;
+	case 0x3fff ... 0x401e:
+		dest->mant.m32[0] &= 0xffffffffU << (0x401e - dest->exp);
+		dest->mant.m32[1] = 0;
+		if (oldmant.m64 == dest->mant.m64)
+			return;
+		break;
+	case 0x401f ... 0x403e:
+		dest->mant.m32[1] &= 0xffffffffU << (0x403e - dest->exp);
+		if (oldmant.m32[1] == dest->mant.m32[1])
+			return;
+		break;
+	default:
+		return;
+	}
+	fp_set_sr(FPSR_EXC_INEX2);
+
+	/* We might want to normalize upwards here... however, since
+	   we know that this is only called on the output of fp_fdiv,
+	   or with the input to fp_fint or fp_fintrz, and the inputs
+	   to all these functions are either normal or denormalized
+	   (no subnormals allowed!), there's really no need.
+
+	   In the case of fp_fdiv, observe that 0x80000000 / 0xffff =
+	   0xffff8000, and the same holds for 128-bit / 64-bit. (i.e. the
+	   smallest possible normal dividend and the largest possible normal
+	   divisor will still produce a normal quotient, therefore, (normal
+	   << 64) / normal is normal in all cases) */
+
+	switch (mode) {
+	case FPCR_ROUND_RN:
+		switch (dest->exp) {
+		case 0 ... 0x3ffd:
+			return;
+		case 0x3ffe:
+			/* As noted above, the input is always normal, so the
+			   guard bit (bit 63) is always set.  therefore, the
+			   only case in which we will NOT round to 1.0 is when
+			   the input is exactly 0.5. */
+			if (oldmant.m64 == (1ULL << 63))
+				return;
+			break;
+		case 0x3fff ... 0x401d:
+			mask = 1 << (0x401d - dest->exp);
+			if (!(oldmant.m32[0] & mask))
+				return;
+			if (oldmant.m32[0] & (mask << 1))
+				break;
+			if (!(oldmant.m32[0] << (dest->exp - 0x3ffd)) &&
+					!oldmant.m32[1])
+				return;
+			break;
+		case 0x401e:
+			if (!(oldmant.m32[1] >= 0))
+				return;
+			if (oldmant.m32[0] & 1)
+				break;
+			if (!(oldmant.m32[1] << 1))
+				return;
+			break;
+		case 0x401f ... 0x403d:
+			mask = 1 << (0x403d - dest->exp);
+			if (!(oldmant.m32[1] & mask))
+				return;
+			if (oldmant.m32[1] & (mask << 1))
+				break;
+			if (!(oldmant.m32[1] << (dest->exp - 0x401d)))
+				return;
+			break;
+		default:
+			return;
+		}
+		break;
+	case FPCR_ROUND_RZ:
+		return;
+	default:
+		if (dest->sign ^ (mode - FPCR_ROUND_RM))
+			break;
+		return;
+	}
+
+	switch (dest->exp) {
+	case 0 ... 0x3ffe:
+		dest->exp = 0x3fff;
+		dest->mant.m64 = 1ULL << 63;
+		break;
+	case 0x3fff ... 0x401e:
+		mask = 1 << (0x401e - dest->exp);
+		if (dest->mant.m32[0] += mask)
+			break;
+		dest->mant.m32[0] = 0x80000000;
+		dest->exp++;
+		break;
+	case 0x401f ... 0x403e:
+		mask = 1 << (0x403e - dest->exp);
+		if (dest->mant.m32[1] += mask)
+			break;
+		if (dest->mant.m32[0] += 1)
+                        break;
+		dest->mant.m32[0] = 0x80000000;
+                dest->exp++;
+		break;
+	}
+}
+
+/* modrem_kernel: Implementation of the FREM and FMOD instructions
+   (which are exactly the same, except for the rounding used on the
+   intermediate value) */
+
+static struct fp_ext *
+modrem_kernel(struct fp_ext *dest, struct fp_ext *src, int mode)
+{
+	struct fp_ext tmp;
+
+	fp_dyadic_check(dest, src);
+
+	/* Infinities and zeros */
+	if (IS_INF(dest) || IS_ZERO(src)) {
+		fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_ZERO(dest) || IS_INF(src))
+		return dest;
+
+	/* FIXME: there is almost certainly a smarter way to do this */
+	fp_copy_ext(&tmp, dest);
+	fp_fdiv(&tmp, src);		/* NOTE: src might be modified */
+	fp_roundint(&tmp, mode);
+	fp_fmul(&tmp, src);
+	fp_fsub(dest, &tmp);
+
+	/* set the quotient byte */
+	fp_set_quotient((dest->mant.m64 & 0x7f) | (dest->sign << 7));
+	return dest;
+}
+
+/* fp_fmod: Implements the kernel of the FMOD instruction.
+
+   Again, the argument order is backwards.  The result, as defined in
+   the Motorola manuals, is:
+
+   fmod(src,dest) = (dest - (src * floor(dest / src))) */
+
+struct fp_ext *
+fp_fmod(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fmod\n");
+	return modrem_kernel(dest, src, FPCR_ROUND_RZ);
+}
+
+/* fp_frem: Implements the kernel of the FREM instruction.
+
+   frem(src,dest) = (dest - (src * round(dest / src)))
+ */
+
+struct fp_ext *
+fp_frem(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "frem\n");
+	return modrem_kernel(dest, src, FPCR_ROUND_RN);
+}
+
+struct fp_ext *
+fp_fint(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fint\n");
+
+	fp_copy_ext(dest, src);
+
+	fp_roundint(dest, FPDATA->rnd);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fintrz(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fintrz\n");
+
+	fp_copy_ext(dest, src);
+
+	fp_roundint(dest, FPCR_ROUND_RZ);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fscale(struct fp_ext *dest, struct fp_ext *src)
+{
+	int scale, oldround;
+
+	dprint(PINSTR, "fscale\n");
+
+	fp_dyadic_check(dest, src);
+
+	/* Infinities */
+	if (IS_INF(src)) {
+		fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_INF(dest))
+		return dest;
+
+	/* zeroes */
+	if (IS_ZERO(src) || IS_ZERO(dest))
+		return dest;
+
+	/* Source exponent out of range */
+	if (src->exp >= 0x400c) {
+		fp_set_ovrflw(dest);
+		return dest;
+	}
+
+	/* src must be rounded with round to zero. */
+	oldround = FPDATA->rnd;
+	FPDATA->rnd = FPCR_ROUND_RZ;
+	scale = fp_conv_ext2long(src);
+	FPDATA->rnd = oldround;
+
+	/* new exponent */
+	scale += dest->exp;
+
+	if (scale >= 0x7fff) {
+		fp_set_ovrflw(dest);
+	} else if (scale <= 0) {
+		fp_set_sr(FPSR_EXC_UNFL);
+		fp_denormalize(dest, -scale);
+	} else
+		dest->exp = scale;
+
+	return dest;
+}
+
--- a/arch/m68k/math-emu/fp_arith.h
+++ b/arch/m68k/math-emu/fp_arith.h
@@ -0,0 +1,52 @@
+/*
+
+   fp_arith.h: floating-point math routines for the Linux-m68k
+   floating point emulator.
+
+   Copyright (c) 1998 David Huggins-Daines.
+
+   Somewhat based on the AlphaLinux floating point emulator, by David
+   Mosberger-Tang.
+
+   You may copy, modify, and redistribute this file under the terms of
+   the GNU General Public License, version 2, or any later version, at
+   your convenience.
+
+ */
+
+#ifndef FP_ARITH_H
+#define FP_ARITH_H
+
+/* easy ones */
+struct fp_ext *
+fp_fabs(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fneg(struct fp_ext *dest, struct fp_ext *src);
+
+/* straightforward arithmetic */
+struct fp_ext *
+fp_fadd(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fsub(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fcmp(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_ftst(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fmul(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fdiv(struct fp_ext *dest, struct fp_ext *src);
+
+/* ones that do rounding and integer conversions */
+struct fp_ext *
+fp_fmod(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_frem(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fint(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fintrz(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fscale(struct fp_ext *dest, struct fp_ext *src);
+
+#endif	/* FP_ARITH__H */
--- a/arch/m68k/math-emu/fp_cond.S
+++ b/arch/m68k/math-emu/fp_cond.S
@@ -0,0 +1,334 @@
+/*
+ * fp_cond.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fp_emu.h"
+#include "fp_decode.h"
+
+	.globl	fp_fscc, fp_fbccw, fp_fbccl
+
+#ifdef FPU_EMU_DEBUG
+fp_fnop:
+	printf	PDECODE,"fnop\n"
+	jra	fp_end
+#else
+#define fp_fnop fp_end
+#endif
+
+fp_fbccw:
+	tst.w	%d2
+	jeq	fp_fnop
+	printf	PDECODE,"fbccw "
+	fp_get_pc %a0
+	lea	(-2,%a0,%d2.w),%a0
+	jra	1f
+
+fp_fbccl:
+	printf	PDECODE,"fbccl "
+	fp_get_pc %a0
+	move.l	%d2,%d0
+	swap	%d0
+	fp_get_instr_word %d0,fp_err_ua1
+	lea	(-2,%a0,%d0.l),%a0
+1:	printf	PDECODE,"%x",1,%a0
+	move.l	%d2,%d0
+	swap	%d0
+	jsr	fp_compute_cond
+	tst.l	%d0
+	jeq	1f
+	fp_put_pc %a0,1
+1:	printf	PDECODE,"\n"
+	jra	fp_end
+
+fp_fdbcc:
+	printf	PDECODE,"fdbcc "
+	fp_get_pc %a1				| calculate new pc
+	fp_get_instr_word %d0,fp_err_ua1
+	add.w	%d0,%a1
+	fp_decode_addr_reg
+	printf	PDECODE,"d%d,%x\n",2,%d0,%a1
+	swap	%d1				| test condition in %d1
+	tst.w	%d1
+	jne	2f
+	move.l	%d0,%d1
+	jsr	fp_get_data_reg
+	subq.w	#1,%d0
+	jcs	1f
+	fp_put_pc %a1,1
+1:	jsr	fp_put_data_reg
+2:	jra	fp_end
+
+| set flags for decode macros for fs<cc>
+do_fscc=1
+do_no_pc_mode=1
+
+fp_fscc:
+	printf	PDECODE,"fscc "
+	move.l	%d2,%d0
+	jsr	fp_compute_cond
+	move.w	%d0,%d1
+	swap	%d1
+
+	| decode addressing mode
+	fp_decode_addr_mode
+
+	.long	fp_data, fp_fdbcc
+	.long	fp_indirect, fp_postinc
+	.long	fp_predecr, fp_disp16
+	.long	fp_extmode0, fp_extmode1
+
+	| addressing mode: data register direct
+fp_data:
+	fp_mode_data_direct
+	move.w	%d0,%d1			| save register nr
+	jsr	fp_get_data_reg
+	swap	%d1
+	move.b	%d1,%d0
+	swap	%d1
+	jsr	fp_put_data_reg
+	printf	PDECODE,"\n"
+	jra	fp_end
+
+fp_indirect:
+	fp_mode_addr_indirect
+	jra	fp_do_scc
+
+fp_postinc:
+	fp_mode_addr_indirect_postinc
+	jra	fp_do_scc
+
+fp_predecr:
+	fp_mode_addr_indirect_predec
+	jra	fp_do_scc
+
+fp_disp16:
+	fp_mode_addr_indirect_disp16
+	jra	fp_do_scc
+
+fp_extmode0:
+	fp_mode_addr_indirect_extmode0
+	jra	fp_do_scc
+
+fp_extmode1:
+	bfextu	%d2{#13,#3},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fp_absolute_short, fp_absolute_long
+	.long	fp_ill, fp_ill		| NOTE: jump here to ftrap.x
+	.long	fp_ill, fp_ill
+	.long	fp_ill, fp_ill
+
+fp_absolute_short:
+	fp_mode_abs_short
+	jra	fp_do_scc
+
+fp_absolute_long:
+	fp_mode_abs_long
+|	jra	fp_do_scc
+
+fp_do_scc:
+	swap	%d1
+	putuser.b %d1,(%a0),fp_err_ua1,%a0
+	printf	PDECODE,"\n"
+	jra	fp_end
+
+
+#define tst_NAN	btst #24,%d1
+#define tst_Z	btst #26,%d1
+#define tst_N	btst #27,%d1
+
+fp_compute_cond:
+	move.l	(FPD_FPSR,FPDATA),%d1
+	btst	#4,%d0
+	jeq	1f
+	tst_NAN
+	jeq	1f
+	bset	#15,%d1
+	bset	#7,%d1
+	move.l	%d1,(FPD_FPSR,FPDATA)
+1:	and.w	#0xf,%d0
+	jmp	([0f:w,%pc,%d0.w*4])
+
+	.align	4
+0:
+	.long	fp_f  , fp_eq , fp_ogt, fp_oge
+	.long	fp_olt, fp_ole, fp_ogl, fp_or
+	.long	fp_un , fp_ueq, fp_ugt, fp_uge
+	.long	fp_ult, fp_ule, fp_ne , fp_t
+
+fp_f:
+	moveq	#0,%d0
+	rts
+
+fp_eq:
+	moveq	#0,%d0
+	tst_Z
+	jeq	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_ogt:
+	moveq	#0,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	tst_N
+	jne	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_oge:
+	moveq	#-1,%d0
+	tst_Z
+	jne	2f
+	tst_NAN
+	jne	1f
+	tst_N
+	jeq	2f
+1:	moveq	#0,%d0
+2:	rts
+
+fp_olt:
+	moveq	#0,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	tst_N
+	jeq	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_ole:
+	moveq	#-1,%d0
+	tst_Z
+	jne	2f
+	tst_NAN
+	jne	1f
+	tst_N
+	jne	2f
+1:	moveq	#0,%d0
+2:	rts
+
+fp_ogl:
+	moveq	#0,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_or:
+	moveq	#0,%d0
+	tst_NAN
+	jne	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_un:
+	moveq	#0,%d0
+	tst_NAN
+	jeq	1f
+	moveq	#-1,%d0
+	rts
+
+fp_ueq:
+	moveq	#-1,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	moveq	#0,%d0
+1:	rts
+
+fp_ugt:
+	moveq	#-1,%d0
+	tst_NAN
+	jne	2f
+	tst_N
+	jne	1f
+	tst_Z
+	jeq	2f
+1:	moveq	#0,%d0
+2:	rts
+
+fp_uge:
+	moveq	#-1,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	tst_N
+	jeq	1f
+	moveq	#0,%d0
+1:	rts
+
+fp_ult:
+	moveq	#-1,%d0
+	tst_NAN
+	jne	2f
+	tst_Z
+	jne	1f
+	tst_N
+	jne	2f
+1:	moveq	#0,%d0
+2:	rts
+
+fp_ule:
+	moveq	#-1,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	tst_N
+	jne	1f
+	moveq	#0,%d0
+1:	rts
+
+fp_ne:
+	moveq	#0,%d0
+	tst_Z
+	jne	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_t:
+	moveq	#-1,%d0
+	rts
--- a/arch/m68k/math-emu/fp_decode.h
+++ b/arch/m68k/math-emu/fp_decode.h
@@ -0,0 +1,417 @@
+/*
+ * fp_decode.h
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FP_DECODE_H
+#define _FP_DECODE_H
+
+/* These macros do the dirty work of the instr decoding, several variables
+ * can be defined in the source file to modify the work of these macros,
+ * currently the following variables are used:
+ * ...
+ * The register usage:
+ * d0 - will contain source operand for data direct mode,
+ *	otherwise scratch register
+ * d1 - upper 16bit are reserved for caller
+ *	lower 16bit may contain further arguments,
+ *	is destroyed during decoding
+ * d2 - contains first two instruction words,
+ *	first word will be used for extension word
+ * a0 - will point to source/dest operand for any indirect mode
+ *	otherwise scratch register
+ * a1 - scratch register
+ * a2 - base addr to the task structure
+ *
+ * the current implementation doesn't check for every disallowed
+ * addressing mode (e.g. pc relative modes as destination), as long
+ * as it only means a new addressing mode, which should not appear
+ * in a program and that doesn't crash the emulation, I think it's
+ * not a problem to allow these modes.
+ */
+
+do_fmovem=0
+do_fmovem_cr=0
+do_no_pc_mode=0
+do_fscc=0
+
+| first decoding of the instr type
+| this separates the conditional instr
+.macro	fp_decode_cond_instr_type
+	bfextu	%d2{#8,#2},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+|	.long	"f<op>","fscc/fdbcc"
+|	.long	"fbccw","fbccl"
+.endm
+
+| second decoding of the instr type
+| this separates most move instr
+.macro	fp_decode_move_instr_type
+	bfextu	%d2{#16,#3},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+|	.long	"f<op> fpx,fpx","invalid instr"
+|	.long	"f<op> <ea>,fpx","fmove fpx,<ea>"
+|	.long	"fmovem <ea>,fpcr","fmovem <ea>,fpx"
+|	.long	"fmovem fpcr,<ea>","fmovem fpx,<ea>"
+.endm
+
+| extract the source specifier, specifies
+| either source fp register or data format
+.macro	fp_decode_sourcespec
+	bfextu	%d2{#19,#3},%d0
+.endm
+
+| decode destination format for fmove reg,ea
+.macro	fp_decode_dest_format
+	bfextu	%d2{#19,#3},%d0
+.endm
+
+| decode source register for fmove reg,ea
+.macro	fp_decode_src_reg
+	bfextu	%d2{#22,#3},%d0
+.endm
+
+| extract the addressing mode
+| it depends on the instr which of the modes is valid
+.macro	fp_decode_addr_mode
+	bfextu	%d2{#10,#3},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+|	.long	"data register direct","addr register direct"
+|	.long	"addr register indirect"
+|	.long	"addr register indirect postincrement"
+|	.long	"addr register indirect predecrement"
+|	.long	"addr register + index16"
+|	.long	"extension mode1","extension mode2"
+.endm
+
+| extract the register for the addressing mode
+.macro	fp_decode_addr_reg
+	bfextu	%d2{#13,#3},%d0
+.endm
+
+| decode the 8bit diplacement from the brief extension word
+.macro	fp_decode_disp8
+	move.b	%d2,%d0
+	ext.w	%d0
+.endm
+
+| decode the index of the brief/full extension word
+.macro	fp_decode_index
+	bfextu	%d2{#17,#3},%d0		| get the register nr
+	btst	#15,%d2			| test for data/addr register
+	jne	1\@f
+	printf	PDECODE,"d%d",1,%d0
+	jsr	fp_get_data_reg
+	jra	2\@f
+1\@:	printf	PDECODE,"a%d",1,%d0
+	jsr	fp_get_addr_reg
+	move.l	%a0,%d0
+2\@:
+debug	lea	"'l'.w,%a0"
+	btst	#11,%d2			| 16/32 bit size?
+	jne	3\@f
+debug	lea	"'w'.w,%a0"
+	ext.l	%d0
+3\@:	printf	PDECODE,":%c",1,%a0
+	move.w	%d2,%d1			| scale factor
+	rol.w	#7,%d1
+	and.w	#3,%d1
+debug	move.l	"%d1,-(%sp)"
+debug	ext.l	"%d1"
+	printf	PDECODE,":%d",1,%d1
+debug	move.l	"(%sp)+,%d1"
+	lsl.l	%d1,%d0
+.endm
+
+| decode the base displacement size
+.macro	fp_decode_basedisp
+	bfextu	%d2{#26,#2},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+|	.long	"reserved","null displacement"
+|	.long	"word displacement","long displacement"
+.endm
+
+.macro	fp_decode_outerdisp
+	bfextu	%d2{#30,#2},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+|	.long	"no memory indirect action/reserved","null outer displacement"
+|	.long	"word outer displacement","long outer displacement"
+.endm
+
+| get the extension word and test for brief or full extension type
+.macro	fp_get_test_extword label
+	fp_get_instr_word %d2,fp_err_ua1
+	btst	#8,%d2
+	jne	\label
+.endm
+
+
+| test if %pc is the base register for the indirect addr mode
+.macro	fp_test_basereg_d16	label
+	btst	#20,%d2
+	jeq	\label
+.endm
+
+| test if %pc is the base register for one of the extended modes
+.macro	fp_test_basereg_ext	label
+	btst	#19,%d2
+	jeq	\label
+.endm
+
+.macro	fp_test_suppr_index label
+	btst	#6,%d2
+	jne	\label
+.endm
+
+
+| addressing mode: data register direct
+.macro	fp_mode_data_direct
+	fp_decode_addr_reg
+	printf	PDECODE,"d%d",1,%d0
+.endm
+
+| addressing mode: address register indirect
+.macro	fp_mode_addr_indirect
+	fp_decode_addr_reg
+	printf	PDECODE,"(a%d)",1,%d0
+	jsr	fp_get_addr_reg
+.endm
+
+| adjust stack for byte moves from/to stack
+.macro	fp_test_sp_byte_move
+	.if	!do_fmovem
+	.if	do_fscc
+	move.w	#6,%d1
+	.endif
+	cmp.w	#7,%d0
+	jne	1\@f
+	.if	!do_fscc
+	cmp.w	#6,%d1
+	jne	1\@f
+	.endif
+	move.w	#4,%d1
+1\@:
+	.endif
+.endm
+
+| addressing mode: address register indirect with postincrement
+.macro	fp_mode_addr_indirect_postinc
+	fp_decode_addr_reg
+	printf	PDECODE,"(a%d)+",1,%d0
+	fp_test_sp_byte_move
+	jsr	fp_get_addr_reg
+	move.l	%a0,%a1			| save addr
+	.if	do_fmovem
+	lea	(%a0,%d1.w*4),%a0
+	.if	!do_fmovem_cr
+	lea	(%a0,%d1.w*8),%a0
+	.endif
+	.else
+	add.w	(fp_datasize,%d1.w*2),%a0
+	.endif
+	jsr	fp_put_addr_reg
+	move.l	%a1,%a0
+.endm
+
+| addressing mode: address register indirect with predecrement
+.macro	fp_mode_addr_indirect_predec
+	fp_decode_addr_reg
+	printf	PDECODE,"-(a%d)",1,%d0
+	fp_test_sp_byte_move
+	jsr	fp_get_addr_reg
+	.if	do_fmovem
+	.if	!do_fmovem_cr
+	lea	(-12,%a0),%a1		| setup to addr of 1st reg to move
+	neg.w	%d1
+	lea	(%a0,%d1.w*4),%a0
+	add.w	%d1,%d1
+	lea	(%a0,%d1.w*4),%a0
+	jsr	fp_put_addr_reg
+	move.l	%a1,%a0
+	.else
+	neg.w	%d1
+	lea	(%a0,%d1.w*4),%a0
+	jsr	fp_put_addr_reg
+	.endif
+	.else
+	sub.w	(fp_datasize,%d1.w*2),%a0
+	jsr	fp_put_addr_reg
+	.endif
+.endm
+
+| addressing mode: address register/programm counter indirect
+|		   with 16bit displacement
+.macro	fp_mode_addr_indirect_disp16
+	.if	!do_no_pc_mode
+	fp_test_basereg_d16 1f
+	printf	PDECODE,"pc"
+	fp_get_pc %a0
+	jra	2f
+	.endif
+1:	fp_decode_addr_reg
+	printf	PDECODE,"a%d",1,%d0
+	jsr	fp_get_addr_reg
+2:	fp_get_instr_word %a1,fp_err_ua1
+	printf	PDECODE,"@(%x)",1,%a1
+	add.l	%a1,%a0
+.endm
+
+| perform preindex (if I/IS == 0xx and xx != 00)
+.macro	fp_do_preindex
+	moveq	#3,%d0
+	and.w	%d2,%d0
+	jeq	1f
+	btst	#2,%d2
+	jne	1f
+	printf	PDECODE,")@("
+	getuser.l (%a1),%a1,fp_err_ua1,%a1
+debug	jra	"2f"
+1:	printf	PDECODE,","
+2:
+.endm
+
+| perform postindex (if I/IS == 1xx)
+.macro	fp_do_postindex
+	btst	#2,%d2
+	jeq	1f
+	printf	PDECODE,")@("
+	getuser.l (%a1),%a1,fp_err_ua1,%a1
+debug	jra	"2f"
+1:	printf	PDECODE,","
+2:
+.endm
+
+| all other indirect addressing modes will finally end up here
+.macro	fp_mode_addr_indirect_extmode0
+	.if	!do_no_pc_mode
+	fp_test_basereg_ext 1f
+	printf	PDECODE,"pc"
+	fp_get_pc %a0
+	jra	2f
+	.endif
+1:	fp_decode_addr_reg
+	printf	PDECODE,"a%d",1,%d0
+	jsr	fp_get_addr_reg
+2:	move.l	%a0,%a1
+	swap	%d2
+	fp_get_test_extword 3f
+	| addressing mode: address register/programm counter indirect
+	|		   with index and 8bit displacement
+	fp_decode_disp8
+debug	ext.l	"%d0"
+	printf	PDECODE,"@(%x,",1,%d0
+	add.w	%d0,%a1
+	fp_decode_index
+	add.l	%d0,%a1
+	printf	PDECODE,")"
+	jra	9f
+3:	| addressing mode: address register/programm counter memory indirect
+	|		   with base and/or outer displacement
+	btst	#7,%d2			| base register suppressed?
+	jeq	1f
+	printf	PDECODE,"!"
+	sub.l	%a1,%a1
+1:	printf	PDECODE,"@("
+	fp_decode_basedisp
+
+	.long	fp_ill,1f
+	.long	2f,3f
+
+#ifdef FPU_EMU_DEBUG
+1:	printf	PDECODE,"0"		| null base displacement
+	jra	1f
+#endif
+2:	fp_get_instr_word %a0,fp_err_ua1 | 16bit base displacement
+	printf	PDECODE,"%x:w",1,%a0
+	jra	4f
+3:	fp_get_instr_long %a0,fp_err_ua1 | 32bit base displacement
+	printf	PDECODE,"%x:l",1,%a0
+4:	add.l	%a0,%a1
+1:
+	fp_do_postindex
+	fp_test_suppr_index 1f
+	fp_decode_index
+	add.l	%d0,%a1
+1:	fp_do_preindex
+
+	fp_decode_outerdisp
+
+	.long	5f,1f
+	.long	2f,3f
+
+#ifdef FPU_EMU_DEBUG
+1:	printf	PDECODE,"0"		| null outer displacement
+	jra	1f
+#endif
+2:	fp_get_instr_word %a0,fp_err_ua1 | 16bit outer displacement
+	printf	PDECODE,"%x:w",1,%a0
+	jra	4f
+3:	fp_get_instr_long %a0,fp_err_ua1 | 32bit outer displacement
+	printf	PDECODE,"%x:l",1,%a0
+4:	add.l	%a0,%a1
+1:
+5:	printf	PDECODE,")"
+9:	move.l	%a1,%a0
+	swap	%d2
+.endm
+
+| get the absolute short address from user space
+.macro	fp_mode_abs_short
+	fp_get_instr_word %a0,fp_err_ua1
+	printf	PDECODE,"%x.w",1,%a0
+.endm
+
+| get the absolute long address from user space
+.macro	fp_mode_abs_long
+	fp_get_instr_long %a0,fp_err_ua1
+	printf	PDECODE,"%x.l",1,%a0
+.endm
+
+#endif /* _FP_DECODE_H */
--- a/arch/m68k/math-emu/fp_emu.h
+++ b/arch/m68k/math-emu/fp_emu.h
@@ -0,0 +1,146 @@
+/*
+ * fp_emu.h
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FP_EMU_H
+#define _FP_EMU_H
+
+#ifdef __ASSEMBLY__
+#include <asm/offsets.h>
+#endif
+#include <asm/math-emu.h>
+
+#ifndef __ASSEMBLY__
+
+#define IS_INF(a) ((a)->exp == 0x7fff)
+#define IS_ZERO(a) ((a)->mant.m64 == 0)
+
+
+#define fp_set_sr(bit) ({					\
+	FPDATA->fpsr |= 1 << (bit);				\
+})
+
+#define fp_set_quotient(quotient) ({				\
+	FPDATA->fpsr &= 0xff00ffff;				\
+	FPDATA->fpsr |= ((quotient) & 0xff) << 16;		\
+})
+
+/* linkage for several useful functions */
+
+/* Normalize the extended struct, return 0 for a NaN */
+#define fp_normalize_ext(fpreg) ({				\
+	register struct fp_ext *reg asm ("a0") = fpreg;		\
+	register int res asm ("d0");				\
+								\
+	asm volatile ("jsr fp_conv_ext2ext"			\
+			: "=d" (res) : "a" (reg)		\
+			: "a1", "d1", "d2", "memory");		\
+	res;							\
+})
+
+#define fp_copy_ext(dest, src) ({				\
+	*dest = *src;						\
+})
+
+#define fp_monadic_check(dest, src) ({				\
+	fp_copy_ext(dest, src);					\
+	if (!fp_normalize_ext(dest))				\
+		return dest;					\
+})
+
+#define fp_dyadic_check(dest, src) ({				\
+	if (!fp_normalize_ext(dest))				\
+		return dest;					\
+	if (!fp_normalize_ext(src)) {				\
+		fp_copy_ext(dest, src);				\
+		return dest;					\
+	}							\
+})
+
+extern const struct fp_ext fp_QNaN;
+extern const struct fp_ext fp_Inf;
+
+#define fp_set_nan(dest) ({					\
+	fp_set_sr(FPSR_EXC_OPERR);				\
+	*dest = fp_QNaN;					\
+})
+
+/* TODO check rounding mode? */
+#define fp_set_ovrflw(dest) ({					\
+	fp_set_sr(FPSR_EXC_OVFL);				\
+	dest->exp = 0x7fff;					\
+	dest->mant.m64 = 0;					\
+})
+
+#define fp_conv_ext2long(src) ({				\
+	register struct fp_ext *__src asm ("a0") = src;		\
+	register int __res asm ("d0");				\
+								\
+	asm volatile ("jsr fp_conv_ext2long"			\
+			: "=d" (__res) : "a" (__src)		\
+			: "a1", "d1", "d2", "memory");		\
+	__res;							\
+})
+
+#define fp_conv_long2ext(dest, src) ({				\
+	register struct fp_ext *__dest asm ("a0") = dest;	\
+	register int __src asm ("d0") = src;			\
+								\
+	asm volatile ("jsr fp_conv_ext2long"			\
+			: : "d" (__src), "a" (__dest)		\
+			: "a1", "d1", "d2", "memory");		\
+})
+
+#else /* __ASSEMBLY__ */
+
+/*
+ * set, reset or clear a bit in the fp status register
+ */
+.macro	fp_set_sr	bit
+	bset	#(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA)
+.endm
+
+.macro	fp_clr_sr	bit
+	bclr	#(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA)
+.endm
+
+.macro	fp_tst_sr	bit
+	btst	#(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA)
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _FP_EMU_H */
--- a/arch/m68k/math-emu/fp_entry.S
+++ b/arch/m68k/math-emu/fp_entry.S
@@ -0,0 +1,325 @@
+/*
+ * fp_emu.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/entry.h>
+
+#include "fp_emu.h"
+
+	.globl	fpu_emu
+	.globl	fp_debugprint
+	.globl	fp_err_ua1,fp_err_ua2
+
+	.text
+fpu_emu:
+	SAVE_ALL_INT
+	GET_CURRENT(%d0)
+
+#if defined(CPU_M68020_OR_M68030) && defined(CPU_M68040_OR_M68060)
+        tst.l	m68k_is040or060
+        jeq	1f
+#endif
+#if defined(CPU_M68040_OR_M68060)
+	move.l	(FPS_PC2,%sp),(FPS_PC,%sp)
+#endif
+1:
+	| emulate the instruction
+	jsr	fp_scan
+
+#if defined(CONFIG_M68060)
+#if !defined(CPU_M68060_ONLY)
+	btst	#3,m68k_cputype+3
+	jeq	1f
+#endif
+	btst	#7,(FPS_SR,%sp)
+	jne	fp_sendtrace060
+#endif
+1:
+	| emulation successful?
+	tst.l	%d0
+	jeq	ret_from_exception
+
+	| send some signal to program here
+
+	jra	ret_from_exception
+
+	| we jump here after an access error while trying to access
+	| user space, we correct stackpointer and send a SIGSEGV to
+	| the user process
+fp_err_ua2:
+	addq.l	#4,%sp
+fp_err_ua1:
+	addq.l	#4,%sp
+	move.l	%a0,-(%sp)
+	pea	SEGV_MAPERR
+	pea	SIGSEGV
+	jsr	fpemu_signal
+	add.w	#12,%sp
+	jra	ret_from_exception
+
+#if defined(CONFIG_M68060)
+	| send a trace signal if we are debugged
+	| it does not really belong here, but...
+fp_sendtrace060:
+	move.l	(FPS_PC,%sp),-(%sp)
+	pea	TRAP_TRACE
+	pea	SIGTRAP
+	jsr	fpemu_signal
+	add.w	#12,%sp
+	jra	ret_from_exception
+#endif
+
+	.globl	fp_get_data_reg, fp_put_data_reg
+	.globl	fp_get_addr_reg, fp_put_addr_reg
+
+	| Entry points to get/put a register. Some of them can be get/put
+	| directly, others are on the stack, as we read/write the stack
+	| directly here, these function may only be called from within
+	| instruction decoding, otherwise the stack pointer is incorrect
+	| and the stack gets corrupted.
+fp_get_data_reg:
+	jmp	([0f:w,%pc,%d0.w*4])
+
+	.align	4
+0:
+	.long	fp_get_d0, fp_get_d1
+	.long	fp_get_d2, fp_get_d3
+	.long	fp_get_d4, fp_get_d5
+	.long	fp_get_d6, fp_get_d7
+
+fp_get_d0:
+	move.l	(PT_D0+8,%sp),%d0
+	printf	PREGISTER,"{d0->%08x}",1,%d0
+	rts
+
+fp_get_d1:
+	move.l	(PT_D1+8,%sp),%d0
+	printf	PREGISTER,"{d1->%08x}",1,%d0
+	rts
+
+fp_get_d2:
+	move.l	(PT_D2+8,%sp),%d0
+	printf	PREGISTER,"{d2->%08x}",1,%d0
+	rts
+
+fp_get_d3:
+	move.l	%d3,%d0
+	printf	PREGISTER,"{d3->%08x}",1,%d0
+	rts
+
+fp_get_d4:
+	move.l	%d4,%d0
+	printf	PREGISTER,"{d4->%08x}",1,%d0
+	rts
+
+fp_get_d5:
+	move.l	%d5,%d0
+	printf	PREGISTER,"{d5->%08x}",1,%d0
+	rts
+
+fp_get_d6:
+	move.l	%d6,%d0
+	printf	PREGISTER,"{d6->%08x}",1,%d0
+	rts
+
+fp_get_d7:
+	move.l	%d7,%d0
+	printf	PREGISTER,"{d7->%08x}",1,%d0
+	rts
+
+fp_put_data_reg:
+	jmp	([0f:w,%pc,%d1.w*4])
+
+	.align	4
+0:
+	.long	fp_put_d0, fp_put_d1
+	.long	fp_put_d2, fp_put_d3
+	.long	fp_put_d4, fp_put_d5
+	.long	fp_put_d6, fp_put_d7
+
+fp_put_d0:
+	printf	PREGISTER,"{d0<-%08x}",1,%d0
+	move.l	%d0,(PT_D0+8,%sp)
+	rts
+
+fp_put_d1:
+	printf	PREGISTER,"{d1<-%08x}",1,%d0
+	move.l	%d0,(PT_D1+8,%sp)
+	rts
+
+fp_put_d2:
+	printf	PREGISTER,"{d2<-%08x}",1,%d0
+	move.l	%d0,(PT_D2+8,%sp)
+	rts
+
+fp_put_d3:
+	printf	PREGISTER,"{d3<-%08x}",1,%d0
+|	move.l	%d0,%d3
+	move.l	%d0,(PT_D3+8,%sp)
+	rts
+
+fp_put_d4:
+	printf	PREGISTER,"{d4<-%08x}",1,%d0
+|	move.l	%d0,%d4
+	move.l	%d0,(PT_D4+8,%sp)
+	rts
+
+fp_put_d5:
+	printf	PREGISTER,"{d5<-%08x}",1,%d0
+|	move.l	%d0,%d5
+	move.l	%d0,(PT_D5+8,%sp)
+	rts
+
+fp_put_d6:
+	printf	PREGISTER,"{d6<-%08x}",1,%d0
+	move.l	%d0,%d6
+	rts
+
+fp_put_d7:
+	printf	PREGISTER,"{d7<-%08x}",1,%d0
+	move.l	%d0,%d7
+	rts
+
+fp_get_addr_reg:
+	jmp	([0f:w,%pc,%d0.w*4])
+
+	.align	4
+0:
+	.long	fp_get_a0, fp_get_a1
+	.long	fp_get_a2, fp_get_a3
+	.long	fp_get_a4, fp_get_a5
+	.long	fp_get_a6, fp_get_a7
+
+fp_get_a0:
+	move.l	(PT_A0+8,%sp),%a0
+	printf	PREGISTER,"{a0->%08x}",1,%a0
+	rts
+
+fp_get_a1:
+	move.l	(PT_A1+8,%sp),%a0
+	printf	PREGISTER,"{a1->%08x}",1,%a0
+	rts
+
+fp_get_a2:
+	move.l	(PT_A2+8,%sp),%a0
+	printf	PREGISTER,"{a2->%08x}",1,%a0
+	rts
+
+fp_get_a3:
+	move.l	%a3,%a0
+	printf	PREGISTER,"{a3->%08x}",1,%a0
+	rts
+
+fp_get_a4:
+	move.l	%a4,%a0
+	printf	PREGISTER,"{a4->%08x}",1,%a0
+	rts
+
+fp_get_a5:
+	move.l	%a5,%a0
+	printf	PREGISTER,"{a5->%08x}",1,%a0
+	rts
+
+fp_get_a6:
+	move.l	%a6,%a0
+	printf	PREGISTER,"{a6->%08x}",1,%a0
+	rts
+
+fp_get_a7:
+	move.l	%usp,%a0
+	printf	PREGISTER,"{a7->%08x}",1,%a0
+	rts
+
+fp_put_addr_reg:
+	jmp	([0f:w,%pc,%d0.w*4])
+
+	.align	4
+0:
+	.long	fp_put_a0, fp_put_a1
+	.long	fp_put_a2, fp_put_a3
+	.long	fp_put_a4, fp_put_a5
+	.long	fp_put_a6, fp_put_a7
+
+fp_put_a0:
+	printf	PREGISTER,"{a0<-%08x}",1,%a0
+	move.l	%a0,(PT_A0+8,%sp)
+	rts
+
+fp_put_a1:
+	printf	PREGISTER,"{a1<-%08x}",1,%a0
+	move.l	%a0,(PT_A1+8,%sp)
+	rts
+
+fp_put_a2:
+	printf	PREGISTER,"{a2<-%08x}",1,%a0
+	move.l	%a0,(PT_A2+8,%sp)
+	rts
+
+fp_put_a3:
+	printf	PREGISTER,"{a3<-%08x}",1,%a0
+	move.l	%a0,%a3
+	rts
+
+fp_put_a4:
+	printf	PREGISTER,"{a4<-%08x}",1,%a0
+	move.l	%a0,%a4
+	rts
+
+fp_put_a5:
+	printf	PREGISTER,"{a5<-%08x}",1,%a0
+	move.l	%a0,%a5
+	rts
+
+fp_put_a6:
+	printf	PREGISTER,"{a6<-%08x}",1,%a0
+	move.l	%a0,%a6
+	rts
+
+fp_put_a7:
+	printf	PREGISTER,"{a7<-%08x}",1,%a0
+	move.l	%a0,%usp
+	rts
+
+	.data
+	.align	4
+
+fp_debugprint:
+|	.long	PMDECODE
+	.long	PMINSTR+PMDECODE+PMCONV+PMNORM
+|	.long	PMCONV+PMNORM+PMINSTR
+|	.long	0
--- a/arch/m68k/math-emu/fp_log.c
+++ b/arch/m68k/math-emu/fp_log.c
@@ -0,0 +1,223 @@
+/*
+
+  fp_trig.c: floating-point math routines for the Linux-m68k
+  floating point emulator.
+
+  Copyright (c) 1998-1999 David Huggins-Daines / Roman Zippel.
+
+  I hereby give permission, free of charge, to copy, modify, and
+  redistribute this software, in source or binary form, provided that
+  the above copyright notice and the following disclaimer are included
+  in all such copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL
+  OR IMPLIED.
+
+*/
+
+#include "fp_emu.h"
+
+static const struct fp_ext fp_one =
+{
+	.exp = 0x3fff,
+};
+
+extern struct fp_ext *fp_fadd(struct fp_ext *dest, const struct fp_ext *src);
+extern struct fp_ext *fp_fdiv(struct fp_ext *dest, const struct fp_ext *src);
+extern struct fp_ext *fp_fmul(struct fp_ext *dest, const struct fp_ext *src);
+
+struct fp_ext *
+fp_fsqrt(struct fp_ext *dest, struct fp_ext *src)
+{
+	struct fp_ext tmp, src2;
+	int i, exp;
+
+	dprint(PINSTR, "fsqrt\n");
+
+	fp_monadic_check(dest, src);
+
+	if (IS_ZERO(dest))
+		return dest;
+
+	if (dest->sign) {
+		fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_INF(dest))
+		return dest;
+
+	/*
+	 *		 sqrt(m) * 2^(p)	, if e = 2*p
+	 * sqrt(m*2^e) =
+	 *		 sqrt(2*m) * 2^(p)	, if e = 2*p + 1
+	 *
+	 * So we use the last bit of the exponent to decide wether to
+	 * use the m or 2*m.
+	 *
+	 * Since only the fractional part of the mantissa is stored and
+	 * the integer part is assumed to be one, we place a 1 or 2 into
+	 * the fixed point representation.
+	 */
+	exp = dest->exp;
+	dest->exp = 0x3FFF;
+	if (!(exp & 1))		/* lowest bit of exponent is set */
+		dest->exp++;
+	fp_copy_ext(&src2, dest);
+
+	/*
+	 * The taylor row arround a for sqrt(x) is:
+	 *	sqrt(x) = sqrt(a) + 1/(2*sqrt(a))*(x-a) + R
+	 * With a=1 this gives:
+	 *	sqrt(x) = 1 + 1/2*(x-1)
+	 *		= 1/2*(1+x)
+	 */
+	fp_fadd(dest, &fp_one);
+	dest->exp--;		/* * 1/2 */
+
+	/*
+	 * We now apply the newton rule to the function
+	 *	f(x) := x^2 - r
+	 * which has a null point on x = sqrt(r).
+	 *
+	 * It gives:
+	 *	x' := x - f(x)/f'(x)
+	 *	    = x - (x^2 -r)/(2*x)
+	 *	    = x - (x - r/x)/2
+	 *          = (2*x - x + r/x)/2
+	 *	    = (x + r/x)/2
+	 */
+	for (i = 0; i < 9; i++) {
+		fp_copy_ext(&tmp, &src2);
+
+		fp_fdiv(&tmp, dest);
+		fp_fadd(dest, &tmp);
+		dest->exp--;
+	}
+
+	dest->exp += (exp - 0x3FFF) / 2;
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fetoxm1(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fetoxm1\n");
+
+	fp_monadic_check(dest, src);
+
+	if (IS_ZERO(dest))
+		return dest;
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fetox(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fetox\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_ftwotox(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("ftwotox\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_ftentox(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("ftentox\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_flogn(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("flogn\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_flognp1(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("flognp1\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_flog10(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("flog10\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_flog2(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("flog2\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fgetexp(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fgetexp\n");
+
+	fp_monadic_check(dest, src);
+
+	if (IS_INF(dest)) {
+		fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_ZERO(dest))
+		return dest;
+
+	fp_conv_long2ext(dest, (int)dest->exp - 0x3FFF);
+
+	fp_normalize_ext(dest);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fgetman(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fgetman\n");
+
+	fp_monadic_check(dest, src);
+
+	if (IS_ZERO(dest))
+		return dest;
+
+	if (IS_INF(dest))
+		return dest;
+
+	dest->exp = 0x3FFF;
+
+	return dest;
+}
+
--- a/arch/m68k/math-emu/fp_move.S
+++ b/arch/m68k/math-emu/fp_move.S
@@ -0,0 +1,244 @@
+/*
+ * fp_move.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fp_emu.h"
+#include "fp_decode.h"
+
+do_no_pc_mode=1
+
+	.globl	fp_fmove_fp2mem
+
+fp_fmove_fp2mem:
+	clr.b	(2+FPD_FPSR,FPDATA)
+	fp_decode_dest_format
+	move.w	%d0,%d1			| store data size twice in %d1
+	swap	%d1			| one can be trashed below
+	move.w	%d0,%d1
+#ifdef FPU_EMU_DEBUG
+	lea	0f,%a0
+	clr.l	%d0
+	move.b	(%a0,%d1.w),%d0
+	printf	PDECODE,"fmove.%c ",1,%d0
+	fp_decode_src_reg
+	printf	PDECODE,"fp%d,",1,%d0
+
+	.data
+0:	.byte	'l','s','x','p','w','d','b','p'
+	.previous
+#endif
+
+	| encode addressing mode for dest
+	fp_decode_addr_mode
+
+	.long	fp_data, fp_ill
+	.long	fp_indirect, fp_postinc
+	.long	fp_predecr, fp_disp16
+	.long	fp_extmode0, fp_extmode1
+
+	| addressing mode: data register direct
+fp_data:
+	fp_mode_data_direct
+	move.w	%d0,%d1
+	fp_decode_src_reg
+	fp_get_fp_reg
+	lea	(FPD_TEMPFP1,FPDATA),%a1
+	move.l	(%a0)+,(%a1)+
+	move.l	(%a0)+,(%a1)+
+	move.l	(%a0),(%a1)
+	lea	(-8,%a1),%a0
+	swap	%d1
+	move.l	%d1,%d2
+	printf	PDECODE,"\n"
+	jmp	([0f:w,%pc,%d1.w*4])
+
+	.align	4
+0:
+	.long	fp_data_long, fp_data_single
+	.long	fp_ill, fp_ill
+	.long	fp_data_word, fp_ill
+	.long	fp_data_byte, fp_ill
+
+fp_data_byte:
+	jsr	fp_normalize_ext
+	jsr	fp_conv_ext2byte
+	move.l	%d0,%d1
+	swap	%d2
+	move.w	%d2,%d0
+	jsr	fp_get_data_reg
+	move.b	%d1,%d0
+	move.w	%d2,%d1
+	jsr	fp_put_data_reg
+	jra	fp_final
+
+fp_data_word:
+	jsr	fp_normalize_ext
+	jsr	fp_conv_ext2short
+	move.l	%d0,%d1
+	swap	%d2
+	move.w	%d2,%d0
+	jsr	fp_get_data_reg
+	move.w	%d1,%d0
+	move.l	%d2,%d1
+	jsr	fp_put_data_reg
+	jra	fp_final
+
+fp_data_long:
+	jsr	fp_normalize_ext
+	jsr	fp_conv_ext2long
+	swap	%d2
+	move.w	%d2,%d1
+	jsr	fp_put_data_reg
+	jra	fp_final
+
+fp_data_single:
+	jsr	fp_normalize_ext
+	jsr	fp_conv_ext2single
+	swap	%d2
+	move.w	%d2,%d1
+	jsr	fp_put_data_reg
+	jra	fp_final
+
+	| addressing mode: address register indirect
+fp_indirect:
+	fp_mode_addr_indirect
+	jra	fp_putdest
+
+	| addressing mode: address register indirect with postincrement
+fp_postinc:
+	fp_mode_addr_indirect_postinc
+	jra	fp_putdest
+
+	| addressing mode: address register indirect with predecrement
+fp_predecr:
+	fp_mode_addr_indirect_predec
+	jra	fp_putdest
+
+	| addressing mode: address register indirect with 16bit displacement
+fp_disp16:
+	fp_mode_addr_indirect_disp16
+	jra     fp_putdest
+
+fp_extmode0:
+	fp_mode_addr_indirect_extmode0
+	jra	fp_putdest
+
+fp_extmode1:
+	fp_decode_addr_reg
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fp_abs_short, fp_abs_long
+	.long	fp_ill, fp_ill
+	.long	fp_ill, fp_ill
+	.long	fp_ill, fp_ill
+
+fp_abs_short:
+	fp_mode_abs_short
+	jra	fp_putdest
+
+fp_abs_long:
+	fp_mode_abs_long
+	jra	fp_putdest
+
+fp_putdest:
+	move.l	%a0,%a1
+	fp_decode_src_reg
+	move.l	%d1,%d2			| save size
+	fp_get_fp_reg
+	printf	PDECODE,"\n"
+	addq.l	#8,%a0
+	move.l	(%a0),-(%sp)
+	move.l	-(%a0),-(%sp)
+	move.l	-(%a0),-(%sp)
+	move.l	%sp,%a0
+	jsr	fp_normalize_ext
+
+	swap	%d2
+	jmp	([0f:w,%pc,%d2.w*4])
+
+	.align	4
+0:
+	.long	fp_format_long, fp_format_single
+	.long	fp_format_extended, fp_format_packed
+	.long	fp_format_word, fp_format_double
+	.long	fp_format_byte, fp_format_packed
+
+fp_format_long:
+	jsr	fp_conv_ext2long
+	putuser.l %d0,(%a1),fp_err_ua1,%a1
+	jra	fp_finish_move
+
+fp_format_single:
+	jsr	fp_conv_ext2single
+	putuser.l %d0,(%a1),fp_err_ua1,%a1
+	jra	fp_finish_move
+
+fp_format_extended:
+	move.l	(%a0)+,%d0
+	lsl.w	#1,%d0
+	lsl.l	#7,%d0
+	lsl.l	#8,%d0
+	putuser.l %d0,(%a1)+,fp_err_ua1,%a1
+	move.l	(%a0)+,%d0
+	putuser.l %d0,(%a1)+,fp_err_ua1,%a1
+	move.l	(%a0),%d0
+	putuser.l %d0,(%a1),fp_err_ua1,%a1
+	jra	fp_finish_move
+
+fp_format_packed:
+	/* not supported yet */
+	lea	(12,%sp),%sp
+	jra	fp_ill
+
+fp_format_word:
+	jsr	fp_conv_ext2short
+	putuser.w %d0,(%a1),fp_err_ua1,%a1
+	jra	fp_finish_move
+
+fp_format_double:
+	jsr	fp_conv_ext2double
+	jra	fp_finish_move
+
+fp_format_byte:
+	jsr	fp_conv_ext2byte
+	putuser.b %d0,(%a1),fp_err_ua1,%a1
+|	jra	fp_finish_move
+
+fp_finish_move:
+	lea	(12,%sp),%sp
+	jra	fp_final
--- a/arch/m68k/math-emu/fp_movem.S
+++ b/arch/m68k/math-emu/fp_movem.S
@@ -0,0 +1,368 @@
+/*
+ * fp_movem.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fp_emu.h"
+#include "fp_decode.h"
+
+| set flags for decode macros for fmovem
+do_fmovem=1
+
+	.globl	fp_fmovem_fp, fp_fmovem_cr
+
+| %d1 contains the mask and count of the register list
+| for other register usage see fp_decode.h
+
+fp_fmovem_fp:
+	printf	PDECODE,"fmovem.x "
+	| get register list and count them
+	btst	#11,%d2
+	jne	1f
+	bfextu	%d2{#24,#8},%d0		| static register list
+	jra	2f
+1:	bfextu	%d2{#25,#3},%d0		| dynamic register list
+	jsr	fp_get_data_reg
+2:	move.l	%d0,%d1
+	swap	%d1
+	jra	2f
+1:	addq.w	#1,%d1			| count the # of registers in
+2:	lsr.b	#1,%d0			| register list and keep it in %d1
+	jcs	1b
+	jne	2b
+	printf	PDECODE,"#%08x",1,%d1
+#ifdef FPU_EMU_DEBUG
+	btst	#12,%d2
+	jne	1f
+	printf	PDECODE,"-"		| decremental move
+	jra	2f
+1:	printf	PDECODE,"+"		| incremental move
+2:	btst	#13,%d2
+	jeq	1f
+	printf	PDECODE,"->"		| fpu -> cpu
+	jra	2f
+1:	printf	PDECODE,"<-"		| fpu <- cpu
+2:
+#endif
+
+	| decode address mode
+	fp_decode_addr_mode
+
+	.long	fp_ill, fp_ill
+	.long	fpr_indirect, fpr_postinc
+	.long	fpr_predecr, fpr_disp16
+	.long	fpr_extmode0, fpr_extmode1
+
+	| addressing mode: address register indirect
+fpr_indirect:
+	fp_mode_addr_indirect
+	jra	fpr_do_movem
+
+	| addressing mode: address register indirect with postincrement
+fpr_postinc:
+	fp_mode_addr_indirect_postinc
+	jra	fpr_do_movem
+
+fpr_predecr:
+	fp_mode_addr_indirect_predec
+	jra	fpr_do_movem
+
+	| addressing mode: address register/programm counter indirect
+	|		   with 16bit displacement
+fpr_disp16:
+	fp_mode_addr_indirect_disp16
+	jra	fpr_do_movem
+
+fpr_extmode0:
+	fp_mode_addr_indirect_extmode0
+	jra	fpr_do_movem
+
+fpr_extmode1:
+	fp_decode_addr_reg
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fpr_absolute_short, fpr_absolute_long
+	.long	fpr_disp16, fpr_extmode0
+	.long	fp_ill, fp_ill
+	.long	fp_ill, fp_ill
+
+fpr_absolute_short:
+	fp_mode_abs_short
+	jra	fpr_do_movem
+
+fpr_absolute_long:
+	fp_mode_abs_long
+|	jra	fpr_do_movem
+
+fpr_do_movem:
+	swap	%d1			| get fpu register list
+	lea	(FPD_FPREG,FPDATA),%a1
+	moveq	#12,%d0
+	btst	#12,%d2
+	jne	1f
+	lea	(-12,%a1,%d0*8),%a1
+	neg.l	%d0
+1:	btst	#13,%d2
+	jne	4f
+	| move register from memory into fpu
+	jra	3f
+1:	printf	PMOVEM,"(%p>%p)",2,%a0,%a1
+	getuser.l (%a0)+,%d2,fp_err_ua1,%a0
+	lsr.l	#8,%d2
+	lsr.l	#7,%d2
+	lsr.w	#1,%d2
+	move.l	%d2,(%a1)+
+	getuser.l (%a0)+,%d2,fp_err_ua1,%a0
+	move.l	%d2,(%a1)+
+	getuser.l (%a0),%d2,fp_err_ua1,%a0
+	move.l	%d2,(%a1)
+	subq.l	#8,%a0
+	subq.l	#8,%a1
+	add.l	%d0,%a0
+2:	add.l	%d0,%a1
+3:	lsl.b	#1,%d1
+	jcs	1b
+	jne	2b
+	jra	5f
+	| move register from fpu into memory
+1:	printf	PMOVEM,"(%p>%p)",2,%a1,%a0
+	move.l	(%a1)+,%d2
+	lsl.w	#1,%d2
+	lsl.l	#7,%d2
+	lsl.l	#8,%d2
+	putuser.l %d2,(%a0)+,fp_err_ua1,%a0
+	move.l	(%a1)+,%d2
+	putuser.l %d2,(%a0)+,fp_err_ua1,%a0
+	move.l	(%a1),%d2
+	putuser.l %d2,(%a0),fp_err_ua1,%a0
+	subq.l	#8,%a1
+	subq.l	#8,%a0
+	add.l	%d0,%a0
+2:	add.l	%d0,%a1
+4:	lsl.b	#1,%d1
+	jcs	1b
+	jne	2b
+5:
+	printf	PDECODE,"\n"
+#if 0
+	lea	(FPD_FPREG,FPDATA),%a0
+	printf	PMOVEM,"fp:"
+	printx	PMOVEM,%a0@(0)
+	printx	PMOVEM,%a0@(12)
+	printf	PMOVEM,"\n   "
+	printx	PMOVEM,%a0@(24)
+	printx	PMOVEM,%a0@(36)
+	printf	PMOVEM,"\n   "
+	printx	PMOVEM,%a0@(48)
+	printx	PMOVEM,%a0@(60)
+	printf	PMOVEM,"\n   "
+	printx	PMOVEM,%a0@(72)
+	printx	PMOVEM,%a0@(84)
+	printf	PMOVEM,"\n"
+#endif
+	jra	fp_end
+
+| set flags for decode macros for fmovem control register
+do_fmovem=1
+do_fmovem_cr=1
+
+fp_fmovem_cr:
+	printf	PDECODE,"fmovem.cr "
+	| get register list and count them
+	bfextu	%d2{#19,#3},%d0
+	move.l	%d0,%d1
+	swap	%d1
+	jra	2f
+1:	addq.w	#1,%d1
+2:	lsr.l	#1,%d0
+	jcs	1b
+	jne	2b
+	printf	PDECODE,"#%08x",1,%d1
+#ifdef FPU_EMU_DEBUG
+	btst	#13,%d2
+	jeq	1f
+	printf	PDECODE,"->"		| fpu -> cpu
+	jra	2f
+1:	printf	PDECODE,"<-"		| fpu <- cpu
+2:
+#endif
+
+	| decode address mode
+	fp_decode_addr_mode
+
+	.long	fpc_data, fpc_addr
+	.long	fpc_indirect, fpc_postinc
+	.long	fpc_predecr, fpc_disp16
+	.long	fpc_extmode0, fpc_extmode1
+
+fpc_data:
+	fp_mode_data_direct
+	move.w	%d0,%d1
+	bfffo	%d2{#19,#3},%d0
+	sub.w	#19,%d0
+	lea	(FPD_FPCR,FPDATA,%d0.w*4),%a1
+	btst	#13,%d2
+	jne	1f
+	move.w	%d1,%d0
+	jsr	fp_get_data_reg
+	move.l	%d0,(%a1)
+	jra	fpc_movem_fin
+1:	move.l	(%a1),%d0
+	jsr	fp_put_data_reg
+	jra	fpc_movem_fin
+
+fpc_addr:
+	fp_decode_addr_reg
+	printf	PDECODE,"a%d",1,%d0
+	btst	#13,%d2
+	jne	1f
+	jsr	fp_get_addr_reg
+	move.l	%a0,(FPD_FPIAR,FPDATA)
+	jra	fpc_movem_fin
+1:	move.l	(FPD_FPIAR,FPDATA),%a0
+	jsr	fp_put_addr_reg
+	jra	fpc_movem_fin
+
+fpc_indirect:
+	fp_mode_addr_indirect
+	jra	fpc_do_movem
+
+fpc_postinc:
+	fp_mode_addr_indirect_postinc
+	jra	fpc_do_movem
+
+fpc_predecr:
+	fp_mode_addr_indirect_predec
+	jra	fpc_do_movem
+
+fpc_disp16:
+	fp_mode_addr_indirect_disp16
+	jra	fpc_do_movem
+
+fpc_extmode0:
+	fp_mode_addr_indirect_extmode0
+	jra	fpc_do_movem
+
+fpc_extmode1:
+	fp_decode_addr_reg
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fpc_absolute_short, fpc_absolute_long
+	.long	fpc_disp16, fpc_extmode0
+	.long	fpc_immediate, fp_ill
+	.long	fp_ill, fp_ill
+
+fpc_absolute_short:
+	fp_mode_abs_short
+	jra	fpc_do_movem
+
+fpc_absolute_long:
+	fp_mode_abs_long
+	jra	fpc_do_movem
+
+fpc_immediate:
+	fp_get_pc %a0
+	lea	(%a0,%d1.w*4),%a1
+	fp_put_pc %a1
+	printf	PDECODE,"#imm"
+|	jra	fpc_do_movem
+#if 0
+	swap	%d1
+	lsl.l	#5,%d1
+	lea	(FPD_FPCR,FPDATA),%a0
+	jra	3f
+1:	move.l	%d0,(%a0)
+2:	addq.l	#4,%a0
+3:	lsl.b	#1,%d1
+	jcs	1b
+	jne	2b
+	jra	fpc_movem_fin
+#endif
+
+fpc_do_movem:
+	swap	%d1			| get fpu register list
+	lsl.l	#5,%d1
+	lea	(FPD_FPCR,FPDATA),%a1
+1:	btst	#13,%d2
+	jne	4f
+
+	| move register from memory into fpu
+	jra	3f
+1:	printf	PMOVEM,"(%p>%p)",2,%a0,%a1
+	getuser.l (%a0)+,%d0,fp_err_ua1,%a0
+	move.l	%d0,(%a1)
+2:	addq.l	#4,%a1
+3:	lsl.b	#1,%d1
+	jcs	1b
+	jne	2b
+	jra	fpc_movem_fin
+
+	| move register from fpu into memory
+1:	printf	PMOVEM,"(%p>%p)",2,%a1,%a0
+	move.l	(%a1),%d0
+	putuser.l %d0,(%a0)+,fp_err_ua1,%a0
+2:	addq.l	#4,%a1
+4:	lsl.b	#1,%d1
+	jcs	1b
+	jne	2b
+
+fpc_movem_fin:
+	and.l	#0x0000fff0,(FPD_FPCR,FPDATA)
+	and.l	#0x0ffffff8,(FPD_FPSR,FPDATA)
+	move.l	(FPD_FPCR,FPDATA),%d0
+	lsr.l	#4,%d0
+	moveq	#3,%d1
+	and.l	%d0,%d1
+	move.w	%d1,(FPD_RND,FPDATA)
+	lsr.l	#2,%d0
+	moveq	#3,%d1
+	and.l	%d0,%d1
+	move.w	%d1,(FPD_PREC,FPDATA)
+	printf	PDECODE,"\n"
+#if 0
+	printf	PMOVEM,"fpcr : %08x\n",1,FPDATA@(FPD_FPCR)
+	printf	PMOVEM,"fpsr : %08x\n",1,FPDATA@(FPD_FPSR)
+	printf	PMOVEM,"fpiar: %08x\n",1,FPDATA@(FPD_FPIAR)
+	clr.l	%d0
+	move.w	(FPD_PREC,FPDATA),%d0
+	printf	PMOVEM,"prec : %04x\n",1,%d0
+	move.w	(FPD_RND,FPDATA),%d0
+	printf	PMOVEM,"rnd  : %04x\n",1,%d0
+#endif
+	jra	fp_end
--- a/arch/m68k/math-emu/fp_scan.S
+++ b/arch/m68k/math-emu/fp_scan.S
@@ -0,0 +1,478 @@
+/*
+ * fp_scan.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fp_emu.h"
+#include "fp_decode.h"
+
+	.globl	fp_scan, fp_datasize
+
+	.data
+
+| %d2 - first two instr words
+| %d1 - operand size
+
+/* operand formats are:
+
+	Long = 0,		i.e. fmove.l
+	Single,			i.e. fmove.s
+	Extended,		i.e. fmove.x
+	Packed-BCD,		i.e. fmove.p
+	Word,			i.e. fmove.w
+	Double,			i.e. fmove.d
+*/
+
+	.text
+
+| On entry:
+| FPDATA - base of emulated FPU registers
+
+fp_scan:
+| normal fpu instruction? (this excludes fsave/frestore)
+	fp_get_pc %a0
+	printf	PDECODE,"%08x: ",1,%a0
+	getuser.b (%a0),%d0,fp_err_ua1,%a0
+#if 1
+	cmp.b	#0xf2,%d0		| cpid = 1
+#else
+	cmp.b	#0xfc,%d0		| cpid = 6
+#endif
+	jne	fp_nonstd
+| first two instruction words are kept in %d2
+	getuser.l (%a0)+,%d2,fp_err_ua1,%a0
+	fp_put_pc %a0
+fp_decode_cond:				| separate conditional instr
+	fp_decode_cond_instr_type
+
+	.long	fp_decode_move, fp_fscc
+	.long	fp_fbccw, fp_fbccl
+
+fp_decode_move:				| separate move instr
+	fp_decode_move_instr_type
+
+	.long	fp_fgen_fp, fp_ill
+	.long	fp_fgen_ea, fp_fmove_fp2mem
+	.long	fp_fmovem_cr, fp_fmovem_cr
+	.long	fp_fmovem_fp, fp_fmovem_fp
+
+| now all arithmetic instr and a few move instr are left
+fp_fgen_fp:				| source is a fpu register
+	clr.b	(FPD_FPSR+2,FPDATA)	| clear the exception byte
+	fp_decode_sourcespec
+	printf	PDECODE,"f<op>.x fp%d",1,%d0
+	fp_get_fp_reg
+	lea	(FPD_TEMPFP1,FPDATA),%a1 | copy src into a temp location
+	move.l	(%a0)+,(%a1)+
+	move.l	(%a0)+,(%a1)+
+	move.l	(%a0),(%a1)
+	lea	(-8,%a1),%a0
+	jra	fp_getdest
+
+fp_fgen_ea:				| source is <ea>
+	clr.b	(FPD_FPSR+2,FPDATA)	| clear the exception byte
+	| sort out fmovecr, keep data size in %d1
+	fp_decode_sourcespec
+	cmp.w	#7,%d0
+	jeq	fp_fmovecr
+	move.w	%d0,%d1			| store data size twice in %d1
+	swap	%d1			| one can be trashed below
+	move.w	%d0,%d1
+#ifdef FPU_EMU_DEBUG
+	lea	0f,%a0
+	clr.l	%d0
+	move.b	(%a0,%d1.w),%d0
+	printf	PDECODE,"f<op>.%c ",1,%d0
+
+	.data
+0:	.byte	'l','s','x','p','w','d','b',0
+	.previous
+#endif
+
+/*
+	fp_getsource, fp_getdest
+
+	basically, we end up with a pointer to the source operand in
+	%a1, and a pointer to the destination operand in %a0.  both
+	are, of course, 96-bit extended floating point numbers.
+*/
+
+fp_getsource:
+	| decode addressing mode for source
+	fp_decode_addr_mode
+
+	.long	fp_data, fp_ill
+	.long	fp_indirect, fp_postinc
+	.long	fp_predecr, fp_disp16
+	.long	fp_extmode0, fp_extmode1
+
+	| addressing mode: data register direct
+fp_data:
+	fp_mode_data_direct
+	jsr	fp_get_data_reg
+	lea	(FPD_TEMPFP1,FPDATA),%a0
+	jmp	([0f:w,%pc,%d1.w*4])
+
+	.align	4
+0:
+	.long	fp_data_long, fp_data_single
+	.long	fp_ill, fp_ill
+	.long	fp_data_word, fp_ill
+	.long	fp_data_byte, fp_ill
+
+	| data types that fit in an integer data register
+fp_data_byte:
+	extb.l	%d0
+	jra	fp_data_long
+
+fp_data_word:
+	ext.l	%d0
+
+fp_data_long:
+	jsr	fp_conv_long2ext
+	jra	fp_getdest
+
+fp_data_single:
+	jsr	fp_conv_single2ext
+	jra	fp_getdest
+
+	| addressing mode: address register indirect
+fp_indirect:
+	fp_mode_addr_indirect
+	jra	fp_fetchsource
+
+	| addressing mode: address register indirect with postincrement
+fp_postinc:
+	fp_mode_addr_indirect_postinc
+	jra	fp_fetchsource
+
+	| addressing mode: address register indirect with predecrement
+fp_predecr:
+	fp_mode_addr_indirect_predec
+	jra	fp_fetchsource
+
+	| addressing mode: address register/programm counter indirect
+	|		   with 16bit displacement
+fp_disp16:
+	fp_mode_addr_indirect_disp16
+	jra	fp_fetchsource
+
+	| all other indirect addressing modes will finally end up here
+fp_extmode0:
+	fp_mode_addr_indirect_extmode0
+	jra	fp_fetchsource
+
+| all pc relative addressing modes and immediate/absolute modes end up here
+| the first ones are sent to fp_extmode0 or fp_disp16
+| and only the latter are handled here
+fp_extmode1:
+	fp_decode_addr_reg
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fp_abs_short, fp_abs_long
+	.long	fp_disp16, fp_extmode0
+	.long	fp_immediate, fp_ill
+	.long	fp_ill, fp_ill
+
+	| addressing mode: absolute short
+fp_abs_short:
+	fp_mode_abs_short
+	jra	fp_fetchsource
+
+	| addressing mode: absolute long
+fp_abs_long:
+	fp_mode_abs_long
+	jra	fp_fetchsource
+
+	| addressing mode: immediate data
+fp_immediate:
+	printf	PDECODE,"#"
+	fp_get_pc %a0
+	move.w	(fp_datasize,%d1.w*2),%d0
+	addq.w	#1,%d0
+	and.w	#-2,%d0
+#ifdef FPU_EMU_DEBUG
+	movem.l	%d0/%d1,-(%sp)
+	movel	%a0,%a1
+	clr.l	%d1
+	jra	2f
+1:	getuser.b (%a1)+,%d1,fp_err_ua1,%a1
+	printf	PDECODE,"%02x",1,%d1
+2:	dbra	%d0,1b
+	movem.l	(%sp)+,%d0/%d1
+#endif
+	lea	(%a0,%d0.w),%a1
+	fp_put_pc %a1
+|	jra	fp_fetchsource
+
+fp_fetchsource:
+	move.l	%a0,%a1
+	swap	%d1
+	lea	(FPD_TEMPFP1,FPDATA),%a0
+	jmp	([0f:w,%pc,%d1.w*4])
+
+	.align	4
+0:	.long	fp_long, fp_single
+	.long	fp_ext, fp_pack
+	.long	fp_word, fp_double
+	.long	fp_byte, fp_ill
+
+fp_long:
+	getuser.l (%a1),%d0,fp_err_ua1,%a1
+	jsr	fp_conv_long2ext
+	jra	fp_getdest
+
+fp_single:
+	getuser.l (%a1),%d0,fp_err_ua1,%a1
+	jsr	fp_conv_single2ext
+	jra	fp_getdest
+
+fp_ext:
+	getuser.l (%a1)+,%d0,fp_err_ua1,%a1
+	lsr.l	#8,%d0
+	lsr.l	#7,%d0
+	lsr.w	#1,%d0
+	move.l	%d0,(%a0)+
+	getuser.l (%a1)+,%d0,fp_err_ua1,%a1
+	move.l	%d0,(%a0)+
+	getuser.l (%a1),%d0,fp_err_ua1,%a1
+	move.l	%d0,(%a0)
+	subq.l	#8,%a0
+	jra	fp_getdest
+
+fp_pack:
+	/* not supported yet */
+	jra	fp_ill
+
+fp_word:
+	getuser.w (%a1),%d0,fp_err_ua1,%a1
+	ext.l	%d0
+	jsr	fp_conv_long2ext
+	jra	fp_getdest
+
+fp_double:
+	jsr	fp_conv_double2ext
+	jra	fp_getdest
+
+fp_byte:
+	getuser.b (%a1),%d0,fp_err_ua1,%a1
+	extb.l	%d0
+	jsr	fp_conv_long2ext
+|	jra	fp_getdest
+
+fp_getdest:
+	move.l	%a0,%a1
+	bfextu	%d2{#22,#3},%d0
+	printf	PDECODE,",fp%d\n",1,%d0
+	fp_get_fp_reg
+	movem.l	%a0/%a1,-(%sp)
+	pea	fp_finalrounding
+	bfextu	%d2{#25,#7},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fp_fmove_mem2fp, fp_fint, fp_fsinh, fp_fintrz
+	.long	fp_fsqrt, fp_ill, fp_flognp1, fp_ill
+	.long	fp_fetoxm1, fp_ftanh, fp_fatan, fp_ill
+	.long	fp_fasin, fp_fatanh, fp_fsin, fp_ftan
+	.long	fp_fetox, fp_ftwotox, fp_ftentox, fp_ill
+	.long	fp_flogn, fp_flog10, fp_flog2, fp_ill
+	.long	fp_fabs, fp_fcosh, fp_fneg, fp_ill
+	.long	fp_facos, fp_fcos, fp_fgetexp, fp_fgetman
+	.long	fp_fdiv, fp_fmod, fp_fadd, fp_fmul
+	.long	fpa_fsgldiv, fp_frem, fp_fscale, fpa_fsglmul
+	.long	fp_fsub, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_fsincos0, fp_fsincos1, fp_fsincos2, fp_fsincos3
+	.long	fp_fsincos4, fp_fsincos5, fp_fsincos6, fp_fsincos7
+	.long	fp_fcmp, fp_ill, fp_ftst, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_fsmove, fp_fssqrt, fp_ill, fp_ill
+	.long	fp_fdmove, fp_fdsqrt, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_fsabs, fp_ill, fp_fsneg, fp_ill
+	.long	fp_fdabs, fp_ill, fp_fdneg, fp_ill
+	.long	fp_fsdiv, fp_ill, fp_fsadd, fp_fsmul
+	.long	fp_fddiv, fp_ill, fp_fdadd, fp_fdmul
+	.long	fp_fssub, fp_ill, fp_ill, fp_ill
+	.long	fp_fdsub, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+
+	| Instructions follow
+
+	| Move an (emulated) ROM constant
+fp_fmovecr:
+	bfextu	%d2{#27,#5},%d0
+	printf	PINSTR,"fp_fmovecr #%d",1,%d0
+	move.l	%d0,%d1
+	add.l	%d0,%d0
+	add.l	%d1,%d0
+	lea	(fp_constants,%d0*4),%a0
+	move.l	#0x801cc0ff,%d0
+	addq.l	#1,%d1
+	lsl.l	%d1,%d0
+	jcc	1f
+	fp_set_sr FPSR_EXC_INEX2			| INEX2 exception
+1:	moveq	#-128,%d0				| continue with fmove
+	and.l	%d0,%d2
+	jra	fp_getdest
+
+	.data
+	.align	4
+fp_constants:
+	.long	0x00004000,0xc90fdaa2,0x2168c235	| pi
+	.extend	0,0,0,0,0,0,0,0,0,0
+	.long	0x00003ffd,0x9a209a84,0xfbcff798	| log10(2)
+	.long	0x00004000,0xadf85458,0xa2bb4a9a	| e
+	.long	0x00003fff,0xb8aa3b29,0x5c17f0bc	| log2(e)
+	.long	0x00003ffd,0xde5bd8a9,0x37287195	| log10(e)
+	.long	0x00000000,0x00000000,0x00000000	| 0.0
+	.long	0x00003ffe,0xb17217f7,0xd1cf79ac	| 1n(2)
+	.long	0x00004000,0x935d8ddd,0xaaa8ac17	| 1n(10)
+	| read this as "1.0 * 2^0" - note the high bit in the mantissa
+	.long	0x00003fff,0x80000000,0x00000000	| 10^0
+	.long	0x00004002,0xa0000000,0x00000000	| 10^1
+	.long	0x00004005,0xc8000000,0x00000000	| 10^2
+	.long	0x0000400c,0x9c400000,0x00000000	| 10^4
+	.long	0x00004019,0xbebc2000,0x00000000	| 10^8
+	.long	0x00004034,0x8e1bc9bf,0x04000000	| 10^16
+	.long	0x00004069,0x9dc5ada8,0x2b70b59e	| 10^32
+	.long	0x000040d3,0xc2781f49,0xffcfa6d5	| 10^64
+	.long	0x000041a8,0x93ba47c9,0x80e98ce0	| 10^128
+	.long	0x00004351,0xaa7eebfb,0x9df9de8e	| 10^256
+	.long	0x000046a3,0xe319a0ae,0xa60e91c7	| 10^512
+	.long	0x00004d48,0xc9767586,0x81750c17	| 10^1024
+	.long	0x00005a92,0x9e8b3b5d,0xc53d5de5	| 10^2048
+	.long	0x00007525,0xc4605202,0x8a20979b	| 10^4096
+	.previous
+
+fp_fmove_mem2fp:
+	printf	PINSTR,"fmove %p,%p\n",2,%a0,%a1
+	move.l	(%a1)+,(%a0)+
+	move.l	(%a1)+,(%a0)+
+	move.l	(%a1),(%a0)
+	subq.l	#8,%a0
+	rts
+
+fpa_fsglmul:
+	move.l	#fp_finalrounding_single_fast,(%sp)
+	jra	fp_fsglmul
+
+fpa_fsgldiv:
+	move.l	#fp_finalrounding_single_fast,(%sp)
+	jra	fp_fsgldiv
+
+.macro	fp_dosingleprec instr
+	printf	PINSTR,"single "
+	move.l	#fp_finalrounding_single,(%sp)
+	jra	\instr
+.endm
+
+.macro	fp_dodoubleprec instr
+	printf	PINSTR,"double "
+	move.l	#fp_finalrounding_double,(%sp)
+	jra	\instr
+.endm
+
+fp_fsmove:
+	fp_dosingleprec fp_fmove_mem2fp
+
+fp_fssqrt:
+	fp_dosingleprec fp_fsqrt
+
+fp_fdmove:
+	fp_dodoubleprec fp_fmove_mem2fp
+
+fp_fdsqrt:
+	fp_dodoubleprec fp_fsqrt
+
+fp_fsabs:
+	fp_dosingleprec fp_fabs
+
+fp_fsneg:
+	fp_dosingleprec fp_fneg
+
+fp_fdabs:
+	fp_dodoubleprec fp_fabs
+
+fp_fdneg:
+	fp_dodoubleprec fp_fneg
+
+fp_fsdiv:
+	fp_dosingleprec fp_fdiv
+
+fp_fsadd:
+	fp_dosingleprec fp_fadd
+
+fp_fsmul:
+	fp_dosingleprec fp_fmul
+
+fp_fddiv:
+	fp_dodoubleprec fp_fdiv
+
+fp_fdadd:
+	fp_dodoubleprec fp_fadd
+
+fp_fdmul:
+	fp_dodoubleprec fp_fmul
+
+fp_fssub:
+	fp_dosingleprec fp_fsub
+
+fp_fdsub:
+	fp_dodoubleprec fp_fsub
+
+fp_nonstd:
+	fp_get_pc %a0
+	getuser.l (%a0),%d0,fp_err_ua1,%a0
+	printf	,"nonstd ((%08x)=%08x)\n",2,%a0,%d0
+	moveq	#-1,%d0
+	rts
+
+	.data
+	.align	4
+
+	| data sizes corresponding to the operand formats
+fp_datasize:
+	.word	4, 4, 12, 12, 2, 8, 1, 0
--- a/arch/m68k/math-emu/fp_trig.c
+++ b/arch/m68k/math-emu/fp_trig.c
@@ -0,0 +1,183 @@
+/*
+
+  fp_trig.c: floating-point math routines for the Linux-m68k
+  floating point emulator.
+
+  Copyright (c) 1998-1999 David Huggins-Daines / Roman Zippel.
+
+  I hereby give permission, free of charge, to copy, modify, and
+  redistribute this software, in source or binary form, provided that
+  the above copyright notice and the following disclaimer are included
+  in all such copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL
+  OR IMPLIED.
+
+*/
+
+#include "fp_emu.h"
+#include "fp_trig.h"
+
+struct fp_ext *
+fp_fsin(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsin\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fcos(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fcos\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_ftan(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("ftan\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fasin(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fasin\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_facos(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("facos\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fatan(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fatan\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsinh(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsinh\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fcosh(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fcosh\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_ftanh(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("ftanh\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fatanh(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fatanh\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos0(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos0\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos1(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos1\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos2(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos2\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos3(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos3\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos4(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos4\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos5(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos5\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos6(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos6\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos7(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos7\n");
+
+	return dest;
+}
--- a/arch/m68k/math-emu/fp_trig.h
+++ b/arch/m68k/math-emu/fp_trig.h
@@ -0,0 +1,32 @@
+/*
+
+  fp_trig.h: floating-point math routines for the Linux-m68k
+  floating point emulator.
+
+  Copyright (c) 1998 David Huggins-Daines.
+
+  I hereby give permission, free of charge, to copy, modify, and
+  redistribute this software, in source or binary form, provided that
+  the above copyright notice and the following disclaimer are included
+  in all such copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL
+  OR IMPLIED.
+
+*/
+
+#ifndef FP_TRIG_H
+#define FP_TRIG_H
+
+#include "fp_emu.h"
+
+/* floating point trigonometric instructions:
+
+   the arguments to these are in the "internal" extended format, that
+   is, an "exploded" version of the 96-bit extended fp format used by
+   the 68881.
+
+   they return a status code, which should end up in %d0, if all goes
+   well.  */
+
+#endif /* FP_TRIG__H */
--- a/arch/m68k/math-emu/fp_util.S
+++ b/arch/m68k/math-emu/fp_util.S
--- a/arch/m68k/math-emu/multi_arith.h
+++ b/arch/m68k/math-emu/multi_arith.h
@@ -0,0 +1,819 @@
+/* multi_arith.h: multi-precision integer arithmetic functions, needed
+   to do extended-precision floating point.
+
+   (c) 1998 David Huggins-Daines.
+
+   Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)
+   David Mosberger-Tang.
+
+   You may copy, modify, and redistribute this file under the terms of
+   the GNU General Public License, version 2, or any later version, at
+   your convenience. */
+
+/* Note:
+
+   These are not general multi-precision math routines.  Rather, they
+   implement the subset of integer arithmetic that we need in order to
+   multiply, divide, and normalize 128-bit unsigned mantissae.  */
+
+#ifndef MULTI_ARITH_H
+#define MULTI_ARITH_H
+
+#if 0	/* old code... */
+
+/* Unsigned only, because we don't need signs to multiply and divide. */
+typedef unsigned int int128[4];
+
+/* Word order */
+enum {
+	MSW128,
+	NMSW128,
+	NLSW128,
+	LSW128
+};
+
+/* big-endian */
+#define LO_WORD(ll) (((unsigned int *) &ll)[1])
+#define HI_WORD(ll) (((unsigned int *) &ll)[0])
+
+/* Convenience functions to stuff various integer values into int128s */
+
+static inline void zero128(int128 a)
+{
+	a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;
+}
+
+/* Human-readable word order in the arguments */
+static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,
+			  unsigned int i0, int128 a)
+{
+	a[LSW128] = i0;
+	a[NLSW128] = i1;
+	a[NMSW128] = i2;
+	a[MSW128] = i3;
+}
+
+/* Convenience functions (for testing as well) */
+static inline void int64_to_128(unsigned long long src, int128 dest)
+{
+	dest[LSW128] = (unsigned int) src;
+	dest[NLSW128] = src >> 32;
+	dest[NMSW128] = dest[MSW128] = 0;
+}
+
+static inline void int128_to_64(const int128 src, unsigned long long *dest)
+{
+	*dest = src[LSW128] | (long long) src[NLSW128] << 32;
+}
+
+static inline void put_i128(const int128 a)
+{
+	printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],
+	       a[NLSW128], a[LSW128]);
+}
+
+/* Internal shifters:
+
+   Note that these are only good for 0 < count < 32.
+ */
+
+static inline void _lsl128(unsigned int count, int128 a)
+{
+	a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count));
+	a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count));
+	a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count));
+	a[LSW128] <<= count;
+}
+
+static inline void _lsr128(unsigned int count, int128 a)
+{
+	a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count));
+	a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count));
+	a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count));
+	a[MSW128] >>= count;
+}
+
+/* Should be faster, one would hope */
+
+static inline void lslone128(int128 a)
+{
+	asm volatile ("lsl.l #1,%0\n"
+		      "roxl.l #1,%1\n"
+		      "roxl.l #1,%2\n"
+		      "roxl.l #1,%3\n"
+		      :
+		      "=d" (a[LSW128]),
+		      "=d"(a[NLSW128]),
+		      "=d"(a[NMSW128]),
+		      "=d"(a[MSW128])
+		      :
+		      "0"(a[LSW128]),
+		      "1"(a[NLSW128]),
+		      "2"(a[NMSW128]),
+		      "3"(a[MSW128]));
+}
+
+static inline void lsrone128(int128 a)
+{
+	asm volatile ("lsr.l #1,%0\n"
+		      "roxr.l #1,%1\n"
+		      "roxr.l #1,%2\n"
+		      "roxr.l #1,%3\n"
+		      :
+		      "=d" (a[MSW128]),
+		      "=d"(a[NMSW128]),
+		      "=d"(a[NLSW128]),
+		      "=d"(a[LSW128])
+		      :
+		      "0"(a[MSW128]),
+		      "1"(a[NMSW128]),
+		      "2"(a[NLSW128]),
+		      "3"(a[LSW128]));
+}
+
+/* Generalized 128-bit shifters:
+
+   These bit-shift to a multiple of 32, then move whole longwords.  */
+
+static inline void lsl128(unsigned int count, int128 a)
+{
+	int wordcount, i;
+
+	if (count % 32)
+		_lsl128(count % 32, a);
+
+	if (0 == (wordcount = count / 32))
+		return;
+
+	/* argh, gak, endian-sensitive */
+	for (i = 0; i < 4 - wordcount; i++) {
+		a[i] = a[i + wordcount];
+	}
+	for (i = 3; i >= 4 - wordcount; --i) {
+		a[i] = 0;
+	}
+}
+
+static inline void lsr128(unsigned int count, int128 a)
+{
+	int wordcount, i;
+
+	if (count % 32)
+		_lsr128(count % 32, a);
+
+	if (0 == (wordcount = count / 32))
+		return;
+
+	for (i = 3; i >= wordcount; --i) {
+		a[i] = a[i - wordcount];
+	}
+	for (i = 0; i < wordcount; i++) {
+		a[i] = 0;
+	}
+}
+
+static inline int orl128(int a, int128 b)
+{
+	b[LSW128] |= a;
+}
+
+static inline int btsthi128(const int128 a)
+{
+	return a[MSW128] & 0x80000000;
+}
+
+/* test bits (numbered from 0 = LSB) up to and including "top" */
+static inline int bftestlo128(int top, const int128 a)
+{
+	int r = 0;
+
+	if (top > 31)
+		r |= a[LSW128];
+	if (top > 63)
+		r |= a[NLSW128];
+	if (top > 95)
+		r |= a[NMSW128];
+
+	r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);
+
+	return (r != 0);
+}
+
+/* Aargh.  We need these because GCC is broken */
+/* FIXME: do them in assembly, for goodness' sake! */
+static inline void mask64(int pos, unsigned long long *mask)
+{
+	*mask = 0;
+
+	if (pos < 32) {
+		LO_WORD(*mask) = (1 << pos) - 1;
+		return;
+	}
+	LO_WORD(*mask) = -1;
+	HI_WORD(*mask) = (1 << (pos - 32)) - 1;
+}
+
+static inline void bset64(int pos, unsigned long long *dest)
+{
+	/* This conditional will be optimized away.  Thanks, GCC! */
+	if (pos < 32)
+		asm volatile ("bset %1,%0":"=m"
+			      (LO_WORD(*dest)):"id"(pos));
+	else
+		asm volatile ("bset %1,%0":"=m"
+			      (HI_WORD(*dest)):"id"(pos - 32));
+}
+
+static inline int btst64(int pos, unsigned long long dest)
+{
+	if (pos < 32)
+		return (0 != (LO_WORD(dest) & (1 << pos)));
+	else
+		return (0 != (HI_WORD(dest) & (1 << (pos - 32))));
+}
+
+static inline void lsl64(int count, unsigned long long *dest)
+{
+	if (count < 32) {
+		HI_WORD(*dest) = (HI_WORD(*dest) << count)
+		    | (LO_WORD(*dest) >> count);
+		LO_WORD(*dest) <<= count;
+		return;
+	}
+	count -= 32;
+	HI_WORD(*dest) = LO_WORD(*dest) << count;
+	LO_WORD(*dest) = 0;
+}
+
+static inline void lsr64(int count, unsigned long long *dest)
+{
+	if (count < 32) {
+		LO_WORD(*dest) = (LO_WORD(*dest) >> count)
+		    | (HI_WORD(*dest) << (32 - count));
+		HI_WORD(*dest) >>= count;
+		return;
+	}
+	count -= 32;
+	LO_WORD(*dest) = HI_WORD(*dest) >> count;
+	HI_WORD(*dest) = 0;
+}
+#endif
+
+static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)
+{
+	reg->exp += cnt;
+
+	switch (cnt) {
+	case 0 ... 8:
+		reg->lowmant = reg->mant.m32[1] << (8 - cnt);
+		reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
+				   (reg->mant.m32[0] << (32 - cnt));
+		reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
+		break;
+	case 9 ... 32:
+		reg->lowmant = reg->mant.m32[1] >> (cnt - 8);
+		if (reg->mant.m32[1] << (40 - cnt))
+			reg->lowmant |= 1;
+		reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
+				   (reg->mant.m32[0] << (32 - cnt));
+		reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
+		break;
+	case 33 ... 39:
+		asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant)
+			: "m" (reg->mant.m32[0]), "d" (64 - cnt));
+		if (reg->mant.m32[1] << (40 - cnt))
+			reg->lowmant |= 1;
+		reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
+		reg->mant.m32[0] = 0;
+		break;
+	case 40 ... 71:
+		reg->lowmant = reg->mant.m32[0] >> (cnt - 40);
+		if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1])
+			reg->lowmant |= 1;
+		reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
+		reg->mant.m32[0] = 0;
+		break;
+	default:
+		reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1];
+		reg->mant.m32[0] = 0;
+		reg->mant.m32[1] = 0;
+		break;
+	}
+}
+
+static inline int fp_overnormalize(struct fp_ext *reg)
+{
+	int shift;
+
+	if (reg->mant.m32[0]) {
+		asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0]));
+		reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift));
+		reg->mant.m32[1] = (reg->mant.m32[1] << shift);
+	} else {
+		asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1]));
+		reg->mant.m32[0] = (reg->mant.m32[1] << shift);
+		reg->mant.m32[1] = 0;
+		shift += 32;
+	}
+
+	return shift;
+}
+
+static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src)
+{
+	int carry;
+
+	/* we assume here, gcc only insert move and a clr instr */
+	asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant)
+		: "g,d" (src->lowmant), "0,0" (dest->lowmant));
+	asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1])
+		: "d" (src->mant.m32[1]), "0" (dest->mant.m32[1]));
+	asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0])
+		: "d" (src->mant.m32[0]), "0" (dest->mant.m32[0]));
+	asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0));
+
+	return carry;
+}
+
+static inline int fp_addcarry(struct fp_ext *reg)
+{
+	if (++reg->exp == 0x7fff) {
+		if (reg->mant.m64)
+			fp_set_sr(FPSR_EXC_INEX2);
+		reg->mant.m64 = 0;
+		fp_set_sr(FPSR_EXC_OVFL);
+		return 0;
+	}
+	reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0);
+	reg->mant.m32[1] = (reg->mant.m32[1] >> 1) |
+			   (reg->mant.m32[0] << 31);
+	reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000;
+
+	return 1;
+}
+
+static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1,
+			      struct fp_ext *src2)
+{
+	/* we assume here, gcc only insert move and a clr instr */
+	asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant)
+		: "g,d" (src2->lowmant), "0,0" (src1->lowmant));
+	asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1])
+		: "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1]));
+	asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0])
+		: "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0]));
+}
+
+#define fp_mul64(desth, destl, src1, src2) ({				\
+	asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth)		\
+		: "g" (src1), "0" (src2));				\
+})
+#define fp_div64(quot, rem, srch, srcl, div)				\
+	asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem)		\
+		: "dm" (div), "1" (srch), "0" (srcl))
+#define fp_add64(dest1, dest2, src1, src2) ({				\
+	asm ("add.l %1,%0" : "=d,dm" (dest2)				\
+		: "dm,d" (src2), "0,0" (dest2));			\
+	asm ("addx.l %1,%0" : "=d" (dest1)				\
+		: "d" (src1), "0" (dest1));				\
+})
+#define fp_addx96(dest, src) ({						\
+	/* we assume here, gcc only insert move and a clr instr */	\
+	asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2])		\
+		: "g,d" (temp.m32[1]), "0,0" (dest->m32[2]));		\
+	asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1])		\
+		: "d" (temp.m32[0]), "0" (dest->m32[1]));		\
+	asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0])		\
+		: "d" (0), "0" (dest->m32[0]));				\
+})
+#define fp_sub64(dest, src) ({						\
+	asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1])			\
+		: "dm,d" (src.m32[1]), "0,0" (dest.m32[1]));		\
+	asm ("subx.l %1,%0" : "=d" (dest.m32[0])			\
+		: "d" (src.m32[0]), "0" (dest.m32[0]));			\
+})
+#define fp_sub96c(dest, srch, srcm, srcl) ({				\
+	char carry;							\
+	asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2])			\
+		: "dm,d" (srcl), "0,0" (dest.m32[2]));			\
+	asm ("subx.l %1,%0" : "=d" (dest.m32[1])			\
+		: "d" (srcm), "0" (dest.m32[1]));			\
+	asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0])	\
+		: "d" (srch), "1" (dest.m32[0]));			\
+	carry;								\
+})
+
+static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1,
+				   struct fp_ext *src2)
+{
+	union fp_mant64 temp;
+
+	fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]);
+	fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]);
+
+	fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]);
+	fp_addx96(dest, temp);
+
+	fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]);
+	fp_addx96(dest, temp);
+}
+
+static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src,
+				 struct fp_ext *div)
+{
+	union fp_mant128 tmp;
+	union fp_mant64 tmp64;
+	unsigned long *mantp = dest->m32;
+	unsigned long fix, rem, first, dummy;
+	int i;
+
+	/* the algorithm below requires dest to be smaller than div,
+	   but both have the high bit set */
+	if (src->mant.m64 >= div->mant.m64) {
+		fp_sub64(src->mant, div->mant);
+		*mantp = 1;
+	} else
+		*mantp = 0;
+	mantp++;
+
+	/* basic idea behind this algorithm: we can't divide two 64bit numbers
+	   (AB/CD) directly, but we can calculate AB/C0, but this means this
+	   quotient is off by C0/CD, so we have to multiply the first result
+	   to fix the result, after that we have nearly the correct result
+	   and only a few corrections are needed. */
+
+	/* C0/CD can be precalculated, but it's an 64bit division again, but
+	   we can make it a bit easier, by dividing first through C so we get
+	   10/1D and now only a single shift and the value fits into 32bit. */
+	fix = 0x80000000;
+	dummy = div->mant.m32[1] / div->mant.m32[0] + 1;
+	dummy = (dummy >> 1) | fix;
+	fp_div64(fix, dummy, fix, 0, dummy);
+	fix--;
+
+	for (i = 0; i < 3; i++, mantp++) {
+		if (src->mant.m32[0] == div->mant.m32[0]) {
+			fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]);
+
+			fp_mul64(*mantp, dummy, first, fix);
+			*mantp += fix;
+		} else {
+			fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]);
+
+			fp_mul64(*mantp, dummy, first, fix);
+		}
+
+		fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp);
+		fp_add64(tmp.m32[0], tmp.m32[1], 0, rem);
+		tmp.m32[2] = 0;
+
+		fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]);
+		fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]);
+
+		src->mant.m32[0] = tmp.m32[1];
+		src->mant.m32[1] = tmp.m32[2];
+
+		while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) {
+			src->mant.m32[0] = tmp.m32[1];
+			src->mant.m32[1] = tmp.m32[2];
+			*mantp += 1;
+		}
+	}
+}
+
+#if 0
+static inline unsigned int fp_fls128(union fp_mant128 *src)
+{
+	unsigned long data;
+	unsigned int res, off;
+
+	if ((data = src->m32[0]))
+		off = 0;
+	else if ((data = src->m32[1]))
+		off = 32;
+	else if ((data = src->m32[2]))
+		off = 64;
+	else if ((data = src->m32[3]))
+		off = 96;
+	else
+		return 128;
+
+	asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));
+	return res + off;
+}
+
+static inline void fp_shiftmant128(union fp_mant128 *src, int shift)
+{
+	unsigned long sticky;
+
+	switch (shift) {
+	case 0:
+		return;
+	case 1:
+		asm volatile ("lsl.l #1,%0"
+			: "=d" (src->m32[3]) : "0" (src->m32[3]));
+		asm volatile ("roxl.l #1,%0"
+			: "=d" (src->m32[2]) : "0" (src->m32[2]));
+		asm volatile ("roxl.l #1,%0"
+			: "=d" (src->m32[1]) : "0" (src->m32[1]));
+		asm volatile ("roxl.l #1,%0"
+			: "=d" (src->m32[0]) : "0" (src->m32[0]));
+		return;
+	case 2 ... 31:
+		src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift));
+		src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
+		src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
+		src->m32[3] = (src->m32[3] << shift);
+		return;
+	case 32 ... 63:
+		shift -= 32;
+		src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
+		src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
+		src->m32[2] = (src->m32[3] << shift);
+		src->m32[3] = 0;
+		return;
+	case 64 ... 95:
+		shift -= 64;
+		src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
+		src->m32[1] = (src->m32[3] << shift);
+		src->m32[2] = src->m32[3] = 0;
+		return;
+	case 96 ... 127:
+		shift -= 96;
+		src->m32[0] = (src->m32[3] << shift);
+		src->m32[1] = src->m32[2] = src->m32[3] = 0;
+		return;
+	case -31 ... -1:
+		shift = -shift;
+		sticky = 0;
+		if (src->m32[3] << (32 - shift))
+			sticky = 1;
+		src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky;
+		src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift));
+		src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
+		src->m32[0] = (src->m32[0] >> shift);
+		return;
+	case -63 ... -32:
+		shift = -shift - 32;
+		sticky = 0;
+		if ((src->m32[2] << (32 - shift)) || src->m32[3])
+			sticky = 1;
+		src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky;
+		src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
+		src->m32[1] = (src->m32[0] >> shift);
+		src->m32[0] = 0;
+		return;
+	case -95 ... -64:
+		shift = -shift - 64;
+		sticky = 0;
+		if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3])
+			sticky = 1;
+		src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky;
+		src->m32[2] = (src->m32[0] >> shift);
+		src->m32[1] = src->m32[0] = 0;
+		return;
+	case -127 ... -96:
+		shift = -shift - 96;
+		sticky = 0;
+		if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3])
+			sticky = 1;
+		src->m32[3] = (src->m32[0] >> shift) | sticky;
+		src->m32[2] = src->m32[1] = src->m32[0] = 0;
+		return;
+	}
+
+	if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3]))
+		src->m32[3] = 1;
+	else
+		src->m32[3] = 0;
+	src->m32[2] = 0;
+	src->m32[1] = 0;
+	src->m32[0] = 0;
+}
+#endif
+
+static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src,
+				 int shift)
+{
+	unsigned long tmp;
+
+	switch (shift) {
+	case 0:
+		dest->mant.m64 = src->m64[0];
+		dest->lowmant = src->m32[2] >> 24;
+		if (src->m32[3] || (src->m32[2] << 8))
+			dest->lowmant |= 1;
+		break;
+	case 1:
+		asm volatile ("lsl.l #1,%0"
+			: "=d" (tmp) : "0" (src->m32[2]));
+		asm volatile ("roxl.l #1,%0"
+			: "=d" (dest->mant.m32[1]) : "0" (src->m32[1]));
+		asm volatile ("roxl.l #1,%0"
+			: "=d" (dest->mant.m32[0]) : "0" (src->m32[0]));
+		dest->lowmant = tmp >> 24;
+		if (src->m32[3] || (tmp << 8))
+			dest->lowmant |= 1;
+		break;
+	case 31:
+		asm volatile ("lsr.l #1,%1; roxr.l #1,%0"
+			: "=d" (dest->mant.m32[0])
+			: "d" (src->m32[0]), "0" (src->m32[1]));
+		asm volatile ("roxr.l #1,%0"
+			: "=d" (dest->mant.m32[1]) : "0" (src->m32[2]));
+		asm volatile ("roxr.l #1,%0"
+			: "=d" (tmp) : "0" (src->m32[3]));
+		dest->lowmant = tmp >> 24;
+		if (src->m32[3] << 7)
+			dest->lowmant |= 1;
+		break;
+	case 32:
+		dest->mant.m32[0] = src->m32[1];
+		dest->mant.m32[1] = src->m32[2];
+		dest->lowmant = src->m32[3] >> 24;
+		if (src->m32[3] << 8)
+			dest->lowmant |= 1;
+		break;
+	}
+}
+
+#if 0 /* old code... */
+static inline int fls(unsigned int a)
+{
+	int r;
+
+	asm volatile ("bfffo %1{#0,#32},%0"
+		      : "=d" (r) : "md" (a));
+	return r;
+}
+
+/* fls = "find last set" (cf. ffs(3)) */
+static inline int fls128(const int128 a)
+{
+	if (a[MSW128])
+		return fls(a[MSW128]);
+	if (a[NMSW128])
+		return fls(a[NMSW128]) + 32;
+	/* XXX: it probably never gets beyond this point in actual
+	   use, but that's indicative of a more general problem in the
+	   algorithm (i.e. as per the actual 68881 implementation, we
+	   really only need at most 67 bits of precision [plus
+	   overflow]) so I'm not going to fix it. */
+	if (a[NLSW128])
+		return fls(a[NLSW128]) + 64;
+	if (a[LSW128])
+		return fls(a[LSW128]) + 96;
+	else
+		return -1;
+}
+
+static inline int zerop128(const int128 a)
+{
+	return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
+}
+
+static inline int nonzerop128(const int128 a)
+{
+	return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
+}
+
+/* Addition and subtraction */
+/* Do these in "pure" assembly, because "extended" asm is unmanageable
+   here */
+static inline void add128(const int128 a, int128 b)
+{
+	/* rotating carry flags */
+	unsigned int carry[2];
+
+	carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);
+	b[LSW128] += a[LSW128];
+
+	carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);
+	b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];
+
+	carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);
+	b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];
+
+	b[MSW128] = a[MSW128] + b[MSW128] + carry[0];
+}
+
+/* Note: assembler semantics: "b -= a" */
+static inline void sub128(const int128 a, int128 b)
+{
+	/* rotating borrow flags */
+	unsigned int borrow[2];
+
+	borrow[0] = b[LSW128] < a[LSW128];
+	b[LSW128] -= a[LSW128];
+
+	borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];
+	b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];
+
+	borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];
+	b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];
+
+	b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];
+}
+
+/* Poor man's 64-bit expanding multiply */
+static inline void mul64(unsigned long long a, unsigned long long b, int128 c)
+{
+	unsigned long long acc;
+	int128 acc128;
+
+	zero128(acc128);
+	zero128(c);
+
+	/* first the low words */
+	if (LO_WORD(a) && LO_WORD(b)) {
+		acc = (long long) LO_WORD(a) * LO_WORD(b);
+		c[NLSW128] = HI_WORD(acc);
+		c[LSW128] = LO_WORD(acc);
+	}
+	/* Next the high words */
+	if (HI_WORD(a) && HI_WORD(b)) {
+		acc = (long long) HI_WORD(a) * HI_WORD(b);
+		c[MSW128] = HI_WORD(acc);
+		c[NMSW128] = LO_WORD(acc);
+	}
+	/* The middle words */
+	if (LO_WORD(a) && HI_WORD(b)) {
+		acc = (long long) LO_WORD(a) * HI_WORD(b);
+		acc128[NMSW128] = HI_WORD(acc);
+		acc128[NLSW128] = LO_WORD(acc);
+		add128(acc128, c);
+	}
+	/* The first and last words */
+	if (HI_WORD(a) && LO_WORD(b)) {
+		acc = (long long) HI_WORD(a) * LO_WORD(b);
+		acc128[NMSW128] = HI_WORD(acc);
+		acc128[NLSW128] = LO_WORD(acc);
+		add128(acc128, c);
+	}
+}
+
+/* Note: unsigned */
+static inline int cmp128(int128 a, int128 b)
+{
+	if (a[MSW128] < b[MSW128])
+		return -1;
+	if (a[MSW128] > b[MSW128])
+		return 1;
+	if (a[NMSW128] < b[NMSW128])
+		return -1;
+	if (a[NMSW128] > b[NMSW128])
+		return 1;
+	if (a[NLSW128] < b[NLSW128])
+		return -1;
+	if (a[NLSW128] > b[NLSW128])
+		return 1;
+
+	return (signed) a[LSW128] - b[LSW128];
+}
+
+inline void div128(int128 a, int128 b, int128 c)
+{
+	int128 mask;
+
+	/* Algorithm:
+
+	   Shift the divisor until it's at least as big as the
+	   dividend, keeping track of the position to which we've
+	   shifted it, i.e. the power of 2 which we've multiplied it
+	   by.
+
+	   Then, for this power of 2 (the mask), and every one smaller
+	   than it, subtract the mask from the dividend and add it to
+	   the quotient until the dividend is smaller than the raised
+	   divisor.  At this point, divide the dividend and the mask
+	   by 2 (i.e. shift one place to the right).  Lather, rinse,
+	   and repeat, until there are no more powers of 2 left. */
+
+	/* FIXME: needless to say, there's room for improvement here too. */
+
+	/* Shift up */
+	/* XXX: since it just has to be "at least as big", we can
+	   probably eliminate this horribly wasteful loop.  I will
+	   have to prove this first, though */
+	set128(0, 0, 0, 1, mask);
+	while (cmp128(b, a) < 0 && !btsthi128(b)) {
+		lslone128(b);
+		lslone128(mask);
+	}
+
+	/* Shift down */
+	zero128(c);
+	do {
+		if (cmp128(a, b) >= 0) {
+			sub128(b, a);
+			add128(mask, c);
+		}
+		lsrone128(mask);
+		lsrone128(b);
+	} while (nonzerop128(mask));
+
+	/* The remainder is in a... */
+}
+#endif
+
+#endif	/* MULTI_ARITH_H */