# include "DEFS.h"

/************************************************************************/
/*									*/
/*	Double-precision square root (Sky version)			*/
/*									*/
/*	This routine used to use the Sky board's square root as an	*/
/*	initial approximation, then do two NR iterations.		*/
/*	There was a problem with this approach when the argument was	*/
/*	outside the range (1e-38,1e38).  It could have been fixed with	*/
/*	a little complication, but I decided just to lift my non-Sky	*/
/*	routine (which uses code from U S Software), and replace the	*/
/*	calls to dpdiv and dpadd with Sky board calls.			*/
/*	The result runs about twice as fast as the former Sky version.	*/
/*	It might be possible to speed it up even further by		*/
/*	dumping the U S Software stuff and doing the mods to make the	*/
/*	Sky hardware value work as a first approximation (along with	*/
/*	intelligent assembly language coding, which the original	*/
/*	routine did not have), but I'm fairly content with this.	*/
/*									*/
/*	Wendy Thrash, ISI/NBI, March, 1985				*/
/*									*/
/************************************************************************/

	.data
	.comm	_errno,4
	.data
	.align	1
NaN:
	.long	0x7fffffff
	.long	0xffffffff

	.text

ENTRY(sqrt)
	link	a6,#-40
	moveml	#0x3cfc,sp@
	movl	_skyaddr,a5
	addql	#4,a5
	movl	fp@(8),d0
	jgt	1f
	jeq	2f
	movl	#33,_errno
	movl	NaN,d0
	movl	NaN+4,d1
	jra	RETURN
2:
	movl	a6@(12),d1
	jeq	RETURN

/*	Fall through to here on denormalized number */
/*	(I don't promise that they are handled correctly */

1:
	swap	d0
	movl	d0,d1
	subw	#16*1023,d1	/* Extract argument's two's exp */
	andb	#0xE0,d1	/* Make it a factor of two */
	subw	d1,d0		/* Scale arg. range to 4.0 > arg' >= 1.0 */
	swap	d0
	asrw	#1,d1		/* Square root of scaled two power */
	movw	d1,d7		/* Save two's exp of result in d7 */

	movl	d0,d6		/* Save arg' high word */
	movl	d0,d1		/* Create fixed point integer for approx */
	moveq	#11,d2		/* Shift count */
	lsll	d2,d1		/* arg' * 2^30 in d1 */
/*	NOTE: don't need clrl d2 here because of the moveq above */
	movw	fp@(12),d2
	lsrw	#5,d2
	orw	d2,d1
	bset	#31,d1		/* Set implicit bit */
	beq	3f		/* Jump if arg >= 2.0 */

	lsrl	#1,d1		/* Adjust D1 */
3:
	movw	#42720-65536,d2	/* D2 = 0.325926 * 2^17 */
	swap	d1		/* D1.W = arg' * 2^14 */
	mulu	d1,d2		/* D2 = arg' * 0.325926 * 2^31 */
	swap	d2
	addw	#23616,d2	/* + 0.7207 * 2^15 - to 4+ bits */
	subxw	d3,d3
	orw	d3,d2		/* Top out approximation at 1.99997 */

	swap	d1
	lsrl	#1,d1		/* Arg' * 2^29 in d1 (prevent overflow) */

	movl	d1,d3		/* Copy into D3 */
	divu	d2,d3		/* Arg'/X0 * 2^14 in d3 */
	lsrw	#1,d2
	addw	d3,d2		/* X1 in D2 - to 8 bits */

	movl	d1,d3		/* Second in-register iteration */
	divu	d2,d3
	lsrw	#1,d2
	addw	d3,d2		/* X2 in D2 - to 16 bits */

	movl	d1,d3
	divu	d2,d3
	movw	d3,d4
	clrw	d3
	swap	d4
	divu	d2,d3
	movw	d3,d4		/* 32 bit division result */
	swap	d2
	clrw	d2
	lsrl	#1,d2
	addl	d4,d2		/* X3 in D2 - to 29 bits */
	subxl	d4,d4
	orl	d4,d2		/* Top out at 1.9999999995 */

	lsll	#1,d2		/* Create DP of X3 (good to 29 bits) ... */
	clrl	d3		/* ... in D2:D3 */
	movw	d2,d3
	andw	#0xFFF,d3
	eorw	d3,d2
	orw	#1023,d2	/* Scale to floating point */
	moveq	#12,d0		/* Shift count in D0 */
	rorl	d0,d2		/* Position bits */
	rorl	d0,d3

	movw	#0x1014,a5@(-4)	/* Do divide */
	movl	d6,a5@
	movl	fp@(12),a5@
	movl	d2,a5@
	movl	d3,a5@
L10007:
	tstw	a5@(-2)
	bges	L10007
	movl	a5@,d0
	movl	a5@,d1
	movw	#0x1002,a5@(-4)	/* Now do add */
	movl	d2,a5@
	movl	d3,a5@
	movl	d0,a5@
	movl	d1,a5@
	movl	a5@,d0
	movl	a5@,d1
	swap	d0
	subw	#0x0010,d0	/* "Divide" by 2 */

	addw	d7,d0		/* Scale result */
	swap	d0
RETURN:
	moveml	sp@,#0x3cfc
	unlk	fp
	rts
