1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file implements multi-precision floating-point numbers.
6// Like in the GNU MPFR library (https://www.mpfr.org/), operands
7// can be of mixed precision. Unlike MPFR, the rounding mode is
8// not specified with each operation, but with each operand. The
9// rounding mode of the result operand determines the rounding
10// mode of an operation. This is a from-scratch implementation.
11
12package big
13
14import (
15	"fmt"
16	"math"
17	"math/bits"
18)
19
20const debugFloat = false // enable for debugging
21
22// A nonzero finite Float represents a multi-precision floating point number
23//
24//	sign × mantissa × 2**exponent
25//
26// with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp.
27// A Float may also be zero (+0, -0) or infinite (+Inf, -Inf).
28// All Floats are ordered, and the ordering of two Floats x and y
29// is defined by x.Cmp(y).
30//
31// Each Float value also has a precision, rounding mode, and accuracy.
32// The precision is the maximum number of mantissa bits available to
33// represent the value. The rounding mode specifies how a result should
34// be rounded to fit into the mantissa bits, and accuracy describes the
35// rounding error with respect to the exact result.
36//
37// Unless specified otherwise, all operations (including setters) that
38// specify a *Float variable for the result (usually via the receiver
39// with the exception of [Float.MantExp]), round the numeric result according
40// to the precision and rounding mode of the result variable.
41//
42// If the provided result precision is 0 (see below), it is set to the
43// precision of the argument with the largest precision value before any
44// rounding takes place, and the rounding mode remains unchanged. Thus,
45// uninitialized Floats provided as result arguments will have their
46// precision set to a reasonable value determined by the operands, and
47// their mode is the zero value for RoundingMode (ToNearestEven).
48//
49// By setting the desired precision to 24 or 53 and using matching rounding
50// mode (typically [ToNearestEven]), Float operations produce the same results
51// as the corresponding float32 or float64 IEEE 754 arithmetic for operands
52// that correspond to normal (i.e., not denormal) float32 or float64 numbers.
53// Exponent underflow and overflow lead to a 0 or an Infinity for different
54// values than IEEE 754 because Float exponents have a much larger range.
55//
56// The zero (uninitialized) value for a Float is ready to use and represents
57// the number +0.0 exactly, with precision 0 and rounding mode [ToNearestEven].
58//
59// Operations always take pointer arguments (*Float) rather
60// than Float values, and each unique Float value requires
61// its own unique *Float pointer. To "copy" a Float value,
62// an existing (or newly allocated) Float must be set to
63// a new value using the [Float.Set] method; shallow copies
64// of Floats are not supported and may lead to errors.
65type Float struct {
66	prec uint32
67	mode RoundingMode
68	acc  Accuracy
69	form form
70	neg  bool
71	mant nat
72	exp  int32
73}
74
75// An ErrNaN panic is raised by a [Float] operation that would lead to
76// a NaN under IEEE 754 rules. An ErrNaN implements the error interface.
77type ErrNaN struct {
78	msg string
79}
80
81func (err ErrNaN) Error() string {
82	return err.msg
83}
84
85// NewFloat allocates and returns a new [Float] set to x,
86// with precision 53 and rounding mode [ToNearestEven].
87// NewFloat panics with [ErrNaN] if x is a NaN.
88func NewFloat(x float64) *Float {
89	if math.IsNaN(x) {
90		panic(ErrNaN{"NewFloat(NaN)"})
91	}
92	return new(Float).SetFloat64(x)
93}
94
95// Exponent and precision limits.
96const (
97	MaxExp  = math.MaxInt32  // largest supported exponent
98	MinExp  = math.MinInt32  // smallest supported exponent
99	MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited
100)
101
102// Internal representation: The mantissa bits x.mant of a nonzero finite
103// Float x are stored in a nat slice long enough to hold up to x.prec bits;
104// the slice may (but doesn't have to) be shorter if the mantissa contains
105// trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e.,
106// the msb is shifted all the way "to the left"). Thus, if the mantissa has
107// trailing 0 bits or x.prec is not a multiple of the Word size _W,
108// x.mant[0] has trailing zero bits. The msb of the mantissa corresponds
109// to the value 0.5; the exponent x.exp shifts the binary point as needed.
110//
111// A zero or non-finite Float x ignores x.mant and x.exp.
112//
113// x                 form      neg      mant         exp
114// ----------------------------------------------------------
115// ±0                zero      sign     -            -
116// 0 < |x| < +Inf    finite    sign     mantissa     exponent
117// ±Inf              inf       sign     -            -
118
119// A form value describes the internal representation.
120type form byte
121
122// The form value order is relevant - do not change!
123const (
124	zero form = iota
125	finite
126	inf
127)
128
129// RoundingMode determines how a [Float] value is rounded to the
130// desired precision. Rounding may change the [Float] value; the
131// rounding error is described by the [Float]'s [Accuracy].
132type RoundingMode byte
133
134// These constants define supported rounding modes.
135const (
136	ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven
137	ToNearestAway                     // == IEEE 754-2008 roundTiesToAway
138	ToZero                            // == IEEE 754-2008 roundTowardZero
139	AwayFromZero                      // no IEEE 754-2008 equivalent
140	ToNegativeInf                     // == IEEE 754-2008 roundTowardNegative
141	ToPositiveInf                     // == IEEE 754-2008 roundTowardPositive
142)
143
144//go:generate stringer -type=RoundingMode
145
146// Accuracy describes the rounding error produced by the most recent
147// operation that generated a [Float] value, relative to the exact value.
148type Accuracy int8
149
150// Constants describing the [Accuracy] of a [Float].
151const (
152	Below Accuracy = -1
153	Exact Accuracy = 0
154	Above Accuracy = +1
155)
156
157//go:generate stringer -type=Accuracy
158
159// SetPrec sets z's precision to prec and returns the (possibly) rounded
160// value of z. Rounding occurs according to z's rounding mode if the mantissa
161// cannot be represented in prec bits without loss of precision.
162// SetPrec(0) maps all finite values to ±0; infinite values remain unchanged.
163// If prec > [MaxPrec], it is set to [MaxPrec].
164func (z *Float) SetPrec(prec uint) *Float {
165	z.acc = Exact // optimistically assume no rounding is needed
166
167	// special case
168	if prec == 0 {
169		z.prec = 0
170		if z.form == finite {
171			// truncate z to 0
172			z.acc = makeAcc(z.neg)
173			z.form = zero
174		}
175		return z
176	}
177
178	// general case
179	if prec > MaxPrec {
180		prec = MaxPrec
181	}
182	old := z.prec
183	z.prec = uint32(prec)
184	if z.prec < old {
185		z.round(0)
186	}
187	return z
188}
189
190func makeAcc(above bool) Accuracy {
191	if above {
192		return Above
193	}
194	return Below
195}
196
197// SetMode sets z's rounding mode to mode and returns an exact z.
198// z remains unchanged otherwise.
199// z.SetMode(z.Mode()) is a cheap way to set z's accuracy to [Exact].
200func (z *Float) SetMode(mode RoundingMode) *Float {
201	z.mode = mode
202	z.acc = Exact
203	return z
204}
205
206// Prec returns the mantissa precision of x in bits.
207// The result may be 0 for |x| == 0 and |x| == Inf.
208func (x *Float) Prec() uint {
209	return uint(x.prec)
210}
211
212// MinPrec returns the minimum precision required to represent x exactly
213// (i.e., the smallest prec before x.SetPrec(prec) would start rounding x).
214// The result is 0 for |x| == 0 and |x| == Inf.
215func (x *Float) MinPrec() uint {
216	if x.form != finite {
217		return 0
218	}
219	return uint(len(x.mant))*_W - x.mant.trailingZeroBits()
220}
221
222// Mode returns the rounding mode of x.
223func (x *Float) Mode() RoundingMode {
224	return x.mode
225}
226
227// Acc returns the accuracy of x produced by the most recent
228// operation, unless explicitly documented otherwise by that
229// operation.
230func (x *Float) Acc() Accuracy {
231	return x.acc
232}
233
234// Sign returns:
235//   - -1 if x < 0;
236//   - 0 if x is ±0;
237//   - +1 if x > 0.
238func (x *Float) Sign() int {
239	if debugFloat {
240		x.validate()
241	}
242	if x.form == zero {
243		return 0
244	}
245	if x.neg {
246		return -1
247	}
248	return 1
249}
250
251// MantExp breaks x into its mantissa and exponent components
252// and returns the exponent. If a non-nil mant argument is
253// provided its value is set to the mantissa of x, with the
254// same precision and rounding mode as x. The components
255// satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0.
256// Calling MantExp with a nil argument is an efficient way to
257// get the exponent of the receiver.
258//
259// Special cases are:
260//
261//	(  ±0).MantExp(mant) = 0, with mant set to   ±0
262//	(±Inf).MantExp(mant) = 0, with mant set to ±Inf
263//
264// x and mant may be the same in which case x is set to its
265// mantissa value.
266func (x *Float) MantExp(mant *Float) (exp int) {
267	if debugFloat {
268		x.validate()
269	}
270	if x.form == finite {
271		exp = int(x.exp)
272	}
273	if mant != nil {
274		mant.Copy(x)
275		if mant.form == finite {
276			mant.exp = 0
277		}
278	}
279	return
280}
281
282func (z *Float) setExpAndRound(exp int64, sbit uint) {
283	if exp < MinExp {
284		// underflow
285		z.acc = makeAcc(z.neg)
286		z.form = zero
287		return
288	}
289
290	if exp > MaxExp {
291		// overflow
292		z.acc = makeAcc(!z.neg)
293		z.form = inf
294		return
295	}
296
297	z.form = finite
298	z.exp = int32(exp)
299	z.round(sbit)
300}
301
302// SetMantExp sets z to mant × 2**exp and returns z.
303// The result z has the same precision and rounding mode
304// as mant. SetMantExp is an inverse of [Float.MantExp] but does
305// not require 0.5 <= |mant| < 1.0. Specifically, for a
306// given x of type *[Float], SetMantExp relates to [Float.MantExp]
307// as follows:
308//
309//	mant := new(Float)
310//	new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0
311//
312// Special cases are:
313//
314//	z.SetMantExp(  ±0, exp) =   ±0
315//	z.SetMantExp(±Inf, exp) = ±Inf
316//
317// z and mant may be the same in which case z's exponent
318// is set to exp.
319func (z *Float) SetMantExp(mant *Float, exp int) *Float {
320	if debugFloat {
321		z.validate()
322		mant.validate()
323	}
324	z.Copy(mant)
325
326	if z.form == finite {
327		// 0 < |mant| < +Inf
328		z.setExpAndRound(int64(z.exp)+int64(exp), 0)
329	}
330	return z
331}
332
333// Signbit reports whether x is negative or negative zero.
334func (x *Float) Signbit() bool {
335	return x.neg
336}
337
338// IsInf reports whether x is +Inf or -Inf.
339func (x *Float) IsInf() bool {
340	return x.form == inf
341}
342
343// IsInt reports whether x is an integer.
344// ±Inf values are not integers.
345func (x *Float) IsInt() bool {
346	if debugFloat {
347		x.validate()
348	}
349	// special cases
350	if x.form != finite {
351		return x.form == zero
352	}
353	// x.form == finite
354	if x.exp <= 0 {
355		return false
356	}
357	// x.exp > 0
358	return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa
359}
360
361// debugging support
362func (x *Float) validate() {
363	if !debugFloat {
364		// avoid performance bugs
365		panic("validate called but debugFloat is not set")
366	}
367	if msg := x.validate0(); msg != "" {
368		panic(msg)
369	}
370}
371
372func (x *Float) validate0() string {
373	if x.form != finite {
374		return ""
375	}
376	m := len(x.mant)
377	if m == 0 {
378		return "nonzero finite number with empty mantissa"
379	}
380	const msb = 1 << (_W - 1)
381	if x.mant[m-1]&msb == 0 {
382		return fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0))
383	}
384	if x.prec == 0 {
385		return "zero precision finite number"
386	}
387	return ""
388}
389
390// round rounds z according to z.mode to z.prec bits and sets z.acc accordingly.
391// sbit must be 0 or 1 and summarizes any "sticky bit" information one might
392// have before calling round. z's mantissa must be normalized (with the msb set)
393// or empty.
394//
395// CAUTION: The rounding modes [ToNegativeInf], [ToPositiveInf] are affected by the
396// sign of z. For correct rounding, the sign of z must be set correctly before
397// calling round.
398func (z *Float) round(sbit uint) {
399	if debugFloat {
400		z.validate()
401	}
402
403	z.acc = Exact
404	if z.form != finite {
405		// ±0 or ±Inf => nothing left to do
406		return
407	}
408	// z.form == finite && len(z.mant) > 0
409	// m > 0 implies z.prec > 0 (checked by validate)
410
411	m := uint32(len(z.mant)) // present mantissa length in words
412	bits := m * _W           // present mantissa bits; bits > 0
413	if bits <= z.prec {
414		// mantissa fits => nothing to do
415		return
416	}
417	// bits > z.prec
418
419	// Rounding is based on two bits: the rounding bit (rbit) and the
420	// sticky bit (sbit). The rbit is the bit immediately before the
421	// z.prec leading mantissa bits (the "0.5"). The sbit is set if any
422	// of the bits before the rbit are set (the "0.25", "0.125", etc.):
423	//
424	//   rbit  sbit  => "fractional part"
425	//
426	//   0     0        == 0
427	//   0     1        >  0  , < 0.5
428	//   1     0        == 0.5
429	//   1     1        >  0.5, < 1.0
430
431	// bits > z.prec: mantissa too large => round
432	r := uint(bits - z.prec - 1) // rounding bit position; r >= 0
433	rbit := z.mant.bit(r) & 1    // rounding bit; be safe and ensure it's a single bit
434	// The sticky bit is only needed for rounding ToNearestEven
435	// or when the rounding bit is zero. Avoid computation otherwise.
436	if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) {
437		sbit = z.mant.sticky(r)
438	}
439	sbit &= 1 // be safe and ensure it's a single bit
440
441	// cut off extra words
442	n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision
443	if m > n {
444		copy(z.mant, z.mant[m-n:]) // move n last words to front
445		z.mant = z.mant[:n]
446	}
447
448	// determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word
449	ntz := n*_W - z.prec // 0 <= ntz < _W
450	lsb := Word(1) << ntz
451
452	// round if result is inexact
453	if rbit|sbit != 0 {
454		// Make rounding decision: The result mantissa is truncated ("rounded down")
455		// by default. Decide if we need to increment, or "round up", the (unsigned)
456		// mantissa.
457		inc := false
458		switch z.mode {
459		case ToNegativeInf:
460			inc = z.neg
461		case ToZero:
462			// nothing to do
463		case ToNearestEven:
464			inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0)
465		case ToNearestAway:
466			inc = rbit != 0
467		case AwayFromZero:
468			inc = true
469		case ToPositiveInf:
470			inc = !z.neg
471		default:
472			panic("unreachable")
473		}
474
475		// A positive result (!z.neg) is Above the exact result if we increment,
476		// and it's Below if we truncate (Exact results require no rounding).
477		// For a negative result (z.neg) it is exactly the opposite.
478		z.acc = makeAcc(inc != z.neg)
479
480		if inc {
481			// add 1 to mantissa
482			if addVW(z.mant, z.mant, lsb) != 0 {
483				// mantissa overflow => adjust exponent
484				if z.exp >= MaxExp {
485					// exponent overflow
486					z.form = inf
487					return
488				}
489				z.exp++
490				// adjust mantissa: divide by 2 to compensate for exponent adjustment
491				shrVU(z.mant, z.mant, 1)
492				// set msb == carry == 1 from the mantissa overflow above
493				const msb = 1 << (_W - 1)
494				z.mant[n-1] |= msb
495			}
496		}
497	}
498
499	// zero out trailing bits in least-significant word
500	z.mant[0] &^= lsb - 1
501
502	if debugFloat {
503		z.validate()
504	}
505}
506
507func (z *Float) setBits64(neg bool, x uint64) *Float {
508	if z.prec == 0 {
509		z.prec = 64
510	}
511	z.acc = Exact
512	z.neg = neg
513	if x == 0 {
514		z.form = zero
515		return z
516	}
517	// x != 0
518	z.form = finite
519	s := bits.LeadingZeros64(x)
520	z.mant = z.mant.setUint64(x << uint(s))
521	z.exp = int32(64 - s) // always fits
522	if z.prec < 64 {
523		z.round(0)
524	}
525	return z
526}
527
528// SetUint64 sets z to the (possibly rounded) value of x and returns z.
529// If z's precision is 0, it is changed to 64 (and rounding will have
530// no effect).
531func (z *Float) SetUint64(x uint64) *Float {
532	return z.setBits64(false, x)
533}
534
535// SetInt64 sets z to the (possibly rounded) value of x and returns z.
536// If z's precision is 0, it is changed to 64 (and rounding will have
537// no effect).
538func (z *Float) SetInt64(x int64) *Float {
539	u := x
540	if u < 0 {
541		u = -u
542	}
543	// We cannot simply call z.SetUint64(uint64(u)) and change
544	// the sign afterwards because the sign affects rounding.
545	return z.setBits64(x < 0, uint64(u))
546}
547
548// SetFloat64 sets z to the (possibly rounded) value of x and returns z.
549// If z's precision is 0, it is changed to 53 (and rounding will have
550// no effect). SetFloat64 panics with [ErrNaN] if x is a NaN.
551func (z *Float) SetFloat64(x float64) *Float {
552	if z.prec == 0 {
553		z.prec = 53
554	}
555	if math.IsNaN(x) {
556		panic(ErrNaN{"Float.SetFloat64(NaN)"})
557	}
558	z.acc = Exact
559	z.neg = math.Signbit(x) // handle -0, -Inf correctly
560	if x == 0 {
561		z.form = zero
562		return z
563	}
564	if math.IsInf(x, 0) {
565		z.form = inf
566		return z
567	}
568	// normalized x != 0
569	z.form = finite
570	fmant, exp := math.Frexp(x) // get normalized mantissa
571	z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11)
572	z.exp = int32(exp) // always fits
573	if z.prec < 53 {
574		z.round(0)
575	}
576	return z
577}
578
579// fnorm normalizes mantissa m by shifting it to the left
580// such that the msb of the most-significant word (msw) is 1.
581// It returns the shift amount. It assumes that len(m) != 0.
582func fnorm(m nat) int64 {
583	if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) {
584		panic("msw of mantissa is 0")
585	}
586	s := nlz(m[len(m)-1])
587	if s > 0 {
588		c := shlVU(m, m, s)
589		if debugFloat && c != 0 {
590			panic("nlz or shlVU incorrect")
591		}
592	}
593	return int64(s)
594}
595
596// SetInt sets z to the (possibly rounded) value of x and returns z.
597// If z's precision is 0, it is changed to the larger of x.BitLen()
598// or 64 (and rounding will have no effect).
599func (z *Float) SetInt(x *Int) *Float {
600	// TODO(gri) can be more efficient if z.prec > 0
601	// but small compared to the size of x, or if there
602	// are many trailing 0's.
603	bits := uint32(x.BitLen())
604	if z.prec == 0 {
605		z.prec = umax32(bits, 64)
606	}
607	z.acc = Exact
608	z.neg = x.neg
609	if len(x.abs) == 0 {
610		z.form = zero
611		return z
612	}
613	// x != 0
614	z.mant = z.mant.set(x.abs)
615	fnorm(z.mant)
616	z.setExpAndRound(int64(bits), 0)
617	return z
618}
619
620// SetRat sets z to the (possibly rounded) value of x and returns z.
621// If z's precision is 0, it is changed to the largest of a.BitLen(),
622// b.BitLen(), or 64; with x = a/b.
623func (z *Float) SetRat(x *Rat) *Float {
624	if x.IsInt() {
625		return z.SetInt(x.Num())
626	}
627	var a, b Float
628	a.SetInt(x.Num())
629	b.SetInt(x.Denom())
630	if z.prec == 0 {
631		z.prec = umax32(a.prec, b.prec)
632	}
633	return z.Quo(&a, &b)
634}
635
636// SetInf sets z to the infinite Float -Inf if signbit is
637// set, or +Inf if signbit is not set, and returns z. The
638// precision of z is unchanged and the result is always
639// [Exact].
640func (z *Float) SetInf(signbit bool) *Float {
641	z.acc = Exact
642	z.form = inf
643	z.neg = signbit
644	return z
645}
646
647// Set sets z to the (possibly rounded) value of x and returns z.
648// If z's precision is 0, it is changed to the precision of x
649// before setting z (and rounding will have no effect).
650// Rounding is performed according to z's precision and rounding
651// mode; and z's accuracy reports the result error relative to the
652// exact (not rounded) result.
653func (z *Float) Set(x *Float) *Float {
654	if debugFloat {
655		x.validate()
656	}
657	z.acc = Exact
658	if z != x {
659		z.form = x.form
660		z.neg = x.neg
661		if x.form == finite {
662			z.exp = x.exp
663			z.mant = z.mant.set(x.mant)
664		}
665		if z.prec == 0 {
666			z.prec = x.prec
667		} else if z.prec < x.prec {
668			z.round(0)
669		}
670	}
671	return z
672}
673
674// Copy sets z to x, with the same precision, rounding mode, and accuracy as x.
675// Copy returns z. If x and z are identical, Copy is a no-op.
676func (z *Float) Copy(x *Float) *Float {
677	if debugFloat {
678		x.validate()
679	}
680	if z != x {
681		z.prec = x.prec
682		z.mode = x.mode
683		z.acc = x.acc
684		z.form = x.form
685		z.neg = x.neg
686		if z.form == finite {
687			z.mant = z.mant.set(x.mant)
688			z.exp = x.exp
689		}
690	}
691	return z
692}
693
694// msb32 returns the 32 most significant bits of x.
695func msb32(x nat) uint32 {
696	i := len(x) - 1
697	if i < 0 {
698		return 0
699	}
700	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
701		panic("x not normalized")
702	}
703	switch _W {
704	case 32:
705		return uint32(x[i])
706	case 64:
707		return uint32(x[i] >> 32)
708	}
709	panic("unreachable")
710}
711
712// msb64 returns the 64 most significant bits of x.
713func msb64(x nat) uint64 {
714	i := len(x) - 1
715	if i < 0 {
716		return 0
717	}
718	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
719		panic("x not normalized")
720	}
721	switch _W {
722	case 32:
723		v := uint64(x[i]) << 32
724		if i > 0 {
725			v |= uint64(x[i-1])
726		}
727		return v
728	case 64:
729		return uint64(x[i])
730	}
731	panic("unreachable")
732}
733
734// Uint64 returns the unsigned integer resulting from truncating x
735// towards zero. If 0 <= x <= [math.MaxUint64], the result is [Exact]
736// if x is an integer and [Below] otherwise.
737// The result is (0, [Above]) for x < 0, and ([math.MaxUint64], [Below])
738// for x > [math.MaxUint64].
739func (x *Float) Uint64() (uint64, Accuracy) {
740	if debugFloat {
741		x.validate()
742	}
743
744	switch x.form {
745	case finite:
746		if x.neg {
747			return 0, Above
748		}
749		// 0 < x < +Inf
750		if x.exp <= 0 {
751			// 0 < x < 1
752			return 0, Below
753		}
754		// 1 <= x < Inf
755		if x.exp <= 64 {
756			// u = trunc(x) fits into a uint64
757			u := msb64(x.mant) >> (64 - uint32(x.exp))
758			if x.MinPrec() <= 64 {
759				return u, Exact
760			}
761			return u, Below // x truncated
762		}
763		// x too large
764		return math.MaxUint64, Below
765
766	case zero:
767		return 0, Exact
768
769	case inf:
770		if x.neg {
771			return 0, Above
772		}
773		return math.MaxUint64, Below
774	}
775
776	panic("unreachable")
777}
778
779// Int64 returns the integer resulting from truncating x towards zero.
780// If [math.MinInt64] <= x <= [math.MaxInt64], the result is [Exact] if x is
781// an integer, and [Above] (x < 0) or [Below] (x > 0) otherwise.
782// The result is ([math.MinInt64], [Above]) for x < [math.MinInt64],
783// and ([math.MaxInt64], [Below]) for x > [math.MaxInt64].
784func (x *Float) Int64() (int64, Accuracy) {
785	if debugFloat {
786		x.validate()
787	}
788
789	switch x.form {
790	case finite:
791		// 0 < |x| < +Inf
792		acc := makeAcc(x.neg)
793		if x.exp <= 0 {
794			// 0 < |x| < 1
795			return 0, acc
796		}
797		// x.exp > 0
798
799		// 1 <= |x| < +Inf
800		if x.exp <= 63 {
801			// i = trunc(x) fits into an int64 (excluding math.MinInt64)
802			i := int64(msb64(x.mant) >> (64 - uint32(x.exp)))
803			if x.neg {
804				i = -i
805			}
806			if x.MinPrec() <= uint(x.exp) {
807				return i, Exact
808			}
809			return i, acc // x truncated
810		}
811		if x.neg {
812			// check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64))
813			if x.exp == 64 && x.MinPrec() == 1 {
814				acc = Exact
815			}
816			return math.MinInt64, acc
817		}
818		// x too large
819		return math.MaxInt64, Below
820
821	case zero:
822		return 0, Exact
823
824	case inf:
825		if x.neg {
826			return math.MinInt64, Above
827		}
828		return math.MaxInt64, Below
829	}
830
831	panic("unreachable")
832}
833
834// Float32 returns the float32 value nearest to x. If x is too small to be
835// represented by a float32 (|x| < [math.SmallestNonzeroFloat32]), the result
836// is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x.
837// If x is too large to be represented by a float32 (|x| > [math.MaxFloat32]),
838// the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x.
839func (x *Float) Float32() (float32, Accuracy) {
840	if debugFloat {
841		x.validate()
842	}
843
844	switch x.form {
845	case finite:
846		// 0 < |x| < +Inf
847
848		const (
849			fbits = 32                //        float size
850			mbits = 23                //        mantissa size (excluding implicit msb)
851			ebits = fbits - mbits - 1 //     8  exponent size
852			bias  = 1<<(ebits-1) - 1  //   127  exponent bias
853			dmin  = 1 - bias - mbits  //  -149  smallest unbiased exponent (denormal)
854			emin  = 1 - bias          //  -126  smallest unbiased exponent (normal)
855			emax  = bias              //   127  largest unbiased exponent (normal)
856		)
857
858		// Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa.
859		e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
860
861		// Compute precision p for float32 mantissa.
862		// If the exponent is too small, we have a denormal number before
863		// rounding and fewer than p mantissa bits of precision available
864		// (the exponent remains fixed but the mantissa gets shifted right).
865		p := mbits + 1 // precision of normal float
866		if e < emin {
867			// recompute precision
868			p = mbits + 1 - emin + int(e)
869			// If p == 0, the mantissa of x is shifted so much to the right
870			// that its msb falls immediately to the right of the float32
871			// mantissa space. In other words, if the smallest denormal is
872			// considered "1.0", for p == 0, the mantissa value m is >= 0.5.
873			// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
874			// If m == 0.5, it is rounded down to even, i.e., 0.0.
875			// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
876			if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
877				// underflow to ±0
878				if x.neg {
879					var z float32
880					return -z, Above
881				}
882				return 0.0, Below
883			}
884			// otherwise, round up
885			// We handle p == 0 explicitly because it's easy and because
886			// Float.round doesn't support rounding to 0 bits of precision.
887			if p == 0 {
888				if x.neg {
889					return -math.SmallestNonzeroFloat32, Below
890				}
891				return math.SmallestNonzeroFloat32, Above
892			}
893		}
894		// p > 0
895
896		// round
897		var r Float
898		r.prec = uint32(p)
899		r.Set(x)
900		e = r.exp - 1
901
902		// Rounding may have caused r to overflow to ±Inf
903		// (rounding never causes underflows to 0).
904		// If the exponent is too large, also overflow to ±Inf.
905		if r.form == inf || e > emax {
906			// overflow
907			if x.neg {
908				return float32(math.Inf(-1)), Below
909			}
910			return float32(math.Inf(+1)), Above
911		}
912		// e <= emax
913
914		// Determine sign, biased exponent, and mantissa.
915		var sign, bexp, mant uint32
916		if x.neg {
917			sign = 1 << (fbits - 1)
918		}
919
920		// Rounding may have caused a denormal number to
921		// become normal. Check again.
922		if e < emin {
923			// denormal number: recompute precision
924			// Since rounding may have at best increased precision
925			// and we have eliminated p <= 0 early, we know p > 0.
926			// bexp == 0 for denormals
927			p = mbits + 1 - emin + int(e)
928			mant = msb32(r.mant) >> uint(fbits-p)
929		} else {
930			// normal number: emin <= e <= emax
931			bexp = uint32(e+bias) << mbits
932			mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
933		}
934
935		return math.Float32frombits(sign | bexp | mant), r.acc
936
937	case zero:
938		if x.neg {
939			var z float32
940			return -z, Exact
941		}
942		return 0.0, Exact
943
944	case inf:
945		if x.neg {
946			return float32(math.Inf(-1)), Exact
947		}
948		return float32(math.Inf(+1)), Exact
949	}
950
951	panic("unreachable")
952}
953
954// Float64 returns the float64 value nearest to x. If x is too small to be
955// represented by a float64 (|x| < [math.SmallestNonzeroFloat64]), the result
956// is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x.
957// If x is too large to be represented by a float64 (|x| > [math.MaxFloat64]),
958// the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x.
959func (x *Float) Float64() (float64, Accuracy) {
960	if debugFloat {
961		x.validate()
962	}
963
964	switch x.form {
965	case finite:
966		// 0 < |x| < +Inf
967
968		const (
969			fbits = 64                //        float size
970			mbits = 52                //        mantissa size (excluding implicit msb)
971			ebits = fbits - mbits - 1 //    11  exponent size
972			bias  = 1<<(ebits-1) - 1  //  1023  exponent bias
973			dmin  = 1 - bias - mbits  // -1074  smallest unbiased exponent (denormal)
974			emin  = 1 - bias          // -1022  smallest unbiased exponent (normal)
975			emax  = bias              //  1023  largest unbiased exponent (normal)
976		)
977
978		// Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa.
979		e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
980
981		// Compute precision p for float64 mantissa.
982		// If the exponent is too small, we have a denormal number before
983		// rounding and fewer than p mantissa bits of precision available
984		// (the exponent remains fixed but the mantissa gets shifted right).
985		p := mbits + 1 // precision of normal float
986		if e < emin {
987			// recompute precision
988			p = mbits + 1 - emin + int(e)
989			// If p == 0, the mantissa of x is shifted so much to the right
990			// that its msb falls immediately to the right of the float64
991			// mantissa space. In other words, if the smallest denormal is
992			// considered "1.0", for p == 0, the mantissa value m is >= 0.5.
993			// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
994			// If m == 0.5, it is rounded down to even, i.e., 0.0.
995			// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
996			if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
997				// underflow to ±0
998				if x.neg {
999					var z float64
1000					return -z, Above
1001				}
1002				return 0.0, Below
1003			}
1004			// otherwise, round up
1005			// We handle p == 0 explicitly because it's easy and because
1006			// Float.round doesn't support rounding to 0 bits of precision.
1007			if p == 0 {
1008				if x.neg {
1009					return -math.SmallestNonzeroFloat64, Below
1010				}
1011				return math.SmallestNonzeroFloat64, Above
1012			}
1013		}
1014		// p > 0
1015
1016		// round
1017		var r Float
1018		r.prec = uint32(p)
1019		r.Set(x)
1020		e = r.exp - 1
1021
1022		// Rounding may have caused r to overflow to ±Inf
1023		// (rounding never causes underflows to 0).
1024		// If the exponent is too large, also overflow to ±Inf.
1025		if r.form == inf || e > emax {
1026			// overflow
1027			if x.neg {
1028				return math.Inf(-1), Below
1029			}
1030			return math.Inf(+1), Above
1031		}
1032		// e <= emax
1033
1034		// Determine sign, biased exponent, and mantissa.
1035		var sign, bexp, mant uint64
1036		if x.neg {
1037			sign = 1 << (fbits - 1)
1038		}
1039
1040		// Rounding may have caused a denormal number to
1041		// become normal. Check again.
1042		if e < emin {
1043			// denormal number: recompute precision
1044			// Since rounding may have at best increased precision
1045			// and we have eliminated p <= 0 early, we know p > 0.
1046			// bexp == 0 for denormals
1047			p = mbits + 1 - emin + int(e)
1048			mant = msb64(r.mant) >> uint(fbits-p)
1049		} else {
1050			// normal number: emin <= e <= emax
1051			bexp = uint64(e+bias) << mbits
1052			mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
1053		}
1054
1055		return math.Float64frombits(sign | bexp | mant), r.acc
1056
1057	case zero:
1058		if x.neg {
1059			var z float64
1060			return -z, Exact
1061		}
1062		return 0.0, Exact
1063
1064	case inf:
1065		if x.neg {
1066			return math.Inf(-1), Exact
1067		}
1068		return math.Inf(+1), Exact
1069	}
1070
1071	panic("unreachable")
1072}
1073
1074// Int returns the result of truncating x towards zero;
1075// or nil if x is an infinity.
1076// The result is [Exact] if x.IsInt(); otherwise it is [Below]
1077// for x > 0, and [Above] for x < 0.
1078// If a non-nil *[Int] argument z is provided, [Int] stores
1079// the result in z instead of allocating a new [Int].
1080func (x *Float) Int(z *Int) (*Int, Accuracy) {
1081	if debugFloat {
1082		x.validate()
1083	}
1084
1085	if z == nil && x.form <= finite {
1086		z = new(Int)
1087	}
1088
1089	switch x.form {
1090	case finite:
1091		// 0 < |x| < +Inf
1092		acc := makeAcc(x.neg)
1093		if x.exp <= 0 {
1094			// 0 < |x| < 1
1095			return z.SetInt64(0), acc
1096		}
1097		// x.exp > 0
1098
1099		// 1 <= |x| < +Inf
1100		// determine minimum required precision for x
1101		allBits := uint(len(x.mant)) * _W
1102		exp := uint(x.exp)
1103		if x.MinPrec() <= exp {
1104			acc = Exact
1105		}
1106		// shift mantissa as needed
1107		if z == nil {
1108			z = new(Int)
1109		}
1110		z.neg = x.neg
1111		switch {
1112		case exp > allBits:
1113			z.abs = z.abs.shl(x.mant, exp-allBits)
1114		default:
1115			z.abs = z.abs.set(x.mant)
1116		case exp < allBits:
1117			z.abs = z.abs.shr(x.mant, allBits-exp)
1118		}
1119		return z, acc
1120
1121	case zero:
1122		return z.SetInt64(0), Exact
1123
1124	case inf:
1125		return nil, makeAcc(x.neg)
1126	}
1127
1128	panic("unreachable")
1129}
1130
1131// Rat returns the rational number corresponding to x;
1132// or nil if x is an infinity.
1133// The result is [Exact] if x is not an Inf.
1134// If a non-nil *[Rat] argument z is provided, [Rat] stores
1135// the result in z instead of allocating a new [Rat].
1136func (x *Float) Rat(z *Rat) (*Rat, Accuracy) {
1137	if debugFloat {
1138		x.validate()
1139	}
1140
1141	if z == nil && x.form <= finite {
1142		z = new(Rat)
1143	}
1144
1145	switch x.form {
1146	case finite:
1147		// 0 < |x| < +Inf
1148		allBits := int32(len(x.mant)) * _W
1149		// build up numerator and denominator
1150		z.a.neg = x.neg
1151		switch {
1152		case x.exp > allBits:
1153			z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits))
1154			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
1155			// z already in normal form
1156		default:
1157			z.a.abs = z.a.abs.set(x.mant)
1158			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
1159			// z already in normal form
1160		case x.exp < allBits:
1161			z.a.abs = z.a.abs.set(x.mant)
1162			t := z.b.abs.setUint64(1)
1163			z.b.abs = t.shl(t, uint(allBits-x.exp))
1164			z.norm()
1165		}
1166		return z, Exact
1167
1168	case zero:
1169		return z.SetInt64(0), Exact
1170
1171	case inf:
1172		return nil, makeAcc(x.neg)
1173	}
1174
1175	panic("unreachable")
1176}
1177
1178// Abs sets z to the (possibly rounded) value |x| (the absolute value of x)
1179// and returns z.
1180func (z *Float) Abs(x *Float) *Float {
1181	z.Set(x)
1182	z.neg = false
1183	return z
1184}
1185
1186// Neg sets z to the (possibly rounded) value of x with its sign negated,
1187// and returns z.
1188func (z *Float) Neg(x *Float) *Float {
1189	z.Set(x)
1190	z.neg = !z.neg
1191	return z
1192}
1193
1194func validateBinaryOperands(x, y *Float) {
1195	if !debugFloat {
1196		// avoid performance bugs
1197		panic("validateBinaryOperands called but debugFloat is not set")
1198	}
1199	if len(x.mant) == 0 {
1200		panic("empty mantissa for x")
1201	}
1202	if len(y.mant) == 0 {
1203		panic("empty mantissa for y")
1204	}
1205}
1206
1207// z = x + y, ignoring signs of x and y for the addition
1208// but using the sign of z for rounding the result.
1209// x and y must have a non-empty mantissa and valid exponent.
1210func (z *Float) uadd(x, y *Float) {
1211	// Note: This implementation requires 2 shifts most of the
1212	// time. It is also inefficient if exponents or precisions
1213	// differ by wide margins. The following article describes
1214	// an efficient (but much more complicated) implementation
1215	// compatible with the internal representation used here:
1216	//
1217	// Vincent Lefèvre: "The Generic Multiple-Precision Floating-
1218	// Point Addition With Exact Rounding (as in the MPFR Library)"
1219	// http://www.vinc17.net/research/papers/rnc6.pdf
1220
1221	if debugFloat {
1222		validateBinaryOperands(x, y)
1223	}
1224
1225	// compute exponents ex, ey for mantissa with "binary point"
1226	// on the right (mantissa.0) - use int64 to avoid overflow
1227	ex := int64(x.exp) - int64(len(x.mant))*_W
1228	ey := int64(y.exp) - int64(len(y.mant))*_W
1229
1230	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
1231
1232	// TODO(gri) having a combined add-and-shift primitive
1233	//           could make this code significantly faster
1234	switch {
1235	case ex < ey:
1236		if al {
1237			t := nat(nil).shl(y.mant, uint(ey-ex))
1238			z.mant = z.mant.add(x.mant, t)
1239		} else {
1240			z.mant = z.mant.shl(y.mant, uint(ey-ex))
1241			z.mant = z.mant.add(x.mant, z.mant)
1242		}
1243	default:
1244		// ex == ey, no shift needed
1245		z.mant = z.mant.add(x.mant, y.mant)
1246	case ex > ey:
1247		if al {
1248			t := nat(nil).shl(x.mant, uint(ex-ey))
1249			z.mant = z.mant.add(t, y.mant)
1250		} else {
1251			z.mant = z.mant.shl(x.mant, uint(ex-ey))
1252			z.mant = z.mant.add(z.mant, y.mant)
1253		}
1254		ex = ey
1255	}
1256	// len(z.mant) > 0
1257
1258	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1259}
1260
1261// z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction
1262// but using the sign of z for rounding the result.
1263// x and y must have a non-empty mantissa and valid exponent.
1264func (z *Float) usub(x, y *Float) {
1265	// This code is symmetric to uadd.
1266	// We have not factored the common code out because
1267	// eventually uadd (and usub) should be optimized
1268	// by special-casing, and the code will diverge.
1269
1270	if debugFloat {
1271		validateBinaryOperands(x, y)
1272	}
1273
1274	ex := int64(x.exp) - int64(len(x.mant))*_W
1275	ey := int64(y.exp) - int64(len(y.mant))*_W
1276
1277	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
1278
1279	switch {
1280	case ex < ey:
1281		if al {
1282			t := nat(nil).shl(y.mant, uint(ey-ex))
1283			z.mant = t.sub(x.mant, t)
1284		} else {
1285			z.mant = z.mant.shl(y.mant, uint(ey-ex))
1286			z.mant = z.mant.sub(x.mant, z.mant)
1287		}
1288	default:
1289		// ex == ey, no shift needed
1290		z.mant = z.mant.sub(x.mant, y.mant)
1291	case ex > ey:
1292		if al {
1293			t := nat(nil).shl(x.mant, uint(ex-ey))
1294			z.mant = t.sub(t, y.mant)
1295		} else {
1296			z.mant = z.mant.shl(x.mant, uint(ex-ey))
1297			z.mant = z.mant.sub(z.mant, y.mant)
1298		}
1299		ex = ey
1300	}
1301
1302	// operands may have canceled each other out
1303	if len(z.mant) == 0 {
1304		z.acc = Exact
1305		z.form = zero
1306		z.neg = false
1307		return
1308	}
1309	// len(z.mant) > 0
1310
1311	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1312}
1313
1314// z = x * y, ignoring signs of x and y for the multiplication
1315// but using the sign of z for rounding the result.
1316// x and y must have a non-empty mantissa and valid exponent.
1317func (z *Float) umul(x, y *Float) {
1318	if debugFloat {
1319		validateBinaryOperands(x, y)
1320	}
1321
1322	// Note: This is doing too much work if the precision
1323	// of z is less than the sum of the precisions of x
1324	// and y which is often the case (e.g., if all floats
1325	// have the same precision).
1326	// TODO(gri) Optimize this for the common case.
1327
1328	e := int64(x.exp) + int64(y.exp)
1329	if x == y {
1330		z.mant = z.mant.sqr(x.mant)
1331	} else {
1332		z.mant = z.mant.mul(x.mant, y.mant)
1333	}
1334	z.setExpAndRound(e-fnorm(z.mant), 0)
1335}
1336
1337// z = x / y, ignoring signs of x and y for the division
1338// but using the sign of z for rounding the result.
1339// x and y must have a non-empty mantissa and valid exponent.
1340func (z *Float) uquo(x, y *Float) {
1341	if debugFloat {
1342		validateBinaryOperands(x, y)
1343	}
1344
1345	// mantissa length in words for desired result precision + 1
1346	// (at least one extra bit so we get the rounding bit after
1347	// the division)
1348	n := int(z.prec/_W) + 1
1349
1350	// compute adjusted x.mant such that we get enough result precision
1351	xadj := x.mant
1352	if d := n - len(x.mant) + len(y.mant); d > 0 {
1353		// d extra words needed => add d "0 digits" to x
1354		xadj = make(nat, len(x.mant)+d)
1355		copy(xadj[d:], x.mant)
1356	}
1357	// TODO(gri): If we have too many digits (d < 0), we should be able
1358	// to shorten x for faster division. But we must be extra careful
1359	// with rounding in that case.
1360
1361	// Compute d before division since there may be aliasing of x.mant
1362	// (via xadj) or y.mant with z.mant.
1363	d := len(xadj) - len(y.mant)
1364
1365	// divide
1366	var r nat
1367	z.mant, r = z.mant.div(nil, xadj, y.mant)
1368	e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W
1369
1370	// The result is long enough to include (at least) the rounding bit.
1371	// If there's a non-zero remainder, the corresponding fractional part
1372	// (if it were computed), would have a non-zero sticky bit (if it were
1373	// zero, it couldn't have a non-zero remainder).
1374	var sbit uint
1375	if len(r) > 0 {
1376		sbit = 1
1377	}
1378
1379	z.setExpAndRound(e-fnorm(z.mant), sbit)
1380}
1381
1382// ucmp returns -1, 0, or +1, depending on whether
1383// |x| < |y|, |x| == |y|, or |x| > |y|.
1384// x and y must have a non-empty mantissa and valid exponent.
1385func (x *Float) ucmp(y *Float) int {
1386	if debugFloat {
1387		validateBinaryOperands(x, y)
1388	}
1389
1390	switch {
1391	case x.exp < y.exp:
1392		return -1
1393	case x.exp > y.exp:
1394		return +1
1395	}
1396	// x.exp == y.exp
1397
1398	// compare mantissas
1399	i := len(x.mant)
1400	j := len(y.mant)
1401	for i > 0 || j > 0 {
1402		var xm, ym Word
1403		if i > 0 {
1404			i--
1405			xm = x.mant[i]
1406		}
1407		if j > 0 {
1408			j--
1409			ym = y.mant[j]
1410		}
1411		switch {
1412		case xm < ym:
1413			return -1
1414		case xm > ym:
1415			return +1
1416		}
1417	}
1418
1419	return 0
1420}
1421
1422// Handling of sign bit as defined by IEEE 754-2008, section 6.3:
1423//
1424// When neither the inputs nor result are NaN, the sign of a product or
1425// quotient is the exclusive OR of the operands’ signs; the sign of a sum,
1426// or of a difference x−y regarded as a sum x+(−y), differs from at most
1427// one of the addends’ signs; and the sign of the result of conversions,
1428// the quantize operation, the roundToIntegral operations, and the
1429// roundToIntegralExact (see 5.3.1) is the sign of the first or only operand.
1430// These rules shall apply even when operands or results are zero or infinite.
1431//
1432// When the sum of two operands with opposite signs (or the difference of
1433// two operands with like signs) is exactly zero, the sign of that sum (or
1434// difference) shall be +0 in all rounding-direction attributes except
1435// roundTowardNegative; under that attribute, the sign of an exact zero
1436// sum (or difference) shall be −0. However, x+x = x−(−x) retains the same
1437// sign as x even when x is zero.
1438//
1439// See also: https://play.golang.org/p/RtH3UCt5IH
1440
1441// Add sets z to the rounded sum x+y and returns z. If z's precision is 0,
1442// it is changed to the larger of x's or y's precision before the operation.
1443// Rounding is performed according to z's precision and rounding mode; and
1444// z's accuracy reports the result error relative to the exact (not rounded)
1445// result. Add panics with [ErrNaN] if x and y are infinities with opposite
1446// signs. The value of z is undefined in that case.
1447func (z *Float) Add(x, y *Float) *Float {
1448	if debugFloat {
1449		x.validate()
1450		y.validate()
1451	}
1452
1453	if z.prec == 0 {
1454		z.prec = umax32(x.prec, y.prec)
1455	}
1456
1457	if x.form == finite && y.form == finite {
1458		// x + y (common case)
1459
1460		// Below we set z.neg = x.neg, and when z aliases y this will
1461		// change the y operand's sign. This is fine, because if an
1462		// operand aliases the receiver it'll be overwritten, but we still
1463		// want the original x.neg and y.neg values when we evaluate
1464		// x.neg != y.neg, so we need to save y.neg before setting z.neg.
1465		yneg := y.neg
1466
1467		z.neg = x.neg
1468		if x.neg == yneg {
1469			// x + y == x + y
1470			// (-x) + (-y) == -(x + y)
1471			z.uadd(x, y)
1472		} else {
1473			// x + (-y) == x - y == -(y - x)
1474			// (-x) + y == y - x == -(x - y)
1475			if x.ucmp(y) > 0 {
1476				z.usub(x, y)
1477			} else {
1478				z.neg = !z.neg
1479				z.usub(y, x)
1480			}
1481		}
1482		if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
1483			z.neg = true
1484		}
1485		return z
1486	}
1487
1488	if x.form == inf && y.form == inf && x.neg != y.neg {
1489		// +Inf + -Inf
1490		// -Inf + +Inf
1491		// value of z is undefined but make sure it's valid
1492		z.acc = Exact
1493		z.form = zero
1494		z.neg = false
1495		panic(ErrNaN{"addition of infinities with opposite signs"})
1496	}
1497
1498	if x.form == zero && y.form == zero {
1499		// ±0 + ±0
1500		z.acc = Exact
1501		z.form = zero
1502		z.neg = x.neg && y.neg // -0 + -0 == -0
1503		return z
1504	}
1505
1506	if x.form == inf || y.form == zero {
1507		// ±Inf + y
1508		// x + ±0
1509		return z.Set(x)
1510	}
1511
1512	// ±0 + y
1513	// x + ±Inf
1514	return z.Set(y)
1515}
1516
1517// Sub sets z to the rounded difference x-y and returns z.
1518// Precision, rounding, and accuracy reporting are as for [Float.Add].
1519// Sub panics with [ErrNaN] if x and y are infinities with equal
1520// signs. The value of z is undefined in that case.
1521func (z *Float) Sub(x, y *Float) *Float {
1522	if debugFloat {
1523		x.validate()
1524		y.validate()
1525	}
1526
1527	if z.prec == 0 {
1528		z.prec = umax32(x.prec, y.prec)
1529	}
1530
1531	if x.form == finite && y.form == finite {
1532		// x - y (common case)
1533		yneg := y.neg
1534		z.neg = x.neg
1535		if x.neg != yneg {
1536			// x - (-y) == x + y
1537			// (-x) - y == -(x + y)
1538			z.uadd(x, y)
1539		} else {
1540			// x - y == x - y == -(y - x)
1541			// (-x) - (-y) == y - x == -(x - y)
1542			if x.ucmp(y) > 0 {
1543				z.usub(x, y)
1544			} else {
1545				z.neg = !z.neg
1546				z.usub(y, x)
1547			}
1548		}
1549		if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
1550			z.neg = true
1551		}
1552		return z
1553	}
1554
1555	if x.form == inf && y.form == inf && x.neg == y.neg {
1556		// +Inf - +Inf
1557		// -Inf - -Inf
1558		// value of z is undefined but make sure it's valid
1559		z.acc = Exact
1560		z.form = zero
1561		z.neg = false
1562		panic(ErrNaN{"subtraction of infinities with equal signs"})
1563	}
1564
1565	if x.form == zero && y.form == zero {
1566		// ±0 - ±0
1567		z.acc = Exact
1568		z.form = zero
1569		z.neg = x.neg && !y.neg // -0 - +0 == -0
1570		return z
1571	}
1572
1573	if x.form == inf || y.form == zero {
1574		// ±Inf - y
1575		// x - ±0
1576		return z.Set(x)
1577	}
1578
1579	// ±0 - y
1580	// x - ±Inf
1581	return z.Neg(y)
1582}
1583
1584// Mul sets z to the rounded product x*y and returns z.
1585// Precision, rounding, and accuracy reporting are as for [Float.Add].
1586// Mul panics with [ErrNaN] if one operand is zero and the other
1587// operand an infinity. The value of z is undefined in that case.
1588func (z *Float) Mul(x, y *Float) *Float {
1589	if debugFloat {
1590		x.validate()
1591		y.validate()
1592	}
1593
1594	if z.prec == 0 {
1595		z.prec = umax32(x.prec, y.prec)
1596	}
1597
1598	z.neg = x.neg != y.neg
1599
1600	if x.form == finite && y.form == finite {
1601		// x * y (common case)
1602		z.umul(x, y)
1603		return z
1604	}
1605
1606	z.acc = Exact
1607	if x.form == zero && y.form == inf || x.form == inf && y.form == zero {
1608		// ±0 * ±Inf
1609		// ±Inf * ±0
1610		// value of z is undefined but make sure it's valid
1611		z.form = zero
1612		z.neg = false
1613		panic(ErrNaN{"multiplication of zero with infinity"})
1614	}
1615
1616	if x.form == inf || y.form == inf {
1617		// ±Inf * y
1618		// x * ±Inf
1619		z.form = inf
1620		return z
1621	}
1622
1623	// ±0 * y
1624	// x * ±0
1625	z.form = zero
1626	return z
1627}
1628
1629// Quo sets z to the rounded quotient x/y and returns z.
1630// Precision, rounding, and accuracy reporting are as for [Float.Add].
1631// Quo panics with [ErrNaN] if both operands are zero or infinities.
1632// The value of z is undefined in that case.
1633func (z *Float) Quo(x, y *Float) *Float {
1634	if debugFloat {
1635		x.validate()
1636		y.validate()
1637	}
1638
1639	if z.prec == 0 {
1640		z.prec = umax32(x.prec, y.prec)
1641	}
1642
1643	z.neg = x.neg != y.neg
1644
1645	if x.form == finite && y.form == finite {
1646		// x / y (common case)
1647		z.uquo(x, y)
1648		return z
1649	}
1650
1651	z.acc = Exact
1652	if x.form == zero && y.form == zero || x.form == inf && y.form == inf {
1653		// ±0 / ±0
1654		// ±Inf / ±Inf
1655		// value of z is undefined but make sure it's valid
1656		z.form = zero
1657		z.neg = false
1658		panic(ErrNaN{"division of zero by zero or infinity by infinity"})
1659	}
1660
1661	if x.form == zero || y.form == inf {
1662		// ±0 / y
1663		// x / ±Inf
1664		z.form = zero
1665		return z
1666	}
1667
1668	// x / ±0
1669	// ±Inf / y
1670	z.form = inf
1671	return z
1672}
1673
1674// Cmp compares x and y and returns:
1675//   - -1 if x < y;
1676//   - 0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf);
1677//   - +1 if x > y.
1678func (x *Float) Cmp(y *Float) int {
1679	if debugFloat {
1680		x.validate()
1681		y.validate()
1682	}
1683
1684	mx := x.ord()
1685	my := y.ord()
1686	switch {
1687	case mx < my:
1688		return -1
1689	case mx > my:
1690		return +1
1691	}
1692	// mx == my
1693
1694	// only if |mx| == 1 we have to compare the mantissae
1695	switch mx {
1696	case -1:
1697		return y.ucmp(x)
1698	case +1:
1699		return x.ucmp(y)
1700	}
1701
1702	return 0
1703}
1704
1705// ord classifies x and returns:
1706//
1707//	-2 if -Inf == x
1708//	-1 if -Inf < x < 0
1709//	 0 if x == 0 (signed or unsigned)
1710//	+1 if 0 < x < +Inf
1711//	+2 if x == +Inf
1712func (x *Float) ord() int {
1713	var m int
1714	switch x.form {
1715	case finite:
1716		m = 1
1717	case zero:
1718		return 0
1719	case inf:
1720		m = 2
1721	}
1722	if x.neg {
1723		m = -m
1724	}
1725	return m
1726}
1727
1728func umax32(x, y uint32) uint32 {
1729	if x > y {
1730		return x
1731	}
1732	return y
1733}
1734