-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconvert_new.go
More file actions
80 lines (65 loc) · 1.96 KB
/
convert_new.go
File metadata and controls
80 lines (65 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
package float16
import "math"
func fromFloat32New(f32 float32) Float16 {
bits := math.Float32bits(f32)
sign := uint16(bits >> 31)
exp := int32((bits >> 23) & 0xff)
mant := uint32(bits & 0x7fffff)
// Handle special cases (infinity and NaN)
if exp == 0xff {
if mant == 0 {
return Float16(sign<<15 | 0x7c00) // infinity
}
return Float16(sign<<15 | 0x7e00) // qNaN
}
// Handle zero
if exp == 0 && mant == 0 {
return Float16(sign << 15)
}
// Adjust exponent from float32 bias (127) to float16 bias (15)
exp -= 127 - 15
// Handle overflow (exponent too large)
if exp >= 0x1f {
return Float16(sign<<15 | 0x7c00) // infinity
}
// Handle underflow and subnormal numbers
if exp <= 0 {
if exp < -10 {
return Float16(sign << 15) // zero
}
// Convert to subnormal
mant = (mant | 1<<23) >> uint(1-exp)
// Round to nearest even
if mant&0x1fff > 0x1000 || (mant&0x1fff == 0x1000 && mant&0x2000 != 0) {
mant += 0x2000
}
return Float16(uint16(sign<<15) | uint16(mant>>13))
}
// Handle normal numbers
// Add implicit 1
mant |= 1 << 23
// For float32 to float16, we need to round the 23-bit mantissa to 10 bits
// We work with the original 23-bit mantissa and round to get 10 bits
// Round to nearest even
// Look at bit 12 (guard), bits 11-0 (round/sticky)
guard := (mant >> 12) & 1
sticky := mant & 0xFFF
lsb := (mant >> 13) & 1
// Round up if: guard=1 AND (sticky!=0 OR lsb=1)
if guard != 0 && (sticky != 0 || lsb != 0) {
mant += 1 << 13
}
// Check for mantissa overflow after rounding
if mant >= 1<<24 {
// Mantissa overflowed, increment exponent
exp++
mant = 0 // Reset mantissa to 0 (implicit 1 will be added by IEEE format)
}
// Check for exponent overflow after rounding
if exp >= 0x1f {
return Float16(sign<<15 | 0x7c00) // infinity
}
// Extract the 10-bit mantissa (bits 22-13 of the original 23-bit mantissa)
mantissa10 := (mant >> 13) & 0x3FF
return Float16(uint16(sign<<15) | uint16(exp<<10) | uint16(mantissa10))
}