@@ -191,10 +191,6 @@ func formatBase10(a []byte, u uint64) int {
191191 // On most systems, the uint32 math is faster, but not all.
192192 // The decision here is based on benchmarking.
193193 itoaPure64 = host64bit && goarch .GOARCH != "amd64" && goarch .GOARCH != "arm64" && goarch .GOARCH != "s390x"
194-
195- // 64-bit systems can all use 64-bit div and mod by a constant,
196- // which the compiler rewrites to use 64x64→128-bit multiplies.
197- itoaDivMod64 = host64bit // can use 64-bit div/mod by constant
198194 )
199195
200196 if itoaPure64 {
@@ -218,47 +214,13 @@ func formatBase10(a []byte, u uint64) int {
218214 return i
219215 }
220216
221- // Convert 9-digit chunks using 32-bit math.
217+ // Split into 9-digit chunks that fit in uint32s and convert each chunk using 32-bit math.
222218 // Most numbers are small, so the comparison u >= 1e9 is usually pure overhead,
223219 // so we approximate it by u>>29 != 0, which is usually faster and good enough.
224220 i := len (a )
225221 for (host64bit && u >> 29 != 0 ) || (! host64bit && (u >> 32 != 0 || uint32 (u )>> 29 != 0 )) {
226222 var lo uint32
227- if itoaDivMod64 {
228- u , lo = u / 1e9 , uint32 (u % 1e9 )
229- } else {
230- // On 64-bit systems the compiler rewrites the div and mod above
231- // into a 64x64→128-bit multiply (https://godbolt.org/z/EPnK8zvMK):
232- // hi, _ := bits.Mul64(u>>1, 0x89705f4136b4a598)
233- // q := hi >> 28
234- // lo = uint32(u - q*1e9)
235- // u = q
236- // On 32-bit systems, the compiler invokes a uint64 software divide,
237- // which is quite slow. We could write the bits.Mul64 code above
238- // but even that is slower than we'd like, since it calls a software mul64
239- // instead of having a hardware instruction to use.
240- // Instead we inline bits.Mul64 here and change y0/y1 to constants.
241- // The compiler does use direct 32x32→64-bit multiplies for this code.
242- //
243- // For lots more about division by multiplication see Warren, _Hacker's Delight_.
244- // For a concise overview, see the first two sections of
245- // https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html.
246- const mask32 = 1 << 32 - 1
247- x0 := ((u >> 1 ) & mask32 )
248- x1 := (u >> 1 ) >> 32
249- const y0 = 0x36b4a598
250- const y1 = 0x89705f41
251- w0 := x0 * y0
252- t := x1 * y0 + w0 >> 32
253- w1 := t & mask32
254- w2 := t >> 32
255- w1 += x0 * y1
256- hi := x1 * y1 + w2 + w1 >> 32
257- q := hi >> 28
258-
259- lo = uint32 (u ) - uint32 (q )* 1e9 // uint32(u - q*1e9) but faster
260- u = q
261- }
223+ u , lo = u / 1e9 , uint32 (u % 1e9 )
262224
263225 // Convert 9 digits.
264226 for range 4 {
0 commit comments