Skip to content

Commit 362b0db

Browse files
authored
Winch: Add implementation for i8x16.popcnt for x64 with AVX (#10241)
1 parent 41b7b26 commit 362b0db

File tree

7 files changed

+141
-1
lines changed

7 files changed

+141
-1
lines changed

crates/wast-util/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,6 @@ impl WastTest {
429429
"spec_testsuite/simd_f64x2_arith.wast",
430430
"spec_testsuite/simd_f64x2_pmin_pmax.wast",
431431
"spec_testsuite/simd_f64x2_rounding.wast",
432-
"spec_testsuite/simd_i8x16_arith2.wast",
433432
"spec_testsuite/simd_load.wast",
434433
"spec_testsuite/simd_load_zero.wast",
435434
"spec_testsuite/simd_splat.wast",
@@ -462,6 +461,7 @@ impl WastTest {
462461
"spec_testsuite/simd_i32x4_cmp.wast",
463462
"spec_testsuite/simd_i64x2_arith2.wast",
464463
"spec_testsuite/simd_i64x2_cmp.wast",
464+
"spec_testsuite/simd_i8x16_arith2.wast",
465465
"spec_testsuite/simd_i8x16_cmp.wast",
466466
"spec_testsuite/simd_int_to_int_extend.wast",
467467
"spec_testsuite/simd_load_extend.wast",
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
;;! target = "x86_64"
2+
;;! test = "winch"
3+
;;! flags = [ "-Ccranelift-has-avx" ]
4+
5+
(module
6+
(func (result v128)
7+
(i8x16.popcnt (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
8+
)
9+
)
10+
;; wasm[0]::function[0]:
11+
;; pushq %rbp
12+
;; movq %rsp, %rbp
13+
;; movq 8(%rdi), %r11
14+
;; movq 0x10(%r11), %r11
15+
;; addq $0x10, %r11
16+
;; cmpq %rsp, %r11
17+
;; ja 0x65
18+
;; 1c: movq %rdi, %r14
19+
;; subq $0x10, %rsp
20+
;; movq %rdi, 8(%rsp)
21+
;; movq %rsi, (%rsp)
22+
;; movdqu 0x3c(%rip), %xmm0
23+
;; vpand 0x44(%rip), %xmm0, %xmm15
24+
;; vpsrlw $4, %xmm0, %xmm0
25+
;; vpand 0x37(%rip), %xmm0, %xmm0
26+
;; movdqu 0x3f(%rip), %xmm1
27+
;; vpshufb %xmm0, %xmm1, %xmm0
28+
;; vpshufb %xmm15, %xmm1, %xmm15
29+
;; vpaddb %xmm0, %xmm15, %xmm0
30+
;; addq $0x10, %rsp
31+
;; popq %rbp
32+
;; retq
33+
;; 65: ud2
34+
;; 67: addb %al, (%rax)
35+
;; 69: addb %al, (%rax)
36+
;; 6b: addb %al, (%rax)
37+
;; 6d: addb %al, (%rax)
38+
;; 6f: addb %al, (%rax)
39+
;; 71: addl %eax, (%rdx)
40+
;; 73: addl 0x9080706(, %rax), %eax
41+
;; 7a: orb (%rbx), %cl
42+
;; 7c: orb $0xd, %al

winch/codegen/src/isa/aarch64/masm.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1242,6 +1242,10 @@ impl Masm for MacroAssembler {
12421242
bail!(CodeGenError::unimplemented_masm_instruction())
12431243
}
12441244

1245+
fn v128_popcnt(&mut self, _context: &mut CodeGenContext<Emission>) -> Result<()> {
1246+
bail!(CodeGenError::unimplemented_masm_instruction())
1247+
}
1248+
12451249
fn v128_avgr(
12461250
&mut self,
12471251
_lhs: Reg,

winch/codegen/src/isa/x64/asm.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1813,6 +1813,7 @@ impl Assembler {
18131813
/// `dst`.
18141814
pub fn xmm_vpadd_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
18151815
let op = match size {
1816+
OperandSize::S8 => AvxOpcode::Vpaddb,
18161817
OperandSize::S32 => AvxOpcode::Vpaddd,
18171818
_ => unimplemented!(),
18181819
};
@@ -2123,6 +2124,7 @@ impl Assembler {
21232124
/// Shift vector data right by `imm`.
21242125
pub fn xmm_vpsrl_rr(&mut self, src: Reg, dst: WritableReg, imm: u32, size: OperandSize) {
21252126
let op = match size {
2127+
OperandSize::S16 => AvxOpcode::Vpsrlw,
21262128
OperandSize::S32 => AvxOpcode::Vpsrld,
21272129
OperandSize::S64 => AvxOpcode::Vpsrlq,
21282130
_ => unimplemented!(),
@@ -2794,6 +2796,25 @@ impl Assembler {
27942796
});
27952797
}
27962798

2799+
/// Performs a bitwise `and` operation on the vectors in `src1` and `src2`
2800+
/// and stores the results in `dst`.
2801+
pub fn xmm_vpand_rrm(&mut self, src1: Reg, src2: &Address, dst: WritableReg) {
2802+
let address = Self::to_synthetic_amode(
2803+
&src2,
2804+
&mut self.pool,
2805+
&mut self.constants,
2806+
&mut self.buffer,
2807+
MemFlags::trusted(),
2808+
);
2809+
2810+
self.emit(Inst::XmmRmiRVex {
2811+
op: AvxOpcode::Vpand,
2812+
src1: src1.into(),
2813+
src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
2814+
dst: dst.to_reg().into(),
2815+
});
2816+
}
2817+
27972818
/// Perform an average operation for the vector of unsigned integers in
27982819
/// `src1` and `src2` and put the results in `dst`.
27992820
pub fn xmm_vpavg_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {

winch/codegen/src/isa/x64/masm.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2640,6 +2640,71 @@ impl Masm for MacroAssembler {
26402640
Ok(())
26412641
}
26422642

2643+
fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {
2644+
self.ensure_has_avx()?;
2645+
2646+
let reg = writable!(context.pop_to_reg(self, None)?.reg);
2647+
let scratch = writable!(regs::scratch_xmm());
2648+
2649+
// This works by using a lookup table to determine the count of bits
2650+
// set in the upper 4 bits and lower 4 bits separately and then adding
2651+
// the counts.
2652+
2653+
// A mask to zero out the upper 4 bits in each lane.
2654+
let address = self.asm.add_constant(&[
2655+
0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
2656+
0x0F, 0x0F,
2657+
]);
2658+
// Zero out the upper 4 bits of each lane.
2659+
self.asm.xmm_vpand_rrm(reg.to_reg(), &address, scratch);
2660+
// Right shift bytes in input by 4 bits to put the upper 4 bits in the
2661+
// lower 4 bits.
2662+
self.asm
2663+
.xmm_vpsrl_rr(reg.to_reg(), reg, 0x4, OperandSize::S16);
2664+
// Zero out the upper 4 bits of each shifted lane.
2665+
self.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);
2666+
2667+
// Write a lookup table of 4 bit values to number of bits set to a
2668+
// register so we only perform the memory read once.
2669+
// Index (hex) | Value (binary) | Population Count
2670+
// 0x0 | 0000 | 0
2671+
// 0x1 | 0001 | 1
2672+
// 0x2 | 0010 | 1
2673+
// 0x3 | 0011 | 2
2674+
// 0x4 | 0100 | 1
2675+
// 0x5 | 0101 | 2
2676+
// 0x6 | 0110 | 2
2677+
// 0x7 | 0111 | 3
2678+
// 0x8 | 1000 | 1
2679+
// 0x9 | 1001 | 2
2680+
// 0xA | 1010 | 2
2681+
// 0xB | 1011 | 3
2682+
// 0xC | 1100 | 2
2683+
// 0xD | 1101 | 3
2684+
// 0xE | 1110 | 3
2685+
// 0xF | 1111 | 4
2686+
let address = self.asm.add_constant(&[
2687+
0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,
2688+
]);
2689+
let reg2 = writable!(context.any_fpr(self)?);
2690+
self.asm
2691+
.xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());
2692+
// Use the upper 4 bits as an index into the lookup table.
2693+
self.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());
2694+
// Use the lower 4 bits as an index into the lookup table.
2695+
self.asm
2696+
.xmm_vpshufb_rrr(scratch, reg2.to_reg(), scratch.to_reg());
2697+
context.free_reg(reg2.to_reg());
2698+
2699+
// Add the counts of the upper 4 bits and the lower 4 bits to get the
2700+
// total number of bits set.
2701+
self.asm
2702+
.xmm_vpadd_rrr(reg.to_reg(), scratch.to_reg(), reg, OperandSize::S8);
2703+
2704+
context.stack.push(TypedReg::v128(reg.to_reg()).into());
2705+
Ok(())
2706+
}
2707+
26432708
fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
26442709
self.ensure_has_avx()?;
26452710
self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);

winch/codegen/src/masm.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2104,6 +2104,9 @@ pub(crate) trait MacroAssembler {
21042104
/// adjacent pairs of the 32-bit results.
21052105
fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()>;
21062106

2107+
/// Count the number of bits set in each lane.
2108+
fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()>;
2109+
21072110
/// Lane-wise rounding average of vectors of integers in `lhs` and `rhs`
21082111
/// and put the results in `dst`.
21092112
fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>;

winch/codegen/src/visitor.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,7 @@ macro_rules! def_unsupported {
525525
(emit I32x4ExtAddPairwiseI16x8U $($rest:tt)*) => {};
526526
(emit I32x4ExtAddPairwiseI16x8S $($rest:tt)*) => {};
527527
(emit I32x4DotI16x8S $($rest:tt)*) => {};
528+
(emit I8x16Popcnt $($rest:tt)*) => {};
528529
(emit I8x16AvgrU $($rest:tt)*) => {};
529530
(emit I16x8AvgrU $($rest:tt)*) => {};
530531

@@ -4157,6 +4158,10 @@ where
41574158
})
41584159
}
41594160

4161+
fn visit_i8x16_popcnt(&mut self) -> Self::Output {
4162+
self.masm.v128_popcnt(&mut self.context)
4163+
}
4164+
41604165
fn visit_i8x16_avgr_u(&mut self) -> Self::Output {
41614166
self.context
41624167
.binop(self.masm, OperandSize::S8, |masm, dst, src, size| {

0 commit comments

Comments
 (0)