@@ -2640,6 +2640,71 @@ impl Masm for MacroAssembler {
26402640 Ok ( ( ) )
26412641 }
26422642
2643+ fn v128_popcnt ( & mut self , context : & mut CodeGenContext < Emission > ) -> Result < ( ) > {
2644+ self . ensure_has_avx ( ) ?;
2645+
2646+ let reg = writable ! ( context. pop_to_reg( self , None ) ?. reg) ;
2647+ let scratch = writable ! ( regs:: scratch_xmm( ) ) ;
2648+
2649+ // This works by using a lookup table to determine the count of bits
2650+ // set in the upper 4 bits and lower 4 bits separately and then adding
2651+ // the counts.
2652+
2653+ // A mask to zero out the upper 4 bits in each lane.
2654+ let address = self . asm . add_constant ( & [
2655+ 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F ,
2656+ 0x0F , 0x0F ,
2657+ ] ) ;
2658+ // Zero out the upper 4 bits of each lane.
2659+ self . asm . xmm_vpand_rrm ( reg. to_reg ( ) , & address, scratch) ;
2660+ // Right shift bytes in input by 4 bits to put the upper 4 bits in the
2661+ // lower 4 bits.
2662+ self . asm
2663+ . xmm_vpsrl_rr ( reg. to_reg ( ) , reg, 0x4 , OperandSize :: S16 ) ;
2664+ // Zero out the upper 4 bits of each shifted lane.
2665+ self . asm . xmm_vpand_rrm ( reg. to_reg ( ) , & address, reg) ;
2666+
2667+ // Write a lookup table of 4 bit values to number of bits set to a
2668+ // register so we only perform the memory read once.
2669+ // Index (hex) | Value (binary) | Population Count
2670+ // 0x0 | 0000 | 0
2671+ // 0x1 | 0001 | 1
2672+ // 0x2 | 0010 | 1
2673+ // 0x3 | 0011 | 2
2674+ // 0x4 | 0100 | 1
2675+ // 0x5 | 0101 | 2
2676+ // 0x6 | 0110 | 2
2677+ // 0x7 | 0111 | 3
2678+ // 0x8 | 1000 | 1
2679+ // 0x9 | 1001 | 2
2680+ // 0xA | 1010 | 2
2681+ // 0xB | 1011 | 3
2682+ // 0xC | 1100 | 2
2683+ // 0xD | 1101 | 3
2684+ // 0xE | 1110 | 3
2685+ // 0xF | 1111 | 4
2686+ let address = self . asm . add_constant ( & [
2687+ 0x0 , 0x1 , 0x1 , 0x2 , 0x1 , 0x2 , 0x2 , 0x3 , 0x1 , 0x2 , 0x2 , 0x3 , 0x2 , 0x3 , 0x3 , 0x4 ,
2688+ ] ) ;
2689+ let reg2 = writable ! ( context. any_fpr( self ) ?) ;
2690+ self . asm
2691+ . xmm_mov_mr ( & address, reg2, OperandSize :: S128 , MemFlags :: trusted ( ) ) ;
2692+ // Use the upper 4 bits as an index into the lookup table.
2693+ self . asm . xmm_vpshufb_rrr ( reg, reg2. to_reg ( ) , reg. to_reg ( ) ) ;
2694+ // Use the lower 4 bits as an index into the lookup table.
2695+ self . asm
2696+ . xmm_vpshufb_rrr ( scratch, reg2. to_reg ( ) , scratch. to_reg ( ) ) ;
2697+ context. free_reg ( reg2. to_reg ( ) ) ;
2698+
2699+ // Add the counts of the upper 4 bits and the lower 4 bits to get the
2700+ // total number of bits set.
2701+ self . asm
2702+ . xmm_vpadd_rrr ( reg. to_reg ( ) , scratch. to_reg ( ) , reg, OperandSize :: S8 ) ;
2703+
2704+ context. stack . push ( TypedReg :: v128 ( reg. to_reg ( ) ) . into ( ) ) ;
2705+ Ok ( ( ) )
2706+ }
2707+
26432708 fn v128_avgr ( & mut self , lhs : Reg , rhs : Reg , dst : WritableReg , size : OperandSize ) -> Result < ( ) > {
26442709 self . ensure_has_avx ( ) ?;
26452710 self . asm . xmm_vpavg_rrr ( lhs, rhs, dst, size) ;
0 commit comments