IBM Support

LI79792: PERFORMANCE IMPROVEMENT FOR __POPCNT8()

Subscribe

You can track all active APARs for this component.

 

APAR status

  • Closed as program error.

Error description

  • When using the __popcnt8() builtin, there is a performance
    concern that needed to be addressed via this APAR.
    
    The following test case demonstrates the issue:
    ===== COMPILE COMMAND:
    /bgsys/drivers/ppcfloor/comm/xl/bin/mpixlcxx_r -O5 -qsmp=omp
    -qthreaded test.c -I/bgsys/drivers/V1R2M2/ppc64
    
    ===== TESTCASE:
    % cat test.c
    #include <stdio.h>
    #include <stdlib.h>
    
    #include <stdio.h>
    #include <inttypes.h>
    
    
    #include "hwi/include/bqc/A2_inlines.h"
    #define  size  (128*1024*1024)
    volatile int64_t value[size];
    
    inline int popcnt8(uint64_t data)
    {
          int ldz;
           asm volatile ("popcntd %0, %1\n\t"
           : "=r" (ldz)
           : "r" (data)
           );
           return ldz;
    }
    
    char BitsSetTable256[256];
    
    int popcount(long long b)
    {
         b = (b & 0x5555555555555555LU) + (b >> 1 &
    0x5555555555555555LU);
         b = (b & 0x3333333333333333LU) + (b >> 2 &
    0x3333333333333333LU);
         b = b + (b >> 4) & 0x0F0F0F0F0F0F0F0FLU;
         b = b + (b >> 8);
         b = b + (b >> 16);
         b = b + (b >> 32) & 0x0000007F;
         return (int) b;
    }
    
    int popcount_table(long long v) {
      int c = BitsSetTable256[v & 0xff] +
        BitsSetTable256[(v >> 8) & 0xff] +
        BitsSetTable256[(v >> 16) & 0xff] +
        BitsSetTable256[v >> 24];
      return c;
    }
    int counttz(long long v) {
      int c;
      if (v)  {
        v = (v ? (v - 1)) >> 1;  // Set v's trailing 0s to 1s and
    zero rest
        for (c = 0; v; c++)
           v >>= 1;
      }
      else
        c = 8 * sizeof(v);
    }
    
    
    
    int main(int argc, char * argv[])
    {
      BitsSetTable256[0] = 0;
    
      for (int i = 0; i < 256; i++)
        BitsSetTable256[i] = (i & 1) + BitsSetTable256[i / 2];
    
    
    
      uint64_t t2, t1, software_cycles, hardware_instruction_cycles;
      uint64_t thr_count[64];
      int nthreads;
      #pragma omp parallel for
      for(int i=0;i<size;i++)
        value[i] = i;
    
      t1 = GetTimeBase();
      #pragma omp parallel
      {
        int i, tid = omp_get_thread_num();
        if(tid ==0) nthreads = omp_get_num_threads();
        uint64_t count = 0;
        thr_count[tid] = 0;
        #pragma omp for
        for(i=0;i<size;i++)
          count += popcount_table(value[i]);
        thr_count[tid] = count;
      }
      t2 = GetTimeBase();
      software_cycles = t2 - t1;
      for(int i = 1;i<nthreads;i++)
        thr_count[0] += thr_count[i];
      printf("Software popcnt cycles: %llu, count
    %llu\n",software_cycles, thr_count[0]);
    
      t1 = GetTimeBase();
      #pragma omp parallel
      {
        int i, tid = omp_get_thread_num();
        uint64_t count = 0;
        thr_count[tid] = 0;
        #pragma omp for
        for(i=0;i<size;i++)
           count += __popcnt8(value[i]);
        thr_count[tid] = count;
      }
      t2 = GetTimeBase();
      hardware_instruction_cycles = t2 - t1;
      for(int i = 1;i<nthreads;i++)
        thr_count[0] += thr_count[i];
      printf("hardware_instruction popcnt cycles: %llu, count
    %llu\n", hardware_instruction_cycles, thr_count[0]);
    
    
    }
    %
    
    
    ===== ACTUAL OUTPUT:
    % /bgsys/drivers/ppcfloor/hlcs/bin/runjob --exe
    a.out --block R00-M0-N04 --np 1 -p 1
    Software popcnt cycles:                   115842638, count
    1811939328
    hardware_instruction popcnt cycles: 242737020, count 1811939328
    %
    

Local fix

  • N/A
    

Problem summary

  • USERS AFFECTED:
    Users using __popcnt8() builtin in their application and call it
    a number of times within a loop maybe affected by this issue.
    
    PROBLEM DESCRIPTION:
    The popcnt hardware ops are slow on BG/Q.
    

Problem conclusion

  • The compiler should not use popcnt hardware ops.
    It has been changed to use a software emulation and shows
    improvement in performance.
    

Temporary fix

Comments

APAR Information

  • APAR number

    LI79792

  • Reported component name

    XL C/C++ FOR BG

  • Reported component ID

    5799AG100

  • Reported release

    C10

  • Status

    CLOSED PER

  • PE

    NoPE

  • HIPER

    NoHIPER

  • Special Attention

    NoSpecatt / Xsystem

  • Submitted date

    2017-12-21

  • Closed date

    2017-12-21

  • Last modified date

    2017-12-21

  • APAR is sysrouted FROM one or more of the following:

  • APAR is sysrouted TO one or more of the following:

Fix information

  • Fixed component name

    XL C/C++ FOR BG

  • Fixed component ID

    5799AG100

Applicable component levels

[{"Business Unit":{"code":"BU048","label":"IBM Software"},"Product":{"code":"SS2LWA","label":"XL C\/C++ for Blue Gene\/Q"},"Component":"","ARM Category":[],"Platform":[{"code":"PF025","label":"Platform Independent"}],"Version":"12.1","Edition":"","Line of Business":{"code":"LOB73","label":"Power TPS"}}]

Document Information

Modified date:
05 September 2024