woachk · September 4, 2022 19:41
diff --git a/S22.txt b/S22.txt
 $ LD_LIBRARY_PATH=/vendor/lib64 clpeak

 Platform: AMD Accelerated Parallel Processing
  Device: gfx1040
    Driver version  : Driver: 7cc74d2a Compiler: 1928c005 (Android)
    Compute units   : 3
    Clock frequency : 555 MHz

    Global memory bandwidth (GBPS)
      float   : 36.65
      float2  : 27.29
      float4  : 28.05
      float8  : 28.22
      float16 : 27.99

    Single-precision compute (GFLOPS)
      float   : 458.16
      float2  : 457.59
      float4  : 458.48
      float8  : 456.67
      float16 : 454.16

    Half-precision compute (GFLOPS)
      half   : 1747.01
      half2  : 1739.68
      half4  : 1737.45
      half8  : 1728.43
      half16 : 1707.71

    No double precision support! Skipped

    Integer compute (GIOPS)
      int   : 183.89
      int2  : 182.21
      int4  : 184.14
      int8  : 183.94
      int16 : 183.36

    Integer compute Fast 24bit (GIOPS)
      int   : 878.98
      int2  : 837.84
      int4  : 818.19
      int8  : 771.67
      int16 : 754.47

    Transfer bandwidth (GBPS)
      enqueueWriteBuffer              : 14.56
      enqueueReadBuffer               : 14.16
      enqueueWriteBuffer non-blocking : 14.16
      enqueueReadBuffer non-blocking  : 14.42
      enqueueMapBuffer(for read)      : 4239.01
        memcpy from mapped ptr        : 11.42
      enqueueUnmap(after write)       : 23241.16
        memcpy to mapped ptr          : 14.07

    Kernel launch latency : 654.66 us

 $ LD_LIBRARY_PATH=/vendor/lib64 clinfo
 Number of platforms                               1
  Platform Name                                   AMD Accelerated Parallel Processing
  Platform Vendor                                 Advanced Micro Devices, Inc.
  Platform Version                                OpenCL 2.0 AMD-APP (3213.0)
  Platform Profile                                FULL_PROFILE
  Platform Extensions                             cl_khr_icd cl_amd_event_callback
  Platform Extensions function suffix             AMD

  Platform Name                                   AMD Accelerated Parallel Processing
 Number of devices                                 1
  Device Name                                     gfx1040
  Device Vendor                                   Advanced Micro Devices, Inc.
  Device Vendor ID                                0x1002
  Device Version                                  OpenCL 2.0 AMD-APP (3213.0)
  Device Numeric Version                          0x800192 (2.0.402)
  Driver Version                                  Driver: 7cc74d2a Compiler: 1928c005
  Device OpenCL C Version                         OpenCL C 2.0
  Device Type                                     GPU
  Device Board Name (AMD)                         Samsung Xclipse 920
  Device PCI-e ID (AMD)                           0x73a0
  Device Topology (AMD)                           PCI-E, 0000:00:00.0
  Device Profile                                  FULL_PROFILE
  Device Available                                Yes
  Compiler Available                              Yes
  Linker Available                                Yes
  Max compute units                               3
  SIMD per compute unit (AMD)                     4
  SIMD width (AMD)                                32
  SIMD instruction width (AMD)                    1
  Max clock frequency                             555MHz
  Graphics IP (AMD)                               10.4
  Device Partition                                (core)
    Max number of sub-devices                     3
    Supported partition types                     None
    Supported affinity domains                    (n/a)
  Max work item dimensions                        3
  Max work item sizes                             1024x1024x1024
  Max work group size                             1024
  Preferred work group size (AMD)                 <printDeviceInfo:55: get CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD : error -30>
  Max work group size (AMD)                       <printDeviceInfo:56: get CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD : error -30>
  Preferred work group size multiple (kernel)     64
  Wavefront width (AMD)                           64
  Preferred / native vector sizes
    char                                                 4 / 4
    short                                                2 / 2
    int                                                  1 / 1
    long                                                 1 / 1
    half                                                 0 / 0        (cl_khr_fp16)
    float                                                1 / 1
    double                                               0 / 0        (n/a)
  Half-precision Floating-point support           (cl_khr_fp16)
    Denormals                                     No
    Infinity and NANs                             No
    Round to nearest                              No
    Round to zero                                 No
    Round to infinity                             No
    IEEE754-2008 fused multiply-add               No
    Support is emulated in software               No
  Single-precision Floating-point support         (core)
    Denormals                                     No
    Infinity and NANs                             Yes
    Round to nearest                              Yes
    Round to zero                                 Yes
    Round to infinity                             Yes
    IEEE754-2008 fused multiply-add               Yes
    Support is emulated in software               No
    Correctly-rounded divide and sqrt operations  No
  Double-precision Floating-point support         (n/a)
  Address bits                                    64, Little-Endian
  Global memory size                              4294967296 (4GiB)
  Global free memory (AMD)                        4187656 (3.994GiB) 4187656 (3.994GiB)
  Global memory channels (AMD)                    0
  Global memory banks per channel (AMD)           4
  Global memory bank width (AMD)                  256 bytes
  Error Correction support                        No
  Max memory allocation                           1073741824 (1024MiB)
  Unified memory for Host and Device              Yes
  Shared Virtual Memory (SVM) capabilities        (core)
    Coarse-grained buffer sharing                 Yes
    Fine-grained buffer sharing                   Yes
    Fine-grained system sharing                   No
    Atomics                                       No
  Minimum alignment for any data type             128 bytes
  Alignment of base address                       1024 bits (128 bytes)
  Preferred alignment for atomics
    SVM                                           0 bytes
    Global                                        0 bytes
    Local                                         0 bytes
  Max size for global variable                    1073741824 (1024MiB)
  Preferred total size of global vars             0
  Global Memory cache type                        Read/Write
  Global Memory cache size                        1048576 (1024KiB)
  Global Memory cache line size                   128 bytes
  Image support                                   Yes
    Max number of samplers per kernel             16
    Max size for 1D images from buffer            134217728 pixels
    Max 1D or 2D image array size                 2048 images
    Base address alignment for 2D image buffers   256 bytes
    Pitch alignment for 2D image buffers          256 pixels
    Max 2D image size                             16384x16384 pixels
    Max 3D image size                             2048x2048x2048 pixels
    Max number of read image args                 128
    Max number of write image args                64
    Max number of read/write image args           64
  Max number of pipe args                         16
  Max active pipe reservations                    16
  Max pipe packet size                            1073741824 (1024MiB)
  Local memory type                               Local
  Local memory size                               32768 (32KiB)
  Local memory size per CU (AMD)                  32768 (32KiB)
  Local memory banks (AMD)                        32
  Max number of constant args                     8
  Max constant buffer size                        1073741824 (1024MiB)
  Preferred constant buffer size (AMD)            16384 (16KiB)
  Max size of kernel argument                     1024
  Queue properties (on host)
    Out-of-order execution                        No
    Profiling                                     Yes
  Queue properties (on device)
    Out-of-order execution                        Yes
    Profiling                                     Yes
    Preferred size                                262144 (256KiB)
    Max size                                      8388608 (8MiB)
  Max queues on device                            1
  Max events on device                            1024
  Prefer user sync for interop                    Yes
  Profiling timer resolution                      1ns
  Profiling timer offset since Epoch (AMD)        1662014362028068533ns (Wed Aug 31 23:39:22 2022)
  Execution capabilities
    Run OpenCL kernels                            Yes
    Run native kernels                            No
    Thread trace supported (AMD)                  Yes
    Number of async queues (AMD)                  4
    Max real-time compute queues (AMD)            1
    Max real-time compute units (AMD)             0
    ILs with version                              (n/a)
  printf() buffer size                            4194304 (4MiB)
  Built-in kernels                                (n/a)
  Built-in kernels with version                   (n/a)
  Device Extensions                               cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_amd_device_attribute_query cl_khr_image2d_from_buffer cl_khr_subgroups cl_khr_depth_images cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_arm_import_memory cl_arm_import_memory_dma_buf cl_arm_import_memory_protected cl_khr_extended_versioning
  Device Extensions with Version                  cl_khr_global_int32_base_atomics                                 0x400000 (1.0.0)
                                                  cl_khr_global_int32_extended_atomics                             0x400000 (1.0.0)
                                                  cl_khr_local_int32_base_atomics                                  0x400000 (1.0.0)
                                                  cl_khr_local_int32_extended_atomics                              0x400000 (1.0.0)
                                                  cl_khr_int64_base_atomics                                        0x400000 (1.0.0)
                                                  cl_khr_int64_extended_atomics                                    0x400000 (1.0.0)
                                                  cl_khr_3d_image_writes                                           0x400000 (1.0.0)
                                                  cl_khr_byte_addressable_store                                    0x400000 (1.0.0)
                                                  cl_khr_fp16                                                      0x400000 (1.0.0)
                                                  cl_amd_device_attribute_query                                    0x400000 (1.0.0)
                                                  cl_khr_image2d_from_buffer                                       0x400000 (1.0.0)
                                                  cl_khr_subgroups                                                 0x400000 (1.0.0)
                                                  cl_khr_depth_images                                              0x400000 (1.0.0)
                                                  cl_khr_mipmap_image                                              0x400000 (1.0.0)
                                                  cl_khr_mipmap_image_writes                                       0x400000 (1.0.0)
                                                  cl_arm_import_memory                                             0x40b000 (1.11.0)
                                                  cl_arm_import_memory_dma_buf                                     0x400000 (1.0.0)
                                                  cl_arm_import_memory_protected                                   0x400000 (1.0.0)
                                                  cl_khr_extended_versioning                                       0x400000 (1.0.0)

 NULL platform behavior
  clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...)  No platform
  clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...)   No platform
  clCreateContext(NULL, ...) [default]            No platform
  clCreateContext(NULL, ...) [other]              Success [AMD]
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_DEFAULT)  Success (1)
    Platform Name                                 AMD Accelerated Parallel Processing
    Device Name                                   gfx1040
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU)  No devices found in platform
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU)  Success (1)
    Platform Name                                 AMD Accelerated Parallel Processing
    Device Name                                   gfx1040
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR)  No devices found in platform
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM)  No devices found in platform
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL)  Success (1)
    Platform Name                                 AMD Accelerated Parallel Processing
    Device Name                                   gfx1040
	$ LD_LIBRARY_PATH=/vendor/lib64 clpeak

	Platform: AMD Accelerated Parallel Processing
	Device: gfx1040
	Driver version : Driver: 7cc74d2a Compiler: 1928c005 (Android)
	Compute units : 3
	Clock frequency : 555 MHz

	Global memory bandwidth (GBPS)
	float : 36.65
	float2 : 27.29
	float4 : 28.05
	float8 : 28.22
	float16 : 27.99

	Single-precision compute (GFLOPS)
	float : 458.16
	float2 : 457.59
	float4 : 458.48
	float8 : 456.67
	float16 : 454.16

	Half-precision compute (GFLOPS)
	half : 1747.01
	half2 : 1739.68
	half4 : 1737.45
	half8 : 1728.43
	half16 : 1707.71

	No double precision support! Skipped

	Integer compute (GIOPS)
	int : 183.89
	int2 : 182.21
	int4 : 184.14
	int8 : 183.94
	int16 : 183.36

	Integer compute Fast 24bit (GIOPS)
	int : 878.98
	int2 : 837.84
	int4 : 818.19
	int8 : 771.67
	int16 : 754.47

	Transfer bandwidth (GBPS)
	enqueueWriteBuffer : 14.56
	enqueueReadBuffer : 14.16
	enqueueWriteBuffer non-blocking : 14.16
	enqueueReadBuffer non-blocking : 14.42
	enqueueMapBuffer(for read) : 4239.01
	memcpy from mapped ptr : 11.42
	enqueueUnmap(after write) : 23241.16
	memcpy to mapped ptr : 14.07

	Kernel launch latency : 654.66 us

	$ LD_LIBRARY_PATH=/vendor/lib64 clinfo
	Number of platforms 1
	Platform Name AMD Accelerated Parallel Processing
	Platform Vendor Advanced Micro Devices, Inc.
	Platform Version OpenCL 2.0 AMD-APP (3213.0)
	Platform Profile FULL_PROFILE
	Platform Extensions cl_khr_icd cl_amd_event_callback
	Platform Extensions function suffix AMD

	Platform Name AMD Accelerated Parallel Processing
	Number of devices 1
	Device Name gfx1040
	Device Vendor Advanced Micro Devices, Inc.
	Device Vendor ID 0x1002
	Device Version OpenCL 2.0 AMD-APP (3213.0)
	Device Numeric Version 0x800192 (2.0.402)
	Driver Version Driver: 7cc74d2a Compiler: 1928c005
	Device OpenCL C Version OpenCL C 2.0
	Device Type GPU
	Device Board Name (AMD) Samsung Xclipse 920
	Device PCI-e ID (AMD) 0x73a0
	Device Topology (AMD) PCI-E, 0000:00:00.0
	Device Profile FULL_PROFILE
	Device Available Yes
	Compiler Available Yes
	Linker Available Yes
	Max compute units 3
	SIMD per compute unit (AMD) 4
	SIMD width (AMD) 32
	SIMD instruction width (AMD) 1
	Max clock frequency 555MHz
	Graphics IP (AMD) 10.4
	Device Partition (core)
	Max number of sub-devices 3
	Supported partition types None
	Supported affinity domains (n/a)
	Max work item dimensions 3
	Max work item sizes 1024x1024x1024
	Max work group size 1024
	Preferred work group size (AMD) <printDeviceInfo:55: get CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD : error -30>
	Max work group size (AMD) <printDeviceInfo:56: get CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD : error -30>
	Preferred work group size multiple (kernel) 64
	Wavefront width (AMD) 64
	Preferred / native vector sizes
	char 4 / 4
	short 2 / 2
	int 1 / 1
	long 1 / 1
	half 0 / 0 (cl_khr_fp16)
	float 1 / 1
	double 0 / 0 (n/a)
	Half-precision Floating-point support (cl_khr_fp16)
	Denormals No
	Infinity and NANs No
	Round to nearest No
	Round to zero No
	Round to infinity No
	IEEE754-2008 fused multiply-add No
	Support is emulated in software No
	Single-precision Floating-point support (core)
	Denormals No
	Infinity and NANs Yes
	Round to nearest Yes
	Round to zero Yes
	Round to infinity Yes
	IEEE754-2008 fused multiply-add Yes
	Support is emulated in software No
	Correctly-rounded divide and sqrt operations No
	Double-precision Floating-point support (n/a)
	Address bits 64, Little-Endian
	Global memory size 4294967296 (4GiB)
	Global free memory (AMD) 4187656 (3.994GiB) 4187656 (3.994GiB)
	Global memory channels (AMD) 0
	Global memory banks per channel (AMD) 4
	Global memory bank width (AMD) 256 bytes
	Error Correction support No
	Max memory allocation 1073741824 (1024MiB)
	Unified memory for Host and Device Yes
	Shared Virtual Memory (SVM) capabilities (core)
	Coarse-grained buffer sharing Yes
	Fine-grained buffer sharing Yes
	Fine-grained system sharing No
	Atomics No
	Minimum alignment for any data type 128 bytes
	Alignment of base address 1024 bits (128 bytes)
	Preferred alignment for atomics
	SVM 0 bytes
	Global 0 bytes
	Local 0 bytes
	Max size for global variable 1073741824 (1024MiB)
	Preferred total size of global vars 0
	Global Memory cache type Read/Write
	Global Memory cache size 1048576 (1024KiB)
	Global Memory cache line size 128 bytes
	Image support Yes
	Max number of samplers per kernel 16
	Max size for 1D images from buffer 134217728 pixels
	Max 1D or 2D image array size 2048 images
	Base address alignment for 2D image buffers 256 bytes
	Pitch alignment for 2D image buffers 256 pixels
	Max 2D image size 16384x16384 pixels
	Max 3D image size 2048x2048x2048 pixels
	Max number of read image args 128
	Max number of write image args 64
	Max number of read/write image args 64
	Max number of pipe args 16
	Max active pipe reservations 16
	Max pipe packet size 1073741824 (1024MiB)
	Local memory type Local
	Local memory size 32768 (32KiB)
	Local memory size per CU (AMD) 32768 (32KiB)
	Local memory banks (AMD) 32
	Max number of constant args 8
	Max constant buffer size 1073741824 (1024MiB)
	Preferred constant buffer size (AMD) 16384 (16KiB)
	Max size of kernel argument 1024
	Queue properties (on host)
	Out-of-order execution No
	Profiling Yes
	Queue properties (on device)
	Out-of-order execution Yes
	Profiling Yes
	Preferred size 262144 (256KiB)
	Max size 8388608 (8MiB)
	Max queues on device 1
	Max events on device 1024
	Prefer user sync for interop Yes
	Profiling timer resolution 1ns
	Profiling timer offset since Epoch (AMD) 1662014362028068533ns (Wed Aug 31 23:39:22 2022)
	Execution capabilities
	Run OpenCL kernels Yes
	Run native kernels No
	Thread trace supported (AMD) Yes
	Number of async queues (AMD) 4
	Max real-time compute queues (AMD) 1
	Max real-time compute units (AMD) 0
	ILs with version (n/a)
	printf() buffer size 4194304 (4MiB)
	Built-in kernels (n/a)
	Built-in kernels with version (n/a)
	Device Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_fp16 cl_amd_device_attribute_query cl_khr_image2d_from_buffer cl_khr_subgroups cl_khr_depth_images cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_arm_import_memory cl_arm_import_memory_dma_buf cl_arm_import_memory_protected cl_khr_extended_versioning
	Device Extensions with Version cl_khr_global_int32_base_atomics 0x400000 (1.0.0)
	cl_khr_global_int32_extended_atomics 0x400000 (1.0.0)
	cl_khr_local_int32_base_atomics 0x400000 (1.0.0)
	cl_khr_local_int32_extended_atomics 0x400000 (1.0.0)
	cl_khr_int64_base_atomics 0x400000 (1.0.0)
	cl_khr_int64_extended_atomics 0x400000 (1.0.0)
	cl_khr_3d_image_writes 0x400000 (1.0.0)
	cl_khr_byte_addressable_store 0x400000 (1.0.0)
	cl_khr_fp16 0x400000 (1.0.0)
	cl_amd_device_attribute_query 0x400000 (1.0.0)
	cl_khr_image2d_from_buffer 0x400000 (1.0.0)
	cl_khr_subgroups 0x400000 (1.0.0)
	cl_khr_depth_images 0x400000 (1.0.0)
	cl_khr_mipmap_image 0x400000 (1.0.0)
	cl_khr_mipmap_image_writes 0x400000 (1.0.0)
	cl_arm_import_memory 0x40b000 (1.11.0)
	cl_arm_import_memory_dma_buf 0x400000 (1.0.0)
	cl_arm_import_memory_protected 0x400000 (1.0.0)
	cl_khr_extended_versioning 0x400000 (1.0.0)

	NULL platform behavior
	clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform
	clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) No platform
	clCreateContext(NULL, ...) [default] No platform
	clCreateContext(NULL, ...) [other] Success [AMD]
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_DEFAULT) Success (1)
	Platform Name AMD Accelerated Parallel Processing
	Device Name gfx1040
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1)
	Platform Name AMD Accelerated Parallel Processing
	Device Name gfx1040
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1)
	Platform Name AMD Accelerated Parallel Processing
	Device Name gfx1040