前言 向量化简单的说就是使用SIMD指令, 来实现使用一条指令同时处理多个数据, MIC中具有32个长度为512位的向量处理单元, 每个向量处理单元可以处理16个32位或者8个64位的数据. 这里主要记录一下MIC向量化的使用方式以及一些向量指令的作用.
数据类型 MIC中使用下面的数据类型作为执行向量函数的操作数
下面是它们的各自的作用:
__m512
- 处理单精度向量(float32 vector)__m512d
- 处理双精度向量(float64 vector)__m512i
- 处理整形向量, 包括32位和64位整形(int32/int64) 上面的数据类型直接映射到向量寄存器上(vector registers), 除此之外还有一种数据类型__mmask16 - is an unsigned short type associated with the mask register values. 我们可以使用 Load Intrinsics
(为向量赋值) 和 Store Intrinsics
(保存向量的值) 实现向量的存取. 下面是一个示例1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 void test_load_store () { int32_t *arr_int32; int64_t *arr_int64; int i, n = 32 ; arr_int32 = _mm_malloc(sizeof (int32_t ) * n, 64 ); arr_int64 = _mm_malloc(sizeof (int64_t ) * n, 64 ); for (i = 0 ; i < n; i++) { arr_int32[i] = i; arr_int64[i] = i + n; } #pragma offload target(mic) inout(arr_int32:length(n)) inout(arr_int64:length(n)) { __m512i m_32, m_64; m_32 = _mm512_load_epi32(arr_int32); m_32 = _mm512_load_epi32(arr_int32 + 16 ); m_64 = _mm512_load_epi64(arr_int64); m_64 = _mm512_load_epi64(arr_int64 + 8 ); _mm512_store_epi32(arr_int32, m_32); _mm512_store_epi32(arr_int32 + 16 , m_32); _mm512_store_epi64(arr_int64, m_64); _mm512_store_epi64(arr_int64 + 8 , m_64); } _mm_free(arr_int32); _mm_free(arr_int64); }
向量化函数(Intrinsics) 这里主要记录一些编译器提供的向量化函数, 完整的函数集可以在这里 或者这里 查询
算术运算 MIC中提供了加,减, 乘 三种算术运算函数, 这里以32位整型的加法为例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <immintrin.h> void mic_add () { uint32_t *arr_a, *arr_b, *arr_c; int i = 0 , n = 16 ; arr_a = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_b = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_c = _mm_malloc(sizeof (uint32_t ) * n, 64 ); for (i = 0 ; i < n; i++) { arr_a[i] = i; arr_b[i] = n + i; } #pragma offload target(mic) in(arr_a:length(n)) in(arr_b:length(n)) out(arr_c:length(n)) { __m512i m_a, m_b, m_c; m_a = _mm512_load_epi32(arr_a); m_b = _mm512_load_epi32(arr_b); m_c = _mm512_add_epi32(m_a, m_b); _mm512_store_epi32(arr_c, m_c); } for (i = 0 ; i < n; i++) { printf ("arr_a[%2d] is: %2d \t arr_b[%2d] is: %2d \t arr_c[%2d] is : %2d\n" , i, arr_a[i], i, arr_b[i], i, arr_c[i]); } _mm_free(arr_a); _mm_free(arr_b); _mm_free(arr_c); } int main () { mic_add(); }
输出结果为:1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 arr_a[ 0] is: 0 arr_b[ 0] is: 16 arr_c[ 0] is : 16 arr_a[ 1] is: 1 arr_b[ 1] is: 17 arr_c[ 1] is : 18 arr_a[ 2] is: 2 arr_b[ 2] is: 18 arr_c[ 2] is : 20 arr_a[ 3] is: 3 arr_b[ 3] is: 19 arr_c[ 3] is : 22 arr_a[ 4] is: 4 arr_b[ 4] is: 20 arr_c[ 4] is : 24 arr_a[ 5] is: 5 arr_b[ 5] is: 21 arr_c[ 5] is : 26 arr_a[ 6] is: 6 arr_b[ 6] is: 22 arr_c[ 6] is : 28 arr_a[ 7] is: 7 arr_b[ 7] is: 23 arr_c[ 7] is : 30 arr_a[ 8] is: 8 arr_b[ 8] is: 24 arr_c[ 8] is : 32 arr_a[ 9] is: 9 arr_b[ 9] is: 25 arr_c[ 9] is : 34 arr_a[10] is: 10 arr_b[10] is: 26 arr_c[10] is : 36 arr_a[11] is: 11 arr_b[11] is: 27 arr_c[11] is : 38 arr_a[12] is: 12 arr_b[12] is: 28 arr_c[12] is : 40 arr_a[13] is: 13 arr_b[13] is: 29 arr_c[13] is : 42 arr_a[14] is: 14 arr_b[14] is: 30 arr_c[14] is : 44 arr_a[15] is: 15 arr_b[15] is: 31 arr_c[15] is : 46
With Mask MIC提供的向量函数一般有两种形式
1 2 3 4 5 extern _m512i __cdecl _mm512_add_epi32(_m512i v2, _m512i v3);extern _m512i __cdecl _mm512_mask_add_epi32(_m512i v1_old, __mmask16 k1, _m512i v2, _m512i v3);
一种是带Mask的, 一种是不带Mask的. 带Mask的多了两个参数: v1_old
和k1
, 其中k1
是__mmask16
类型的数据, 在上面我们知道__mmask
类型就是unsigned short
类型, 长度为16位. 关于带mask函数的解释: 将v1
的16位分别对应到_m512i
的16个整型上, 如果k1
某个位是1, 则将v2和v3中与该位对应的整型相加, 作为结果值, 如果k1
某个位为0, 就使用v1_old
向量中对应位的整型作为结果值. 例如如果k1
的第一位为1, 那么就将v2
的第一个整数和v3
的第一个整数相加, 作为结果向量的第一个整型的值. 如果k1
的第一位是0, 就将v1_old
向量中的第一个整型的值作为结果向量中第一个整型的值. 好吧, 还是看个例子吧.1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 void mic_mask_add () { uint32_t *arr_a, *arr_b, *arr_c, *arr_old; int i = 0 , n = 16 ; arr_a = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_b = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_c = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_old = _mm_malloc(sizeof (uint32_t ) * n, 64 ); for (i = 0 ; i < n; i++) { arr_a[i] = i; arr_b[i] = n + i; arr_old[i] = 10000 ; } #pragma offload target(mic) in(arr_a:length(n)) in(arr_b:length(n)) in(arr_old:length(n)) out(arr_c:length(n)) { __m512i m_a, m_b, m_c, m_old; __mmask16 k1 = 11 ; m_a = _mm512_load_epi32(arr_a); m_b = _mm512_load_epi32(arr_b); m_old = _mm512_load_epi32(arr_old); m_c = _mm512_mask_add_epi32(m_old, k1, m_a, m_b); _mm512_store_epi32(arr_c, m_c); } for (i = 0 ; i < n; i++) { printf ("arr_a[%2d] is: %2d \t arr_b[%2d] is: %2d \t arr_old[%2d] is: %d \t arr_c[%2d] is : %2d\n" , i, arr_a[i], i, arr_b[i], i, arr_old[i], i, arr_c[i]); } _mm_free(arr_a); _mm_free(arr_b); _mm_free(arr_c); _mm_free(arr_old); }
运行结果为:1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 arr_a[ 0] is: 0 arr_b[ 0] is: 16 arr_old[ 0] is: 10000 arr_c[ 0] is : 16 arr_a[ 1] is: 1 arr_b[ 1] is: 17 arr_old[ 1] is: 10000 arr_c[ 1] is : 18 arr_a[ 2] is: 2 arr_b[ 2] is: 18 arr_old[ 2] is: 10000 arr_c[ 2] is : 10000 arr_a[ 3] is: 3 arr_b[ 3] is: 19 arr_old[ 3] is: 10000 arr_c[ 3] is : 22 arr_a[ 4] is: 4 arr_b[ 4] is: 20 arr_old[ 4] is: 10000 arr_c[ 4] is : 10000 arr_a[ 5] is: 5 arr_b[ 5] is: 21 arr_old[ 5] is: 10000 arr_c[ 5] is : 10000 arr_a[ 6] is: 6 arr_b[ 6] is: 22 arr_old[ 6] is: 10000 arr_c[ 6] is : 10000 arr_a[ 7] is: 7 arr_b[ 7] is: 23 arr_old[ 7] is: 10000 arr_c[ 7] is : 10000 arr_a[ 8] is: 8 arr_b[ 8] is: 24 arr_old[ 8] is: 10000 arr_c[ 8] is : 10000 arr_a[ 9] is: 9 arr_b[ 9] is: 25 arr_old[ 9] is: 10000 arr_c[ 9] is : 10000 arr_a[10] is: 10 arr_b[10] is: 26 arr_old[10] is: 10000 arr_c[10] is : 10000 arr_a[11] is: 11 arr_b[11] is: 27 arr_old[11] is: 10000 arr_c[11] is : 10000 arr_a[12] is: 12 arr_b[12] is: 28 arr_old[12] is: 10000 arr_c[12] is : 10000 arr_a[13] is: 13 arr_b[13] is: 29 arr_old[13] is: 10000 arr_c[13] is : 10000 arr_a[14] is: 14 arr_b[14] is: 30 arr_old[14] is: 10000 arr_c[14] is : 10000 arr_a[15] is: 15 arr_b[15] is: 31 arr_old[15] is: 10000 arr_c[15] is : 10000
Bitwise运算 MIC中提供了3中Bitwise运算函数- and
or
xor
, 其中取反元素可以通过与1异或来实现, 下面是and
操作的一个例子
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 void mic_and () { uint32_t *arr_a, *arr_b, *arr_c; int i = 0 , n = 16 ; arr_a = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_b = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_c = _mm_malloc(sizeof (uint32_t ) * n, 64 ); for (i = 0 ; i < n; i++) { arr_a[i] = i ; arr_b[i] = n + i; } #pragma offload target(mic) in(arr_a:length(n)) in(arr_b:length(n)) out(arr_c:length(n)) { __m512i m_a, m_b, m_c; m_a = _mm512_load_epi32(arr_a); m_b = _mm512_load_epi32(arr_b); m_c = _mm512_and_epi32(m_a, m_b); _mm512_store_epi32(arr_c, m_c); } for (i = 0 ; i < n; i++) { print_binary (arr_a[i], 8 ); printf ( " & " ); print_binary(arr_b[i], 8 ); printf (" = " ); print_binary(arr_c[i], 8 ); printf ("\n" ); } _mm_free(arr_a); _mm_free(arr_b); _mm_free(arr_c); }
其中print_binary
是一个打印二进制的函数, 这里只打印了后8位1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 void print_binary (uint64_t t, int bit_len) { short buffer[bit_len]; int i; for (i = 0 ; i < bit_len; i++) { buffer[i] = 0 ; } for (i = 0 ; i < bit_len; i++) { if (t == 0 ) break ; if (t % 2 == 0 ) { buffer[i] = 0 ; } else { buffer[i] = 1 ; } t = t / 2 ; } for (i = bit_len - 1 ; i >= 0 ; i--) { printf ("%hd" , buffer[i]); } }
下面是一个取反的示例1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 void mic_not () { uint32_t *arr_a, *arr_c; int i = 0 , n = 16 ; arr_a = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_c = _mm_malloc(sizeof (uint32_t ) * n, 64 ); for (i = 0 ; i < n; i++) { arr_a[i] = i ; } #pragma offload target(mic) in(arr_a:length(n)) out(arr_c:length(n)) { __m512i m_a, m_b, m_c; int32_t all_one = 0xffffffff ; m_b = _mm512_set1_epi32(all_one); m_a = _mm512_load_epi32(arr_a); m_c = _mm512_xor_epi32(m_a, m_b); _mm512_store_epi32(arr_c, m_c); } for (i = 0 ; i < n; i++) { printf ("~ " ); print_binary (arr_a[i], 8 ); printf ( " = " ); print_binary(arr_c[i], 8 ); printf ("\n" ); } _mm_free(arr_a); _mm_free(arr_c); }
下面是运行结果1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ~ 00000000 = 11111111 ~ 00000001 = 11111110 ~ 00000010 = 11111101 ~ 00000011 = 11111100 ~ 00000100 = 11111011 ~ 00000101 = 11111010 ~ 00000110 = 11111001 ~ 00000111 = 11111000 ~ 00001000 = 11110111 ~ 00001001 = 11110110 ~ 00001010 = 11110101 ~ 00001011 = 11110100 ~ 00001100 = 11110011 ~ 00001101 = 11110010 ~ 00001110 = 11110001 ~ 00001111 = 11110000
移位操作 移位操作分为算术移位和逻辑移位, 逻辑左移和算术左移的规则是一样的, 所以两者共用同一个左移函数, 而逻辑右移和算术右移不同, 逻辑右移是一直补0, 而算术右移要看符号位, 符号位为0则补0, 符号位为1, 则补1. 同时移位操作有两种形式, 一种给定一个常数, 向量中的每个元素都移该常数位, 一种是给定一个向量, 向量中的每个元素移给定向量中对应数值的位. 好吧下面还是看例子吧.左移: 给定一个常数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 void mic_lshift () { uint32_t *arr_a, *arr_c; int i = 0 , n = 16 ; arr_a = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_c = _mm_malloc(sizeof (uint32_t ) * n, 64 ); for (i = 0 ; i < n; i++) { arr_a[i] = i ; } #pragma offload target(mic) in(arr_a:length(n)) out(arr_c:length(n)) { __m512i m_a, m_c; m_a = _mm512_load_epi32(arr_a); m_c = _mm512_slli_epi32 (m_a, 1 ); _mm512_store_epi32(arr_c, m_c); } for (i = 0 ; i < n; i++) { print_binary (arr_a[i], 8 ); printf ( " \t " ); print_binary(arr_c[i], 8 ); printf ("\n" ); } _mm_free(arr_a); _mm_free(arr_c); }
左移:给定一个向量 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 void mic_lshift_v () { uint32_t *arr_a, *arr_c; int i = 0 , n = 16 ; arr_a = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_c = _mm_malloc(sizeof (uint32_t ) * n, 64 ); for (i = 0 ; i < n; i++) { arr_a[i] = i+1 ; } #pragma offload target(mic) in(arr_a:length(n)) out(arr_c:length(n)) { __m512i m_a, m_b, m_c; m_b = _mm512_set_epi32(1 ,2 ,3 ,4 ,1 ,2 ,3 ,4 ,1 ,2 ,3 ,4 ,1 ,2 ,3 ,4 ); m_a = _mm512_load_epi32(arr_a); m_c = _mm512_sllv_epi32 (m_a, m_b); _mm512_store_epi32(arr_c, m_c); } for (i = 0 ; i < n; i++) { print_binary (arr_a[i], 8 ); printf ( " \t " ); print_binary(arr_c[i], 8 ); printf ("\n" ); } _mm_free(arr_a); _mm_free(arr_c); }
执行结果为:1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 00000001 00010000 00000010 00010000 00000011 00001100 00000100 00001000 00000101 01010000 00000110 00110000 00000111 00011100 00001000 00010000 00001001 10010000 00001010 01010000 00001011 00101100 00001100 00011000 00001101 11010000 00001110 01110000 00001111 00111100 00010000 00100000
算术右移 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 void mic_arshift () { uint32_t *arr_a, *arr_b, *arr_c, *arr_d; int i = 0 , n = 16 ; arr_a = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_b = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_c = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_d = _mm_malloc(sizeof (uint32_t ) * n, 64 ); uint32_t high_one = 1 << 31 ; for (i = 0 ; i < n; i++) { arr_a[i] = i+1 ; arr_b[i] = high_one | (i + 1 ); } #pragma offload target(mic) in(arr_a:length(n)) in(arr_b:length(n)) out(arr_c:length(n)) out(arr_d:length(n)) { __m512i m_a,m_b, m_c, m_d; m_a = _mm512_load_epi32(arr_a); m_c = _mm512_srai_epi32 (m_a, 2 ); _mm512_store_epi32(arr_c, m_c); m_b = _mm512_load_epi32(arr_b); m_d = _mm512_srai_epi32(m_b, 2 ); _mm512_store_epi32(arr_d, m_d); } printf ("符号位为0: \n" ); for (i = 0 ; i < n; i++) { print_binary (arr_a[i],32 ); printf ( " \t " ); print_binary(arr_c[i], 32 ); printf ("\n" ); } printf ("符号位为1: \n" ); for (i = 0 ; i < n; i++) { print_binary (arr_b[i],32 ); printf ( " \t " ); print_binary(arr_d[i], 32 ); printf ("\n" ); } _mm_free(arr_a); _mm_free(arr_b); _mm_free(arr_c); _mm_free(arr_d); }
执行结果为:1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 符号位为0: 00000000000000000000000000000001 00000000000000000000000000000000 00000000000000000000000000000010 00000000000000000000000000000000 00000000000000000000000000000011 00000000000000000000000000000000 00000000000000000000000000000100 00000000000000000000000000000001 00000000000000000000000000000101 00000000000000000000000000000001 00000000000000000000000000000110 00000000000000000000000000000001 00000000000000000000000000000111 00000000000000000000000000000001 00000000000000000000000000001000 00000000000000000000000000000010 00000000000000000000000000001001 00000000000000000000000000000010 00000000000000000000000000001010 00000000000000000000000000000010 00000000000000000000000000001011 00000000000000000000000000000010 00000000000000000000000000001100 00000000000000000000000000000011 00000000000000000000000000001101 00000000000000000000000000000011 00000000000000000000000000001110 00000000000000000000000000000011 00000000000000000000000000001111 00000000000000000000000000000011 00000000000000000000000000010000 00000000000000000000000000000100 符号位为1: 10000000000000000000000000000001 11100000000000000000000000000000 10000000000000000000000000000010 11100000000000000000000000000000 10000000000000000000000000000011 11100000000000000000000000000000 10000000000000000000000000000100 11100000000000000000000000000001 10000000000000000000000000000101 11100000000000000000000000000001 10000000000000000000000000000110 11100000000000000000000000000001 10000000000000000000000000000111 11100000000000000000000000000001 10000000000000000000000000001000 11100000000000000000000000000010 10000000000000000000000000001001 11100000000000000000000000000010 10000000000000000000000000001010 11100000000000000000000000000010 10000000000000000000000000001011 11100000000000000000000000000010 10000000000000000000000000001100 11100000000000000000000000000011 10000000000000000000000000001101 11100000000000000000000000000011 10000000000000000000000000001110 11100000000000000000000000000011 10000000000000000000000000001111 11100000000000000000000000000011 10000000000000000000000000010000 11100000000000000000000000000100
_mm512_alignr_epi32 函数原型为:
1 extern __m512i __cdecl _mm512_alignr_epi32(__m512i v2, __m512i v3, const int count);
该函数的作用就是将v2和v3拼接起来, v2在前, v3在后, 然后循环左移count个元素, 然后取最右侧的16个元素, 下面看个例子1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 void mic_alignr () { uint32_t *arr_a, *arr_b, *arr_c, *arr_d; int i = 0 , n = 16 ; arr_a = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_b = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_c = _mm_malloc(sizeof (uint32_t ) * n, 64 ); arr_d = _mm_malloc(sizeof (uint32_t ) * n, 64 ); for (i = 0 ; i < n; i++) { arr_a[i] = i+1 ; arr_b[i] = n + i + 1 ; } #pragma offload target(mic) in(arr_a:length(n)) in(arr_b:length(n)) out(arr_c:length(n)) out(arr_d:length(n)) { __m512i m_a,m_b, m_c, m_d; m_a = _mm512_load_epi32(arr_a); m_b = _mm512_load_epi32(arr_b); m_c = _mm512_alignr_epi32 (m_a, m_b, 3 ); _mm512_store_epi32(arr_c, m_c); m_d = _mm512_alignr_epi32(m_a, m_b, 8 ); _mm512_store_epi32(arr_d, m_d); } printf ("arr_a: " ); for (i = 0 ; i < n; i++) { printf ("%2u " , arr_a[i]); } printf (" \narr_b: " ); for (i = 0 ; i < n; i++) { printf ("%2u " , arr_b[i]); } printf ("\n\n" ); printf ("count = 3 arr_c: " ); for (i = 0 ; i < n; i++) { printf ("%2u " , arr_c[i]); } printf ("\n" ); printf ("count = 8 arr_c: " ); for (i = 0 ; i < n; i++) { printf ("%2u " , arr_d[i]); } printf ("\n" ); _mm_free(arr_a); _mm_free(arr_b); _mm_free(arr_c); _mm_free(arr_d); }
执行结果为:1 2 3 4 5 arr_a: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 arr_b: 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 count = 3 arr_c: 20 21 22 23 24 25 26 27 28 29 30 31 32 1 2 3 count = 8 arr_c: 25 26 27 28 29 30 31 32 1 2 3 4 5 6 7 8