Commit c0c02bf0 authored by Charles Bouillaguet's avatar Charles Bouillaguet
Browse files

lanczos update

parent 3153ddae
......@@ -24,8 +24,10 @@
typedef uint64_t u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
#define BLOCKING_FACTOR 256
#define BLOCKING_FACTOR 128
static const int n = BLOCKING_FACTOR;
char *matrix_filename;
......@@ -276,7 +278,7 @@ void matmul_CpABt_64(u64 * C, u64 const * A, u64 const * Bt, int stride)
u64 x = A[i * stride] & Bt[j * stride];
tmp[j] = (x ^ (x >> 32)) & M1_LO;
}
for (int j = 0; j < 32; j++) {
for (int j = 0; j < 32; j++) { // in each 64-bit word, the top 32-bits are zero
u64 x = tmp[j] ^ (tmp[j + 32] << 32);
tmp[j] = (x ^ (x >> 16)) & M2_LO;
}
......@@ -381,6 +383,11 @@ bool vec_is_zero(vec x)
return x == 0;
}
bool vec_eq(vec x, vec y)
{
return x == y;
}
vec zero()
{
return 0;
......@@ -395,41 +402,152 @@ void randomize(vec * x)
#if BLOCKING_FACTOR == 128
typedef u64 u64x2 __attribute__ ((vector_size(16)));
typedef u32 u32x4 __attribute__ ((vector_size(16)));
typedef u16 u16x8 __attribute__ ((vector_size(16)));
typedef u8 u8x16 __attribute__ ((vector_size(16)));
typedef u64 vec __attribute__ ((vector_size(16)));
void transpose_128(u64 * T, u64 const * M, int stride)
const u64x2 mask5_hi = { 0xffffffff00000000, 0xffffffff00000000 };
const u64x2 mask5_lo = { 0x00000000ffffffff, 0x00000000ffffffff };
const u64x2 mask4_hi = { 0xffff0000ffff0000, 0xffff0000ffff0000 };
const u64x2 mask4_lo = { 0x0000ffff0000ffff, 0x0000ffff0000ffff };
const u64x2 mask3_hi = { 0xff00ff00ff00ff00, 0xff00ff00ff00ff00 };
const u64x2 mask3_lo = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
const u64x2 mask2_hi = { 0xf0f0f0f0f0f0f0f0, 0xf0f0f0f0f0f0f0f0 };
const u64x2 mask2_lo = { 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f };
const u64x2 mask1_hi = { 0xcccccccccccccccc, 0xcccccccccccccccc };
const u64x2 mask1_lo = { 0x3333333333333333, 0x3333333333333333 };
const u64x2 mask0_hi = { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa };
const u64x2 mask0_lo = { 0x5555555555555555, 0x5555555555555555 };
void transpose(vec * T, vec const * M)
{
int h = 1;
int v = 64 * stride;
transpose_64(T, M, stride);
transpose_64(T + v, M + h, stride);
transpose_64(T + h, M + v, stride);
transpose_64(T + v + h, M + v + h, stride);
}
// maybe replace by simpler ops on 64-bit ints
u64x2 mask6a = {0, 2}; /* AB --> Aa */
u64x2 mask6b = {1, 3}; /* ab Bb */
for (int i = 0; i < 64; i++) { // in 128x128, swap 64x64 blocks
u64x2 a = M[i];
u64x2 b = M[i + 64];
u64x2 foo = __builtin_shuffle(a, b, mask6a);
u64x2 bar = __builtin_shuffle(a, b, mask6b);
T[i] = foo;
T[i + 64] = bar;
}
u32x4 mask5a = {0, 4, 2, 6}; /* AB CD --> Aa Cc */
u32x4 mask5b = {1, 5, 3, 7}; /* ab cd Bb Dd */
for (int j = 0; j < 128; j += 64) { // in 2x 64x64, swap 32x32 blocks
for (int i = j; i < j + 32; i++) {
u32x4 a = T[i];
u32x4 b = T[i + 32];
u32x4 foo = __builtin_shuffle(a, b, mask5a);
u32x4 bar = __builtin_shuffle(a, b, mask5b);
T[i] = foo;
T[i + 32] = bar;
}
}
void matmul_CpABt_128(u64 * C, u64 const * A, u64 const * Bt, int stride)
{
int h = 1;
int v = 64 * stride;
matmul_CpABt_64(C , A , Bt , stride);
matmul_CpABt_64(C , A + h , Bt + h , stride);
matmul_CpABt_64(C + v , A + v , Bt , stride);
matmul_CpABt_64(C + v , A + v + h, Bt + h , stride);
matmul_CpABt_64(C + h , A , Bt + v , stride);
matmul_CpABt_64(C + h , A + h , Bt + v + h, stride);
matmul_CpABt_64(C + v + h, A + v , Bt + v , stride);
matmul_CpABt_64(C + v + h, A + v + h, Bt + v + h, stride);
}
u16x8 mask4a = {0, 8, 2, 10, 4, 12, 6, 14}; /* AB CD EF GH --> Aa Cc Ee Gg */
u16x8 mask4b = {1, 9, 3, 11, 5, 13, 7, 15}; /* ab cd ef gh Bb Dd Ff Hh */
for (int j = 0; j < 128; j += 32) { // in 4x 32x32, swap 16x16 blocks
for (int i = j; i < j + 16; i++) {
u16x8 a = T[i];
u16x8 b = T[i + 16];
u16x8 foo = __builtin_shuffle(a, b, mask4a);
u16x8 bar = __builtin_shuffle(a, b, mask4b);
T[i] = foo;
T[i + 16] = bar;
}
}
void transpose(vec * T, vec const * M)
{
transpose_128((u64*) T, (u64*) M, 2);
u8x16 mask3a = {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}; /* ABCDEFGHIJKLMNOP --> Aa Cc Ee Gg ... */
u8x16 mask3b = {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}; /* abcdefghijklmnop --> Bb Dd Ff Hh ... */
for (int j = 0; j < 128; j += 16) { // in 8x 16x16, swap 8x8 blocks
for (int i = j; i < j + 8; i++) {
u8x16 a = T[i];
u8x16 b = T[i + 8];
u8x16 foo = __builtin_shuffle(a, b, mask3a);
u8x16 bar = __builtin_shuffle(a, b, mask3b);
T[i] = foo;
T[i + 8] = bar;
}
}
// now this works intra-byte
for (int j = 0; j < 128; j += 8) { // in 16x 8x8, swap 4x4 blocks
for (int i = j; i < j + 4; i++) {
vec a = T[i];
vec b = T[i + 4];
vec foo = (a & mask2_lo) | ((b & mask2_lo) << 4);
vec bar = ((a & mask2_hi) >> 4) | (b & mask2_hi);
T[i] = foo;
T[i + 4] = bar;
}
}
for (int j = 0; j < 128; j += 4) { // in 32x 4x4, swap 2x2 blocks
for (int i = j; i < j + 2; i++) {
vec a = T[i];
vec b = T[i + 2];
vec foo = ( a & mask1_lo) | ((b & mask1_lo) << 2);
vec bar = ((a & mask1_hi) >> 2) | (b & mask1_hi);
T[i] = foo;
T[i + 2] = bar;
}
}
for (int j = 0; j < 128; j += 2) { // in 64x 2x2, swap 1x1 blocks
for (int i = j; i < j + 1; i++) {
vec a = T[i];
vec b = T[i + 1];
vec foo = ( a & mask0_lo) | ((b & mask0_lo) << 1);
vec bar = ((a & mask0_hi) >> 1) | (b & mask0_hi);
T[i] = foo;
T[i + 1] = bar;
}
}
}
void matmul_CpABt(vec * C, vec const * A, vec const * Bt)
{
matmul_CpABt_128((u64*) C, (u64 *) A, (u64 *) Bt, 2);
vec tmp[64];
for (int i = 0; i < 128; i++) {
for (int j = 0; j < 64; j++) {
vec a = A[i] & Bt[j];
vec b = A[i] & Bt[j + 64];
vec msk1 = {1, 2};
vec msk2 = {0, 3};
vec c = __builtin_shuffle(a, b, msk1);
vec d = __builtin_shuffle(a, b, msk2);
vec x = c ^ d;
tmp[j] = (x ^ (x >> 32)) & mask5_lo;
}
for (int j = 0; j < 32; j++) {
vec x = tmp[j] ^ (tmp[j + 32] << 32);
tmp[j] = (x ^ (x >> 16)) & mask4_lo;
}
for (int j = 0; j < 16; j++) {
vec x = tmp[j] ^ (tmp[j + 16] << 16);
tmp[j] = (x ^ (x >> 8)) & mask3_lo;
}
for (int j = 0; j < 8; j++) {
vec x = tmp[j] ^ (tmp[j + 8] << 8);
tmp[j] = (x ^ (x >> 4)) & mask2_lo;
}
for (int j = 0; j < 4; j++) {
vec x = tmp[j] ^ (tmp[j + 4] << 4);
tmp[j] = (x ^ (x >> 2)) & mask1_lo;
}
for (int j = 0; j < 2; j++) {
vec x = tmp[j] ^ (tmp[j + 2] << 2);
tmp[j] = (x ^ (x >> 1)) & mask0_lo;
}
C[i] ^= tmp[0] ^ (tmp[1] << 1);
}
}
vec zero()
......@@ -450,46 +568,137 @@ void randomize(vec * x)
}
#endif
#if BLOCKING_FACTOR == 256
typedef u64 u64x4 __attribute__ ((vector_size(32)));
typedef u32 u32x8 __attribute__ ((vector_size(32)));
typedef u16 u16x16 __attribute__ ((vector_size(32)));
typedef u8 u8x32 __attribute__ ((vector_size(32)));
typedef u64 vec __attribute__ ((vector_size(32)));
void transpose_128(u64 * T, u64 const * M, int stride)
{
int h = 1;
int v = 64 * stride;
transpose_64(T, M, stride);
transpose_64(T + v, M + h, stride);
transpose_64(T + h, M + v, stride);
transpose_64(T + v + h, M + v + h, stride);
}
const u64x4 mask6_hi = { 0xffffffffffffffff, 0x0000000000000000, 0xffffffffffffffff, 0x0000000000000000 };
const u64x4 mask6_lo = { 0x0000000000000000, 0xffffffffffffffff, 0x0000000000000000, 0xffffffffffffffff };
const u64x4 mask5_hi = { 0xffffffff00000000, 0xffffffff00000000, 0xffffffff00000000, 0xffffffff00000000 };
const u64x4 mask5_lo = { 0x00000000ffffffff, 0x00000000ffffffff, 0x00000000ffffffff, 0x00000000ffffffff };
const u64x4 mask4_hi = { 0xffff0000ffff0000, 0xffff0000ffff0000, 0xffff0000ffff0000, 0xffff0000ffff0000 };
const u64x4 mask4_lo = { 0x0000ffff0000ffff, 0x0000ffff0000ffff, 0x0000ffff0000ffff, 0x0000ffff0000ffff };
const u64x4 mask3_hi = { 0xff00ff00ff00ff00, 0xff00ff00ff00ff00, 0xff00ff00ff00ff00, 0xff00ff00ff00ff00 };
const u64x4 mask3_lo = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
const u64x4 mask2_hi = { 0xf0f0f0f0f0f0f0f0, 0xf0f0f0f0f0f0f0f0, 0xf0f0f0f0f0f0f0f0, 0xf0f0f0f0f0f0f0f0 };
const u64x4 mask2_lo = { 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f };
const u64x4 mask1_hi = { 0xcccccccccccccccc, 0xcccccccccccccccc, 0xcccccccccccccccc, 0xcccccccccccccccc };
const u64x4 mask1_lo = { 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 };
const u64x4 mask0_hi = { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa };
const u64x4 mask0_lo = { 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 };
void matmul_CpABt_128(u64 * C, u64 const * A, u64 const * Bt, int stride)
void transpose(vec * T, vec const * M)
{
int h = 1;
int v = 64 * stride;
matmul_CpABt_64(C , A , Bt , stride);
matmul_CpABt_64(C , A + h , Bt + h , stride);
matmul_CpABt_64(C + v , A + v , Bt , stride);
matmul_CpABt_64(C + v , A + v + h, Bt + h , stride);
matmul_CpABt_64(C + h , A , Bt + v , stride);
matmul_CpABt_64(C + h , A + h , Bt + v + h, stride);
matmul_CpABt_64(C + v + h, A + v , Bt + v , stride);
matmul_CpABt_64(C + v + h, A + v + h, Bt + v + h, stride);
}
// maybe replace by simpler ops on 64-bit ints
u64x4 mask7a = {0, 1, 4, 5}; /* ABCD --> ABab */
u64x4 mask7b = {2, 3, 6, 7}; /* abcd CDcd */
for (int j = 0; j < 256; j += 256) { // in 1x 256x256, swap 128x128 blocks
for (int i = 0; i < 128; i++) {
u64x4 a = M[i];
u64x4 b = M[i + 128];
u64x4 foo = __builtin_shuffle(a, b, mask7a);
u64x4 bar = __builtin_shuffle(a, b, mask7b);
T[i] = foo;
T[i + 128] = bar;
}
}
u64x4 mask6a = {0, 4, 2, 6}; /* AB CD --> Aa Cc */
u64x4 mask6b = {1, 5, 3, 7}; /* ab cd Bb Dd */
for (int j = 0; j < 256; j += 128) { // in 2x 128x128, swap 64x64 blocks
for (int i = j; i < j + 64; i++) {
u64x4 a = T[i];
u64x4 b = T[i + 64];
u64x4 foo = __builtin_shuffle(a, b, mask6a);
u64x4 bar = __builtin_shuffle(a, b, mask6b);
T[i] = foo;
T[i + 64] = bar;
}
}
void transpose_256(u64 * T, u64 const * M, int stride)
{
int h = 2;
int v = 128 * stride;
transpose_128(T, M, stride);
transpose_128(T + v, M + h, stride);
transpose_128(T + h, M + v, stride);
transpose_128(T + v + h, M + v + h, stride);
}
u32x8 mask5a = {0, 8, 2, 10, 4, 12, 6, 14}; /* AB CD EF GH --> Aa Cc Ee Gg */
u32x8 mask5b = {1, 9, 3, 11, 5, 13, 7, 15}; /* ab cd ef gh Bb Dd Ff Hh */
for (int j = 0; j < 256; j += 64) { // in 4x 64x64, swap 32x32 blocks
for (int i = j; i < j + 32; i++) {
u32x8 a = T[i];
u32x8 b = T[i + 32];
u32x8 foo = __builtin_shuffle(a, b, mask5a);
u32x8 bar = __builtin_shuffle(a, b, mask5b);
T[i] = foo;
T[i + 32] = bar;
}
}
u16x16 mask4a = {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}; /* AB CD EF GH --> Aa Cc Ee Gg */
u16x16 mask4b = {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}; /* ab cd ef gh Bb Dd Ff Hh */
for (int j = 0; j < 256; j += 32) { // in 4x 32x32, swap 16x16 blocks
for (int i = j; i < j + 16; i++) {
u16x16 a = T[i];
u16x16 b = T[i + 16];
u16x16 foo = __builtin_shuffle(a, b, mask4a);
u16x16 bar = __builtin_shuffle(a, b, mask4b);
T[i] = foo;
T[i + 16] = bar;
}
}
u8x32 mask3a = {0, 32, 2, 34, 4, 36, 6, 38, 8, 40, 10, 42, 12, 44, 14, 46, 16, 48, 18, 50, 20, 52, 22, 54, 24, 56, 26, 58, 28, 60, 30, 62};
u8x32 mask3b = {1, 33, 3, 35, 5, 37, 7, 39, 9, 41, 11, 43, 13, 45, 15, 47, 17, 49, 19, 51, 21, 53, 23, 55, 25, 57, 27, 59, 29, 61, 31, 63};
for (int j = 0; j < 256; j += 16) { // in 8x 16x16, swap 8x8 blocks
for (int i = j; i < j + 8; i++) {
u8x32 a = T[i];
u8x32 b = T[i + 8];
u8x32 foo = __builtin_shuffle(a, b, mask3a);
u8x32 bar = __builtin_shuffle(a, b, mask3b);
T[i] = foo;
T[i + 8] = bar;
}
}
// now this works intra-byte
for (int j = 0; j < 256; j += 8) { // in 16x 8x8, swap 4x4 blocks
for (int i = j; i < j + 4; i++) {
vec a = T[i];
vec b = T[i + 4];
vec foo = (a & mask2_lo) | ((b & mask2_lo) << 4);
vec bar = ((a & mask2_hi) >> 4) | (b & mask2_hi);
T[i] = foo;
T[i + 4] = bar;
}
}
for (int j = 0; j < 256; j += 4) { // in 32x 4x4, swap 2x2 blocks
for (int i = j; i < j + 2; i++) {
vec a = T[i];
vec b = T[i + 2];
vec foo = ( a & mask1_lo) | ((b & mask1_lo) << 2);
vec bar = ((a & mask1_hi) >> 2) | (b & mask1_hi);
T[i] = foo;
T[i + 2] = bar;
}
}
for (int j = 0; j < 256; j += 2) { // in 64x 2x2, swap 1x1 blocks
for (int i = j; i < j + 1; i++) {
vec a = T[i];
vec b = T[i + 1];
vec foo = ( a & mask0_lo) | ((b & mask0_lo) << 1);
vec bar = ((a & mask0_hi) >> 1) | (b & mask0_hi);
T[i] = foo;
T[i + 1] = bar;
}
}
}
/*
void matmul_CpABt_256(u64 * C, u64 const * A, u64 const * Bt, int stride)
{
int h = 2;
......@@ -502,19 +711,72 @@ void matmul_CpABt_256(u64 * C, u64 const * A, u64 const * Bt, int stride)
matmul_CpABt_128(C + h , A + h , Bt + v + h, stride);
matmul_CpABt_128(C + v + h, A + v , Bt + v , stride);
matmul_CpABt_128(C + v + h, A + v + h, Bt + v + h, stride);
}
}*/
void transpose(vec * T, vec const * M)
{
transpose_256((u64*) T, (u64*) M, 4);
}
void matmul_CpABt(vec * C, vec const * A, vec const * Bt)
{
matmul_CpABt_256((u64*) C, (u64 *) A, (u64 *) Bt, 4);
vec tmp[256];
for (int i = 0; i < 256; i++) {
for (int j = 0; j < 256; j++) {
vec x = A[i] & Bt[j];
// tmp[j] = (x ^ (x >> 128)) & mask6_lo;
vec msk1 = {2, 3, -1, -1};
vec y = __builtin_shuffle(x, msk1);
tmp[j] = x ^ y; // top 128-bits are undef
}
for (int j = 0; j < 128; j++) { // in each 256-bit words, the top 128-bits are not zero but UNDEF
// vec x = tmp[j] ^ (tmp[j + 128] << 128);
vec msk0 = {0, 1, 4, 5};
vec x = __builtin_shuffle(tmp[j], tmp[j+128], msk0);
// tmp[j] = (x ^ (x >> 64)) & mask5_lo;
vec msk1 = {1, -1, 3, -1};
vec y = __builtin_shuffle(x, msk1);
tmp[j] = y & mask6_lo;
}
for (int j = 0; j < 64; j++) { // in each 128-bit words, the top 64-bits are zero
// vec x = tmp[j] ^ (tmp[j + 64] << 64);
vec msk1 = {2, 3, -1, -1};
vec x = __builtin_shuffle(tmp[j], tmp[j+64], msk1);
tmp[j] = (x ^ (x >> 32)) & mask5_lo;
}
for (int j = 0; j < 32; j++) { // in each 64-bit words, the top 32-bits are zero
vec x = tmp[j] ^ (tmp[j + 32] << 32);
tmp[j] = (x ^ (x >> 16)) & mask4_lo;
}
for (int j = 0; j < 16; j++) {
vec x = tmp[j] ^ (tmp[j + 16] << 16);
tmp[j] = (x ^ (x >> 8)) & mask3_lo;
}
for (int j = 0; j < 8; j++) {
vec x = tmp[j] ^ (tmp[j + 8] << 8);
tmp[j] = (x ^ (x >> 4)) & mask2_lo;
}
for (int j = 0; j < 4; j++) {
vec x = tmp[j] ^ (tmp[j + 4] << 4);
tmp[j] = (x ^ (x >> 2)) & mask1_lo;
}
for (int j = 0; j < 2; j++) {
vec x = tmp[j] ^ (tmp[j + 2] << 2);
tmp[j] = (x ^ (x >> 1)) & mask0_lo;
}
C[i] ^= tmp[0] ^ (tmp[1] << 1);
}
}
// void transpose(vec * T, vec const * M)
// {
// transpose_256((u64*) T, (u64*) M, 4);
// }
// void matmul_CpABt(vec * C, vec const * A, vec const * Bt)
// {
// matmul_CpABt_256((u64*) C, (u64 *) A, (u64 *) Bt, 4);
// }
vec zero()
{
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment