Commit 3153ddae authored by Charles Bouillaguet's avatar Charles Bouillaguet
Browse files

lanczos 256 bits functionnal

parent 3f685bfd
......@@ -25,7 +25,7 @@
typedef uint64_t u64;
typedef uint32_t u32;
#define BLOCKING_FACTOR 128
#define BLOCKING_FACTOR 256
static const int n = BLOCKING_FACTOR;
char *matrix_filename;
......@@ -268,7 +268,7 @@ static const u64 M6_HI = 0xaaaaaaaaaaaaaaaa;
static const u64 M6_LO = 0x5555555555555555;
/* C += A*transpose(B) */
static inline void matmul_CpABt_64(u64 * C, u64 const * A, u64 const * Bt, int stride)
void matmul_CpABt_64(u64 * C, u64 const * A, u64 const * Bt, int stride)
{
u64 tmp[64];
for (int i = 0; i < 64; i++) {
......@@ -301,7 +301,7 @@ static inline void matmul_CpABt_64(u64 * C, u64 const * A, u64 const * Bt, int s
}
/* T <-- transpose(M) */
static inline void transpose_64(u64 * At, u64 const * A, int stride)
void transpose_64(u64 * At, u64 const * A, int stride)
{
#define M(i) A[(i)*stride]
#define T(i) At[(i)*stride]
......@@ -356,7 +356,7 @@ static inline void transpose_64(u64 * At, u64 const * A, int stride)
typedef uint64_t vec;
static inline void transpose(vec * T, vec const * M)
void transpose(vec * T, vec const * M)
{
transpose_64(T, M, 1);
}
......@@ -397,39 +397,39 @@ void randomize(vec * x)
typedef u64 vec __attribute__ ((vector_size(16)));
static inline void transpose(vec * T, vec const * M)
void transpose_128(u64 * T, u64 const * M, int stride)
{
transpose_64(&T[0][0], &M[0][0], 2);
transpose_64(&T[64][0], &M[0][1], 2);
transpose_64(&T[0][1], &M[64][0], 2);
transpose_64(&T[64][1], &M[64][1], 2);
int h = 1;
int v = 64 * stride;
transpose_64(T, M, stride);
transpose_64(T + v, M + h, stride);
transpose_64(T + h, M + v, stride);
transpose_64(T + v + h, M + v + h, stride);
}
static inline void matmul_CpABt(vec * C, vec const * A, vec const * Bt)
{
matmul_CpABt_64(&C[0][0], &A[0][0], &Bt[0][0], 2);
matmul_CpABt_64(&C[0][0], &A[0][1], &Bt[0][1], 2);
matmul_CpABt_64(&C[64][0], &A[64][0], &Bt[0][0], 2);
matmul_CpABt_64(&C[64][0], &A[64][1], &Bt[0][1], 2);
matmul_CpABt_64(&C[0][1], &A[0][0], &Bt[64][0], 2);
matmul_CpABt_64(&C[0][1], &A[0][1], &Bt[64][1], 2);
matmul_CpABt_64(&C[64][1], &A[64][0], &Bt[64][0], 2);
matmul_CpABt_64(&C[64][1], &A[64][1], &Bt[64][1], 2);
void matmul_CpABt_128(u64 * C, u64 const * A, u64 const * Bt, int stride)
{
int h = 1;
int v = 64 * stride;
matmul_CpABt_64(C , A , Bt , stride);
matmul_CpABt_64(C , A + h , Bt + h , stride);
matmul_CpABt_64(C + v , A + v , Bt , stride);
matmul_CpABt_64(C + v , A + v + h, Bt + h , stride);
matmul_CpABt_64(C + h , A , Bt + v , stride);
matmul_CpABt_64(C + h , A + h , Bt + v + h, stride);
matmul_CpABt_64(C + v + h, A + v , Bt + v , stride);
matmul_CpABt_64(C + v + h, A + v + h, Bt + v + h, stride);
}
bool get_bit(vec x, int i)
void transpose(vec * T, vec const * M)
{
return (x[i / 64] >> (i % 64)) & 1;
transpose_128((u64*) T, (u64*) M, 2);
}
vec ith_bit(int i)
void matmul_CpABt(vec * C, vec const * A, vec const * Bt)
{
vec x = {0, 0};
x[i / 64] = 1ull << (i % 64);
return x;
matmul_CpABt_128((u64*) C, (u64 *) A, (u64 *) Bt, 2);
}
vec zero()
......@@ -443,15 +443,116 @@ bool vec_is_zero(vec x)
return __builtin_ia32_ptestz128(x, x);
}
bool vec_eq(vec x, vec y)
void randomize(vec * x)
{
return vec_is_zero(x ^ y);
(*x)[0] = random64();
(*x)[1] = random64();
}
#endif
#if BLOCKING_FACTOR == 256
typedef u64 vec __attribute__ ((vector_size(32)));
void transpose_128(u64 * T, u64 const * M, int stride)
{
int h = 1;
int v = 64 * stride;
transpose_64(T, M, stride);
transpose_64(T + v, M + h, stride);
transpose_64(T + h, M + v, stride);
transpose_64(T + v + h, M + v + h, stride);
}
void matmul_CpABt_128(u64 * C, u64 const * A, u64 const * Bt, int stride)
{
int h = 1;
int v = 64 * stride;
matmul_CpABt_64(C , A , Bt , stride);
matmul_CpABt_64(C , A + h , Bt + h , stride);
matmul_CpABt_64(C + v , A + v , Bt , stride);
matmul_CpABt_64(C + v , A + v + h, Bt + h , stride);
matmul_CpABt_64(C + h , A , Bt + v , stride);
matmul_CpABt_64(C + h , A + h , Bt + v + h, stride);
matmul_CpABt_64(C + v + h, A + v , Bt + v , stride);
matmul_CpABt_64(C + v + h, A + v + h, Bt + v + h, stride);
}
void transpose_256(u64 * T, u64 const * M, int stride)
{
int h = 2;
int v = 128 * stride;
transpose_128(T, M, stride);
transpose_128(T + v, M + h, stride);
transpose_128(T + h, M + v, stride);
transpose_128(T + v + h, M + v + h, stride);
}
void matmul_CpABt_256(u64 * C, u64 const * A, u64 const * Bt, int stride)
{
int h = 2;
int v = 128 * stride;
matmul_CpABt_128(C , A , Bt , stride);
matmul_CpABt_128(C , A + h , Bt + h , stride);
matmul_CpABt_128(C + v , A + v , Bt , stride);
matmul_CpABt_128(C + v , A + v + h, Bt + h , stride);
matmul_CpABt_128(C + h , A , Bt + v , stride);
matmul_CpABt_128(C + h , A + h , Bt + v + h, stride);
matmul_CpABt_128(C + v + h, A + v , Bt + v , stride);
matmul_CpABt_128(C + v + h, A + v + h, Bt + v + h, stride);
}
void transpose(vec * T, vec const * M)
{
transpose_256((u64*) T, (u64*) M, 4);
}
void matmul_CpABt(vec * C, vec const * A, vec const * Bt)
{
matmul_CpABt_256((u64*) C, (u64 *) A, (u64 *) Bt, 4);
}
vec zero()
{
vec x = {0, 0, 0, 0};
return x;
}
bool vec_is_zero(vec x)
{
return __builtin_ia32_ptestz256(x, x);
}
void randomize(vec * x)
{
(*x)[0] = random64();
(*x)[1] = random64();
(*x)[2] = random64();
(*x)[3] = random64();
}
#endif
#if BLOCKING_FACTOR > 64
bool vec_eq(vec x, vec y)
{
return vec_is_zero(x ^ y);
}
bool get_bit(vec x, int i)
{
return (x[i / 64] >> (i % 64)) & 1;
}
vec ith_bit(int i)
{
vec x = zero();
x[i / 64] = 1ull << (i % 64);
return x;
}
#endif
......@@ -527,7 +628,7 @@ vec semi_inverse(const vec *M_, vec * winv)
/******** crucial operation : sparse-matrix * dense vector block *********************/
void sparse_matrix_vector_product(vec *y, struct sparsematrix_t const * M, const vec * x)
void sparse_matrix_vector_product(vec * y, struct sparsematrix_t const * M, vec const * x)
{
u32 N = M->nrows;
u64 const * Mp = M->p;
......@@ -688,10 +789,10 @@ vec * block_lanczos(struct sparsematrix_t const *M, struct sparsematrix_t const
human_format(4 * sizeof(vec) * block_size_pad, hsize);
printf("Computation requires %sB of memory (in addition to the matrix)\n", hsize);
vec *v = malloc(block_size_pad * sizeof(*v));
vec *tmp = malloc(block_size_pad * sizeof(*v));
vec *Av = malloc(block_size_pad * sizeof(*v));
vec *p = malloc(block_size_pad * sizeof(*v));
vec *v = aligned_alloc(64, block_size_pad * sizeof(*v));
vec *tmp = aligned_alloc(64, block_size_pad * sizeof(*tmp));
vec *Av = aligned_alloc(64, block_size_pad * sizeof(*Av));
vec *p = aligned_alloc(64, block_size_pad * sizeof(*p));
if (v == NULL || tmp == NULL || Av == NULL || p == NULL)
errx(1, "impossible d'allouer");
......@@ -718,10 +819,11 @@ vec * block_lanczos(struct sparsematrix_t const *M, struct sparsematrix_t const
sparse_matrix_vector_product(Av, M, tmp);
block_dot_products(vtAv, vtAAv, N, Av, v);
vec d = semi_inverse(vtAv, winv);
correctness(winv, vtAv, vtAAv, d);
// correctness(winv, vtAv, vtAAv, d);
if (vec_is_zero(d))
break;
orthogonalize(tmp, p, d, vtAv, vtAAv, winv, N, Av, v);
vec *foo = v;
v = tmp;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment