-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatrix_2_matrix.h
108 lines (99 loc) · 4.04 KB
/
matrix_2_matrix.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#include <vector>
#include <cmath>
#include <iostream>
#include <omp.h>
#include <thread>
using namespace std;
#include "jthread.hpp"
template<typename T, bool col_major=false>
class MatrixView{
public:
T* data_pointer;
const long nrow;
const long ncol;
MatrixView(T *data_pointer, const long nrow, const long ncol) : data_pointer(data_pointer), nrow(nrow),
ncol(ncol) {}
T &operator()(const int row, const int col) {
if (col_major) {
return data_pointer[row + col * nrow];
} else {
return data_pointer[col + row * ncol];
}
}
T operator()(const int row, const int col) const {
if (col_major) {
return data_pointer[row + col * nrow];
} else {
return data_pointer[col + row * ncol];
}
}
};
inline void get_matrix_to_matrix_dist(double *a, double *b, double *res, const long a_rows, const long b_rows, const long vec_dim){
const MatrixView<double, false> a_view(a, a_rows, vec_dim);
const MatrixView<double, false> b_view(b, b_rows, vec_dim);
int idx = 0;
double diff = 0.0;
// #pragma omp parallel for num_threads(4) // data race, use critical(atomic op)
for(auto i = 0; i != a_rows; i++){
for(auto j = 0; j != b_rows; j++){
double cur_sum = 0.0;
#pragma omp parallel for num_threads(4) reduction(+:cur_sum) private(diff)
for(auto k = 0; k != vec_dim; k++){
diff = a_view(i, k) - b_view(j, k);
cur_sum = cur_sum + diff * diff;
}
*(res + idx++) = sqrt(cur_sum);
}
}
}
inline void get_matrix_to_matrix_dist_multi_threading(double *a, double *b, double *res, const long a_rows, const long b_rows, const long vec_dim){
const int threads = omp_get_max_threads()-1;
jthread* thread_pool = new jthread[threads+1];
long start_ptr, size, local_rows;
start_ptr = 0l;
for(int i = 0; i<=threads; i++){
int start_idx_row = static_cast<int> (start_ptr / vec_dim);
if(a_rows * vec_dim - start_ptr < a_rows * vec_dim / threads){
local_rows = a_rows - start_idx_row;
}else{
local_rows = a_rows / threads;
}
size = local_rows * vec_dim;
// thread_pool[i] = thread(get_matrix_to_matrix_dist, &a[start_idx_row * vec_dim],
// &b[0],
// &res[start_idx_row * b_rows],
// local_rows,
// b_rows,
// vec_dim);
thread_pool[i] = jthread(get_matrix_to_matrix_dist, &a[start_idx_row * vec_dim],
&b[0],
&res[start_idx_row * b_rows],
local_rows,
b_rows,
vec_dim);
start_ptr += size;
}
// for(int j=0; j<=threads; j++){
// thread_pool[j].join();
// }
// compile with -pthread
}
inline void get_pairwise_dist(double *a, const long a_rows, const long vec_dim, double *res){
const MatrixView<double, false> a_view(a, a_rows, vec_dim);
auto i = 0;
double diff = 0.0;
double cur_sum;
int index = 0;
// #pragma omp parallel for num_threads(4) // data race, use critical(atomic op)
for(; i != a_rows; i++){
for(auto j = 0; j != i; j++){
cur_sum = 0.0;
#pragma omp parallel for simd num_threads(2) reduction(+:cur_sum) private(diff)
for(auto k = 0; k < vec_dim; k++){
diff = (a_view(i, k) - a_view(j, k));
cur_sum = cur_sum + diff * diff;
}
*(res + index++) = *(res + (a_rows * j + i)) = sqrt(cur_sum);
}
}
}