I just recently wrote an answer to a similar question SO: Eigen library with C++11 multithreading.
As I’m interested in this topic too and already had working code at hand, I adapted that sample to OP’s task of matrix multiplication:
test-multi-threading-matrix.cc
:
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
#include <thread>
#include <vector>
template <typename VALUE>
class MatrixT {
public:
typedef VALUE Value;
private:
size_t _nRows, _nCols;
std::vector<Value> _values;
public:
MatrixT(size_t nRows, size_t nCols, Value value = (Value)0):
_nRows(nRows), _nCols(nCols), _values(_nRows * _nCols, value)
{ }
~MatrixT() = default;
size_t getNumCols() const { return _nCols; }
size_t getNumRows() const { return _nRows; }
Value* operator[](size_t i) { return &_values[0] + i * _nCols; }
const Value* operator[](size_t i) const { return &_values[0] + i * _nCols; }
};
template <typename VALUE>
VALUE dot(const MatrixT<VALUE> &mat1, size_t iRow, const MatrixT<VALUE> &mat2, size_t iCol)
{
const size_t n = mat1.getNumCols();
assert(n == mat2.getNumRows());
VALUE sum = (VALUE)0;
for (size_t i = 0; i < n; ++i) sum += mat1[iRow][i] * mat2[i][iCol];
return sum;
}
typedef MatrixT<double> Matrix;
typedef std::uint16_t Value;
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::microseconds MuSecs;
typedef decltype(std::chrono::duration_cast<MuSecs>(Clock::now() - Clock::now())) Time;
Time duration(const Clock::time_point &t0)
{
return std::chrono::duration_cast<MuSecs>(Clock::now() - t0);
}
Matrix populate(size_t dim)
{
Matrix mat(dim, dim);
for (size_t i = 0; i < dim; ++i) {
for (size_t j = 0; j < dim; ++j) {
mat[i][j] = ((Matrix::Value)rand() / RAND_MAX) * 100 - 50;
}
}
return mat;
}
std::vector<Time> makeTest(size_t dim)
{
const size_t NThreads = std::thread::hardware_concurrency();
const size_t nThreads = std::min(dim * dim, NThreads);
// make a test sample
const Matrix sampleA = populate(dim);
const Matrix sampleB = populate(dim);
// prepare result vectors
Matrix results4[4] = {
Matrix(dim, dim),
Matrix(dim, dim),
Matrix(dim, dim),
Matrix(dim, dim)
};
// make test
std::vector<Time> times{
[&]() { // single threading
// make a copy of test sample
const Matrix a(sampleA), b(sampleB);
Matrix &results = results4[0];
// remember start time
const Clock::time_point t0 = Clock::now();
// do experiment single-threaded
for (size_t k = 0, n = dim * dim; k < n; ++k) {
const size_t i = k / dim, j = k % dim;
results[i][j] = dot(a, i, b, j);
}
// done
return duration(t0);
}(),
[&]() { // multi-threading - stupid aproach
// make a copy of test sample
const Matrix a(sampleA), b(sampleB);
Matrix &results = results4[1];
// remember start time
const Clock::time_point t0 = Clock::now();
// do experiment multi-threaded
std::vector<std::thread> threads(nThreads);
for (size_t k = 0, n = dim * dim; k < n;) {
size_t nT = 0;
for (; nT < nThreads && k < n; ++nT, ++k) {
const size_t i = k / dim, j = k % dim;
threads[nT] = std::move(std::thread(
[i, j, &results, &a, &b]() {
results[i][j] = dot(a, i, b, j);
}));
}
for (size_t iT = 0; iT < nT; ++iT) threads[iT].join();
}
// done
return duration(t0);
}(),
[&]() { // multi-threading - interleaved
// make a copy of test sample
const Matrix a(sampleA), b(sampleB);
Matrix &results = results4[2];
// remember start time
const Clock::time_point t0 = Clock::now();
// do experiment multi-threaded
std::vector<std::thread> threads(nThreads);
for (Value iT = 0; iT < nThreads; ++iT) {
threads[iT] = std::move(std::thread(
[iT, dim, &results, &a, &b, nThreads]() {
for (size_t k = iT, n = dim * dim; k < n; k += nThreads) {
const size_t i = k / dim, j = k % dim;
results[i][j] = dot(a, i, b, j);
}
}));
}
for (std::thread &threadI : threads) threadI.join();
// done
return duration(t0);
}(),
[&]() { // multi-threading - grouped
// make a copy of test sample
const Matrix a(sampleA), b(sampleB);
Matrix &results = results4[3];
// remember start time
const Clock::time_point t0 = Clock::now();
// do experiment multi-threaded
std::vector<std::thread> threads(nThreads);
for (size_t iT = 0; iT < nThreads; ++iT) {
threads[iT] = std::move(std::thread(
[iT, dim, &results, &a, &b, nThreads]() {
const size_t n = dim * dim;
for (size_t k = iT * n / nThreads, kN = (iT + 1) * n / nThreads;
k < kN; ++k) {
const size_t i = k / dim, j = k % dim;
results[i][j] = dot(a, i, b, j);
}
}));
}
for (std::thread &threadI : threads) threadI.join();
// done
return duration(t0);
}()
};
// check results (must be equal for any kind of computation)
const unsigned nResults = sizeof results4 / sizeof *results4;
for (unsigned iResult = 1; iResult < nResults; ++iResult) {
size_t nErrors = 0;
for (size_t i = 0; i < dim; ++i) {
for (size_t j = 0; j < dim; ++j) {
if (results4[0][i][j] != results4[iResult][i][j]) {
++nErrors;
#if 0 // def _DEBUG
std::cerr
<< "results4[0][" << i << "][" << j << "]: "
<< results4[0][i][j]
<< " != results4[" << iResult << "][" << i << "][" << j << "]: "
<< results4[iResult][i][j]
<< "!\n";
#endif // _DEBUG
}
}
}
if (nErrors) std::cerr << nErrors << " errors in results4[" << iResult << "]!\n";
}
// done
return times;
}
int main()
{
std::cout << "std::thread::hardware_concurrency(): "
<< std::thread::hardware_concurrency() << '\n';
// heat up
std::cout << "Heat up...\n";
for (unsigned i = 0; i < 10; ++i) makeTest(64);
// perform tests:
const unsigned NTrials = 10;
for (size_t dim = 64; dim <= 512; dim *= 2) {
std::cout << "Test for A[" << dim << "][" << dim << "] * B[" << dim << "][" << dim << "]...\n";
// repeat NTrials times
std::cout << "Measuring " << NTrials << " runs...\n"
<< " "
<< " | " << std::setw(10) << "Single"
<< " | " << std::setw(10) << "Multi 1"
<< " | " << std::setw(10) << "Multi 2"
<< " | " << std::setw(10) << "Multi 3"
<< '\n';
std::vector<double> sumTimes;
for (unsigned i = 0; i < NTrials; ++i) {
std::vector<Time> times = makeTest(dim);
std::cout << std::setw(2) << (i + 1) << ".";
for (const Time &time : times) {
std::cout << " | " << std::setw(10) << time.count();
}
std::cout << '\n';
sumTimes.resize(times.size(), 0.0);
for (size_t j = 0; j < times.size(); ++j) sumTimes[j] += times[j].count();
}
std::cout << "Average Values:\n ";
for (const double &sumTime : sumTimes) {
std::cout << " | "
<< std::setw(10) << std::fixed << std::setprecision(1)
<< sumTime / NTrials;
}
std::cout << '\n';
std::cout << "Ratio:\n ";
for (const double &sumTime : sumTimes) {
std::cout << " | "
<< std::setw(10) << std::fixed << std::setprecision(3)
<< sumTime / sumTimes.front();
}
std::cout << "\n\n";
}
// done
return 0;
}
In my first tests, I started with 2×2 matrices, and doubled the number of rows and columns for each test series ending with 64×64 matrices.
I soon came to the same conclusion as Mike: these matrices are much too small. The overhead for setting up and joining threads consumes any speed-up which might’ve been gained by concurrency. So, I modified the test series starting with 64×64 matrices and ending with 512×512.
I compiled and run on cygwin64 (on Windows 10):
$ g++ --version
g++ (GCC) 7.3.0
$ g++ -std=c++17 -O2 test-multi-threading-matrix.cc -o test-multi-threading-matrix
$ ./test-multi-threading-matrix
std::thread::hardware_concurrency(): 8
Heat up...
Test for A[64][64] * B[64][64]...
Measuring 10 runs...
| Single | Multi 1 | Multi 2 | Multi 3
1. | 417 | 482837 | 1068 | 1080
2. | 403 | 486775 | 1034 | 1225
3. | 289 | 482578 | 1478 | 1151
4. | 282 | 502703 | 1103 | 1081
5. | 398 | 495351 | 1287 | 1124
6. | 404 | 501426 | 1050 | 1017
7. | 402 | 483517 | 1000 | 980
8. | 271 | 498591 | 1092 | 1047
9. | 284 | 494732 | 984 | 1057
10. | 288 | 494738 | 1050 | 1116
Average Values:
| 343.8 | 492324.8 | 1114.6 | 1087.8
Ratio:
| 1.000 | 1432.009 | 3.242 | 3.164
Test for A[128][128] * B[128][128]...
Measuring 10 runs...
| Single | Multi 1 | Multi 2 | Multi 3
1. | 2282 | 1995527 | 2215 | 1574
2. | 3076 | 1954316 | 1644 | 1679
3. | 2952 | 1981908 | 2572 | 2250
4. | 2119 | 1986365 | 1568 | 1462
5. | 2676 | 2212344 | 1615 | 1657
6. | 2396 | 1981545 | 1776 | 1593
7. | 2513 | 1983718 | 1950 | 1580
8. | 2614 | 1852414 | 1737 | 1670
9. | 2148 | 1955587 | 1805 | 1609
10. | 2161 | 1980772 | 1794 | 1826
Average Values:
| 2493.7 | 1988449.6 | 1867.6 | 1690.0
Ratio:
| 1.000 | 797.389 | 0.749 | 0.678
Test for A[256][256] * B[256][256]...
Measuring 10 runs...
| Single | Multi 1 | Multi 2 | Multi 3
1. | 32418 | 7992363 | 11753 | 11712
2. | 32723 | 7961725 | 12342 | 12490
3. | 32150 | 8041516 | 14646 | 12304
4. | 30207 | 7810907 | 11512 | 11992
5. | 30108 | 8005317 | 12853 | 12850
6. | 32665 | 8064963 | 13197 | 13386
7. | 36286 | 8825215 | 14381 | 15636
8. | 35068 | 8015930 | 16954 | 12287
9. | 30673 | 7973273 | 12061 | 13677
10. | 36323 | 7861856 | 14223 | 13510
Average Values:
| 32862.1 | 8055306.5 | 13392.2 | 12984.4
Ratio:
| 1.000 | 245.125 | 0.408 | 0.395
Test for A[512][512] * B[512][512]...
Measuring 10 runs...
| Single | Multi 1 | Multi 2 | Multi 3
1. | 404459 | 32803878 | 107078 | 103493
2. | 289870 | 32482887 | 98244 | 103338
3. | 333695 | 29398109 | 87735 | 77531
4. | 236028 | 27286537 | 81620 | 76085
5. | 254294 | 27418963 | 89191 | 76760
6. | 230662 | 27278077 | 78454 | 84063
7. | 274278 | 27180899 | 74828 | 83829
8. | 292294 | 29942221 | 106133 | 103450
9. | 292091 | 33011277 | 100545 | 96935
10. | 401007 | 33502134 | 98230 | 95592
Average Values:
| 300867.8 | 30030498.2 | 92205.8 | 90107.6
Ratio:
| 1.000 | 99.813 | 0.306 | 0.299
I did the same with VS2013 (release mode) and got similar results.
A speed-up of 3 sounds not that bad (ignoring the fact that it’s still far away from 8 which you might expect as ideal for a H/W concurrency of 8).
While fiddling with the matrix multiplication, I got an idea for optimization which I wanted to check as well – even beyond multi-threading. It is the attempt to improve cache-locality.
For this, I transpose the 2nd matrix before multiplication. For the multiplication, a modified version of dot()
(dotT()
) is used which considers the transposition of 2nd matrix respectively.
I modified the above sample code respectively and got test-single-threading-matrix-transpose.cc
:
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
#include <vector>
template <typename VALUE>
class MatrixT {
public:
typedef VALUE Value;
private:
size_t _nRows, _nCols;
std::vector<Value> _values;
public:
MatrixT(size_t nRows, size_t nCols, Value value = (Value)0):
_nRows(nRows), _nCols(nCols), _values(_nRows * _nCols, value)
{ }
~MatrixT() = default;
size_t getNumCols() const { return _nCols; }
size_t getNumRows() const { return _nRows; }
Value* operator[](size_t i) { return &_values[0] + i * _nCols; }
const Value* operator[](size_t i) const { return &_values[0] + i * _nCols; }
};
template <typename VALUE>
VALUE dot(const MatrixT<VALUE> &mat1, size_t iRow, const MatrixT<VALUE> &mat2, size_t iCol)
{
const size_t n = mat1.getNumCols();
assert(n == mat2.getNumRows());
VALUE sum = (VALUE)0;
for (size_t i = 0; i < n; ++i) sum += mat1[iRow][i] * mat2[i][iCol];
return sum;
}
template <typename VALUE>
MatrixT<VALUE> transpose(const MatrixT<VALUE> mat)
{
MatrixT<VALUE> matT(mat.getNumCols(), mat.getNumRows());
for (size_t i = 0; i < mat.getNumRows(); ++i) {
for (size_t j = 0; j < mat.getNumCols(); ++j) {
matT[j][i] = mat[i][j];
}
}
return matT;
}
template <typename VALUE>
VALUE dotT(const MatrixT<VALUE> &mat1, size_t iRow1, const MatrixT<VALUE> &matT2, size_t iRow2)
{
const size_t n = mat1.getNumCols();
assert(n == matT2.getNumCols());
VALUE sum = (VALUE)0;
for (size_t i = 0; i < n; ++i) sum += mat1[iRow1][i] * matT2[iRow2][i];
return sum;
}
typedef MatrixT<double> Matrix;
typedef std::uint16_t Value;
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::microseconds MuSecs;
typedef decltype(std::chrono::duration_cast<MuSecs>(Clock::now() - Clock::now())) Time;
Time duration(const Clock::time_point &t0)
{
return std::chrono::duration_cast<MuSecs>(Clock::now() - t0);
}
Matrix populate(size_t dim)
{
Matrix mat(dim, dim);
for (size_t i = 0; i < dim; ++i) {
for (size_t j = 0; j < dim; ++j) {
mat[i][j] = ((Matrix::Value)rand() / RAND_MAX) * 100 - 50;
}
}
return mat;
}
std::vector<Time> makeTest(size_t dim)
{
// make a test sample
const Matrix sampleA = populate(dim);
const Matrix sampleB = populate(dim);
// prepare result vectors
Matrix results2[2] = {
Matrix(dim, dim),
Matrix(dim, dim)
};
// make test
std::vector<Time> times{
[&]() { // single threading
// make a copy of test sample
const Matrix a(sampleA), b(sampleB);
Matrix &results = results2[0];
// remember start time
const Clock::time_point t0 = Clock::now();
// do experiment single-threaded
for (size_t k = 0, n = dim * dim; k < n; ++k) {
const size_t i = k / dim, j = k % dim;
results[i][j] = dot(a, i, b, j);
}
// done
return duration(t0);
}(),
[&]() { // single threading - with transposed matrix
// make a copy of test sample
const Matrix a(sampleA), b(sampleB);
Matrix &results = results2[1];
// remember start time
const Clock::time_point t0 = Clock::now();
const Matrix bT = transpose(b);
// do experiment single-threaded with transposed B
for (size_t k = 0, n = dim * dim; k < n; ++k) {
const size_t i = k / dim, j = k % dim;
results[i][j] = dotT(a, i, bT, j);
}
// done
return duration(t0);
}()
};
// check results (must be equal for any kind of computation)
const unsigned nResults = sizeof results2 / sizeof *results2;
for (unsigned iResult = 1; iResult < nResults; ++iResult) {
size_t nErrors = 0;
for (size_t i = 0; i < dim; ++i) {
for (size_t j = 0; j < dim; ++j) {
if (results2[0][i][j] != results2[iResult][i][j]) {
++nErrors;
#if 0 // def _DEBUG
std::cerr
<< "results2[0][" << i << "][" << j << "]: "
<< results2[0][i][j]
<< " != results2[" << iResult << "][" << i << "][" << j << "]: "
<< results2[iResult][i][j]
<< "!\n";
#endif // _DEBUG
}
}
}
if (nErrors) std::cerr << nErrors << " errors in results2[" << iResult << "]!\n";
}
// done
return times;
}
int main()
{
// heat up
std::cout << "Heat up...\n";
for (unsigned i = 0; i < 10; ++i) makeTest(64);
// perform tests:
const unsigned NTrials = 10;
for (size_t dim = 64; dim <= 512; dim *= 2) {
std::cout << "Test for A[" << dim << "][" << dim << "] * B[" << dim << "][" << dim << "]...\n";
// repeat NTrials times
std::cout << "Measuring " << NTrials << " runs...\n"
<< " "
<< " | " << std::setw(10) << "A * B"
<< " | " << std::setw(10) << "A *T B^T"
<< '\n';
std::vector<double> sumTimes;
for (unsigned i = 0; i < NTrials; ++i) {
std::vector<Time> times = makeTest(dim);
std::cout << std::setw(2) << (i + 1) << ".";
for (const Time &time : times) {
std::cout << " | " << std::setw(10) << time.count();
}
std::cout << '\n';
sumTimes.resize(times.size(), 0.0);
for (size_t j = 0; j < times.size(); ++j) sumTimes[j] += times[j].count();
}
std::cout << "Average Values:\n ";
for (const double &sumTime : sumTimes) {
std::cout << " | "
<< std::setw(10) << std::fixed << std::setprecision(1)
<< sumTime / NTrials;
}
std::cout << '\n';
std::cout << "Ratio:\n ";
for (const double &sumTime : sumTimes) {
std::cout << " | "
<< std::setw(10) << std::fixed << std::setprecision(3)
<< sumTime / sumTimes.front();
}
std::cout << "\n\n";
}
// done
return 0;
}
I compiled and run again on cygwin64 (on Windows 10):
$ g++ -std=c++17 -O2 test-single-threading-matrix-transpose.cc -o test-single-threading-matrix-transpose && ./test-single-threading-matrix-transpose
Heat up...
Test for A[64][64] * B[64][64]...
Measuring 10 runs...
| A * B | A *T B^T
1. | 394 | 366
2. | 394 | 368
3. | 396 | 367
4. | 382 | 368
5. | 392 | 289
6. | 297 | 343
7. | 360 | 341
8. | 399 | 358
9. | 385 | 354
10. | 406 | 374
Average Values:
| 380.5 | 352.8
Ratio:
| 1.000 | 0.927
Test for A[128][128] * B[128][128]...
Measuring 10 runs...
| A * B | A *T B^T
1. | 2972 | 2558
2. | 3317 | 2556
3. | 3279 | 2689
4. | 2952 | 2213
5. | 2745 | 2668
6. | 2981 | 2457
7. | 2164 | 2274
8. | 2634 | 2106
9. | 2126 | 2389
10. | 3015 | 2477
Average Values:
| 2818.5 | 2438.7
Ratio:
| 1.000 | 0.865
Test for A[256][256] * B[256][256]...
Measuring 10 runs...
| A * B | A *T B^T
1. | 31312 | 17656
2. | 29249 | 17127
3. | 32127 | 16865
4. | 29655 | 17287
5. | 32137 | 17687
6. | 29788 | 16732
7. | 32251 | 16549
8. | 32272 | 16257
9. | 28019 | 18042
10. | 30334 | 17936
Average Values:
| 30714.4 | 17213.8
Ratio:
| 1.000 | 0.560
Test for A[512][512] * B[512][512]...
Measuring 10 runs...
| A * B | A *T B^T
1. | 322005 | 135102
2. | 310180 | 134897
3. | 329994 | 134304
4. | 335375 | 137701
5. | 330754 | 134252
6. | 353761 | 136732
7. | 359234 | 135632
8. | 351498 | 134389
9. | 360754 | 135751
10. | 368602 | 137139
Average Values:
| 342215.7 | 135589.9
Ratio:
| 1.000 | 0.396
Impressive, isn’t it?
It achieves similar speed-ups like the above multi-threading attempts (I mean the better ones) but with a single core.
The additional effort for transposing the 2nd matrix (which is considered in measurement) does more than amortize. This isn’t that surprising because there a so many more read-accesses in multiplication (which now access consecutive bytes) compared to the additional effort to construct/write the transposed matrix once.