I am trying to use RcppParrallel to estimate distances between rows of two 3D-matrix and return a new matrix. I saw examples of Parallel Distance Matrix Calculation using parallelFor
, but these calculations come from a single matrix
with a fixed size.
For example, let’s say that I have two matrices amat
and bmat
, the nrow
of these could differ between them, but the ncol
will always be 3
. In R these may look like:
set.seed(10); amat <- matrix(rnorm(9, 2, 0.5), ncol = 3)
set.seed(50); bmat <- matrix(rnorm(9, 2, 0.5), ncol = 3)
Using this example, the expected output is a matrix
of nrow = amat.row()*bmat.row() = 9
, and 5 columns (1 the row
index of amat
, 2:4 the row
values of bmat
, and 5 the euclidean distance between rows of matrices. Something like this:
points X Y Z distance
[1,] 0 1.579198 1.136198 1.704544 0.7737024
[2,] 0 2.274835 2.262075 2.180414 1.0006478
[3,] 0 2.016499 1.861068 2.487795 1.1036122
[4,] 1 2.274835 2.262075 2.180414 0.5282677
[5,] 1 2.016499 1.861068 2.487795 0.7362889
[6,] 1 1.579198 1.136198 1.704544 1.0692094
[7,] 2 1.579198 1.136198 1.704544 1.2079720
[8,] 2 2.274835 2.262075 2.180414 1.3836957
[9,] 2 2.016499 1.861068 2.487795 1.5157243
This is the code that I have so far, inspired in RcppParallel
examples
// [[Rcpp::depends(RcppParallel)]]
#include <RcppParallel.h>
using namespace RcppParallel;
struct Mdistance : public Worker { //function object
// input 3D-matrix
const RMatrix<double> amat;
const RMatrix<double> bmat;
// output matrix to write to
RMatrix<double> rmat;
// initialize from Rcpp input and output matrixes
Mdistance(const NumericMatrix amat, const NumericMatrix bmat, NumericMatrix rmat)
: amat(amat), bmat(bmat), rmat(rmat) {}
// function call operator that work for the specified range (begin/end) #Not sure of this part
void operator()(std::size_t begin, std::size_t end) {
for (std::size_t i = 0; i < amat.nrow(); i++) {
for (std::size_t j = 0; j < bmat.nrow(); j++) {
// write to output matrix
rmat((i + (j * amat.nrow())), 0) = i + 1; //Row index of amat
rmat((i + (j * amat.nrow())), 1) = bmat(j, 0); //Value of column 0 of bmat
rmat((i + (j * amat.nrow())), 2) = bmat(j, 1); //Value of column 1 of bmat
rmat((i + (j * amat.nrow())), 3) = bmat(j, 2); //Value of column 2 of bmat
rmat((i + (j * amat.nrow())), 4) = sqrt((pow(bmat(j,0) - amat(i,0), 2.0) + pow(bmat(j, 1) - amat(i, 1), 2.0) + pow(bmat(j, 2) - amat(i, 2), 2.0))); //Euclidean distance between rows
}
}
}
};
// [[Rcpp::export]]
NumericMatrix Mdistance_parallel(NumericMatrix amat, NumericMatrix bmat) {
// allocate the matrix we will return
NumericMatrix rmat((amat.nrow()*bmat.nrow()), 5);
// create the worker
Mdistance Mdistance(amat, bmat, rmat);
// call it with parallelFor
parallelFor(0, (amat.nrow()*bmat.nrow()), MDistance);
return rmat;
}
Any idea of how I can put this to work using RcppParallel? Obviously, I am using parallel because the nrow
of amat
and bmat
tend to be close to 10 million. I was using other routines based on foreach
in R
. However, it takes a long time (> 1 day) and don't seem to be stable.
Thanks...
EDIT
Here is my example using just Rcpp
#include <Rcpp.h>
#include <cmath>
#include <algorithm>
using namespace Rcpp;
// [[Rcpp::export]]
NumericMatrix rcpp_distance(NumericMatrix amat, NumericMatrix bmat) {
// allocate the matrix we will return
NumericMatrix rmat((amat.nrow()*bmat.nrow()), 5);
for (int i = 0; i < amat.nrow(); i++) {
for (int j = 0; j < bmat.nrow(); j++) {
rmat((i + (j * amat.nrow())), 0) = i + 1; //Row index of amat
rmat((i + (j * amat.nrow())), 1) = bmat(j, 0); //Value of column 0 of bmat
rmat((i + (j * amat.nrow())), 2) = bmat(j, 1); //Value of column 1 of bmat
rmat((i + (j * amat.nrow())), 3) = bmat(j, 2); //Value of column 2 of bmat
rmat((i + (j * amat.nrow())), 4) = sqrt((pow(bmat(j,0) - amat(i,0), 2.0) + pow(bmat(j, 1) - amat(i, 1), 2.0) + pow(bmat(j, 2) - amat(i, 2), 2.0))); //Euclidean distance between rows
}
}
return rmat;
}