vendredi 9 décembre 2016

Closest 2 points using Rabin algorithm not too fast

I was told to use the following Rabin algorithm to find the shortest distance between 2 points in 2-d.

  • Randomly choose \sqrt{n} and brute force to find the closest distance in this set. Let this distance be delta.

  • Make a delta grid on top of all n points, which partitions all n points into different bins/buckets.

  • For each bin k, find the shortest distance between 2 points in this bin. Also, find the shortest distance between points in bin k and points in its 8 neighbor bins (4 should be enough to avoid double counting).

  • The result is the smallest distance we compute.

The expected running time for this algorithm is said to be O(n). However, when I tried to implement it and compared with the divide-and-conquer (DC) algorithm (supposed to be O(n log n)), although the results were the same, it was not as fast as expected.

In particular, testing on my i7-4700mq with MS Visual Studio Community 2015, when n <= 5 000 000, it was only about 0.1 second faster than the DC algorithm. Then when n was larger, the DC algorithm began to overtake it, and when n = 50 000 000, it took the DC algorithm about 70 seconds and this randomizing algorithm about 100 seconds.

Is there anything wrong in my code? Could someone please look at it? Thank you very much.

#include <algorithm>
#include <vector>
#include <climits>
#include <unordered_map>
#include <time.h>
#include <random>

#define PAIR std::pair<double, double> // x y coordinates
#define EPS 1e-8
#define MARGIN 1e-3

// Eucledian distance
double distance(PAIR p, PAIR q) {
    return sqrt((p.first - q.first) * (p.first - q.first) + (p.second - q.second) * (p.second - q.second));
}

// brute force for small data set
double closest_2d_points_slow(std::vector<PAIR> &a, int L, int R, std::vector<PAIR> &answer) {
    if (R - L < 1) return DBL_MAX;

    answer.resize(2);
    answer[0] = a[L];
    answer[1] = a[L + 1];
    double d = distance(answer[0], answer[1]);
    for (int i = L; i < R; ++i)
        for (int j = i + 1; j <= R; ++j) {
            double dd = distance(a[i], a[j]);
            if (dd < d - EPS) {
                d = dd;
                answer[0] = a[i];
                answer[1] = a[j];
            }
        }
    return d;
}

// estimate 'delta' by brute force on sqrt(n) points.
double estimate_delta(std::vector<PAIR> &a) {
    int n = (int)a.size(), k = (int)sqrt(n), k1 = k;
    std::vector<PAIR> sample(k);

    // choose k = sqrt(n) points randomly.
    std::mt19937 rng((unsigned int)time(0));
    for (int i = 0; i < n; ++i) {
        std::uniform_int_distribution<int> gen(0, n - i - 1);
        if (gen(rng) < k1) {
            sample[--k1] = a[i];
        }
    }
    return closest_2d_points_slow(sample, 0, k - 1, std::vector<PAIR>());
}

// find x_min, x_max, y_min, y_max
// answer[0] = x_min, answer[1] = x_max
// answer[2] = y_min, answer[3] = y_max
void minXY_maxXY(std::vector<PAIR> &a, double * answer) {
    answer[0] = answer[2] = DBL_MAX;
    answer[1] = answer[3] = DBL_MIN;
    int left = 0;
    if (a.size() % 2 > 0) { // odd number of points
        answer[0] = answer[1] = a[0].first;
        answer[2] = answer[3] = a[0].second;
        ++left;
    }
    for (int i = left; i < (int)a.size(); i += 2) {
        // x-coordinates
        if (a[i].first < a[i + 1].first - EPS) {
            if (answer[0] > a[i].first + EPS)
                answer[0] = a[i].first;
            if (answer[1] < a[i + 1].first - EPS)
                answer[1] = a[i + 1].first;
        }
        else {
            if (answer[0] > a[i + 1].first + EPS)
                answer[0] = a[i + 1].first;
            if (answer[1] < a[i].first - EPS)
                answer[1] = a[i].first;
        }
        // y-coordinates
        if (a[i].second < a[i + 1].second - EPS) {
            if (answer[2] > a[i].second + EPS)
                answer[2] = a[i].second;
            if (answer[3] < a[i + 1].second - EPS)
                answer[3] = a[i + 1].second;
        }
        else {
            if (answer[2] > a[i + 1].second + EPS)
                answer[2] = a[i + 1].second;
            if (answer[3] < a[i].second - EPS)
                answer[3] = a[i].second;
        }
    }
    answer[0] -= MARGIN;
    answer[1] += MARGIN;
    answer[2] -= MARGIN;
    answer[3] += MARGIN;
}

// convert xy-coordinate to a number representing a cell in a rows x cols grid.
int sub2ind(PAIR p, int rows, int cols, double x_min, double y_min, double delta) {
    return int(floor((p.second - y_min) / delta)) * cols + int(floor((p.first - x_min) / delta));
}

// convert a cell number to (row, col)
void ind2sub(int bin, int rows, int cols, int &row, int &col) {
    row = bin / cols;
    col = bin % cols;
}

// return the neighbors of a cell
void bin_neighbors(int bin, int rows, int cols, std::vector<int> &neighbors) {
    int row, col;
    ind2sub(bin, rows, cols, row, col);
    neighbors.clear();
    if (row == 0) {
        neighbors.push_back(bin + cols);
        if (col > 0)
            neighbors.push_back(bin + cols - 1);
        if (col < cols - 1) {
            neighbors.push_back(bin + 1);
            neighbors.push_back(bin + cols + 1);
        }
        return;
    }
    if (row == rows - 1) {
        if (col < cols - 1)
            neighbors.push_back(bin + 1);
        return;
    }
    neighbors.push_back(bin + cols);
    if (col > 0)
        neighbors.push_back(bin + cols - 1);
    if (col < cols - 1) {
        neighbors.push_back(bin + 1);
        neighbors.push_back(bin + cols + 1);
    }
}

// Main function to compute the cloest 2d distance between 2 points using Rabin randomized algorithm.
double closest_2d_points_rabin_randomized(std::vector<PAIR> &a, std::vector<PAIR> &answer) {
    if (a.size() < 25)
        return closest_2d_points_slow(a, 0, (int)a.size() - 1, answer);

    // compute estimated delta and use it as a grid.
    answer.resize(2);
    double delta = estimate_delta(a);
    double mm[4];
    minXY_maxXY(a, mm);

    // compute the number of rows and cols in a grid.
    int rows = (int)ceil((mm[3] - mm[2]) / delta);
    int cols = (int)ceil((mm[1] - mm[0]) / delta);

    // project each point into cells according to its coordinate and delta.
    std::unordered_map<int, std::vector<PAIR>> point2bin;
    point2bin.reserve(a.size());

    std::vector<int> bins;
    for (PAIR p : a) {
        int bin = sub2ind(p, rows, cols, mm[0], mm[2], delta);
        if (point2bin.find(bin) == point2bin.end()) {
            bins.push_back(bin);
            std::vector<PAIR> list(1, p);
            point2bin.insert(std::pair<int, std::vector<PAIR>>(bin, list));
        }
        else
            point2bin[bin].push_back(p);
    }

    // look at each cell and its 4 adjacent cells.
    double d = DBL_MAX, dd = DBL_MAX;
    answer.resize(2);
    for (int bin : bins) {
        std::vector<PAIR> pts = point2bin[bin];
        int bin_size = (int)pts.size();
        std::vector<int> neighbors;
        bin_neighbors(bin, rows, cols, neighbors);
        for (int i = 0; i < bin_size; ++i) {
            // points in one cell
            for (int j = i + 1; j < bin_size; ++j) {
                dd = distance(pts[i], pts[j]);
                if (dd < d - EPS) {
                    d = dd;
                    answer[0] = pts[i];
                    answer[1] = pts[j];
                }
            }
            // points in its 4 neighbors
            for (int another_bin : neighbors)
                if (point2bin.find(another_bin) != point2bin.end()) {
                    for (PAIR q : point2bin[another_bin]) {
                        dd = distance(pts[i], q);
                        if (dd < d - EPS) {
                            d = dd;
                            answer[0] = pts[i];
                            answer[1] = q;
                        }
                    }
                }
        }
    }
    return d;
}




Aucun commentaire:

Enregistrer un commentaire