1 year ago

#370880

test-img

Lamett

Closest pair algorithm - Optimizing my code for huge sets of points

I'm trying to create an algorithm that finds the closest pair of points (2D) within a set of points. I'm using a divide and conquer method which is explained here Closest Pair of Points Algorithm.

However, my algorithm is really the best when it comes to huge sets of points, it takes a lot more time than it should.

Any idea on how to optimize my code?

#!/usr/bin/env python3

from timeit import timeit
from sys import argv
from geo.point import Point
from geo.tycat import tycat
from geo.segment import Segment

def load_instance(filename):
    """
    loads .mnt file.
    returns list of points.
    """
    with open(filename, "r") as instance_file:
        points = [Point((float(p[0]), float(p[1]))) for p in (l.split(',') for l in instance_file)]

    return points


def brute(points):
    
    d_min = float('inf')
    n = len(points)
    for i in range(n):
        for j in range(i+1, n):
            d = points[i].distance_to(points[j])
            if d < d_min:
                d_min = d
                p1 = points[i]
                p2 = points[j]

    return p1, p2, d_min

def closest_pair(points_x, points_y):
    n = len(points_x)
    if n <= 3:
        if n == 2: return points_x[0], points_x[1], points_x[0].distance_to(points_x[1])
        return brute(points_x)
    mid = n // 2
    points_x_left = points_x[:mid]
    set_x_left = set(points_x_left)
    points_x_right = points_x[mid:]

    midpoint = points_x[mid].coordinates[0]

    points_y_left = []
    points_y_right = []

    for x in points_y:
        if x in set_x_left:
            points_y_left.append(x)
        else:
            points_y_right.append(x)

    (p1, q1, d1) = closest_pair(points_x_left, points_y_left)
    (p2, q2, d2) = closest_pair(points_x_right, points_y_right)


    if d1 <= d2:
        d = d1
        pair_min = (p1, q1)
    else:
        d = d2
        pair_min = (p2, q2)

    (p3, q3, d3) = closest_pair_boundary(points_x, points_y, d, pair_min)

    if d <= d3:
        return pair_min[0], pair_min[1], d
    else:
        return p3, q3, d3


def closest_pair_boundary(points_x, points_y, d, pair_min):
    n = len(points_x)
    midx = points_x[n//2].coordinates[0]

    s_y = [x for x in points_y if midx - d <= x.coordinates[0] <= midx + d]

    d_min = d
    m = len(s_y)
    for i in range (m-1):
        for j in range(i+1, min(i+7, m)):
            p, q = s_y[i], s_y[j]
            dist = p.distance_to(q)
            if dist < d_min:
                pair_min = p, q
                d_min = dist
    return pair_min[0], pair_min[1], d_min

def print_solution(points):
    points_x = sorted(points, key=lambda p: p.coordinates[0])
    points_y = sorted(points, key=lambda p: p.coordinates[1])
    p1, p2, d_min = closest_pair(points_x, points_y)
    print(f'{p1}; {p2}')



def main():
    
    for instance in argv[1:]:
        points = load_instance(instance)
        print_solution(points)


main()

I'm using search in a set instead of a comparison because it proves more effective. Still, my code could do better. Any help? Maybe there are algorithms that are better suited for bigger sets of data? Or maybe some functions I've used don't work well with big sets or big lists. I'm just looking for any insights really.

(I know my algorithm doesn't do well enough because I have an optimal proved time I'm supposed to be as close to as possible and, while for sets of ~10000 points I am, for ~1000000 points I'm completely behind.)

Thanks for reading!

python

optimization

divide-and-conquer

0 Answers

Your Answer

Accepted video resources