1 year ago

#355105

test-img

ZVY545

Finding closest pair of points from one array to another

I have two arrays of points v and v1. Need to find the closest pair from array v for each point in array v1. I came out with the three following methods to calculate the indices and distances for each points of v1. Real dataset can have 100s of millions of rows.

from scipy.spatial import KDTree
import numpy as np
import pandas as pd
from scipy.spatial import distance 
from scipy.spatial.distance import cdist
import time


np.random.seed(0)
n = 6
m=15
v = np.random.rand(n, 2) # sluice gate mid points
v1 = np.random.rand(m, 2) #mesh points

# naive python 
tic = time.time()
def closest_node(node, nodes): # distance of a mesh node from sluice gate nodes
    nodes = np.asarray(nodes)
    d = np.sum((nodes - node)**2, axis=1)
    distance=np.sqrt(d).min()
    return [np.argmin(d)+1,distance]

table=[]
for j in range(len(v1)):
    print(j+1, closest_node(v1[j],v)) # comment out for larger arrary
    table.append({'Node':j+1, 'nearest sluice(1-6)':closest_node(v1[j],v)[0],'distance':closest_node(v1[j],v)[1] })
    df1=pd.DataFrame(table)    

toc = time.time()
print(toc-tic)    

# using kdtree module
tic = time.time()

kdtree = KDTree(v)
table=[]
for j in range(len(v1)):
    
 d, i = kdtree.query(v1[j])
 #node number /j+1, nearest sluice(1-6)/i, distance/d
 print("closest point:  ",j+1, i+1, d,v1[j])
 table.append({'Node':j+1, 'nearest sluice(1-6)':i+1,'distance':d})
 df2=pd.DataFrame(table)

toc = time.time()
print(toc-tic) 

# using cdist module
tic = time.time()  
def closest_node(node, nodes): # distance of a mesh node from sluice gate nodes
    d= cdist([node], nodes, 'euclidean').min()
    closest_index = distance.cdist([node], nodes).argmin()
    return closest_index+1, d
table=[] 
for j in range(len(v1)):
    print(j+1, closest_node(v1[j],v))    
    table.append({'Node':j+1, 'nearest sluice(1-6)':closest_node(v1[j],v)[0],'distance':closest_node(v1[j],v)[1] })
    df3=pd.DataFrame(table)   
    
toc = time.time()
print(toc-tic) 
#check if all dfs results are equal
dfs=[df1,df2,df3]
all(x.equals(dfs[0]) for x in dfs)

Please let me know what are other unique ways to solve this problem.

python

euclidean-distance

0 Answers

Your Answer

Accepted video resources