import numpy as np
from scipy.spatial import cKDTree, distance
def _ensure_spacing(coord, spacing, p_norm, max_out):
"""Returns a subset of coord where a minimum spacing is guaranteed.
Parameters
----------
coord : ndarray
The coordinates of the considered points.
spacing : float
the maximum allowed spacing between the points.
p_norm : float
Which Minkowski p-norm to use. Should be in the range [1, inf].
A finite large p may cause a ValueError if overflow can occur.
``inf`` corresponds to the Chebyshev distance and 2 to the
Euclidean distance.
max_out: int
If not None, at most the first ``max_out`` candidates are
returned.
Returns
-------
output : ndarray
A subset of coord where a minimum spacing is guaranteed.
"""
# Use KDtree to find the peaks that are too close to each other
tree = cKDTree(coord)
indices = tree.query_ball_point(coord, r=spacing, p=p_norm)
rejected_peaks_indices = set()
naccepted = 0
for idx, candidates in enumerate(indices):
if idx not in rejected_peaks_indices:
# keep current point and the points at exactly spacing from it
candidates.remove(idx)
dist = distance.cdist([coord[idx]],
coord[candidates],
distance.minkowski,
p=p_norm).reshape(-1)
candidates = [c for c, d in zip(candidates, dist)
if d < spacing]
# candidates.remove(keep)
rejected_peaks_indices.update(candidates)
naccepted += 1
if max_out is not None and naccepted >= max_out:
break
# Remove the peaks that are too close to each other
output = np.delete(coord, tuple(rejected_peaks_indices), axis=0)
if max_out is not None:
output = output[:max_out]
return output
def ensure_spacing(coords, spacing=1, p_norm=np.inf, min_split_size=50,
max_out=None, *, max_split_size=2000):
"""Returns a subset of coord where a minimum spacing is guaranteed.
Parameters
----------
coords : array_like
The coordinates of the considered points.
spacing : float
the maximum allowed spacing between the points.
p_norm : float
Which Minkowski p-norm to use. Should be in the range [1, inf].
A finite large p may cause a ValueError if overflow can occur.
``inf`` corresponds to the Chebyshev distance and 2 to the
Euclidean distance.
min_split_size : int
Minimum split size used to process ``coords`` by batch to save
memory. If None, the memory saving strategy is not applied.
max_out : int
If not None, only the first ``max_out`` candidates are returned.
max_split_size : int
Maximum split size used to process ``coords`` by batch to save
memory. This number was decided by profiling with a large number
of points. Too small a number results in too much looping in
Python instead of C, slowing down the process, while too large
a number results in large memory allocations, slowdowns, and,
potentially, in the process being killed -- see gh-6010. See
benchmark results `here
<https://github.com/scikit-image/scikit-image/pull/6035#discussion_r751518691>`_.
Returns
-------
output : array_like
A subset of coord where a minimum spacing is guaranteed.
"""
output = coords
if len(coords):
coords = np.atleast_2d(coords)
if min_split_size is None:
batch_list = [coords]
else:
coord_count = len(coords)
split_idx = [min_split_size]
split_size = min_split_size
while coord_count - split_idx[-1] > max_split_size:
split_size *= 2
split_idx.append(split_idx[-1] + min(split_size,
max_split_size))
batch_list = np.array_split(coords, split_idx)
output = np.zeros((0, coords.shape[1]), dtype=coords.dtype)
for batch in batch_list:
output = _ensure_spacing(np.vstack([output, batch]),
spacing, p_norm, max_out)
if max_out is not None and len(output) >= max_out:
break
return output