Source code for pept.tracking.peptml.cutpoints

#!/usr/bin/env python3
# -*- coding: utf-8 -*-


#    pept is a Python library that unifies Positron Emission Particle
#    Tracking (PEPT) research, including tracking, simulation, data analysis
#    and visualisation tools.
#
#    If you used this codebase or any software making use of it in a scientific
#    publication, you must cite the following paper:
#        Nicuşan AL, Windows-Yule CR. Positron emission particle tracking
#        using machine learning. Review of Scientific Instruments.
#        2020 Jan 1;91(1):013329.
#        https://doi.org/10.1063/1.5129251
#
#    Copyright (C) 2019-2021 the pept developers
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <https://www.gnu.org/licenses/>.


# File   : cutpoints.py
# License: GNU v3.0
# Author : Andrei Leonard Nicusan <a.l.nicusan@bham.ac.uk>
# Date   : 13.04.2020


import numpy as np
import pept


def find_cutpoints(
    sample_lines,
    max_distance,
    cutoffs = None,
    append_indices = False
):
    '''Find the cutpoints from a sample / array of LoRs.

    A cutpoint is the point in 3D space that minimises the distance between any
    two lines. For any two non-parallel 3D lines, this point corresponds to the
    midpoint of the unique segment that is perpendicular to both lines.

    This function considers every pair of lines in `sample_lines` and returns
    all the cutpoints that satisfy the following conditions:

    1. The distance between the two lines is smaller than `max_distance`.
    2. The cutpoint is within the `cutoffs`.

    Parameters
    ----------
    sample_lines : (N, M >= 7) numpy.ndarray
        A sample of LoRs, where each row is `[time, x1, y1, z1, x2, y2, z2]`,
        such that every line is defined by the points `[x1, y1, z1]` and
        `[x2, y2, z2]`.
    max_distance : float
        The maximum distance between any two lines for their cutpoint to be
        considered. A good starting value would be 0.1 mm for small tracers
        and/or clean data, or 0.2 mm for larger tracers and/or noisy data.
    cutoffs : list, optional
        The cutoffs for each dimension, formatted as `[x_min, x_max,
        y_min, y_max, z_min, z_max]`. If it is `None`, they are computed
        automatically by calling `get_cutoffs`. The default is `None`.
    append_indices : bool, optional
        If set to `True`, the indices of the individual LoRs that were used
        to compute each cutpoint are also appended to the returned array.
        Default is `False`.

    Returns
    -------
    cutpoints : (M, 4) or (M, 6) numpy.ndarray
        A numpy array of the calculated cutpoints. If `append_indices` is
        `False`, then the columns are [time, x, y, z]. If `append_indices` is
        `True`, then the columns are [time, x, y, z, i, j], where `i` and `j`
        are the LoR indices from `sample_lines` that were used to compute the
        weighted cutpoints. The time is the average between the timestamps of
        the two LoRs that were used to compute the cutpoint. The first column
        (for time) is sorted.

    Raises
    ------
    ValueError
        If `sample_lines` is not a numpy array with shape (N, M >= 7).
    ValueError
        If `cutoffs` is not a one-dimensional array with values
        `[min_x, max_x, min_y, max_y, min_z, max_z]`

    See Also
    --------
    pept.tracking.peptml.Cutpoints : Compute cutpoints from `pept.LineData`.
    pept.utilities.read_csv : Fast CSV file reading into numpy arrays.
    '''

    if not isinstance(sample_lines, pept.LineData):
        sample_lines = pept.LineData(sample_lines)

    lines = sample_lines.lines

    lines = np.asarray(lines, order = 'C', dtype = float)
    max_distance = float(max_distance)

    # If cutoffs were not defined, automatically compute them
    if cutoffs is None:
        cutoffs = get_cutoffs(lines)
    else:
        cutoffs = np.asarray(cutoffs, order = 'C', dtype = float)
        if cutoffs.ndim != 1 or len(cutoffs) != 6:
            raise ValueError((
                "\n[ERROR]: cutoffs should be a one-dimensional array with "
                "values [min_x, max_x, min_y, max_y, min_z, max_z]. Received "
                f"{cutoffs}.\n"
            ))

    sample_cutpoints = pept.utilities.find_cutpoints(
        lines,
        max_distance,
        cutoffs,
        append_indices = append_indices
    )

    columns = ["t", "x", "y", "z"]
    if append_indices:
        columns += ["line_index1", "line_index2"]

    points = pept.PointData(sample_cutpoints, columns = columns)

    # Add optional metadata to the points; because they have an underscore,
    # they won't be propagated when new objects are constructed
    points._max_distance = max_distance
    points._cutoffs = cutoffs
    if append_indices:
        points._lines = sample_lines

    return points


def get_cutoffs(sample):
    '''Compute the cutoffs from a sample of LoR data.

    It computes the cutoffs from the minimum and maximum values of the LoRs in
    `sample` in each dimension (e.g. the x-dimension is defined by data in
    columns 1 and 4).

    Parameters
    ----------
    sample : (N, M >= 7) numpy.ndarray
        A sample of LoRs, where each row is `[time, x1, y1, z1, x2, y2, z2]`,
        such that every line is defined by the points `[x1, y1, z1]` and
        `[x2, y2, z2]`.

    Returns
    -------
    cutoffs : (6,) numpy.ndarray
        The computed cutoffs for each dimension, formatted as
        `[x_min, x_max, y_min, y_max, z_min, z_max]`.

    Raises
    ------
    ValueError
        If `sample` is not a numpy array with shape (N, M >= 7).

    See Also
    --------
    pept.tracking.peptml.Cutpoints : Compute cutpoints from `pept.LineData`.
    pept.utilities.read_csv : Fast CSV file reading into numpy arrays.
    '''

    # Check sample has shape (N, M >= 7)
    if sample.ndim != 2 or sample.shape[1] < 7:
        raise ValueError((
            "\n[ERROR]: `sample_lines` should have dimensions (M, N), "
            f" where N >= 7. Received {sample.shape}.\n"
        ))

    # Compute cutoffs for cutpoints as the (min, max) values of the lines
    # Minimum value of the two points that define a line
    min_x = min(sample[:, 1].min(),
                sample[:, 4].min())
    # Maximum value of the two points that define a line
    max_x = max(sample[:, 1].max(),
                sample[:, 4].max())

    # Minimum value of the two points that define a line
    min_y = min(sample[:, 2].min(),
                sample[:, 5].min())
    # Maximum value of the two points that define a line
    max_y = max(sample[:, 2].max(),
                sample[:, 5].max())

    # Minimum value of the two points that define a line
    min_z = min(sample[:, 3].min(),
                sample[:, 6].min())
    # Maximum value of the two points that define a line
    max_z = max(sample[:, 3].max(),
                sample[:, 6].max())

    cutoffs = np.array([min_x, max_x, min_y, max_y, min_z, max_z],
                       dtype = float)
    return cutoffs




[docs]class Cutpoints(pept.base.LineDataFilter):
    '''Transform LoRs (a `pept.LineData` instance) into *cutpoints* (a
    `pept.PointData` instance) for clustering, in parallel.

    Under typical usage, the `Cutpoints` class is initialised with a
    `pept.LineData` instance, automatically calculating the cutpoints from the
    samples of lines. The `Cutpoints` class inherits from `pept.PointData`,
    such that once the cutpoints have been computed, all the methods from the
    parent class `pept.PointData` can be used on them (such as visualisation
    functionality).

    For more control over the operations, `pept.tracking.peptml.find_cutpoints`
    can be used - it receives a generic numpy array of LoRs (one 'sample') and
    returns a numpy array of cutpoints.

    Attributes
    ----------
    max_distance : float
        The maximum distance between any two lines for their cutpoint to be
        considered. A good starting value would be 0.1 mm for small tracers
        and/or clean data, or 0.2 mm for larger tracers and/or noisy data.

    cutoffs : list-like of length 6
        A list (or equivalent) of the cutoff distances for every axis,
        formatted as `[x_min, x_max, y_min, y_max, z_min, z_max]`. Only the
        cutpoints which fall within these cutoff distances are considered. The
        default is None, in which case they are automatically computed using
        `pept.tracking.peptml.get_cutoffs`.

    See Also
    --------
    pept.LineData : Encapsulate LoRs for ease of iteration and plotting.
    pept.tracking.HDBSCAN : Efficient, parallel HDBSCAN-based clustering of
                            (cut)points.
    pept.read_csv : Fast CSV file reading into numpy arrays.

    Examples
    --------
    Compute the cutpoints for a `LineData` instance between lines that are less
    than 0.1 apart:

    >>> line_data = pept.LineData(example_data)
    >>> cutpts = peptml.Cutpoints(0.1).fit(line_data)

    Compute the cutpoints for a single sample:

    >>> sample = line_data[0]
    >>> cutpts_sample = peptml.Cutpoints(0.1).fit_sample(sample)
    '''

[docs]    def __init__(
        self,
        max_distance,
        cutoffs = None,
        append_indices = False,
    ):
        '''Cutpoints class constructor.

        Parameters
        ----------
        max_distance : float
            The maximum distance between any two lines for their cutpoint to be
            considered. A good starting value would be 0.1 mm for small tracers
            and/or clean data, or 0.5 mm for larger tracers and/or noisy data.

        cutoffs : list-like of length 6, optional
            A list (or equivalent) of the cutoff distances for every axis,
            formatted as `[x_min, x_max, y_min, y_max, z_min, z_max]`. Only the
            cutpoints which fall within these cutoff distances are considered.
            The default is None, in which case they are automatically computed
            using `pept.tracking.peptml.get_cutoffs`.

        append_indices : bool, default False
            If set to `True`, the indices of the individual LoRs that were used
            to compute each cutpoint are also appended to the returned array.

        Raises
        ------
        ValueError
            If `cutoffs` is not a one-dimensional array with values formatted
            as `[min_x, max_x, min_y, max_y, min_z, max_z]`.
        '''

        # Setting class attributes. The ones below call setters which do type
        # checking
        self.cutoffs = cutoffs
        self.append_indices = append_indices
        self.max_distance = max_distance


    @property
    def max_distance(self):
        return self._max_distance


    @max_distance.setter
    def max_distance(self, max_distance):
        self._max_distance = float(max_distance)


    @property
    def cutoffs(self):
        return self._cutoffs


    @cutoffs.setter
    def cutoffs(self, cutoffs):
        if cutoffs is not None:
            cutoffs = np.asarray(cutoffs, order = 'C', dtype = float)
            if cutoffs.ndim != 1 or len(cutoffs) != 6:
                raise ValueError((
                    "\n[ERROR]: cutoffs should be a one-dimensional array "
                    "with values [min_x, max_x, min_y, max_y, min_z, max_z]. "
                    f"Received {cutoffs}.\n"
                ))

            self._cutoffs = cutoffs
        else:
            self._cutoffs = None


    @property
    def append_indices(self):
        return self._append_indices


    @append_indices.setter
    def append_indices(self, append_indices):
        self._append_indices = bool(append_indices)


[docs]    def fit_sample(self, sample_lines):
        if not isinstance(sample_lines, pept.LineData):
            sample_lines = pept.LineData(sample_lines)

        # If cutoffs were not defined, automatically compute them
        if self.cutoffs is not None:
            cutoffs = self.cutoffs
        else:
            cutoffs = get_cutoffs(sample_lines.lines)

        # Only compute cutpoints if there are at least 2 LoRs
        if len(sample_lines.lines) >= 2:
            sample_cutpoints = pept.utilities.find_cutpoints(
                sample_lines.lines,
                self.max_distance,
                cutoffs,
                append_indices = self.append_indices,
            )
        else:
            sample_cutpoints = np.empty((0, 6 if self.append_indices else 4))

        # Column names
        columns = ["t", "x", "y", "z"]
        if self.append_indices:
            columns += ["line_index1", "line_index2"]

        # Encapsulate points in a PointData
        points = pept.PointData(sample_cutpoints, columns = columns)

        # Add optional metadata to the points; because they have an underscore,
        # they won't be propagated when new objects are constructed
        points.attrs["_max_distance"] = self.max_distance
        points.attrs["_cutoffs"] = cutoffs

        # If LoR indices were appended, also include the cutpoints' constituent
        # lines
        if self.append_indices:
            points.attrs["_lines"] = sample_lines

        return points