Source code for pept.utilities.misc.read_csv

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File   : read_csv.py
# License: GNU v3.0
# Author : Andrei Leonard Nicusan <a.l.nicusan@bham.ac.uk>
# Date   : 14.04.2020


import pandas as pd
import csv




[docs]def number_of_lines(filepath_or_buffer):
    '''Return the number of lines (or rows) in a file.

    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        Path to the file.

    Returns
    -------
    int
        The number of lines in the file pointed at by `filepath_or_buffer`.
    '''

    with open(filepath_or_buffer) as f:
        file_lines = sum(1 for line in f)

    return file_lines


[docs]def read_csv(
    filepath_or_buffer,         # Essential
    skiprows = None,            # Important
    nrows = None,               # Important
    dtype = float,              # Medium Importance
    sep = r"\s+",               # Extra parameters
    header = None,              # |
    engine = "c",               # |
    na_filter = False,          # |
    quoting = csv.QUOTE_NONE,   # |
    memory_map = True,          # -
    **kwargs                    # Extra keyword arguments to pandas.read_csv
):
    '''Read a given number of lines from a file and return a numpy array of the
    values.

    This is a convenience function that's simply a proxy to `pandas.read_csv`,
    configured with default parameters for fast reading and parsing of usual
    PEPT data.

    Most importantly, it reads from a **space-separated values** file at
    `filepath_or_buffer`, optionally skipping `skiprows` lines and reading in
    `nrows` lines. It returns a `numpy.ndarray` with `float` values.

    The parameters below are sent to `pandas.read_csv` with no further parsing.
    The descriptions below are taken from the `pandas` documentation.

    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, and file. For file URLs, a host is
        expected. A local file could be: file://localhost/path/to/table.csv. If
        you want to pass in a path object, pandas accepts any `os.PathLike`. By
        file-like object, we refer to objects with a `read()` method, such as a
        file handler (e.g. via builtin `open` function) or `StringIO`.
    skiprows : list-like, int or callable, optional
        Line numbers to skip (0-indexed) or number of lines to skip (int) at
        the start of the file.
    nrows : int, optional
        Number of rows of file to read. Useful for reading pieces of large
        files.
    dtype : Type name, default `float`
        Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32,
        ‘c’: ‘Int64’}.
    sep : str, default `"\s+"`
        Delimiter to use. Separators longer than 1 character and different from
        '\s+' will be interpreted as regular expressions and will also force
        the use of the Python parsing engine.
    header : int, list of int, "infer", optional
        Row number(s) to use as the column names, and the start of the data. By
        default assume there is no header present (i.e. `header = None`).
    engine : {‘c’, ‘python’}, default "c"
        Parser engine to use. The C engine is faster while the python engine is
        currently more feature-complete.
    na_filter : bool, default `True`
        Detect missing value markers (empty strings and the value of
        na_values). In data without any NAs, passing na_filter=False can
        improve the performance of reading a large file.
    quoting : int or csv.QUOTE_* instance, default `csv.QUOTE_NONE`
        Control field quoting behavior per csv.QUOTE_* constants. Use one of
        QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or
        QUOTE_NONE (3).
    memory_map : bool, default True
        If a filepath is provided for filepath_or_buffer, map the file object
        directly onto memory and access the data directly from there. Using
        this option can improve performance because there is no longer any I/O
        overhead.
    **kwargs : optional
        Extra keyword arguments that will be passed to `pandas.read_csv`.
    '''

    data = pd.read_csv(
        filepath_or_buffer,
        skiprows = skiprows,
        nrows = nrows,
        dtype = dtype,
        sep = sep,
        header = header,
        engine = engine,
        na_filter = na_filter,
        quoting = quoting,
        memory_map = memory_map,
        **kwargs
    )

    return data.to_numpy()




[docs]def read_csv_chunks(
    filepath_or_buffer,
    chunksize,
    skiprows = None,
    nrows = None,
    dtype = float,
    sep = "\s+",
    header = None,
    engine = "c",
    na_filter = False,
    quoting = csv.QUOTE_NONE,
    memory_map = True,
    **kwargs
):
    '''Read chunks of data from a file lazily, returning numpy arrays of the
    values.

    This function returns a generator - an object that can be iterated over
    once, creating data on-demand. This means that chunks of data will be
    read only when being accessed, making it a more efficient alternative to
    `read_csv` for large files (> 1.000.000 lines).

    A more convenient and feature-complete alternative is
    `pept.utilities.ChunkReader` which is more reusable and can access
    out-of-order chunks using subscript notation (i.e. data[0]).

    This is a convenience function that's simply a proxy to `pandas.read_csv`,
    configured with default parameters for fast reading and parsing of usual
    PEPT data.

    Most importantly, it lazily read chunks of size `chunksize` from a
    **space-separated values** file at `filepath_or_buffer`, optionally
    skipping `skiprows` lines and reading in `nrows` lines. It returns
    `numpy.ndarray`s with `float` values.

    The parameters below are sent to `pandas.read_csv` with no further parsing.
    The descriptions below are taken from the `pandas` documentation.

    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, and file. For file URLs, a host is
        expected. A local file could be: file://localhost/path/to/table.csv. If
        you want to pass in a path object, pandas accepts any `os.PathLike`. By
        file-like object, we refer to objects with a `read()` method, such as a
        file handler (e.g. via builtin `open` function) or `StringIO`.
    chunksize : int
        Number of lines read in a chunk of data. Return TextFileReader object
        for iteration.
    skiprows : list-like, int or callable, optional
        Line numbers to skip (0-indexed) or number of lines to skip (int) at
        the start of the file.
    nrows : int, optional
        Number of rows of file to read. Useful for reading pieces of large
        files.
    dtype : Type name, default `float`
        Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32,
        ‘c’: ‘Int64’}.
    sep : str, default `"\s+"`
        Delimiter to use. Separators longer than 1 character and different from
        '\s+' will be interpreted as regular expressions and will also force
        the use of the Python parsing engine.
    header : int, list of int, "infer", optional
        Row number(s) to use as the column names, and the start of the data. By
        default assume there is no header present (i.e. `header = None`).
    engine : {‘c’, ‘python’}, default "c"
        Parser engine to use. The C engine is faster while the python engine is
        currently more feature-complete.
    na_filter : bool, default `True`
        Detect missing value markers (empty strings and the value of
        na_values). In data without any NAs, passing na_filter=False can
        improve the performance of reading a large file.
    quoting : int or csv.QUOTE_* instance, default `csv.QUOTE_NONE`
        Control field quoting behavior per csv.QUOTE_* constants. Use one of
        QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or
        QUOTE_NONE (3).
    memory_map : bool, default True
        If a filepath is provided for filepath_or_buffer, map the file object
        directly onto memory and access the data directly from there. Using
        this option can improve performance because there is no longer any I/O
        overhead.
    **kwargs : optional
        Extra keyword arguments that will be passed to `pandas.read_csv`.
    '''

    reader = pd.read_csv(
        filepath_or_buffer,
        chunksize = chunksize,
        skiprows = skiprows,
        nrows = nrows,
        dtype = dtype,
        sep = sep,
        header = header,
        engine = engine,
        na_filter = na_filter,
        quoting = quoting,
        memory_map = memory_map,
        **kwargs
    )

    for chunk in reader:
        yield chunk.values




[docs]class ChunkReader:
    '''Class for fast, on-demand reading / parsing and iteration over chunks of
    data from CSV files.

    This is an abstraction above `pandas.read_csv` for easy and fast iteration
    over chunks of data from a CSV file. The chunks can be accessed using
    normal iteration (`for chunk in reader: ...`) and subscripting
    (`reader[0]`).

    The chunks are read lazily, only upon access. It is therefore a more
    efficient alternative to `read_csv` for large files (> 1.000.000 lines).
    For convenience, this class configures some default parameters for
    `pandas.read_csv` for fast reading and parsing of usual PEPT data.

    Most importantly, it reads chunks containing `chunksize` lines from a
    **space-separated values** file at `filepath_or_buffer`, optionally
    skipping `skiprows` lines and reading in at most `nrows` lines. It returns
    `numpy.ndarray`s with `float` values.

    Attributes
    ----------
    filepath_or_buffer : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL.
        Valid URL schemes include http, ftp, s3, and file. For file URLs, a
        host is expected. A local file could be
        file://localhost/path/to/table.csv. If you want to pass in a path
        object, pandas accepts any `os.PathLike`. By file-like object, we
        refer to objects with a `read()` method, such as a file handler
        (e.g. via builtin `open` function) or `StringIO`.
    number_of_chunks : int
        The number of chunks (also returned when using the `len` method),
        taking into account the lines skipped (`skiprows`), the number of lines
        in the file (`file_lines`) and the maximum number of lines to be read
        (`nrows`).
    file_lines : int
        The number of lines in the file pointed at by `filepath_or_buffer`.
    chunksize : int
        The number of lines in a chunk of data.
    skiprows : int
        The number of lines to be skipped at the beginning of the file.
    nrows : int
        The maximum number of lines to be read. Only has an effect if it is
        less than `file_lines` - `skiprows`. For example, if a file has
        10 lines and `skiprows` = 5 and `chunksize` = 5, even if `nrows` were
        to be 20, the `number_of_chunks` should still be 1.

    Raises
    ------
    IndexError
        Upon access to a non-existent chunk using subscript notation
        (i.e. `data[100]` when there are 50 chunks).

    See Also
    --------
    pept.utilities.read_csv : Fast CSV file reading into numpy arrays.
    pept.LineData : Encapsulate LoRs for ease of iteration and plotting.
    pept.PointData : Encapsulate points for ease of iteration and plotting.

    Examples
    --------
    Say "data.csv" contains 1_000_000 lines of data. Read chunks of 10_000
    lines as a time, skipping the first 100_000:

    >>> from pept.utilities import ChunkReader
    >>> chunks = ChunkReader("data.csv", 10_000, skiprows = 100_000)
    >>> len(chunks)         # 90 chunks
    >>> chunks.file_lines   # 1_000_000

    Normal iteration:

    >>> for chunk in chunks:
    >>>     ... # neat operations

    Access a single chunk using subscripting:

    >>> chunks[0]   # First chunk
    >>> chunks[-1]  # Last chunk
    >>> chunks[100] # IndexError
    '''

[docs]    def __init__(
        self,
        filepath_or_buffer,
        chunksize,
        skiprows = None,
        nrows = None,
        dtype = float,
        sep = "\s+",
        header = None,
        engine = "c",
        na_filter = False,
        quoting = csv.QUOTE_NONE,
        memory_map = True,
        **kwargs
    ):
        '''ChunkReader class constructor.

        Parameters
        ----------
        filepath_or_buffer : str, path object or file-like object
            Any valid string path *to a local file* is acceptable. If you want
            to read in lines from an online location (i.e. using a URL), you
            should use `pept.utilities.read_csv`. If you want to pass in a path
            object, pandas accepts any `os.PathLike`. By file-like object, we
            refer to objects with a `read()` method, such as a file handler
            (e.g. via builtin `open` function) or `StringIO`.
        chunksize : int
            Number of lines read in a chunk of data.
        skiprows : list-like, int or callable, optional
            Line numbers to skip (0-indexed) or number of lines to skip (int)
            at the start of the file.
        nrows : int, optional
            Number of rows of file to read. Useful for reading pieces of large
            files.
        dtype : Type name, default `float`
            Data type for data or columns. E.g. {‘a’: np.float64,
            ‘b’: np.int32, ‘c’: ‘Int64’}.
        sep : str, default `"\s+"`
            Delimiter to use. Separators longer than 1 character and different
            from '\s+' will be interpreted as regular expressions and will also
            force the use of the Python parsing engine.
        header : int, list of int, "infer", optional
            Row number(s) to use as the column names, and the start of the
            data. By default assume there is no header present (i.e.
            `header = None`).
        engine : {‘c’, ‘python’}, default "c"
            Parser engine to use. The C engine is faster while the python
            engine is currently more feature-complete.
        na_filter : bool, default `True`
            Detect missing value markers (empty strings and the value of
            na_values). In data without any NAs, passing na_filter=False can
            improve the performance of reading a large file.
        quoting : int or csv.QUOTE_* instance, default `csv.QUOTE_NONE`
            Control field quoting behavior per csv.QUOTE_* constants. Use one
            of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or
            QUOTE_NONE (3).
        memory_map : bool, default True
            If a filepath is provided for filepath_or_buffer, map the file
            object directly onto memory and access the data directly from
            there. Using this option can improve performance because there is
            no longer any I/O overhead.
        **kwargs : optional
            Extra keyword arguments that will be passed to `pandas.read_csv`.

        Raises
        ------
        EOFError : End Of File Error
            If `skiprows` >= `number_of_lines`.
        '''

        self.filepath_or_buffer = filepath_or_buffer
        self._chunksize = chunksize

        self._file_lines = number_of_lines(filepath_or_buffer)

        if skiprows is None:
            self._skiprows = 0
        elif skiprows >= self.file_lines:
            raise EOFError((
                f"\n[ERROR]: Tried to skip `skiprows` = {skiprows} lines "
                f"when there are `file_lines` = {self._file_lines} lines in "
                "the data file.\n"
            ))
        else:
            self._skiprows = skiprows

        # If undefined, set `nrows` to the maximum number of lines that can be
        # read from the file; that is `file_lines` - `skiprows`
        if nrows is None:
            self._nrows = self._file_lines - self._skiprows
        else:
            self._nrows = nrows

        self.dtype = dtype
        self.sep = sep
        self.header = header
        self.engine = engine
        self.na_filter = na_filter
        self.quoting = quoting
        self.memory_map = memory_map
        self.kwargs = kwargs

        # The number of chunks is (the number of lines that can be read from
        # the file OR the set number of rows to be read, whichever's smaller)
        # divided by the chunksize.
        self._number_of_chunks = int(
            min(self._nrows, self._file_lines - self._skiprows) / \
            self._chunksize
        )
        self._index = 0


    @property
    def number_of_chunks(self):
        return self._number_of_chunks


    @property
    def file_lines(self):
        return self._file_lines


    @property
    def chunksize(self):
        return self._chunksize


    @chunksize.setter
    def chunksize(self, new_chunksize):
        self._chunksize = new_chunksize

        # Recalculate the number of chunks and reset the inner index
        self._number_of_chunks = int(
            min(self._nrows, self._file_lines - self._skiprows) / \
            self._chunksize
        )
        self._index = 0


    @property
    def skiprows(self):
        return self._skiprows


    @skiprows.setter
    def skiprows(self, new_skiprows):
        if new_skiprows is None:
            self._skiprows = 0
        elif new_skiprows >= self._file_lines:
            raise EOFError((
                f"\n[ERROR]: Tried to skip `skiprows` = {new_skiprows} lines "
                f"when there are `file_lines` = {self._file_lines} lines in "
                "the data file.\n"
            ))
        else:
            self._skiprows = new_skiprows

        # Recalculate the number of chunks and reset the inner index
        self._number_of_chunks = int(
            min(self._nrows, self._file_lines - self._skiprows) / \
            self._chunksize
        )
        self._index = 0


    @property
    def nrows(self):
        return self._nrows


    @nrows.setter
    def nrows(self, new_nrows):
        if new_nrows is None:
            self._nrows = self._file_lines - self._skiprows
        else:
            self._nrows = new_nrows

        # Recalculate the number of chunks and reset the inner index
        self._number_of_chunks = int(
            min(self._nrows, self._file_lines - self._skiprows) / \
            self._chunksize
        )
        self._index = 0


    def __len__(self):
        return self._number_of_chunks


    def __iter__(self):
        return self


    def __next__(self):
        if self._index >= self._number_of_chunks:
            self._index = 0
            raise StopIteration

        data = pd.read_csv(
            self.filepath_or_buffer,
            skiprows = self._skiprows + self._index * self._chunksize,
            nrows = self._chunksize,
            dtype = self.dtype,
            sep = self.sep,
            header = self.header,
            engine = self.engine,
            na_filter = self.na_filter,
            quoting = self.quoting,
            memory_map = self.memory_map,
            **self.kwargs
        )

        self._index = self._index + 1

        return data.to_numpy()


    def __getitem__(self, key):
        if key >= self._number_of_chunks:
            raise IndexError((
                f"\n[ERROR]: Tried to read the data chunk at index {key} when "
                f"there are {self._number_of_chunks} chunks (indexed from "
                "0).\n"
            ))

        # Allow negative indices
        while key < 0:
            key += self._number_of_chunks

        data = pd.read_csv(
            self.filepath_or_buffer,
            skiprows = self._skiprows + key * self._chunksize,
            nrows = self._chunksize,
            dtype = self.dtype,
            sep = self.sep,
            header = self.header,
            engine = self.engine,
            na_filter = self.na_filter,
            quoting = self.quoting,
            memory_map = self.memory_map,
            **self.kwargs
        )

        return data.to_numpy()


    def __str__(self):
        # Shown when calling print(class)
        docstr = (
            f"filepath_or_buffer = {self.filepath_or_buffer}\n"
            f"file_lines =         {self.file_lines}\n\n"
            f"skiprows =           {self.skiprows}\n"
            f"nrows =              {self.nrows}\n\n"
            f"chunksize =          {self.chunksize}\n"
            f"number_of_chunks =   {self.number_of_chunks}"
        )

        return docstr


    def __repr__(self):
        # Shown when writing the class on a REPL
        docstr = (
            "Class instance that inherits from `pept.utilities.ChunkReader`.\n"
            f"Type:\n{type(self)}\n\n"
            "Attributes\n"
            "----------\n"
            f"{self.__str__()}\n"
        )

        return docstr