Source code for litholog.sequence.io

"""
IO classes & functions
"""
import operator
from abc import ABC, abstractmethod

import numpy as np
import pandas as pd

from litholog import Bed, utils


[docs]def check_order(df, topcol, basecol, raise_error=True):
    """
    Check that all rows are either depth ordered or elevation_ordered.
    Returns 'elevation' or 'depth'.
    """
    assert basecol in df.columns, f'`basecol` {basecol} not present in {df.columns}'

    if (df[topcol] > df[basecol]).all():
        return 'elevation'
    elif (df[topcol] < df[basecol]).all():
        return 'depth'
    elif raise_error:
        raise ValueError('Dataframe has inconsistent top/base conventions')
    else:
        return None


[docs]def check_samples(df, depthcol, valuecol):
    """
    Check that `depth_col` and `sample_col` have equal number of entries per bed,

    Returns
    -------
    good : bool
        True if sizes match in all rows, False otherwise.
    """
    dsizes = df[depthcol].apply(utils.safelen)
    vsizes = df[valuecol].apply(utils.safelen)

    return (dsizes == vsizes).all()


[docs]def check_thicknesses(df, topcol, thickcol, order, basecol='bases', tol=1e-3):
    """
    Check that gap between tops and adjacent bases implied by 'th' are consistent and small.

    Returns
    -------
    (df, good) : (DataFrame, bool)
        `df` has new `basecol` added with implied base positions
        `good` is `True` if the average gap < `tol`, else `False`
    """
    assert order in {'elevation', 'depth'}, f'{order} not a valid `order`'
    assert thickcol in df.columns, f'{thickcol} not in {df.columns}'

    op = operator.sub if order is 'elevation' else operator.add

    bases = op(df[topcol], df[thickcol]).values

    gap = np.abs(bases[:-1] - df[topcol].values[1:]).sum()

    df.loc[:, basecol] = bases

    within_tolerance = True if gap <= tol*bases.size else False

    return df, within_tolerance


[docs]def preprocess_dataframe(df, topcol, basecol=None, thickcol=None, tol=1e-3):
    """
    Check for position order + consistency in `df`, return preprocessed DataFrame.

    This doesn't check for all possible inconsistencies, just the most obvious ones.
    """
    assert topcol in df.columns, f'`topcol` {topcol}  not present in {df.columns}'

    assert basecol or thickcol, 'Must specify either `basecol` or `thickcol`'

    elev_sorted = df.sort_values(topcol, ascending=False)
    depth_sorted = df.sort_values(topcol, ascending=True)

    if basecol:
        order = check_order(df, topcol, basecol)
        return elev_sorted if order is 'elevation' else depth_sorted

    else:
        elev_sorted, elev_good = check_thicknesses(elev_sorted, topcol, thickcol,
                                                  'elevation', basecol='bases', tol=tol)
        if elev_good:
            return elev_sorted

        depth_sorted, depth_good = check_thicknesses(depth_sorted, topcol, thickcol,
                                                    'depth', basecol='bases', tol=tol)
        if depth_good:
            return depth_sorted

        print('Problem with `df`:\n', df)
        raise UserWarning('Check that thicknesses are consistent!')



[docs]class SequenceIOMixin(ABC):
    """
    Defines the IO interface for `BedSequence`.
    """
[docs]    @classmethod
    def from_dataframe(cls, df,
                      topcol='tops',
                      basecol=None,
                      thickcol=None,
                      component_map=None,
                      datacols=[],
                      metacols=[],
                      metasafe=True,
                      tol=1e-3):
        """
        Create an instance from a pd.DataFrame or subclass (e.g., a GroupBy object).
        Must provide `topcol` and one of `basecol` or `thickcol`.

        Parameters
        ----------
        df : pd.DataFrame or subclass
            Table from which to create `list_of_Beds`.
        topcol : str
            Name of top depth/elevation column. Must be present. Default='top'.
        basecol, thickcol: str
            Either provide a base depth/elevation column, or a thickness column. Must provide at least one.
        component_map : tuple(str, func), optional
            Function that maps values of a column to a primary `striplog.Component` for individual Beds.
            TODO: if `func` is a str with 'wentworth', maybe just map using grainsize bins?
        datacols : list(str), optional
            Columns to use as `Bed` data. Should reference numeric columns only.
        metacols : list(str), optional
            Columns to read into `metadata` dict attribute.
        metasafe : bool, optional
            If True, enforces that df[metacols] have a single unique value per column.
            If False, just attaches any + all unique values.
        """
        # Check for data/meta column presence
        missing_data_cols = [c for c in datacols if c not in df.columns]
        assert not missing_data_cols, f'datacols {missing_data_cols} not present in `df`'

        missing_meta_cols = [c for c in metacols if c not in df.columns]
        assert not missing_meta_cols, f'metacols {missing_meta_cols} not present in `df`'

        # Preprocess the data
        try:
            df = preprocess_dataframe(df, topcol, basecol=basecol, thickcol=thickcol, tol=tol)
        except Exception as e:
            print('Problem with DataFrame:\n', df)
            raise(e)

        basecol = basecol or 'bases'

        metadata = {}
        for metacol in metacols:
            meta_values = df[metacol].unique()
            if metasafe:
                assert len(meta_values) == 1, f'`metacol` {metacol} has more than one unique value: {meta_values}'
            metadata[metacol] = meta_values[0]

        list_of_Beds = []
        for _, row in df.iterrows():
            if component_map:
                field, field_fn = component_map
                component = field_fn(row[field])
                bed = Bed(row[topcol], row[basecol], row[datacols], components=[component])
            else:
                bed = Bed(row[topcol], row[basecol], row[datacols])
            list_of_Beds.append(bed)

        return cls(list_of_Beds, metadata=metadata)


    #def to_dataframe(self):

[docs]    @classmethod
    def from_numpy(self, arr, other=None, keys=None, split_key=None, component_map=None):
        """
        TODO: Implement a method to convert numpy (e.g., from GAN) to `BedSequence` instance.

        Use keys from `other`, or provide list of `keys`.
        Provide a `component_map` to group samples into `Bed`s?
        """
        pass