Source code for litholog.sequence.io

"""
IO classes & functions
"""
import operator
from abc import ABC, abstractmethod

import numpy as np
import pandas as pd

from litholog import Bed, utils


[docs]def check_order(df, topcol, basecol, raise_error=True): """ Check that all rows are either depth ordered or elevation_ordered. Returns 'elevation' or 'depth'. """ assert basecol in df.columns, f'`basecol` {basecol} not present in {df.columns}' if (df[topcol] > df[basecol]).all(): return 'elevation' elif (df[topcol] < df[basecol]).all(): return 'depth' elif raise_error: raise ValueError('Dataframe has inconsistent top/base conventions') else: return None
[docs]def check_samples(df, depthcol, valuecol): """ Check that `depth_col` and `sample_col` have equal number of entries per bed, Returns ------- good : bool True if sizes match in all rows, False otherwise. """ dsizes = df[depthcol].apply(utils.safelen) vsizes = df[valuecol].apply(utils.safelen) return (dsizes == vsizes).all()
[docs]def check_thicknesses(df, topcol, thickcol, order, basecol='bases', tol=1e-3): """ Check that gap between tops and adjacent bases implied by 'th' are consistent and small. Returns ------- (df, good) : (DataFrame, bool) `df` has new `basecol` added with implied base positions `good` is `True` if the average gap < `tol`, else `False` """ assert order in {'elevation', 'depth'}, f'{order} not a valid `order`' assert thickcol in df.columns, f'{thickcol} not in {df.columns}' op = operator.sub if order is 'elevation' else operator.add bases = op(df[topcol], df[thickcol]).values gap = np.abs(bases[:-1] - df[topcol].values[1:]).sum() df.loc[:, basecol] = bases within_tolerance = True if gap <= tol*bases.size else False return df, within_tolerance
[docs]def preprocess_dataframe(df, topcol, basecol=None, thickcol=None, tol=1e-3): """ Check for position order + consistency in `df`, return preprocessed DataFrame. This doesn't check for all possible inconsistencies, just the most obvious ones. """ assert topcol in df.columns, f'`topcol` {topcol} not present in {df.columns}' assert basecol or thickcol, 'Must specify either `basecol` or `thickcol`' elev_sorted = df.sort_values(topcol, ascending=False) depth_sorted = df.sort_values(topcol, ascending=True) if basecol: order = check_order(df, topcol, basecol) return elev_sorted if order is 'elevation' else depth_sorted else: elev_sorted, elev_good = check_thicknesses(elev_sorted, topcol, thickcol, 'elevation', basecol='bases', tol=tol) if elev_good: return elev_sorted depth_sorted, depth_good = check_thicknesses(depth_sorted, topcol, thickcol, 'depth', basecol='bases', tol=tol) if depth_good: return depth_sorted print('Problem with `df`:\n', df) raise UserWarning('Check that thicknesses are consistent!')
[docs]class SequenceIOMixin(ABC): """ Defines the IO interface for `BedSequence`. """
[docs] @classmethod def from_dataframe(cls, df, topcol='tops', basecol=None, thickcol=None, component_map=None, datacols=[], metacols=[], metasafe=True, tol=1e-3): """ Create an instance from a pd.DataFrame or subclass (e.g., a GroupBy object). Must provide `topcol` and one of `basecol` or `thickcol`. Parameters ---------- df : pd.DataFrame or subclass Table from which to create `list_of_Beds`. topcol : str Name of top depth/elevation column. Must be present. Default='top'. basecol, thickcol: str Either provide a base depth/elevation column, or a thickness column. Must provide at least one. component_map : tuple(str, func), optional Function that maps values of a column to a primary `striplog.Component` for individual Beds. TODO: if `func` is a str with 'wentworth', maybe just map using grainsize bins? datacols : list(str), optional Columns to use as `Bed` data. Should reference numeric columns only. metacols : list(str), optional Columns to read into `metadata` dict attribute. metasafe : bool, optional If True, enforces that df[metacols] have a single unique value per column. If False, just attaches any + all unique values. """ # Check for data/meta column presence missing_data_cols = [c for c in datacols if c not in df.columns] assert not missing_data_cols, f'datacols {missing_data_cols} not present in `df`' missing_meta_cols = [c for c in metacols if c not in df.columns] assert not missing_meta_cols, f'metacols {missing_meta_cols} not present in `df`' # Preprocess the data try: df = preprocess_dataframe(df, topcol, basecol=basecol, thickcol=thickcol, tol=tol) except Exception as e: print('Problem with DataFrame:\n', df) raise(e) basecol = basecol or 'bases' metadata = {} for metacol in metacols: meta_values = df[metacol].unique() if metasafe: assert len(meta_values) == 1, f'`metacol` {metacol} has more than one unique value: {meta_values}' metadata[metacol] = meta_values[0] list_of_Beds = [] for _, row in df.iterrows(): if component_map: field, field_fn = component_map component = field_fn(row[field]) bed = Bed(row[topcol], row[basecol], row[datacols], components=[component]) else: bed = Bed(row[topcol], row[basecol], row[datacols]) list_of_Beds.append(bed) return cls(list_of_Beds, metadata=metadata)
#def to_dataframe(self):
[docs] @classmethod def from_numpy(self, arr, other=None, keys=None, split_key=None, component_map=None): """ TODO: Implement a method to convert numpy (e.g., from GAN) to `BedSequence` instance. Use keys from `other`, or provide list of `keys`. Provide a `component_map` to group samples into `Bed`s? """ pass