Module pdpipe.df
Handles for dynamic dataframe-method-wrapping pipeline stages.
All pandas.DataFrame methods can be used as pipeline stages using this module.
For example pdp.df.dropna(axis=1)
will return a pdpipe.PdPipelineStage object
that will call the dropna
method of input DataFrames with the axis=1
keyword argument provided an return the resulting dataframe object (practically
dropping any column with a missing value from the input dataframe).
These stage combine naturally into pdpipe
pipelines:
>>> import pdpipe as pdp;
>>> pipeline = pdp.PdPipeline([
... pdp.df.set_axis(labels='datetime'),
... pdp.ColDrop('age'),
... ])
There are a couple of caveats:
- pdpipe pipeline stages never alter input dataframes, so the
inplace
keyword argument is always ignored, even if provided. - All method parameters are fixed on pipeline stage creation time, and must be explicitly provided as keyword arguments, and not as positional ones.
Expand source code
"""Handles for dynamic dataframe-method-wrapping pipeline stages.
All pandas.DataFrame methods can be used as pipeline stages using this module.
For example `pdp.df.dropna(axis=1)` will return a pdpipe.PdPipelineStage object
that will call the `dropna` method of input DataFrames with the `axis=1`
keyword argument provided an return the resulting dataframe object (practically
dropping any column with a missing value from the input dataframe).
These stage combine naturally into `pdpipe` pipelines:
>>> import pdpipe as pdp;
>>> pipeline = pdp.PdPipeline([
... pdp.df.set_axis(labels='datetime'),
... pdp.ColDrop('age'),
... ])
There are a couple of caveats:
* pdpipe pipeline stages never alter input dataframes, so the `inplace` keyword
argument is always ignored, even if provided.
* All method parameters are fixed on pipeline stage creation time, and must be
explicitly provided as keyword arguments, and not as positional ones.
"""
from typing import Dict
from pandas import DataFrame
from .core import PdPipelineStage
# this_module = __import__(__name__)
class _DataFrameMethodTransformer(PdPipelineStage):
def __init__(self, method_name: str, kwargs: Dict[str, object]) -> None:
self._method_name = method_name
self._kwargs = kwargs.copy()
# we must always pop 'inplace', if it's there
found = self._kwargs.pop('inplace', None)
if found is not None:
self._kwargs['inplace'] = False
exmsg = (
"Pipeline stage failed while applying method {} with kwargs {}"
).format(method_name, self._kwargs)
desc = "Apply dataframe method {} with kwargs {}".format(
method_name, self._kwargs)
super_kwargs = {
'exmsg': exmsg,
'desc': desc,
'name': method_name,
}
super().__init__(**super_kwargs)
def _prec(self, df: DataFrame) -> bool: # pragma: no cover
return True
def _transform(self, df: DataFrame, verbose: bool) -> DataFrame:
method = getattr(df, self._method_name)
return method(**self._kwargs)
class _DfMethodTransformerHandle(object):
def __init__(self, method_name: str, doc: str) -> None:
self._method_name = method_name
self.__doc__ = doc
def __call__(self, **kwargs: Dict[str, object]) -> PdPipelineStage:
return _DataFrameMethodTransformer(
method_name=self._method_name,
kwargs=kwargs,
)
__RETURNS = 'Returns'
__DATAFRAME = 'DataFrame'
def _is_dataframe_transform(attr_name: str, attr: object) -> bool:
if attr_name.startswith('_') or not callable(attr):
return False
try:
doc_lines = attr.__doc__.split('\n')
returns_line_index = None
for i in range(len(doc_lines)):
if __RETURNS in doc_lines[i]:
returns_line_index = i + 2
break
if returns_line_index:
return_type_line = doc_lines[returns_line_index]
if __DATAFRAME in return_type_line:
return True
return False
except (AttributeError, IndexError): # pragma: no cover
return False
for attr_name in dir(DataFrame):
attr = getattr(DataFrame, attr_name)
if _is_dataframe_transform(attr_name, attr):
# print(f"Adding {attr_name} of {attr}")
handle = _DfMethodTransformerHandle(
method_name=attr_name,
doc=attr.__doc__,
)
globals()[attr_name] = handle
# setattr(this_module, attr_name, handle)
# print(this_module)
# del this_module
del attr
del handle
del attr_name
del Dict
del DataFrame
del PdPipelineStage