Module pdpipe.util
Utility methods for pdpipe.
Expand source code
"""Utility methods for pdpipe."""
import numpy as np
import pandas as pd
def out_of_place_col_insert(df, series, loc, column_name=None):
"""Returns a new dataframe with given column inserted at given location.
Parameters
----------
df : pandas.DataFrame
The dataframe into which to insert the column.
series : pandas.Series
The pandas series to be inserted.
loc : int
The location into which to insert the new column.
column_name : str, default None
The name to assign the new column. If None, the given series name
attribute is attempted; if the given series is missing the name
attribute a ValueError exception will be raised.
Returns
-------
pandas.DataFrame
The resulting dataframe.
Example
-------
>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[1, 'a'], [4, 'b']], columns=['a', 'g'])
>>> ser = pd.Series([7, 5])
>>> out_of_place_col_insert(df, ser, 1, 'n')
a n g
0 1 7 a
1 4 5 b
"""
if column_name is None:
if series.name is None:
raise ValueError("A column name must be supplied if the given "
"series is missing the name attribute.")
column_name = series.name
inter_df = df.assign(**{column_name: series})
cols = list(inter_df.columns)
cols.insert(loc, cols.pop(cols.index(column_name)))
return inter_df.loc[:, cols]
def get_numeric_column_names(df):
"""Return the names of all columns of numeric type.
Parameters
----------
df : pandas.DataFrame
The dataframe to get numeric column names for.
Returns
-------
list of str
The names of all columns of numeric type.
Example
-------
>>> import pandas as pd; import pdpipe as pdp;
>>> data = [[2, 3.2, "acd"], [1, 7.2, "alk"], [8, 12.1, "alk"]]
>>> df = pd.DataFrame(data, [1,2,3], ["rank", "ph","lbl"])
>>> sorted(get_numeric_column_names(df))
['ph', 'rank']
"""
num_cols = []
for colbl, dtype in df.dtypes.to_dict().items():
if np.issubdtype(dtype, np.number):
num_cols.append(colbl)
return num_cols
def per_column_values_sklearn_transform(df: pd.DataFrame, transform: callable):
"""Applies a 2d-array sklearn transform to 1d values arrays of a dataframe.
Parameters
----------
df : pandas.DataFrame
The dataframe to transform.
transform : callable
The numpy.array-ingesting per-column transform to apply.
Returns
-------
res_df : pandas.DataFrame
The transformed dataframe.
"""
return pd.DataFrame(
data=np.array([
transform(np.array([series.values]).T)[:, 0]
for lbl, series in df.iteritems()
]).T,
index=df.index,
columns=df.columns,
)
Functions
def get_numeric_column_names(df)
-
Return the names of all columns of numeric type.
Parameters
df
:pandas.DataFrame
- The dataframe to get numeric column names for.
Returns
list
ofstr
- The names of all columns of numeric type.
Example
>>> import pandas as pd; import pdpipe as pdp; >>> data = [[2, 3.2, "acd"], [1, 7.2, "alk"], [8, 12.1, "alk"]] >>> df = pd.DataFrame(data, [1,2,3], ["rank", "ph","lbl"]) >>> sorted(get_numeric_column_names(df)) ['ph', 'rank']
Expand source code
def get_numeric_column_names(df): """Return the names of all columns of numeric type. Parameters ---------- df : pandas.DataFrame The dataframe to get numeric column names for. Returns ------- list of str The names of all columns of numeric type. Example ------- >>> import pandas as pd; import pdpipe as pdp; >>> data = [[2, 3.2, "acd"], [1, 7.2, "alk"], [8, 12.1, "alk"]] >>> df = pd.DataFrame(data, [1,2,3], ["rank", "ph","lbl"]) >>> sorted(get_numeric_column_names(df)) ['ph', 'rank'] """ num_cols = [] for colbl, dtype in df.dtypes.to_dict().items(): if np.issubdtype(dtype, np.number): num_cols.append(colbl) return num_cols
def out_of_place_col_insert(df, series, loc, column_name=None)
-
Returns a new dataframe with given column inserted at given location.
Parameters
df
:pandas.DataFrame
- The dataframe into which to insert the column.
series
:pandas.Series
- The pandas series to be inserted.
loc
:int
- The location into which to insert the new column.
column_name
:str
, defaultNone
- The name to assign the new column. If None, the given series name attribute is attempted; if the given series is missing the name attribute a ValueError exception will be raised.
Returns
pandas.DataFrame
- The resulting dataframe.
Example
>>> import pandas as pd; import pdpipe as pdp; >>> df = pd.DataFrame([[1, 'a'], [4, 'b']], columns=['a', 'g']) >>> ser = pd.Series([7, 5]) >>> out_of_place_col_insert(df, ser, 1, 'n') a n g 0 1 7 a 1 4 5 b
Expand source code
def out_of_place_col_insert(df, series, loc, column_name=None): """Returns a new dataframe with given column inserted at given location. Parameters ---------- df : pandas.DataFrame The dataframe into which to insert the column. series : pandas.Series The pandas series to be inserted. loc : int The location into which to insert the new column. column_name : str, default None The name to assign the new column. If None, the given series name attribute is attempted; if the given series is missing the name attribute a ValueError exception will be raised. Returns ------- pandas.DataFrame The resulting dataframe. Example ------- >>> import pandas as pd; import pdpipe as pdp; >>> df = pd.DataFrame([[1, 'a'], [4, 'b']], columns=['a', 'g']) >>> ser = pd.Series([7, 5]) >>> out_of_place_col_insert(df, ser, 1, 'n') a n g 0 1 7 a 1 4 5 b """ if column_name is None: if series.name is None: raise ValueError("A column name must be supplied if the given " "series is missing the name attribute.") column_name = series.name inter_df = df.assign(**{column_name: series}) cols = list(inter_df.columns) cols.insert(loc, cols.pop(cols.index(column_name))) return inter_df.loc[:, cols]
def per_column_values_sklearn_transform(df: pandas.core.frame.DataFrame, transform:
) -
Applies a 2d-array sklearn transform to 1d values arrays of a dataframe.
Parameters
df
:pandas.DataFrame
- The dataframe to transform.
transform
:callable
- The numpy.array-ingesting per-column transform to apply.
Returns
res_df
:pandas.DataFrame
- The transformed dataframe.
Expand source code
def per_column_values_sklearn_transform(df: pd.DataFrame, transform: callable): """Applies a 2d-array sklearn transform to 1d values arrays of a dataframe. Parameters ---------- df : pandas.DataFrame The dataframe to transform. transform : callable The numpy.array-ingesting per-column transform to apply. Returns ------- res_df : pandas.DataFrame The transformed dataframe. """ return pd.DataFrame( data=np.array([ transform(np.array([series.values]).T)[:, 0] for lbl, series in df.iteritems() ]).T, index=df.index, columns=df.columns, )