Source code for learning_machine.engine.category_encoder
import sklearn.preprocessing as skp
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from .engine import DataEngine
from learning_machine.zoo import DATA_ENGINE_ZOO
[docs]
@DATA_ENGINE_ZOO.regist()
class OneHotEncoder(DataEngine):
"""Onehot encoder from scikit-learn. return columns {prefix}_{col}."""
[docs]
def __init__(self, cols: list[str], prefix: str = "onehot", sparse_output=False):
"""
Args:
cols (list[str]): list of columns need encoding
prefix (str, optional): return column prefix. Defaults to "onehot".
sparse_output (bool, optional): return sparse matrix. Ref scikit-learn. Defaults to False.
"""
self.cols = cols
self.prefix = prefix
self.enc = skp.OneHotEncoder(sparse_output=sparse_output)
self.is_fit = False
def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
arr = data[self.cols].to_numpy()
if not self.is_fit:
one_hot = self.enc.fit_transform(arr)
self.is_fit = True
else:
one_hot = self.enc.transform(arr) # type: ignore
if isinstance(one_hot, csr_matrix):
one_hot = one_hot.toarray()
col_names = np.concatenate(self.enc.categories_)
col_names = [f"{self.prefix}_{col}" for col in col_names]
return pd.DataFrame(one_hot, columns=col_names, index=data.index) # type: ignore
[docs]
@DATA_ENGINE_ZOO.regist()
class LabelEncoder(DataEngine):
"""Label encoder from scikit-learn. return column {prefix}_{col}."""
[docs]
def __init__(self, col: str, prefix: str = "label"):
"""
Args:
col (str): column name
prefix (str, optional): return column prefix. Defaults to "label".
"""
self.col = col
self.prefix = prefix
self.enc = skp.LabelEncoder()
self.is_fit = False
def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
arr = data[self.col].to_numpy()
if not self.is_fit:
label = self.enc.fit_transform(arr)
self.is_fit = True
else:
label = self.enc.transform(arr)
return pd.DataFrame({f"{self.prefix}_{self.col}": label})