missing values handeling#

# define heatmap to be used and show visualization of the data

TBD

mInvData = dfData.isna() #<! The logical matrix of invalid values

hF, hA = plt.subplots(figsize = FIG_SIZE_DEF)
sns.heatmap(data = mInvData, square = False, ax = hA)
hA.set_title('Invalid Data Map')

plt.show()

DataFrame with Missing Values#

import numpy as np
import pandas as pd

# Create a DataFrame
data = {
    'Age': [25, 27, 29, np.nan, 35, 36, np.nan],
    'Salary': [50000, 54000, np.nan, 65000, np.nan, 68000, 72000],
    'YearsAtCompany': [5, 7, 10, 3, np.nan, 8, 9]
}
df = pd.DataFrame(data)

print(df)
    Age   Salary  YearsAtCompany
0  25.0  50000.0             5.0
1  27.0  54000.0             7.0
2  29.0      NaN            10.0
3   NaN  65000.0             3.0
4  35.0      NaN             NaN
5  36.0  68000.0             8.0
6   NaN  72000.0             9.0

Imputation with SimpleImputer#

from sklearn.impute import SimpleImputer

# Using SimpleImputer to replace missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
df_simple = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print(df_simple)
    Age   Salary  YearsAtCompany
0  25.0  50000.0             5.0
1  27.0  54000.0             7.0
2  29.0  61800.0            10.0
3  30.4  65000.0             3.0
4  35.0  61800.0             7.0
5  36.0  68000.0             8.0
6  30.4  72000.0             9.0

imputation with IterativeImputer#

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Using IterativeImputer which models each feature with missing values as a function of other features
iterative_imputer = IterativeImputer()
df_iterative = pd.DataFrame(iterative_imputer.fit_transform(df), columns=df.columns)

print(df_iterative)
         Age        Salary  YearsAtCompany
0  25.000000  50000.000000        5.000000
1  27.000000  54000.000000        7.000000
2  29.000000  56926.492883       10.000000
3  34.043225  65000.000000        3.000000
4  35.000000  66528.385435        7.340165
5  36.000000  68000.000000        8.000000
6  38.369582  72000.000000        9.000000

Imputation with KNNImputer#

from sklearn.impute import KNNImputer

# Using KNNImputer to replace missing values using the mean value from k-nearest neighbors
knn_imputer = KNNImputer(n_neighbors=2)
df_knn = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

print(df_knn)
    Age   Salary  YearsAtCompany
0  25.0  50000.0             5.0
1  27.0  54000.0             7.0
2  29.0  63000.0            10.0
3  32.5  65000.0             3.0
4  35.0  61000.0             9.0
5  36.0  68000.0             8.0
6  32.5  72000.0             9.0

fillna numpy#

TBD

Conversion of Categorical Data ????