missing values handeling#
# define heatmap to be used and show visualization of the data
TBD
mInvData = dfData.isna() #<! The logical matrix of invalid values
hF, hA = plt.subplots(figsize = FIG_SIZE_DEF)
sns.heatmap(data = mInvData, square = False, ax = hA)
hA.set_title('Invalid Data Map')
plt.show()
DataFrame with Missing Values#
import numpy as np
import pandas as pd
# Create a DataFrame
data = {
'Age': [25, 27, 29, np.nan, 35, 36, np.nan],
'Salary': [50000, 54000, np.nan, 65000, np.nan, 68000, 72000],
'YearsAtCompany': [5, 7, 10, 3, np.nan, 8, 9]
}
df = pd.DataFrame(data)
print(df)
Age Salary YearsAtCompany
0 25.0 50000.0 5.0
1 27.0 54000.0 7.0
2 29.0 NaN 10.0
3 NaN 65000.0 3.0
4 35.0 NaN NaN
5 36.0 68000.0 8.0
6 NaN 72000.0 9.0
Imputation with SimpleImputer#
from sklearn.impute import SimpleImputer
# Using SimpleImputer to replace missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
df_simple = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print(df_simple)
Age Salary YearsAtCompany
0 25.0 50000.0 5.0
1 27.0 54000.0 7.0
2 29.0 61800.0 10.0
3 30.4 65000.0 3.0
4 35.0 61800.0 7.0
5 36.0 68000.0 8.0
6 30.4 72000.0 9.0
imputation with IterativeImputer#
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# Using IterativeImputer which models each feature with missing values as a function of other features
iterative_imputer = IterativeImputer()
df_iterative = pd.DataFrame(iterative_imputer.fit_transform(df), columns=df.columns)
print(df_iterative)
Age Salary YearsAtCompany
0 25.000000 50000.000000 5.000000
1 27.000000 54000.000000 7.000000
2 29.000000 56926.492883 10.000000
3 34.043225 65000.000000 3.000000
4 35.000000 66528.385435 7.340165
5 36.000000 68000.000000 8.000000
6 38.369582 72000.000000 9.000000
Imputation with KNNImputer#
from sklearn.impute import KNNImputer
# Using KNNImputer to replace missing values using the mean value from k-nearest neighbors
knn_imputer = KNNImputer(n_neighbors=2)
df_knn = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)
print(df_knn)
Age Salary YearsAtCompany
0 25.0 50000.0 5.0
1 27.0 54000.0 7.0
2 29.0 63000.0 10.0
3 32.5 65000.0 3.0
4 35.0 61000.0 9.0
5 36.0 68000.0 8.0
6 32.5 72000.0 9.0
fillna numpy#
TBD
Conversion of Categorical Data ????