-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
130 lines (110 loc) · 4.6 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
import os
import warnings
import sys
# Get the current file's directory
current_dir = os.path.dirname(os.path.abspath(__file__))
# Get the parent directory (project directory)
project_dir = os.path.dirname(current_dir)
# Add the project directory to the Python path
sys.path.append(project_dir)
# Now you can import the config module
from config.config import ARTIFACTS_DIR
from config.config import logger
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
warnings.filterwarnings("ignore")
def load_data(file_path):
'''Load the dataset from a file or DataFrame.'''
if isinstance(file_path, str):
# If the input is a string (file path), load the data from CSV
data = pd.read_csv(file_path)
logger.info('Loaded the dataset from CSV!')
elif isinstance(file_path, pd.DataFrame):
# If the input is a DataFrame, use it directly
data = file_path
logger.info('Loaded the dataset from DataFrame!')
else:
raise ValueError("Invalid input type. Please provide either a file path (str) or a DataFrame.")
return data
def clean_data(data):
cleaned_data = data.drop_duplicates()
# Handle outliers and fill missing values
num_vars = cleaned_data.select_dtypes(include=np.number).columns
cat_vars = cleaned_data.select_dtypes(include='object').columns
for var in num_vars:
if data[var].isnull().sum() > 0:
has_outliers = False
# Check for outliers
Q1 = data[var].quantile(0.25)
Q3 = data[var].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
lower_bound = Q1 - 1.5 * IQR
if data[var].max() > upper_bound or data[var].min() < lower_bound:
has_outliers = True
if has_outliers:
# Has outliers, fill missing values with median
data[var].fillna(data[var].median(), inplace=True)
else:
# No outliers, fill missing values with mean
data[var].fillna(data[var].mean(), inplace=True)
for var in cat_vars:
# Fill missing values with mode
data[var].fillna(data[var].mode().iloc[0], inplace=True)
logger.info('Cleaning completed!')
return data
def replace_with_no(cleaned_data):
'''
Replace 'No internet service' and 'No phone service' with 'No' for the specified columns in the dataset.
'''
columns = ['OnlineSecurity']
for column in columns:
if column in cleaned_data.columns:
cleaned_data[column] = cleaned_data[column].replace('No internet service', 'No')
logger.info('Replaced with No')
return cleaned_data
def label_encoding(cleaned_data):
label_encoder = LabelEncoder()
encode_vals = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
'InternetService', 'OnlineSecurity', 'Contract', 'PaymentMethod', 'Churn']
for column in encode_vals:
if column in cleaned_data.columns:
cleaned_data[column] = label_encoder.fit_transform(cleaned_data[column])
logger.info('Label Encoding completed!')
return cleaned_data
def to_numeric(cleaned_data):
cleaned_data['TotalCharges'] = pd.to_numeric(cleaned_data['TotalCharges'], errors='coerce').fillna(0).astype(float)
logger.info('Converted into int!')
return cleaned_data
def scaling(cleaned_data):
scaler = MinMaxScaler()
numeric_columns = cleaned_data.select_dtypes(include=np.number).columns
cleaned_data = cleaned_data.copy()
cleaned_data[numeric_columns] = scaler.fit_transform(cleaned_data[numeric_columns])
logger.info('Scaling Completed!')
return cleaned_data
def preprocess_data(file_path):
'''
Preprocess the data by applying all the necessary steps.
'''
data = load_data(file_path)
cleaned_data = clean_data(data)
cleaned_data = replace_with_no(cleaned_data)
cleaned_data = label_encoding(cleaned_data)
cleaned_data = to_numeric(cleaned_data)
cleaned_data = scaling(cleaned_data)
logger.info('Data preprocessed!')
return cleaned_data
if __name__ == "__main__":
df = pd.read_csv("/Users/tarakram/Documents/Churn-Prediction/data/raw/customer_churn_raw_data.csv")
label_encoder = LabelEncoder()
scaler = MinMaxScaler()
processed_data = preprocess_data(df)
processed_data.to_csv(ARTIFACTS_DIR / 'processed_data.csv', index=False)
with open(ARTIFACTS_DIR / 'label_encoder.pkl', 'wb') as f:
pickle.dump(label_encoder, f)
with open(ARTIFACTS_DIR / 'scaler.pkl', 'wb') as f:
pickle.dump(scaler, f)