import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, accuracy_score, precision_score, \ recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay from sklearn.linear_model import LogisticRegression
#dataset available here https://career.skills.google/focuses/133285?parent=catalog
df = pd.read_csv("waze_dataset.csv")print(df.shape) df.info()
df.head()
| ID | label | sessions | drives | total_sessions | n_days_after_onboarding | total_navigations_fav1 | total_navigations_fav2 | driven_km_drives | duration_minutes_drives | activity_days | driving_days | device | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | retained | 283 | 226 | 296.748273 | 2276 | 208 | 0 | 2628.845068 | 1985.775061 | 28 | 19 | Android |
| 1 | 1 | retained | 133 | 107 | 326.896596 | 1225 | 19 | 64 | 13715.920550 | 3160.472914 | 13 | 11 | iPhone |
| 2 | 2 | retained | 114 | 95 | 135.522926 | 2651 | 0 | 0 | 3059.148818 | 1610.735904 | 14 | 8 | Android |
| 3 | 3 | retained | 49 | 40 | 67.589221 | 15 | 322 | 7 | 913.591123 | 587.196542 | 7 | 3 | iPhone |
| 4 | 4 | retained | 84 | 68 | 168.247020 | 1562 | 166 | 5 | 3950.202008 | 1219.555924 | 27 | 18 | Android |
df.drop('ID', axis = 1, inplace = True)print(df.isna().any())
t_values, f_values = df["label"].isna().value_counts()
print("the rate of NA values in is", round(f_values * 100.0 / ( f_values + t_values ), 2))label column is null. Since this is a very small fraction of our 15,000-row dataset, dropping these rows is a safe and straightforward approach that shouldn't skew our overall analysis.df.dropna(inplace = True)
df['km_per_driving_day'] = df['driven_km_drives'] / df['driving_days'] df.loc[df['km_per_driving_day'] == np.inf, 'km_per_driving_day'] = 0 df['frequent_user'] = np.where((df['drives'] >= 60) & (df['driving_days'] >= 15), 1, 0)
df['km_per_driving_day'].describe()
count 14999.000000 mean 578.963113 std 1030.094384 min 0.000000 25% 136.238895 50% 272.889272 75% 558.686918 max 15420.234110 Name: km_per_driving_day, dtype: float64
df['frequent_user'].describe()
count 14999.000000 mean 0.172945 std 0.378212 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 1.000000 Name: frequent_user, dtype: float64
df.groupby(['frequent_user'])['label'].value_counts(normalize = True)
frequent_user label
0 retained 0.801202
churned 0.198798
1 retained 0.924437
churned 0.075563
Name: proportion, dtype: float64df['label2'] = np.where(df['label']=='churned', 1, 0) df['device2'] = np.where(df['device']=='Android', 0, 1)
corr_matrix = df.drop(['label', 'device'], axis = 1).corr(method = "pearson") sns.heatmap(corr_matrix, vmin=-1, vmax=1, cmap='coolwarm') plt.show()
X = df.drop(columns = ['label', 'label2', 'device', 'sessions', 'driving_days']) y = df['label2']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
model = LogisticRegression(penalty=None, max_iter=2000) model.fit(X_train, y_train)
LogisticRegression(max_iter=2000, penalty=None)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
| penalty | None | |
| dual | False | |
| tol | 0.0001 | |
| C | 1.0 | |
| fit_intercept | True | |
| intercept_scaling | 1 | |
| class_weight | None | |
| random_state | None | |
| solver | 'lbfgs' | |
| max_iter | 2000 | |
| multi_class | 'deprecated' | |
| verbose | 0 | |
| warm_start | False | |
| n_jobs | None | |
| l1_ratio | None |
activity_days to ensure this assumption holds up before we evaluate our final metrics.training_probabilities = model.predict_proba(X_train) training_probabilities logit_data = X_train.copy() logit_data['logit'] = [np.log(prob[1] / prob[0]) for prob in training_probabilities]
sns.regplot(x='activity_days', y='logit', data=logit_data, scatter_kws={'s': 2, 'alpha': 0.5})
plt.show()y_preds = model.predict(X_test)
model.score(X_test, y_test)
0.832
cm = confusion_matrix(y_test, y_preds) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['retained', 'churned']) disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x294050738f0>
target_labels = ['retained', 'churned'] print(classification_report(y_test, y_preds, target_names=target_labels))