AITA Predict w Trees

Apr 13, 2025

Data Cleaning

Code here: https://github.com/kerixyz/pred-aita/blob/main/pred.md

df_big = pd.read_csv('aita7780.csv')
print(df_big.isnull().sum())

Unnamed: 0 0 id 0 flair 0 title 0 body 0 dtype: int64

#convert flair (outcome var) to numerical vals
le = LabelEncoder()
df_big['flair_num'] = le.fit_transform(df_big['flair'])

Train / Test Split

#split into test / train 

X = df_big['title']
y = df_big['flair_num']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=17)

df_big['title'][2] 'AITA for preferring that my uncle teach me to drive than my dad and refusing to tell my dad why?'

tfidf = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
dt_model = DecisionTreeClassifier(random_state=17, max_depth=10)
dt_model.fit(X_train_tfidf, y_train)
y_pred_dt = dt_model.predict(X_test_tfidf)

temp = dt_model.fit(X_train_tfidf, y_train)
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(temp,
                   filled=True)

print('Decision Tree Accuracy:',accuracy_score(y_test,y_pred_dt)) 0.7172236503856041

print(classification_report(y_test,y_pred_dt))

precision recall f1-score support

0 1.00 0.19 0.32 280 1 0.76 0.17 0.28 95 2 0.83 0.12 0.22 81 3 0.00 0.00 0.00 58 4 0.71 1.00 0.83 1042

accuracy 0.72 1556 macro avg 0.66 0.30 0.33 1556 weighted avg 0.74 0.72 0.64 1556

rf_model = RandomForestClassifier(random_state=17, max_depth=10, n_estimators=100)
rf_model.fit(X_train_tfidf, y_train)

y_pred_rf = rf_model.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.6735218508997429 precision recall f1-score support

0 1.00 0.01 0.02 280 1 0.00 0.00 0.00 95 2 1.00 0.04 0.07 81 3 0.00 0.00 0.00 58 4 0.67 1.00 0.80 1042

accuracy 0.67 1556 macro avg 0.53 0.21 0.18 1556 weighted avg 0.68 0.67 0.55 1556

base_tree = DecisionTreeClassifier(max_depth=5, random_state=17)
bagging_model = BaggingClassifier(estimator=base_tree, n_estimators=50, random_state=17)
bagging_model.fit(X_train_tfidf, y_train)

y_pred_bagging = bagging_model.predict(X_test_tfidf)
print("Bagging Accuracy:", accuracy_score(y_test, y_pred_bagging))
print(classification_report(y_test, y_pred_bagging))

Bagging Accuracy: 0.7011568123393316 precision recall f1-score support

0 1.00 0.12 0.21 280 1 1.00 0.07 0.14 95 2 1.00 0.11 0.20 81 3 0.00 0.00 0.00 58 4 0.69 1.00 0.82 1042

accuracy 0.70 1556 macro avg 0.74 0.26 0.27 1556 weighted avg 0.76 0.70 0.60 1556

gb_model = GradientBoostingClassifier(random_state=17)
gb_model.fit(X_train_tfidf, y_train)

y_pred_gb = gb_model.predict(X_test_tfidf)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Accuracy: 0.859254498714653 precision recall f1-score support

0 1.00 0.31 0.48 280 1 1.00 0.77 0.87 95 2 1.00 0.94 0.97 81 3 1.00 1.00 1.00 58 4 0.83 1.00 0.90 1042

accuracy 0.86 1556 macro avg 0.97 0.80 0.84 1556 weighted avg 0.88 0.86 0.83 1556

##Collect Merics for cCOmparison 
def eval_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return{
        'Accuracy':accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test,y_pred,average='weighted'),
        'Recall':recall_score(y_test,y_pred,average='weighted'),
        'F1-Score': f1_score(y_test,y_pred, average='weighted')
    }
metrics = {
    'Decision Tree': eval_model(dt_model, X_test_tfidf,y_test),
    'Random Forest': eval_model(rf_model, X_test_tfidf, y_test),
    'Gradient Boosting': eval_model(gb_model, X_test_tfidf, y_test),
    'Bagging': eval_model(bagging_model, X_test_tfidf, y_test)
}
metrics_df = pd.DataFrame(metrics)
print(metrics_df)

Decision Tree Random Forest Gradient Boosting Bagging Accuracy 0.717224 0.673522 0.859254 0.701157 Precision 0.742257 0.682193 0.883698 0.756093 Recall 0.717224 0.673522 0.859254 0.701157 F1-Score 0.638219 0.545953 0.832780 0.604238

metrics_df = metrics_df.T
print(metrics_df)

Accuracy Precision Recall F1-Score Decision Tree 0.717224 0.742257 0.717224 0.638219 Random Forest 0.673522 0.682193 0.673522 0.545953 Gradient Boosting 0.859254 0.883698 0.859254 0.832780 Bagging 0.701157 0.756093 0.701157 0.604238

metrics_df['Accuracy'].plot(kind='bar', color=['blue', 'green', 'orange', 'purple'], figsize=(8, 6))
plt.style.use('fivethirtyeight')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xlabel('Models')
plt.xticks(rotation=45)
plt.show()

png

metrics_df['F1-Score'].plot(kind='bar', color=['blue', 'green', 'orange', 'purple'], figsize=(8, 6))
plt.title('Model F1-Score Comparison')
plt.ylabel('F1-Score')
plt.xlabel('Models')
plt.xticks(rotation=45)
plt.show()

png

models = [("Decision Tree", dt_model), 
          ("Random Forest", rf_model), 
          ("Gradient Boosting", gb_model), 
          ("Bagging", bagging_model)]

for name, model in models:
    disp = ConfusionMatrixDisplay.from_estimator(model, X_test_tfidf, y_test, cmap=plt.cm.Blues)
    disp.ax_.set_title(f"Confusion Matrix: {name}")
    plt.show()

png

png

png

png

cv_scores = {
    "Decision Tree": np.mean(cross_val_score(dt_model, X_train_tfidf, y_train, cv=5)),
    "Random Forest": np.mean(cross_val_score(rf_model, X_train_tfidf, y_train, cv=5)),
    "Gradient Boosting": np.mean(cross_val_score(gb_model, X_train_tfidf, y_train, cv=5)),
    "Bagging": np.mean(cross_val_score(bagging_model, X_train_tfidf, y_train, cv=5))
}
cv_scores_series = pd.Series(cv_scores)
cv_scores_series.plot(kind='bar', color=['blue', 'green', 'orange', 'purple'], figsize=(8, 6))
plt.title('Cross-Validation Score Comparison')
plt.ylabel('Mean CV Score')
plt.xlabel('Models')
plt.xticks(rotation=45)
plt.show()

png