File size: 3,889 Bytes
045d7d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import joblib

import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import classification_report


def main():
  train_df = pd.read_csv('train_data.csv', na_values='-')
  # `service` is about half-empty and the rest are completely full
  # one of the rows has `no` for `state` which isn't listed as an option in the description of the fields
  # I'm just going to delete that
  train_df = train_df.drop(columns=['id'])
  train_df = train_df.drop(index=train_df[train_df['state']=='no'].index)

  # It can predict `label` really well ~0.95 accuracy/f1/whatever other stat you care about
  # It does a lot worse trying to predict `attack_cat` b/c there are 10 classes
  # and some of them are not well-represented
  # so that might be more interesting to visualize
  cheating = train_df.pop('attack_cat')
  y_enc = LabelEncoder().fit(train_df['label'])
  train_y = y_enc.transform(train_df.pop('label'))
  x_enc = OrdinalEncoder().fit(train_df)
  train_df = x_enc.transform(train_df)

  # Random forest doesn't handle NaNs
  # I could drop the `service` column or I can use the HistGradientBoostingClassifier
  # super helpful error message from sklearn pointed me to this list
  # https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
  #rf = RandomForestClassifier()
  #rf.fit(train_df, y_train)

  # max_iter is the number of time it builds a gradient-boosted tree
  # so it's the number of estimators
  hgb = HistGradientBoostingClassifier(max_iter=10).fit(train_df, train_y)
  joblib.dump(hgb, 'hgb_classifier.joblib', compress=9)

  test_df = pd.read_csv('test_data.csv', na_values='-')
  test_df = test_df.drop(columns=['id', 'attack_cat'])
  test_y = y_enc.transform(test_df.pop('label'))
  test_df = x_enc.transform(test_df)
  test_preds = hgb.predict(test_df)
  print(classification_report(test_y, test_preds))

  # I guess they took out the RF feature importance
  # or maybe that's only in XGBoost
  # you can still kind of get to it
  # with RandomForestClassifier.feature_importances_
  # or like this
  # https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
  # but there's really nothing for the HistGradientBoostingClassifier
  # but you can get to the actual nodes for each predictor/estimator like this
  #  hgb._predictors[i][0].nodes
  # and that has information gain metric for each node which might be viz-able
  # so that might be an interesting viz
  # like plot the whole forest
  # maybe only do like 10 estimators to keep it smaller
  # or stick with 100 and figure out a good way to viz big models
  # the first two estimators are almost identical
  # so maybe like plot the first estimator
  # and then fuzz the nodes by how much the other estimators differ
  # assuming there's some things they all agree on exactly and others where they differ a little bit
  # idk I don't really know how the algorithm works
  # the 96th estimator looks pretty different (I'm assuming from boosting)
  # so maybe like an evolution animation from the first to the last
  # to see the effect of the boosting
  # like plot the points and show how the decision boundary shifts with each generation
  # alongside an animation of the actual decision tree morphing each step
  # That might look too much like an animation of the model being trained though
  # which I guess that's sort of what it is so idk

  # https://scikit-learn.org/stable/modules/ensemble.html#interpretation-with-feature-importance

  # also
  # you can see what path a data point takes through the forest
  # with RandomForestClassifier.decision_path()
  # which might be really cool
  # to see like 10 trees and the path through each tree and what each tree predicted


if __name__ == '__main__':
  main()