Spaces:
Runtime error
Runtime error
File size: 5,959 Bytes
045d7d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import joblib
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
hgb = joblib.load('hgb_classifier.joblib')
FEATS = [
'srcip',
'sport',
'dstip',
'dsport',
'proto',
#'state', I dropped this one when I trained the model
'dur',
'sbytes',
'dbytes',
'sttl',
'dttl',
'sloss',
'dloss',
'service',
'Sload',
'Dload',
'Spkts',
'Dpkts',
'swin',
'dwin',
'stcpb',
'dtcpb',
'smeansz',
'dmeansz',
'trans_depth',
'res_bdy_len',
'Sjit',
'Djit',
'Stime',
'Ltime',
'Sintpkt',
'Dintpkt',
'tcprtt',
'synack',
'ackdat',
'is_sm_ips_ports',
'ct_state_ttl',
'ct_flw_http_mthd',
'is_ftp_login',
'ct_ftp_cmd',
'ct_srv_src',
'ct_srv_dst',
'ct_dst_ltm',
'ct_src_ltm',
'ct_src_dport_ltm',
'ct_dst_sport_ltm',
'ct_dst_src_ltm',
]
# plotly only has the CSS named colors
# I don't think I can use xkcd colors
# I copied a bunch of CSS colors from somewhere online
# and then deleted whites and things that showed up too close on the tree
# this is not really a general solution, it just works for this specific tree
# I'll have to come up with a better colormap at some point
COLORS = [
'aliceblue','aqua','aquamarine','azure',
'bisque','black','blanchedalmond','blue',
'blueviolet','brown','burlywood','cadetblue',
'chartreuse','chocolate','coral','cornflowerblue',
'cornsilk','crimson','cyan','darkblue','darkcyan',
'darkgoldenrod','darkgray','darkgreen',
'darkkhaki','darkmagenta','darkolivegreen','darkorange',
'darkorchid','darkred','darksalmon','darkseagreen',
'darkslateblue','darkslategray',
'darkturquoise','darkviolet','deeppink','deepskyblue',
'dimgray','dodgerblue',
'forestgreen','fuchsia','gainsboro',
'gold','goldenrod','gray','green',
'greenyellow','honeydew','hotpink','indianred','indigo',
'ivory','khaki','lavender','lavenderblush','lawngreen',
'lemonchiffon','lightblue','lightcoral','lightcyan',
'lightgoldenrodyellow','lightgray',
'lightgreen','lightpink','lightsalmon','lightseagreen',
'lightskyblue','lightslategray',
'lightsteelblue','lightyellow','lime','limegreen',
'linen','magenta','maroon','mediumaquamarine',
'mediumblue','mediumorchid','mediumpurple',
'mediumseagreen','mediumslateblue','mediumspringgreen',
'mediumturquoise','mediumvioletred','midnightblue',
'mintcream','mistyrose','moccasin','navy',
'oldlace','olive','olivedrab','orange','orangered',
'orchid','palegoldenrod','palegreen','paleturquoise',
'palevioletred','papayawhip','peachpuff','peru','pink',
'plum','powderblue','purple','red','rosybrown',
'royalblue','saddlebrown','salmon','sandybrown',
'seagreen','seashell','sienna','silver','skyblue',
'slateblue','slategray','slategrey','snow','springgreen',
'steelblue','tan','teal','thistle','tomato','turquoise',
'violet','wheat','yellow','yellowgreen'
]
trees = [x[0].nodes for x in hgb._predictors]
# the final tree definitely has a similar structure but is noticably different
# that's really cool
# I think this will make a cool animation
# if I can figure it out
tree = pd.DataFrame(trees[0])
#tree = pd.DataFrame(trees[9])
# parents is going to be tricky
# I need get the index of whichever node has the current node listed in either left or right
parents = [None]
# keep track of whether each node is a left or right child of the parent in the list
directions = [None]
# it uses 0 to say "no left/right child"
# so I have to skip searching for node 0
# which is fine b/c node 0 is the root
for i in tree.index[1:]:
# it seems to make a very even tree
# so just guess it's in the right side
# and that will be right half the time
parent = tree[tree['right']==i].index
if parent.empty:
parents.append(str(tree[tree['left']==i].index[0]))
directions.append('l')
else:
parents.append(str(parent[0]))
directions.append('r')
# generate the labels
# and the colors
labels = ['Histogram Gradient-Boosted Decision Tree']
colors = ['white']
for i, node, parent, direction in zip(
tree.index.to_numpy(),
tree.iterrows(),
parents,
directions
):
# skip the first one (the root)
if i == 0:
continue
node = node[1]
feat = FEATS[int(tree.loc[int(parent), 'feature_idx'])]
thresh = tree.loc[int(parent), 'num_threshold']
if direction == 'l':
labels.append(f"[{i}] {feat} <= {thresh}")
else:
labels.append(f"[{i}] {feat} > {thresh}")
# colors
offset = FEATS.index(feat)
colors.append(COLORS[offset])
# actual plot
f = go.Figure(
go.Treemap(
values=tree['count'].to_numpy(),
labels=labels,
ids=tree.index.to_numpy(),
parents=parents,
marker_colors=colors,
)
)
#f.update_layout(
# treemapcolorway = ['pink']
#)
breakpoint()
# converting the ndarry with columns names to a pandas df
# 3284 bytes as an ndarry
# 3300 bytes as a dataframe
# so they're the same size
# do I need to convert it to pandas? idk
# just curious
# https://linuxtut.com/en/ffb2e319db5545965933/
# https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
# figuring out how the thing works
# `value` is the predicted class / value / whatever
# so if it's a leaf node, it returns that value as the prediction
# there are negative values in some of the leaves
# maybe the classes are +/-1 instead of 0/1?
# if the data value is <= `num_threshold` then it goes in the left node
# if it's > `num_threshold` then it goes in the right node
# okay and then all the leave have feature_idx=0, num_threshold=0, left=0, right=0
# that makes sense
# still kind of annoying that they use 0 instead of np.nan but oh well
# also super super hard to figure out what the labels on the tree map should be
# like it has to check the parent's feature_idx and num_threshold
# which I guess isn't too bad once we have the list of parents already built
# except that I don't know whether a node is left or right from its parent
# hmmmm
|