This script is used to convert the Ton iot datasets into jsonl that can be used by the tokenizer in the training and testing scripts in order to be in the correct format to train the models.

In [2]:
import csv
import json

# Path to desired file to be converted
csv_file = 'train_test_network.csv'

# Name of file to be written to
jsonl_file = 'train_test_network.jsonl'

with open(csv_file, 'r') as f, open(jsonl_file, 'w') as jsonl_f:
 reader = csv.DictReader(f)
 
 for row in reader:
 # Normalize the label to ensure comparison is case-insensitive
 label = row['type'].strip().lower()
 
 # For converting the dataset to binary. If you are doing multi-class comment out this line
 label = 'normal' if label == 'normal' else 'attack'

 
 # Construct the user message by concatenating all relevant fields
 user_message = ", ".join([f"{key}: {value}" for key, value in row.items() if key != 'label' and key != 'type'])
 
 # Create the conversation in the desired format
 conversation = {
 "conversations": [
 {"from": "human", "value": user_message},
 {"from": "gpt", "value": label}
 ]
 }
 
 # Write each conversation as a JSON object in a new line
 jsonl_f.write(json.dumps(conversation) + "\n")

Output will look like:

```
{"conversations": [{"from": "human", "value": "\ufeffsrc_ip: 192.168.1.192, src_port: 5353, dst_ip: 224.0.0.251, dst_port: 5353, proto: udp, service: dns, duration: 0, src_bytes: 0, dst_bytes: 0, conn_state: S0, missed_bytes: 0, src_pkts: 1, src_ip_bytes: 73, dst_pkts: 0, dst_ip_bytes: 0, dns_query: _ipps._tcp.local, dns_qclass: 1, dns_qtype: 12, dns_rcode: 0, dns_AA: F, dns_RD: F, dns_RA: F, dns_rejected: F, ssl_version: -, ssl_cipher: -, ssl_resumed: -, ssl_established: -, ssl_subject: -, ssl_issuer: -, http_trans_depth: -, http_method: -, http_uri: -, http_version: -, http_request_body_len: 0, http_response_body_len: 0, http_status_code: 0, http_user_agent: -, http_orig_mime_types: -, http_resp_mime_types: -, weird_name: -, weird_addl: -, weird_notice: -"}, {"from": "gpt", "value": "normal"}]}
```


If you want to Split datasets and save them seperately in order to upload them to huggingface use this. 

In [None]:
# Load dataset from the JSONL file
dataset = load_dataset('json', data_files=jsonl_file, split='train')

# Split the dataset
split_dataset = dataset.train_test_split(test_size=0.2) # 20% test data

# Access the training and testing splits
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

# Define paths for saving the datasets
jsonl_train = 'train_network.jsonl'
jsonl_test = 'test_network.jsonl'

# Save train_dataset to JSONL file
with open(jsonl_train, 'w') as train_f:
 for example in train_dataset:
 train_f.write(json.dumps(example) + "\n")

# Save test_dataset to JSONL file
with open(jsonl_test, 'w') as test_f:
 for example in test_dataset:
 test_f.write(json.dumps(example) + "\n")