FreshBench / get_loss /get_loss.py
jijivski
okay on local phi-2
3fe3e10
raw
history blame contribute delete
No virus
9.91 kB
# import packages
import os
from tqdm import tqdm
import warnings
import json
import torch.nn.functional as F
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
from datetime import datetime
import argparse
import mamba_ssm
import rwkv
RWKV4_TOKENIZER_FILE = "./support/20B_tokenizer.json"
def load_list_from_json(file_path):
"""
Loads a list of strings from a JSON file.
:param file_path: Path of the JSON file to be loaded.
:return: List of strings loaded from the JSON file.
"""
with open(file_path, 'r', encoding='utf-8') as file:
return json.load(file)
def calculate_log_sum(logits, target_token_ids):
shifted_logits = logits[:-1, :]
shifted_targets = target_token_ids[1:]
log_probs = F.log_softmax(shifted_logits, dim=-1)
target_log_probs = -log_probs.gather(1, shifted_targets.unsqueeze(1)).squeeze()
# print(target_log_probs)
log_sum = torch.sum(target_log_probs, dim=-1)
# print(perplexity_sum)
return log_sum.item()
def print_model_parameters_in_billions(model):
total_params = sum(p.numel() for p in model.parameters())
total_params_billion = total_params / 1e9
print(f"Model parameters: {total_params_billion:.3f} billion")
def make_log(data_dict, folder_path):
if not os.path.exists(folder_path):
try:
os.makedirs(folder_path)
print(f"Directory created at {folder_path}")
except Exception as e:
print(f"Error creating directory: {e}")
return
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = f"{timestamp}.json"
file_path = os.path.join(folder_path, file_name)
try:
with open(file_path, 'w') as file:
json.dump(data_dict, file, indent=4)
print(f"Dictionary saved successfully to {file_path}")
except Exception as e:
print(f"Error saving dictionary: {e}")
def load_rwkv(path):
os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '1'
from rwkv.model import RWKV
from rwkv.utils import PIPELINE
rwkv_model = RWKV(model=path, strategy='cuda fp16')
rwkv_pipeline = PIPELINE(rwkv_model, r"rwkv_vocab_v20230424")
rwkv_tokenizer = rwkv_pipeline.tokenizer
return rwkv_model, rwkv_tokenizer
def load_rwkv4pile(path):
os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '1'
from rwkv.model import RWKV
from rwkv.utils import PIPELINE
rwkv_model = RWKV(model=path, strategy='cuda fp16')
rwkv_pipeline = PIPELINE(rwkv_model, RWKV4_TOKENIZER_FILE)
rwkv_tokenizer = rwkv_pipeline.tokenizer
return rwkv_model, rwkv_tokenizer
def load_hf_model(path, cache_path):
hf_tokenizer = AutoTokenizer.from_pretrained(path)
if cache_path is not None:
hf_model = AutoModelForCausalLM.from_pretrained(path,
device_map="cuda",
trust_remote_code=True,
cache_dir=cache_path).eval()
else:
hf_model = AutoModelForCausalLM.from_pretrained(path,
device_map="cuda",
trust_remote_code=True).eval()
print_model_parameters_in_billions(hf_model)
return hf_model, hf_tokenizer
def load_mamba(path):
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
mamba_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
mamba_model = MambaLMHeadModel.from_pretrained(path, device="cuda", dtype=torch.float16)
mamba_model.device = torch.device('cuda')
print_model_parameters_in_billions(mamba_model)
return mamba_model, mamba_tokenizer
def eval_rwkv(model, tokenizer, texts, chunk_size, v4pile=False):
rwkv_test_data = []
rwkv_token_length_list = []
for idx, sample in tqdm(enumerate(texts), total=len(texts)):
with torch.no_grad():
if v4pile:
input_seq = tokenizer.encode(sample).ids # v4
else:
input_seq = tokenizer.encode(sample)
input_length = len(input_seq)
neg_log_prob_temp = 0
# for begin in range(0, input_length, chunk_size):
input_chunk = input_seq[:chunk_size]
logit = model.forward(input_chunk, None, full_output=True)[0]
if len(input_chunk) == 1:
logit = logit.unsqueeze(0)
# log_sum = calculate_log_sum(logit, torch.tensor(input_chunk).cuda())
# neg_log_prob_temp += log_sum
# rwkv_token_length_list.append(input_length)
# rwkv_test_data.append(neg_log_prob_temp)
# data_dict = {
# 'neg_log_prob_sum': sum(rwkv_test_data) / len(rwkv_test_data),
# 'avg tokens': sum(rwkv_token_length_list) / len(rwkv_token_length_list),
# }
# print(f'log probability sum: {sum(rwkv_test_data) / len(rwkv_test_data):.2f}')
# print(f'avg tokens: {sum(rwkv_token_length_list) / len(rwkv_token_length_list):.0f}')
return logit,logit,input_chunk,tokenizer
def eval_hf_model(model, tokenizer, texts, chunk_size):
data = []
token_length_list = []
for idx, sample in tqdm(enumerate(texts), total=len(texts)):
with torch.no_grad():
inputs = tokenizer(sample, return_tensors='pt')
inputs = inputs.to(model.device)
seq_length = inputs['input_ids'].shape[-1]
neg_log_prob_temp = 0
# for begin in range(0, seq_length, chunk_size):
input_chunk = inputs['input_ids'][:, :chunk_size]
logit = model.forward(input_ids=input_chunk).logits[0, :, :]
# log_sum = calculate_log_sum(logit, input_chunk.squeeze(0))
# neg_log_prob_temp += log_sum
# token_length_list.append(seq_length)
# data.append(neg_log_prob_temp)
# data_dict = {
# 'neg_log_prob_sum': sum(data) / len(data),
# 'avg tokens': sum(token_length_list) / len(token_length_list),
# }
# print(f'log probability sum: {sum(data) / len(data):.2f}')
# print(f'avg tokens: {sum(token_length_list) / len(token_length_list):.0f}')
return logit,input_chunk,tokenizer
# if __name__ == '__main__':
# parser = argparse.ArgumentParser()
# parser.add_argument('--model', type=str, required=True, help='model name or path')
# parser.add_argument('--model_type', choices=['hf', 'rwkv', 'mamba', 'rwkv4pile'], required=True, help='model type')
# parser.add_argument('--data', type=str, required=True, help='data path (json file)')
# parser.add_argument('--log_path', type=str, default='./logs/', help='log file path')
# parser.add_argument('--model_cache', type=str, help='hugging face model cache')
# parser.add_argument('--chunk_size', type=int, default=1024, help='chunk size')
def run_get_loss(args):
# args = parser.parse_args()
# load data
texts = load_list_from_json(args.data)
print(f'data size: {len(texts)}')
# load model
if args.model_type == 'hf':
model, tokenizer = load_hf_model(args.model, args.model_cache)# tokenzier path, model path
elif args.model_type == 'rwkv':
model, tokenizer = load_rwkv(args.model)
elif args.model_type == 'mamba':
model, tokenizer = load_mamba(args.model)
elif args.model_type == 'rwkv4pile':
model, tokenizer = load_rwkv4pile(args.model)
else:
raise NotImplementedError
# eval
if args.model_type in ['hf', 'mamba']:
return eval_hf_model(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size)
elif args.model_type == 'rwkv':
return eval_rwkv(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size)
elif args.model_type == 'rwkv4pile':
return eval_rwkv(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size, v4pile=True)
else:
raise NotImplementedError
# results['model_name_or_path'] = args.model
# results['data_path'] = args.data
# results['chunk_size'] = args.chunk_size
# make_log(results, args.log_path)
# print(json.dumps(results, indent=4, ensure_ascii=False))
from types import SimpleNamespace
if __name__ == '__main__':
args=SimpleNamespace(model='microsoft/phi-2',texts=['Hello FreshBench !'],model_type='hf',data='data.json',model_cache=None,chunk_size=1024)
# def run_get_loss(input_string, model_type):
# # load data
# texts = [input_string]
# print(f'data size: {len(texts)}')
# # load model
# if model_type == 'hf':
# model, tokenizer = load_hf_model(args.model, args.model_cache)# tokenzier path, model path
# elif model_type == 'rwkv':
# model, tokenizer = load_rwkv(args.model)
# elif model_type == 'mamba':
# model, tokenizer = load_mamba(args.model)
# elif model_type == 'rwkv4pile':
# model, tokenizer = load_rwkv4pile(args.model)
# else:
# raise NotImplementedError
# # eval
# if model_type in ['hf', 'mamba']:
# results = eval_hf_model(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size)
# elif model_type == 'rwkv':
# results = eval_rwkv(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size)
# elif model_type == 'rwkv4pile':
# results = eval_rwkv(model=model, tokenizer=tokenizer, texts=texts, chunk_size=args.chunk_size, v4pile=True)
# else:
# raise NotImplementedError
# results['model_name_or_path'] = args.model
# results['data_path'] = args.data
# results['chunk_size'] = args.chunk_size
# make_log(results, args.log_path)
# print(json.dumps(results, indent=4, ensure_ascii=False))