import torch from mmengine.evaluator import BaseMetric from opencompass.registry import METRICS EVAL_DIM_MAPPING = { 1: 'Scene Understanding', 2: 'Instance Identity', 3: 'Instance Attributes', 4: 'Instance Location', 5: 'Instance Counting', 6: 'Spatial Relations', 7: 'Instance Interaction', 8: 'Visual Reasoning', 9: 'Text Recognition', 10: 'Action Recognition', 11: 'Action Prediction', 12: 'Procedure Understanding', } @METRICS.register_module() class SEEDBenchAcc(BaseMetric): """Compute results for SEED-Bench.""" def process(self, data_batch, data_samples) -> None: for data_sample in data_samples: losses = data_sample['losses'] class_ranks = torch.argsort(losses, dim=-1).cpu() pred_id = ['A', 'B', 'C', 'D'][class_ranks[0]] answer_record = { 'q_id': data_sample['question_id'], 'prediction': pred_id, 'gt': data_sample['answer'], 'q_type_id': data_sample['question_type_id'], 'losses': [str(num) for num in list(losses.cpu().numpy())], } self.results.append(answer_record) def compute_metrics(self, results: list) -> dict: type_counts = {} correct_counts = {} out = {} out['answer_records'] = results for item in results: pred, gt = item['prediction'], item['gt'] data_type = item['q_type_id'] type_counts[data_type] = type_counts.get(data_type, 0) + 1 if pred == gt: correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 total_count = 0 total_correct = 0 for data_type in type_counts.keys(): accuracy = correct_counts.get(data_type, 0) / type_counts[data_type] * 100 category = EVAL_DIM_MAPPING[data_type] out[f'Data type {data_type} - {category}'] = accuracy total_count += type_counts[data_type] total_correct += correct_counts.get(data_type, 0) total_accuracy = total_correct / total_count * 100 out['Total accuracy'] = total_accuracy return out