MobileVLM-1.7B / configuration_mobilevlm.py
mjschock's picture
Upload config
6ee5a33 verified
raw
history blame contribute delete
No virus
5.75 kB
from typing import List, Optional
from transformers import PretrainedConfig
config = {
"_name_or_path": "mtgv/MobileVLM-1.7B",
"architectures": [
"MobileLlamaForCausalLM"
],
"bos_token_id": 1,
"eos_token_id": 2,
"freeze_mm_mlp_adapter": False,
"hidden_act": "silu",
"hidden_size": 2048,
"image_aspect_ratio": "pad",
"image_grid_pinpoints": None,
"initializer_range": 0.02,
"intermediate_size": 5632,
"max_position_embeddings": 2048,
"max_sequence_length": 2048,
"mm_hidden_size": 1024,
"mm_projector_type": "ldpnet",
"mm_use_im_patch_token": False,
"mm_use_im_start_end": False,
"mm_vision_select_feature": "patch",
"mm_vision_select_layer": -2,
"mm_vision_tower": "openai/clip-vit-large-patch14-336",
"model_type": "mobilevlm",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"num_key_value_heads": 16,
"pad_token_id": 0,
"pretraining_tp": 1,
"rms_norm_eps": 1e-06,
"rope_scaling": None,
"rope_theta": 10000.0,
"tie_word_embeddings": False,
"torch_dtype": "bfloat16",
"transformers_version": "4.33.1",
"tune_mm_mlp_adapter": False,
"use_cache": True,
"use_mm_proj": True,
"vision_tower_type": "clip",
"vocab_size": 32000
}
class MobileVLMConfig(PretrainedConfig):
model_type = "mobilevlm"
def __init__(
self,
# _name_or_path: str = config["_name_or_path"],
architectures: Optional[List[str]] = config["architectures"],
bos_token_id: Optional[int] = config["bos_token_id"],
eos_token_id: Optional[int] = config["eos_token_id"],
freeze_mm_mlp_adapter: Optional[bool] = config["freeze_mm_mlp_adapter"],
hidden_act: Optional[str] = config["hidden_act"],
hidden_size: Optional[int] = config["hidden_size"],
image_aspect_ratio: Optional[str] = config["image_aspect_ratio"],
image_grid_pinpoints: Optional[bool] = config["image_grid_pinpoints"],
initializer_range: Optional[float] = config["initializer_range"],
intermediate_size: Optional[int] = config["intermediate_size"],
max_position_embeddings: Optional[int] = config["max_position_embeddings"],
max_sequence_length: Optional[int] = config["max_sequence_length"],
mm_hidden_size: Optional[int] = config["mm_hidden_size"],
mm_projector_type: Optional[str] = config["mm_projector_type"],
mm_use_im_patch_token: Optional[bool] = config["mm_use_im_patch_token"],
mm_use_im_start_end: Optional[bool] = config["mm_use_im_start_end"],
mm_vision_select_feature: Optional[str] = config["mm_vision_select_feature"],
mm_vision_select_layer: Optional[int] = config["mm_vision_select_layer"],
mm_vision_tower: Optional[str] = config["mm_vision_tower"],
# model_type: Optional[str] = config["model_type"],
num_attention_heads: Optional[int] = config["num_attention_heads"],
num_hidden_layers: Optional[int] = config["num_hidden_layers"],
num_key_value_heads: Optional[int] = config["num_key_value_heads"],
pad_token_id: Optional[int] = config["pad_token_id"],
pretraining_tp: Optional[int] = config["pretraining_tp"],
rms_norm_eps: Optional[float] = config["rms_norm_eps"],
rope_scaling: Optional[float] = config["rope_scaling"],
rope_theta: Optional[float] = config["rope_theta"],
tie_word_embeddings: Optional[bool] = config["tie_word_embeddings"],
torch_dtype: Optional[str] = config["torch_dtype"],
transformers_version: Optional[str] = config["transformers_version"],
tune_mm_mlp_adapter: Optional[bool] = config["tune_mm_mlp_adapter"],
use_cache: Optional[bool] = config["use_cache"],
use_mm_proj: Optional[bool] = config["use_mm_proj"],
vision_tower_type: Optional[str] = config["vision_tower_type"],
vocab_size: Optional[int] = config["vocab_size"],
**kwargs,
):
# self._name_or_path = _name_or_path
self.architectures = architectures
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.freeze_mm_mlp_adapter = freeze_mm_mlp_adapter
self.hidden_act = hidden_act
self.hidden_size = hidden_size
self.image_aspect_ratio = image_aspect_ratio
self.image_grid_pinpoints = image_grid_pinpoints
self.initializer_range = initializer_range
self.intermediate_size = intermediate_size
self.max_position_embeddings = max_position_embeddings
self.max_sequence_length = max_sequence_length
self.mm_hidden_size = mm_hidden_size
self.mm_projector_type = mm_projector_type
self.mm_use_im_patch_token = mm_use_im_patch_token
self.mm_use_im_start_end = mm_use_im_start_end
self.mm_vision_select_feature = mm_vision_select_feature
self.mm_vision_select_layer = mm_vision_select_layer
self.mm_vision_tower = mm_vision_tower
# self.model_type = model_type
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_hidden_layers
self.num_key_value_heads = num_key_value_heads
self.pad_token_id = pad_token_id
self.pretraining_tp = pretraining_tp
self.rms_norm_eps = rms_norm_eps
self.rope_scaling = rope_scaling
self.rope_theta = rope_theta
self.tie_word_embeddings = tie_word_embeddings
self.torch_dtype = torch_dtype
self.transformers_version = transformers_version
self.tune_mm_mlp_adapter = tune_mm_mlp_adapter
self.use_cache = use_cache
self.use_mm_proj = use_mm_proj
self.vision_tower_type = vision_tower_type
self.vocab_size = vocab_size
super().__init__(**kwargs)