Skip to content

Commit

Permalink
add clip H config (NVIDIA#9082)
Browse files Browse the repository at this point in the history
* add clip H config

* add comment to 1st line of yaml
  • Loading branch information
JRD971000 authored May 1, 2024
1 parent e267406 commit 3d87ed7
Showing 1 changed file with 204 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
# An example model that works with this config is "https://huggingface.co/yuvalkirstain/PickScore_v1"
model:
precision: 32
# specify micro_batch_size, global_batch_size, and model parallelism
# gradient accumulation will be done automatically based on data_parallel_size
micro_batch_size: 32 # limited by GPU memory
global_batch_size: 32 # will use more micro batches to reach global batch size
tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism
virtual_pipeline_model_parallel_size: null # interleaved pipeline

restore_from_pretrained: null # used in fine-tuning
# multimodal configs
output_dim: 1024
# As the number of devices used to train increases, so does the space complexity of
# the logit matrix. Using a naïve all-gather scheme, space complexity will be
# `O(n^2)`. Instead, complexity may become effectively linear if the flags
# `--gather-with-grad` and `--local-loss` are used. This alteration results in one-to-one
# numerical results as the naïve method.
local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue

vision:
precision: 32
# vision configs
patch_dim: 14
img_h: 224
img_w: 224
image_mean: null
image_std: null
num_channels: 3
drop_patch_rate: 0.0
drop_path_rate: 0.0
global_average_pool: False
output_dim: ${model.output_dim}
class_token_length: 1
preprocess_layernorm: True # apply layer norm to embedded tokens

# model architecture
encoder_seq_length: 196
max_position_embeddings: ${.encoder_seq_length}
position_embedding_type: learned_parameters
num_layers: 32
hidden_size: 1280
ffn_hidden_size: 5120 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 16
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
use_scaled_init_method: True # use scaled residuals initialization
hidden_dropout: 0. # Dropout probability for hidden state transformer.
attention_dropout: 0.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
normalization: layernorm # Type of normalization layers
layernorm_epsilon: 1e-5
do_layer_norm_weight_decay: False # True means weight decay on all params
pre_process: True # add embedding
post_process: True # add pooler
persist_layer_norm: True # Use of persistent fused layer norm kernel.

## Activation Checkpointing
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
activations_checkpoint_num_layers: null # not used with 'selective'
sequence_parallel: False

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# model fusions
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.

use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
openai_gelu: False
bias_activation_fusion: False
megatron_legacy: True
activation: gelu



text:
precision: 32
# text configs
output_dim: ${model.output_dim}

# model architecture
encoder_seq_length: 77
max_position_embeddings: ${.encoder_seq_length}
position_embedding_type: learned_parameters
num_layers: 24
hidden_size: 1024
ffn_hidden_size: 4096 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 16
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
use_scaled_init_method: True # use scaled residuals initialization
hidden_dropout: 0. # Dropout probability for hidden state transformer.
attention_dropout: 0.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
normalization: layernorm # Type of normalization layers
layernorm_epsilon: 1e-5
do_layer_norm_weight_decay: False # True means weight decay on all params
pre_process: True # add embedding
post_process: True # add pooler
persist_layer_norm: True # Use of persistent fused layer norm kernel.

## Activation Checkpointing
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
activations_checkpoint_num_layers: null # not used with 'selective'
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: False

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

# model fusions
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.

use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
openai_gelu: False
bias_activation_fusion: False
megatron_legacy: True

transformer_engine: False
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
fp8_margin: 0 # scaling margin
fp8_interval: 1 # scaling update interval
fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
activation: gelu

# Megatron O2-style half-precision
megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce

# miscellaneous
seed: 1234
resume_from_checkpoint: null # manually set the checkpoint file to load from
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

tokenizer:
library: 'huggingface'
type: 'openai/clip-vit-large-patch14'
model: null
vocab_file: null
merge_file: null
delimiter: null # only used for tabular tokenizer
sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.

data:
num_workers: 8
train:
dataset_path: # List of paths to pkl files or tar files
- /datasets/coyo/test.pkl
validation: # List of paths to pkl files or tar files
dataset_path:
- /datasets/coyo/test.pkl
webdataset:
infinite_sampler: False
local_root_path: /datasets/coyo

imagenet_val: null # Path to imagenet val set for conducting zero shot evaluation.

# Nsys profiling options
nsys_profile:
enabled: False
start_step: 10 # Global batch to start profiling
end_step: 10 # Global batch to end profiling
ranks: [ 0 ] # Global rank IDs to profile
gen_shape: False # Generate model and kernel details including input shapes

optim:
name: fused_adam
lr: 1e-3
weight_decay: 0.2
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 2000
constant_steps: 0
min_lr: 1e-5

0 comments on commit 3d87ed7

Please sign in to comment.