CLI Examples¶

Small Example Configuration for single model¶

defaults:
  - default
  - _self_

nvbenjo:
  models:
      shufflenet:
        type_or_path: torchvision:shufflenet_v2_x0_5
        shape: [B, 3, 224, 224]
        num_warmup_batches: 1
        num_batches: 2
        batch_sizes: [8, 16]
        devices: ["cpu"]
        runtime_options:
          run_FP32:
            precision: FP32
          run_FP16:
            precision: FP16
        custom_batchmetrics:
          # Frames per seconds: `1.0 / time_total_batch_normalized`
          fps: 1.0

Example with Profiling¶

Example Configuration¶

defaults:
  - default
  - _self_

nvbenjo:
  models:
    shufflenet_torch:
      # Directly uses a model part of torchvision but you may also specify a local path or a Hugging Face model identifier
      type_or_path: torchvision:shufflenet_v2_x0_5
      kwargs: {}
      shape: [B, 3, 224, 224]
      num_warmup_batches: 1
      num_batches: 2
      # You may specify multiple batch sizes to benchmark
      batch_sizes: [1, 2]
      # You may also specify multiple devices to benchmark
      devices: ["cuda:0"]

      # Specify different runtime options to benchmark with different precisions or settings
      # You may also use `amp`, `amp_fp16`, `amp_bfloat16` for automatic mixed precision
      runtime_options:
        run_FP32:
          compile: false
          precision: FP32
          matmul_precision: highest
        run_FP16_compiled:
          compile: torch_compile
          cuda_graphs: false
          precision: fp16
        run_FP32_profiled:
          precision: FP32
          enable_profiling: true
          profiling_prefix: "shufflenet_profile/shufflenet"

      custom_batchmetrics:
        # Define a custom metric that computes frames per second (fps) 
        # `1 / time_total_batch_normalized`
        fps: 1

    # Specify an ONNX model
    resnet_onnx:
      type_or_path: ~/Downloads/resnet50-v2-7.onnx

      # Input shape can also be specified using a dictionary with input type and min-max values
      shape: [{"name": "data", "shape": [B, 3, 224, 224], "type": "float", "min_max": [0, 1]}]

      num_warmup_batches: 3
      num_batches: 2
      batch_sizes: [1, 8, 16, 32]
      devices: ["cuda:0"]
      runtime_options:
        DEFAULT:
          # onnx runtime session options
          intra_op_num_threads: 2
          graph_optimization_level: ORT_ENABLE_BASIC
          enable_profiling: true
          profiling_prefix: "resnet_profile"

      custom_batchmetrics:
        # Define a custom metric that computes frames per second (fps) 
        # `1 / time_total_batch_normalized`
        fps: 1

PyTorch¶

Compilation & CUDA Graphs¶

Example Configuration with torch.compile, AOT compilation and CUDA Graphs¶

defaults:
  - default
  - _self_

nvbenjo:
  models:
    resnet50:
      type_or_path: torchvision:resnet50
      shape: [B, 3, 224, 224]
      num_warmup_batches: 3
      num_batches: 5
      batch_sizes: [8]
      devices: ["cuda:0"]
      runtime_options:
        # No compilation
        no_compile:
          precision: AMP_BFLOAT16
          compile: false
        # Standard torch.compile
        torch_compile:
          precision: AMP_BFLOAT16
          compile: torch_compile
        # torch.compile + CUDA graphs
        torch_compile_graphed:
          precision: AMP_BFLOAT16
          compile: torch_compile
          cuda_graphs: true
        # Ahead-of-time compilation
        aot_compile:
          precision: BFLOAT16
          compile: aot_compile
          cuda_graphs: false
        # AOT compilation + CUDA graphs
        aot_compile_graphed:
          precision: BFLOAT16
          compile: aot_compile
          cuda_graphs: true

Torchvision¶

Example Configuration of Torchvision Models¶

defaults:
  - default
  - _self_

nvbenjo:
  models:
      resnet50:
        type_or_path: torchvision:resnet50
        shape: [B, 3, 224, 224]
        num_warmup_batches: 5
        num_batches: 10
        batch_sizes: [1, 8, 16]
        devices: ["cuda:0"]
        runtime_options:
          FP32:
            precision: FP32
          AMP_FP16:
            precision: AMP_FP16
          AMP_BFLOAT16:
            precision: AMP_BFLOAT16

      vit-base-patch16-224:
        type_or_path: torchvision:vit_b_16
        shape: [B, 3, 224, 224]
        num_warmup_batches: 5
        num_batches: 10
        batch_sizes: [16]
        devices: ["cuda:0"]
        runtime_options:
          FP32_highest:
            matmul_precision: highest
            precision: FP32
          FP32_medium:
            matmul_precision: medium
            precision: FP32
          AMP_FP16:
            precision: AMP_FP16
          AMP_BFLOAT16:
            precision: AMP_BFLOAT16

      alexnet:
        type_or_path: torchvision:alexnet
        shape: [B, 3, 224, 224]
        num_warmup_batches: 5
        num_batches: 10
        batch_sizes: [1, 8, 16]
        devices: ["cuda:0"]
        runtime_options:
          FP32:
            precision: FP32
          AMP_FP16:
            precision: AMP_FP16
          AMP_BFLOAT16:
            precision: AMP_BFLOAT16

Huggingface¶

Example Configuration of Huggingface Models¶

defaults:
  - default
  - _self_

nvbenjo:
  models:
      bert:
        type_or_path: huggingface:google-bert/bert-base-cased
        kwargs: {}
        shape: [{"name": "input_ids", "shape": [B, 200], "type": "long", min_max: [0, 200]}]
        num_warmup_batches: &num-warmup-batches 1
        num_batches: &num-batches 10
        batch_sizes: &batch-sizes [1, 8, 16]
        devices: &devices ["cuda:0"]
        runtime_options:
          FP16:
            precision: FP16
          FP32:
            precision: FP32
          AMP_FP16:
            precision: AMP_FP16
        custom_batchmetrics:
          # `tokens_per_batch / time_total_batch_normalized` to get tokens per second
          tokens_per_second: 200.0

      gpt2:
        type_or_path: huggingface:openai-community/gpt2
        kwargs: {}
        shape: [{"name": "input_ids", "shape": [B, 90], "type": "long", min_max: [0, 200]}]
        num_warmup_batches: *num-warmup-batches
        num_batches: *num-batches
        batch_sizes: *batch-sizes
        devices: *devices
        runtime_options:
          BFLOAT16:
            compile: false
            precision: BFLOAT16
          FP32:
            compile: false
            precision: FP32
        custom_batchmetrics:
          # `tokens_per_batch / time_total_batch_normalized` to get tokens per second
          tokens_per_second: 90.0

      ast-audioset:
        type_or_path: huggingface:MIT/ast-finetuned-audioset-10-10-0.4593
        kwargs: {}
        shape: [{"name": "input_values", "shape": [B, 1024, 128], "type": "float", "min_max": [0, 1]}]
        num_warmup_batches: *num-warmup-batches
        num_batches: *num-batches
        batch_sizes: *batch-sizes
        devices: *devices
        runtime_options:
          FP32:
            precision: FP32
          AMP_FP16:
            precision: AMP_FP16
          AMP_BFLOAT16:
            precision: AMP_BFLOAT16
        custom_batchmetrics:
          # `audio_duration / time_total_batch_normalized` to get real time factor
          real_time_factor: 10.24

      openai/whisper-small:
        type_or_path: huggingface:openai/whisper-small
        kwargs: {}
        shape: [
            {"name": "input_features", "shape": [B, 80, 3000], "min_max": [0, 1]},
            {"name": "decoder_input_ids", "shape": [B, 6], "value": 50258, "type": "long"},
            {"name": "attention_mask", "shape": [B, 6], "value": 1, "type": "long"},
          ]
        num_warmup_batches: *num-warmup-batches
        num_batches: *num-batches
        batch_sizes: *batch-sizes
        devices: *devices
        runtime_options:
          FP16:
            precision: FP16
          FP32:
            precision: FP32
        custom_batchmetrics:
          # `audio_duration / time_total_batch_normalized` to get real time factor
          real_time_factor: 30.0

      google/t5:
        type_or_path: huggingface:google/t5-v1_1-small
        kwargs: {}
        shape: [
            {"name": "input_ids", "shape": [B, 512], "type": "long", "min_max": [0, 32128]},
            {"name": "attention_mask", "shape": [B, 512], "type": "long", "value": 1},
            {"name": "decoder_input_ids", "shape": [B, 8], "type": "long", "value": 0},
          ]
        num_warmup_batches: *num-warmup-batches
        num_batches: *num-batches
        batch_sizes: *batch-sizes
        devices: *devices
        runtime_options:
          FP16:
            precision: FP16
          FP32:
            precision: FP32
        custom_batchmetrics:
          # `tokens_per_batch / time_total_batch_normalized` to get tokens per second
          tokens_per_second: 512.0

      mobilevit:
        type_or_path: huggingface:apple/mobilevit-small
        kwargs: {}
        shape: [{"name": "pixel_values", "shape": [B, 3, 224, 224], "type": "float", "min_max": [0, 1]}]
        num_warmup_batches: *num-warmup-batches
        num_batches: *num-batches
        batch_sizes: *batch-sizes
        devices: *devices
        runtime_options:
          FP32:
            precision: FP32
          AMP_FP16:
            precision: AMP_FP16
          AMP_BFLOAT16:
            precision: AMP_BFLOAT16
        custom_batchmetrics:
          # `images_per_batch / time_total_batch_normalized` to get images per second
          fps: 1

Onnx¶

Example Configuration of Onnx Model¶

defaults:
  - default
  - _self_

nvbenjo:
  models:
    resnet:
      # wget https://github.com/onnx/models/raw/refs/heads/main/validated/vision/classification/resnet/model/resnet50-v2-7.onnx?download= ~/Downloads/resnet50-v2-7.onnx
      type_or_path: ~/Downloads/resnet50-v2-7.onnx
      shape: [{"name": "data", "shape": [B, 3, 224, 224], "type": "float", "min_max": [0, 1]}]
      num_warmup_batches: 3
      num_batches: 10
      batch_sizes: [1, 8, 16, 32]
      devices: ["cuda:0", "cpu"]
      runtime_options:
        DEFAULT:
          intra_op_num_threads: 2
          graph_optimization_level: ORT_ENABLE_BASIC
          enable_profiling: true
          profiling_prefix: "resnet_profile"

Example Configuration of Onnx-Silero Model¶

defaults:
  - default
  - _self_

nvbenjo:
  models:
      silero:
        # see https://github.com/snakers4/silero-models/blob/master/models.yml
        # wget https://models.silero.ai/models/en/en_v5.onnx -O ~/Downloads/silero_model_stt_v5.onnx
        type_or_path: ~/Downloads/silero_model_stt_v5.onnx
        shape:
          - {name: "input", shape: [B, 16000], "type": "float", min_max: [-1, 1]}
        num_warmup_batches: 5
        num_batches: 20
        batch_sizes: [1, 8, 16, 128]

        # Specify the execution providers to use directly
        runtime_options:
          DEFAULT_CPU:
            execution_providers: ["CPUExecutionProvider"]
            intra_op_num_threads: 4
            graph_optimization_level: ORT_ENABLE_EXTENDED
            enable_profiling: true
            profiling_prefix: "silero_profile/silero"
          DEFAULT_CUDA:
            execution_providers: [["CUDAExecutionProvider", {"device_id": 0}], "CPUExecutionProvider"]
            intra_op_num_threads: 4
            graph_optimization_level: ORT_ENABLE_EXTENDED
            enable_profiling: true
            profiling_prefix: "silero_profile/silero_cuda"

Advanced Examples¶

Example Configuration¶

defaults:
  - default
  - _self_


nvbenjo:
  models:
    openai/whisper-small:
      type_or_path: huggingface:openai/whisper-small
      kwargs: {}
      shape: [
          {"name": "input_features", "shape": [B, 80, 3000], "min_max": [0, 1]},
          {"name": "decoder_input_ids", "shape": [B, 6], "value": 50258, "type": "long"},
          {"name": "attention_mask", "shape": [B, 6], "value": 1, "type": "long"},
        ]
      num_warmup_batches: 1
      num_batches: 10
      batch_sizes: [1, 8, 16]
      devices: ["cuda:0"]
      runtime_options:
        FP16:
          precision: FP16
          compile: false
        FP32:
          precision: FP32
          compile: false
        FP32_profiled:
          precision: FP32
          compile: false
          compile_kwargs: {dynamic: true}
          enable_profiling: true
          profiler_kwargs: {}
          profiling_prefix: "whisper_small_profile/whisper_small"
      
      custom_batchmetrics:
        # Define a custom metric that computes Real Time Factor (RTF)
        # 30s audio input -> 30 / time_total_batch_normalized
        real_time_factor: 30

    silero:
      type_or_path: ~/Downloads/silero_model_stt_v5.onnx
      shape:
        - {name: "input", shape: [B, 16000], "type": "float", min_max: [-1, 1]}
      num_warmup_batches: 5
      num_batches: 20
      batch_sizes: [1, 2]
      devices: ["cuda:0", "cpu"]
      runtime_options:
        default:
          intra_op_num_threads: 4
          inter_op_num_threads: 0
          graph_optimization_level: ORT_ENABLE_EXTENDED
          log_severity_level: 2
        profiled:
          intra_op_num_threads: 4
          inter_op_num_threads: 0
          graph_optimization_level: ORT_ENABLE_EXTENDED
          log_severity_level: 2
          enable_profiling: true
          profiling_prefix: "silero_profile/silero"