xref: /aosp_15_r20/external/pytorch/benchmarks/dynamo/torchbench.yaml (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1# Some models have large dataset that doesn't fit in memory. Lower the batch
2# size to test the accuracy.
3batch_size:
4  training:
5    demucs: 4
6    dlrm: 1024
7    densenet121: 4
8    hf_Reformer: 4
9    hf_T5_base: 4
10    timm_efficientdet: 1
11    llama_v2_7b_16h: 1
12    # reduced from 16 due to cudagraphs OOM in TorchInductor dashboard
13    yolov3: 8
14
15  inference:
16    timm_efficientdet: 32
17
18
19dont_change_batch_size:
20  - demucs
21  - pytorch_struct
22  - pyhpc_turbulent_kinetic_energy
23  # https://github.com/pytorch/benchmark/pull/1656
24  - vision_maskrcnn
25
26
27tolerance:
28  # Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models.
29  higher:
30    - alexnet
31    - attention_is_all_you_need_pytorch
32    - densenet121
33    - hf_Albert
34    - vgg16
35    - mobilenet_v3_large
36    - nvidia_deeprecommender
37    - timm_efficientdet
38
39  # These models need >1e-3 tolerance
40  even_higher:
41    - soft_actor_critic
42    - tacotron2
43    - yolov3
44    - timm_efficientdet
45    - squeezenet1_1
46
47  higher_fp16:
48    - doctr_reco_predictor
49    - drq
50    - hf_Whisper
51
52  higher_bf16:
53    - doctr_reco_predictor
54    - drq
55    - hf_Whisper
56
57  cosine: []
58
59require_larger_multiplier_for_smaller_tensor:
60  - yolov3
61
62# These benchmarks took >600s on an i9-11900K CPU
63very_slow: &VERY_SLOW_MODELS
64  # 3339s
65  - hf_BigBird
66  # 3062s
67  - hf_Longformer
68  # 930s
69  - hf_T5
70
71
72# These benchmarks took >60s on an i9-11900K CPU
73slow:
74  - *VERY_SLOW_MODELS
75  # 137s
76  - BERT_pytorch
77  # 116s
78  - demucs
79  # 242s
80  - fastNLP_Bert
81  # 221s
82  - hf_Albert
83  # 400s
84  - hf_Bart
85  # 334s
86  - hf_Bert
87  # 187s
88  - hf_DistilBert
89  # 470s
90  - hf_GPT2
91  # 141s
92  - hf_Reformer
93  # 317s
94  - speech_transformer
95  # 99s
96  - vision_maskrcnn
97
98
99non_deterministic:
100  # https://github.com/pytorch/pytorch/issues/98355
101  - mobilenet_v3_large
102  - sam_fast
103
104
105dtype:
106  force_amp_for_fp16_bf16_models:
107    - DALLE2_pytorch
108    - doctr_det_predictor
109    - doctr_reco_predictor
110    - Super_SloMo
111    - tts_angular
112    - pyhpc_turbulent_kinetic_energy
113    - detectron2_fcos_r_50_fpn
114
115  force_fp16_for_bf16_models:
116    - vision_maskrcnn
117
118
119# models in canary_models that we should run anyway
120canary_models:
121  - torchrec_dlrm
122
123
124detectron2_models: &DETECTRON2_MODELS
125  - detectron2_fasterrcnn_r_101_c4
126  - detectron2_fasterrcnn_r_101_dc5
127  - detectron2_fasterrcnn_r_101_fpn
128  - detectron2_fasterrcnn_r_50_c4
129  - detectron2_fasterrcnn_r_50_dc5
130  - detectron2_fasterrcnn_r_50_fpn
131  - detectron2_maskrcnn_r_101_c4
132  - detectron2_maskrcnn_r_101_fpn
133  - detectron2_maskrcnn_r_50_fpn
134
135
136# These models support only train mode. So accuracy checking can't be done in
137# eval mode.
138only_training:
139  - *DETECTRON2_MODELS
140  - tts_angular
141  - tacotron2
142  - demucs
143  - hf_Reformer
144  - pytorch_struct
145  - yolov3
146
147
148trt_not_yet_working:
149  - alexnet
150  - resnet18
151  - resnet50
152  - mobilenet_v2
153  - mnasnet1_0
154  - squeezenet1_1
155  - shufflenetv2_x1_0
156  - vgg16
157  - resnext50_32x4d
158
159
160skip:
161  all:
162    # OOMs (A100 40G)
163    - detectron2_maskrcnn
164    # TIMEOUT, https://github.com/pytorch/pytorch/issues/98467
165    - tacotron2
166    # Failing in eager mode
167    - hf_clip
168    # multi gpu not always available in benchmark runners
169    - simple_gpt_tp_manual
170
171  device:
172    cpu:
173      # OOMs
174      - hf_T5_generate
175      # model is CUDA only
176      - cm3leon_generate
177      # timeout
178      - nanogpt
179      # timeout
180      - sam
181      # model is CUDA only
182      - sam_fast
183      # model is CUDA only
184      - llama_v2_7b_16h
185      # flaky
186      - stable_diffusion
187      # requires FBGEMM, CUDA only
188      - torchrec_dlrm
189      - simple_gpt
190      # works on cuda, accuracy failure on cpu
191      - hf_Whisper
192      - stable_diffusion_text_encoder
193      - llava
194      - moco
195
196    cuda: []
197
198  test:
199    training:
200      - *DETECTRON2_MODELS
201      # not designed for training
202      - pyhpc_equation_of_state
203      - pyhpc_isoneutral_mixing
204      - pyhpc_turbulent_kinetic_energy
205      - maml
206      - llama
207      - llama_v2_7b_16h
208      - simple_gpt
209      - sam_fast
210      # Model's DEFAULT_TRAIN_BSIZE is not implemented
211      - cm3leon_generate
212      - hf_T5_generate
213      - doctr_det_predictor
214      - doctr_reco_predictor
215      - moondream
216      # doesnt fit in memory
217      - phi_1_5
218      - detectron2_fcos_r_50_fpn
219
220  control_flow:
221    - cm3leon_generate
222    - detectron2_fcos_r_50_fpn
223    - fastNLP_Bert
224    - hf_Longformer
225    - hf_Reformer
226    - hf_T5_generate
227    - opacus_cifar10
228    - speech_transformer
229
230  # Models that should only run in --multiprocess mode
231  multiprocess:
232    - simple_gpt
233
234  # for these models, conv-batchnorm fusing causes big numerical churn.
235  # Skip them
236  # mnasnet1_0 and shufflenet_v2_x1_0 can pass on cpu, moco cuda only.
237  freezing:
238    cuda:
239      - mnasnet1_0
240      - moco
241      - shufflenet_v2_x1_0
242    cpu: []
243
244
245
246
247accuracy:
248  skip:
249    large_models:
250      # Models too large to have eager, dynamo and fp64_numbers simultaneosuly
251      # even for 40 GB machine. We have tested accuracy for smaller version of
252      # these models
253      - hf_GPT2_large
254      - hf_T5_large
255      - timm_vision_transformer_large
256      # accuracy https://github.com/pytorch/pytorch/issues/93847
257      - maml
258      - llama_v2_7b_16h
259      - Background_Matting
260      - stable_diffusion_unet
261    eager_not_deterministic:
262      # Models that deterministic algorithms can not be turned on for eager mode.
263      - Background_Matting
264      - pytorch_unet
265
266  max_batch_size:
267    hf_GPT2: 2
268    pytorch_unet: 2
269