Reference guide for model compilation configurations, including Optimization settings and Quantization options for different hardware environments.
{ "warmups": { "enabled": true, "iterations": 5, "sample_input_data": [] }, "backend": { "name": "auto", "version": "latest", "extra_params": {} }, "optimisations": { "speculative_decoding": { "enabled": false, "type": "auto", "extra_params": {} }, "attention_caching": { "enabled": false, "type": "auto", "extra_params": {} } }, "tensor_parallel_size": 1, "quantization": "float16" }
{ "type": "llm", "loras": [], "lora_repo": { "type": "", "path": "", "ownership": "", "secret": { "type": "" } }, "quantized_model_path": { "type": "", "path": "", "ownership": "", "secret": { "type": "" } } }