StyledLines/cpp-docs/common-base_8h_source.html

// Various helper functions and utilities


#pragma once


#include "llama.h"


#include "sampling.h"


#include <cmath>

#include <string>

#include <vector>

#include <random>

#include <thread>

#include <unordered_map>

#include <tuple>


struct llama_control_vector_load_info;


//

// CPU utils

//


int32_t cpu_get_num_physical_cores();

int32_t cpu_get_num_math();


//

// CLI argument parsing

//


// dimensionality reduction methods, used by cvector-generator


enum dimre_method {

    DIMRE_METHOD_PCA,

    DIMRE_METHOD_MEAN,

};


struct gpt_params {

    uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed


    int32_t n_threads = cpu_get_num_math();

    int32_t n_threads_draft = -1;

    int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)

    int32_t n_threads_batch_draft = -1;

    int32_t n_predict = -1; // new tokens to predict

    int32_t n_ctx = 0; // context size

    int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)

    int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)

    int32_t n_keep = 0; // number of tokens to keep from initial prompt

    int32_t n_draft = 5; // number of tokens to draft during speculative decoding

    int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)

    int32_t n_parallel = 1; // number of parallel sequences to decode

    int32_t n_sequences = 1; // number of sequences to decode

    float   p_split = 0.1f; // speculative decoding split probability

    int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)

    int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

    int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors

    float   tensor_split[128] = { 0 }; // how split tensors should be distributed across GPUs

    int32_t grp_attn_n = 1; // group-attention factor

    int32_t grp_attn_w = 512; // group-attention width

    int32_t n_print = -1; // print token count every n tokens (-1 = disabled)

    float   rope_freq_base = 0.0f; // RoPE base frequency

    float   rope_freq_scale = 0.0f; // RoPE frequency scaling factor

    float   yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor

    float   yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor

    float   yarn_beta_fast = 32.0f; // YaRN low correction dim

    float   yarn_beta_slow = 1.0f; // YaRN high correction dim

    int32_t yarn_orig_ctx = 0; // YaRN original context length

    float   defrag_thold = -1.0f; // KV cache defragmentation threshold


    ggml_backend_sched_eval_callback cb_eval = nullptr;

    void* cb_eval_user_data = nullptr;


    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;


    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;

    enum llama_pooling_type      pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings

    enum llama_attention_type    attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings


    // // sampling parameters

    struct llama_sampling_params sparams;


    std::string model = ""; // model path

    std::string model_draft = ""; // draft model for speculative decoding

    std::string model_alias = "unknown"; // model alias

    std::string model_url = ""; // model url to download

    std::string hf_token = ""; // HF token

    std::string hf_repo = ""; // HF repo

    std::string hf_file = ""; // HF file

    std::string prompt = "";

    std::string prompt_file = ""; // store the external prompt file name

    std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state

    std::string input_prefix = ""; // string to prefix user inputs with

    std::string input_suffix = ""; // string to suffix user inputs with

    std::string logdir = ""; // directory in which to save YAML log files

    std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding

    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding

    std::string logits_file = ""; // file for saving *all* logits

    std::string rpc_servers = ""; // comma separated list of RPC servers


    std::vector<std::string> in_files;   // all input files

    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)

    std::vector<llama_model_kv_override> kv_overrides;


    // TODO: avoid tuple, use struct

    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale


    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale


    int32_t verbosity = 0;

    int32_t control_vector_layer_start = -1; // layer range for control vector

    int32_t control_vector_layer_end = -1; // layer range for control vector


    int32_t ppl_stride = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.

    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line

    //                                       (which is more convenient to use for plotting)

    //

    bool   hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt

    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score


    bool   winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt

    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed


    bool   multiple_choice = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt

    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed


    bool   kl_divergence = false; // compute KL divergence


    bool usage = false; // print usage

    bool use_color = false; // use color to distinguish generations and inputs

    bool special = false; // enable special token output

    bool interactive = false; // interactive mode

    bool interactive_first = false; // wait for user input immediately

    bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)

    bool prompt_cache_all = false; // save user input and generations to prompt cache

    bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it


    bool escape = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"

    bool multiline_input = false; // reverse the usage of `\`

    bool simple_io = false; // improves compatibility with subprocesses and limited consoles

    bool cont_batching = true;  // insert new sequences for decoding on-the-fly

    bool flash_attn = false; // flash attention


    bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix

    bool ignore_eos = false; // ignore generated EOS tokens

    bool logits_all = false; // return logits for all tokens in the batch

    bool use_mmap = true;  // use mmap for faster loads

    bool use_mlock = false; // use mlock to keep model in memory

    bool verbose_prompt = false; // print prompt tokens before generation

    bool display_prompt = true;  // print prompt before generation

    bool infill = false; // use infill mode

    bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes

    bool no_kv_offload = false; // disable KV offloading

    bool warmup = true;  // warmup run

    bool check_tensors = false; // validate tensor data


    std::string cache_type_k = "f16"; // KV cache data type for the K

    std::string cache_type_v = "f16"; // KV cache data type for the V


    // multimodal models (see examples/llava)

    std::string mmproj = "";        // path to multimodal projector

    std::vector<std::string> image; // path to image file(s)


    // embedding

    bool embedding = false; // get only sentence embedding

    int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)

    std::string embd_out = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix

    std::string embd_sep = "\n";  // separator of embendings


    // server params

    int32_t port = 8080;         // server listens on this network port

    int32_t timeout_read = 600;          // http read timeout in seconds

    int32_t timeout_write = timeout_read; // http write timeout in seconds

    int32_t n_threads_http = -1;           // number of threads to process HTTP requests


    std::string hostname = "127.0.0.1";

    std::string public_path = "";

    std::string chat_template = "";

    std::string system_prompt = "";

    bool enable_chat_template = true;


    std::vector<std::string> api_keys;


    std::string ssl_file_key = "";

    std::string ssl_file_cert = "";


    bool endpoint_slots = true;

    bool endpoint_metrics = false;


    bool log_json = false;


    std::string slot_save_path;


    float slot_prompt_similarity = 0.5f;


    // batched-bench params

    bool is_pp_shared = false;


    std::vector<int32_t> n_pp;

    std::vector<int32_t> n_tg;

    std::vector<int32_t> n_pl;


    // retrieval params

    std::vector<std::string> context_files; // context files to embed


    int32_t chunk_size = 64; // chunk size for context embedding


    std::string chunk_separator = "\n"; // chunk separator for context embedding


    // passkey params

    int32_t n_junk = 250; // number of times to repeat the junk text

    int32_t i_pos = -1;  // position of the passkey in the junk text


    // imatrix params

    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file


    int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations

    int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations

    int32_t i_chunk = 0; // start processing from this chunk


    bool process_output = false; // collect data for the output tensor

    bool compute_ppl = true;  // whether to compute perplexity


    // cvector-generator params

    int n_pca_batch = 100;

    int n_pca_iterations = 1000;

    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;

    std::string cvector_outfile = "control_vector.gguf";

    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";

    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";


    bool spm_infill = false; // suffix/prefix/middle pattern for infill


    std::string lora_outfile = "ggml-lora-merged-f16.gguf";

};


cpu_get_num_physical_cores
int32_t cpu_get_num_physical_cores()

cpu_get_num_math
int32_t cpu_get_num_math()

dimre_method
dimre_method
Definition common-base.h:33

DIMRE_METHOD_MEAN
@ DIMRE_METHOD_MEAN
Definition common-base.h:35

DIMRE_METHOD_PCA
@ DIMRE_METHOD_PCA
Definition common-base.h:34

gpt_params
Definition common-base.h:38

gpt_params::port
int32_t port
Definition common-base.h:172

gpt_params::kl_divergence
bool kl_divergence
Definition common-base.h:128

gpt_params::chunk_size
int32_t chunk_size
Definition common-base.h:207

gpt_params::verbose_prompt
bool verbose_prompt
Definition common-base.h:150

gpt_params::n_gpu_layers
int32_t n_gpu_layers
Definition common-base.h:55

gpt_params::antiprompt
std::vector< std::string > antiprompt
Definition common-base.h:103

gpt_params::hostname
std::string hostname
Definition common-base.h:177

gpt_params::multiline_input
bool multiline_input
Definition common-base.h:140

gpt_params::rope_freq_scale
float rope_freq_scale
Definition common-base.h:63

gpt_params::slot_prompt_similarity
float slot_prompt_similarity
Definition common-base.h:195

gpt_params::p_split
float p_split
Definition common-base.h:54

gpt_params::n_ubatch
int32_t n_ubatch
Definition common-base.h:48

gpt_params::interactive_first
bool interactive_first
Definition common-base.h:134

gpt_params::main_gpu
int32_t main_gpu
Definition common-base.h:57

gpt_params::api_keys
std::vector< std::string > api_keys
Definition common-base.h:183

gpt_params::lookup_cache_dynamic
std::string lookup_cache_dynamic
Definition common-base.h:98

gpt_params::chunk_separator
std::string chunk_separator
Definition common-base.h:209

gpt_params::cache_type_v
std::string cache_type_v
Definition common-base.h:159

gpt_params::compute_ppl
bool compute_ppl
Definition common-base.h:223

gpt_params::usage
bool usage
Definition common-base.h:130

gpt_params::prompt
std::string prompt
Definition common-base.h:91

gpt_params::grp_attn_n
int32_t grp_attn_n
Definition common-base.h:59

gpt_params::n_print
int32_t n_print
Definition common-base.h:61

gpt_params::infill
bool infill
Definition common-base.h:152

gpt_params::n_pca_batch
int n_pca_batch
Definition common-base.h:226

gpt_params::model_draft
std::string model_draft
Definition common-base.h:85

gpt_params::hf_file
std::string hf_file
Definition common-base.h:90

gpt_params::lora_adapter
std::vector< std::tuple< std::string, float > > lora_adapter
Definition common-base.h:107

gpt_params::timeout_write
int32_t timeout_write
Definition common-base.h:174

gpt_params::multiple_choice
bool multiple_choice
Definition common-base.h:125

gpt_params::logdir
std::string logdir
Definition common-base.h:96

gpt_params::hf_token
std::string hf_token
Definition common-base.h:88

gpt_params::n_threads_batch_draft
int32_t n_threads_batch_draft
Definition common-base.h:44

gpt_params::yarn_attn_factor
float yarn_attn_factor
Definition common-base.h:65

gpt_params::timeout_read
int32_t timeout_read
Definition common-base.h:173

gpt_params::attention_type
enum llama_attention_type attention_type
Definition common-base.h:79

gpt_params::multiple_choice_tasks
size_t multiple_choice_tasks
Definition common-base.h:126

gpt_params::use_color
bool use_color
Definition common-base.h:131

gpt_params::logits_file
std::string logits_file
Definition common-base.h:99

gpt_params::rope_scaling_type
enum llama_rope_scaling_type rope_scaling_type
Definition common-base.h:77

gpt_params::embd_out
std::string embd_out
Definition common-base.h:168

gpt_params::n_sequences
int32_t n_sequences
Definition common-base.h:53

gpt_params::model
std::string model
Definition common-base.h:84

gpt_params::kv_overrides
std::vector< llama_model_kv_override > kv_overrides
Definition common-base.h:104

gpt_params::verbosity
int32_t verbosity
Definition common-base.h:111

gpt_params::hellaswag
bool hellaswag
Definition common-base.h:119

gpt_params::winogrande
bool winogrande
Definition common-base.h:122

gpt_params::seed
uint32_t seed
Definition common-base.h:39

gpt_params::ppl_stride
int32_t ppl_stride
Definition common-base.h:115

gpt_params::control_vector_layer_end
int32_t control_vector_layer_end
Definition common-base.h:113

gpt_params::n_pp
std::vector< int32_t > n_pp
Definition common-base.h:200

gpt_params::cvector_dimre_method
dimre_method cvector_dimre_method
Definition common-base.h:228

gpt_params::enable_chat_template
bool enable_chat_template
Definition common-base.h:181

gpt_params::use_mmap
bool use_mmap
Definition common-base.h:148

gpt_params::n_draft
int32_t n_draft
Definition common-base.h:50

gpt_params::system_prompt
std::string system_prompt
Definition common-base.h:180

gpt_params::tensor_split
float tensor_split[128]
Definition common-base.h:58

gpt_params::lookup_cache_static
std::string lookup_cache_static
Definition common-base.h:97

gpt_params::ssl_file_cert
std::string ssl_file_cert
Definition common-base.h:186

gpt_params::logits_all
bool logits_all
Definition common-base.h:147

gpt_params::out_file
std::string out_file
Definition common-base.h:216

gpt_params::n_batch
int32_t n_batch
Definition common-base.h:47

gpt_params::n_threads_batch
int32_t n_threads_batch
Definition common-base.h:43

gpt_params::n_tg
std::vector< int32_t > n_tg
Definition common-base.h:201

gpt_params::display_prompt
bool display_prompt
Definition common-base.h:151

gpt_params::n_predict
int32_t n_predict
Definition common-base.h:45

gpt_params::rpc_servers
std::string rpc_servers
Definition common-base.h:100

gpt_params::n_save_freq
int32_t n_save_freq
Definition common-base.h:219

gpt_params::n_ctx
int32_t n_ctx
Definition common-base.h:46

gpt_params::yarn_beta_slow
float yarn_beta_slow
Definition common-base.h:67

gpt_params::ppl_output_type
int32_t ppl_output_type
Definition common-base.h:116

gpt_params::n_threads_http
int32_t n_threads_http
Definition common-base.h:175

gpt_params::embd_normalize
int32_t embd_normalize
Definition common-base.h:167

gpt_params::pooling_type
enum llama_pooling_type pooling_type
Definition common-base.h:78

gpt_params::cb_eval
ggml_backend_sched_eval_callback cb_eval
Definition common-base.h:71

gpt_params::chat_template
std::string chat_template
Definition common-base.h:179

gpt_params::prompt_file
std::string prompt_file
Definition common-base.h:92

gpt_params::ssl_file_key
std::string ssl_file_key
Definition common-base.h:185

gpt_params::process_output
bool process_output
Definition common-base.h:222

gpt_params::n_pl
std::vector< int32_t > n_pl
Definition common-base.h:202

gpt_params::input_suffix
std::string input_suffix
Definition common-base.h:95

gpt_params::n_junk
int32_t n_junk
Definition common-base.h:212

gpt_params::endpoint_metrics
bool endpoint_metrics
Definition common-base.h:189

gpt_params::grp_attn_w
int32_t grp_attn_w
Definition common-base.h:60

gpt_params::n_keep
int32_t n_keep
Definition common-base.h:49

gpt_params::simple_io
bool simple_io
Definition common-base.h:141

gpt_params::control_vectors
std::vector< llama_control_vector_load_info > control_vectors
Definition common-base.h:109

gpt_params::special
bool special
Definition common-base.h:132

gpt_params::no_kv_offload
bool no_kv_offload
Definition common-base.h:154

gpt_params::rope_freq_base
float rope_freq_base
Definition common-base.h:62

gpt_params::hf_repo
std::string hf_repo
Definition common-base.h:89

gpt_params::cvector_positive_file
std::string cvector_positive_file
Definition common-base.h:230

gpt_params::public_path
std::string public_path
Definition common-base.h:178

gpt_params::use_mlock
bool use_mlock
Definition common-base.h:149

gpt_params::yarn_orig_ctx
int32_t yarn_orig_ctx
Definition common-base.h:68

gpt_params::image
std::vector< std::string > image
Definition common-base.h:163

gpt_params::embedding
bool embedding
Definition common-base.h:166

gpt_params::interactive
bool interactive
Definition common-base.h:133

gpt_params::hellaswag_tasks
size_t hellaswag_tasks
Definition common-base.h:120

gpt_params::n_parallel
int32_t n_parallel
Definition common-base.h:52

gpt_params::warmup
bool warmup
Definition common-base.h:155

gpt_params::prompt_cache_all
bool prompt_cache_all
Definition common-base.h:136

gpt_params::context_files
std::vector< std::string > context_files
Definition common-base.h:205

gpt_params::winogrande_tasks
size_t winogrande_tasks
Definition common-base.h:123

gpt_params::i_chunk
int32_t i_chunk
Definition common-base.h:220

gpt_params::is_pp_shared
bool is_pp_shared
Definition common-base.h:198

gpt_params::prompt_cache_ro
bool prompt_cache_ro
Definition common-base.h:137

gpt_params::escape
bool escape
Definition common-base.h:139

gpt_params::cvector_outfile
std::string cvector_outfile
Definition common-base.h:229

gpt_params::flash_attn
bool flash_attn
Definition common-base.h:143

gpt_params::n_chunks
int32_t n_chunks
Definition common-base.h:51

gpt_params::conversation
bool conversation
Definition common-base.h:135

gpt_params::cb_eval_user_data
void * cb_eval_user_data
Definition common-base.h:72

gpt_params::n_threads_draft
int32_t n_threads_draft
Definition common-base.h:42

gpt_params::dump_kv_cache
bool dump_kv_cache
Definition common-base.h:153

gpt_params::embd_sep
std::string embd_sep
Definition common-base.h:169

gpt_params::n_pca_iterations
int n_pca_iterations
Definition common-base.h:227

gpt_params::n_threads
int32_t n_threads
Definition common-base.h:41

gpt_params::i_pos
int32_t i_pos
Definition common-base.h:213

gpt_params::path_prompt_cache
std::string path_prompt_cache
Definition common-base.h:93

gpt_params::cache_type_k
std::string cache_type_k
Definition common-base.h:158

gpt_params::cont_batching
bool cont_batching
Definition common-base.h:142

gpt_params::input_prefix_bos
bool input_prefix_bos
Definition common-base.h:145

gpt_params::control_vector_layer_start
int32_t control_vector_layer_start
Definition common-base.h:112

gpt_params::lora_outfile
std::string lora_outfile
Definition common-base.h:235

gpt_params::endpoint_slots
bool endpoint_slots
Definition common-base.h:188

gpt_params::sparams
struct llama_sampling_params sparams
Definition common-base.h:82

gpt_params::input_prefix
std::string input_prefix
Definition common-base.h:94

gpt_params::mmproj
std::string mmproj
Definition common-base.h:162

gpt_params::log_json
bool log_json
Definition common-base.h:191

gpt_params::model_alias
std::string model_alias
Definition common-base.h:86

gpt_params::n_out_freq
int32_t n_out_freq
Definition common-base.h:218

gpt_params::split_mode
enum llama_split_mode split_mode
Definition common-base.h:76

gpt_params::model_url
std::string model_url
Definition common-base.h:87

gpt_params::numa
ggml_numa_strategy numa
Definition common-base.h:74

gpt_params::check_tensors
bool check_tensors
Definition common-base.h:156

gpt_params::ignore_eos
bool ignore_eos
Definition common-base.h:146

gpt_params::cvector_negative_file
std::string cvector_negative_file
Definition common-base.h:231

gpt_params::n_gpu_layers_draft
int32_t n_gpu_layers_draft
Definition common-base.h:56

gpt_params::spm_infill
bool spm_infill
Definition common-base.h:233

gpt_params::slot_save_path
std::string slot_save_path
Definition common-base.h:193

gpt_params::defrag_thold
float defrag_thold
Definition common-base.h:69

gpt_params::yarn_beta_fast
float yarn_beta_fast
Definition common-base.h:66

gpt_params::in_files
std::vector< std::string > in_files
Definition common-base.h:102

gpt_params::yarn_ext_factor
float yarn_ext_factor
Definition common-base.h:64

llama_sampling_params
Definition sampling-base.h:18