Llama.cpp C++-to-Csharp wrapper from testedlines.com: C++ docs 1.0.1
Llama.cpp C++-to-Csharp wrapper is a minor extension to Llama.cpp tag b3490 codebase modified a bit by testedlines allowing it to be compiled for and called from Styled Lines Csharp unity asset store package.
Loading...
Searching...
No Matches
common-base.h
Go to the documentation of this file.
1// Various helper functions and utilities
2
3#pragma once
4
5#include "llama.h"
6
7#include "sampling.h"
8
9#include <cmath>
10#include <string>
11#include <vector>
12#include <random>
13#include <thread>
14#include <unordered_map>
15#include <tuple>
16
17
18
19struct llama_control_vector_load_info;
20
21//
22// CPU utils
23//
24
27
28//
29// CLI argument parsing
30//
31
32// dimensionality reduction methods, used by cvector-generator
37
38struct gpt_params {
39 uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
40
42 int32_t n_threads_draft = -1;
43 int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
45 int32_t n_predict = -1; // new tokens to predict
46 int32_t n_ctx = 0; // context size
47 int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
48 int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
49 int32_t n_keep = 0; // number of tokens to keep from initial prompt
50 int32_t n_draft = 5; // number of tokens to draft during speculative decoding
51 int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
52 int32_t n_parallel = 1; // number of parallel sequences to decode
53 int32_t n_sequences = 1; // number of sequences to decode
54 float p_split = 0.1f; // speculative decoding split probability
55 int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
56 int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
57 int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
58 float tensor_split[128] = { 0 }; // how split tensors should be distributed across GPUs
59 int32_t grp_attn_n = 1; // group-attention factor
60 int32_t grp_attn_w = 512; // group-attention width
61 int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
62 float rope_freq_base = 0.0f; // RoPE base frequency
63 float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
64 float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
65 float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
66 float yarn_beta_fast = 32.0f; // YaRN low correction dim
67 float yarn_beta_slow = 1.0f; // YaRN high correction dim
68 int32_t yarn_orig_ctx = 0; // YaRN original context length
69 float defrag_thold = -1.0f; // KV cache defragmentation threshold
70
71 ggml_backend_sched_eval_callback cb_eval = nullptr;
72 void* cb_eval_user_data = nullptr;
73
74 ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
75
76 enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
77 enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
78 enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
79 enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
80
81 // // sampling parameters
83
84 std::string model = ""; // model path
85 std::string model_draft = ""; // draft model for speculative decoding
86 std::string model_alias = "unknown"; // model alias
87 std::string model_url = ""; // model url to download
88 std::string hf_token = ""; // HF token
89 std::string hf_repo = ""; // HF repo
90 std::string hf_file = ""; // HF file
91 std::string prompt = "";
92 std::string prompt_file = ""; // store the external prompt file name
93 std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
94 std::string input_prefix = ""; // string to prefix user inputs with
95 std::string input_suffix = ""; // string to suffix user inputs with
96 std::string logdir = ""; // directory in which to save YAML log files
97 std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
98 std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
99 std::string logits_file = ""; // file for saving *all* logits
100 std::string rpc_servers = ""; // comma separated list of RPC servers
101
102 std::vector<std::string> in_files; // all input files
103 std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
104 std::vector<llama_model_kv_override> kv_overrides;
105
106 // TODO: avoid tuple, use struct
107 std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
108
109 std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
110
111 int32_t verbosity = 0;
112 int32_t control_vector_layer_start = -1; // layer range for control vector
113 int32_t control_vector_layer_end = -1; // layer range for control vector
114
115 int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
116 int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
117 // (which is more convenient to use for plotting)
118 //
119 bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
120 size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
121
122 bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
123 size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
124
125 bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
126 size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
127
128 bool kl_divergence = false; // compute KL divergence
129
130 bool usage = false; // print usage
131 bool use_color = false; // use color to distinguish generations and inputs
132 bool special = false; // enable special token output
133 bool interactive = false; // interactive mode
134 bool interactive_first = false; // wait for user input immediately
135 bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
136 bool prompt_cache_all = false; // save user input and generations to prompt cache
137 bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
138
139 bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
140 bool multiline_input = false; // reverse the usage of `\`
141 bool simple_io = false; // improves compatibility with subprocesses and limited consoles
142 bool cont_batching = true; // insert new sequences for decoding on-the-fly
143 bool flash_attn = false; // flash attention
144
145 bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
146 bool ignore_eos = false; // ignore generated EOS tokens
147 bool logits_all = false; // return logits for all tokens in the batch
148 bool use_mmap = true; // use mmap for faster loads
149 bool use_mlock = false; // use mlock to keep model in memory
150 bool verbose_prompt = false; // print prompt tokens before generation
151 bool display_prompt = true; // print prompt before generation
152 bool infill = false; // use infill mode
153 bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
154 bool no_kv_offload = false; // disable KV offloading
155 bool warmup = true; // warmup run
156 bool check_tensors = false; // validate tensor data
157
158 std::string cache_type_k = "f16"; // KV cache data type for the K
159 std::string cache_type_v = "f16"; // KV cache data type for the V
160
161 // multimodal models (see examples/llava)
162 std::string mmproj = ""; // path to multimodal projector
163 std::vector<std::string> image; // path to image file(s)
164
165 // embedding
166 bool embedding = false; // get only sentence embedding
167 int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
168 std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
169 std::string embd_sep = "\n"; // separator of embendings
170
171 // server params
172 int32_t port = 8080; // server listens on this network port
173 int32_t timeout_read = 600; // http read timeout in seconds
174 int32_t timeout_write = timeout_read; // http write timeout in seconds
175 int32_t n_threads_http = -1; // number of threads to process HTTP requests
176
177 std::string hostname = "127.0.0.1";
178 std::string public_path = "";
179 std::string chat_template = "";
180 std::string system_prompt = "";
182
183 std::vector<std::string> api_keys;
184
185 std::string ssl_file_key = "";
186 std::string ssl_file_cert = "";
187
188 bool endpoint_slots = true;
189 bool endpoint_metrics = false;
190
191 bool log_json = false;
192
193 std::string slot_save_path;
194
196
197 // batched-bench params
198 bool is_pp_shared = false;
199
200 std::vector<int32_t> n_pp;
201 std::vector<int32_t> n_tg;
202 std::vector<int32_t> n_pl;
203
204 // retrieval params
205 std::vector<std::string> context_files; // context files to embed
206
207 int32_t chunk_size = 64; // chunk size for context embedding
208
209 std::string chunk_separator = "\n"; // chunk separator for context embedding
210
211 // passkey params
212 int32_t n_junk = 250; // number of times to repeat the junk text
213 int32_t i_pos = -1; // position of the passkey in the junk text
214
215 // imatrix params
216 std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
217
218 int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
219 int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
220 int32_t i_chunk = 0; // start processing from this chunk
221
222 bool process_output = false; // collect data for the output tensor
223 bool compute_ppl = true; // whether to compute perplexity
224
225 // cvector-generator params
226 int n_pca_batch = 100;
229 std::string cvector_outfile = "control_vector.gguf";
230 std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
231 std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
232
233 bool spm_infill = false; // suffix/prefix/middle pattern for infill
234
235 std::string lora_outfile = "ggml-lora-merged-f16.gguf";
236};
int32_t cpu_get_num_physical_cores()
int32_t cpu_get_num_math()
dimre_method
Definition common-base.h:33
@ DIMRE_METHOD_MEAN
Definition common-base.h:35
@ DIMRE_METHOD_PCA
Definition common-base.h:34
Definition common-base.h:38
int32_t port
Definition common-base.h:172
bool kl_divergence
Definition common-base.h:128
int32_t chunk_size
Definition common-base.h:207
bool verbose_prompt
Definition common-base.h:150
int32_t n_gpu_layers
Definition common-base.h:55
std::vector< std::string > antiprompt
Definition common-base.h:103
std::string hostname
Definition common-base.h:177
bool multiline_input
Definition common-base.h:140
float rope_freq_scale
Definition common-base.h:63
float slot_prompt_similarity
Definition common-base.h:195
float p_split
Definition common-base.h:54
int32_t n_ubatch
Definition common-base.h:48
bool interactive_first
Definition common-base.h:134
int32_t main_gpu
Definition common-base.h:57
std::vector< std::string > api_keys
Definition common-base.h:183
std::string lookup_cache_dynamic
Definition common-base.h:98
std::string chunk_separator
Definition common-base.h:209
std::string cache_type_v
Definition common-base.h:159
bool compute_ppl
Definition common-base.h:223
bool usage
Definition common-base.h:130
std::string prompt
Definition common-base.h:91
int32_t grp_attn_n
Definition common-base.h:59
int32_t n_print
Definition common-base.h:61
bool infill
Definition common-base.h:152
int n_pca_batch
Definition common-base.h:226
std::string model_draft
Definition common-base.h:85
std::string hf_file
Definition common-base.h:90
std::vector< std::tuple< std::string, float > > lora_adapter
Definition common-base.h:107
int32_t timeout_write
Definition common-base.h:174
bool multiple_choice
Definition common-base.h:125
std::string logdir
Definition common-base.h:96
std::string hf_token
Definition common-base.h:88
int32_t n_threads_batch_draft
Definition common-base.h:44
float yarn_attn_factor
Definition common-base.h:65
int32_t timeout_read
Definition common-base.h:173
enum llama_attention_type attention_type
Definition common-base.h:79
size_t multiple_choice_tasks
Definition common-base.h:126
bool use_color
Definition common-base.h:131
std::string logits_file
Definition common-base.h:99
enum llama_rope_scaling_type rope_scaling_type
Definition common-base.h:77
std::string embd_out
Definition common-base.h:168
int32_t n_sequences
Definition common-base.h:53
std::string model
Definition common-base.h:84
std::vector< llama_model_kv_override > kv_overrides
Definition common-base.h:104
int32_t verbosity
Definition common-base.h:111
bool hellaswag
Definition common-base.h:119
bool winogrande
Definition common-base.h:122
uint32_t seed
Definition common-base.h:39
int32_t ppl_stride
Definition common-base.h:115
int32_t control_vector_layer_end
Definition common-base.h:113
std::vector< int32_t > n_pp
Definition common-base.h:200
dimre_method cvector_dimre_method
Definition common-base.h:228
bool enable_chat_template
Definition common-base.h:181
bool use_mmap
Definition common-base.h:148
int32_t n_draft
Definition common-base.h:50
std::string system_prompt
Definition common-base.h:180
float tensor_split[128]
Definition common-base.h:58
std::string lookup_cache_static
Definition common-base.h:97
std::string ssl_file_cert
Definition common-base.h:186
bool logits_all
Definition common-base.h:147
std::string out_file
Definition common-base.h:216
int32_t n_batch
Definition common-base.h:47
int32_t n_threads_batch
Definition common-base.h:43
std::vector< int32_t > n_tg
Definition common-base.h:201
bool display_prompt
Definition common-base.h:151
int32_t n_predict
Definition common-base.h:45
std::string rpc_servers
Definition common-base.h:100
int32_t n_save_freq
Definition common-base.h:219
int32_t n_ctx
Definition common-base.h:46
float yarn_beta_slow
Definition common-base.h:67
int32_t ppl_output_type
Definition common-base.h:116
int32_t n_threads_http
Definition common-base.h:175
int32_t embd_normalize
Definition common-base.h:167
enum llama_pooling_type pooling_type
Definition common-base.h:78
ggml_backend_sched_eval_callback cb_eval
Definition common-base.h:71
std::string chat_template
Definition common-base.h:179
std::string prompt_file
Definition common-base.h:92
std::string ssl_file_key
Definition common-base.h:185
bool process_output
Definition common-base.h:222
std::vector< int32_t > n_pl
Definition common-base.h:202
std::string input_suffix
Definition common-base.h:95
int32_t n_junk
Definition common-base.h:212
bool endpoint_metrics
Definition common-base.h:189
int32_t grp_attn_w
Definition common-base.h:60
int32_t n_keep
Definition common-base.h:49
bool simple_io
Definition common-base.h:141
std::vector< llama_control_vector_load_info > control_vectors
Definition common-base.h:109
bool special
Definition common-base.h:132
bool no_kv_offload
Definition common-base.h:154
float rope_freq_base
Definition common-base.h:62
std::string hf_repo
Definition common-base.h:89
std::string cvector_positive_file
Definition common-base.h:230
std::string public_path
Definition common-base.h:178
bool use_mlock
Definition common-base.h:149
int32_t yarn_orig_ctx
Definition common-base.h:68
std::vector< std::string > image
Definition common-base.h:163
bool embedding
Definition common-base.h:166
bool interactive
Definition common-base.h:133
size_t hellaswag_tasks
Definition common-base.h:120
int32_t n_parallel
Definition common-base.h:52
bool warmup
Definition common-base.h:155
bool prompt_cache_all
Definition common-base.h:136
std::vector< std::string > context_files
Definition common-base.h:205
size_t winogrande_tasks
Definition common-base.h:123
int32_t i_chunk
Definition common-base.h:220
bool is_pp_shared
Definition common-base.h:198
bool prompt_cache_ro
Definition common-base.h:137
bool escape
Definition common-base.h:139
std::string cvector_outfile
Definition common-base.h:229
bool flash_attn
Definition common-base.h:143
int32_t n_chunks
Definition common-base.h:51
bool conversation
Definition common-base.h:135
void * cb_eval_user_data
Definition common-base.h:72
int32_t n_threads_draft
Definition common-base.h:42
bool dump_kv_cache
Definition common-base.h:153
std::string embd_sep
Definition common-base.h:169
int n_pca_iterations
Definition common-base.h:227
int32_t n_threads
Definition common-base.h:41
int32_t i_pos
Definition common-base.h:213
std::string path_prompt_cache
Definition common-base.h:93
std::string cache_type_k
Definition common-base.h:158
bool cont_batching
Definition common-base.h:142
bool input_prefix_bos
Definition common-base.h:145
int32_t control_vector_layer_start
Definition common-base.h:112
std::string lora_outfile
Definition common-base.h:235
bool endpoint_slots
Definition common-base.h:188
struct llama_sampling_params sparams
Definition common-base.h:82
std::string input_prefix
Definition common-base.h:94
std::string mmproj
Definition common-base.h:162
bool log_json
Definition common-base.h:191
std::string model_alias
Definition common-base.h:86
int32_t n_out_freq
Definition common-base.h:218
enum llama_split_mode split_mode
Definition common-base.h:76
std::string model_url
Definition common-base.h:87
ggml_numa_strategy numa
Definition common-base.h:74
bool check_tensors
Definition common-base.h:156
bool ignore_eos
Definition common-base.h:146
std::string cvector_negative_file
Definition common-base.h:231
int32_t n_gpu_layers_draft
Definition common-base.h:56
bool spm_infill
Definition common-base.h:233
std::string slot_save_path
Definition common-base.h:193
float defrag_thold
Definition common-base.h:69
float yarn_beta_fast
Definition common-base.h:66
std::vector< std::string > in_files
Definition common-base.h:102
float yarn_ext_factor
Definition common-base.h:64
Definition sampling-base.h:18