class Llama::Context

Llama::Context
Reference
Object

Overview

Wrapper for the llama_context structure

Defined in:

llama/context.cr
llama/context/error.cr

Constructors

.new(model : Model, n_ctx : UInt32 = 0, n_batch : UInt32 = 512, n_threads : Int32 = 0, n_threads_batch : Int32 = 0, embeddings : Bool = false, offload_kqv : Bool = false)
Creates a new Context instance for a model.

Instance Method Summary

#apply_adapter_cvec(data : Slice(Float32), n_embd : Int32, il_start : Int32, il_end : Int32) : Int32
Applies a control vector to the LoRA adapter
#apply_chat_template(messages : Array(ChatMessage), add_assistant : Bool = true, template : String | Nil = nil) : String
Applies the chat template to the given messages and returns the formatted prompt.
#attach_adapter_lora(adapter : AdapterLora, scale : Float32 = 1.0) : Int32
Attaches a LoRA adapter to this context
#chat(messages : Array(ChatMessage), max_tokens : Int32 = 128, temperature : Float32 = 0.8, template : String | Nil = nil) : String
Generates a response in a chat conversation
#clear_adapters_lora
Clears all LoRA adapters from this context
#decode(batch : LibLlama::LlamaBatch | Batch) : Int32
Processes a batch of tokens with the decoder part of the model
#detach_adapter_lora(adapter : AdapterLora) : Int32
Detaches a LoRA adapter from this context
#encode(batch : LibLlama::LlamaBatch | Batch) : Int32
Processes a batch of tokens with the encoder part of the model
#finalize
Frees the resources associated with this context
#generate(prompt : String, max_tokens : Int32 = 128, temperature : Float32 = 0.8) : String
Generates text from a prompt
#generate_with_sampler(prompt : String, sampler : SamplerChain, max_tokens : Int32 = 128) : String
Generates text using a sampler chain
#get_embeddings : Array(Float32) | Nil
Gets all output token embeddings Only available when embeddings mode is enabled
#get_embeddings_ith(i : Int32) : Array(Float32) | Nil
Gets the embeddings for a specific token
#get_embeddings_seq(seq_id : Int32) : Array(Float32) | Nil
Gets the embeddings for a specific sequence
#logits : Pointer(Float32)
Gets the logits for the last token
#memory : Memory
Returns the memory for this context (modern API)
#n_batch : UInt32
Returns the logical batch size (n_batch)
#n_ctx : UInt32
Returns the context window size (n_ctx)
#n_seq_max : UInt32
Returns the maximum number of sequence IDs per token (n_seq_max)
#n_threads : Int32
Returns the number of threads used for generation
#n_threads_batch : Int32
Returns the number of threads used for batch processing
#n_ubatch : UInt32
Returns the micro-batch size (n_ubatch)
#pooling_type : LibLlama::LlamaPoolingType
Gets the pooling type used for embeddings
#print_perf
Print performance information for this context
#process_embeddings(embeddings : Array(Array(Float32)), seq_ids : Array(Int32) | Nil = nil, n_seq_max : Int32 = 8) : Int32
Process embeddings
#process_prompts(prompts : Array(String)) : Array(Int32)
Process multiple prompts in batch
#process_tokens(tokens : Array(Int32), compute_logits_for_last : Bool = true, seq_ids : Array(Int32) | Nil = nil, n_seq_max : Int32 = 8) : Int32
Process a sequence of tokens
#reset_perf
Reset performance counters for this context
#set_embeddings(enabled : Bool)
Sets whether the model is in embeddings mode or not If true, embeddings will be returned but logits will not
#state : State
Returns the state manager for this context Lazily initializes the state if it doesn't exist yet
#to_unsafe : Pointer(Llama::LibLlama::LlamaContext)
Returns the raw pointer to the underlying llama_context structure

Constructor Detail

def self.new(model : Model, n_ctx : UInt32 = 0, n_batch : UInt32 = 512, n_threads : Int32 = 0, n_threads_batch : Int32 = 0, embeddings : Bool = false, offload_kqv : Bool = false) #

Creates a new Context instance for a model.

Parameters:

model: The Model to create a context for.
n_ctx: Text context (default: 0). The maximum context size. If 0, a minimum context size of 512 is used.
n_batch: Logical maximum batch size that can be submitted to llama_decode (default: 512).
n_threads: Number of threads to use for generation (default: 0). If 0, uses the number of hardware threads.
n_threads_batch: Number of threads to use for batch processing (default: 0). If 0, uses the number of hardware threads.
embeddings: Extract embeddings (together with logits) (default: false). If true, extract embeddings (together with logits).
offload_kqv: Whether to offload the KQV ops (including the KV cache) to GPU (default: false). Requires a GPU build of llama.cpp.

Raises:

Llama::Context::Error if the context cannot be created.

[View source]

Instance Method Detail

def apply_adapter_cvec(data : Slice(Float32), n_embd : Int32, il_start : Int32, il_end : Int32) : Int32 #

Applies a control vector to the LoRA adapter

Parameters:

data: The control vector data
n_embd: Embedding dimension per layer
il_start: Start layer index (inclusive, 1-based)
il_end: End layer index (inclusive, 1-based)

Returns:

0 on success, non-zero on error

Raises:

Llama::Context::Error if the control vector cannot be applied

[View source]

def apply_chat_template(messages : Array(ChatMessage), add_assistant : Bool = true, template : String | Nil = nil) : String #

Applies the chat template to the given messages and returns the formatted prompt.

Parameters:

messages: Array of ChatMessage (user/assistant/system)
add_assistant: Whether to add assistant role (default: true)
template: Optional template string (default: model's template)

Returns:

The formatted prompt string.

[View source]

def attach_adapter_lora(adapter : AdapterLora, scale : Float32 = 1.0) : Int32 #

Attaches a LoRA adapter to this context

Parameters:

adapter: The LoRA adapter to attach
scale: Scaling factor for the adapter (default: 1.0)

Returns:

0 on success, non-zero on error

Raises:

Llama::Context::Error if the adapter cannot be attached

[View source]

def chat(messages : Array(ChatMessage), max_tokens : Int32 = 128, temperature : Float32 = 0.8, template : String | Nil = nil) : String #

Generates a response in a chat conversation

Parameters:

messages: Array of chat messages
max_tokens: Maximum number of tokens to generate
temperature: Sampling temperature
template: Optional chat template (nil to use model's default)

Returns:

The generated response text

Raises:

ArgumentError if parameters are invalid
Llama::Context::Error if text generation fails
Llama::TokenizationError if the prompt cannot be tokenized

[View source]

def clear_adapters_lora #

Clears all LoRA adapters from this context

[View source]

def decode(batch : LibLlama::LlamaBatch | Batch) : Int32 #

Processes a batch of tokens with the decoder part of the model

Parameters:

batch: The batch to process (can be a LibLlama::LlamaBatch or a Batch instance)

Returns:

0 on success
1 if no KV slot was found for the batch
< 0 on error

Raises:

Llama::Batch::Error on error

[View source]

def detach_adapter_lora(adapter : AdapterLora) : Int32 #

Detaches a LoRA adapter from this context

Parameters:

adapter: The LoRA adapter to detach

Returns:

0 on success, non-zero on error

Raises:

Llama::Context::Error if the adapter cannot be detached

[View source]

def encode(batch : LibLlama::LlamaBatch | Batch) : Int32 #

Processes a batch of tokens with the encoder part of the model

This function is used for encoder-decoder models to encode the input before generating text with the decoder.

Parameters:

batch: The batch to process (can be a LibLlama::LlamaBatch or a Batch instance)

Returns:

0 on success
< 0 on error

Raises:

Llama::Batch::Error on error

[View source]

def finalize #

Frees the resources associated with this context

[View source]

def generate(prompt : String, max_tokens : Int32 = 128, temperature : Float32 = 0.8) : String #

Generates text from a prompt

Parameters:

prompt: The input prompt
max_tokens: Maximum number of tokens to generate (must be positive)
temperature: Sampling temperature (0.0 = greedy, 1.0 = more random)

Returns:

The generated text

Raises:

ArgumentError if parameters are invalid
Llama::Context::Error if text generation fails
Llama::TokenizationError if the prompt cannot be tokenized

[View source]

def generate_with_sampler(prompt : String, sampler : SamplerChain, max_tokens : Int32 = 128) : String #

Generates text using a sampler chain

Parameters:

prompt: The input prompt
sampler: The sampler chain to use
max_tokens: Maximum number of tokens to generate (must be positive)

Returns:

The generated text

Raises:

ArgumentError if parameters are invalid
Llama::Context::Error if text generation fails
Llama::TokenizationError if the prompt cannot be tokenized
Llama::Sampler::Error if sampling fails

[View source]

def get_embeddings : Array(Float32) | Nil #

Gets all output token embeddings Only available when embeddings mode is enabled

Returns:

An array of embeddings, or nil if embeddings are not available

Raises:

Llama::Context::Error if embeddings mode is not enabled

[View source]

def get_embeddings_ith(i : Int32) : Array(Float32) | Nil #

Gets the embeddings for a specific token

Parameters:

i: The token index (negative indices can be used to access in reverse order)

Returns:

An array of embedding values, or nil if not available

Raises:

Llama::Context::Error if embeddings mode is not enabled

[View source]

def get_embeddings_seq(seq_id : Int32) : Array(Float32) | Nil #

Gets the embeddings for a specific sequence

Parameters:

seq_id: The sequence ID

Returns:

An array of embedding values, or nil if not available

Raises:

Llama::Context::Error if embeddings mode is not enabled

[View source]

def logits : Pointer(Float32) #

Gets the logits for the last token

Returns:

A pointer to the logits array

[View source]

def memory : Memory #

Returns the memory for this context (modern API)

The memory system provides unified access to various memory types:

Standard KV cache (llama_kv_cache_unified)
SWA (Sliding Window Attention) cache
Recurrent layer memory
Hybrid attention/recurrent models

Returns:

A Memory instance

[View source]

def n_batch : UInt32 #

Returns the logical batch size (n_batch)

[View source]

def n_ctx : UInt32 #

Returns the context window size (n_ctx)

[View source]

def n_seq_max : UInt32 #

Returns the maximum number of sequence IDs per token (n_seq_max)

[View source]

def n_threads : Int32 #

Returns the number of threads used for generation

[View source]

def n_threads_batch : Int32 #

Returns the number of threads used for batch processing

[View source]

def n_ubatch : UInt32 #

Returns the micro-batch size (n_ubatch)

[View source]

def pooling_type : LibLlama::LlamaPoolingType #

Gets the pooling type used for embeddings

Returns:

The pooling type as a PoolingType enum

[View source]

def print_perf #

Print performance information for this context

This method prints performance statistics about the context to STDERR. It's useful for debugging and performance analysis.

[View source]

def process_embeddings(embeddings : Array(Array(Float32)), seq_ids : Array(Int32) | Nil = nil, n_seq_max : Int32 = 8) : Int32 #

Process embeddings

Parameters:

embeddings: Array of embedding vectors
seq_ids: Sequence IDs to use for all embeddings
n_seq_max: Maximum number of sequence IDs per token (default: 8)

Returns:

The result of the decode operation (0 on success)

Raises:

Llama::Batch::Error on error

[View source]

def process_prompts(prompts : Array(String)) : Array(Int32) #

Process multiple prompts in batch

Parameters:

prompts: Array of text prompts to process
compute_logits_for_last: Whether to compute logits only for the last token of each prompt

Returns:

Array of decode operation results (0 on success)

Raises:

Llama::Batch::Error on error
Llama::TokenizationError if a prompt cannot be tokenized

[View source]

def process_tokens(tokens : Array(Int32), compute_logits_for_last : Bool = true, seq_ids : Array(Int32) | Nil = nil, n_seq_max : Int32 = 8) : Int32 #

Process a sequence of tokens

Parameters:

tokens: Array of token IDs to process
compute_logits_for_last: Whether to compute logits only for the last token
seq_ids: Sequence IDs to use for all tokens
n_seq_max: Maximum number of sequence IDs per token (default: 8)

Returns:

The result of the decode operation (0 on success)

Raises:

Llama::Batch::Error on error

[View source]

def reset_perf #

Reset performance counters for this context

This method resets all performance counters for the context.

[View source]

def set_embeddings(enabled : Bool) #

Sets whether the model is in embeddings mode or not If true, embeddings will be returned but logits will not

Parameters:

enabled: Whether to enable embeddings mode

[View source]

def state : State #

Returns the state manager for this context Lazily initializes the state if it doesn't exist yet

[View source]

def to_unsafe : Pointer(Llama::LibLlama::LlamaContext) #

Returns the raw pointer to the underlying llama_context structure

[View source]