1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
//! # Large Language Model (LLM) Module
//!
//! This module contains components necessary for loading and processing large language models.
//! It includes utilities for handling model parameters, loading models, generating tokens,
//! and other functionalities essential for text generation.

/// Parameters for text generation.
///
/// This module defines the parameters used to control the behavior of text generation,
/// such as the maximum number of new tokens to generate, temperature settings, and others.
pub mod generate_parameter;

/// Module for loading models.
///
/// Provides functionality to load model weights and other necessary components for language models.
pub mod loader;

/// Processor for language models.
///
/// Handles the processing of input data through the model, including forward passes
/// and manipulation of outputs.
pub mod model_processor;

/// Enumerations for supported models.
///
/// Defines the various language models supported by this application.
pub mod models;

/// Sampling utilities for language models.
///
/// Includes implementations for sampling methods used in text generation, such as
/// temperature-based sampling.
pub mod sampler;

/// Main text generation logic.
///
/// Central module for generating text using the language models. It orchestrates
/// the interaction between the tokenizer, model, and sampling methods.
pub mod text_generation;

/// Generator for text generation.
///
/// Manages the generation of text by iteratively producing tokens and constructing
/// the final output text.
pub mod text_generator;

/// Token generator utilities.
///
/// Provides the core functionality for generating individual tokens during the text
/// generation process.
pub mod token_generator;

/// Enumeration representing the reason why text generation was finished.
///
/// Indicates whether the generation stopped due to reaching the maximum length,
/// encountering an end-of-sequence token, or hitting a specified stop sequence.
#[derive(Debug, PartialEq)]
pub enum FinishReason {
    /// Generation stopped because the maximum length was reached.
    Length,

    /// Generation stopped due to the model producing an end-of-sequence token.
    EosToken,

    /// Generation stopped because a specified stop sequence was encountered.
    StopSequence,
}

#[derive(Clone)]
pub enum Model {
    Llama(candle_transformers::models::quantized_llama::ModelWeights),
    MixFormer(candle_transformers::models::quantized_mixformer::MixFormerSequentialForCausalLM),
}