Created
June 28, 2024 05:41
-
-
Save EvanLyu732/ef808fa3752780d0fc518b7134529fb8 to your computer and use it in GitHub Desktop.
transformer in pure c written by chatgpt.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* This file is written by chatgpt */ | |
#include <math.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#define MAX_SEQUENCE_LENGTH 512 ///< Maximum sequence length for input data | |
#define MAX_EMBEDDING_DIM 512 ///< Maximum embedding dimension for the model | |
/** | |
* Represents a matrix with floating point data. | |
*/ | |
typedef struct { | |
float *data; ///< Pointer to the matrix data | |
int rows; ///< Number of rows in the matrix | |
int cols; ///< Number of columns in the matrix | |
} Matrix; | |
/** | |
* Represents the weights for a single attention layer in the transformer model. | |
*/ | |
typedef struct { | |
Matrix query_weight; ///< Query weight matrix | |
Matrix key_weight; ///< Key weight matrix | |
Matrix value_weight; ///< Value weight matrix | |
Matrix output_weight; ///< Output weight matrix | |
} AttentionLayer; | |
/** | |
* Represents a single transformer block, which includes an attention layer and | |
* a feed-forward network. | |
*/ | |
typedef struct { | |
AttentionLayer attention; ///< Attention layer | |
Matrix feed_forward_weight1; ///< First weight matrix for the feed-forward | |
///< network | |
Matrix feed_forward_weight2; ///< Second weight matrix for the feed-forward | |
///< network | |
Matrix norm_gamma; ///< Gamma parameter for layer normalization | |
Matrix norm_beta; ///< Beta parameter for layer normalization | |
} TransformerBlock; | |
/** | |
* Represents the entire transformer model, consisting of multiple transformer | |
* blocks. | |
*/ | |
typedef struct { | |
Matrix embedding; ///< Embedding matrix for input data | |
TransformerBlock *blocks; ///< Pointer to an array of transformer blocks | |
int num_blocks; ///< Number of transformer blocks in the model | |
} TransformerModel; | |
/** | |
* Creates a matrix with the specified number of rows and columns. | |
* Allocates memory for the matrix data. | |
* | |
* @param rows Number of rows in the matrix | |
* @param cols Number of columns in the matrix | |
* @return The created matrix | |
*/ | |
Matrix create_matrix(int rows, int cols) { | |
Matrix mat; | |
mat.rows = rows; | |
mat.cols = cols; | |
mat.data = (float *)calloc( | |
rows * cols, sizeof(float)); // Use calloc for zero-initialization | |
return mat; | |
} | |
/** | |
* Frees the memory allocated for a matrix. | |
* | |
* @param mat Pointer to the matrix to be freed | |
*/ | |
void free_matrix(Matrix *mat) { | |
free(mat->data); | |
mat->data = NULL; | |
mat->rows = 0; | |
mat->cols = 0; | |
} | |
/** | |
* Multiplies two matrices and stores the result in a third matrix. | |
* | |
* @param a Pointer to the first matrix | |
* @param b Pointer to the second matrix | |
* @param result Pointer to the matrix where the result will be stored | |
*/ | |
void matmul(Matrix *a, Matrix *b, Matrix *result) { | |
if (a->cols != b->rows) | |
return; | |
for (int i = 0; i < a->rows; i++) { | |
for (int j = 0; j < b->cols; j++) { | |
float sum = 0; | |
for (int k = 0; k < a->cols; k++) { | |
sum += a->data[i * a->cols + k] * b->data[k * b->cols + j]; | |
} | |
result->data[i * result->cols + j] = sum; | |
} | |
} | |
} | |
/** | |
* Adds two matrices and stores the result in a third matrix. | |
* | |
* @param a Pointer to the first matrix | |
* @param b Pointer to the second matrix | |
* @param result Pointer to the matrix where the result will be stored | |
*/ | |
void add_matrix(Matrix *a, Matrix *b, Matrix *result) { | |
int size = a->rows * a->cols; | |
for (int i = 0; i < size; i++) { | |
result->data[i] = a->data[i] + b->data[i]; | |
} | |
} | |
/** | |
* Applies positional encoding to an embedding matrix. | |
* | |
* @param embedding Pointer to the embedding matrix | |
* @param max_len Maximum length of the sequences | |
* @param d_model Dimension of the model | |
*/ | |
void positional_encoding(Matrix *embedding, int max_len, int d_model) { | |
for (int pos = 0; pos < max_len; pos++) { | |
for (int i = 0; i < d_model; i += 2) { | |
embedding->data[pos * d_model + i] = | |
sin(pos / pow(10000, (float)i / d_model)); | |
if (i + 1 < d_model) { | |
embedding->data[pos * d_model + i + 1] = | |
cos(pos / pow(10000, (float)(i + 1) / d_model)); | |
} | |
} | |
} | |
} | |
/** | |
* Performs self-attention on the input matrix and stores the result in the | |
* output matrix. | |
* | |
* @param input Pointer to the input matrix | |
* @param layer Pointer to the attention layer containing the weights | |
* @param output Pointer to the matrix where the result will be stored | |
*/ | |
void self_attention(Matrix *input, AttentionLayer *layer, Matrix *output) { | |
Matrix q = create_matrix(input->rows, layer->query_weight.cols); | |
Matrix k = create_matrix(input->rows, layer->key_weight.cols); | |
Matrix v = create_matrix(input->rows, layer->value_weight.cols); | |
Matrix temp = create_matrix(input->rows, input->rows); | |
matmul(input, &layer->query_weight, &q); | |
matmul(input, &layer->key_weight, &k); | |
matmul(input, &layer->value_weight, &v); | |
matmul(&q, &k, &temp); | |
for (int i = 0; i < temp.rows * temp.cols; i++) { | |
temp.data[i] /= sqrt(k.cols); | |
} | |
for (int i = 0; i < temp.rows; i++) { | |
float sum = 0.0; | |
for (int j = 0; j < temp.cols; j++) { | |
temp.data[i * temp.cols + j] = exp(temp.data[i * temp.cols + j]); | |
sum += temp.data[i * temp.cols + j]; | |
} | |
for (int j = 0; j < temp.cols; j++) { | |
temp.data[i * temp.cols + j] /= sum; | |
} | |
} | |
matmul(&temp, &v, output); | |
free_matrix(&q); | |
free_matrix(&k); | |
free_matrix(&v); | |
free_matrix(&temp); | |
} | |
/** | |
* Applies a feed-forward neural network to the input matrix and stores the | |
* result in the output matrix. | |
* | |
* @param input Pointer to the input matrix | |
* @param weight1 Pointer to the first weight matrix of the feed-forward network | |
* @param weight2 Pointer to the second weight matrix of the feed-forward | |
* network | |
* @param output Pointer to the matrix where the result will be stored | |
*/ | |
void feed_forward(Matrix *input, Matrix *weight1, Matrix *weight2, | |
Matrix *output) { | |
Matrix temp = create_matrix(input->rows, weight1->cols); | |
matmul(input, weight1, &temp); | |
for (int i = 0; i < temp.rows * temp.cols; i++) { | |
temp.data[i] = fmax(0, temp.data[i]); // ReLU activation | |
} | |
matmul(&temp, weight2, output); | |
free_matrix(&temp); | |
} | |
/** | |
* Processes an input matrix through a transformer block and stores the result | |
* in the output matrix. | |
* | |
* @param input Pointer to the input matrix | |
* @param block Pointer to the transformer block containing the layer weights | |
* @param output Pointer to the matrix where the result will be stored | |
*/ | |
void transformer_block(Matrix *input, TransformerBlock *block, Matrix *output) { | |
Matrix attn_output = create_matrix(input->rows, input->cols); | |
Matrix norm_output = create_matrix(input->rows, input->cols); | |
Matrix ff_output = | |
create_matrix(input->rows, block->feed_forward_weight2.cols); | |
self_attention(input, &block->attention, &attn_output); | |
add_matrix(input, &attn_output, &norm_output); | |
feed_forward(&norm_output, &block->feed_forward_weight1, | |
&block->feed_forward_weight2, &ff_output); | |
add_matrix(&norm_output, &ff_output, output); | |
free_matrix(&attn_output); | |
free_matrix(&norm_output); | |
free_matrix(&ff_output); | |
} | |
/** | |
* Processes an input matrix through the entire transformer model and stores the | |
* result in the output matrix. | |
* | |
* @param input Pointer to the input matrix | |
* @param model Pointer to the transformer model | |
* @param output Pointer to the matrix where the result will be stored | |
*/ | |
void transformer_forward(Matrix *input, TransformerModel *model, | |
Matrix *output) { | |
Matrix temp_input = create_matrix(input->rows, input->cols); | |
memcpy(temp_input.data, input->data, | |
input->rows * input->cols * sizeof(float)); | |
for (int i = 0; i < model->num_blocks; i++) { | |
Matrix temp_output = create_matrix(temp_input.rows, temp_input.cols); | |
transformer_block(&temp_input, &model->blocks[i], &temp_output); | |
free_matrix(&temp_input); | |
temp_input = temp_output; | |
} | |
memcpy(output->data, temp_input.data, | |
temp_input.rows * temp_input.cols * sizeof(float)); | |
free_matrix(&temp_input); | |
} | |
int main() { | |
// Example usage | |
int num_blocks = 2; | |
int embedding_dim = 8; | |
int seq_length = 4; | |
// Initialize the transformer model | |
TransformerModel model; | |
model.num_blocks = num_blocks; | |
model.blocks = | |
(TransformerBlock *)malloc(num_blocks * sizeof(TransformerBlock)); | |
// Allocate memory for each transformer block | |
for (int i = 0; i < num_blocks; i++) { | |
model.blocks[i].attention.query_weight = | |
create_matrix(embedding_dim, embedding_dim); | |
model.blocks[i].attention.key_weight = | |
create_matrix(embedding_dim, embedding_dim); | |
model.blocks[i].attention.value_weight = | |
create_matrix(embedding_dim, embedding_dim); | |
model.blocks[i].attention.output_weight = | |
create_matrix(embedding_dim, embedding_dim); | |
model.blocks[i].feed_forward_weight1 = | |
create_matrix(embedding_dim, embedding_dim); | |
model.blocks[i].feed_forward_weight2 = | |
create_matrix(embedding_dim, embedding_dim); | |
model.blocks[i].norm_gamma = create_matrix(1, embedding_dim); | |
model.blocks[i].norm_beta = create_matrix(1, embedding_dim); | |
} | |
// Create input and output matrices | |
Matrix input = create_matrix(seq_length, embedding_dim); | |
Matrix output = create_matrix(seq_length, embedding_dim); | |
// Initialize input with some values | |
for (int i = 0; i < input.rows * input.cols; i++) { | |
input.data[i] = (float)i / 10.0; | |
} | |
// Apply the transformer model | |
transformer_forward(&input, &model, &output); | |
// Print the output | |
printf("Output Matrix:\n"); | |
for (int i = 0; i < output.rows; i++) { | |
for (int j = 0; j < output.cols; j++) { | |
printf("%f ", output.data[i * output.cols + j]); | |
} | |
printf("\n"); | |
} | |
// Free memory | |
free_matrix(&input); | |
free_matrix(&output); | |
for (int i = 0; i < num_blocks; i++) { | |
free_matrix(&model.blocks[i].attention.query_weight); | |
free_matrix(&model.blocks[i].attention.key_weight); | |
free_matrix(&model.blocks[i].attention.value_weight); | |
free_matrix(&model.blocks[i].attention.output_weight); | |
free_matrix(&model.blocks[i].feed_forward_weight1); | |
free_matrix(&model.blocks[i].feed_forward_weight2); | |
free_matrix(&model.blocks[i].norm_gamma); | |
free_matrix(&model.blocks[i].norm_beta); | |
} | |
free(model.blocks); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@startuml | |
class Matrix { | |
+float *data | |
+int rows | |
+int cols | |
} | |
class AttentionLayer { | |
+Matrix query_weight | |
+Matrix key_weight | |
+Matrix value_weight | |
+Matrix output_weight | |
} | |
class TransformerBlock { | |
+AttentionLayer attention | |
+Matrix feed_forward_weight1 | |
+Matrix feed_forward_weight2 | |
+Matrix norm_gamma | |
+Matrix norm_beta | |
} | |
class TransformerModel { | |
+Matrix embedding | |
+TransformerBlock *blocks | |
+int num_blocks | |
} | |
class Main { | |
+int main() | |
} | |
class "Matrix Operations" { | |
+Matrix create_matrix(int rows, int cols) | |
+void free_matrix(Matrix *mat) | |
+void matmul(Matrix *a, Matrix *b, Matrix *result) | |
+void add_matrix(Matrix *a, Matrix *b, Matrix *result) | |
+void positional_encoding(Matrix *embedding, int max_len, int d_model) | |
} | |
class "Transformer Operations" { | |
+void self_attention(Matrix *input, AttentionLayer *layer, Matrix *output) | |
+void feed_forward(Matrix *input, Matrix *weight1, Matrix *weight2, Matrix *output) | |
+void transformer_block(Matrix *input, TransformerBlock *block, Matrix *output) | |
+void transformer_forward(Matrix *input, TransformerModel *model, Matrix *output) | |
} | |
Main -right-> TransformerModel | |
TransformerModel -right-> TransformerBlock | |
TransformerBlock -right-> AttentionLayer | |
TransformerModel *-- Matrix | |
AttentionLayer *-- Matrix | |
TransformerBlock *-- Matrix | |
"Matrix Operations" ..> Matrix : uses | |
"Transformer Operations" ..> Matrix : uses | |
"Transformer Operations" ..> AttentionLayer : uses | |
"Transformer Operations" ..> TransformerBlock : uses | |
"Transformer Operations" ..> TransformerModel : uses | |
@enduml |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment