From ead8a6e04e217a804dee6664c9ecd9ccd33ff3d5 Mon Sep 17 00:00:00 2001 From: Marco Cetica Date: Mon, 12 Jan 2026 11:58:32 +0100 Subject: [PATCH] Refactored `string_set_at` to be immutable and added `String` documentation --- Makefile | 2 +- README.md | 36 ++++++++++++++++- docs/README.md | 3 +- docs/bigint.md | 4 +- docs/map.md | 6 +-- docs/string.md | 96 +++++++++++++++++++++++++++++++++++++++++++++ docs/vector.md | 6 +-- src/string.c | 76 ++++++++++++++++++++--------------- src/string.h | 4 +- tests/test_string.c | 36 +++++++++++++---- usage.c | 7 ++-- 11 files changed, 221 insertions(+), 55 deletions(-) create mode 100644 docs/string.md diff --git a/Makefile b/Makefile index a40e35f..21eb0ac 100644 --- a/Makefile +++ b/Makefile @@ -40,7 +40,7 @@ $(TEST_M_TARGET): $(OBJ_DIR)/test_map.o $(OBJ_DIR)/map.o $(TEST_B_TARGET): $(OBJ_DIR)/test_bigint.o $(OBJ_DIR)/bigint.o $(OBJ_DIR)/vector.o $(CC) $(CFLAGS) -o $@ $^ -$(TEST_S_TARGET): $(OBJ_DIR)/test_string.o $(OBJ_DIR)/string.o $(OBJ_DIR)/vector.o +$(TEST_S_TARGET): $(OBJ_DIR)/test_string.o $(OBJ_DIR)/string.o $(CC) $(CFLAGS) -o $@ $^ $(OBJ_DIR)/%.o: $(SRC_DIR)/%.c | $(OBJ_DIR) diff --git a/README.md b/README.md index 678ffee..63e7eac 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,8 @@ the standard library. It currently features: - [**Vector**](/docs/vector.md): a growable, contiguous array of homogenous generic data types; - [**Map**](/docs/map.md): an associative array that handles generic heterogenous data types; -- [**BigInt**](/docs/bigint.md): a data type for arbitrary large integers. +- [**BigInt**](/docs/bigint.md): a data type for arbitrary large integers; +- [**String**](/docs/string.md): an immutable string type with partial UTF-8 support. ## Usage At its simplest, you can use this library as follows: @@ -167,6 +168,39 @@ int main(void) { } ``` + +### `String` usage: +```c +#include + +#include "src/string.h" + +/* + * Compile with: gcc -O3 main.c src/string.c + * Output: Final string: "Hello,World,๐Ÿ˜€" Splitted: ["Hello" "World" "๐Ÿ˜€" ] + */ +int main(void) { + string_t *x = string_new(" Hello, ").value.string; + string_t *x_trm = string_trim(x).value.string; + + string_t *y = string_new("๐Ÿ˜€,dlroW").value.string; + string_t *y_rev = string_reverse(y).value.string; + + string_t *str = string_concat(x_trm, y_rev).value.string; + string_t **strings = string_split(str, ",").value.split.strings; + + printf("Final string: \"%s\" Splitted: [", str->data); + for (int idx = 0; idx < 3; idx++) { printf("\"%s\" ", strings[idx]->data); } + printf("]\n"); + + string_split_destroy(strings, 3); string_destroy(str); + string_destroy(x); string_destroy(y); + string_destroy(x_trm); string_destroy(y_rev); + + return 0; +} +``` + For a more exhaustive example, refer to the `usage.c` file. There, you will find a program with proper error management and a sample usage for every available method. To run it, first issue the following command: diff --git a/docs/README.md b/docs/README.md index 9968106..cfb2c6e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -7,4 +7,5 @@ At the time being, this documentation includes the following pages: - [vector.md](vector.md): vector documentation; - [map.md](map.md): map documentation; -- [bigint.md](bigint.md): bigint documentation. +- [bigint.md](bigint.md): bigint documentation; +- [string.md](string.md): string documentation. diff --git a/docs/bigint.md b/docs/bigint.md index e51980a..38819a7 100644 --- a/docs/bigint.md +++ b/docs/bigint.md @@ -46,7 +46,7 @@ The `BigInt` data structure supports the following methods: - `bigint_result_t bigint_destroy(number)`: delete the big number; - `bigint_result_t bigint_printf(format, ...)`: `printf` wrapper that introduces the `%B` placeholder to print big numbers. It supports variadic parameters. -As you can see by the previous function signatures, methods that operate on the +As you can see from the previous function signatures, methods that operate on the `BigInt` data type return a custom type called `bigint_result_t` which is defined as follows: @@ -80,7 +80,7 @@ by setting the `status` field and by providing a descriptive message on the `mes field. If the operation was successful (that is, `status == BIGINT_OK`), you can either move on with the rest of the program or read the returned value from the sum data type. Of course, you can choose to ignore the return value (if you're brave enough :D) as -illustrated in the first part of the README. +illustrated on the first part of the README. The sum data type (i.e., the `value` union) defines four different variables. Each of them has an unique scope as described below: diff --git a/docs/map.md b/docs/map.md index f825bf8..a0c3eeb 100644 --- a/docs/map.md +++ b/docs/map.md @@ -5,7 +5,7 @@ aspects (internal design, memory layout, etc.) of the `Map` data structure. `Map` is an hash table that uses open addressing with linear probing for collision resolution and the [FNV-1a algorithm](https://en.wikipedia.org/wiki/Fowlerโ€“Nollโ€“Vo_hash_function) as its hashing function. Resizing is performed automatically by doubling the capacity when the load factor exceeds 75%. Internally, -this data structure is represented by the following two structures: +this data structure is represented by the following two layouts: ```c typedef struct { @@ -46,7 +46,7 @@ The `Map` data structure supports the following methods: - `size_t map_size(map)`: returns map size (i.e., the number of elements); - `size_t map_capacity(map)`: returns map capacity (i.e., map total size). -As you can see by the previous function signatures, most methods that operate +As you can see from the previous function signatures, most methods that operate on the `Map` data type return a custom type called `map_result_t` which is defined as follows: @@ -73,4 +73,4 @@ Each method that returns such type indicates whether the operation was successfu the `status` field and by providing a descriptive message on the `message` field. If the operation was successful (that is, `status == MAP_OK`), you can either move on with the rest of the program or read the returned value from the sum data type. Of course, you can choose to ignore the return value (if you're brave enough :D) as illustrated -in the first part of the README. \ No newline at end of file +on the first part of the README. diff --git a/docs/string.md b/docs/string.md new file mode 100644 index 0000000..8b53131 --- /dev/null +++ b/docs/string.md @@ -0,0 +1,96 @@ +# String Technical Details +In this document you can find a quick overview of the technical +aspects (internal design, memory layout, etc.) of the `String` data structure. + +`String` is an immutable string data type with partial UTF-8 support. +This means that methods return a new string instance rather than modifying the string in-place. +Internally, this data structure is represented by the following layout: + +```c +typedef struct { + char *data; + size_t byte_size; + size_t byte_capacity; + size_t char_count; +} string_t; +``` + +where the `data` variable represents the actual string (represented as a pointer to `char`), +the `byte_size` variable indicates the actual size (in bytes) of the string, the +`byte_capacity` variable represents the total number of allocated memory (in bytes) and the +`char_count` variable represent the number of logical characters, that is the number of +symbols. + +As mentioned earlier, this library provides partial UTF-8 support. It is able to recognize +UTF-8 byte sequences as individual Unicode code points, which allows it to correctly distinguish +between byte length and character count. It fully supports Unicode symbols and emojis, while +remaining backward compatible with ASCII strings. + +However, this data structure does not support localization. In particular, it does not perform +locale-aware conversion; for instance, uppercase/lowercase transformations are limited to ASCII +characters only. As a result, the German scharfes S (`รŸ`) is not convert to `SS`, the Spanish +`ร‘` is not converted to `รฑ` and the Italian `รฉ` (and its variants) is not treated as a single +symbol, but rather as a base letter combined with an accent. + +At the time being, `String` supports the following methods: + +- `string_result_t string_new(c_str)`: create a new string; +- `string_result_t string_clone(str)`: clone an existing string; +- `string_result_t string_concat(x, y)`: concatenate two strings together; +- `string_result_t string_contains(haystack, needle)`: search whether the `haystack` string contains `needle`; +- `string_result_t string_slice(str, start, end)`: return a slice (a new string) from `str` between `start` and `end` indices (inclusive); +- `string_result_t string_eq(x, y, case_sensitive)`: check whether `x` and `y` are equal; +- `string_result_t string_get_at(str, position)`: get the UTF-8 symbol indexed by `position` from `str`; +- `string_result_t string_set_at(str, position, utf8_char)`: write a UTF-8 symbol into `str` at index `position`; +- `string_result_t string_to_lower(str)`: convert a string to lowercase; +- `string_result_t string_to_upper(str)`: convert a string to uppercase; +- `string_result_t string_reverse(str)`: reverse a string; +- `string_result_t string_trim(str)`: remove leading and trailing white space from a string; +- `string_result_t string_split(str, delim)`: split a string into an array of `string_t` by specifying a separator; +- `string_result_t string_destroy(str)`: remove a string from memory; +- `string_result_t string_split_destroy(split, count)`: remove an array of strings from memory; +- `size_t string_size(str)`: return string character count. + +As you can see from the previous function signatures, most methods that operate on the `String` +data type return a custom type called `string_result_t` which is defined as follows: + +```c +typedef enum { + STRING_OK = 0x0, + STRING_ERR_ALLOCATE, + STRING_ERR_INVALID, + STRING_ERR_INVALID_UTF8, + STRING_ERR_OVERFLOW +} string_status_t; + +typedef struct { + string_status_t status; + uint8_t message[RESULT_MSG_SIZE]; + union { + string_t *string; // For new, clone, slice, reverse, trim + char *symbol; // For get_at + int64_t idx; // For contains + bool is_equ; // For comparison + struct { // For split + string_t **strings; + size_t count; + } split; + } value; +} string_result_t; +``` + +Each method that returns such type indicates whether the operation was successful or not +by setting the `status` field and by providing a descriptive message on the `message` +field. If the operation was successful (that is, `status == STRING_OK`) you can either +move on with the rest of your program or read the returned value from the sum data type. +Of course, you can choose to ignore the return value (if you're brave enough :D) as illustrated +on the first part of the README. + +The sum data type (i.e., the `value` union) defines five different variables. +Each of them has an unique scope as described below: + +- `string`: result of `new`, `clone`, `slice`, `reverse` and `trim` functions; +- `symbol`: result of `get_at` function; +- `idx`: result of `contains` function; +- `is_eq`: result of `equ` function. It's true when two strings are equal, false otherwise; +- `split`: result of `split` function. It contains an array of `string_t` and its number of elements. diff --git a/docs/vector.md b/docs/vector.md index cd24c0b..dd67d82 100644 --- a/docs/vector.md +++ b/docs/vector.md @@ -5,7 +5,7 @@ aspects (internal design, memory layout, etc.) of the `Vector` data structure. `Vector` is a dynamic array with generic data type support; this means that you can store any kind of homogenous value on this data structure. Resizing is performed automatically by increasing the capacity by 1.5 times when the array becomes full. Internally, this -data structure is represented by the following structure: +data structure is represented by the following layout: ```c typedef struct { @@ -39,7 +39,7 @@ At the time being, `Vector` supports the following methods: - `size_t vector_size(vector)`: return vector size (i.e., the number of elements); - `size_t vector_capacity(vector)`: return vector capacity (i.e., vector total size). -As you can see by the previous function signatures, most methods that operate +As you can see from the previous function signatures, most methods that operate on the `Vector` data type return a custom type called `vector_result_t` which is defined as follows: @@ -66,7 +66,7 @@ Each method that returns such type indicates whether the operation was successfu by setting the `status` field and by providing a descriptive message on the `message` field. If the operation was successful (that is, `status == VECTOR_OK`), you can either move on with the rest of the program or read the returned value from the sum data type. Of course, you can choose to -ignore the return value (if you're brave enough :D) as illustrated in the first part of the README. +ignore the return value (if you're brave enough :D) as illustrated on the first part of the README. ## Functional methods `Vector` provides three functional methods called `map`, `filter` and `reduce` which allow the caller to apply a computation to the vector, diff --git a/src/string.c b/src/string.c index aac4bc3..9424421 100644 --- a/src/string.c +++ b/src/string.c @@ -522,14 +522,20 @@ string_result_t string_get_at(const string_t *str, size_t position) { * * Returns a string_result_t data type */ -string_result_t string_set_at(string_t *str, size_t position, const char *utf8_char) { +string_result_t string_set_at(const string_t *str, size_t position, const char *utf8_char) { string_result_t result = {0}; - int new_len; - - if (str == NULL || utf8_is_char_valid(utf8_char, &new_len) == 0) { + if (str == NULL) { result.status = STRING_ERR_INVALID; - SET_MSG(result, "Invalid index or character"); + SET_MSG(result, "Invalid string"); + + return result; + } + + int new_char_bytes; + if (utf8_is_char_valid(utf8_char, &new_char_bytes) == 0) { + result.status = STRING_ERR_INVALID_UTF8; + SET_MSG(result, "Invalid UTF-8 character"); return result; } @@ -541,41 +547,49 @@ string_result_t string_set_at(string_t *str, size_t position, const char *utf8_c return result; } - char *pos = str->data; + // Locate the byte offset of the character to replace + const char *pos = str->data; for (size_t idx = 0; idx < position; idx++) { pos += utf8_char_len((unsigned char)*pos); } - int old_len = utf8_char_len((unsigned char)*pos); - if (old_len == new_len) { - memcpy(pos, utf8_char, new_len); - } else { - const size_t prefix_len = pos - str->data; - const size_t suffix_len = str->byte_size - prefix_len - old_len; - const size_t new_total = prefix_len + new_len + suffix_len; + const size_t prefix_len = pos - str->data; + const int old_char_bytes = utf8_char_len((unsigned char)*pos); + const size_t suffix_len = str->byte_size - prefix_len - old_char_bytes; + const size_t new_total_bytes = prefix_len + new_char_bytes + suffix_len; - char *new_data = malloc(new_total + 1); - if (new_data == NULL) { - result.status = STRING_ERR_ALLOCATE; - SET_MSG(result, "Cannot allocate memory"); + string_t *new_str = malloc(sizeof(string_t)); + if (new_str == NULL) { + result.status = STRING_ERR_ALLOCATE; + SET_MSG(result, "Cannot allocate memory"); - return result; - } - - memcpy(new_data, str->data, prefix_len); - memcpy(new_data + prefix_len, utf8_char, new_len); - memcpy(new_data + prefix_len + new_len, pos + old_len, suffix_len); - new_data[new_total] = '\0'; - - free(str->data); - - str->data = new_data; - str->byte_size = new_total; - str->byte_capacity = new_total + 1; + return result; } + new_str->data = malloc(new_total_bytes + 1); + if (new_str->data == NULL) { + free(new_str); + result.status = STRING_ERR_ALLOCATE; + SET_MSG(result, "Cannot allocate memory"); + + return result; + } + + // Copy prefix data from original string + memcpy(new_str->data, str->data, prefix_len); + // Copy the new character at requested index + memcpy(new_str->data + prefix_len, utf8_char, new_char_bytes); + // Copy suffix data from the original string by skipping the overwritten character + memcpy(new_str->data + prefix_len + new_char_bytes, pos + old_char_bytes, suffix_len); + new_str->data[new_total_bytes] = '\0'; + + new_str->byte_size = new_total_bytes; + new_str->byte_capacity = new_total_bytes + 1; + new_str->char_count = str->char_count; + result.status = STRING_OK; - SET_MSG(result, "Character successfully set"); + result.value.string = new_str; + SET_MSG(result, "Symbol successfully set"); return result; } diff --git a/src/string.h b/src/string.h index 23f5cc0..e04790e 100644 --- a/src/string.h +++ b/src/string.h @@ -49,7 +49,7 @@ string_result_t string_contains(const string_t *haystack, const string_t *needle string_result_t string_slice(const string_t *str, size_t start, size_t end); string_result_t string_eq(const string_t *x, const string_t *y, bool case_sensitive); string_result_t string_get_at(const string_t *str, size_t position); -string_result_t string_set_at(string_t *str, size_t position, const char *utf8_char); +string_result_t string_set_at(const string_t *str, size_t position, const char *utf8_char); string_result_t string_to_lower(const string_t *str); string_result_t string_to_upper(const string_t *str); string_result_t string_reverse(const string_t *str); @@ -59,7 +59,7 @@ string_result_t string_destroy(string_t *str); string_result_t string_split_destroy(string_t **split, size_t count); // Inline methods -static inline size_t string_len(const string_t *str) { +static inline size_t string_size(const string_t *str) { return str ? str->char_count : 0; } diff --git a/tests/test_string.c b/tests/test_string.c index 88d9c0f..38bc7c3 100644 --- a/tests/test_string.c +++ b/tests/test_string.c @@ -22,7 +22,7 @@ void test_string_new(void) { assert(res.status == STRING_OK); assert(res.value.string != NULL); assert(strcmp(res.value.string->data, "hello") == 0); - assert(string_len(res.value.string) == 5); + assert(string_size(res.value.string) == 5); assert(res.value.string->byte_size == 5); string_destroy(res.value.string); @@ -33,7 +33,7 @@ void test_string_new_empty(void) { string_result_t res = string_new(""); assert(res.status == STRING_OK); - assert(string_len(res.value.string) == 0); + assert(string_size(res.value.string) == 0); assert(res.value.string->byte_size == 0); assert(res.value.string->data[0] == '\0'); @@ -62,7 +62,7 @@ void test_string_concat(void) { string_result_t res = string_concat(str1, str2); assert(res.status == STRING_OK); assert(strcmp(res.value.string->data, "Foo Bar") == 0); - assert(string_len(res.value.string) == 7); + assert(string_size(res.value.string) == 7); string_destroy(str1); string_destroy(str2); @@ -155,9 +155,9 @@ void test_string_reverse_utf8(void) { string_result_t res = string_reverse(str); assert(res.status == STRING_OK); - assert(string_len(res.value.string) == 3); + assert(string_size(res.value.string) == 3); assert(strcmp(res.value.string->data, "Z๐ŸŒA") == 0); - assert(string_len(res.value.string) == 3); + assert(string_size(res.value.string) == 3); string_destroy(str); string_destroy(res.value.string); @@ -198,10 +198,29 @@ void test_string_set_at(void) { // Replace 'B' with emoji string_result_t res = string_set_at(str, 1, "๐Ÿ˜†"); + string_t *altered = res.value.string; + assert(res.status == STRING_OK); - assert(strcmp(str->data, "A๐Ÿ˜†C") == 0); - assert(string_len(str) == 3); - assert(str->byte_size == 6); // that is: A (1B) + emoji (4B) + C (1B) + assert(strcmp(altered->data, "A๐Ÿ˜†C") == 0); + assert(string_size(altered) == 3); + assert(altered->byte_size == 6); // that is: A (1B) + emoji (4B) + C (1B) + + string_destroy(str); + string_destroy(altered); +} + +// Test mutation of invalid UTF-8 symbol +void test_string_set_at_invalid_utf8(void) { + string_t *str = string_new("ABC").value.string; + + const char * const invalid_sym1 = "\xFF"; + const char * const invalid_sym2 = "\x80"; + + string_result_t res1 = string_set_at(str, 1, invalid_sym1); + assert(res1.status == STRING_ERR_INVALID_UTF8); + + string_result_t res2 = string_set_at(str, 1, invalid_sym2); + assert(res2.status == STRING_ERR_INVALID_UTF8); string_destroy(str); } @@ -298,6 +317,7 @@ int main(void) { TEST(string_get_at_overflow); TEST(string_set_at); TEST(string_set_at_overflow); + TEST(string_set_at_invalid_utf8); TEST(string_to_lower); TEST(string_to_upper); TEST(string_trim); diff --git a/usage.c b/usage.c index 37d5784..982c12e 100644 --- a/usage.c +++ b/usage.c @@ -543,7 +543,7 @@ int string_usage(void) { string_t *str1 = res.value.string; printf("Created string: \"%s\"\n", str1->data); - printf("Character count: %zu (%zu actual bytes)\n", string_len(str1), str1->byte_size); + printf("Character count: %zu (%zu actual bytes)\n", string_size(str1), str1->byte_size); string_result_t res_clone = string_clone(str1); if (res_clone.status != STRING_OK) { @@ -566,7 +566,7 @@ int string_usage(void) { string_t *suffix = res_suffix.value.string; printf("Created another string: \"%s\"\n", suffix->data); - printf("Character count: %zu (%zu actual bytes)\n\n", string_len(suffix), suffix->byte_size); + printf("Character count: %zu (%zu actual bytes)\n\n", string_size(suffix), suffix->byte_size); string_result_t res_cat = string_concat(str1, suffix); if (res_cat.status != STRING_OK) { @@ -656,7 +656,8 @@ int string_usage(void) { return 1; } - printf("Updated string: \"%s\"\n\n", concat_str->data); + printf("Updated string: \"%s\"\n\n", res_set.value.string->data); + string_destroy(res_set.value.string); // Get character from string (the emoji) string_result_t res_get = string_get_at(concat_str, 14);