Refactored string_set_at and added documentation
This commit is contained in:
2
Makefile
2
Makefile
@@ -40,7 +40,7 @@ $(TEST_M_TARGET): $(OBJ_DIR)/test_map.o $(OBJ_DIR)/map.o
|
||||
$(TEST_B_TARGET): $(OBJ_DIR)/test_bigint.o $(OBJ_DIR)/bigint.o $(OBJ_DIR)/vector.o
|
||||
$(CC) $(CFLAGS) -o $@ $^
|
||||
|
||||
$(TEST_S_TARGET): $(OBJ_DIR)/test_string.o $(OBJ_DIR)/string.o $(OBJ_DIR)/vector.o
|
||||
$(TEST_S_TARGET): $(OBJ_DIR)/test_string.o $(OBJ_DIR)/string.o
|
||||
$(CC) $(CFLAGS) -o $@ $^
|
||||
|
||||
$(OBJ_DIR)/%.o: $(SRC_DIR)/%.c | $(OBJ_DIR)
|
||||
|
||||
36
README.md
36
README.md
@@ -11,7 +11,8 @@ the standard library. It currently features:
|
||||
|
||||
- [**Vector**](/docs/vector.md): a growable, contiguous array of homogenous generic data types;
|
||||
- [**Map**](/docs/map.md): an associative array that handles generic heterogenous data types;
|
||||
- [**BigInt**](/docs/bigint.md): a data type for arbitrary large integers.
|
||||
- [**BigInt**](/docs/bigint.md): a data type for arbitrary large integers;
|
||||
- [**String**](/docs/string.md): an immutable string type with partial UTF-8 support.
|
||||
|
||||
## Usage
|
||||
At its simplest, you can use this library as follows:
|
||||
@@ -167,6 +168,39 @@ int main(void) {
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### `String` usage:
|
||||
```c
|
||||
#include <stdio.h>
|
||||
|
||||
#include "src/string.h"
|
||||
|
||||
/*
|
||||
* Compile with: gcc -O3 main.c src/string.c
|
||||
* Output: Final string: "Hello,World,😀" Splitted: ["Hello" "World" "😀" ]
|
||||
*/
|
||||
int main(void) {
|
||||
string_t *x = string_new(" Hello, ").value.string;
|
||||
string_t *x_trm = string_trim(x).value.string;
|
||||
|
||||
string_t *y = string_new("😀,dlroW").value.string;
|
||||
string_t *y_rev = string_reverse(y).value.string;
|
||||
|
||||
string_t *str = string_concat(x_trm, y_rev).value.string;
|
||||
string_t **strings = string_split(str, ",").value.split.strings;
|
||||
|
||||
printf("Final string: \"%s\" Splitted: [", str->data);
|
||||
for (int idx = 0; idx < 3; idx++) { printf("\"%s\" ", strings[idx]->data); }
|
||||
printf("]\n");
|
||||
|
||||
string_split_destroy(strings, 3); string_destroy(str);
|
||||
string_destroy(x); string_destroy(y);
|
||||
string_destroy(x_trm); string_destroy(y_rev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
For a more exhaustive example, refer to the `usage.c` file. There, you will find a program with proper error management
|
||||
and a sample usage for every available method. To run it, first issue the following command:
|
||||
|
||||
|
||||
@@ -7,4 +7,5 @@ At the time being, this documentation includes the following pages:
|
||||
|
||||
- [vector.md](vector.md): vector documentation;
|
||||
- [map.md](map.md): map documentation;
|
||||
- [bigint.md](bigint.md): bigint documentation.
|
||||
- [bigint.md](bigint.md): bigint documentation;
|
||||
- [string.md](string.md): string documentation.
|
||||
|
||||
@@ -46,7 +46,7 @@ The `BigInt` data structure supports the following methods:
|
||||
- `bigint_result_t bigint_destroy(number)`: delete the big number;
|
||||
- `bigint_result_t bigint_printf(format, ...)`: `printf` wrapper that introduces the `%B` placeholder to print big numbers. It supports variadic parameters.
|
||||
|
||||
As you can see by the previous function signatures, methods that operate on the
|
||||
As you can see from the previous function signatures, methods that operate on the
|
||||
`BigInt` data type return a custom type called `bigint_result_t` which is defined as
|
||||
follows:
|
||||
|
||||
@@ -80,7 +80,7 @@ by setting the `status` field and by providing a descriptive message on the `mes
|
||||
field. If the operation was successful (that is, `status == BIGINT_OK`), you can either
|
||||
move on with the rest of the program or read the returned value from the sum data type.
|
||||
Of course, you can choose to ignore the return value (if you're brave enough :D) as
|
||||
illustrated in the first part of the README.
|
||||
illustrated on the first part of the README.
|
||||
|
||||
The sum data type (i.e., the `value` union) defines four different variables. Each
|
||||
of them has an unique scope as described below:
|
||||
|
||||
@@ -5,7 +5,7 @@ aspects (internal design, memory layout, etc.) of the `Map` data structure.
|
||||
`Map` is an hash table that uses open addressing with linear probing for collision
|
||||
resolution and the [FNV-1a algorithm](https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function) as its hashing function. Resizing is performed
|
||||
automatically by doubling the capacity when the load factor exceeds 75%. Internally,
|
||||
this data structure is represented by the following two structures:
|
||||
this data structure is represented by the following two layouts:
|
||||
|
||||
```c
|
||||
typedef struct {
|
||||
@@ -46,7 +46,7 @@ The `Map` data structure supports the following methods:
|
||||
- `size_t map_size(map)`: returns map size (i.e., the number of elements);
|
||||
- `size_t map_capacity(map)`: returns map capacity (i.e., map total size).
|
||||
|
||||
As you can see by the previous function signatures, most methods that operate
|
||||
As you can see from the previous function signatures, most methods that operate
|
||||
on the `Map` data type return a custom type called `map_result_t` which is
|
||||
defined as follows:
|
||||
|
||||
@@ -73,4 +73,4 @@ Each method that returns such type indicates whether the operation was successfu
|
||||
the `status` field and by providing a descriptive message on the `message` field. If the operation was
|
||||
successful (that is, `status == MAP_OK`), you can either move on with the rest of the program or read
|
||||
the returned value from the sum data type. Of course, you can choose to ignore the return value (if you're brave enough :D) as illustrated
|
||||
in the first part of the README.
|
||||
on the first part of the README.
|
||||
|
||||
96
docs/string.md
Normal file
96
docs/string.md
Normal file
@@ -0,0 +1,96 @@
|
||||
# String Technical Details
|
||||
In this document you can find a quick overview of the technical
|
||||
aspects (internal design, memory layout, etc.) of the `String` data structure.
|
||||
|
||||
`String` is an immutable string data type with partial UTF-8 support.
|
||||
This means that methods return a new string instance rather than modifying the string in-place.
|
||||
Internally, this data structure is represented by the following layout:
|
||||
|
||||
```c
|
||||
typedef struct {
|
||||
char *data;
|
||||
size_t byte_size;
|
||||
size_t byte_capacity;
|
||||
size_t char_count;
|
||||
} string_t;
|
||||
```
|
||||
|
||||
where the `data` variable represents the actual string (represented as a pointer to `char`),
|
||||
the `byte_size` variable indicates the actual size (in bytes) of the string, the
|
||||
`byte_capacity` variable represents the total number of allocated memory (in bytes) and the
|
||||
`char_count` variable represent the number of logical characters, that is the number of
|
||||
symbols.
|
||||
|
||||
As mentioned earlier, this library provides partial UTF-8 support. It is able to recognize
|
||||
UTF-8 byte sequences as individual Unicode code points, which allows it to correctly distinguish
|
||||
between byte length and character count. It fully supports Unicode symbols and emojis, while
|
||||
remaining backward compatible with ASCII strings.
|
||||
|
||||
However, this data structure does not support localization. In particular, it does not perform
|
||||
locale-aware conversion; for instance, uppercase/lowercase transformations are limited to ASCII
|
||||
characters only. As a result, the German scharfes S (`ß`) is not convert to `SS`, the Spanish
|
||||
`Ñ` is not converted to `ñ` and the Italian `é` (and its variants) is not treated as a single
|
||||
symbol, but rather as a base letter combined with an accent.
|
||||
|
||||
At the time being, `String` supports the following methods:
|
||||
|
||||
- `string_result_t string_new(c_str)`: create a new string;
|
||||
- `string_result_t string_clone(str)`: clone an existing string;
|
||||
- `string_result_t string_concat(x, y)`: concatenate two strings together;
|
||||
- `string_result_t string_contains(haystack, needle)`: search whether the `haystack` string contains `needle`;
|
||||
- `string_result_t string_slice(str, start, end)`: return a slice (a new string) from `str` between `start` and `end` indices (inclusive);
|
||||
- `string_result_t string_eq(x, y, case_sensitive)`: check whether `x` and `y` are equal;
|
||||
- `string_result_t string_get_at(str, position)`: get the UTF-8 symbol indexed by `position` from `str`;
|
||||
- `string_result_t string_set_at(str, position, utf8_char)`: write a UTF-8 symbol into `str` at index `position`;
|
||||
- `string_result_t string_to_lower(str)`: convert a string to lowercase;
|
||||
- `string_result_t string_to_upper(str)`: convert a string to uppercase;
|
||||
- `string_result_t string_reverse(str)`: reverse a string;
|
||||
- `string_result_t string_trim(str)`: remove leading and trailing white space from a string;
|
||||
- `string_result_t string_split(str, delim)`: split a string into an array of `string_t` by specifying a separator;
|
||||
- `string_result_t string_destroy(str)`: remove a string from memory;
|
||||
- `string_result_t string_split_destroy(split, count)`: remove an array of strings from memory;
|
||||
- `size_t string_size(str)`: return string character count.
|
||||
|
||||
As you can see from the previous function signatures, most methods that operate on the `String`
|
||||
data type return a custom type called `string_result_t` which is defined as follows:
|
||||
|
||||
```c
|
||||
typedef enum {
|
||||
STRING_OK = 0x0,
|
||||
STRING_ERR_ALLOCATE,
|
||||
STRING_ERR_INVALID,
|
||||
STRING_ERR_INVALID_UTF8,
|
||||
STRING_ERR_OVERFLOW
|
||||
} string_status_t;
|
||||
|
||||
typedef struct {
|
||||
string_status_t status;
|
||||
uint8_t message[RESULT_MSG_SIZE];
|
||||
union {
|
||||
string_t *string; // For new, clone, slice, reverse, trim
|
||||
char *symbol; // For get_at
|
||||
int64_t idx; // For contains
|
||||
bool is_equ; // For comparison
|
||||
struct { // For split
|
||||
string_t **strings;
|
||||
size_t count;
|
||||
} split;
|
||||
} value;
|
||||
} string_result_t;
|
||||
```
|
||||
|
||||
Each method that returns such type indicates whether the operation was successful or not
|
||||
by setting the `status` field and by providing a descriptive message on the `message`
|
||||
field. If the operation was successful (that is, `status == STRING_OK`) you can either
|
||||
move on with the rest of your program or read the returned value from the sum data type.
|
||||
Of course, you can choose to ignore the return value (if you're brave enough :D) as illustrated
|
||||
on the first part of the README.
|
||||
|
||||
The sum data type (i.e., the `value` union) defines five different variables.
|
||||
Each of them has an unique scope as described below:
|
||||
|
||||
- `string`: result of `new`, `clone`, `slice`, `reverse` and `trim` functions;
|
||||
- `symbol`: result of `get_at` function;
|
||||
- `idx`: result of `contains` function;
|
||||
- `is_eq`: result of `equ` function. It's true when two strings are equal, false otherwise;
|
||||
- `split`: result of `split` function. It contains an array of `string_t` and its number of elements.
|
||||
@@ -5,7 +5,7 @@ aspects (internal design, memory layout, etc.) of the `Vector` data structure.
|
||||
`Vector` is a dynamic array with generic data type support; this means that you can store
|
||||
any kind of homogenous value on this data structure. Resizing is performed automatically
|
||||
by increasing the capacity by 1.5 times when the array becomes full. Internally, this
|
||||
data structure is represented by the following structure:
|
||||
data structure is represented by the following layout:
|
||||
|
||||
```c
|
||||
typedef struct {
|
||||
@@ -39,7 +39,7 @@ At the time being, `Vector` supports the following methods:
|
||||
- `size_t vector_size(vector)`: return vector size (i.e., the number of elements);
|
||||
- `size_t vector_capacity(vector)`: return vector capacity (i.e., vector total size).
|
||||
|
||||
As you can see by the previous function signatures, most methods that operate
|
||||
As you can see from the previous function signatures, most methods that operate
|
||||
on the `Vector` data type return a custom type called `vector_result_t` which is
|
||||
defined as follows:
|
||||
|
||||
@@ -66,7 +66,7 @@ Each method that returns such type indicates whether the operation was successfu
|
||||
by setting the `status` field and by providing a descriptive message on the `message`
|
||||
field. If the operation was successful (that is, `status == VECTOR_OK`), you can either
|
||||
move on with the rest of the program or read the returned value from the sum data type. Of course, you can choose to
|
||||
ignore the return value (if you're brave enough :D) as illustrated in the first part of the README.
|
||||
ignore the return value (if you're brave enough :D) as illustrated on the first part of the README.
|
||||
|
||||
## Functional methods
|
||||
`Vector` provides three functional methods called `map`, `filter` and `reduce` which allow the caller to apply a computation to the vector,
|
||||
|
||||
62
src/string.c
62
src/string.c
@@ -522,14 +522,20 @@ string_result_t string_get_at(const string_t *str, size_t position) {
|
||||
*
|
||||
* Returns a string_result_t data type
|
||||
*/
|
||||
string_result_t string_set_at(string_t *str, size_t position, const char *utf8_char) {
|
||||
string_result_t string_set_at(const string_t *str, size_t position, const char *utf8_char) {
|
||||
string_result_t result = {0};
|
||||
|
||||
int new_len;
|
||||
|
||||
if (str == NULL || utf8_is_char_valid(utf8_char, &new_len) == 0) {
|
||||
if (str == NULL) {
|
||||
result.status = STRING_ERR_INVALID;
|
||||
SET_MSG(result, "Invalid index or character");
|
||||
SET_MSG(result, "Invalid string");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int new_char_bytes;
|
||||
if (utf8_is_char_valid(utf8_char, &new_char_bytes) == 0) {
|
||||
result.status = STRING_ERR_INVALID_UTF8;
|
||||
SET_MSG(result, "Invalid UTF-8 character");
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -541,41 +547,49 @@ string_result_t string_set_at(string_t *str, size_t position, const char *utf8_c
|
||||
return result;
|
||||
}
|
||||
|
||||
char *pos = str->data;
|
||||
// Locate the byte offset of the character to replace
|
||||
const char *pos = str->data;
|
||||
for (size_t idx = 0; idx < position; idx++) {
|
||||
pos += utf8_char_len((unsigned char)*pos);
|
||||
}
|
||||
|
||||
int old_len = utf8_char_len((unsigned char)*pos);
|
||||
if (old_len == new_len) {
|
||||
memcpy(pos, utf8_char, new_len);
|
||||
} else {
|
||||
const size_t prefix_len = pos - str->data;
|
||||
const size_t suffix_len = str->byte_size - prefix_len - old_len;
|
||||
const size_t new_total = prefix_len + new_len + suffix_len;
|
||||
const int old_char_bytes = utf8_char_len((unsigned char)*pos);
|
||||
const size_t suffix_len = str->byte_size - prefix_len - old_char_bytes;
|
||||
const size_t new_total_bytes = prefix_len + new_char_bytes + suffix_len;
|
||||
|
||||
char *new_data = malloc(new_total + 1);
|
||||
if (new_data == NULL) {
|
||||
string_t *new_str = malloc(sizeof(string_t));
|
||||
if (new_str == NULL) {
|
||||
result.status = STRING_ERR_ALLOCATE;
|
||||
SET_MSG(result, "Cannot allocate memory");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
memcpy(new_data, str->data, prefix_len);
|
||||
memcpy(new_data + prefix_len, utf8_char, new_len);
|
||||
memcpy(new_data + prefix_len + new_len, pos + old_len, suffix_len);
|
||||
new_data[new_total] = '\0';
|
||||
new_str->data = malloc(new_total_bytes + 1);
|
||||
if (new_str->data == NULL) {
|
||||
free(new_str);
|
||||
result.status = STRING_ERR_ALLOCATE;
|
||||
SET_MSG(result, "Cannot allocate memory");
|
||||
|
||||
free(str->data);
|
||||
|
||||
str->data = new_data;
|
||||
str->byte_size = new_total;
|
||||
str->byte_capacity = new_total + 1;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Copy prefix data from original string
|
||||
memcpy(new_str->data, str->data, prefix_len);
|
||||
// Copy the new character at requested index
|
||||
memcpy(new_str->data + prefix_len, utf8_char, new_char_bytes);
|
||||
// Copy suffix data from the original string by skipping the overwritten character
|
||||
memcpy(new_str->data + prefix_len + new_char_bytes, pos + old_char_bytes, suffix_len);
|
||||
new_str->data[new_total_bytes] = '\0';
|
||||
|
||||
new_str->byte_size = new_total_bytes;
|
||||
new_str->byte_capacity = new_total_bytes + 1;
|
||||
new_str->char_count = str->char_count;
|
||||
|
||||
result.status = STRING_OK;
|
||||
SET_MSG(result, "Character successfully set");
|
||||
result.value.string = new_str;
|
||||
SET_MSG(result, "Symbol successfully set");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ string_result_t string_contains(const string_t *haystack, const string_t *needle
|
||||
string_result_t string_slice(const string_t *str, size_t start, size_t end);
|
||||
string_result_t string_eq(const string_t *x, const string_t *y, bool case_sensitive);
|
||||
string_result_t string_get_at(const string_t *str, size_t position);
|
||||
string_result_t string_set_at(string_t *str, size_t position, const char *utf8_char);
|
||||
string_result_t string_set_at(const string_t *str, size_t position, const char *utf8_char);
|
||||
string_result_t string_to_lower(const string_t *str);
|
||||
string_result_t string_to_upper(const string_t *str);
|
||||
string_result_t string_reverse(const string_t *str);
|
||||
@@ -59,7 +59,7 @@ string_result_t string_destroy(string_t *str);
|
||||
string_result_t string_split_destroy(string_t **split, size_t count);
|
||||
|
||||
// Inline methods
|
||||
static inline size_t string_len(const string_t *str) {
|
||||
static inline size_t string_size(const string_t *str) {
|
||||
return str ? str->char_count : 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ void test_string_new(void) {
|
||||
assert(res.status == STRING_OK);
|
||||
assert(res.value.string != NULL);
|
||||
assert(strcmp(res.value.string->data, "hello") == 0);
|
||||
assert(string_len(res.value.string) == 5);
|
||||
assert(string_size(res.value.string) == 5);
|
||||
assert(res.value.string->byte_size == 5);
|
||||
|
||||
string_destroy(res.value.string);
|
||||
@@ -33,7 +33,7 @@ void test_string_new_empty(void) {
|
||||
string_result_t res = string_new("");
|
||||
|
||||
assert(res.status == STRING_OK);
|
||||
assert(string_len(res.value.string) == 0);
|
||||
assert(string_size(res.value.string) == 0);
|
||||
assert(res.value.string->byte_size == 0);
|
||||
assert(res.value.string->data[0] == '\0');
|
||||
|
||||
@@ -62,7 +62,7 @@ void test_string_concat(void) {
|
||||
string_result_t res = string_concat(str1, str2);
|
||||
assert(res.status == STRING_OK);
|
||||
assert(strcmp(res.value.string->data, "Foo Bar") == 0);
|
||||
assert(string_len(res.value.string) == 7);
|
||||
assert(string_size(res.value.string) == 7);
|
||||
|
||||
string_destroy(str1);
|
||||
string_destroy(str2);
|
||||
@@ -155,9 +155,9 @@ void test_string_reverse_utf8(void) {
|
||||
string_result_t res = string_reverse(str);
|
||||
|
||||
assert(res.status == STRING_OK);
|
||||
assert(string_len(res.value.string) == 3);
|
||||
assert(string_size(res.value.string) == 3);
|
||||
assert(strcmp(res.value.string->data, "Z🌍A") == 0);
|
||||
assert(string_len(res.value.string) == 3);
|
||||
assert(string_size(res.value.string) == 3);
|
||||
|
||||
string_destroy(str);
|
||||
string_destroy(res.value.string);
|
||||
@@ -198,10 +198,29 @@ void test_string_set_at(void) {
|
||||
|
||||
// Replace 'B' with emoji
|
||||
string_result_t res = string_set_at(str, 1, "😆");
|
||||
string_t *altered = res.value.string;
|
||||
|
||||
assert(res.status == STRING_OK);
|
||||
assert(strcmp(str->data, "A😆C") == 0);
|
||||
assert(string_len(str) == 3);
|
||||
assert(str->byte_size == 6); // that is: A (1B) + emoji (4B) + C (1B)
|
||||
assert(strcmp(altered->data, "A😆C") == 0);
|
||||
assert(string_size(altered) == 3);
|
||||
assert(altered->byte_size == 6); // that is: A (1B) + emoji (4B) + C (1B)
|
||||
|
||||
string_destroy(str);
|
||||
string_destroy(altered);
|
||||
}
|
||||
|
||||
// Test mutation of invalid UTF-8 symbol
|
||||
void test_string_set_at_invalid_utf8(void) {
|
||||
string_t *str = string_new("ABC").value.string;
|
||||
|
||||
const char * const invalid_sym1 = "\xFF";
|
||||
const char * const invalid_sym2 = "\x80";
|
||||
|
||||
string_result_t res1 = string_set_at(str, 1, invalid_sym1);
|
||||
assert(res1.status == STRING_ERR_INVALID_UTF8);
|
||||
|
||||
string_result_t res2 = string_set_at(str, 1, invalid_sym2);
|
||||
assert(res2.status == STRING_ERR_INVALID_UTF8);
|
||||
|
||||
string_destroy(str);
|
||||
}
|
||||
@@ -298,6 +317,7 @@ int main(void) {
|
||||
TEST(string_get_at_overflow);
|
||||
TEST(string_set_at);
|
||||
TEST(string_set_at_overflow);
|
||||
TEST(string_set_at_invalid_utf8);
|
||||
TEST(string_to_lower);
|
||||
TEST(string_to_upper);
|
||||
TEST(string_trim);
|
||||
|
||||
7
usage.c
7
usage.c
@@ -543,7 +543,7 @@ int string_usage(void) {
|
||||
|
||||
string_t *str1 = res.value.string;
|
||||
printf("Created string: \"%s\"\n", str1->data);
|
||||
printf("Character count: %zu (%zu actual bytes)\n", string_len(str1), str1->byte_size);
|
||||
printf("Character count: %zu (%zu actual bytes)\n", string_size(str1), str1->byte_size);
|
||||
|
||||
string_result_t res_clone = string_clone(str1);
|
||||
if (res_clone.status != STRING_OK) {
|
||||
@@ -566,7 +566,7 @@ int string_usage(void) {
|
||||
|
||||
string_t *suffix = res_suffix.value.string;
|
||||
printf("Created another string: \"%s\"\n", suffix->data);
|
||||
printf("Character count: %zu (%zu actual bytes)\n\n", string_len(suffix), suffix->byte_size);
|
||||
printf("Character count: %zu (%zu actual bytes)\n\n", string_size(suffix), suffix->byte_size);
|
||||
|
||||
string_result_t res_cat = string_concat(str1, suffix);
|
||||
if (res_cat.status != STRING_OK) {
|
||||
@@ -656,7 +656,8 @@ int string_usage(void) {
|
||||
|
||||
return 1;
|
||||
}
|
||||
printf("Updated string: \"%s\"\n\n", concat_str->data);
|
||||
printf("Updated string: \"%s\"\n\n", res_set.value.string->data);
|
||||
string_destroy(res_set.value.string);
|
||||
|
||||
// Get character from string (the emoji)
|
||||
string_result_t res_get = string_get_at(concat_str, 14);
|
||||
|
||||
Reference in New Issue
Block a user