diff --git a/src/string.c b/src/string.c new file mode 100644 index 0000000..4a4d0dc --- /dev/null +++ b/src/string.c @@ -0,0 +1,152 @@ +#define SET_MSG(result, msg) \ + do { \ + snprintf((char *)(result).message, RESULT_MSG_SIZE, "%s", (const char *)msg); \ + } while (0) + +#include +#include +#include +#include + +#include "string.h" + +// Check if a character is a space +static inline bool is_space(unsigned char c) { + return (c == ' ' || c == '\t' || + c == '\n' || c == '\r' || + c == '\f' || c == '\v'); +} + +// Get byte length of an UTF-8 sequence +static inline int utf8_char_len(unsigned char byte) { + if ((byte & 0x80) == 0x00) return 1; + if ((byte & 0xE0) == 0xC0) return 2; + if ((byte & 0xF0) == 0xE0) return 3; + if ((byte & 0xF8) == 0xF0) return 4; + + return -1; +} + +// Validate an UTF-8 symbol +static bool utf8_is_char_valid(const char *utf8_char, int *out_len) { + if (utf8_char == NULL) { + return false; + } + + const size_t len = utf8_char_len((unsigned char)utf8_char[0]); + if (len <= 0) { + return false; + } + + for (size_t idx = 1; idx < len; idx++) { + if ((utf8_char[idx] & 0xC0) != 0x80) { + return false; + } + } + + if (utf8_char[len] != '\0') { + return false; + } + + if (out_len) { + *out_len = len; + } + + return true; +} + +// Validate an UTF-8 symbol and measure byte length and character count +static bool utf8_scan(const char *str, size_t *out_byte_size, size_t *out_char_count) { + size_t b_size = 0; + size_t c_count = 0; + const unsigned char *p = (const unsigned char *)str; + + while (p[b_size] != '\0') { + size_t len = utf8_char_len(p[b_size]); + if (len <= 0) { + return false; + } + + for (size_t idx = 1; idx < len; idx++) { + if (p[b_size + idx] == '\0' || (p[b_size + idx] & 0xC0) != 0x80) { + return false; + } + } + p_size += len; + c_count++; + } + + *out_byte_size = b_size; + *out_char_count = c_count; + + return true; +} + +// Decode an UTF-8 symbol to a codepoint +static uint32_t utf8_decode(const char *str, int *char_len) { + unsigned char byte = (unsigned char)*str; + *char_len = utf8_char_len(byte); + + uint32_t result = 0; + + switch (*char_len) { + case 1: + result = byte; + break; + case 2: + result = ((byte & 0x1F) << 6) | + (str[1] & 0x3F); + break; + case 3: + result = ((byte & 0x0F) << 12) | + ((str[1] & 0x3F) << 6) | + (str[2] & 0x3F); + break; + case 4: + result = ((byte & 0x07) << 18) | + ((str[1] & 0x3F) << 12) | + ((str[2] & 0x3F) << 6) | + (str[3] & 0x3F); + break; + default: + result = 0; + break; + } + + return result; +} + +// Encode a codepoint to an UTF-8 symbol +static int utf8_encode(uint32_t codepoint, char *out) { + if (codepoint <= 0x7F) { + out[0] = (char)codepoint; + + return 1; + } + + if (codepoint <= 0x7FF) { + out[0] = (char)(0xC0 | (codepoint >> 6)); + out[1] = (char)(0x80 | (codepount & 0x3F)); + + return 2; + } + + if (codepoint <= 0xFFFF) { + out[0] = (char)(0xE0 | (codepoint >> 12)); + out[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + out[2] = (char)(0x80 | (codepoint & 0x3F)); + + return 3; + } + + if (codepoint <= 0x10FFFF) { + out[0] = (char)(0xF0 | (codepoint >> 18)); + out[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F)); + out[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + out[3] = (char)(0x80 | (codepoint & 0x3F)); + + return 4; + } + + return 0; +} diff --git a/src/string.h b/src/string.h new file mode 100644 index 0000000..2715cb4 --- /dev/null +++ b/src/string.h @@ -0,0 +1,67 @@ +#ifndef STRING_H +#define STRING_H + +#define RESULT_MSG_SIZE 64 + +#include +#include +#include + +typedef enum { + STRING_OK = 0x0, + STRING_ERR_ALLOCATE, + STRING_ERR_INVALID, + STRING_ERR_INVALID_UTF8, + STRING_ERR_OVERFLOW +} string_status_t; + +typedef struct { + char *data; + size_t byte_size; // Size in bytes minus the NULL terminator + size_t byte_cap; // Total allocated memory + size_t char_count; // Number of symbols +} string_t; + +typedef struct { + string_status_t status; + uint8_t message[RESULT_MSG_SIZE]; + union { + string_t *string; // For new, clone, slice, reverse, trim + char *symbol; // For get_at + bool is_eq; // For comparison + struct { // For split + string_t **strings; + size_t count; + } split; + } value; +} string_result_t; + +#ifdef __cplusplus +#extern "C" { +#endif + +// Public APIs +string_result_t string_new(const char *c_str); +string_result_t string_close(const string_t *str); +string_result_t string_concat(const string_t *x, const string_t *y); +string_result_t string_contains(const string_t *haystack, const string_t *needle); +string_result_t string_slice(const string_t *str, size_t start, size_t end); +string_result_t string_eq(const string_t *x, const string_t *y, bool case_sensitive); +string_result_t string_get_at(const string_t *str, size_t position); +string_result_t string_set_at(const string_t *str, size_t position, const char *utf8_char); +string_result_t string_to_lower(const string_t *str); +string_result_t string_to_upper(const string_t *str); +string_result_t string_reverse(const string_t *str); +string_result_t string_trim(const string_t *str); +string_result_t string_split(const string_t *str, const char *delim); +string_result_t string_destroy(string_t *str); +string_result_t string_split_destroy(string_t **split, size_t count); + +// Inline methods +static inline size_t string_size(const string_t *str) { + return str ? str->char_count : 0; +} + +#ifdef __cplusplus +} +#endif