libu8
u8stringfns.h File Reference

These functions provide utilities over UTF-8 strings. More...

Macros

#define u8_bytelen(string)   (strlen(string))
 Returns the number of bytes in a UTF-8 string. More...
 

Functions

U8_EXPORT u8_string u8_upcase (u8_string string)
 Returns an uppercase version of a UTF-8 string. More...
 
U8_EXPORT u8_string u8_downcase (u8_string string)
 Returns a lowercase version of a UTF-8 string. More...
 
U8_EXPORT int u8_has_prefix (u8_string string, u8_string prefix, int casefold)
 Determines if a string has a particular prefix. More...
 
U8_EXPORT int u8_has_suffix (u8_string string, u8_string suffix, int casefold)
 Determines if a string has a particular suffix. More...
 
U8_EXPORT u8_string u8_strchrs (u8_string s, u8_string chars, int order)
 Searches for a character in a string. More...
 
U8_EXPORT u8_string u8_strstrs (u8_string s, u8_string strings[], int order)
 Searches for a set of strings in a string. More...
 
U8_EXPORT char * u8_itoa10 (long long int n, char buf[32])
 Generates a base-ten representation of a long long int This should be safe to use in, for example, signal handlers, where printf is verboten. More...
 
U8_EXPORT char * u8_uitoa10 (unsigned long long int n, char buf[32])
 Generates a base-ten representation of a long long int This should be safe to use in, for example, signal handlers, where printf is verboten. More...
 
U8_EXPORT char * u8_uitoa8 (unsigned long long int n, char buf[32])
 Generates a base-ten representation of a long long int This should be safe to use in, for example, signal handlers, where printf is verboten. More...
 
U8_EXPORT char * u8_uitoa16 (unsigned long long int n, char buf[32])
 Generates a base-eight representation of an unsigned long long int This should be safe to use in, for example, signal handlers, where printf is verboten. More...
 
U8_EXPORT u8_string u8_decompose (u8_string string)
 Returns a decomposed version of a UTF-8 string. More...
 
U8_EXPORT u8_string u8_string_append (u8_string first_string,...)
 Appends together any number of UTF-8 strings This takes any number of UTF-8 strings, finishing with a NULL pointer, and returns the result of appending them together.
 
U8_EXPORT u8_string u8_string_subst (u8_string input, u8_string key, u8_string replace)
 Substitutes one string for another within its input This takes an input string, a key string, and a replacement string. More...
 
U8_EXPORT u8_string u8_slice (const u8_byte *start, const u8_byte *end)
 Extracts and copies a substring of a UTF-8 string. More...
 
U8_EXPORT int u8_strlen (u8_string string)
 Returns the number of characters in a UTF-8 string. More...
 
U8_EXPORT int u8_strlen_x (u8_string string, int len)
 Returns the number of characters in a UTF-8 string with an explicit length. More...
 
U8_EXPORT u8_string u8_substring (u8_string string, int i)
 Returns a pointer into string starting at the ith character. More...
 
U8_EXPORT int u8_string_ref (u8_string strptr)
 Returns the first codepoint in strptr. More...
 
U8_EXPORT int u8_validptr (const u8_byte *s)
 Checks if the start of a string is a valid UTF-8 representation. More...
 
U8_EXPORT int u8_validp (u8_string s)
 Checks if a string is a valid UTF-8 representation. More...
 
U8_EXPORT int u8_validate (u8_string s, int n)
 Checks if the n bytes starting at s are a valid UTF-8 string. More...
 
U8_EXPORT u8_string u8_valid_copy (u8_string s)
 Checks the validity of a UTF-8 string and copies it. More...
 
U8_EXPORT u8_string u8_convert_crlfs (u8_string s)
 Checks the validity of a UTF-8 string and copies it, converting CRLFS. More...
 
U8_EXPORT u8_string u8_indent_text (u8_string input, u8_string indent)
 Adds indentation at the beginning of every line within a string. More...
 
U8_EXPORT char * u8_grab_bytes (u8_string s, int n, char *buf)
 Returns an ascii-converted substring of a possible UTF-8 string. More...
 

Detailed Description

These functions provide utilities over UTF-8 strings.

Many of these work generically over NUL-terminated strings but some are particular to UTF-8.

Macro Definition Documentation

#define u8_bytelen (   string)    (strlen(string))

Returns the number of bytes in a UTF-8 string.

This is just an alias for the C library function strlen();

Parameters
stringa UTF-8 string
Returns
the number of bytes in the string

Referenced by u8_string_subst().

Function Documentation

U8_EXPORT u8_string u8_convert_crlfs ( u8_string  s)

Checks the validity of a UTF-8 string and copies it, converting CRLFS.

Parameters
sa possibly (probably) valid UTF-8 string.
Returns
a valid UTF-8 string or NULL

References U8_INIT_STATIC_OUTPUT.

U8_EXPORT u8_string u8_decompose ( u8_string  string)

Returns a decomposed version of a UTF-8 string.

Parameters
stringa UTF-8 string
Returns
a UTF-8 string with all composed characters broken down

References U8_INIT_STATIC_OUTPUT.

U8_EXPORT u8_string u8_downcase ( u8_string  string)

Returns a lowercase version of a UTF-8 string.

Parameters
stringa UTF-8 string
Returns
a UTF-8 string with all uppercase characters converted to lowercase.

References U8_INIT_STATIC_OUTPUT, and u8_tolower.

Referenced by u8_do_printf().

U8_EXPORT char* u8_grab_bytes ( u8_string  s,
int  n,
char *  buf 
)

Returns an ascii-converted substring of a possible UTF-8 string.

Parameters
sa possibly (probably) valid UTF-8 string.
nthe size (in bytes) of the substring to return
bufan optional buffer to use (otherwise, one is mallocd
Returns
an ASCII string, possibly with %-encoded bytes
U8_EXPORT int u8_has_prefix ( u8_string  string,
u8_string  prefix,
int  casefold 
)

Determines if a string has a particular prefix.

Parameters
stringa UTF-8 string
prefixa UTF-8 string
casefoldan int (1 or 0)
Returns
1 if string starts with prefix, 0 otherwise If casefold is not zero, case is ignored

References u8_strlen().

U8_EXPORT int u8_has_suffix ( u8_string  string,
u8_string  suffix,
int  casefold 
)

Determines if a string has a particular suffix.

Parameters
stringa UTF-8 string
suffixa UTF-8 string
casefoldan int (1 or 0)
Returns
1 if string ends with suffix, 0 otherwise If casefold is not zero, case is ignored

References u8_strlen().

U8_EXPORT u8_string u8_indent_text ( u8_string  input,
u8_string  indent 
)

Adds indentation at the beginning of every line within a string.

Parameters
inputan original string, possibly with newlines
indentthe indent string
Returns
a copy of the original string with the indent string inserted before every '
'

Referenced by u8_set_logindent().

U8_EXPORT char* u8_itoa10 ( long long int  n,
char  buf[32] 
)

Generates a base-ten representation of a long long int This should be safe to use in, for example, signal handlers, where printf is verboten.

Parameters
na long long int (may be automatically cast up, of course)
na static 24-byte buffer (long enough to contain the largest N)
Returns
pointer to that buffer
U8_EXPORT u8_string u8_slice ( const u8_byte *  start,
const u8_byte *  end 
)

Extracts and copies a substring of a UTF-8 string.

Parameters
starta pointer into a UTF-8 string
enda pointer into a later location in the same string
Returns
a UTF-8 string extracted from between the two pointers

Referenced by u8_read_base16().

U8_EXPORT u8_string u8_strchrs ( u8_string  s,
u8_string  chars,
int  order 
)

Searches for a character in a string.

Parameters
stringa UTF-8 string
charsa sequence of characters in a UTF-8 string
orderan integer indicating which match is returned
Returns
a substring of 'string' or NULL

This returns a pointer to the place in 'string' where any of the characters 'chars' is found or NULL if none of them are found. If order is 0 , return the string based on the first character (in 'chars') to match, otherwise priority is based on where the match occurs in the searched 'string'.

If order is > 0, this returns the earliest first occurence, if order is < 0, this return the furthest first result.

Referenced by u8_readlink().

U8_EXPORT int u8_string_ref ( u8_string  strptr)

Returns the first codepoint in strptr.

Parameters
strptra pointer into a UTF-8 string
Returns
the unicode code point at the pointer.
U8_EXPORT u8_string u8_string_subst ( u8_string  input,
u8_string  key,
u8_string  replace 
)

Substitutes one string for another within its input This takes an input string, a key string, and a replacement string.

It returns a copy of the input string with the replacement string substituted for all occurences of the key string. Note: that this does not do any UTF-8 normalization.

References u8_bytelen, and U8_INIT_STATIC_OUTPUT.

U8_EXPORT int u8_strlen ( u8_string  string)

Returns the number of characters in a UTF-8 string.

This counts the number of unicode codepoints, so combining characters are counted as separate characters. This assumes that the string is NUL terminated; to count characters given a particular end pointer use u8_strlen_x()

Parameters
stringa UTF-8 string
Returns
the number of characters (codepoints) in the string

Referenced by u8_has_prefix(), and u8_has_suffix().

U8_EXPORT int u8_strlen_x ( u8_string  string,
int  len 
)

Returns the number of characters in a UTF-8 string with an explicit length.

This counts the number of unicode codepoints, so combining characters are counted as separate characters.

Parameters
stringa UTF-8 string
lenthe number of bytes in the string to be measured
Returns
the number of characters (codepoints) in the string
U8_EXPORT u8_string u8_strstrs ( u8_string  s,
u8_string  strings[],
int  order 
)

Searches for a set of strings in a string.

Parameters
stringa UTF-8 string
charsa NUL-terminated
orderan int
Returns
a substring of 'string' or NULL

This returns a place in 'string' where any of the strings in 'strings' occurs; order determines which match is returned (when there are several). If order is 0, priority is based on the order of the strings in 'strings', otherwise, priority is based on where the match occurs in the searched 'string'.

If order is > 0, this returns the earliest first occurence, if order is < 0, this return the furthest first result.

U8_EXPORT u8_string u8_substring ( u8_string  string,
int  i 
)

Returns a pointer into string starting at the ith character.

This does not copy its result, so the returned string shares memory with the string.

Parameters
stringa UTF-8 string
ihow many characters in to start the string
Returns
a substring (not copied)
U8_EXPORT char* u8_uitoa10 ( unsigned long long int  n,
char  buf[32] 
)

Generates a base-ten representation of a long long int This should be safe to use in, for example, signal handlers, where printf is verboten.

Parameters
na long long int (may be automatically cast up, of course)
na static 24-byte buffer (long enough to contain the largest N)
Returns
pointer to that buffer
U8_EXPORT char* u8_uitoa16 ( unsigned long long int  n,
char  buf[32] 
)

Generates a base-eight representation of an unsigned long long int This should be safe to use in, for example, signal handlers, where printf is verboten.

Parameters
na long long int (may be automatically cast up, of course)
na static 24-byte buffer (long enough to contain the largest N)
Returns
pointer to that buffer
U8_EXPORT char* u8_uitoa8 ( unsigned long long int  n,
char  buf[32] 
)

Generates a base-ten representation of a long long int This should be safe to use in, for example, signal handlers, where printf is verboten.

Parameters
na long long int (may be automatically cast up, of course)
na static 24-byte buffer (long enough to contain the largest N)
Returns
pointer to that buffer
U8_EXPORT u8_string u8_upcase ( u8_string  string)

Returns an uppercase version of a UTF-8 string.

Parameters
stringa UTF-8 string
Returns
a UTF-8 string with all lowercase characters converted to uppercase.

References U8_INIT_STATIC_OUTPUT, and u8_toupper.

Referenced by u8_do_printf().

U8_EXPORT u8_string u8_valid_copy ( u8_string  s)

Checks the validity of a UTF-8 string and copies it.

Parameters
sa possibly (probably) valid UTF-8 string.
Returns
a valid UTF-8 string or NULL

References U8_INIT_STATIC_OUTPUT.

Referenced by u8_abspath(), u8_dirname(), u8_fromlibc(), and u8_tolibc().

U8_EXPORT int u8_validate ( u8_string  s,
int  n 
)

Checks if the n bytes starting at s are a valid UTF-8 string.

Parameters
sa possible UTF-8 string
nthe number of bytes in the string
Returns
1 if the pointer refers to a valid UTF-8 sequence, 0 otherwise

Referenced by u8_peekc().

U8_EXPORT int u8_validp ( u8_string  s)

Checks if a string is a valid UTF-8 representation.

Parameters
sa possible UTF-8 string
Returns
1 if the string is a valid UTF-8 string, 0 otherwise
U8_EXPORT int u8_validptr ( const u8_byte *  s)

Checks if the start of a string is a valid UTF-8 representation.

This checks only for a single character representation.

Parameters
sa possible UTF-8 string
Returns
1 if the pointer refers to a valid UTF-8 sequence, 0 otherwise

Referenced by u8_guess_encoding().