|
libu8
|
These functions and macros interrogate and transform unicode points. More...
Data Structures | |
| struct | U8_DECOMPOSITION |
| struct U8_DECOMPOSITION indicates a mapping between a single Unicode codepoint and an equivalent Unicode sequence. More... | |
| struct | U8_CHARINFO_TABLE |
| struct U8_CHARINFO_TABLE is used to store additional character info not provided by the statically defined tables. More... | |
Defines | |
| #define | u8_isalpha(c) ((c>=0) && ((u8_getcharinfo(c)) < 6)) |
| Returns 1 if its argument is an alphabetic unicode point. | |
| #define | u8_islower(c) ((c>=0) && ((u8_getcharinfo(c)) == U8_LOWER_LETTER)) |
| Returns 1 if its argument is a lower-case alphabetic unicode point. | |
| #define | u8_isupper(c) |
| Returns 1 if its argument is an upper-case alphabetic unicode point. | |
| #define | u8_ismodifier(c) ((c>=0) && ((u8_getcharinfo(c)) == U8_MODIFIER_LETTER)) |
| Returns 1 if its argument is modifier unicode point. | |
| #define | u8_isdigit(c) ((c>=0) && ((u8_getcharinfo(c)) == U8_NUMBER)) |
| Returns 1 if its argument is numeric digit unicode point. | |
| #define | u8_ispunct(c) |
| Returns 1 if its argument is a punctuation character. | |
| #define | u8_isprint(c) |
| Returns 1 if its argument is a printing character (letter,digit,punct) | |
| #define | u8_isspace(c) ((c>=0) && ((u8_getcharinfo(c)) == U8_SEPARATOR)) |
| Returns 1 if its argument is whitespace unicode point. | |
| #define | u8_ishspace(c) |
| Returns 1 if its argument is horizontal whitespace unicode point. | |
| #define | u8_isvspace(c) |
| Returns 1 if its argument is horizontal whitespace unicode point. | |
| #define | u8_isctrl(c) ((c>=0) && ((c<0x20) || ((c>0x7e) && (c<0x9f)))) |
| Returns 1 if its argument is a standard control character. | |
| #define | u8_isalnum(c) ((c>=0) && (((u8_getcharinfo(c)) < 6) || (u8_isdigit(c)))) |
| Returns 1 if its argument is an alphanumeric unicode point. | |
| #define | u8_isxdigit(c) ((c>=0) && ((c<128) && (isxdigit(c)))) |
| Returns 1 if its argument is an ASCII hex digit. | |
| #define | u8_isodigit(c) ((c>=0) && ((c<128) && (isdigit(c)) && (c<'8'))) |
| Returns 1 if its argument is an ASCII octal digit. | |
| #define | u8_toupper(c) |
| Returns a non-lowercase version of a unicode code point. | |
| #define | u8_tolower(c) |
| Returns a non-uppercase version of a unicode code point. | |
| #define | u8_digit_weight(c) ((u8_isdigit(c)) ? ((c<0x10000) ? (u8_getchardata(c)) : (u8_lookup_chardata(c))) : (0)) |
| Returns the numeric weight of a numeric unicode code point. | |
Typedefs | |
| typedef struct U8_DECOMPOSITION | U8_DECOMPOSITION |
| struct U8_DECOMPOSITION indicates a mapping between a single Unicode codepoint and an equivalent Unicode sequence. | |
Functions | |
| U8_EXPORT int | u8_entity2code (u8_string name) |
| Converts an XML entity name into the corresponding code point. | |
| U8_EXPORT u8_string | u8_code2entity (int code) |
| Converts a code point into an XML entity name. | |
| U8_EXPORT int | u8_parse_entity (u8_byte *entity, u8_byte **endp) |
| Parses a unicode entity name from a string, recording the endpoint. | |
| U8_EXPORT void | u8_set_charinfo (int n, unsigned char *info, short *data) |
| Sets the character information for a particular code point. | |
These functions and macros interrogate and transform unicode points.
They include standard character predicates (u8_isspace, u8_ispunct, etc) as well as function/macros for changing case and converting to and from XML character entities.
| #define u8_digit_weight | ( | c | ) | ((u8_isdigit(c)) ? ((c<0x10000) ? (u8_getchardata(c)) : (u8_lookup_chardata(c))) : (0)) |
Returns the numeric weight of a numeric unicode code point.
| #define u8_isalnum | ( | c | ) | ((c>=0) && (((u8_getcharinfo(c)) < 6) || (u8_isdigit(c)))) |
Returns 1 if its argument is an alphanumeric unicode point.
| #define u8_isalpha | ( | c | ) | ((c>=0) && ((u8_getcharinfo(c)) < 6)) |
Returns 1 if its argument is an alphabetic unicode point.
| #define u8_isctrl | ( | c | ) | ((c>=0) && ((c<0x20) || ((c>0x7e) && (c<0x9f)))) |
Returns 1 if its argument is a standard control character.
| #define u8_isdigit | ( | c | ) | ((c>=0) && ((u8_getcharinfo(c)) == U8_NUMBER)) |
Returns 1 if its argument is numeric digit unicode point.
| #define u8_ishspace | ( | c | ) |
((c>=0) && \ ((c==' ')||(c=='\t')||(c==0x1680)||(c==0x180e)|| \ ((c>=0x2000)&&(c<0x200b))))
Returns 1 if its argument is horizontal whitespace unicode point.
| #define u8_islower | ( | c | ) | ((c>=0) && ((u8_getcharinfo(c)) == U8_LOWER_LETTER)) |
Returns 1 if its argument is a lower-case alphabetic unicode point.
| #define u8_ismodifier | ( | c | ) | ((c>=0) && ((u8_getcharinfo(c)) == U8_MODIFIER_LETTER)) |
Returns 1 if its argument is modifier unicode point.
| #define u8_isodigit | ( | c | ) | ((c>=0) && ((c<128) && (isdigit(c)) && (c<'8'))) |
Returns 1 if its argument is an ASCII octal digit.
| #define u8_ispunct | ( | c | ) |
((c>=0) && \
(((u8_getcharinfo(c)) == U8_GLUE_PUNCTUATION) || \
((u8_getcharinfo(c)) == U8_BREAK_PUNCTUATION) || \
((u8_getcharinfo(c)) == U8_SYMBOL) || \
((u8_getcharinfo(c)) == U8_MARK)))
Returns 1 if its argument is a punctuation character.
| #define u8_isspace | ( | c | ) | ((c>=0) && ((u8_getcharinfo(c)) == U8_SEPARATOR)) |
Returns 1 if its argument is whitespace unicode point.
| #define u8_isupper | ( | c | ) |
((c>=0) && \
(((u8_getcharinfo(c)) == U8_UPPER_LETTER) || \
((u8_getcharinfo(c)) == U8_TITLE_LETTER)))
Returns 1 if its argument is an upper-case alphabetic unicode point.
| #define u8_isvspace | ( | c | ) |
((c>=0) && \ ((c=='\n')||(c=='\r')||(c==0x0c)||(c==0x0b)|| \ (c==0x1C)||(c==0x1D)||(c==0x1E)||(c==0x1F)|| \ (c==0x85)||(c==0x2029)))
Returns 1 if its argument is horizontal whitespace unicode point.
| #define u8_isxdigit | ( | c | ) | ((c>=0) && ((c<128) && (isxdigit(c)))) |
Returns 1 if its argument is an ASCII hex digit.
| #define u8_tolower | ( | c | ) |
((u8_isupper(c)) ? \ ((c<0x10000) ? (c+(u8_getchardata(c))) : (u8_lookup_chardata(c))) : \ (c))
Returns a non-uppercase version of a unicode code point.
| #define u8_toupper | ( | c | ) |
((u8_islower(c)) ? \ ((c<0x10000) ? (c+(u8_getchardata(c))) : (u8_lookup_chardata(c))) : \ (c))
Returns a non-lowercase version of a unicode code point.
| U8_EXPORT u8_string u8_code2entity | ( | int | code | ) |
Converts a code point into an XML entity name.
This falls back to hex codes if neccessary.
| code | a unicode code point |
| U8_EXPORT int u8_entity2code | ( | u8_string | name | ) |
Converts an XML entity name into the corresponding code point.
This returns -1 for unrecognized or invalid entity names.
| name | a utf-8 (ASCII) name. |
| U8_EXPORT int u8_parse_entity | ( | u8_byte * | entity, |
| u8_byte ** | endp | ||
| ) |
Parses a unicode entity name from a string, recording the endpoint.
This is handed a pointer to a UTF-8 string (entity) just after the entity escape character ampersand ('&'). It parses an entity name, returning the corresponding code and storing the end of the entity (after the trailing semicolon (';')) in endp. If endp is NULL, the end result is not stored.
| entity | a pointer into a UTF-8 string |
| endp | a pointer to a location to store the end of the entity |
| U8_EXPORT void u8_set_charinfo | ( | int | n, |
| unsigned char * | info, | ||
| short * | data | ||
| ) |
Sets the character information for a particular code point.
| n | a Unicode code point |
| info | a string describing information about the character |
| data | a pointer to a short vector of data about the character |
1.7.4