| File src/utils/utf8proc.c changed (mode: 100644) (index 4f2d4d01d..eb25204c8) |
| ... |
... |
UTF8PROC_DLLEXPORT const char *utf8proc_version(void) { |
| 101 |
101 |
} |
} |
| 102 |
102 |
|
|
| 103 |
103 |
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) { |
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) { |
| 104 |
|
return "16.0.0"; |
|
|
104 |
|
return "17.0.0"; |
| 105 |
105 |
} |
} |
| 106 |
106 |
|
|
| 107 |
107 |
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { |
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { |
| |
| ... |
... |
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqinde |
| 388 |
388 |
for (; len >= 0; entry++, len--) { |
for (; len >= 0; entry++, len--) { |
| 389 |
389 |
utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry); |
utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry); |
| 390 |
390 |
|
|
| 391 |
|
written += utf8proc_decompose_char( |
|
| 392 |
|
entry_cp, |
|
| 393 |
|
(dst != NULL) ? (dst + written) : NULL, |
|
| 394 |
|
(bufsize > written) ? (bufsize - written) : 0, |
|
| 395 |
|
options, |
|
| 396 |
|
last_boundclass |
|
| 397 |
|
); |
|
|
391 |
|
written += utf8proc_decompose_char(entry_cp, dst ? dst+written : dst, |
|
392 |
|
(bufsize > written) ? (bufsize - written) : 0, options, |
|
393 |
|
last_boundclass); |
| 398 |
394 |
if (written < 0) return UTF8PROC_ERROR_OVERFLOW; |
if (written < 0) return UTF8PROC_ERROR_OVERFLOW; |
| 399 |
395 |
} |
} |
| 400 |
396 |
return written; |
return written; |
| |
| ... |
... |
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { |
| 451 |
447 |
|
|
| 452 |
448 |
#define utf8proc_decompose_lump(replacement_uc) \ |
#define utf8proc_decompose_lump(replacement_uc) \ |
| 453 |
449 |
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ |
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ |
| 454 |
|
options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass) |
|
|
450 |
|
(utf8proc_option_t)(options & ~(unsigned int)UTF8PROC_LUMP), last_boundclass) |
| 455 |
451 |
|
|
| 456 |
452 |
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { |
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { |
| 457 |
453 |
const utf8proc_property_t *property; |
const utf8proc_property_t *property; |
| |
| ... |
... |
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( |
| 582 |
578 |
uc = custom_func(uc, custom_data); /* user-specified custom mapping */ |
uc = custom_func(uc, custom_data); /* user-specified custom mapping */ |
| 583 |
579 |
} |
} |
| 584 |
580 |
decomp_result = utf8proc_decompose_char( |
decomp_result = utf8proc_decompose_char( |
| 585 |
|
uc, |
|
| 586 |
|
(buffer != NULL) ? (buffer + wpos) : NULL, |
|
| 587 |
|
(bufsize > wpos) ? (bufsize - wpos) : 0, |
|
| 588 |
|
options, |
|
|
581 |
|
uc, buffer ? buffer+wpos : buffer, (bufsize > wpos) ? (bufsize - wpos) : 0, options, |
| 589 |
582 |
&boundclass |
&boundclass |
| 590 |
583 |
); |
); |
| 591 |
584 |
if (decomp_result < 0) return decomp_result; |
if (decomp_result < 0) return decomp_result; |
| |
| ... |
... |
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( |
| 602 |
595 |
utf8proc_int32_t uc1, uc2; |
utf8proc_int32_t uc1, uc2; |
| 603 |
596 |
const utf8proc_property_t *property1, *property2; |
const utf8proc_property_t *property1, *property2; |
| 604 |
597 |
uc1 = buffer[pos]; |
uc1 = buffer[pos]; |
|
598 |
|
if (uc1 < 0) { |
|
599 |
|
/* skip grapheme break */ |
|
600 |
|
pos++; |
|
601 |
|
continue; |
|
602 |
|
} |
| 605 |
603 |
uc2 = buffer[pos+1]; |
uc2 = buffer[pos+1]; |
|
604 |
|
if (uc2 < 0) { |
|
605 |
|
/* cannot recombine; skip grapheme break */ |
|
606 |
|
pos+=2; |
|
607 |
|
continue; |
|
608 |
|
} |
| 606 |
609 |
property1 = unsafe_get_property(uc1); |
property1 = unsafe_get_property(uc1); |
| 607 |
610 |
property2 = unsafe_get_property(uc2); |
property2 = unsafe_get_property(uc2); |
| 608 |
611 |
if (property1->combining_class > property2->combining_class && |
if (property1->combining_class > property2->combining_class && |
| |
| ... |
... |
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b |
| 681 |
684 |
(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { |
(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { |
| 682 |
685 |
utf8proc_int32_t hangul_tindex; |
utf8proc_int32_t hangul_tindex; |
| 683 |
686 |
hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; |
hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; |
| 684 |
|
if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { |
|
|
687 |
|
if (hangul_tindex > 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { |
| 685 |
688 |
*starter += hangul_tindex; |
*starter += hangul_tindex; |
| 686 |
689 |
starter_property = NULL; |
starter_property = NULL; |
| 687 |
690 |
continue; |
continue; |
| |
| ... |
... |
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b |
| 695 |
698 |
int len = starter_property->comb_length; |
int len = starter_property->comb_length; |
| 696 |
699 |
utf8proc_int32_t max_second = utf8proc_combinations_second[idx + len - 1]; |
utf8proc_int32_t max_second = utf8proc_combinations_second[idx + len - 1]; |
| 697 |
700 |
if (current_char <= max_second) { |
if (current_char <= max_second) { |
|
701 |
|
int off; |
| 698 |
702 |
// TODO: binary search? arithmetic search? |
// TODO: binary search? arithmetic search? |
| 699 |
|
for (int off = 0; off < len; ++off) { |
|
|
703 |
|
for (off = 0; off < len; ++off) { |
| 700 |
704 |
utf8proc_int32_t second = utf8proc_combinations_second[idx + off]; |
utf8proc_int32_t second = utf8proc_combinations_second[idx + off]; |
| 701 |
705 |
if (current_char < second) { |
if (current_char < second) { |
| 702 |
706 |
/* not found */ |
/* not found */ |
| |
| ... |
... |
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( |
| 796 |
800 |
|
|
| 797 |
801 |
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { |
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { |
| 798 |
802 |
utf8proc_uint8_t *retval; |
utf8proc_uint8_t *retval; |
| 799 |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
|
| 800 |
|
UTF8PROC_DECOMPOSE); |
|
|
803 |
|
utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
|
804 |
|
UTF8PROC_DECOMPOSE)); |
| 801 |
805 |
return retval; |
return retval; |
| 802 |
806 |
} |
} |
| 803 |
807 |
|
|
| 804 |
808 |
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { |
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { |
| 805 |
809 |
utf8proc_uint8_t *retval; |
utf8proc_uint8_t *retval; |
| 806 |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
|
| 807 |
|
UTF8PROC_COMPOSE); |
|
|
810 |
|
utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
|
811 |
|
UTF8PROC_COMPOSE)); |
| 808 |
812 |
return retval; |
return retval; |
| 809 |
813 |
} |
} |
| 810 |
814 |
|
|
| 811 |
815 |
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { |
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { |
| 812 |
816 |
utf8proc_uint8_t *retval; |
utf8proc_uint8_t *retval; |
| 813 |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
|
| 814 |
|
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); |
|
|
817 |
|
utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
|
818 |
|
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT)); |
| 815 |
819 |
return retval; |
return retval; |
| 816 |
820 |
} |
} |
| 817 |
821 |
|
|
| 818 |
822 |
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { |
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { |
| 819 |
823 |
utf8proc_uint8_t *retval; |
utf8proc_uint8_t *retval; |
| 820 |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
|
| 821 |
|
UTF8PROC_COMPOSE | UTF8PROC_COMPAT); |
|
|
824 |
|
utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
|
825 |
|
UTF8PROC_COMPOSE | UTF8PROC_COMPAT)); |
| 822 |
826 |
return retval; |
return retval; |
| 823 |
827 |
} |
} |
| 824 |
828 |
|
|
| 825 |
829 |
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) { |
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) { |
| 826 |
830 |
utf8proc_uint8_t *retval; |
utf8proc_uint8_t *retval; |
| 827 |
|
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
|
| 828 |
|
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE); |
|
|
831 |
|
utf8proc_map(str, 0, &retval, (utf8proc_option_t)(UTF8PROC_NULLTERM | UTF8PROC_STABLE | |
|
832 |
|
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE)); |
| 829 |
833 |
return retval; |
return retval; |
| 830 |
834 |
} |
} |
| File src/utils/utf8proc.h changed (mode: 100644) (index 039da7690..3893f6f91) |
| 71 |
71 |
/** The MAJOR version number (increased when backwards API compatibility is broken). */ |
/** The MAJOR version number (increased when backwards API compatibility is broken). */ |
| 72 |
72 |
#define UTF8PROC_VERSION_MAJOR 2 |
#define UTF8PROC_VERSION_MAJOR 2 |
| 73 |
73 |
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ |
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ |
| 74 |
|
#define UTF8PROC_VERSION_MINOR 10 |
|
|
74 |
|
#define UTF8PROC_VERSION_MINOR 11 |
| 75 |
75 |
/** The PATCH version (increased for fixes that do not change the API). */ |
/** The PATCH version (increased for fixes that do not change the API). */ |
| 76 |
|
#define UTF8PROC_VERSION_PATCH 0 |
|
|
76 |
|
#define UTF8PROC_VERSION_PATCH 2 |
| 77 |
77 |
/** @} */ |
/** @} */ |
| 78 |
78 |
|
|
| 79 |
79 |
#include <stdlib.h> |
#include <stdlib.h> |
| |
| ... |
... |
typedef bool utf8proc_bool; |
| 121 |
121 |
#include <limits.h> |
#include <limits.h> |
| 122 |
122 |
|
|
| 123 |
123 |
#ifdef UTF8PROC_STATIC |
#ifdef UTF8PROC_STATIC |
| 124 |
|
# define UTF8PROC_DLLEXPORT |
|
|
124 |
|
# ifndef UTF8PROC_DLLEXPORT |
|
125 |
|
# define UTF8PROC_DLLEXPORT |
|
126 |
|
# endif |
| 125 |
127 |
#else |
#else |
| 126 |
128 |
# ifdef _WIN32 |
# ifdef _WIN32 |
| 127 |
129 |
# ifdef UTF8PROC_EXPORTS |
# ifdef UTF8PROC_EXPORTS |
| |
| ... |
... |
typedef enum { |
| 150 |
152 |
UTF8PROC_STABLE = (1<<1), |
UTF8PROC_STABLE = (1<<1), |
| 151 |
153 |
/** Compatibility decomposition (i.e. formatting information is lost). */ |
/** Compatibility decomposition (i.e. formatting information is lost). */ |
| 152 |
154 |
UTF8PROC_COMPAT = (1<<2), |
UTF8PROC_COMPAT = (1<<2), |
| 153 |
|
/** Return a result with decomposed characters. */ |
|
|
155 |
|
/** Return a result with composed characters. */ |
| 154 |
156 |
UTF8PROC_COMPOSE = (1<<3), |
UTF8PROC_COMPOSE = (1<<3), |
| 155 |
157 |
/** Return a result with decomposed characters. */ |
/** Return a result with decomposed characters. */ |
| 156 |
158 |
UTF8PROC_DECOMPOSE = (1<<4), |
UTF8PROC_DECOMPOSE = (1<<4), |
| |
| ... |
... |
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int |
| 517 |
519 |
* @param dst the destination buffer. |
* @param dst the destination buffer. |
| 518 |
520 |
* @param bufsize the size of the destination buffer. |
* @param bufsize the size of the destination buffer. |
| 519 |
521 |
* @param options one or more of the following flags: |
* @param options one or more of the following flags: |
| 520 |
|
* - @ref UTF8PROC_REJECTNA - return an error `codepoint` is unassigned |
|
|
522 |
|
* - @ref UTF8PROC_REJECTNA - return an error if `codepoint` is unassigned |
| 521 |
523 |
* - @ref UTF8PROC_IGNORE - strip "default ignorable" codepoints |
* - @ref UTF8PROC_IGNORE - strip "default ignorable" codepoints |
| 522 |
524 |
* - @ref UTF8PROC_CASEFOLD - apply Unicode casefolding |
* - @ref UTF8PROC_CASEFOLD - apply Unicode casefolding |
| 523 |
525 |
* - @ref UTF8PROC_COMPAT - replace certain codepoints with their |
* - @ref UTF8PROC_COMPAT - replace certain codepoints with their |
| |
| ... |
... |
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int |
| 532 |
534 |
* option is used. If the string is being processed in order, this can be initialized to 0 for |
* option is used. If the string is being processed in order, this can be initialized to 0 for |
| 533 |
535 |
* the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored. |
* the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored. |
| 534 |
536 |
* |
* |
|
537 |
|
* In the current version of utf8proc, the maximum destination buffer with the @ref UTF8PROC_DECOMPOSE |
|
538 |
|
* option is 4 elements (or double that with @ref UTF8PROC_CHARBOUND), so this is a good default size. |
|
539 |
|
* However, this may increase in future Unicode versions, so you should always check the return value |
|
540 |
|
* as described below. |
|
541 |
|
* |
| 535 |
542 |
* @return |
* @return |
| 536 |
543 |
* In case of success, the number of codepoints written is returned; in case |
* In case of success, the number of codepoints written is returned; in case |
| 537 |
544 |
* of an error, a negative error code is returned (utf8proc_errmsg()). |
* of an error, a negative error code is returned (utf8proc_errmsg()). |
| |
| ... |
... |
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi |
| 743 |
750 |
* |
* |
| 744 |
751 |
* @note The memory of the new UTF-8 string will have been allocated |
* @note The memory of the new UTF-8 string will have been allocated |
| 745 |
752 |
* with `malloc`, and should therefore be deallocated with `free`. |
* with `malloc`, and should therefore be deallocated with `free`. |
|
753 |
|
* |
|
754 |
|
* @note `utf8proc_map` simply calls `utf8proc_decompose` followed by `utf8proc_reencode`, |
|
755 |
|
* and applications requiring greater control over memory allocation should instead call |
|
756 |
|
* those two functions directly. |
| 746 |
757 |
*/ |
*/ |
| 747 |
758 |
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( |
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( |
| 748 |
759 |
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options |
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options |