diff --git a/embed.fnc b/embed.fnc index 66a4f04cd910..fdb889827a4f 100644 --- a/embed.fnc +++ b/embed.fnc @@ -914,6 +914,11 @@ CTp |Signal_t|csighandler1 |int sig CTp |Signal_t|csighandler3 |int sig \ |NULLOK Siginfo_t *info \ |NULLOK void *uap +ATdmp |bool |c9strict_utf8_to_uv \ + |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p EXp |regexp_engine const *|current_re_engine RXp |XOPRETANY|custom_op_get_field \ |NN const OP *o \ @@ -1169,6 +1174,11 @@ AOdp |SV * |eval_pv |NN const char *p \ |I32 croak_on_error AOdp |SSize_t|eval_sv |NN SV *sv \ |I32 flags +ATdmp |bool |extended_utf8_to_uv \ + |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p Adfpv |void |fatal_warner |U32 err \ |NN const char *pat \ |... @@ -3065,6 +3075,11 @@ dopx |PerlIO *|start_glob |NN SV *tmpglob \ |NN IO *io Adp |I32 |start_subparse |I32 is_format \ |U32 flags +ATdmp |bool |strict_utf8_to_uv \ + |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p CRp |NV |str_to_version |NN SV *sv : Used in pp_ctl.c p |void |sub_crush_depth|NN CV *cv @@ -3667,12 +3682,11 @@ ARTdmp |U8 * |utf8_hop_safe |NN const U8 *s \ |NN const U8 * const end ARdp |STRLEN |utf8_length |NN const U8 *s0 \ |NN const U8 *e - -AMTdp |UV |utf8n_to_uvchr |NN const U8 *s \ +ATdmp |UV |utf8n_to_uvchr |NN const U8 *s \ |STRLEN curlen \ |NULLOK STRLEN *retlen \ |const U32 flags -AMTdp |UV |utf8n_to_uvchr_error \ +ATdmp |UV |utf8n_to_uvchr_error \ |NN const U8 *s \ |STRLEN curlen \ |NULLOK STRLEN *retlen \ @@ -3685,13 +3699,6 @@ ATdip |UV |utf8n_to_uvchr_msgs \ |const U32 flags \ |NULLOK U32 *errors \ |NULLOK AV **msgs -CTp |UV |_utf8n_to_uvchr_msgs_helper \ - |NN const U8 *s \ - |STRLEN curlen \ - |NULLOK STRLEN *retlen \ - |const U32 flags \ - |NULLOK U32 *errors \ - |NULLOK AV **msgs CDbdp |UV |utf8n_to_uvuni |NN const U8 *s \ |STRLEN curlen \ |NULLOK STRLEN *retlen \ @@ -3735,16 +3742,44 @@ EMXp |U8 * |utf16_to_utf8_reversed \ |NN U8 *d \ |Size_t bytelen \ |NN Size_t *newlen +ATdmp |bool |utf8_to_uv |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p ADbdp |UV |utf8_to_uvchr |NN const U8 *s \ |NULLOK STRLEN *retlen -AMdp |UV |utf8_to_uvchr_buf \ - |NN const U8 *s \ - |NN const U8 *send \ - |NULLOK STRLEN *retlen -Cip |UV |utf8_to_uvchr_buf_helper \ +AMdip |UV |utf8_to_uvchr_buf \ |NN const U8 *s \ |NN const U8 *send \ |NULLOK STRLEN *retlen +ATdmp |bool |utf8_to_uv_errors \ + |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p \ + |const U32 flags \ + |NULLOK U32 *errors +ATdmp |bool |utf8_to_uv_flags \ + |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p \ + |const U32 flag +ATdip |bool |utf8_to_uv_msgs|NN const U8 * const s0 \ + |NN const U8 *e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p \ + |const U32 flags \ + |NULLOK U32 *errors \ + |NULLOK AV **msgs +CTp |bool |utf8_to_uv_msgs_helper_ \ + |NN const U8 * const s0 \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p \ + |const U32 flags \ + |NULLOK U32 *errors \ + |NULLOK AV **msgs CDbdp |UV |utf8_to_uvuni |NN const U8 *s \ |NULLOK STRLEN *retlen : Used in perly.y diff --git a/embed.h b/embed.h index 365303533b65..560ecde2c484 100644 --- a/embed.h +++ b/embed.h @@ -125,7 +125,6 @@ # define _to_utf8_lower_flags(a,b,c,d,e) Perl__to_utf8_lower_flags(aTHX_ a,b,c,d,e) # define _to_utf8_title_flags(a,b,c,d,e) Perl__to_utf8_title_flags(aTHX_ a,b,c,d,e) # define _to_utf8_upper_flags(a,b,c,d,e) Perl__to_utf8_upper_flags(aTHX_ a,b,c,d,e) -# define _utf8n_to_uvchr_msgs_helper Perl__utf8n_to_uvchr_msgs_helper # define amagic_call(a,b,c,d) Perl_amagic_call(aTHX_ a,b,c,d) # define amagic_deref_call(a,b) Perl_amagic_deref_call(aTHX_ a,b) # define apply_attrs_string(a,b,c,d) Perl_apply_attrs_string(aTHX_ a,b,c,d) @@ -158,6 +157,7 @@ # define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d) # define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c) # define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) +# define c9strict_utf8_to_uv Perl_c9strict_utf8_to_uv # define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c) # define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b) # define call_list(a,b) Perl_call_list(aTHX_ a,b) @@ -223,6 +223,7 @@ # define dump_vindent(a,b,c,d) Perl_dump_vindent(aTHX_ a,b,c,d) # define eval_pv(a,b) Perl_eval_pv(aTHX_ a,b) # define eval_sv(a,b) Perl_eval_sv(aTHX_ a,b) +# define extended_utf8_to_uv Perl_extended_utf8_to_uv # define fbm_compile(a,b) Perl_fbm_compile(aTHX_ a,b) # define fbm_instr(a,b,c,d) Perl_fbm_instr(aTHX_ a,b,c,d) # define filter_add(a,b) Perl_filter_add(aTHX_ a,b) @@ -676,6 +677,7 @@ # define stack_grow(a,b,c) Perl_stack_grow(aTHX_ a,b,c) # define start_subparse(a,b) Perl_start_subparse(aTHX_ a,b) # define str_to_version(a) Perl_str_to_version(aTHX_ a) +# define strict_utf8_to_uv Perl_strict_utf8_to_uv # define suspend_compcv(a) Perl_suspend_compcv(aTHX_ a) # define sv_2bool(a) Perl_sv_2bool(aTHX,a) # define sv_2bool_flags(a,b) Perl_sv_2bool_flags(aTHX_ a,b) @@ -863,7 +865,13 @@ # define utf8_to_bytes_new_pv(a,b,c) Perl_utf8_to_bytes_new_pv(aTHX,a,b,c) # define utf8_to_bytes_overwrite(a,b) Perl_utf8_to_bytes_overwrite(aTHX,a,b) # define utf8_to_bytes_temp_pv(a,b) Perl_utf8_to_bytes_temp_pv(aTHX,a,b) -# define utf8_to_uvchr_buf_helper(a,b,c) Perl_utf8_to_uvchr_buf_helper(aTHX_ a,b,c) +# define utf8_to_uv Perl_utf8_to_uv +# define utf8_to_uv_errors Perl_utf8_to_uv_errors +# define utf8_to_uv_flags Perl_utf8_to_uv_flags +# define utf8_to_uv_msgs Perl_utf8_to_uv_msgs +# define utf8_to_uv_msgs_helper_ Perl_utf8_to_uv_msgs_helper_ +# define utf8n_to_uvchr Perl_utf8n_to_uvchr +# define utf8n_to_uvchr_error Perl_utf8n_to_uvchr_error # define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs # define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX,a,b) # define uvchr_to_utf8_flags(a,b,c) Perl_uvchr_to_utf8_flags(aTHX,a,b,c) diff --git a/inline.h b/inline.h index 11e59be414a4..3a2618431702 100644 --- a/inline.h +++ b/inline.h @@ -2053,7 +2053,7 @@ C> (and kin); and if C is C, they give the same results as C> (and kin). Otherwise C may be any combination of the C> flags understood by -C>, with the same meanings. +C>, with the same meanings. It's better to use one of the non-C<_flags> functions if they give you the desired strictness, as those have a better chance of being inlined by the C @@ -2306,7 +2306,7 @@ as C>; and if C is C, this gives the same results as C>. Otherwise C may be any combination of the C> flags -understood by C>, with the same meanings. +understood by C>, with the same meanings. The three alternative macros are for the most commonly needed validations; they are likely to run somewhat faster than this more general one, as they can be @@ -2931,7 +2931,7 @@ C when the latter is called with a zero C parameter. This parameter is used to restrict the classes of code points that are considered to be valid. When zero, Perl's extended UTF-8 is used. Otherwise C can be any combination of the C> -flags accepted by C>. If there is any sequence of bytes +flags accepted by C>. If there is any sequence of bytes that can complete the input partial character in such a way that a non-prohibited character is formed, the function returns TRUE; otherwise FALSE. Non-character code points cannot be determined based on partial character @@ -3003,7 +3003,7 @@ complete code point, this will return TRUE anyway, provided that C> returns TRUE for them. C can be zero or any combination of the C> flags -accepted by C>, and with the same meanings. +accepted by C>, and with the same meanings. The functions differ from C> only in that the latter returns FALSE if the final few bytes of the string don't form a complete code @@ -3048,21 +3048,22 @@ Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, || is_utf8_valid_partial_char_flags(*ep, s + len, flags); } -PERL_STATIC_INLINE UV -Perl_utf8n_to_uvchr_msgs(const U8 * const s0, - STRLEN curlen, - STRLEN *retlen, - const U32 flags, - U32 * errors, - AV ** msgs) +PERL_STATIC_INLINE bool +Perl_utf8_to_uv_msgs(const U8 * const s0, + const U8 * const e, + UV * cp_p, + Size_t *advance_p, + const U32 flags, + U32 * errors, + AV ** msgs) { - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; + PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS; - /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the - * simple cases, and, if necessary calls a helper function to deal with the - * more complex ones. Almost all well-formed non-problematic code points - * are considered simple, so that it's unlikely that the helper function - * will need to be called. */ + /* This is the inlined portion of utf8_to_uv_msgs. It handles the simple + * cases, and, if necessary calls a helper function to deal with the more + * complex ones. Almost all well-formed non-problematic code points are + * considered simple, so that it's unlikely that the helper function will + * need to be called. */ /* Assume that isn't malformed; the vast majority of calls won't be */ if (errors) { @@ -3075,9 +3076,9 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0, /* No calls from core pass in an empty string; non-core need a check */ #ifdef PERL_CORE - assert(curlen > 0); + assert(e > s0); #else - if (LIKELY(curlen > 0)) + if (LIKELY(e > s0)) #endif { @@ -3085,15 +3086,15 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0, * capable of handling this, but this shortcuts this very common case * */ if (UTF8_IS_INVARIANT(*s0)) { - if (retlen) { - *retlen = 1; + if (advance_p) { + *advance_p = 1; } - return *s0; + *cp_p = *s0; + return true; } const U8 * s = s0; - const U8 * send = s + curlen; /* This dfa is fast. If it accepts the input, it was for a * well-formed, non-problematic code point, which can be returned @@ -3116,7 +3117,7 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0, PERL_UINT_FAST16_T state = PL_strict_utf8_dfa_tab[256 + type]; UV uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s); - while (state > 1 && ++s < send) { + while (state > 1 && ++s < e) { type = PL_strict_utf8_dfa_tab[*s]; state = PL_strict_utf8_dfa_tab[256 + state + type]; @@ -3124,42 +3125,75 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0, } if (LIKELY(state == 0)) { - if (retlen) { - *retlen = s - s0 + 1; + if (advance_p) { + *advance_p = s - s0 + 1; } - return UNI_TO_NATIVE(uv); + *cp_p = UNI_TO_NATIVE(uv); + return true; } } /* Here is potentially problematic. Use the full mechanism */ - return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, - errors, msgs); + return utf8_to_uv_msgs_helper_(s0, e, cp_p, advance_p, flags, errors, msgs); } PERL_STATIC_INLINE UV -Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) +Perl_utf8n_to_uvchr_msgs(const U8 * const s0, + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors, + AV ** msgs) { - PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER; + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; - assert(s < send); + UV cp; + if (LIKELY(utf8_to_uv_msgs(s0, s0 + curlen, &cp, retlen, flags, errors, + msgs))) + { + return cp; + } - if (! ckWARN_d(WARN_UTF8)) { + if ((flags & UTF8_CHECK_ONLY) && retlen) { + *retlen = ((STRLEN) -1); + } - /* EMPTY is not really allowed, and asserts on debugging builds. But - * on non-debugging we have to deal with it, and this causes it to - * return the REPLACEMENT CHARACTER, as the documentation indicates */ - return utf8n_to_uvchr(s, send - s, retlen, - (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY)); + return 0; +} + + +PERL_STATIC_INLINE UV +Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) +{ + PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; + assert(s < send); + + UV cp; + + /* When everything is legal, just return that; but when not: + * 1) if warnings are enabled return 0 and retlen to -1 + * 2) if warnings are disabled, set 'flags' to accept any malformation, + * but that will just cause the REPLACEMENT CHARACTER to be returned, + * as the documentation indicates. EMPTY is not really allowed, and + * asserts on debugging builds. But on non-debugging we have to deal + * with it. + * This API means 0 can mean a legal NUL, or the input is malformed; and + * the caller has to know if warnings are disabled to know if it can rely on + * 'retlen'. Best to use utf8_to_uv() instead */ + U32 flags = (ckWARN_d(WARN_UTF8)) ? 0 : (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY); + + if ( LIKELY(utf8_to_uv_flags(s, send, &cp, retlen, flags)) + || flags) + { + return cp; } - else { - UV ret = utf8n_to_uvchr(s, send - s, retlen, 0); - if (retlen && ret == 0 && (send <= s || *s != '\0')) { - *retlen = (STRLEN) -1; - } - return ret; + if (retlen) { + *retlen = (STRLEN) -1; } + + return 0; } /* ------------------------------- perl.h ----------------------------- */ diff --git a/mathoms.c b/mathoms.c index 49a63a61087e..402f3a3e32b8 100644 --- a/mathoms.c +++ b/mathoms.c @@ -202,29 +202,6 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags)); } -/* -=for apidoc_section $unicode -=for apidoc utf8_to_uvchr - -Returns the native code point of the first character in the string C -which is assumed to be in UTF-8 encoding; C will be set to the -length, in bytes, of that character. - -Some, but not all, UTF-8 malformations are detected, and in fact, some -malformed input could cause reading beyond the end of the input buffer, which -is why this function is deprecated. Use L instead. - -If C points to one of the detected malformations, and UTF8 warnings are -enabled, zero is returned and C<*retlen> is set (if C isn't -C) to -1. If those warnings are off, the computed value if well-defined (or -the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> -is set (if C isn't NULL) so that (S + C<*retlen>>) is the -next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is returned. - -=cut -*/ - UV Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) { diff --git a/pod/perldelta.pod b/pod/perldelta.pod index cf2c408fd489..12c782611fae 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -406,6 +406,42 @@ well. =item * +New API functions are introduced to convert strings encoded in UTF-8 to +their ordinal code point equivalent. These are safe to use by default, +and generally more convenient to use than the existing ones. + +L> replaces L> (which is +retained for backwards compatibility), but you should convert to use the +new form, as likely you aren't using the old one safely. + +There are also two new functions, L> and +L> which do the same thing except when +the input string represents a code point that Unicode doesn't accept as +legal for interchange, using either the strict original definition +(C), or the looser one given by +L +(C). When the input string represents one of the +restricted code points, these functions return the Unicode +C instead. + +Also L> is a synonym for C, for use +when you want to emphasize that the entire range of Perl extended UTF-8 +is acceptable. + +There are also replacement functions for the three more specialized +conversion functions that you are unlikely to need to use. Again, the +old forms are kept for backwards compatibility, but you should convert +to use the new forms. + +L> replaces L>. + +L> replaces L>. + +L> replaces +L>. + +=item * + Three new API functions are introduced to convert strings encoded in UTF-8 to native bytes format (if possible). These are easier to use than the existing ones, and they avoid unnecessary memory allocations. diff --git a/proto.h b/proto.h index 4c37a25c1350..7db33e9fd7d2 100644 --- a/proto.h +++ b/proto.h @@ -160,11 +160,6 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, const U8 *e, U8 *ustrp, STRLEN *len #define PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS \ assert(p); assert(ustrp) -PERL_CALLCONV UV -Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 *errors, AV **msgs); -#define PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER \ - assert(s) - PERL_CALLCONV_NO_RET void Perl_abort_execution(pTHX_ SV *msg_sv, const char * const name) __attribute__noreturn__ @@ -413,6 +408,9 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp); #define PERL_ARGS_ASSERT_BYTES_TO_UTF8 \ assert(s); assert(lenp) +/* PERL_CALLCONV bool +Perl_c9strict_utf8_to_uv(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p); */ + PERL_CALLCONV SSize_t Perl_call_argv(pTHX_ const char *sub_name, I32 flags, char **argv); #define PERL_ARGS_ASSERT_CALL_ARGV \ @@ -1114,6 +1112,9 @@ Perl_eval_sv(pTHX_ SV *sv, I32 flags); #define PERL_ARGS_ASSERT_EVAL_SV \ assert(sv) +/* PERL_CALLCONV bool +Perl_extended_utf8_to_uv(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p); */ + PERL_CALLCONV void Perl_fatal_warner(pTHX_ U32 err, const char *pat, ...) __attribute__format__(__printf__,pTHX_2,pTHX_3); @@ -4389,6 +4390,9 @@ Perl_str_to_version(pTHX_ SV *sv) #define PERL_ARGS_ASSERT_STR_TO_VERSION \ assert(sv) +/* PERL_CALLCONV bool +Perl_strict_utf8_to_uv(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p); */ + PERL_CALLCONV void Perl_sub_crush_depth(pTHX_ CV *cv) __attribute__visibility__("hidden"); @@ -5364,20 +5368,25 @@ Perl_utf8_to_utf16_base(pTHX_ U8 *s, U8 *d, Size_t bytelen, Size_t *newlen, cons #define PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE \ assert(s); assert(d); assert(newlen) -PERL_CALLCONV UV -Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen); -#define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF \ - assert(s); assert(send) +/* PERL_CALLCONV bool +Perl_utf8_to_uv(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p); */ -PERL_CALLCONV UV -Perl_utf8n_to_uvchr(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags); -#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR \ - assert(s) +/* PERL_CALLCONV bool +Perl_utf8_to_uv_errors(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p, const U32 flags, U32 *errors); */ -PERL_CALLCONV UV -Perl_utf8n_to_uvchr_error(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 *errors); -#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR \ - assert(s) +/* PERL_CALLCONV bool +Perl_utf8_to_uv_flags(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p, const U32 flag); */ + +PERL_CALLCONV bool +Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, const U8 * const e, UV *cp_p, Size_t *advance_p, const U32 flags, U32 *errors, AV **msgs); +#define PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_ \ + assert(s0); assert(e); assert(cp_p) + +/* PERL_CALLCONV UV +Perl_utf8n_to_uvchr(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags); */ + +/* PERL_CALLCONV UV +Perl_utf8n_to_uvchr_error(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 *errors); */ PERL_CALLCONV void Perl_utilize(pTHX_ int aver, I32 floor, OP *version, OP *idop, OP *arg) @@ -10011,9 +10020,14 @@ Perl_utf8_hop_overshoot(const U8 *s, SSize_t off, const U8 * const start, const # define PERL_ARGS_ASSERT_UTF8_HOP_OVERSHOOT \ assert(s); assert(start); assert(end) +PERL_STATIC_INLINE bool +Perl_utf8_to_uv_msgs(const U8 * const s0, const U8 *e, UV *cp_p, Size_t *advance_p, const U32 flags, U32 *errors, AV **msgs); +# define PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS \ + assert(s0); assert(e); assert(cp_p) + PERL_STATIC_INLINE UV -Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen); -# define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER \ +Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen); +# define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF \ assert(s); assert(send) PERL_STATIC_INLINE UV diff --git a/utf8.c b/utf8.c index 02aa3776bbd4..74eea0c0606d 100644 --- a/utf8.c +++ b/utf8.c @@ -50,7 +50,7 @@ Perl__force_out_malformed_utf8_message(pTHX_ const U8 *const p, /* First byte in UTF-8 sequence */ const U8 * const e, /* Final byte in sequence (may include multiple chars */ - const U32 flags, /* Flags to pass to utf8n_to_uvchr(), + const U32 flags, /* Flags to pass to utf8_to_uv(), usually 0, or some DISALLOW flags */ const bool die_here) /* If TRUE, this function does not return */ { @@ -727,7 +727,7 @@ Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags) * nowhere else. The function has to cope as best it can if that * sequence does not form a full character. * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags - * accepted by L. If non-zero, this function returns + * accepted by L. If non-zero, this function returns * 0 if it determines the input will match something disallowed. * On output: * The return is the number of bytes required to represent the code point @@ -1002,173 +1002,324 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, /* -=for apidoc utf8n_to_uvchr +=for apidoc utf8_to_uv +=for apidoc_item extended_utf8_to_uv +=for apidoc_item strict_utf8_to_uv +=for apidoc_item c9strict_utf8_to_uv +=for apidoc_item utf8_to_uvchr_buf +=for apidoc_item utf8_to_uvchr + +These functions each translate from UTF-8 to UTF-32 (or UTF-64 on 64 bit +platforms). In other words, to a code point ordinal value. (On EBCDIC +platforms, the initial encoding is UTF-EBCDIC, and the output is a native code +point). + +For example, the string "A" would be converted to the number 65 on an ASCII +platform, and to 193 on an EBCDIC one. Converting the string "ABC" would yield +the same results, as the functions stop after the first character converted. +Converting the string "\N{LATIN CAPITAL LETTER A WITH MACRON} plus anything +more in the string" would yield the number 0x100 on both types of platforms, +since the first character is U+0100. + +The functions whose names contain C are older than the functions +whose names don't have C in them. The API in the older functions is +harder to use correctly, and so they are kept only for backwards compatibility, +and may eventually become deprecated. If you are writing a module and use +L, your code can use the new functions back to at least Perl +v5.7.1. + +All the functions accept, without complaint, well-formed UTF-8 for any +non-problematic Unicode code point 0 .. 0x10FFFF. There are two types of +Unicode problematic code points: surrogate characters and non-character code +points. (See L.) Some of the functions reject one or both of +these. Private use characters and those code points yet to be assigned to a +particular character are never considered problematic. Additionally, most of +the functions accept non-Unicode code points, those starting at 0x110000. -THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. -Most code should use L() rather than call this -directly. - -Bottom level UTF-8 decode routine. -Returns the native code point value of the first character in the string C, -which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than -C bytes; C<*retlen> (if C isn't NULL) will be set to -the length, in bytes, of that character. - -The value of C determines the behavior when either C does not point -to a well-formed UTF-8 character, or the pointed-to code point is a member of -certain potentially problematic classes (listed below). If C is 0, all -such classes are accepted, and encountering a malformation causes zero to be -returned and C<*retlen> to be set so that (S + C<*retlen>>) is the next -possible position in C that could begin a non-malformed character. For -malformations, if UTF-8 warnings haven't been lexically disabled, a warning is -also raised. Some UTF-8 input sequences may contain multiple malformations. -This function tries to find every possible one in each call, so multiple -warnings can be raised for the same sequence. - -Various ALLOW flags can be set in C to allow (and not warn on) -individual types of malformations, such as the sequence being overlong (that -is, there is a shorter sequence that can express the same code point; overlong -sequences are expressly forbidden in the UTF-8 standard due to potential -security issues). Another malformation example is the first byte of the input -sequence not being a legal first byte. See F for the list of such -flags. Even if allowed, this function generally returns the Unicode -REPLACEMENT CHARACTER when it encounters a malformation. There are flags in -F to override this behavior for the overlong malformations, but don't -do that except for very specialized purposes. - -The C flag overrides the behavior when a non-allowed (by other -flags) malformation is found. If this flag is set, the routine assumes that -the caller will raise a warning, and this function will silently just set -C to C<-1> (cast to C) and return zero. - -Note that this API requires disambiguation between successful decoding a C -character, and an error return (unless the C flag is set), as -in both cases, 0 is returned, and, depending on the malformation, C may -be set to -1. To disambiguate, upon a zero return, see if the first byte of -C is 0 as well. If so, the input was a C; if not, the input had an -error. Or you can use C>. - -Certain classes of code points are considered problematic. These are Unicode -surrogates, Unicode non-characters, and code points above the Unicode maximum -of 0x10FFFF. By default these are considered regular code points, but certain -situations warrant special handling for them, which can be specified using the -C parameter. If C contains C, -all three classes are treated as malformations and handled as such. The flags -C, C, and -C (meaning above the legal Unicode maximum) can be set to -disallow these categories individually. C -restricts the allowed inputs to the strict UTF-8 traditionally defined by -Unicode. Use C to use the strictness -definition given by -L. -The difference between traditional strictness and C9 strictness is that the -latter does not forbid non-character code points. (They are still discouraged, -however.) For more discussion see L. - -The flags C, -C, C, -C, and C will cause warning messages to be -raised for their respective categories, but otherwise the code points are -considered valid (not malformations). To get a category to both be treated as -a malformation and raise a warning, specify both the WARN and DISALLOW flags. -(But note that warnings are not raised if lexically disabled nor if -C is also specified.) +=over 4 -Extremely high code points were never specified in any standard, and require an -extension to UTF-8 to express, which Perl does. It is likely that programs -written in something other than Perl would not be able to read files that -contain these; nor would Perl understand files written by something that uses a -different extension. For these reasons, there is a separate set of flags that -can warn and/or disallow these extremely high code points, even if other -above-Unicode ones are accepted. They are the C and -C flags. For more information see -C>. Of course C will treat all -above-Unicode code points, including these, as malformations. -(Note that the Unicode standard considers anything above 0x10FFFF to be -illegal, but there are standards predating it that allow up to 0x7FFF_FFFF -(2**31 -1)) - -A somewhat misleadingly named synonym for C is -retained for backward compatibility: C. Similarly, -C is usable instead of the more accurately named -C. The names are misleading because these flags -can apply to code points that actually do fit in 31 bits. This happens on -EBCDIC platforms, and sometimes when the L> is also present. The new names accurately -describe the situation in all cases. - -All other code points corresponding to Unicode characters, including private -use and those yet to be assigned, are never considered malformed and never -warn. +=item C forms -=for apidoc Amnh||UTF8_CHECK_ONLY -=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE -=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE -=for apidoc Amnh||UTF8_DISALLOW_SURROGATE -=for apidoc Amnh||UTF8_DISALLOW_NONCHAR -=for apidoc Amnh||UTF8_DISALLOW_SUPER -=for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE -=for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE -=for apidoc Amnh||UTF8_WARN_SURROGATE -=for apidoc Amnh||UTF8_WARN_NONCHAR -=for apidoc Amnh||UTF8_WARN_SUPER -=for apidoc Amnh||UTF8_WARN_PERL_EXTENDED -=for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED +Almost all code should use only C, C, +C, or C. The other functions are +either the problematic old form, or are for highly specialized uses. -=cut +These four functions each return C if the sequence of bytes starting at +C form a complete, legal UTF-8 (or UTF-EBCDIC) sequence for a code point. +If so, C<*cp> will be set to the native code point value it represents, and +C<*advance> will be set to its length, in bytes. -Also implemented as a macro in utf8.h -*/ +Otherwise, each function returns C and sets C<*cp> to the Unicode +REPLACEMENT CHARACTER, and C<*advance> to the next position along C, where +the next possible UTF-8 character could begin. Failing to use this position as +the next starting point during parsing of strings has led to successful +attacks by crafted inputs. -UV -Perl_utf8n_to_uvchr(const U8 *s, - STRLEN curlen, - STRLEN *retlen, - const U32 flags) -{ - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; +The functions only examine as many bytes along C as are needed to form a +complete UTF-8 representation of a single code point, but they never examine +the byte at C, or beyond. They return false if the code point requires more +than S> bytes to represent. - return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL); -} +The functions differ only in what flavor of UTF-8 they accept. All reject +syntactically invalid UTF-8. -/* +=over 4 -=for apidoc utf8n_to_uvchr_error +=item * C -THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. -Most code should use L() rather than call this -directly. - -This function is for code that needs to know what the precise malformation(s) -are when an error is found. If you also need to know the generated warning -messages, use L() instead. - -It is like C> but it takes an extra parameter placed after -all the others, C. If this parameter is 0, this function behaves -identically to C>. Otherwise, C should be a pointer -to a C variable, which this function sets to indicate any errors found. -Upon return, if C<*errors> is 0, there were no errors found. Otherwise, -C<*errors> is the bit-wise C of the bits described in the list below. Some -of these bits will be set if a malformation is found, even if the input -C parameter indicates that the given malformation is allowed; those -exceptions are noted: +additionally rejects any UTF-8 that translates into a code point that isn't +specified by Unicode to be freely exchangeable, namely the surrogate characters +and non-character code points (besides non-Unicode code points, any above +0x10FFFF). It does not raise a warning when rejecting. + +=item * C + +instead uses the exchangeable definition given by Unicode's Corregendum #9, +which accepts non-character code points while still rejecting surrogates. It +does not raise a warning when rejecting. + +=item * C + +accepts all syntactically valid UTF-8, as extended by Perl to allow 64-bit code +points to be encoded. + +=back + +C is merely a synonym for C, whose name +explicitly indicates that it accepts Perl-extended UTF-8. Perl programs +traditionally handle this by default. + +Whenever syntactically invalid input is rejected, an explanatory warning +message is raised, unless C warnings (or the appropriate subcategory) are +turned off. A given input sequence may contain multiple malformations, giving +rise to multiple warnings, as the functions attempt to find and report on all +malformations in a sequence. All the possible malformations are listed in +C>, with some examples of multiple ones for the same +sequence. You can use that function or C> to exert more +control over the input that is considered acceptable, and the warnings that are +raised. + +Often, C is an arbitrarily long string containing the UTF-8 representations +of many code points in a row, and these functions are called in the course of +parsing C to find all those code points. + +If your code doesn't know how to deal with illegal input, as would be typical +of a low level routine, the loop could look like: + + while (s < e) { + UV cp; + Size_t advance; + (void) utf8_to_uv(s, e, &cp, &advance); + + s += advance; + } + +A REPLACEMENT CHARACTER will be inserted everywhere that malformed input +occurs. Obviously, we aren't expecting such outcomes, but your code will be +protected from attacks and many harmful effects that could otherwise occur. + +If you do have a plan for handling malformed input, you could instead write: + + while (s < e) { + UV cp; + Size_t advance; + + if (UNLIKELY(! utf8_to_uv(s, e, &cp, &advance)) { + + } + + + + s += advance; + } + +You may pass NULL to these functions instead of a pointer to your C +variable. But the only legitimate case to do this is if you are only examining +the first character in C, and have no plans to ever look further. You could +also advance by using C, but this gives the correct result if and +only if the input is well-formed; and this practice has led to successful +attacks against such code; and it is extra work always, as the functions have +already done the equivalent work and return the correct value in C, +regardless of whether the input is well-formed or not. + +You must always pass a non-NULL pointer into which to store the (first) code +point C represents. If you don't care about this value, you should be using +one of the C> functions instead. + +=item C forms + +These are the old form equivalents of C (and its synonym, +C). They are C and C. +There is no old form equivalent of either C nor +C. + +C is DEPRECATED. Do NOT use it; it is a security hole ready to +bring destruction onto you and yours. + +C is discouraged and may eventually become deprecated. It +checks if the sequence of bytes starting at C form a complete, legal UTF-8 +(or UTF-EBCDIC) sequence for a code point. If so, it returns the code point +value the sequence represents, and C<*retlen> will be set to its length, in +bytes. Thus, the next possible character in C begins at S>. + +The function only examines as many bytes along C as are needed to form a +complete UTF-8 representation of a single code point, but it never examines +the byte at C, or beyond. + +If the sequence examined starting at C is not legal Perl extended UTF-8, the +translation fails, and the resultant behavior unfortunately depends on if the +warnings category "utf8" is enabled or not. =over 4 -=item C +=item If C<'utf8'> warnings are disabled -The input sequence is not standard UTF-8, but a Perl extension. This bit is -set only if the input C parameter contains either the -C or the C flags. +The Unicode REPLACEMENT CHARACTER is silently returned, and C<*retlen> is set +(if C isn't C) so that (S + C<*retlen>>) is the next +possible position in C that could begin a non-malformed character. + +But note that it is ambiguous whether a REPLACEMENT CHARACTER was actually in +the input, or if this function synthetically generated one. In the unlikely +event that you care, you'd have to examine the input to disambiguate. + +=item If C<'utf8'> warnings are enabled + +A warning will be displayed, and 0 is returned and C<*retlen> is set (if +C isn't C) to -1. + +But note that 0 may also be returned if S<*s> is a legal NUL character. This +means that you have to disambiguate a 0 return. You can do this by checking +that the first byte of C is indeed a NUL; or by making sure to always pass a +non-NULL C pointer, and by examining it. + +Also note that should you wish to proceed with parsing C, you have no easy +way of knowing where to start looking in it for the next possible character. +It is important to look in the right place to prevent attacks on your code. +It would be better to have instead called an equivalent function that provides +this information; any of the C series, or C>. + +=back + +Because of these quirks, C is very difficult to use +correctly and handle all cases. Generally, you need to bail out at the first +failure it finds. + +The deprecated C behaves the same way as C for +well-formed input, and for the malformations it is capable of finding, but +doesn't find all of them, and it can read beyond the end of the input buffer, +which is why it is deprecated. + +=back + +The C family of functions is preferred because they make it +easier to write code safe from attacks. You should be converting to them; this +will result in simpler, more robust code. + +=for apidoc utf8_to_uv_flags +=for apidoc_item utf8n_to_uvchr + +These functions are extensions of C>, where you need +more control over what UTF-8 sequences are acceptable. These functions are +unlikely to be needed except for specialized purposes. + +C is more like an extension of C, but +with fewer quirks, and a different method of specifying the bytes in C it is +allowed to examine. It has a C parameter instead of an C parameter, +so the furthest byte in C it can look at is S>. Its return +value is, like C, ambiguous with respect to the NUL and +REPLACEMENT characters, but the value of C<*retlen> can be relied on (except +with the C flag described below) to know where the next +possible character along C starts, removing that quirk. Hence, you always +should use C<*retlen> to determine where the next character in C starts. + +These functions have an additional parameter, C, besides the ones in +C and C, which can be used to broaden or +restrict what is acceptable UTF-8. C has the same meaning and behavior +in both functions. When C is 0, these functions accept any +syntactically valid Perl-extended-UTF-8 sequence. + +There are flags that apply to accepting particular sequences, and flags that +apply to raising warnings about encountering sequences. Each type is +independent of the other. You can reject and not warn; warn and still accept; +or both reject and warn. Rejecting means that the sequence gets translated +into the Unicode REPLACEMENT CHARACTER instead of what it was meant to +represent. + +Even if a flag is passed that indicates warnings are desired; no warning will be +raised if C<'utf8'> warnings (or the appropriate subcategory) are disabled at +the point of the call. + +=over 4 + +=item C + +This also suppresses any warnings. And it changes what is stored into +C<*retlen> with the C family of functions (for the worse). It is not +likely to be of use to you. You can use C (described below) to +also turn off warnings, and that flag doesn't adversely affect C<*retlen>. + +=item C + +=item C + +These disallow and/or warn about UTF-8 sequences that represent surrogate +characters. + +=item C + +=item C + +These disallow and/or warn about UTF-8 sequences that represent non-character +code points. + +=item C + +=item C + +These disallow and/or warn about UTF-8 sequences that represent code points +above 0x10FFFF. + +=item C + +=item C + +These are the same as having selected all three of the corresponding SURROGATE, +NONCHAR and SUPER flags listed above. + +All such code points are not considered to be safely freely exchangeable +between processes. + +=item C -Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, -and so some extension must be used to express them. Perl uses a natural -extension to UTF-8 to represent the ones up to 2**36-1, and invented a further -extension to represent even higher ones, so that any code point that fits in a -64-bit word can be represented. Text using these extensions is not likely to -be portable to non-Perl code. We lump both of these extensions together and -refer to them as Perl extended UTF-8. There exist other extensions that people -have invented, incompatible with Perl's. +=item C + +These are the same as having selected both the corresponding SURROGATE and +SUPER flags listed above. + +Unicode issued L to allow non-character +code points to be exchanged by processes aware of the possibility. (They are +still discouraged, however.) For more discussion see +L. + +=item C + +=item C + +These disallow and/or warn on encountering sequences that require Perl's +extension to UTF-8 to represent them. These are all for code points above +0x10FFFF, so these sequences are a subset of the ones controlled by SUPER or +either of the illegal interchange sets of flags. + +Perl predates Unicode, and earlier standards allowed for code points up through +0x7FFF_FFFF (2**31 - 1). Perl, of course, would like you to be able to +represent in UTF-8 any code point available on the platform. To do so, some +extension must be used to express them. Perl uses a natural extension to UTF-8 +to represent the ones up to 2**36-1, and invented a further extension to +represent even higher ones, so that any code point that fits in a 64-bit word +can be represented. We lump both of these extensions together and refer to +them as Perl extended UTF-8. There exist other extensions that people have +invented, incompatible with Perl's. On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower @@ -1176,15 +1327,59 @@ than on ASCII. Prior to that, code points 2**31 and higher were simply unrepresentable, and a different, incompatible method was used to represent code points between 2**30 and 2**31 - 1. -On both platforms, ASCII and EBCDIC, C is set if -Perl extended UTF-8 is used. +It is likely that programs written in something other than Perl would not be +able to read files that contain these; nor would Perl understand files written +by something that uses a different extension. Hence, you can specify that +above-Unicode code points are generally accepted and/or warned about, but still +exclude the ones that require this extension to represent. + +=item C and kin + +Other flags can be passed to allow, in a limited way, syntactic malformations +and/or overflowing the number of bits available in a UV on the platform. +The functions will not treat the relevant malformations as errors, hence will +not raise any warnings for them. C will return C. + +B, +regardless of any of the flags. + +The only such flag that you would ever have any reason to use is +C which applies to any of the syntactic malformations and +overflow, except for empty input. The other flags are shown in the C<_GOT_> +bits list in C>. + +=back -In earlier Perls, this bit was named C, which you still -may use for backward compatibility. That name is misleading, as this flag may -be set when the code point actually does fit in 31 bits. This happens on -EBCDIC platforms, and sometimes when the L> is also present. The new name accurately -describes the situation in all cases. +=for apidoc utf8_to_uv_msgs +=for apidoc_item utf8n_to_uvchr_msgs +=for apidoc_item utf8_to_uv_errors +=for apidoc_item utf8n_to_uvchr_error + +These functions are extensions of C> and +C>. They are used for the highly specialized purpose of +when the caller needs to know the exact malformations that were encountered +and/or the diagnostics that would be raised. + +They each take one or two extra parameters, pointers to where to store this +information. The functions with C<_msgs> in their names return both types, so +take two extra parameters; those with C<_error> return just the malformations, +so take just one extra parameter. When the extra parameters are both 0, the +functions behave identically to the function they extend. + +When the C parameter is not NULL, it should be the address of a U32 +variable, into which the functions store a bitmap, described just below, with a +bit set for each malformation the function found; 0 if none. The C-type +flags are ignored when determining the content of this variable. That is, even +if you "allow" a particular malformation, if it is encountered, the +corresponding bit will be set to notify you that one was encountered. +The bits for malformations that are accepted by default aren't set unless the +flags passed to the function indicate that they should be rejected or warned +about when encountering them. These malformations are explicitly noted in the +list below along with the controlling flags. + +The bits returned in C and their meanings are: + +=over 4 =item C @@ -1193,7 +1388,10 @@ continuation byte. =item C -The input C parameter was 0. +The input parameters indicated the length of C is 0. Technically, this a +coding error, not a malformation; you should check before calling these +functions if there is actually anything to convert. But perl needs to be able +to recover from bad input, and this is how it does it. =item C @@ -1212,8 +1410,8 @@ C or the C flags. =item C -The input sequence was malformed in that a non-continuation type byte was found -in a position where only a continuation type one should be. See also +The input sequence was malformed in that a non-continuation-type byte was found +in a position where only a continuation-type one should be. See also C>. =item C @@ -1221,13 +1419,18 @@ C>. The input sequence was malformed in that it is for a code point that is not representable in the number of bits available in an IV on the current platform. +=item C + +The input sequence is not standard UTF-8, but a Perl extension. This bit is +set only if the input C parameter contains either the +C or the C flags. + =item C The input sequence was malformed in that C is smaller than required for a complete sequence. In other words, the input is for a partial character sequence. - C and C both indicate a too short sequence. The difference is that C indicates always that there is an error, while C means that an incomplete @@ -1235,12 +1438,12 @@ sequence was looked at. If no other flags are present, it means that the sequence was valid as far as it went. Depending on the application, this could mean one of three things: -=over +=over 4 =item * -The C length parameter passed in was too small, and the function was -prevented from examining all the necessary bytes. +The C or C parameters passed in were too small, and the function +was prevented from examining all the necessary bytes. =item * @@ -1271,133 +1474,150 @@ C or the C flags. =back -To do your own error handling, call this function with the C -flag to suppress any warnings, and then examine the C<*errors> return. - -=for apidoc Amnh||UTF8_GOT_PERL_EXTENDED -=for apidoc Amnh||UTF8_GOT_CONTINUATION -=for apidoc Amnh||UTF8_GOT_EMPTY -=for apidoc Amnh||UTF8_GOT_LONG -=for apidoc Amnh||UTF8_GOT_NONCHAR -=for apidoc Amnh||UTF8_GOT_NON_CONTINUATION -=for apidoc Amnh||UTF8_GOT_OVERFLOW -=for apidoc Amnh||UTF8_GOT_SHORT -=for apidoc Amnh||UTF8_GOT_SUPER -=for apidoc Amnh||UTF8_GOT_SURROGATE - -=cut - -Also implemented as a macro in utf8.h -*/ - -UV -Perl_utf8n_to_uvchr_error(const U8 *s, - STRLEN curlen, - STRLEN *retlen, - const U32 flags, - U32 * errors) -{ - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; - - return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL); -} - -/* - -=for apidoc utf8n_to_uvchr_msgs - -THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. -Most code should use L() rather than call this -directly. - -This function is for code that needs to know what the precise malformation(s) -are when an error is found, and wants the corresponding warning and/or error -messages to be returned to the caller rather than be displayed. All messages -that would have been displayed if all lexical warnings are enabled will be -returned. - -It is just like C> but it takes an extra parameter -placed after all the others, C. If this parameter is 0, this function -behaves identically to C>. Otherwise, C should -be a pointer to an C variable, in which this function creates a new AV to -contain any appropriate messages. The elements of the array are ordered so -that the first message that would have been displayed is in the 0th element, -and so on. Each element is a hash with three key-value pairs, as follows: +Note that more than one bit may have been set by these functions. This is +because it is possible for multiple malformations to be present in the same +sequence. An example would be an overlong sequence evaluating to a surrogate +when surrogates are forbidden. Another example is overflow; standard UTF-8 +never overflows, so something that does must have been expressed using Perl's +extended UTF-8. If also is above all legal Unicode code points. So there will +be a bit set for up to all three of these things. Overflow always, +perl-extended if the calling flags indicate those should be rejected or warned +about, and above-Unicode provided the calling flags indicate those should be +rejected or warned about. + +If you don't care about the system's messages text nor warning categories, you +can customize error handling by calling one of the C<_error> functions, using +either of the flags C or C to suppress any +warnings, and then examine the C<*errors> return. + +But if you do care, use one of the functions with C<_msgs> in their names. +These allow you to completely customize error handling by suppressing any +warnings that would otherwise be raised; instead returning all needed +information in a structure specified by an extra parameter, C, a pointer +to a variable which has been declared to be an C, and into which the +function creates a new AV to store information, described below, about all +the malformations that were encountered. + +If the flag C is passed, this parameter is ignored. + +What is considered a malformation is affected by C, the same as +described in C>. No array element is generated for +malformations that are "allowed" by the input flags, in contrast to the +C<_error> functions. + +Each element of the C AV array is an anonymous hash with the following +three key-value pairs: =over 4 =item C -The text of the message as a C. +A C containing the text of any warning message that would have ordinarily +been generated. The function suppresses raising this warning itself. =item C -The warning category (or categories) packed into a C. +The warning category (or categories) for the message, packed into a C. =item C -A single flag bit associated with this message, in a C. -The bit corresponds to some bit in the C<*errors> return value, -such as C. +A C containing a single flag bit associated with this message. The bit +corresponds to some bit in the C<*errors> return value, such as +C. =back -It's important to note that specifying this parameter as non-null will cause -any warnings this function would otherwise generate to be suppressed, and -instead be placed in C<*msgs>. The caller can check the lexical warnings state -(or not) when choosing what to do with the returned messages. +The array is sorted so that element C<[0]> contains the first message that +would have otherwise been raised; C<[1]>, the second; and so on. -If the flag C is passed, no warnings are generated, and hence -no AV is created. +You thus can completely override the normal error handling; you can check the +lexical warnings state (or not) when choosing what to do with the returned +messages. The caller, of course, is responsible for freeing any returned AV. +=for apidoc Amnh||UTF8_ALLOW_CONTINUATION +=for apidoc Amnh||UTF8_ALLOW_EMPTY +=for apidoc Amnh||UTF8_ALLOW_LONG +=for apidoc Amnh||UTF8_ALLOW_NON_CONTINUATION +=for apidoc Amnh||UTF8_ALLOW_OVERFLOW +=for apidoc Amnh||UTF8_ALLOW_PERL_EXTENDED +=for apidoc Amnh||UTF8_ALLOW_SHORT +=for apidoc Amnh||UTF8_CHECK_ONLY +=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE +=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE +=for apidoc Amnh||UTF8_DISALLOW_NONCHAR +=for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED +=for apidoc Amnh||UTF8_DISALLOW_SUPER +=for apidoc Amnh||UTF8_DISALLOW_SURROGATE +=for apidoc Amnh||UTF8_GOT_CONTINUATION +=for apidoc Amnh||UTF8_GOT_EMPTY +=for apidoc Amnh||UTF8_GOT_LONG +=for apidoc Amnh||UTF8_GOT_NONCHAR +=for apidoc Amnh||UTF8_GOT_NON_CONTINUATION +=for apidoc Amnh||UTF8_GOT_OVERFLOW +=for apidoc Amnh||UTF8_GOT_PERL_EXTENDED +=for apidoc Amnh||UTF8_GOT_SHORT +=for apidoc Amnh||UTF8_GOT_SUPER +=for apidoc Amnh||UTF8_GOT_SURROGATE +=for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE +=for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE +=for apidoc Amnh||UTF8_WARN_NONCHAR +=for apidoc Amnh||UTF8_WARN_PERL_EXTENDED +=for apidoc Amnh||UTF8_WARN_SUPER +=for apidoc Amnh||UTF8_WARN_SURROGATE + =cut */ -UV -Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, - STRLEN curlen, - STRLEN *retlen, - const U32 flags, - U32 * errors, - AV ** msgs) +bool +Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, + const U8 * const e, + UV *cp_p, + Size_t *advance_p, + const U32 flags, + U32 * errors, + AV ** msgs) { - const U8 * const s0 = s; - const U8 * send = s0 + curlen; + PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_; + + const U8 * s = s0; + const U8 * send = e; + SSize_t curlen = send - s0; U32 possible_problems; /* A bit is set here for each potential problem found as we go along */ UV uv; - STRLEN expectlen; /* How long should this sequence be? */ - STRLEN avail_len; /* When input is too short, gives what that is */ + SSize_t expectlen; /* How long should this sequence be? */ + SSize_t avail_len; /* When input is too short, gives what that is */ U32 discard_errors; /* Used to save branches when 'errors' is NULL; this gets set and discarded */ dTHX; - PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER; - /* Here, is one of: a) malformed; b) a problematic code point (surrogate, * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul * syllables that the dfa doesn't properly handle. Quickly dispose of the * final case. */ + /* Assume will be successful; override later if necessary */ + if (UNLIKELY(errors)) { + *errors = 0; + } + if (UNLIKELY(msgs)) { + *msgs = NULL; + } + /* Each of the affected Hanguls starts with \xED */ if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */ - if (retlen) { - *retlen = 3; - } - if (errors) { - *errors = 0; - } - if (msgs) { - *msgs = NULL; + if (advance_p) { + *advance_p = 3; } - return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) - | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) - | (s0[2] & UTF_CONTINUATION_MASK); + *cp_p = ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) + | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) + | (s0[2] & UTF_CONTINUATION_MASK); + return true; } /* In conjunction with the exhaustive tests that can be enabled in @@ -1438,7 +1658,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * We also should not consume too few bytes, otherwise someone could inject * things. For example, an input could be deliberately designed to * overflow, and if this code bailed out immediately upon discovering that, - * returning to the caller C<*retlen> pointing to the very next byte (one + * returning to the caller C<*advance_p> pointing to the very next byte (one * which is actually part of the overflowing sequence), that could look * legitimate to the caller, which could discard the initial partial * sequence and process the rest, inappropriately. @@ -1450,7 +1670,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * allowed one, we could allow in something that shouldn't have been. */ - if (UNLIKELY(curlen == 0)) { + if (UNLIKELY(curlen <= 0)) { possible_problems |= UTF8_GOT_EMPTY; curlen = 0; uv = UNICODE_REPLACEMENT; @@ -1468,8 +1688,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * function will be for, has this expected length. For efficiency, set * things up here to return it. It will be overridden only in those rare * cases where a malformation is found */ - if (retlen) { - *retlen = expectlen; + if (advance_p) { + *advance_p = expectlen; } /* A continuation character can't start a valid sequence */ @@ -1641,7 +1861,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, } } - ready_to_handle_errors: + ready_to_handle_errors: ; /* At this point: * curlen contains the number of bytes in the sequence that @@ -1664,16 +1884,12 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * us should be in it, but no further than s0 + * avail_len */ + bool success = true; if (UNLIKELY(possible_problems)) { bool disallowed = FALSE; const U32 orig_problems = possible_problems; - if (msgs) { - *msgs = NULL; - } - - /* Returns 0 if no message needs to be generated for this problem even * if everything else says to. Otherwise returns the warning category * to use for the message. @@ -2087,50 +2303,18 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, /* Since there was a possible problem, the returned length may need to * be changed from the one stored at the beginning of this function. * Instead of trying to figure out if it has changed, just do it. */ - if (retlen) { - *retlen = curlen; + if (advance_p) { + *advance_p = curlen; } if (disallowed) { - if (flags & UTF8_CHECK_ONLY && retlen) { - *retlen = ((STRLEN) -1); - } - return 0; + success = false; + uv = UNICODE_REPLACEMENT; } } - return UNI_TO_NATIVE(uv); -} - -/* -=for apidoc utf8_to_uvchr_buf - -Returns the native code point of the first character in the string C which -is assumed to be in UTF-8 encoding; C points to 1 beyond the end of C. -C<*retlen> will be set to the length, in bytes, of that character. - -If C does not point to a well-formed UTF-8 character and UTF8 warnings are -enabled, zero is returned and C<*retlen> is set (if C isn't -C) to -1. If those warnings are off, the computed value, if well-defined -(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and -C<*retlen> is set (if C isn't C) so that (S + C<*retlen>>) is -the next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is -returned. - -=cut - -Also implemented as a macro in utf8.h - -*/ - - -UV -Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) -{ - PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; - - return utf8_to_uvchr_buf_helper(s, send, retlen); + *cp_p = UNI_TO_NATIVE(uv); + return success; } /* diff --git a/utf8.h b/utf8.h index 15391a65712d..fe3626cae651 100644 --- a/utf8.h +++ b/utf8.h @@ -148,12 +148,31 @@ typedef enum { Perl_uvchr_to_utf8_flags_msgs(aTHX, d, u, f, 0) #define Perl_uvchr_to_utf8_flags_msgs(mTHX, d, u, f , m) \ Perl_uvoffuni_to_utf8_flags_msgs(aTHX_ d, NATIVE_TO_UNI(u), f, m) -#define utf8_to_uvchr_buf(s, e, lenp) \ - utf8_to_uvchr_buf_helper((const U8 *) (s), (const U8 *) e, lenp) -#define utf8n_to_uvchr(s, len, lenp, flags) \ - utf8n_to_uvchr_error(s, len, lenp, flags, 0) -#define utf8n_to_uvchr_error(s, len, lenp, flags, errors) \ - utf8n_to_uvchr_msgs(s, len, lenp, flags, errors, 0) + +/* This is needed to cast the parameters for all those calls that had them + * improperly as chars */ +#define utf8_to_uvchr_buf(s, e, lenp) \ + Perl_utf8_to_uvchr_buf(aTHX_ (const U8 *) (s), (const U8 *) e, lenp) + +#define Perl_utf8n_to_uvchr(s, len, lenp, flags) \ + Perl_utf8n_to_uvchr_error(s, len, lenp, flags, 0) +#define Perl_utf8n_to_uvchr_error(s, len, lenp, flags, errors) \ + Perl_utf8n_to_uvchr_msgs(s, len, lenp, flags, errors, 0) + +#define Perl_utf8_to_uv( s, e, cp_p, advance_p) \ + Perl_utf8_to_uv_flags( s, e, cp_p, advance_p, 0) +#define Perl_utf8_to_uv_flags( s, e, cp_p, advance_p, flags) \ + Perl_utf8_to_uv_errors( s, e, cp_p, advance_p, flags, 0) +#define Perl_utf8_to_uv_errors( s, e, cp_p, advance_p, flags, errors) \ + Perl_utf8_to_uv_msgs( s, e, cp_p, advance_p, flags, errors, 0) +#define Perl_extended_utf8_to_uv(s, e, cp_p, advance_p) \ + Perl_utf8_to_uv(s, e, cp_p, advance_p) +#define Perl_strict_utf8_to_uv( s, e, cp_p, advance_p) \ + Perl_utf8_to_uv_flags( s, e, cp_p, advance_p, \ + UTF8_DISALLOW_ILLEGAL_INTERCHANGE) +#define Perl_c9strict_utf8_to_uv(s, e, cp_p, advance_p) \ + Perl_utf8_to_uv_flags( s, e, cp_p, advance_p, \ + UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) #define utf16_to_utf8(p, d, bytelen, newlen) \ utf16_to_utf8_base(p, d, bytelen, newlen, 0, 1)