Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new API function utf8_to_uv() #22541

Open
wants to merge 14 commits into
base: blead
Choose a base branch
from
Open
65 changes: 50 additions & 15 deletions embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,11 @@ CTp |Signal_t|csighandler1 |int sig
CTp |Signal_t|csighandler3 |int sig \
|NULLOK Siginfo_t *info \
|NULLOK void *uap
ATdmp |bool |c9strict_utf8_to_uv \
|NN const U8 * const s \
|NN const U8 * const e \
|NN UV *cp_p \
|NULLOK Size_t *advance_p
EXp |regexp_engine const *|current_re_engine
RXp |XOPRETANY|custom_op_get_field \
|NN const OP *o \
Expand Down Expand Up @@ -1169,6 +1174,11 @@ AOdp |SV * |eval_pv |NN const char *p \
|I32 croak_on_error
AOdp |SSize_t|eval_sv |NN SV *sv \
|I32 flags
ATdmp |bool |extended_utf8_to_uv \
|NN const U8 * const s \
|NN const U8 * const e \
|NN UV *cp_p \
|NULLOK Size_t *advance_p
Adfpv |void |fatal_warner |U32 err \
|NN const char *pat \
|...
Expand Down Expand Up @@ -3065,6 +3075,11 @@ dopx |PerlIO *|start_glob |NN SV *tmpglob \
|NN IO *io
Adp |I32 |start_subparse |I32 is_format \
|U32 flags
ATdmp |bool |strict_utf8_to_uv \
|NN const U8 * const s \
|NN const U8 * const e \
|NN UV *cp_p \
|NULLOK Size_t *advance_p
CRp |NV |str_to_version |NN SV *sv
: Used in pp_ctl.c
p |void |sub_crush_depth|NN CV *cv
Expand Down Expand Up @@ -3667,12 +3682,11 @@ ARTdmp |U8 * |utf8_hop_safe |NN const U8 *s \
|NN const U8 * const end
ARdp |STRLEN |utf8_length |NN const U8 *s0 \
|NN const U8 *e

AMTdp |UV |utf8n_to_uvchr |NN const U8 *s \
ATdmp |UV |utf8n_to_uvchr |NN const U8 *s \
|STRLEN curlen \
|NULLOK STRLEN *retlen \
|const U32 flags
AMTdp |UV |utf8n_to_uvchr_error \
ATdmp |UV |utf8n_to_uvchr_error \
|NN const U8 *s \
|STRLEN curlen \
|NULLOK STRLEN *retlen \
Expand All @@ -3685,13 +3699,6 @@ ATdip |UV |utf8n_to_uvchr_msgs \
|const U32 flags \
|NULLOK U32 *errors \
|NULLOK AV **msgs
CTp |UV |_utf8n_to_uvchr_msgs_helper \
|NN const U8 *s \
|STRLEN curlen \
|NULLOK STRLEN *retlen \
|const U32 flags \
|NULLOK U32 *errors \
|NULLOK AV **msgs
CDbdp |UV |utf8n_to_uvuni |NN const U8 *s \
|STRLEN curlen \
|NULLOK STRLEN *retlen \
Expand Down Expand Up @@ -3735,16 +3742,44 @@ EMXp |U8 * |utf16_to_utf8_reversed \
|NN U8 *d \
|Size_t bytelen \
|NN Size_t *newlen
ATdmp |bool |utf8_to_uv |NN const U8 * const s \
|NN const U8 * const e \
|NN UV *cp_p \
|NULLOK Size_t *advance_p
ADbdp |UV |utf8_to_uvchr |NN const U8 *s \
|NULLOK STRLEN *retlen
AMdp |UV |utf8_to_uvchr_buf \
|NN const U8 *s \
|NN const U8 *send \
|NULLOK STRLEN *retlen
Cip |UV |utf8_to_uvchr_buf_helper \
AMdip |UV |utf8_to_uvchr_buf \
|NN const U8 *s \
|NN const U8 *send \
|NULLOK STRLEN *retlen
ATdmp |bool |utf8_to_uv_errors \
|NN const U8 * const s \
|NN const U8 * const e \
|NN UV *cp_p \
|NULLOK Size_t *advance_p \
|const U32 flags \
|NULLOK U32 *errors
ATdmp |bool |utf8_to_uv_flags \
|NN const U8 * const s \
|NN const U8 * const e \
|NN UV *cp_p \
|NULLOK Size_t *advance_p \
|const U32 flag
ATdip |bool |utf8_to_uv_msgs|NN const U8 * const s0 \
|NN const U8 *e \
|NN UV *cp_p \
|NULLOK Size_t *advance_p \
|const U32 flags \
|NULLOK U32 *errors \
|NULLOK AV **msgs
CTp |bool |utf8_to_uv_msgs_helper_ \
|NN const U8 * const s0 \
|NN const U8 * const e \
|NN UV *cp_p \
|NULLOK Size_t *advance_p \
|const U32 flags \
|NULLOK U32 *errors \
|NULLOK AV **msgs
CDbdp |UV |utf8_to_uvuni |NN const U8 *s \
|NULLOK STRLEN *retlen
: Used in perly.y
Expand Down
12 changes: 10 additions & 2 deletions embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@
# define _to_utf8_lower_flags(a,b,c,d,e) Perl__to_utf8_lower_flags(aTHX_ a,b,c,d,e)
# define _to_utf8_title_flags(a,b,c,d,e) Perl__to_utf8_title_flags(aTHX_ a,b,c,d,e)
# define _to_utf8_upper_flags(a,b,c,d,e) Perl__to_utf8_upper_flags(aTHX_ a,b,c,d,e)
# define _utf8n_to_uvchr_msgs_helper Perl__utf8n_to_uvchr_msgs_helper
# define amagic_call(a,b,c,d) Perl_amagic_call(aTHX_ a,b,c,d)
# define amagic_deref_call(a,b) Perl_amagic_deref_call(aTHX_ a,b)
# define apply_attrs_string(a,b,c,d) Perl_apply_attrs_string(aTHX_ a,b,c,d)
Expand Down Expand Up @@ -158,6 +157,7 @@
# define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
# define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c)
# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
# define c9strict_utf8_to_uv Perl_c9strict_utf8_to_uv
# define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c)
# define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b)
# define call_list(a,b) Perl_call_list(aTHX_ a,b)
Expand Down Expand Up @@ -223,6 +223,7 @@
# define dump_vindent(a,b,c,d) Perl_dump_vindent(aTHX_ a,b,c,d)
# define eval_pv(a,b) Perl_eval_pv(aTHX_ a,b)
# define eval_sv(a,b) Perl_eval_sv(aTHX_ a,b)
# define extended_utf8_to_uv Perl_extended_utf8_to_uv
# define fbm_compile(a,b) Perl_fbm_compile(aTHX_ a,b)
# define fbm_instr(a,b,c,d) Perl_fbm_instr(aTHX_ a,b,c,d)
# define filter_add(a,b) Perl_filter_add(aTHX_ a,b)
Expand Down Expand Up @@ -676,6 +677,7 @@
# define stack_grow(a,b,c) Perl_stack_grow(aTHX_ a,b,c)
# define start_subparse(a,b) Perl_start_subparse(aTHX_ a,b)
# define str_to_version(a) Perl_str_to_version(aTHX_ a)
# define strict_utf8_to_uv Perl_strict_utf8_to_uv
# define suspend_compcv(a) Perl_suspend_compcv(aTHX_ a)
# define sv_2bool(a) Perl_sv_2bool(aTHX,a)
# define sv_2bool_flags(a,b) Perl_sv_2bool_flags(aTHX_ a,b)
Expand Down Expand Up @@ -863,7 +865,13 @@
# define utf8_to_bytes_new_pv(a,b,c) Perl_utf8_to_bytes_new_pv(aTHX,a,b,c)
# define utf8_to_bytes_overwrite(a,b) Perl_utf8_to_bytes_overwrite(aTHX,a,b)
# define utf8_to_bytes_temp_pv(a,b) Perl_utf8_to_bytes_temp_pv(aTHX,a,b)
# define utf8_to_uvchr_buf_helper(a,b,c) Perl_utf8_to_uvchr_buf_helper(aTHX_ a,b,c)
# define utf8_to_uv Perl_utf8_to_uv
# define utf8_to_uv_errors Perl_utf8_to_uv_errors
# define utf8_to_uv_flags Perl_utf8_to_uv_flags
# define utf8_to_uv_msgs Perl_utf8_to_uv_msgs
# define utf8_to_uv_msgs_helper_ Perl_utf8_to_uv_msgs_helper_
# define utf8n_to_uvchr Perl_utf8n_to_uvchr
# define utf8n_to_uvchr_error Perl_utf8n_to_uvchr_error
# define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs
# define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX,a,b)
# define uvchr_to_utf8_flags(a,b,c) Perl_uvchr_to_utf8_flags(aTHX,a,b,c)
Expand Down
122 changes: 78 additions & 44 deletions inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -2053,7 +2053,7 @@ C<L</is_strict_utf8_string>> (and kin); and if C<flags> is
C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, they give the same results as
C<L</is_c9strict_utf8_string>> (and kin). Otherwise C<flags> may be any
combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
C<L</utf8n_to_uvchr>>, with the same meanings.
C<L</utf8_to_uv>>, with the same meanings.

It's better to use one of the non-C<_flags> functions if they give you the
desired strictness, as those have a better chance of being inlined by the C
Expand Down Expand Up @@ -2306,7 +2306,7 @@ as C<L</isSTRICT_UTF8_CHAR>>;
and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
understood by C<L</utf8n_to_uvchr>>, with the same meanings.
understood by C<L</utf8_to_uv>>, with the same meanings.

The three alternative macros are for the most commonly needed validations; they
are likely to run somewhat faster than this more general one, as they can be
Expand Down Expand Up @@ -2931,7 +2931,7 @@ C<is_utf8_valid_partial_char_flags> when the latter is called with a zero
C<flags> parameter. This parameter is used to restrict the classes of code
points that are considered to be valid. When zero, Perl's extended UTF-8 is
used. Otherwise C<flags> can be any combination of the C<UTF8_DISALLOW_I<foo>>
flags accepted by C<L</utf8n_to_uvchr>>. If there is any sequence of bytes
flags accepted by C<L</utf8_to_uv>>. If there is any sequence of bytes
that can complete the input partial character in such a way that a
non-prohibited character is formed, the function returns TRUE; otherwise FALSE.
Non-character code points cannot be determined based on partial character
Expand Down Expand Up @@ -3003,7 +3003,7 @@ complete code point, this will return TRUE anyway, provided that
C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.

C<flags> can be zero or any combination of the C<UTF8_DISALLOW_I<foo>> flags
accepted by C<L</utf8n_to_uvchr>>, and with the same meanings.
accepted by C<L</utf8_to_uv>>, and with the same meanings.

The functions differ from C<L</is_utf8_string_flags>> only in that the latter
returns FALSE if the final few bytes of the string don't form a complete code
Expand Down Expand Up @@ -3048,21 +3048,22 @@ Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
|| is_utf8_valid_partial_char_flags(*ep, s + len, flags);
}

PERL_STATIC_INLINE UV
Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
U32 * errors,
AV ** msgs)
PERL_STATIC_INLINE bool
Perl_utf8_to_uv_msgs(const U8 * const s0,
const U8 * const e,
UV * cp_p,
Size_t *advance_p,
const U32 flags,
U32 * errors,
AV ** msgs)
{
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS;

/* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
* simple cases, and, if necessary calls a helper function to deal with the
* more complex ones. Almost all well-formed non-problematic code points
* are considered simple, so that it's unlikely that the helper function
* will need to be called. */
/* This is the inlined portion of utf8_to_uv_msgs. It handles the simple
* cases, and, if necessary calls a helper function to deal with the more
* complex ones. Almost all well-formed non-problematic code points are
* considered simple, so that it's unlikely that the helper function will
* need to be called. */

/* Assume that isn't malformed; the vast majority of calls won't be */
if (errors) {
Expand All @@ -3075,25 +3076,25 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0,

/* No calls from core pass in an empty string; non-core need a check */
#ifdef PERL_CORE
assert(curlen > 0);
assert(e > s0);
#else
if (LIKELY(curlen > 0))
if (LIKELY(e > s0))
#endif

{
/* UTF-8 invariants are returned unchanged. The code below is quite
* capable of handling this, but this shortcuts this very common case
* */
if (UTF8_IS_INVARIANT(*s0)) {
if (retlen) {
*retlen = 1;
if (advance_p) {
*advance_p = 1;
}

return *s0;
*cp_p = *s0;
return true;
}

const U8 * s = s0;
const U8 * send = s + curlen;

/* This dfa is fast. If it accepts the input, it was for a
* well-formed, non-problematic code point, which can be returned
Expand All @@ -3116,50 +3117,83 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
PERL_UINT_FAST16_T state = PL_strict_utf8_dfa_tab[256 + type];
UV uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);

while (state > 1 && ++s < send) {
while (state > 1 && ++s < e) {
type = PL_strict_utf8_dfa_tab[*s];
state = PL_strict_utf8_dfa_tab[256 + state + type];

uv = UTF8_ACCUMULATE(uv, *s);
}

if (LIKELY(state == 0)) {
if (retlen) {
*retlen = s - s0 + 1;
if (advance_p) {
*advance_p = s - s0 + 1;
}

return UNI_TO_NATIVE(uv);
*cp_p = UNI_TO_NATIVE(uv);
return true;
}
}

/* Here is potentially problematic. Use the full mechanism */
return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
errors, msgs);
return utf8_to_uv_msgs_helper_(s0, e, cp_p, advance_p, flags, errors, msgs);
}

PERL_STATIC_INLINE UV
Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
U32 * errors,
AV ** msgs)
{
PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;

assert(s < send);
UV cp;
if (LIKELY(utf8_to_uv_msgs(s0, s0 + curlen, &cp, retlen, flags, errors,
msgs)))
{
return cp;
}

if (! ckWARN_d(WARN_UTF8)) {
if ((flags & UTF8_CHECK_ONLY) && retlen) {
*retlen = ((STRLEN) -1);
}

/* EMPTY is not really allowed, and asserts on debugging builds. But
* on non-debugging we have to deal with it, and this causes it to
* return the REPLACEMENT CHARACTER, as the documentation indicates */
return utf8n_to_uvchr(s, send - s, retlen,
(UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
return 0;
}


tonycoz marked this conversation as resolved.
Show resolved Hide resolved
PERL_STATIC_INLINE UV
Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
{
PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
assert(s < send);

UV cp;

/* When everything is legal, just return that; but when not:
* 1) if warnings are enabled return 0 and retlen to -1
* 2) if warnings are disabled, set 'flags' to accept any malformation,
* but that will just cause the REPLACEMENT CHARACTER to be returned,
* as the documentation indicates. EMPTY is not really allowed, and
* asserts on debugging builds. But on non-debugging we have to deal
* with it.
* This API means 0 can mean a legal NUL, or the input is malformed; and
* the caller has to know if warnings are disabled to know if it can rely on
* 'retlen'. Best to use utf8_to_uv() instead */
U32 flags = (ckWARN_d(WARN_UTF8)) ? 0 : (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY);

if ( LIKELY(utf8_to_uv_flags(s, send, &cp, retlen, flags))
|| flags)
{
return cp;
}
else {
UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
if (retlen && ret == 0 && (send <= s || *s != '\0')) {
*retlen = (STRLEN) -1;
}

return ret;
if (retlen) {
*retlen = (STRLEN) -1;
}

return 0;
}

/* ------------------------------- perl.h ----------------------------- */
Expand Down
Loading
Loading