diff --git a/embed.fnc b/embed.fnc index fdb889827a4f..f9cbcbd7cbe4 100644 --- a/embed.fnc +++ b/embed.fnc @@ -3809,6 +3809,17 @@ Cp |U8 * |uvoffuni_to_utf8_flags_msgs \ |UV input_uv \ |const UV flags \ |NULLOK HV **msgs + +Admp |U8 * |uv_to_utf8 |NN U8 *d \ + |UV uv +Admp |U8 * |uv_to_utf8_flags \ + |NN U8 *d \ + |UV uv \ + |UV flags +Admp |U8 * |uv_to_utf8_msgs|NN U8 *d \ + |UV uv \ + |UV flags \ + |NULLOK HV **msgs CDbp |U8 * |uvuni_to_utf8 |NN U8 *d \ |UV uv EXdpx |bool |validate_proto |NN SV *name \ diff --git a/embed.h b/embed.h index 560ecde2c484..503eb22cccda 100644 --- a/embed.h +++ b/embed.h @@ -873,6 +873,9 @@ # define utf8n_to_uvchr Perl_utf8n_to_uvchr # define utf8n_to_uvchr_error Perl_utf8n_to_uvchr_error # define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs +# define uv_to_utf8(a,b) Perl_uv_to_utf8(aTHX,a,b) +# define uv_to_utf8_flags(a,b,c) Perl_uv_to_utf8_flags(aTHX,a,b,c) +# define uv_to_utf8_msgs(a,b,c,d) Perl_uv_to_utf8_msgs(aTHX,a,b,c,d) # define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX,a,b) # define uvchr_to_utf8_flags(a,b,c) Perl_uvchr_to_utf8_flags(aTHX,a,b,c) # define uvchr_to_utf8_flags_msgs(a,b,c,d) Perl_uvchr_to_utf8_flags_msgs(aTHX,a,b,c,d) diff --git a/pod/perldelta.pod b/pod/perldelta.pod index bd202557c46e..1dcc264c902e 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -414,6 +414,11 @@ L> replaces L> (which is retained for backwards compatibility), but you should convert to use the new form, as likely you aren't using the old one safely. +To convert in the opposite direction, you can now use +L>. This is not a new function, but a new synonym +for L>. It is added so you don't have to learn +two sets of names. + There are also two new functions, L> and L> which do the same thing except when the input string represents a code point that Unicode doesn't accept as @@ -440,6 +445,12 @@ L> replaces L>. L> replaces L>. +Also added are the inverse functions L> +and L>, which are synonyms for the existing +functions, L> and +L> respectively. These are provided only +so you don't have to learn two sets of names. + =item * Three new API functions are introduced to convert strings encoded in diff --git a/proto.h b/proto.h index 7db33e9fd7d2..9805894847a2 100644 --- a/proto.h +++ b/proto.h @@ -5394,6 +5394,15 @@ Perl_utilize(pTHX_ int aver, I32 floor, OP *version, OP *idop, OP *arg) #define PERL_ARGS_ASSERT_UTILIZE \ assert(idop) +/* PERL_CALLCONV U8 * +Perl_uv_to_utf8(pTHX_ U8 *d, UV uv); */ + +/* PERL_CALLCONV U8 * +Perl_uv_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags); */ + +/* PERL_CALLCONV U8 * +Perl_uv_to_utf8_msgs(pTHX_ U8 *d, UV uv, UV flags, HV **msgs); */ + /* PERL_CALLCONV U8 * Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); */ diff --git a/utf8.c b/utf8.c index 74eea0c0606d..50c5accfa684 100644 --- a/utf8.c +++ b/utf8.c @@ -121,14 +121,14 @@ S_new_msg_hv(pTHX_ const char * const message, /* The message text */ =for apidoc uvoffuni_to_utf8_flags THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. -Instead, B or -L>. +Instead, B or +L>. This function is like them, but the input is a strict Unicode (as opposed to native) code point. Only in very rare circumstances should code not be using the native code point. -For details, see the description for L. +For details, see the description for L. =cut */ @@ -155,9 +155,11 @@ const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode," #define MASK UTF_CONTINUATION_MASK /* -=for apidoc uvchr_to_utf8_flags_msgs +=for apidoc uv_to_utf8_msgs +=for apidoc_item uvchr_to_utf8_flags_msgs -THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. +These functions are identical. THEY SHOULD BE USED IN ONLY VERY SPECIALIZED +CIRCUMSTANCES. Most code should use C()> rather than call this directly. @@ -367,7 +369,9 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs) } /* -=for apidoc uvchr_to_utf8 +=for apidoc uv_to_utf8 +=for apidoc_item uv_to_utf8_flags +=for apidoc_item uvchr_to_utf8 =for apidoc_item uvchr_to_utf8_flags These each add the UTF-8 representation of the native code point C to the @@ -375,18 +379,22 @@ end of the string C; C should have at least C (up to C) free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, - d = uvchr_to_utf8(d, uv); + d = uv_to_utf8(d, uv); This is the Unicode-aware way of saying *(d++) = uv; -C is used to make some classes of code points problematic in some way. -C is effectively the same as calling C +(C is a synonym for C.) + +C is used to make some classes of code points problematic in +some way. C is effectively the same as calling C with C set to 0, meaning no class of code point is considered problematic. That means any input code point from 0..C is considered to be fine. C is typically 0x7FFF_FFFF in a 32-bit word. +(C is a synonym for C). + A code point can be problematic in one of two ways. Its use could just raise a warning, and/or it could be forbidden with the function failing, and returning NULL. diff --git a/utf8.h b/utf8.h index fe3626cae651..18a4e6cb4b28 100644 --- a/utf8.h +++ b/utf8.h @@ -142,11 +142,11 @@ typedef enum { #define uvoffuni_to_utf8_flags(d,uv,flags) \ uvoffuni_to_utf8_flags_msgs(d, uv, flags, 0) -#define Perl_uvchr_to_utf8(mTHX, d, u) \ - Perl_uvchr_to_utf8_flags(aTHX, d, u, 0) -#define Perl_uvchr_to_utf8_flags(mTHX, d, u, f) \ - Perl_uvchr_to_utf8_flags_msgs(aTHX, d, u, f, 0) -#define Perl_uvchr_to_utf8_flags_msgs(mTHX, d, u, f , m) \ +#define Perl_uv_to_utf8(mTHX, d, u) \ + Perl_uv_to_utf8_flags(aTHX, d, u, 0) +#define Perl_uv_to_utf8_flags(mTHX, d, u, f) \ + Perl_uv_to_utf8_msgs(aTHX, d, u, f, 0) +#define Perl_uv_to_utf8_msgs(mTHX, d, u, f , m) \ Perl_uvoffuni_to_utf8_flags_msgs(aTHX_ d, NATIVE_TO_UNI(u), f, m) /* This is needed to cast the parameters for all those calls that had them @@ -173,6 +173,9 @@ typedef enum { #define Perl_c9strict_utf8_to_uv(s, e, cp_p, advance_p) \ Perl_utf8_to_uv_flags( s, e, cp_p, advance_p, \ UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) +#define Perl_uvchr_to_utf8 Perl_uv_to_utf8 +#define Perl_uvchr_to_utf8_flags Perl_uv_to_utf8_flags +#define Perl_uvchr_to_utf8_flags_msgs Perl_uv_to_utf8_msgs #define utf16_to_utf8(p, d, bytelen, newlen) \ utf16_to_utf8_base(p, d, bytelen, newlen, 0, 1)