From 1d26a61beddff8f486f640f882f90674dea87429 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Sat, 17 Aug 2024 20:30:21 +0100 Subject: [PATCH] Implement Data.Text.toTitle directly, without streaming --- src/Data/Text.hs | 6 +- src/Data/Text/Internal/Transformation.hs | 152 +++++++++++++++++++---- 2 files changed, 130 insertions(+), 28 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 790e9d1a..3a37db91 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -255,7 +255,7 @@ import qualified Prelude as P import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter, reverseIter_, unsafeHead, unsafeTail, iterArray, reverseIterArray) import Data.Text.Internal.Search (indices) -import Data.Text.Internal.Transformation (mapNonEmpty, toCaseFoldNonEmpty, toLowerNonEmpty, toUpperNonEmpty, filter_) +import Data.Text.Internal.Transformation (mapNonEmpty, toCaseFoldNonEmpty, toLowerNonEmpty, toUpperNonEmpty, toTitleNonEmpty, filter_) #if defined(__HADDOCK__) import Data.ByteString (ByteString) import qualified Data.Text.Lazy as L @@ -900,7 +900,9 @@ toUpper = \t -> -- -- @since 1.0.0.0 toTitle :: Text -> Text -toTitle t = unstream (S.toTitle (stream t)) +toTitle = \t -> + if null t then empty + else toTitleNonEmpty t {-# INLINE toTitle #-} -- | /O(n)/ Left-justify a string to the given length, using the diff --git a/src/Data/Text/Internal/Transformation.hs b/src/Data/Text/Internal/Transformation.hs index c9f14c48..e67abd9e 100644 --- a/src/Data/Text/Internal/Transformation.hs +++ b/src/Data/Text/Internal/Transformation.hs @@ -1,8 +1,11 @@ -{-# LANGUAGE BangPatterns, CPP, MagicHash #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE PartialTypeSignatures #-} +{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE Trustworthy #-} +{-# LANGUAGE UnboxedTuples #-} {-# LANGUAGE UnliftedFFITypes #-} -{-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE PartialTypeSignatures #-} {-# OPTIONS_GHC -fno-warn-orphans #-} {-# OPTIONS_GHC -Wno-partial-type-signatures #-} @@ -25,24 +28,26 @@ module Data.Text.Internal.Transformation , toCaseFoldNonEmpty , toLowerNonEmpty , toUpperNonEmpty + , toTitleNonEmpty , filter_ ) where import Prelude (Char, Bool(..), Int, Ord(..), Monad(..), pure, - (+), (-), ($), - not, return, otherwise) + (+), (-), ($), (&&), (||), (==), + not, return, otherwise, fromIntegral, (/=), const) import Data.Bits ((.&.), shiftR, shiftL) +import Data.Char (isLetter, isSpace, ord) import Control.Monad.ST (ST, runST) import qualified Data.Text.Array as A import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader, chr2, chr3, chr4) -import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping) +import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping, titleMapping) import Data.Text.Internal (Text(..), safe) import Data.Text.Internal.Unsafe.Char (unsafeWrite, unsafeChr8) import qualified Prelude as P import Data.Text.Unsafe (Iter(..), iterArray) -import Data.Word (Word8) +import Data.Word (Word8, Word) import qualified GHC.Exts as Exts import GHC.Int (Int64(..)) @@ -113,7 +118,7 @@ caseConvert ascii remap (Text src o l) = runST $ do A.unsafeWrite dst dstOff m0 A.unsafeWrite dst (dstOff + 1) m1 pure $ dstOff + 2 - i -> writeMapping i dstOff + i -> writeMapping dst i dstOff inner (srcOff + 2) dstOff' 3 -> do let !(Exts.C# c) = chr3 m0 m1 m2 @@ -123,7 +128,7 @@ caseConvert ascii remap (Text src o l) = runST $ do A.unsafeWrite dst (dstOff + 1) m1 A.unsafeWrite dst (dstOff + 2) m2 pure $ dstOff + 3 - i -> writeMapping i dstOff + i -> writeMapping dst i dstOff inner (srcOff + 3) dstOff' _ -> do let !(Exts.C# c) = chr4 m0 m1 m2 m3 @@ -134,45 +139,140 @@ caseConvert ascii remap (Text src o l) = runST $ do A.unsafeWrite dst (dstOff + 2) m2 A.unsafeWrite dst (dstOff + 3) m3 pure $ dstOff + 4 - i -> writeMapping i dstOff + i -> writeMapping dst i dstOff inner (srcOff + 4) dstOff' - writeMapping :: Int64 -> Int -> ST s Int - writeMapping 0 dstOff = pure dstOff - writeMapping i dstOff = do - let (ch, j) = chopOffChar i - d <- unsafeWrite dst dstOff ch - writeMapping j (dstOff + d) - - chopOffChar :: Int64 -> (Char, Int64) - chopOffChar ab = (chr a, ab `shiftR` 21) - where - chr (Exts.I# n) = Exts.C# (Exts.chr# n) - mask = (1 `shiftL` 21) - 1 - a = P.fromIntegral $ ab .&. mask {-# INLINE caseConvert #-} +writeMapping :: A.MArray s -> Int64 -> Int -> ST s Int +writeMapping !_ 0 !dstOff = pure dstOff +writeMapping dst i dstOff = do + let (ch, j) = chopOffChar i + d <- unsafeWrite dst dstOff ch + writeMapping dst j (dstOff + d) + +chopOffChar :: Int64 -> (Char, Int64) +chopOffChar ab = (chr a, ab `shiftR` 21) + where + chr (Exts.I# n) = Exts.C# (Exts.chr# n) + mask = (1 `shiftL` 21) - 1 + a = P.fromIntegral $ ab .&. mask -- | /O(n)/ Convert a string to folded case. -- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty. toCaseFoldNonEmpty :: Text -> Text -toCaseFoldNonEmpty = \xs -> caseConvert (\w -> if w - 65 <= 25 then w + 32 else w) foldMapping xs +toCaseFoldNonEmpty = \xs -> caseConvert asciiToLower foldMapping xs {-# INLINE toCaseFoldNonEmpty #-} -- | /O(n)/ Convert a string to lower case, using simple case -- conversion. -- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty. toLowerNonEmpty :: Text -> Text -toLowerNonEmpty = \xs -> caseConvert (\w -> if w - 65 <= 25 then w + 32 else w) lowerMapping xs +toLowerNonEmpty = \xs -> caseConvert asciiToLower lowerMapping xs {-# INLINE toLowerNonEmpty #-} -- | /O(n)/ Convert a string to upper case, using simple case -- conversion. -- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty. toUpperNonEmpty :: Text -> Text -toUpperNonEmpty = \xs -> caseConvert (\w -> if w - 97 <= 25 then w - 32 else w) upperMapping xs +toUpperNonEmpty = \xs -> caseConvert asciiToUpper upperMapping xs {-# INLINE toUpperNonEmpty #-} +asciiToLower :: Word8 -> Word8 +asciiToLower w = if w - 65 <= 25 then w + 32 else w + +asciiToUpper :: Word8 -> Word8 +asciiToUpper w = if w - 97 <= 25 then w - 32 else w + +isAsciiLetter :: Word8 -> Bool +isAsciiLetter w = w - 65 <= 25 || w - 97 <= 25 + +isAsciiSpace :: Word8 -> Bool +isAsciiSpace w = w .&. 0x50 == 0 && w < 0x80 && (w == 0x20 || w - 0x09 < 5) + +-- | /O(n)/ Convert a string to title case, see 'Data.Text.toTitle' for discussion. +-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty. +toTitleNonEmpty :: Text -> Text +toTitleNonEmpty (Text src o l) = runST $ do + -- Case conversion a single code point may produce up to 3 code-points, + -- each up to 4 bytes, so 12 in total. + dst <- A.new (l + 12) + outer dst l o 0 False + where + outer :: forall s. A.MArray s -> Int -> Int -> Int -> Bool -> ST s Text + outer !dst !dstLen = inner + where + inner !srcOff !dstOff !mode + | srcOff >= o + l = do + A.shrinkM dst dstOff + arr <- A.unsafeFreeze dst + return (Text arr 0 dstOff) + | dstOff + 12 > dstLen = do + -- Ensure to extend the buffer by at least 12 bytes. + let !dstLen' = dstLen + max 12 (l + o - srcOff) + dst' <- A.resizeM dst dstLen' + outer dst' dstLen' srcOff dstOff mode + -- If a character is to remain unchanged, no need to decode Char back into UTF8, + -- just copy bytes from input. + | otherwise = do + let m0 = A.unsafeIndex src srcOff + m1 = A.unsafeIndex src (srcOff + 1) + m2 = A.unsafeIndex src (srcOff + 2) + m3 = A.unsafeIndex src (srcOff + 3) + !d = utf8LengthByLeader m0 + + case d of + 1 -> do + let (mode', m0') = asciiAdvance mode m0 + A.unsafeWrite dst dstOff m0' + inner (srcOff + 1) (dstOff + 1) mode' + 2 -> do + let !(Exts.C# c) = chr2 m0 m1 + !(# mode', c' #) = advance (\_ -> m0 == 0xC2 && m1 == 0xA0) mode c + dstOff' <- case I64# c' of + 0 -> do + A.unsafeWrite dst dstOff m0 + A.unsafeWrite dst (dstOff + 1) m1 + pure $ dstOff + 2 + i -> writeMapping dst i dstOff + inner (srcOff + 2) dstOff' mode' + 3 -> do + let !(Exts.C# c) = chr3 m0 m1 m2 + isSpace3 ch + = m0 == 0xE1 && m1 == 0x9A && m2 == 0x80 + || m0 == 0xE2 && (m1 == 0x80 && isSpace (Exts.C# ch) || m1 == 0x81 && m2 == 0x9F) + || m0 == 0xE3 && m1 == 0x80 && m2 == 0x80 + !(# mode', c' #) = advance isSpace3 mode c + dstOff' <- case I64# c' of + 0 -> do + A.unsafeWrite dst dstOff m0 + A.unsafeWrite dst (dstOff + 1) m1 + A.unsafeWrite dst (dstOff + 2) m2 + pure $ dstOff + 3 + i -> writeMapping dst i dstOff + inner (srcOff + 3) dstOff' mode' + _ -> do + let !(Exts.C# c) = chr4 m0 m1 m2 m3 + !(# mode', c' #) = advance (\_ -> False) mode c + dstOff' <- case I64# c' of + 0 -> do + A.unsafeWrite dst dstOff m0 + A.unsafeWrite dst (dstOff + 1) m1 + A.unsafeWrite dst (dstOff + 2) m2 + A.unsafeWrite dst (dstOff + 3) m3 + pure $ dstOff + 4 + i -> writeMapping dst i dstOff + inner (srcOff + 4) dstOff' mode' + + asciiAdvance :: Bool -> Word8 -> (Bool, Word8) + asciiAdvance False w = (isAsciiLetter w, asciiToUpper w) + asciiAdvance True w = (not (isAsciiSpace w), asciiToLower w) + + advance :: (Exts.Char# -> Bool) -> Bool -> Exts.Char# -> (# Bool, _ {- unboxed Int64 -} #) + advance _ False c = (# isLetter (Exts.C# c), titleMapping c #) + advance isSpaceChar True c = (# not (isSpaceChar c), lowerMapping c #) + {-# INLINE advance #-} + -- | /O(n)/ 'filter_', applied to a continuation, a predicate and a @Text@, -- calls the continuation with the @Text@ containing only the characters satisfying the predicate. filter_ :: forall a. (A.Array -> Int -> Int -> a) -> (Char -> Bool) -> Text -> a