Skip to content

Commit

Permalink
Implement Data.Text.toTitle directly, without streaming
Browse files Browse the repository at this point in the history
  • Loading branch information
Bodigrim committed Aug 17, 2024
1 parent 295175b commit d289882
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 26 deletions.
6 changes: 4 additions & 2 deletions src/Data/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ import qualified Prelude as P
import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
reverseIter_, unsafeHead, unsafeTail, iterArray, reverseIterArray)
import Data.Text.Internal.Search (indices)
import Data.Text.Internal.Transformation (mapNonEmpty, toCaseFoldNonEmpty, toLowerNonEmpty, toUpperNonEmpty, filter_)
import Data.Text.Internal.Transformation (mapNonEmpty, toCaseFoldNonEmpty, toLowerNonEmpty, toUpperNonEmpty, toTitleNonEmpty, filter_)
#if defined(__HADDOCK__)
import Data.ByteString (ByteString)
import qualified Data.Text.Lazy as L
Expand Down Expand Up @@ -900,7 +900,9 @@ toUpper = \t ->
--
-- @since 1.0.0.0
toTitle :: Text -> Text
toTitle t = unstream (S.toTitle (stream t))
toTitle = \t ->
if null t then empty
else toTitleNonEmpty t
{-# INLINE toTitle #-}

-- | /O(n)/ Left-justify a string to the given length, using the
Expand Down
143 changes: 119 additions & 24 deletions src/Data/Text/Internal/Transformation.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
{-# LANGUAGE BangPatterns, CPP, MagicHash #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE PartialTypeSignatures #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE UnboxedTuples #-}
{-# LANGUAGE UnliftedFFITypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE PartialTypeSignatures #-}

{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# OPTIONS_GHC -Wno-partial-type-signatures #-}
Expand All @@ -25,19 +28,21 @@ module Data.Text.Internal.Transformation
, toCaseFoldNonEmpty
, toLowerNonEmpty
, toUpperNonEmpty
, toTitleNonEmpty
, filter_
) where

import Prelude (Char, Bool(..), Int,
Ord(..),
Monad(..), pure,
(+), (-), ($),
(+), (-), ($), (&&), (||), (==),
not, return, otherwise)
import Data.Bits ((.&.), shiftR, shiftL)
import Data.Char (isLetter, isSpace)
import Control.Monad.ST (ST, runST)
import qualified Data.Text.Array as A
import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader, chr2, chr3, chr4)
import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping)
import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping, titleMapping)
import Data.Text.Internal (Text(..), safe)
import Data.Text.Internal.Unsafe.Char (unsafeWrite, unsafeChr8)
import qualified Prelude as P
Expand Down Expand Up @@ -113,7 +118,7 @@ caseConvert ascii remap (Text src o l) = runST $ do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
pure $ dstOff + 2
i -> writeMapping i dstOff
i -> writeMapping dst i dstOff
inner (srcOff + 2) dstOff'
3 -> do
let !(Exts.C# c) = chr3 m0 m1 m2
Expand All @@ -123,7 +128,7 @@ caseConvert ascii remap (Text src o l) = runST $ do
A.unsafeWrite dst (dstOff + 1) m1
A.unsafeWrite dst (dstOff + 2) m2
pure $ dstOff + 3
i -> writeMapping i dstOff
i -> writeMapping dst i dstOff
inner (srcOff + 3) dstOff'
_ -> do
let !(Exts.C# c) = chr4 m0 m1 m2 m3
Expand All @@ -134,45 +139,135 @@ caseConvert ascii remap (Text src o l) = runST $ do
A.unsafeWrite dst (dstOff + 2) m2
A.unsafeWrite dst (dstOff + 3) m3
pure $ dstOff + 4
i -> writeMapping i dstOff
i -> writeMapping dst i dstOff
inner (srcOff + 4) dstOff'

writeMapping :: Int64 -> Int -> ST s Int
writeMapping 0 dstOff = pure dstOff
writeMapping i dstOff = do
let (ch, j) = chopOffChar i
d <- unsafeWrite dst dstOff ch
writeMapping j (dstOff + d)

chopOffChar :: Int64 -> (Char, Int64)
chopOffChar ab = (chr a, ab `shiftR` 21)
where
chr (Exts.I# n) = Exts.C# (Exts.chr# n)
mask = (1 `shiftL` 21) - 1
a = P.fromIntegral $ ab .&. mask
{-# INLINE caseConvert #-}

writeMapping :: A.MArray s -> Int64 -> Int -> ST s Int
writeMapping !_ 0 !dstOff = pure dstOff
writeMapping dst i dstOff = do
let (ch, j) = chopOffChar i
d <- unsafeWrite dst dstOff ch
writeMapping dst j (dstOff + d)

chopOffChar :: Int64 -> (Char, Int64)
chopOffChar ab = (chr a, ab `shiftR` 21)
where
chr (Exts.I# n) = Exts.C# (Exts.chr# n)
mask = (1 `shiftL` 21) - 1
a = P.fromIntegral $ ab .&. mask

-- | /O(n)/ Convert a string to folded case.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toCaseFoldNonEmpty :: Text -> Text
toCaseFoldNonEmpty = \xs -> caseConvert (\w -> if w - 65 <= 25 then w + 32 else w) foldMapping xs
toCaseFoldNonEmpty = \xs -> caseConvert asciiToLower foldMapping xs
{-# INLINE toCaseFoldNonEmpty #-}

-- | /O(n)/ Convert a string to lower case, using simple case
-- conversion.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toLowerNonEmpty :: Text -> Text
toLowerNonEmpty = \xs -> caseConvert (\w -> if w - 65 <= 25 then w + 32 else w) lowerMapping xs
toLowerNonEmpty = \xs -> caseConvert asciiToLower lowerMapping xs
{-# INLINE toLowerNonEmpty #-}

-- | /O(n)/ Convert a string to upper case, using simple case
-- conversion.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toUpperNonEmpty :: Text -> Text
toUpperNonEmpty = \xs -> caseConvert (\w -> if w - 97 <= 25 then w - 32 else w) upperMapping xs
toUpperNonEmpty = \xs -> caseConvert asciiToUpper upperMapping xs
{-# INLINE toUpperNonEmpty #-}

asciiToLower :: Word8 -> Word8
asciiToLower w = if w - 65 <= 25 then w + 32 else w

asciiToUpper :: Word8 -> Word8
asciiToUpper w = if w - 97 <= 25 then w - 32 else w

isAsciiLetter :: Word8 -> Bool
isAsciiLetter w = w - 65 <= 25 || w - 97 <= 25

isAsciiSpace :: Word8 -> Bool
isAsciiSpace w = w .&. 0x50 == 0 && w < 0x80 && (w == 0x20 || w - 0x09 < 5)

-- | /O(n)/ Convert a string to title case, see 'Data.Text.toTitle' for discussion.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toTitleNonEmpty :: Text -> Text
toTitleNonEmpty (Text src o l) = runST $ do
-- Case conversion a single code point may produce up to 3 code-points,
-- each up to 4 bytes, so 12 in total.
dst <- A.new (l + 12)
outer dst l o 0 False
where
outer :: forall s. A.MArray s -> Int -> Int -> Int -> Bool -> ST s Text
outer !dst !dstLen = inner
where
inner !srcOff !dstOff !mode
| srcOff >= o + l = do
A.shrinkM dst dstOff
arr <- A.unsafeFreeze dst
return (Text arr 0 dstOff)
| dstOff + 12 > dstLen = do
-- Ensure to extend the buffer by at least 12 bytes.
let !dstLen' = dstLen + max 12 (l + o - srcOff)
dst' <- A.resizeM dst dstLen'
outer dst' dstLen' srcOff dstOff mode
-- If a character is to remain unchanged, no need to decode Char back into UTF8,
-- just copy bytes from input.
| otherwise = do
let m0 = A.unsafeIndex src srcOff
m1 = A.unsafeIndex src (srcOff + 1)
m2 = A.unsafeIndex src (srcOff + 2)
m3 = A.unsafeIndex src (srcOff + 3)
!d = utf8LengthByLeader m0

case d of
1 -> do
let (mode', m0') = asciiAdvance mode m0
A.unsafeWrite dst dstOff m0'
inner (srcOff + 1) (dstOff + 1) mode'
2 -> do
let !(Exts.C# c) = chr2 m0 m1
!(# mode', c' #) = advance mode c
dstOff' <- case I64# c' of
0 -> do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
pure $ dstOff + 2
i -> writeMapping dst i dstOff
inner (srcOff + 2) dstOff' mode'
3 -> do
let !(Exts.C# c) = chr3 m0 m1 m2
!(# mode', c' #) = advance mode c
dstOff' <- case I64# c' of
0 -> do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
A.unsafeWrite dst (dstOff + 2) m2
pure $ dstOff + 3
i -> writeMapping dst i dstOff
inner (srcOff + 3) dstOff' mode'
_ -> do
let !(Exts.C# c) = chr4 m0 m1 m2 m3
!(# mode', c' #) = advance mode c
dstOff' <- case I64# c' of
0 -> do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
A.unsafeWrite dst (dstOff + 2) m2
A.unsafeWrite dst (dstOff + 3) m3
pure $ dstOff + 4
i -> writeMapping dst i dstOff
inner (srcOff + 4) dstOff' mode'

asciiAdvance :: Bool -> Word8 -> (Bool, Word8)
asciiAdvance False w = (isAsciiLetter w, asciiToUpper w)
asciiAdvance True w = (not (isAsciiSpace w), asciiToLower w)

advance :: Bool -> Exts.Char# -> (# Bool, _ {- unboxed Int64 -} #)
advance False c = (# isLetter (Exts.C# c), titleMapping c #)
advance True c = (# not (isSpace (Exts.C# c)), lowerMapping c #)

-- | /O(n)/ 'filter_', applied to a continuation, a predicate and a @Text@,
-- calls the continuation with the @Text@ containing only the characters satisfying the predicate.
filter_ :: forall a. (A.Array -> Int -> Int -> a) -> (Char -> Bool) -> Text -> a
Expand Down

0 comments on commit d289882

Please sign in to comment.