Skip to content

Commit

Permalink
Share empty Text values
Browse files Browse the repository at this point in the history
Try to use the same heap object to represent all empty Text values.
There are already attempts to enforce something like this through the
`text` smart constructor, and in various functions by special casing the
empty case.

This patch expands on these attempts and adds some tests to ensure that
empty Text values produced by this library are represented by the same
heap object.

Despite these efforts, we cannot guarantee that this will be the case in
all situations and users of the library shouldn't rely on this behaviour
for the correctness of their code.

Resolves #492.
  • Loading branch information
TeofilC authored and Bodigrim committed Jan 7, 2024
1 parent 0a72ea0 commit e84c7a3
Show file tree
Hide file tree
Showing 11 changed files with 494 additions and 218 deletions.
251 changes: 58 additions & 193 deletions src/Data/Text.hs

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/Data/Text/Foreign.hs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ newtype I8 = I8 Int
fromPtr :: Ptr Word8 -- ^ source array
-> I8 -- ^ length of source array (in 'Word8' units)
-> IO Text
fromPtr _ (I8 0) = pure empty
fromPtr ptr (I8 len) = unsafeSTToIO $ do
dst <- A.new len
A.copyFromPointer dst 0 ptr len
Expand Down
12 changes: 4 additions & 8 deletions src/Data/Text/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ module Data.Text.Internal
, safe
-- * Code that must be here for accessibility
, empty
, empty_
, append
-- * Utilities
, firstf
Expand Down Expand Up @@ -94,12 +93,7 @@ text_ arr off len =
-- | /O(1)/ The empty 'Text'.
empty :: Text
empty = Text A.empty 0 0
{-# INLINE [1] empty #-}

-- | A non-inlined version of 'empty'.
empty_ :: Text
empty_ = Text A.empty 0 0
{-# NOINLINE empty_ #-}
{-# NOINLINE empty #-}

-- | /O(n)/ Appends one 'Text' to the other by copying both of them
-- into a new 'Text'.
Expand All @@ -121,6 +115,7 @@ append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)

-- | Construct a 'Text' without invisibly pinning its byte array in
-- memory if its length has dwindled to zero.
-- It ensures that empty 'Text' values are shared.
text ::
#if defined(ASSERTS)
HasCallStack =>
Expand All @@ -131,7 +126,7 @@ text ::
-> Text
text arr off len | len == 0 = empty
| otherwise = text_ arr off len
{-# INLINE text #-}
{-# INLINE [0] text #-}

textP :: A.Array -> Int -> Int -> Text
{-# DEPRECATED textP "Use text instead" #-}
Expand Down Expand Up @@ -251,6 +246,7 @@ int64ToInt32 = fromIntegral
-- >>> Data.Text.unpack (pack "\55555")
-- "\65533"
pack :: String -> Text
pack [] = empty
pack xs = runST $ do
-- It's tempting to allocate a buffer of 4 * length xs bytes,
-- but not only it's wasteful for predominantly ASCII arguments,
Expand Down
14 changes: 11 additions & 3 deletions src/Data/Text/Internal/Lazy.hs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import Data.Typeable (Typeable)
import Foreign.Storable (sizeOf)
import qualified Data.Text.Array as A
import qualified Data.Text.Internal as T
import qualified Data.Text as T

data Text = Empty
| Chunk {-# UNPACK #-} !T.Text Text
Expand Down Expand Up @@ -86,9 +87,16 @@ showStructure (Chunk t ts) =

-- | Smart constructor for 'Chunk'. Guarantees the data type invariant.
chunk :: T.Text -> Text -> Text
{-# INLINE chunk #-}
chunk t@(T.Text _ _ len) ts | len == 0 = ts
| otherwise = Chunk t ts
{-# INLINE [0] chunk #-}
chunk t ts | T.null t = ts
| otherwise = Chunk t ts

{-# RULES
"TEXT chunk/text" forall arr off len.
chunk (T.text arr off len) = chunk (T.Text arr off len)
"TEXT chunk/empty" forall ts.
chunk T.empty ts = ts
#-}

-- | Smart constructor for 'Empty'.
empty :: Text
Expand Down
18 changes: 13 additions & 5 deletions src/Data/Text/Internal/Reverse.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
{-# OPTIONS_HADDOCK not-home #-}

-- | Implements 'reverse', using efficient C routines by default.
module Data.Text.Internal.Reverse (reverse) where
module Data.Text.Internal.Reverse (reverse, reverseNonEmpty) where

#if !defined(PURE_HASKELL)
import GHC.Exts as Exts
Expand All @@ -23,7 +23,7 @@ import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader)
import GHC.Stack (HasCallStack)
#endif
import Prelude hiding (reverse)
import Data.Text.Internal (Text(..))
import Data.Text.Internal (Text(..), empty)
import Control.Monad.ST (runST)
import qualified Data.Text.Array as A

Expand All @@ -39,8 +39,16 @@ reverse ::
HasCallStack =>
#endif
Text -> Text
reverse (Text _ _ 0) = empty
reverse t = reverseNonEmpty t
{-# INLINE reverse #-}

-- | /O(n)/ Reverse the characters of a string.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
reverseNonEmpty ::
Text -> Text
#if defined(PURE_HASKELL)
reverse (Text src off len) = runST $ do
reverseNonEmtpy (Text src off len) = runST $ do
dest <- A.new len
_ <- reversePoints src off dest len
result <- A.unsafeFreeze dest
Expand Down Expand Up @@ -80,13 +88,13 @@ reversePoints src xx dest yy = go xx yy where
A.copyI pLen dest yNext src x
go (x + pLen) yNext
#else
reverse (Text (A.ByteArray ba) off len) = runST $ do
reverseNonEmpty (Text (A.ByteArray ba) off len) = runST $ do
marr@(A.MutableByteArray mba) <- A.new len
unsafeIOToST $ c_reverse mba ba (fromIntegral off) (fromIntegral len)
brr <- A.unsafeFreeze marr
return $ Text brr 0 len
#endif
{-# INLINE reverse #-}
{-# INLINE reverseNonEmpty #-}

#if !defined(PURE_HASKELL)
-- | The input buffer (src :: ByteArray#, off :: CSize, len :: CSize)
Expand Down
Loading

0 comments on commit e84c7a3

Please sign in to comment.