From 3e02069076cd5c8b32ecaff37539a9526bc5ecee Mon Sep 17 00:00:00 2001 From: Bryan Richter Date: Fri, 10 Nov 2023 20:36:35 +0200 Subject: [PATCH 1/5] Move Data.Text.isAscii --- src/Data/Text.hs | 30 +---------------- src/Data/Text/Internal/IsAscii.hs | 54 +++++++++++++++++++++++++++++++ text.cabal | 1 + 3 files changed, 56 insertions(+), 29 deletions(-) create mode 100644 src/Data/Text/Internal/IsAscii.hs diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 7d2514aa..72c1b96d 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -231,6 +231,7 @@ import Data.Binary (Binary(get, put)) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.String (IsString(..)) +import Data.Text.Internal.IsAscii (isAscii) import Data.Text.Internal.Reverse (reverse) import Data.Text.Internal.Measure (measure_off) import Data.Text.Internal.Encoding.Utf8 (utf8Length, utf8LengthByLeader, chr3, ord2, ord3, ord4) @@ -1075,35 +1076,6 @@ minimum :: HasCallStack => Text -> Char minimum t = S.minimum (stream t) {-# INLINE minimum #-} --- | \O(n)\ Test whether 'Text' contains only ASCII code-points (i.e. only --- U+0000 through U+007F). --- --- This is a more efficient version of @'all' 'Data.Char.isAscii'@. --- --- >>> isAscii "" --- True --- --- >>> isAscii "abc\NUL" --- True --- --- >>> isAscii "abcd€" --- False --- --- prop> isAscii t == all (< '\x80') t --- --- @since 2.0.2 -isAscii :: Text -> Bool -isAscii (Text (A.ByteArray arr) off len) = - cSizeToInt (c_is_ascii_offset arr (intToCSize off) (intToCSize len)) == len -{-# INLINE isAscii #-} - -cSizeToInt :: CSize -> Int -cSizeToInt = P.fromIntegral -{-# INLINE cSizeToInt #-} - -foreign import ccall unsafe "_hs_text_is_ascii_offset" c_is_ascii_offset - :: ByteArray# -> CSize -> CSize -> CSize - -- ----------------------------------------------------------------------------- -- * Building 'Text's -- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of diff --git a/src/Data/Text/Internal/IsAscii.hs b/src/Data/Text/Internal/IsAscii.hs new file mode 100644 index 00000000..db38f476 --- /dev/null +++ b/src/Data/Text/Internal/IsAscii.hs @@ -0,0 +1,54 @@ +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE UnliftedFFITypes #-} + +{-# OPTIONS_HADDOCK not-home #-} + +-- | Implements 'isAscii', using efficient C routines by default. +module Data.Text.Internal.IsAscii where + +import Prelude (Bool(..), Int, (==)) +import qualified Prelude as P +import Data.Text.Internal (Text(..)) +import Foreign.C.Types +import qualified Data.Text.Array as A +import GHC.Base (ByteArray#) + +-- | \O(n)\ Test whether 'Text' contains only ASCII code-points (i.e. only +-- U+0000 through U+007F). +-- +-- This is a more efficient version of @'all' 'Data.Char.isAscii'@. +-- +-- >>> isAscii "" +-- True +-- +-- >>> isAscii "abc\NUL" +-- True +-- +-- >>> isAscii "abcd€" +-- False +-- +-- prop> isAscii t == all (< '\x80') t +-- +-- @since 2.0.2 +isAscii :: Text -> Bool +#if defined(PURE_HASKELL) +isAscii = P.const False +#else +cSizeToInt :: CSize -> Int +cSizeToInt = P.fromIntegral +{-# INLINE cSizeToInt #-} + +intToCSize :: Int -> CSize +intToCSize = P.fromIntegral + +isAscii (Text (A.ByteArray arr) off len) = + cSizeToInt (c_is_ascii_offset arr (intToCSize off) (intToCSize len)) == len +#endif +{-# INLINE isAscii #-} + + +#if !defined(PURE_HASKELL) +foreign import ccall unsafe "_hs_text_is_ascii_offset" c_is_ascii_offset + :: ByteArray# -> CSize -> CSize -> CSize +#endif diff --git a/text.cabal b/text.cabal index 83452d5e..1d55559f 100644 --- a/text.cabal +++ b/text.cabal @@ -209,6 +209,7 @@ library Data.Text.Internal.Measure Data.Text.Internal.Reverse Data.Text.Internal.Transformation + Data.Text.Internal.IsAscii build-depends: array >= 0.3 && < 0.6, From 85a3a02a1410faf6edcc2d3abb7c7a49774e959a Mon Sep 17 00:00:00 2001 From: Bryan Richter Date: Fri, 10 Nov 2023 22:14:09 +0200 Subject: [PATCH 2/5] Split out all use of is_ascii --- src/Data/Text/Encoding.hs | 33 ++++++++++++++++++++----------- src/Data/Text/Internal/IsAscii.hs | 33 +++++++++++++++++++++++++++---- text.cabal | 3 +-- 3 files changed, 51 insertions(+), 18 deletions(-) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 5ad79f52..751618f6 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -85,21 +85,30 @@ module Data.Text.Encoding ) where import Control.Exception (evaluate, try) +import Data.Word (Word8) +import GHC.Exts (byteArrayContents#, unsafeCoerce#) +import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) +import Data.ByteString (ByteString) +#if defined(PURE_HASKELL) +import Control.Monad.ST.Unsafe (unsafeSTToIO) +import Data.ByteString.Char8 (unpack) +import Data.Text.Internal (pack) +import Foreign.Ptr (minusPtr, plusPtr) +import Foreign.Storable (poke) +#else import Control.Monad.ST (runST) import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Data.Bits (shiftR, (.&.)) -import Data.Word (Word8) +import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Foreign.C.Types (CSize(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) -import GHC.Exts (byteArrayContents#, unsafeCoerce#) -import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr)) -import Data.ByteString (ByteString) +#endif import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), empty) -import Data.Text.Internal.ByteStringCompat (withBS) import Data.Text.Internal.Encoding -import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import Data.Text.Internal.IsAscii (asciiPrefixLength) import Data.Text.Unsafe (unsafeDupablePerformIO) import Data.Text.Show () import qualified Data.ByteString as B @@ -172,12 +181,6 @@ decodeASCIIPrefix bs = if B.null bs (prefix, suffix) {-# INLINE decodeASCIIPrefix #-} --- | Length of the longest ASCII prefix. -asciiPrefixLength :: ByteString -> Int -asciiPrefixLength bs = unsafeDupablePerformIO $ withBS bs $ \ fp len -> - unsafeWithForeignPtr fp $ \src -> do - fromIntegral <$> c_is_ascii src (src `plusPtr` len) - -- | Decode a 'ByteString' containing 7-bit ASCII encoded text. -- -- This is a total function which returns either the 'ByteString' converted to a @@ -219,6 +222,9 @@ decodeLatin1 :: HasCallStack => #endif ByteString -> Text +#if defined(PURE_HASKELL) +decodeLatin1 bs = pack (Data.ByteString.Char8.unpack bs) +#else decodeLatin1 bs = withBS bs $ \fp len -> runST $ do dst <- A.new (2 * len) let inner srcOff dstOff = if srcOff >= len then return dstOff else do @@ -238,9 +244,12 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do dst' <- A.resizeM dst actualLen arr <- A.unsafeFreeze dst' return $ Text arr 0 actualLen +#endif +#if !defined(PURE_HASKELL) foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize +#endif -- $stream -- diff --git a/src/Data/Text/Internal/IsAscii.hs b/src/Data/Text/Internal/IsAscii.hs index db38f476..a53be011 100644 --- a/src/Data/Text/Internal/IsAscii.hs +++ b/src/Data/Text/Internal/IsAscii.hs @@ -5,14 +5,26 @@ {-# OPTIONS_HADDOCK not-home #-} -- | Implements 'isAscii', using efficient C routines by default. +-- +-- Similarly implements asciiPrefixLength, used internally in Data.Text.Encoding. module Data.Text.Internal.IsAscii where -import Prelude (Bool(..), Int, (==)) -import qualified Prelude as P -import Data.Text.Internal (Text(..)) +#if defined(PURE_HASKELL) +import Prelude (Bool(..), Int) +#else +import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import Data.Text.Unsafe (unsafeDupablePerformIO) +import Data.Word (Word8) import Foreign.C.Types -import qualified Data.Text.Array as A +import Foreign.Ptr (Ptr, plusPtr) import GHC.Base (ByteArray#) +import Prelude (Bool(..), Int, (==), ($), IO, (<$>)) +import qualified Data.Text.Array as A +#endif +import Data.ByteString (ByteString) +import Data.Text.Internal (Text(..)) +import qualified Prelude as P -- | \O(n)\ Test whether 'Text' contains only ASCII code-points (i.e. only -- U+0000 through U+007F). @@ -47,8 +59,21 @@ isAscii (Text (A.ByteArray arr) off len) = #endif {-# INLINE isAscii #-} +-- | Length of the longest ASCII prefix. +asciiPrefixLength :: ByteString -> Int +#if defined(PURE_HASKELL) +asciiPrefixLength = P.const 0 +#else +asciiPrefixLength bs = unsafeDupablePerformIO $ withBS bs $ \ fp len -> + unsafeWithForeignPtr fp $ \src -> do + P.fromIntegral <$> c_is_ascii src (src `plusPtr` len) +#endif +{-# INLINE asciiPrefixLength #-} #if !defined(PURE_HASKELL) foreign import ccall unsafe "_hs_text_is_ascii_offset" c_is_ascii_offset :: ByteArray# -> CSize -> CSize -> CSize + +foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii + :: Ptr Word8 -> Ptr Word8 -> IO CSize #endif diff --git a/text.cabal b/text.cabal index 1d55559f..47479503 100644 --- a/text.cabal +++ b/text.cabal @@ -96,8 +96,7 @@ flag pure-haskell library if arch(javascript) || flag(pure-haskell) cpp-options: -DPURE_HASKELL - c-sources: cbits/is_ascii.c - cbits/utils.c + c-sources: cbits/utils.c else c-sources: cbits/is_ascii.c cbits/measure_off.c From 97ae28ac28a87ba778e678ce494bd58d833d3052 Mon Sep 17 00:00:00 2001 From: Bryan Richter Date: Fri, 26 Jan 2024 15:08:07 +0200 Subject: [PATCH 3/5] Implement pure isAscii --- src/Data/Text/Internal/IsAscii.hs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/Data/Text/Internal/IsAscii.hs b/src/Data/Text/Internal/IsAscii.hs index a53be011..fa48020b 100644 --- a/src/Data/Text/Internal/IsAscii.hs +++ b/src/Data/Text/Internal/IsAscii.hs @@ -1,6 +1,9 @@ {-# LANGUAGE MagicHash #-} {-# LANGUAGE CPP #-} {-# LANGUAGE UnliftedFFITypes #-} +#if defined(PURE_HASKELL) +{-# LANGUAGE BangPatterns #-} +#endif {-# OPTIONS_HADDOCK not-home #-} @@ -10,7 +13,10 @@ module Data.Text.Internal.IsAscii where #if defined(PURE_HASKELL) -import Prelude (Bool(..), Int) +import Prelude hiding (all) +import qualified Data.Char as Char +import qualified Data.ByteString as BS +import Data.Text.Unsafe (iter, Iter(..)) #else import Data.Text.Internal.ByteStringCompat (withBS) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) @@ -45,7 +51,16 @@ import qualified Prelude as P -- @since 2.0.2 isAscii :: Text -> Bool #if defined(PURE_HASKELL) -isAscii = P.const False +isAscii = all Char.isAscii + +-- | (Re)implemented to avoid circular dependency on Data.Text. +all :: (Char -> Bool) -> Text -> Bool +all p t@(Text _ _ len) = go 0 + where + go i | i >= len = True + | otherwise = + let !(Iter c j) = iter t i + in p c && go (i+j) #else cSizeToInt :: CSize -> Int cSizeToInt = P.fromIntegral From b070bac902ff63ef0349ec046977074368f25d74 Mon Sep 17 00:00:00 2001 From: Bryan Richter Date: Fri, 26 Jan 2024 15:45:40 +0200 Subject: [PATCH 4/5] Implement pure-Haskell asciiPrefixLength --- src/Data/Text/Internal/IsAscii.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Data/Text/Internal/IsAscii.hs b/src/Data/Text/Internal/IsAscii.hs index fa48020b..bc829555 100644 --- a/src/Data/Text/Internal/IsAscii.hs +++ b/src/Data/Text/Internal/IsAscii.hs @@ -77,7 +77,7 @@ isAscii (Text (A.ByteArray arr) off len) = -- | Length of the longest ASCII prefix. asciiPrefixLength :: ByteString -> Int #if defined(PURE_HASKELL) -asciiPrefixLength = P.const 0 +asciiPrefixLength = BS.length P.. BS.takeWhile (P.< 0x80) #else asciiPrefixLength bs = unsafeDupablePerformIO $ withBS bs $ \ fp len -> unsafeWithForeignPtr fp $ \src -> do From c04d31af5f7a23406f7924c2df0f2f9086d4a273 Mon Sep 17 00:00:00 2001 From: Bryan Richter Date: Fri, 26 Jan 2024 21:32:47 +0200 Subject: [PATCH 5/5] Enable parallel tests --- text.cabal | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text.cabal b/text.cabal index 47479503..628f779f 100644 --- a/text.cabal +++ b/text.cabal @@ -259,7 +259,7 @@ source-repository head test-suite tests type: exitcode-stdio-1.0 ghc-options: - -Wall -threaded -rtsopts + -Wall -threaded -rtsopts -with-rtsopts=-N hs-source-dirs: tests main-is: Tests.hs