Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pure-Haskell replacement for is_ascii.c #545

Merged
merged 5 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 1 addition & 29 deletions src/Data/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ import Data.Binary (Binary(get, put))
import Data.Monoid (Monoid(..))
import Data.Semigroup (Semigroup(..))
import Data.String (IsString(..))
import Data.Text.Internal.IsAscii (isAscii)
import Data.Text.Internal.Reverse (reverse)
import Data.Text.Internal.Measure (measure_off)
import Data.Text.Internal.Encoding.Utf8 (utf8Length, utf8LengthByLeader, chr3, ord2, ord3, ord4)
Expand Down Expand Up @@ -1075,35 +1076,6 @@ minimum :: HasCallStack => Text -> Char
minimum t = S.minimum (stream t)
{-# INLINE minimum #-}

-- | \O(n)\ Test whether 'Text' contains only ASCII code-points (i.e. only
-- U+0000 through U+007F).
--
-- This is a more efficient version of @'all' 'Data.Char.isAscii'@.
--
-- >>> isAscii ""
-- True
--
-- >>> isAscii "abc\NUL"
-- True
--
-- >>> isAscii "abcd€"
-- False
--
-- prop> isAscii t == all (< '\x80') t
--
-- @since 2.0.2
isAscii :: Text -> Bool
isAscii (Text (A.ByteArray arr) off len) =
cSizeToInt (c_is_ascii_offset arr (intToCSize off) (intToCSize len)) == len
{-# INLINE isAscii #-}

cSizeToInt :: CSize -> Int
cSizeToInt = P.fromIntegral
{-# INLINE cSizeToInt #-}

foreign import ccall unsafe "_hs_text_is_ascii_offset" c_is_ascii_offset
:: ByteArray# -> CSize -> CSize -> CSize

-- -----------------------------------------------------------------------------
-- * Building 'Text's
-- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
Expand Down
33 changes: 21 additions & 12 deletions src/Data/Text/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -85,21 +85,30 @@ module Data.Text.Encoding
) where

import Control.Exception (evaluate, try)
import Data.Word (Word8)
import GHC.Exts (byteArrayContents#, unsafeCoerce#)
import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
import Data.ByteString (ByteString)
#if defined(PURE_HASKELL)
import Control.Monad.ST.Unsafe (unsafeSTToIO)
import Data.ByteString.Char8 (unpack)
import Data.Text.Internal (pack)
import Foreign.Ptr (minusPtr, plusPtr)
import Foreign.Storable (poke)
#else
import Control.Monad.ST (runST)
import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
import Data.Bits (shiftR, (.&.))
import Data.Word (Word8)
import Data.Text.Internal.ByteStringCompat (withBS)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Foreign.C.Types (CSize(..))
import Foreign.Ptr (Ptr, minusPtr, plusPtr)
import Foreign.Storable (poke, peekByteOff)
import GHC.Exts (byteArrayContents#, unsafeCoerce#)
import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
import Data.ByteString (ByteString)
#endif
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
import Data.Text.Internal (Text(..), empty)
import Data.Text.Internal.ByteStringCompat (withBS)
import Data.Text.Internal.Encoding
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Internal.IsAscii (asciiPrefixLength)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Text.Show ()
import qualified Data.ByteString as B
Expand Down Expand Up @@ -172,12 +181,6 @@ decodeASCIIPrefix bs = if B.null bs
(prefix, suffix)
{-# INLINE decodeASCIIPrefix #-}

-- | Length of the longest ASCII prefix.
asciiPrefixLength :: ByteString -> Int
asciiPrefixLength bs = unsafeDupablePerformIO $ withBS bs $ \ fp len ->
unsafeWithForeignPtr fp $ \src -> do
fromIntegral <$> c_is_ascii src (src `plusPtr` len)

-- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
--
-- This is a total function which returns either the 'ByteString' converted to a
Expand Down Expand Up @@ -219,6 +222,9 @@ decodeLatin1 ::
HasCallStack =>
#endif
ByteString -> Text
#if defined(PURE_HASKELL)
decodeLatin1 bs = pack (Data.ByteString.Char8.unpack bs)
chreekat marked this conversation as resolved.
Show resolved Hide resolved
#else
decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
dst <- A.new (2 * len)
let inner srcOff dstOff = if srcOff >= len then return dstOff else do
Expand All @@ -238,9 +244,12 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
dst' <- A.resizeM dst actualLen
arr <- A.unsafeFreeze dst'
return $ Text arr 0 actualLen
#endif

#if !defined(PURE_HASKELL)
foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
:: Ptr Word8 -> Ptr Word8 -> IO CSize
#endif

-- $stream
--
Expand Down
94 changes: 94 additions & 0 deletions src/Data/Text/Internal/IsAscii.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE UnliftedFFITypes #-}
#if defined(PURE_HASKELL)
{-# LANGUAGE BangPatterns #-}
#endif

{-# OPTIONS_HADDOCK not-home #-}

-- | Implements 'isAscii', using efficient C routines by default.
--
-- Similarly implements asciiPrefixLength, used internally in Data.Text.Encoding.
module Data.Text.Internal.IsAscii where

#if defined(PURE_HASKELL)
import Prelude hiding (all)
import qualified Data.Char as Char
import qualified Data.ByteString as BS
import Data.Text.Unsafe (iter, Iter(..))
#else
import Data.Text.Internal.ByteStringCompat (withBS)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Word (Word8)
import Foreign.C.Types
import Foreign.Ptr (Ptr, plusPtr)
import GHC.Base (ByteArray#)
import Prelude (Bool(..), Int, (==), ($), IO, (<$>))
import qualified Data.Text.Array as A
#endif
import Data.ByteString (ByteString)
import Data.Text.Internal (Text(..))
import qualified Prelude as P

-- | \O(n)\ Test whether 'Text' contains only ASCII code-points (i.e. only
-- U+0000 through U+007F).
--
-- This is a more efficient version of @'all' 'Data.Char.isAscii'@.
--
-- >>> isAscii ""
-- True
--
-- >>> isAscii "abc\NUL"
-- True
--
-- >>> isAscii "abcd€"
-- False
--
-- prop> isAscii t == all (< '\x80') t
--
-- @since 2.0.2
isAscii :: Text -> Bool
#if defined(PURE_HASKELL)
isAscii = all Char.isAscii
Lysxia marked this conversation as resolved.
Show resolved Hide resolved

-- | (Re)implemented to avoid circular dependency on Data.Text.
all :: (Char -> Bool) -> Text -> Bool
all p t@(Text _ _ len) = go 0
where
go i | i >= len = True
| otherwise =
let !(Iter c j) = iter t i
in p c && go (i+j)
#else
cSizeToInt :: CSize -> Int
cSizeToInt = P.fromIntegral
{-# INLINE cSizeToInt #-}

intToCSize :: Int -> CSize
intToCSize = P.fromIntegral

isAscii (Text (A.ByteArray arr) off len) =
cSizeToInt (c_is_ascii_offset arr (intToCSize off) (intToCSize len)) == len
#endif
{-# INLINE isAscii #-}

-- | Length of the longest ASCII prefix.
asciiPrefixLength :: ByteString -> Int
#if defined(PURE_HASKELL)
asciiPrefixLength = BS.length P.. BS.takeWhile (P.< 0x80)
#else
asciiPrefixLength bs = unsafeDupablePerformIO $ withBS bs $ \ fp len ->
unsafeWithForeignPtr fp $ \src -> do
P.fromIntegral <$> c_is_ascii src (src `plusPtr` len)
#endif
{-# INLINE asciiPrefixLength #-}

#if !defined(PURE_HASKELL)
foreign import ccall unsafe "_hs_text_is_ascii_offset" c_is_ascii_offset
:: ByteArray# -> CSize -> CSize -> CSize

foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
:: Ptr Word8 -> Ptr Word8 -> IO CSize
#endif
6 changes: 3 additions & 3 deletions text.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,7 @@ flag pure-haskell
library
if arch(javascript) || flag(pure-haskell)
cpp-options: -DPURE_HASKELL
c-sources: cbits/is_ascii.c
cbits/utils.c
c-sources: cbits/utils.c
else
c-sources: cbits/is_ascii.c
cbits/measure_off.c
Expand Down Expand Up @@ -209,6 +208,7 @@ library
Data.Text.Internal.Measure
Data.Text.Internal.Reverse
Data.Text.Internal.Transformation
Data.Text.Internal.IsAscii

build-depends:
array >= 0.3 && < 0.6,
Expand Down Expand Up @@ -259,7 +259,7 @@ source-repository head
test-suite tests
type: exitcode-stdio-1.0
ghc-options:
-Wall -threaded -rtsopts
-Wall -threaded -rtsopts -with-rtsopts=-N
Lysxia marked this conversation as resolved.
Show resolved Hide resolved

hs-source-dirs: tests
main-is: Tests.hs
Expand Down
Loading