Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvements, up to 20x faster #183

Merged
merged 7 commits into from
Feb 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 81 additions & 38 deletions System/FilePath/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -112,24 +112,24 @@ module System.OsPath.MODULE_NAME.Internal

{- HLINT ignore "Use fewer imports" -}
import Prelude (Char, Bool(..), Maybe(..), (.), (&&), (<=), not, fst, maybe, (||), (==), ($), otherwise, fmap, mempty, (>=), (/=), (++), snd)
import Data.Bifunctor (first)
import Data.Semigroup ((<>))
import qualified Prelude as P
import Data.Maybe(isJust)
import Data.Maybe(fromMaybe, isJust)
import qualified Data.List as L

#ifndef OS_PATH
import Data.String (fromString)
import System.Environment(getEnv)
import Prelude (String, map, FilePath, Eq, IO, id, last, init, reverse, dropWhile, null, break, takeWhile, take, all, elem, any, span)
import Prelude (String, map, FilePath, Eq, IO, id, last, init, reverse, dropWhile, null, break, take, all, elem, any, span)
import Data.Char(toLower, toUpper, isAsciiLower, isAsciiUpper)
import Data.List(stripPrefix, isSuffixOf, uncons)
import Data.List(stripPrefix, isSuffixOf, uncons, dropWhileEnd)
#define CHAR Char
#define STRING String
#define FILEPATH FilePath
#else
import Prelude (fromIntegral)
import Control.Exception ( SomeException, evaluate, try, displayException )
import Data.Bifunctor (first)
import Control.DeepSeq (force)
import GHC.IO (unsafePerformIO)
import qualified Data.Char as C
Expand Down Expand Up @@ -290,13 +290,24 @@ getSearchPath = fmap splitSearchPath (getEnv "PATH")
-- > splitExtension "file.txt/boris.ext" == ("file.txt/boris",".ext")
-- > splitExtension "file/path.txt.bob.fred" == ("file/path.txt.bob",".fred")
-- > splitExtension "file/path.txt/" == ("file/path.txt/","")

-- A naive implementation would be to use @splitFileName_@ first,
-- then break filename into basename and extension, then recombine dir and basename.
-- This is way too expensive, see @splitFileName_@ comment for discussion.
--
-- Instead we speculatively split on the extension separator first, then check
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice trick

-- whether results are well-formed.
splitExtension :: FILEPATH -> (STRING, STRING)
splitExtension x = if null nameDot
then (x, mempty)
else (dir <> init nameDot, singleton extSeparator <> ext)
where
(dir,file) = splitFileName_ x
(nameDot,ext) = breakEnd isExtSeparator file
splitExtension x
-- Imagine x = "no-dots", then nameDot = ""
| null nameDot = (x, mempty)
-- Imagine x = "\\shared.with.dots\no-dots"
| isWindows && null (dropDrive nameDot) = (x, mempty)
hasufell marked this conversation as resolved.
Show resolved Hide resolved
-- Imagine x = "dir.with.dots/no-dots"
| any isPathSeparator ext = (x, mempty)
| otherwise = (init nameDot, extSeparator `cons` ext)
where
(nameDot, ext) = breakEnd isExtSeparator x

-- | Get the extension of a file, returns @\"\"@ for no extension, @.ext@ otherwise.
--
Expand Down Expand Up @@ -358,7 +369,7 @@ addExtension file xs = case uncons xs of
Just (x, _) -> joinDrive a res
where
res = if isExtSeparator x then b <> xs
else b <> singleton extSeparator <> xs
else b <> (extSeparator `cons` xs)

(a,b) = splitDrive file

Expand All @@ -383,7 +394,7 @@ isExtensionOf :: STRING -> FILEPATH -> Bool
isExtensionOf ext = \fp -> case uncons ext of
Just (x, _)
| x == _period -> isSuffixOf ext . takeExtensions $ fp
_ -> isSuffixOf (singleton _period <> ext) . takeExtensions $ fp
_ -> isSuffixOf (_period `cons` ext) . takeExtensions $ fp

-- | Drop the given extension from a FILEPATH, and the @\".\"@ preceding it.
-- Returns 'Nothing' if the FILEPATH does not have the given extension, or
Expand All @@ -403,7 +414,7 @@ isExtensionOf ext = \fp -> case uncons ext of
-- > stripExtension "" x == Just x
stripExtension :: STRING -> FILEPATH -> Maybe FILEPATH
stripExtension ext path = case uncons ext of
Just (x, _) -> let dotExt = if isExtSeparator x then ext else singleton _period <> ext
Just (x, _) -> let dotExt = if isExtSeparator x then ext else _period `cons` ext
in stripSuffix dotExt path
Nothing -> Just path

Expand Down Expand Up @@ -506,19 +517,21 @@ readDriveUNC bs = case unpack bs of

{- c:\ -}
readDriveLetter :: STRING -> Maybe (FILEPATH, FILEPATH)
readDriveLetter bs = case unpack bs of
(x:c:y:xs)
| c == _colon && isLetter x && isPathSeparator y -> Just $ addSlash (pack [x,_colon]) (pack (y:xs))
(x:c:xs)
| c == _colon && isLetter x -> Just (pack [x,_colon], pack xs)
_ -> Nothing
readDriveLetter bs = case uncons2 bs of
Nothing -> Nothing
Just (x, c, ys)
| isLetter x, c == _colon -> Just $ case uncons ys of
Just (y, _)
| isPathSeparator y -> addSlash (pack [x,_colon]) ys
_ -> (pack [x,_colon], ys)
| otherwise -> Nothing

{- \\sharename\ -}
readDriveShare :: STRING -> Maybe (FILEPATH, FILEPATH)
readDriveShare bs = case unpack bs of
(s1:s2:xs) | isPathSeparator s1 && isPathSeparator s2 ->
let (a, b) = readDriveShareName (pack xs)
in Just (singleton s1 <> singleton s2 <> a,b)
in Just (s1 `cons` (s2 `cons` a), b)
_ -> Nothing

{- assume you have already seen \\ -}
Expand Down Expand Up @@ -594,19 +607,53 @@ splitFileName x = if null path
else (path, file)
where
(path, file) = splitFileName_ x
dotSlash = singleton _period <> singleton _slash
dotSlash = _period `cons` singleton _slash

-- version of splitFileName where, if the FILEPATH has no directory
-- component, the returned directory is "" rather than "./". This
-- is used in cases where we are going to combine the returned
-- directory to make a valid FILEPATH, and having a "./" appear would
-- look strange and upset simple equality properties. See
-- e.g. replaceFileName.
--
-- A naive implementation is
--
-- splitFileName_ fp = (drv <> dir, file)
-- where
-- (drv, pth) = splitDrive fp
-- (dir, file) = breakEnd isPathSeparator pth
--
-- but it is undesirable for two reasons:
-- * splitDrive is very slow on Windows,
-- * we unconditionally allocate 5 FilePath objects where only 2 would normally suffice.
--
-- In the implementation below we first speculatively split the input by the last path
-- separator. In the vast majority of cases this is already the answer, except
-- two exceptional cases explained below.
--
splitFileName_ :: FILEPATH -> (STRING, STRING)
splitFileName_ fp = (drv <> dir, file)
splitFileName_ fp
-- If dirSlash is empty, @fp@ is either a genuine filename without any dir,
-- or just a Windows drive name without slash like "c:".
-- Run readDriveLetter to figure out.
| isWindows
, null dirSlash
= fromMaybe (mempty, fp) (readDriveLetter fp)
-- Another Windows quirk is that @fp@ could have been a shared drive "\\share"
-- or UNC location "\\?\UNC\foo", where path separator is a part of the drive name.
-- We can test this by trying dropDrive and falling back to splitDrive.
| isWindows
, Just (s1, _s2, bs') <- uncons2 dirSlash
, isPathSeparator s1
-- If bs' is empty, then s2 as the last character of dirSlash must be a path separator,
-- so we are in the middle of shared drive.
-- Otherwise, since s1 is a path separator, we might be in the middle of UNC path.
, null bs' || maybe False (null . snd) (readDriveUNC dirSlash)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to be honest... this code will be harder to maintain in the face of fixing bugs like this:

ghci> W.splitFileName "\\\\.\\C:"
("\\\\.\\","C:")

These will be eventually fixed when I get around finishing #99

So I can't guarantee there won't be performance regressions in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough, I'll be happy to take another look at performance after #99.

FWIW this branch deals with Windows file names starting with \\, so it's a relatively uncommon code path, should not be a bottleneck anyway.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with merging. If I ever manage to finish #99, then it will be a major bump and almost none of the existing code will remain.

The idea is to parse into an ADT and then deal with that only. My guess is that will increase allocations, so we'll see performance regression.

= (fp, mempty)
| otherwise
= (dirSlash, file)
where
(drv, pth) = splitDrive fp
(dir, file) = breakEnd isPathSeparator pth
(dirSlash, file) = breakEnd isPathSeparator fp

-- | Set the filename.
--
Expand Down Expand Up @@ -736,7 +783,7 @@ combineAlways a b | null a = b
[a1, a2] | isWindows
, isLetter a1
, a2 == _colon -> a <> b
_ -> a <> singleton pathSeparator <> b
_ -> a <> (pathSeparator `cons` b)


-- | Combine two paths with a path separator.
Expand Down Expand Up @@ -1068,7 +1115,7 @@ makeValid path
| isPosix = map (\x -> if x == _nul then _underscore else x) path
| isJust (readDriveShare drv) && all isPathSeparator drv = take 2 drv <> fromString "drive"
| isJust (readDriveUNC drv) && not (hasTrailingPathSeparator drv) =
makeValid (drv <> singleton pathSeparator <> pth)
makeValid (drv <> (pathSeparator `cons` pth))
| otherwise = joinDrive drv $ validElements $ validCHARs pth

where
Expand Down Expand Up @@ -1129,18 +1176,9 @@ isAbsolute = not . isRelative
#ifndef OS_PATH

-----------------------------------------------------------------------------
-- dropWhileEnd (>2) [1,2,3,4,1,2,3,4] == [1,2,3,4,1,2])
-- Note that Data.List.dropWhileEnd is only available in base >= 4.5.
dropWhileEnd :: (a -> Bool) -> [a] -> [a]
dropWhileEnd p = reverse . dropWhile p . reverse

-- takeWhileEnd (>2) [1,2,3,4,1,2,3,4] == [3,4])
takeWhileEnd :: (a -> Bool) -> [a] -> [a]
takeWhileEnd p = reverse . takeWhile p . reverse

-- spanEnd (>2) [1,2,3,4,1,2,3,4] = ([1,2,3,4,1,2], [3,4])
spanEnd :: (a -> Bool) -> [a] -> ([a], [a])
spanEnd p xs = (dropWhileEnd p xs, takeWhileEnd p xs)
spanEnd p = L.foldr (\x (pref, suff) -> if null pref && p x then (pref, x : suff) else (x : pref, suff)) ([], [])

-- breakEnd (< 2) [1,2,3,4,1,2,3,4] == ([1,2,3,4,1],[2,3,4])
breakEnd :: (a -> Bool) -> [a] -> ([a], [a])
Expand All @@ -1152,11 +1190,16 @@ breakEnd p = spanEnd (not . p)
stripSuffix :: Eq a => [a] -> [a] -> Maybe [a]
stripSuffix xs ys = reverse P.<$> stripPrefix (reverse xs) (reverse ys)

cons :: a -> [a] -> [a]
cons = (:)

unsnoc :: [a] -> Maybe ([a], a)
unsnoc [] = Nothing
unsnoc xs = Just (init xs, last xs)
unsnoc = L.foldr (\x -> Just . maybe ([], x) (first (x :))) Nothing

uncons2 :: [a] -> Maybe (a, a, [a])
uncons2 [] = Nothing
uncons2 [_] = Nothing
uncons2 (x : y : zs) = Just (x, y, zs)

_period, _quotedbl, _backslash, _slash, _question, _U, _N, _C, _colon, _semicolon, _US, _less, _greater, _bar, _asterisk, _nul, _space, _underscore :: Char
_period = '.'
Expand Down
16 changes: 16 additions & 0 deletions System/OsPath/Data/ByteString/Short.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE NoImplicitPrelude #-}
-- |
-- Module : System.OsPath.Data.ByteString.Short
Expand Down Expand Up @@ -81,6 +82,7 @@ module System.OsPath.Data.ByteString.Short (
last,
tail,
uncons,
uncons2,
head,
init,
unsnoc,
Expand Down Expand Up @@ -173,3 +175,17 @@ module System.OsPath.Data.ByteString.Short (
) where

import Data.ByteString.Short.Internal
import System.OsPath.Data.ByteString.Short.Internal

import Prelude (Maybe(..), Ord(..), Num(..), ($), otherwise)
import Data.Word (Word8)

uncons2 :: ShortByteString -> Maybe (Word8, Word8, ShortByteString)
hasufell marked this conversation as resolved.
Show resolved Hide resolved
uncons2 = \sbs ->
let l = length sbs
nl = l - 2
in if | l <= 1 -> Nothing
| otherwise -> let h = indexWord8Array (asBA sbs) 0
h' = indexWord8Array (asBA sbs) 1
t = create nl $ \mba -> copyByteArray (asBA sbs) 1 mba 0 nl
in Just (h, h', t)
13 changes: 10 additions & 3 deletions System/OsPath/Data/ByteString/Short/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ module System.OsPath.Data.ByteString.Short.Internal where

import Control.Monad.ST
import Control.Exception (assert, throwIO)
import Data.Bits (Bits(..))
import Data.ByteString.Short.Internal (ShortByteString(..), length)
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup
Expand Down Expand Up @@ -284,15 +285,21 @@ writeWord16Array (MBA# mba#) (I# i#) (W16# w#) =
ST (\s -> case writeWord8Array# mba# (i# +# 1#) msb# s of
s' -> (# s', () #))

indexWord8Array :: BA
-> Int -- ^ Word8 index
-> Word8
indexWord8Array (BA# ba#) (I# i#) = W8# (indexWord8Array# ba# i#)

-- | This isn't strictly Word16 array read. Instead it's two Word8 array reads
-- to avoid endianness issues due to primops doing automatic alignment based
-- on host platform. We expect the byte array to be LE always.
indexWord16Array :: BA
-> Int -- ^ Word8 index (not Word16)
-> Word16
indexWord16Array (BA# ba#) (I# i#) =
case (# indexWord8Array# ba# i#, indexWord8Array# ba# (i# +# 1#) #) of
(# lsb#, msb# #) -> W16# (decodeWord16LE# (# lsb#, msb# #))
indexWord16Array ba i = fromIntegral lsb .|. (fromIntegral msb `shiftL` 8)
where
lsb = indexWord8Array ba i
msb = indexWord8Array ba (i + 1)

#if !MIN_VERSION_base(4,16,0)

Expand Down
13 changes: 13 additions & 0 deletions System/OsPath/Data/ByteString/Short/Word16.hs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ module System.OsPath.Data.ByteString.Short.Word16 (
last,
tail,
uncons,
uncons2,
head,
init,
unsnoc,
Expand Down Expand Up @@ -260,6 +261,18 @@ uncons = \(assertEven -> sbs) ->
t = create nl $ \mba -> copyByteArray (asBA sbs) 2 mba 0 nl
in Just (h, t)

-- | /O(n)/ Extract first two elements and the rest of a ByteString,
-- returning Nothing if it is shorter than two elements.
uncons2 :: ShortByteString -> Maybe (Word16, Word16, ShortByteString)
uncons2 = \(assertEven -> sbs) ->
let l = BS.length sbs
nl = l - 4
in if | l <= 2 -> Nothing
| otherwise -> let h = indexWord16Array (asBA sbs) 0
h' = indexWord16Array (asBA sbs) 2
t = create nl $ \mba -> copyByteArray (asBA sbs) 4 mba 0 nl
in Just (h, h', t)

-- | /O(1)/ Extract the first element of a ShortByteString, which must be at least one Word16.
-- An exception will be thrown in the case of an empty ShortByteString.
head :: HasCallStack => ShortByteString -> Word16
Expand Down