From b2614d6ab6366a7c174f45c2fd5753a751f2646b Mon Sep 17 00:00:00 2001 From: Juri <1773075+2mol@users.noreply.github.com> Date: Sat, 11 Jun 2022 17:19:54 +0200 Subject: [PATCH] vendorize pdfinfo (#55) Co-authored-by: 2mol <2mol@users.noreply.github.com> --- pboy.cabal | 5 +- src/Lib.hs | 6 +- src/PDFInfo.hs | 180 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 187 insertions(+), 4 deletions(-) create mode 100644 src/PDFInfo.hs diff --git a/pboy.cabal b/pboy.cabal index d7c3553..4c7c315 100644 --- a/pboy.cabal +++ b/pboy.cabal @@ -27,6 +27,7 @@ library other-modules: Config , Lib + , PDFInfo , Paths_pboy hs-source-dirs: src @@ -39,10 +40,12 @@ library , either , microlens , microlens-th - , pdfinfo + , mtl , process + , process-extras , text , time + , time-locale-compat , titlecase , vector , vty diff --git a/src/Lib.hs b/src/Lib.hs index caa4dc7..be32c73 100644 --- a/src/Lib.hs +++ b/src/Lib.hs @@ -29,7 +29,7 @@ import qualified System.Directory as Dir import System.FilePath ((</>)) import qualified System.FilePath as FilePath import qualified System.Process as P -import qualified Text.PDF.Info as PDFI +import qualified PDFInfo data FileInfo = FileInfo @@ -70,7 +70,7 @@ isPdf fileInfo = fileNameSuggestions :: FilePath -> IO (Text, [Text]) fileNameSuggestions file = do - pdfInfo <- PDFI.pdfInfo file + pdfInfo <- PDFInfo.pdfInfo file topLines <- getTopLines file @@ -87,7 +87,7 @@ fileNameSuggestions file = do maybeTitle = Either.rightToMaybe pdfInfo - >>= PDFI.pdfInfoTitle + >>= PDFInfo.pdfInfoTitle & fmap sanitize >>= boolToMaybe lengthCheck diff --git a/src/PDFInfo.hs b/src/PDFInfo.hs new file mode 100644 index 0000000..d5f4070 --- /dev/null +++ b/src/PDFInfo.hs @@ -0,0 +1,180 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE OverloadedStrings #-} +{-# OPTIONS -Wall #-} + +module PDFInfo + (-- * Reading PDF info + pdfInfo + , PDFInfo(..) + , PDFSize(..) + , PDFEncryptionInfo(..) + , PDFInfoError(..) + -- * Internals + , ParsePDFInfo + , runParse + , parse + , parseSize + , parseDate + , parseEncrypted + , readRight) + where + +import Control.Applicative +import Control.Arrow +import Control.Exception as E +import Control.Monad.Except + +import Data.Char (isSpace) +import Data.Text (Text) +import qualified Data.Text as T +import Data.Time (UTCTime, parseTimeM) +import Data.Time.Locale.Compat (defaultTimeLocale) +import Prelude +import System.Exit +import System.Process.Text + +-- | A type representing the output from the pdfinfo command. +data PDFInfo = PDFInfo { + pdfInfoTitle :: !(Maybe Text) -- ^ Title + , pdfInfoSubject :: !(Maybe Text) -- ^ Subject + , pdfInfoAuthor :: !(Maybe Text) -- ^ Author: E.g. Chris Done + , pdfInfoCreator :: !(Maybe Text) -- ^ Creator: E.g. Microsoft Office Word 2007 + , pdfInfoProducer :: !(Maybe Text) -- ^ Producer: E.g. Microsoft Office Word 2007 + , pdfInfoCreationDate :: !(Maybe UTCTime) -- ^ Creation Date + , pdfInfoModDate :: !(Maybe UTCTime) -- ^ Modification Date + , pdfInfoTagged :: !(Maybe Bool) -- ^ Tagged? + , pdfInfoPages :: !(Maybe Integer) -- ^ Pages: E.g. 238 + , pdfInfoEncrypted :: !(Maybe PDFEncryptionInfo) -- ^ Encryption information + , pdfInfoPageSize :: !(Maybe PDFSize) -- ^ Page: E.g. 595.32 x 841.92 pts (A4) + , pdfInfoFileSize :: !(Maybe Integer) -- ^ File: E.g. 4061737 bytes + , pdfInfoOptimized :: !(Maybe Bool) -- ^ Optimized? + , pdfInfoPDFVersion :: !(Maybe Double) -- ^ PDF: E.g. 1.5 + } deriving Show + +-- | Possible things that can go wrong while reading the info. +data PDFInfoError + = ParseError !String -- ^ Couldn't parse a property value. + | ProcessFailure !Text -- ^ Process exited with this stderr. + | ProcessError !IOException -- ^ Error to do with the pdfinfo process. + | NoMessage -- ^ No message given. + | SomeError String -- ^ Some nonspecific error. + deriving Show + +-- | Size of the PDF in pts. +data PDFSize = PDFSize { pdfSizeW :: !Float, pdfSizeH :: !Float } + deriving (Eq,Show) + +-- | Encryption and restricted permissions +data PDFEncryptionInfo + -- | Not encrypted + = PDFNoEncryption + -- | Encrypted with possible permission restrictions + | PDFEncryption { + pdfCanPrint :: !(Maybe Bool) -- ^ Can the file be printed? + , pdfCanCopy :: !(Maybe Bool) -- ^ Can the file be copied? + , pdfCanChange :: !(Maybe Bool) -- ^ Can the file be changed? + , pdfCanAddNotes :: !(Maybe Bool) -- ^ Can notes be added? + , pdfEncryptionAlgorithm :: !(Maybe Text) -- ^ Encryption algorithm: e.g. unknown, RC4, AES, AES-256 + } + deriving (Eq,Show) + +-- instance Error PDFInfoError where noMsg = NoMessage; strMsg = SomeError +newtype ParsePDFInfo a = ParsePDFInfo { runParse :: Either PDFInfoError a } + deriving (Monad,Functor,MonadError PDFInfoError) +instance Applicative ParsePDFInfo where (<*>) = ap; pure = pure + +-- | Run pdfinfo on the given file. Handles IO exceptions to do with +-- running the process. +pdfInfo :: MonadIO m => FilePath -> m (Either PDFInfoError PDFInfo) +pdfInfo path = liftIO $ loadInfo `E.catch` ioErrorHandler where + loadInfo = do (code,out,err) <- readProcessWithExitCode "pdfinfo" ["-enc","UTF-8",path] "" + case code of + ExitSuccess -> return (parse out) + ExitFailure{} -> return (Left (ProcessFailure err)) + ioErrorHandler = return . Left . ProcessError + +-- | Parse PDFInfo's output. +parse :: Text -> Either PDFInfoError PDFInfo +parse out = runParse $ + PDFInfo <$> string props "Title" + <*> string props "Subject" + <*> string props "Author" + <*> string props "Creator" + <*> string props "Producer" + <*> date "CreationDate" + <*> date "ModDate" + <*> bool props "Tagged" + <*> integer "Pages" + <*> encrypted "Encrypted" + <*> size "Page size" + <*> integer "File size" + <*> bool props "Optimized" + <*> floating "PDF version" + where date = get parseDate + size = get parseSize + encrypted = get parseEncrypted + floating = readIt + integer = readIt + readIt :: Read a => Text -> ParsePDFInfo (Maybe a) + readIt = get readRight + props = map split . T.lines $ out + get = withProps props + +type Props = [(Text,Text)] + +-- | Look up a name in a finite map of "properties" and apply a (parsing) +-- function. +withProps :: Props -> (Text -> ParsePDFInfo a) -> Text -> ParsePDFInfo (Maybe a) +withProps properties f name = + case lookup name properties of + Just ok -> catchError (Just <$> (f $ T.strip ok)) + (\_ -> return Nothing) + Nothing -> return Nothing + +split :: Text -> (Text, Text) +split = second (T.drop 1) . T.span (/=':') + +string :: Props -> Text -> ParsePDFInfo (Maybe Text) +string props = withProps props return + +bool :: Props -> Text -> ParsePDFInfo (Maybe Bool) +bool props = withProps props $ \yes -> return $ yes == "yes" + +-- | Parse a page size. This is loosely defined. +parseSize :: Text -> ParsePDFInfo PDFSize +parseSize s = + case T.words s of + ((readRight -> Right x):"x":(readRight -> Right y):_) -> + return $ PDFSize x y + _ -> throwError $ ParseError $ "Unable to read size: " ++ show s + +-- | Parse a date according to pdfinfo's format. +parseDate :: Text -> ParsePDFInfo UTCTime +parseDate s = + case parseTimeM True defaultTimeLocale "%a %b %e %H:%M:%S %Y" (T.unpack s) of + Just ok -> return ok + Nothing -> throwError $ ParseError $ "Unable to parse date: " ++ show s + +-- | Parse encryption information according to pdfinfo's format. +parseEncrypted :: Text -> ParsePDFInfo PDFEncryptionInfo +parseEncrypted s = + case T.break isSpace s of + ("yes",rest) -> + PDFEncryption <$> bool props "print" + <*> bool props "copy" + <*> bool props "change" + <*> bool props "addNotes" + <*> string props "algorithm" + where + props = map split $ T.words $ T.filter (flip notElem ['(',')']) rest + ("no",_) -> return PDFNoEncryption + _ -> throwError $ ParseError $ "Unable to parse encryption: " ++ show s + +-- | Read a value, maybe, allow misc trailing data. +readRight :: (MonadError PDFInfoError m,Read a) => Text -> m a +readRight s = + case reads (T.unpack s) of + [(v,_)] -> return v + _ -> throwError $ ParseError $ "Couldn't read value: " ++ show s