From 6cdc320681456cfc1ce901c8c0b4ef03904db5b5 Mon Sep 17 00:00:00 2001 From: FFengIll Date: Mon, 10 Apr 2023 14:45:26 +0800 Subject: [PATCH] feat/pypdf: bump pypdf. (#16) * dep: upgrade PyPDF2 to pypdf==3.7.1 * refactor: update pypdf api usage. * doc: update. * chore: update sample output with pypdf. --- README.md | 3 ++- cases/output/input.pdf | Bin 8991 -> 9080 bytes cases/output/output.pdf | Bin 8991 -> 9080 bytes pdf_page_split/Readme.md | 2 +- pdf_page_split/splitor.py | 15 ++++++--------- pdf_white_cut/cutter.py | 4 ++-- requirements.txt | 2 +- tests/{test_PyPDF2.py => test_pypdf.py} | 14 +++++++------- tests/test_white_cut.py | 11 +++++------ 9 files changed, 24 insertions(+), 27 deletions(-) rename tests/{test_PyPDF2.py => test_pypdf.py} (81%) diff --git a/README.md b/README.md index a4e52ae..3bb4933 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,8 @@ Known limitation: ## Dependency -- PyPDF2: edit the pdf box +- pypdf: a pure python module for pdf (upgrade of PyPDF2) +- ~~PyPDF2: edit the pdf box~~ - ~~pdfminer & pdfminer3: scan elements~~ - pdfminer.six: scan pdf elements - ~~PySide2: optional for GUI only~~ diff --git a/cases/output/input.pdf b/cases/output/input.pdf index 5714ab905d9b17cf5fe457c488893951f91b360e..cd7257a776ffabe0cc20976bf89e84bfcb81f9b8 100644 GIT binary patch delta 541 zcmZvXze)o^5XQNHa4ceFArj0QTm->&XaCJ^YxFElF98L$Opag?tN04XK7fy4p-+&` zO2jwNcMuB;bBTB-d*5`w`DW&u`|K-QWBsFSA2<^B{_*lwqYFTp`iq3E&Q2aql2gU4ZU5llFIt|=8j|kx|+3Ynn z=ztvb#-sdjG@)ly%*%MgB^rZG_VApvx0)K|p8m!7YIyV5Zas9SVCW(R9a8d}N1%=v zfQp%w5wqGx%qbT!kAA_XqB;yxL`Oiw{MS-7pLWaP8Z%bL%&P{5{{p7vYG)0E*X~ZD Q!e_TbP)Neq@o7eW0PyK{*#H0l delta 454 zcmez2Hs6gyHNeG9*HF)RBA*tc@kC2?;ebk@h><3jogG(dUP^va7MJnF#X6cM3T9@e zdX^Rz3YHdnW+p}^3g#B3dM1`;3b9=J0f|Mac_j*l3I+;6lf@Xfa4A@r80uMASelzp zzQ-tq%;jN{M&?>G$w(j+1Io@0yBVwnFWTJp#i3t zDK;@va}2%aCYW|uSQ>!U)gxJKXlQ7HuG!Go2O7N%xKleLsp7)>VoE6a1485(n`s=E5SaRC67vR`@t diff --git a/cases/output/output.pdf b/cases/output/output.pdf index 5714ab905d9b17cf5fe457c488893951f91b360e..cd7257a776ffabe0cc20976bf89e84bfcb81f9b8 100644 GIT binary patch delta 541 zcmZvXze)o^5XQNHa4ceFArj0QTm->&XaCJ^YxFElF98L$Opag?tN04XK7fy4p-+&` zO2jwNcMuB;bBTB-d*5`w`DW&u`|K-QWBsFSA2<^B{_*lwqYFTp`iq3E&Q2aql2gU4ZU5llFIt|=8j|kx|+3Ynn z=ztvb#-sdjG@)ly%*%MgB^rZG_VApvx0)K|p8m!7YIyV5Zas9SVCW(R9a8d}N1%=v zfQp%w5wqGx%qbT!kAA_XqB;yxL`Oiw{MS-7pLWaP8Z%bL%&P{5{{p7vYG)0E*X~ZD Q!e_TbP)Neq@o7eW0PyK{*#H0l delta 454 zcmez2Hs6gyHNeG9*HF)RBA*tc@kC2?;ebk@h><3jogG(dUP^va7MJnF#X6cM3T9@e zdX^Rz3YHdnW+p}^3g#B3dM1`;3b9=J0f|Mac_j*l3I+;6lf@Xfa4A@r80uMASelzp zzQ-tq%;jN{M&?>G$w(j+1Io@0yBVwnFWTJp#i3t zDK;@va}2%aCYW|uSQ>!U)gxJKXlQ7HuG!Go2O7N%xKleLsp7)>VoE6a1485(n`s=E5SaRC67vR`@t diff --git a/pdf_page_split/Readme.md b/pdf_page_split/Readme.md index 1c38141..eca0fa8 100644 --- a/pdf_page_split/Readme.md +++ b/pdf_page_split/Readme.md @@ -13,7 +13,7 @@ - 输出所有的对应latex语句(template.tex)中可以修改 # 使用 -- python3 -m pip install PyPDF2 +- python3 -m pip install -r requirements.txt - python3 splitor.py -t figure.tex -c figure.csv -o split/figure testcase.pdf - `-t`选择模板,只有figure和table两类(也可以执行修改) - `-c`选择数据文件 diff --git a/pdf_page_split/splitor.py b/pdf_page_split/splitor.py index 83bd6dd..78f4df0 100644 --- a/pdf_page_split/splitor.py +++ b/pdf_page_split/splitor.py @@ -1,15 +1,12 @@ # coding=utf-8 import argparse -import PyPDF2 as pdflib -from PyPDF2 import PdfFileWriter, PdfFileReader - +import configparser +import csv +import logging import os import sys -import logging - -import configparser -import csv +from pypdf import PdfReader, PdfWriter # from config import config @@ -113,7 +110,7 @@ def split(path, template, config, output="split", no_pdf=False): with open(path, "rb") as pdf, open( "log.txt".format(output), "w", encoding="utf-8" ) as log: - reader = PdfFileReader(pdf) + reader = PdfReader(pdf) for i, item in zip(range(reader.getNumPages()), config): if item is None: @@ -130,7 +127,7 @@ def split(path, template, config, output="split", no_pdf=False): page = reader.getPage(i) if not no_pdf: - writer = PdfFileWriter() + writer = PdfWriter() writer.add_page(page) with open("{}".format(path), "wb") as figure: writer.write(figure) diff --git a/pdf_white_cut/cutter.py b/pdf_white_cut/cutter.py index 7c818df..970c9ff 100644 --- a/pdf_white_cut/cutter.py +++ b/pdf_white_cut/cutter.py @@ -1,7 +1,7 @@ import os -from PyPDF2 import PdfWriter, PdfReader from path import Path +from pypdf import PdfReader, PdfWriter from pdf_white_cut import analyzer from pdf_white_cut.logger import logger @@ -55,7 +55,7 @@ def edit_pdf(source: Path, target: Path, ignore=0): try: # MENTION: never move and change the sequence, since file IO. # analyses the visible box of each page, aka the box scale. res=[(x1,y1,x2,y2)] - # analyses whole pdf at one time since it use `pdfminer` (not `PyPDF2`) + # analyses whole pdf at one time since it use `pdfminer` (not `pypdf`) page_box_list = analyzer.extract_pdf_boxs(source, ignore=ignore) # edit pdf by visible box and output it diff --git a/requirements.txt b/requirements.txt index 8cc99df..a8c7f5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # required -PyPDF2==2.11.1 +pypdf==3.7.1 pdfminer.six loguru==0.6.0 path diff --git a/tests/test_PyPDF2.py b/tests/test_pypdf.py similarity index 81% rename from tests/test_PyPDF2.py rename to tests/test_pypdf.py index 0d27e4b..b890284 100644 --- a/tests/test_PyPDF2.py +++ b/tests/test_pypdf.py @@ -1,7 +1,7 @@ -from PyPDF2 import PdfFileWriter, PdfFileReader +from pypdf import PdfReader, PdfWriter -output = PdfFileWriter() -input1 = PdfFileReader(open("cases/input/input.pdf", "rb")) +output = PdfWriter() +input1 = PdfReader(open("cases/input/input.pdf", "rb")) # print how many pages input1 has: print("document1.pdf has %d pages." % input1.getNumPages()) @@ -18,15 +18,15 @@ # add page 4 from input1, but first add a watermark from another PDF: page4 = input1.getPage(3) -watermark = PdfFileReader(open("input.pdf", "rb")) +watermark = PdfReader(open("input.pdf", "rb")) page4.mergePage(watermark.getPage(0)) output.add_page(page4) # add page 5 from input1, but crop it to half size: page5 = input1.getPage(4) -page5.mediaBox.upperRight = ( - page5.mediaBox.getUpperRight_x() / 2, - page5.mediaBox.getUpperRight_y() / 2, +page5.mediabox.upperRight = ( + page5.mediabox.getUpperRight_x() / 2, + page5.mediabox.getUpperRight_y() / 2, ) output.add_page(page5) diff --git a/tests/test_white_cut.py b/tests/test_white_cut.py index 6a9d20f..d424efc 100644 --- a/tests/test_white_cut.py +++ b/tests/test_white_cut.py @@ -1,7 +1,7 @@ import sys import loguru -from PyPDF2 import PdfFileReader, PdfFileWriter +from pypdf import PdfReader, PdfWriter sys.path.append(".") @@ -28,13 +28,12 @@ def test_analyzer(): def test_rw_pdf(): - - pdf = PdfFileReader(open("cases/input/input.pdf", "rb")) - out = PdfFileWriter() + pdf = PdfReader(open("cases/input/input.pdf", "rb")) + out = PdfWriter() for page in pdf.pages: - page.mediaBox.upper_right = (580, 800) - page.mediaBox.lower_left = (128, 232) + page.mediabox.upper_right = (580, 800) + page.mediabox.lower_left = (128, 232) out.add_page(page) ous = open("cases/output/output.pdf", "wb")