From d6bffd05515c69fc1bd57e85b91c48264eddf69b Mon Sep 17 00:00:00 2001 From: Alessio Date: Tue, 9 Dec 2025 19:17:19 +0100 Subject: [PATCH 1/3] PDF merge command --- src/parxy_cli/cli.py | 2 + src/parxy_cli/commands/pdf.py | 281 ++++++++++++++++++++++ tests/commands/test_pdf.py | 439 ++++++++++++++++++++++++++++++++++ 3 files changed, 722 insertions(+) create mode 100644 src/parxy_cli/commands/pdf.py create mode 100644 tests/commands/test_pdf.py diff --git a/src/parxy_cli/cli.py b/src/parxy_cli/cli.py index 9b7d429..7bb086d 100644 --- a/src/parxy_cli/cli.py +++ b/src/parxy_cli/cli.py @@ -16,6 +16,7 @@ from parxy_cli.commands.env import app as env_command from parxy_cli.commands.version import app as version_command from parxy_cli.commands.markdown import app as markdown_command +from parxy_cli.commands.pdf import app as pdf_command # Create typer app @@ -71,6 +72,7 @@ def main( app.add_typer(env_command) app.add_typer(version_command) app.add_typer(markdown_command) +app.add_typer(pdf_command) def main(): diff --git a/src/parxy_cli/commands/pdf.py b/src/parxy_cli/commands/pdf.py new file mode 100644 index 0000000..49c4a3b --- /dev/null +++ b/src/parxy_cli/commands/pdf.py @@ -0,0 +1,281 @@ +"""PDF manipulation commands.""" + +import re +from pathlib import Path +from typing import List, Annotated, Optional, Tuple + +import typer +import pymupdf + +from parxy_cli.console.console import Console + +app = typer.Typer() + +console = Console() + + +def parse_input_with_pages( + input_str: str, +) -> Tuple[str, Optional[int], Optional[int]]: + """ + Parse input string to extract file path and page range. + + Supports formats: + - file.pdf[1] - single page (1-based) + - file.pdf[:2] - from start to page 2 (1-based, inclusive) + - file.pdf[3:] - from page 3 to end (1-based) + - file.pdf[3:5] - from page 3 to 5 (1-based, inclusive) + - file.pdf - all pages + + Args: + input_str: Input string with optional page range + + Returns: + Tuple of (file_path, from_page, to_page) where pages are 0-based for PyMuPDF. + from_page and to_page are None if no range specified or represent the range to use. + """ + # Match pattern: filename[range] + pattern = r'^(.+?)\[([^\]]+)\]$' + match = re.match(pattern, input_str) + + if not match: + # No page range specified + return input_str, None, None + + file_path = match.group(1) + page_range = match.group(2) + + # Parse the page range + if ':' in page_range: + # Range format [start:end] + parts = page_range.split(':', 1) + start_str = parts[0].strip() + end_str = parts[1].strip() + + # Convert to 0-based indices + # PyMuPDF uses 0-based indexing + from_page = (int(start_str) - 1) if start_str else 0 + to_page = (int(end_str) - 1) if end_str else None # None means last page + + else: + # Single page [n] + page_num = int(page_range) - 1 # Convert to 0-based + from_page = page_num + to_page = page_num + + return file_path, from_page, to_page + + +def collect_pdf_files_with_ranges( + inputs: List[str], +) -> List[Tuple[Path, Optional[int], Optional[int]]]: + """ + Collect PDF files from the input list with optional page ranges. + + For folders, only files in the exact directory are collected (non-recursive). + For files with page ranges (e.g., file.pdf[1:3]), parse and extract the range. + + Args: + inputs: List of file paths (with optional page ranges) and/or folder paths + + Returns: + List of tuples: (Path, from_page, to_page) where pages are 0-based. + from_page and to_page are None if all pages should be included. + """ + files = [] + + for input_str in inputs: + # Parse the input to extract file path and page range + file_path_str, from_page, to_page = parse_input_with_pages(input_str) + path = Path(file_path_str) + + if path.is_file(): + # Check if it's a PDF + if path.suffix.lower() == '.pdf': + files.append((path, from_page, to_page)) + else: + console.warning(f'Skipping non-PDF file: {file_path_str}') + elif path.is_dir(): + # Non-recursive: only files in the given directory + # Directories cannot have page ranges + if from_page is not None or to_page is not None: + console.warning( + f'Page ranges are not supported for directories: {input_str}' + ) + pdf_files = sorted(path.glob('*.pdf')) + if pdf_files: + # Add all PDFs from directory without page ranges + files.extend([(f, None, None) for f in pdf_files]) + else: + console.warning(f'No PDF files found in directory: {file_path_str}') + else: + console.warning(f'Path not found: {file_path_str}') + + return files + + +@app.command(name='pdf:merge', help='Merge multiple PDF files into a single PDF') +def merge( + inputs: Annotated[ + List[str], + typer.Argument( + help='One or more PDF files or folders to merge. Files support page ranges in square brackets (e.g., file.pdf[1:3]). Folders are processed non-recursively.', + ), + ], + output: Annotated[ + str, + typer.Option( + '--output', + '-o', + help='Output file path for the merged PDF. If not specified, you will be prompted.', + ), + ] = None, +): + """ + Merge multiple PDF files into a single PDF. + + Files are merged in the order they are provided. When a folder is specified, + PDF files in that folder are included (non-recursively) and sorted alphabetically. + + Page ranges can be specified using square brackets with 1-based indexing: + - file.pdf[1] - only page 1 + - file.pdf[:2] - from first page to page 2 (inclusive) + - file.pdf[3:] - from page 3 to the end + - file.pdf[3:5] - from page 3 to page 5 (inclusive) + - file.pdf - all pages (no brackets) + + Examples: + + # Merge specific files with output specified + parxy pdf:merge file1.pdf file2.pdf -o merged.pdf + + # Merge files - will prompt for output filename + parxy pdf:merge file1.pdf file2.pdf + + # Merge with page ranges - take page 1 from file1, pages 2-4 from file2 + parxy pdf:merge file1.pdf[1] file2.pdf[2:4] -o merged.pdf + + # Merge specific pages from multiple files + parxy pdf:merge doc1.pdf[:3] doc2.pdf[5:] doc3.pdf[2] -o combined.pdf + + # Mix full files and page ranges + parxy pdf:merge cover.pdf report.pdf[1:10] appendix.pdf -o final.pdf + + # Merge all PDFs in a folder + parxy pdf:merge /path/to/folder -o merged.pdf + + # Merge files and folders + parxy pdf:merge doc1.pdf /path/to/folder doc2.pdf -o merged.pdf + """ + console.action('Merge PDF files', space_after=False) + + # Collect all PDF files with page ranges + files_with_ranges = collect_pdf_files_with_ranges(inputs) + + if not files_with_ranges: + console.error('No PDF files found to merge.', panel=True) + raise typer.Exit(1) + + if len(files_with_ranges) < 2: + console.warning( + 'Only one PDF file found. At least two files are needed for merging.', + panel=True, + ) + raise typer.Exit(1) + + console.info( + f'Found {len(files_with_ranges)} PDF file{"s" if len(files_with_ranges) > 1 else ""} to merge' + ) + + # Handle output path + if output is None: + output = typer.prompt('Enter output filename or path') + + output_path = Path(output) + + # If only a filename is provided (not an absolute path), use the first input file's directory + if not output_path.is_absolute() and output_path.parent == Path('.'): + first_file = files_with_ranges[0][0] + output_path = first_file.parent / output_path + + # Ensure the output has .pdf extension + if output_path.suffix.lower() != '.pdf': + output_path = output_path.with_suffix('.pdf') + + # Create output directory if it doesn't exist + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Merge PDFs + try: + with console.shimmer(f'Merging {len(files_with_ranges)} PDF files...'): + merged_pdf = pymupdf.open() + + for file_path, from_page, to_page in files_with_ranges: + try: + pdf = pymupdf.open(file_path) + + # Determine page range to insert + if from_page is None and to_page is None: + # Insert all pages + page_info = 'all pages' + merged_pdf.insert_pdf(pdf) + else: + # Insert specific page range + # PyMuPDF insert_pdf uses from_page and to_page (inclusive, 0-based) + actual_from = from_page if from_page is not None else 0 + actual_to = to_page if to_page is not None else (len(pdf) - 1) + + # Validate page range + if actual_from < 0 or actual_from >= len(pdf): + console.warning( + f'Invalid page range for {file_path.name}: page {actual_from + 1} does not exist' + ) + pdf.close() + continue + + if actual_to < 0 or actual_to >= len(pdf): + console.warning( + f'Invalid page range for {file_path.name}: page {actual_to + 1} does not exist' + ) + pdf.close() + continue + + if actual_from > actual_to: + console.warning( + f'Invalid page range for {file_path.name}: start page {actual_from + 1} > end page {actual_to + 1}' + ) + pdf.close() + continue + + # Format page info for display (1-based) + if actual_from == actual_to: + page_info = f'page {actual_from + 1}' + else: + page_info = f'pages {actual_from + 1}-{actual_to + 1}' + + merged_pdf.insert_pdf( + pdf, from_page=actual_from, to_page=actual_to + ) + + console.print( + f'[faint]⎿ [/faint] Adding {file_path.name} ({page_info})' + ) + pdf.close() + + except Exception as e: + console.error(f'Error processing {file_path.name}: {str(e)}') + merged_pdf.close() + raise typer.Exit(1) + + # Save the merged PDF + merged_pdf.save(str(output_path)) + merged_pdf.close() + + console.newline() + console.success( + f'Successfully merged {len(files_with_ranges)} files into {output_path}' + ) + + except Exception as e: + console.error(f'Error during merge: {str(e)}') + raise typer.Exit(1) diff --git a/tests/commands/test_pdf.py b/tests/commands/test_pdf.py new file mode 100644 index 0000000..21bdd2e --- /dev/null +++ b/tests/commands/test_pdf.py @@ -0,0 +1,439 @@ +"""Test suite for PDF commands.""" + +import pytest +from pathlib import Path +from typer.testing import CliRunner +import pymupdf + +from parxy_cli.commands.pdf import ( + app, + parse_input_with_pages, + collect_pdf_files_with_ranges, +) + + +@pytest.fixture +def runner(): + """Fixture providing a CLI runner.""" + return CliRunner() + + +@pytest.fixture +def sample_pdfs(tmp_path): + """Create sample PDF files for testing.""" + # Create first PDF with 3 pages + pdf1_path = tmp_path / 'doc1.pdf' + pdf1 = pymupdf.open() + for i in range(3): + page = pdf1.new_page(width=612, height=792) + page.insert_text((100, 100), f'Page {i + 1} of doc1') + pdf1.save(str(pdf1_path)) + pdf1.close() + + # Create second PDF with 2 pages + pdf2_path = tmp_path / 'doc2.pdf' + pdf2 = pymupdf.open() + for i in range(2): + page = pdf2.new_page(width=612, height=792) + page.insert_text((100, 100), f'Page {i + 1} of doc2') + pdf2.save(str(pdf2_path)) + pdf2.close() + + # Create third PDF with 5 pages + pdf3_path = tmp_path / 'doc3.pdf' + pdf3 = pymupdf.open() + for i in range(5): + page = pdf3.new_page(width=612, height=792) + page.insert_text((100, 100), f'Page {i + 1} of doc3') + pdf3.save(str(pdf3_path)) + pdf3.close() + + return { + 'pdf1': pdf1_path, + 'pdf2': pdf2_path, + 'pdf3': pdf3_path, + 'tmp_path': tmp_path, + } + + +@pytest.fixture +def pdf_folder(tmp_path): + """Create a folder with multiple PDFs.""" + folder = tmp_path / 'pdfs' + folder.mkdir() + + # Create three PDFs in the folder + for i in range(1, 4): + pdf_path = folder / f'file{i}.pdf' + pdf = pymupdf.open() + page = pdf.new_page(width=612, height=792) + page.insert_text((100, 100), f'Content of file{i}') + pdf.save(str(pdf_path)) + pdf.close() + + return folder + + +# Tests for parse_input_with_pages helper function +class TestParseInputWithPages: + """Tests for the parse_input_with_pages helper function.""" + + def test_no_page_range(self): + """Test parsing input without page range.""" + file_path, from_page, to_page = parse_input_with_pages('file.pdf') + assert file_path == 'file.pdf' + assert from_page is None + assert to_page is None + + def test_single_page(self): + """Test parsing single page specification.""" + file_path, from_page, to_page = parse_input_with_pages('file.pdf[3]') + assert file_path == 'file.pdf' + assert from_page == 2 # 0-based index + assert to_page == 2 + + def test_range_from_start(self): + """Test parsing range from start to specified page.""" + file_path, from_page, to_page = parse_input_with_pages('file.pdf[:5]') + assert file_path == 'file.pdf' + assert from_page == 0 + assert to_page == 4 # 0-based index + + def test_range_to_end(self): + """Test parsing range from specified page to end.""" + file_path, from_page, to_page = parse_input_with_pages('file.pdf[3:]') + assert file_path == 'file.pdf' + assert from_page == 2 # 0-based index + assert to_page is None + + def test_range_both_bounds(self): + """Test parsing range with both start and end specified.""" + file_path, from_page, to_page = parse_input_with_pages('file.pdf[2:5]') + assert file_path == 'file.pdf' + assert from_page == 1 # 0-based index + assert to_page == 4 # 0-based index + + def test_path_with_spaces(self): + """Test parsing path with spaces.""" + file_path, from_page, to_page = parse_input_with_pages( + 'path with spaces/file.pdf[1:3]' + ) + assert file_path == 'path with spaces/file.pdf' + assert from_page == 0 + assert to_page == 2 + + def test_path_with_brackets_in_name(self): + """Test parsing path without page range but with brackets elsewhere.""" + file_path, from_page, to_page = parse_input_with_pages('file.pdf') + assert file_path == 'file.pdf' + assert from_page is None + assert to_page is None + + +# Tests for collect_pdf_files_with_ranges helper function +class TestCollectPdfFilesWithRanges: + """Tests for the collect_pdf_files_with_ranges helper function.""" + + def test_single_file_no_range(self, sample_pdfs): + """Test collecting a single file without page range.""" + files = collect_pdf_files_with_ranges([str(sample_pdfs['pdf1'])]) + assert len(files) == 1 + assert files[0][0] == sample_pdfs['pdf1'] + assert files[0][1] is None # from_page + assert files[0][2] is None # to_page + + def test_single_file_with_range(self, sample_pdfs): + """Test collecting a single file with page range.""" + files = collect_pdf_files_with_ranges([f'{sample_pdfs["pdf1"]}[1:2]']) + assert len(files) == 1 + assert files[0][0] == sample_pdfs['pdf1'] + assert files[0][1] == 0 # from_page (0-based) + assert files[0][2] == 1 # to_page (0-based) + + def test_multiple_files(self, sample_pdfs): + """Test collecting multiple files.""" + files = collect_pdf_files_with_ranges( + [str(sample_pdfs['pdf1']), str(sample_pdfs['pdf2'])] + ) + assert len(files) == 2 + assert files[0][0] == sample_pdfs['pdf1'] + assert files[1][0] == sample_pdfs['pdf2'] + + def test_folder_input(self, pdf_folder): + """Test collecting PDFs from a folder.""" + files = collect_pdf_files_with_ranges([str(pdf_folder)]) + assert len(files) == 3 + # Files should be sorted alphabetically + file_names = [f[0].name for f in files] + assert file_names == ['file1.pdf', 'file2.pdf', 'file3.pdf'] + + def test_mixed_files_and_folders(self, sample_pdfs, pdf_folder): + """Test collecting from both files and folders.""" + files = collect_pdf_files_with_ranges( + [str(sample_pdfs['pdf1']), str(pdf_folder)] + ) + assert len(files) == 4 # 1 file + 3 from folder + + def test_nonexistent_file(self, tmp_path): + """Test handling of nonexistent file.""" + files = collect_pdf_files_with_ranges([str(tmp_path / 'nonexistent.pdf')]) + assert len(files) == 0 + + def test_non_pdf_file(self, tmp_path): + """Test handling of non-PDF file.""" + txt_file = tmp_path / 'file.txt' + txt_file.write_text('not a pdf') + files = collect_pdf_files_with_ranges([str(txt_file)]) + assert len(files) == 0 + + def test_empty_folder(self, tmp_path): + """Test handling of empty folder.""" + empty_folder = tmp_path / 'empty' + empty_folder.mkdir() + files = collect_pdf_files_with_ranges([str(empty_folder)]) + assert len(files) == 0 + + def test_folder_with_page_range_warning(self, pdf_folder): + """Test that page ranges on folders produce warning.""" + files = collect_pdf_files_with_ranges([f'{pdf_folder}[1:3]']) + # Should still collect files but ignore the page range + assert len(files) == 3 + # All files should have no page ranges + for file_path, from_page, to_page in files: + assert from_page is None + assert to_page is None + + +# Tests for the merge command +class TestMergeCommand: + """Tests for the pdf:merge command.""" + + def test_merge_two_files_basic(self, runner, sample_pdfs): + """Test basic merge of two PDF files.""" + output = sample_pdfs['tmp_path'] / 'merged.pdf' + result = runner.invoke( + app, + [ + 'pdf:merge', + str(sample_pdfs['pdf1']), + str(sample_pdfs['pdf2']), + '--output', + str(output), + ], + ) + + assert result.exit_code == 0 + assert output.exists() + + # Verify the merged PDF has correct number of pages + merged = pymupdf.open(str(output)) + assert len(merged) == 5 # 3 pages from pdf1 + 2 pages from pdf2 + merged.close() + + def test_merge_with_page_ranges(self, runner, sample_pdfs): + """Test merging with specific page ranges.""" + output = sample_pdfs['tmp_path'] / 'merged.pdf' + result = runner.invoke( + app, + [ + 'pdf:merge', + f'{sample_pdfs["pdf1"]}[1:2]', # First 2 pages + f'{sample_pdfs["pdf2"]}[1]', # Only first page + '--output', + str(output), + ], + ) + + assert result.exit_code == 0 + assert output.exists() + + # Verify the merged PDF has correct number of pages + merged = pymupdf.open(str(output)) + assert len(merged) == 3 # 2 pages from pdf1 + 1 page from pdf2 + merged.close() + + def test_merge_folder(self, runner, pdf_folder, tmp_path): + """Test merging all PDFs in a folder.""" + output = tmp_path / 'merged.pdf' + result = runner.invoke( + app, ['pdf:merge', str(pdf_folder), '--output', str(output)] + ) + + assert result.exit_code == 0 + assert output.exists() + + # Verify the merged PDF has correct number of pages (3 PDFs with 1 page each) + merged = pymupdf.open(str(output)) + assert len(merged) == 3 + merged.close() + + def test_merge_without_output_prompts(self, runner, sample_pdfs): + """Test that merge prompts for output when not specified.""" + result = runner.invoke( + app, + ['pdf:merge', str(sample_pdfs['pdf1']), str(sample_pdfs['pdf2'])], + input='output.pdf\n', + ) + + # Command should prompt for output filename + assert 'Enter output filename' in result.stdout + assert result.exit_code == 0 + + def test_merge_adds_pdf_extension(self, runner, sample_pdfs): + """Test that .pdf extension is added if missing.""" + output = sample_pdfs['tmp_path'] / 'merged' # No extension + result = runner.invoke( + app, + [ + 'pdf:merge', + str(sample_pdfs['pdf1']), + str(sample_pdfs['pdf2']), + '--output', + str(output), + ], + ) + + assert result.exit_code == 0 + # Output should have .pdf extension added + assert (sample_pdfs['tmp_path'] / 'merged.pdf').exists() + + def test_merge_creates_output_directory(self, runner, sample_pdfs): + """Test that output directory is created if it doesn't exist.""" + output = sample_pdfs['tmp_path'] / 'subdir' / 'nested' / 'merged.pdf' + result = runner.invoke( + app, + [ + 'pdf:merge', + str(sample_pdfs['pdf1']), + str(sample_pdfs['pdf2']), + '--output', + str(output), + ], + ) + + assert result.exit_code == 0 + assert output.exists() + assert output.parent.exists() + + def test_merge_single_file_fails(self, runner, sample_pdfs): + """Test that merging a single file fails with appropriate message.""" + output = sample_pdfs['tmp_path'] / 'merged.pdf' + result = runner.invoke( + app, + ['pdf:merge', str(sample_pdfs['pdf1']), '--output', str(output)], + ) + + assert result.exit_code == 1 + assert 'at least two files' in result.stdout.lower() + + def test_merge_no_files_fails(self, runner, tmp_path): + """Test that merging with no valid files fails.""" + output = tmp_path / 'merged.pdf' + result = runner.invoke( + app, + ['pdf:merge', str(tmp_path / 'nonexistent.pdf'), '--output', str(output)], + ) + + assert result.exit_code == 1 + assert 'no pdf files found' in result.stdout.lower() + + def test_merge_with_invalid_page_range(self, runner, sample_pdfs): + """Test merging with invalid page range.""" + output = sample_pdfs['tmp_path'] / 'merged.pdf' + # pdf1 has only 3 pages, trying to access page 10 + result = runner.invoke( + app, + [ + 'pdf:merge', + f'{sample_pdfs["pdf1"]}[10]', + str(sample_pdfs['pdf2']), + '--output', + str(output), + ], + ) + + # Should show warning but continue with pdf2 + assert 'invalid page range' in result.stdout.lower() or result.exit_code == 0 + + def test_merge_mixed_files_and_ranges(self, runner, sample_pdfs): + """Test merging mix of full files and page ranges.""" + output = sample_pdfs['tmp_path'] / 'merged.pdf' + result = runner.invoke( + app, + [ + 'pdf:merge', + str(sample_pdfs['pdf1']), # All pages (3) + f'{sample_pdfs["pdf3"]}[2:4]', # Pages 2-4 (3 pages) + '--output', + str(output), + ], + ) + + assert result.exit_code == 0 + assert output.exists() + + merged = pymupdf.open(str(output)) + assert len(merged) == 6 # 3 from pdf1 + 3 from pdf3[2:4] + merged.close() + + def test_merge_with_open_ended_range(self, runner, sample_pdfs): + """Test merging with open-ended page ranges.""" + output = sample_pdfs['tmp_path'] / 'merged.pdf' + result = runner.invoke( + app, + [ + 'pdf:merge', + f'{sample_pdfs["pdf3"]}[:2]', # First 2 pages + f'{sample_pdfs["pdf3"]}[3:]', # From page 3 to end (3 pages) + '--output', + str(output), + ], + ) + + assert result.exit_code == 0 + assert output.exists() + + merged = pymupdf.open(str(output)) + assert len(merged) == 5 # 2 + 3 pages + merged.close() + + def test_merge_preserves_order(self, runner, sample_pdfs): + """Test that files are merged in the specified order.""" + output = sample_pdfs['tmp_path'] / 'merged.pdf' + result = runner.invoke( + app, + [ + 'pdf:merge', + str(sample_pdfs['pdf2']), + str(sample_pdfs['pdf1']), + '--output', + str(output), + ], + ) + + assert result.exit_code == 0 + assert output.exists() + + merged = pymupdf.open(str(output)) + # Should be pdf2 pages first, then pdf1 pages + assert len(merged) == 5 + # We can't easily verify order without reading content, but count is correct + merged.close() + + def test_merge_relative_output_path(self, runner, sample_pdfs): + """Test that relative output path uses first file's directory.""" + result = runner.invoke( + app, + [ + 'pdf:merge', + str(sample_pdfs['pdf1']), + str(sample_pdfs['pdf2']), + '--output', + 'merged.pdf', # Relative path + ], + ) + + assert result.exit_code == 0 + # Output should be in same directory as first input file + expected_output = sample_pdfs['pdf1'].parent / 'merged.pdf' + assert expected_output.exists() From 69f466add7e2051053c04e9de43ab8a6e2104292 Mon Sep 17 00:00:00 2001 From: Alessio Date: Tue, 9 Dec 2025 20:55:02 +0100 Subject: [PATCH 2/3] Add pdf:split command --- src/parxy_cli/commands/pdf.py | 116 +++++++++++++++++++++++++++++ tests/commands/test_pdf.py | 134 ++++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+) diff --git a/src/parxy_cli/commands/pdf.py b/src/parxy_cli/commands/pdf.py index 49c4a3b..700bfd6 100644 --- a/src/parxy_cli/commands/pdf.py +++ b/src/parxy_cli/commands/pdf.py @@ -279,3 +279,119 @@ def merge( except Exception as e: console.error(f'Error during merge: {str(e)}') raise typer.Exit(1) + + +@app.command(name='pdf:split', help='Split a PDF file into individual pages') +def split( + input_file: Annotated[ + str, + typer.Argument( + help='PDF file to split', + ), + ], + output_dir: Annotated[ + Optional[str], + typer.Option( + '--output', + '-o', + help='Output directory for split files. If not specified, creates a folder next to the input file.', + ), + ] = None, + prefix: Annotated[ + Optional[str], + typer.Option( + '--prefix', + '-p', + help='Prefix for output filenames. If not specified, uses the input filename.', + ), + ] = None, +): + """ + Split a PDF file into individual pages. + + Each page becomes a separate PDF file in the output directory. + + Output files are named: {prefix}_page_{number}.pdf + + Examples: + + # Split into individual pages (default behavior) + parxy pdf:split document.pdf + + # Split with custom output directory + parxy pdf:split document.pdf -o /path/to/output + + # Split with custom prefix + parxy pdf:split document.pdf --prefix chapter + + # Split with custom output and prefix + parxy pdf:split report.pdf -o ./pages -p page + """ + console.action('Split PDF file', space_after=False) + + # Validate input file + input_path = Path(input_file) + if not input_path.is_file(): + console.error(f'Input file not found: {input_file}', panel=True) + raise typer.Exit(1) + + if input_path.suffix.lower() != '.pdf': + console.error(f'Input file must be a PDF: {input_file}', panel=True) + raise typer.Exit(1) + + # Determine output directory + if output_dir is None: + # Create a folder next to the input file + output_path = input_path.parent / f'{input_path.stem}_split' + else: + output_path = Path(output_dir) + + # Create output directory + output_path.mkdir(parents=True, exist_ok=True) + + # Determine filename prefix + if prefix is None: + prefix = input_path.stem + + # Open and process the PDF + try: + pdf = pymupdf.open(input_path) + total_pages = len(pdf) + + if total_pages == 0: + console.error('PDF file is empty (no pages)', panel=True) + pdf.close() + raise typer.Exit(1) + + console.info( + f'Processing PDF with {total_pages} page{"s" if total_pages > 1 else ""}' + ) + console.info( + f'Splitting into {total_pages} file{"s" if total_pages > 1 else ""}' + ) + + with console.shimmer(f'Splitting PDF...'): + output_files = [] + + # Split into individual pages + for page_num in range(total_pages): + output_file = output_path / f'{prefix}_page_{page_num + 1}.pdf' + output_pdf = pymupdf.open() + output_pdf.insert_pdf(pdf, from_page=page_num, to_page=page_num) + output_pdf.save(str(output_file)) + output_pdf.close() + output_files.append(output_file) + console.print( + f'[faint]⎿ [/faint] Created {output_file.name} (page {page_num + 1})' + ) + + pdf.close() + + console.newline() + console.success( + f'Successfully split PDF into {len(output_files)} file{"s" if len(output_files) > 1 else ""} in {output_path}' + ) + + except Exception as e: + console.error(f'Error during split: {str(e)}') + raise typer.Exit(1) diff --git a/tests/commands/test_pdf.py b/tests/commands/test_pdf.py index 21bdd2e..72ee0fc 100644 --- a/tests/commands/test_pdf.py +++ b/tests/commands/test_pdf.py @@ -437,3 +437,137 @@ def test_merge_relative_output_path(self, runner, sample_pdfs): # Output should be in same directory as first input file expected_output = sample_pdfs['pdf1'].parent / 'merged.pdf' assert expected_output.exists() + + +# Tests for the split command +class TestSplitCommand: + """Tests for the pdf:split command.""" + + def test_split_into_individual_pages(self, runner, sample_pdfs): + """Test splitting a PDF into individual pages.""" + output_dir = sample_pdfs['tmp_path'] / 'split_output' + result = runner.invoke( + app, + [ + 'pdf:split', + str(sample_pdfs['pdf1']), + '--output', + str(output_dir), + ], + ) + + assert result.exit_code == 0 + assert output_dir.exists() + + # Should create 3 files (pdf1 has 3 pages) + output_files = sorted(output_dir.glob('*.pdf')) + assert len(output_files) == 3 + + # Check filenames + assert output_files[0].name == 'doc1_page_1.pdf' + assert output_files[1].name == 'doc1_page_2.pdf' + assert output_files[2].name == 'doc1_page_3.pdf' + + # Verify each file has exactly 1 page + for output_file in output_files: + pdf = pymupdf.open(str(output_file)) + assert len(pdf) == 1 + pdf.close() + + def test_split_with_custom_prefix(self, runner, sample_pdfs): + """Test splitting with a custom filename prefix.""" + output_dir = sample_pdfs['tmp_path'] / 'split_prefix' + result = runner.invoke( + app, + [ + 'pdf:split', + str(sample_pdfs['pdf2']), # 2 pages + '--output', + str(output_dir), + '--prefix', + 'chapter', + ], + ) + + assert result.exit_code == 0 + assert output_dir.exists() + + output_files = sorted(output_dir.glob('*.pdf')) + assert len(output_files) == 2 + assert output_files[0].name == 'chapter_page_1.pdf' + assert output_files[1].name == 'chapter_page_2.pdf' + + def test_split_default_output_directory(self, runner, sample_pdfs): + """Test that default output directory is created next to input file.""" + result = runner.invoke( + app, + [ + 'pdf:split', + str(sample_pdfs['pdf2']), + ], + ) + + assert result.exit_code == 0 + + # Default output should be {filename}_split next to the input + expected_output_dir = sample_pdfs['pdf2'].parent / 'doc2_split' + assert expected_output_dir.exists() + + output_files = list(expected_output_dir.glob('*.pdf')) + assert len(output_files) == 2 + + def test_split_nonexistent_file(self, runner, tmp_path): + """Test splitting a nonexistent file.""" + result = runner.invoke( + app, + [ + 'pdf:split', + str(tmp_path / 'nonexistent.pdf'), + ], + ) + + assert result.exit_code == 1 + assert 'not found' in result.stdout.lower() + + def test_split_non_pdf_file(self, runner, tmp_path): + """Test splitting a non-PDF file.""" + txt_file = tmp_path / 'file.txt' + txt_file.write_text('not a pdf') + + result = runner.invoke( + app, + [ + 'pdf:split', + str(txt_file), + ], + ) + + assert result.exit_code == 1 + assert 'must be a pdf' in result.stdout.lower() + + def test_split_single_page_pdf(self, runner, tmp_path): + """Test splitting a single-page PDF.""" + # Create a single-page PDF + pdf_path = tmp_path / 'single.pdf' + pdf = pymupdf.open() + page = pdf.new_page(width=612, height=792) + page.insert_text((100, 100), 'Single page') + pdf.save(str(pdf_path)) + pdf.close() + + output_dir = tmp_path / 'split_single' + result = runner.invoke( + app, + [ + 'pdf:split', + str(pdf_path), + '--output', + str(output_dir), + ], + ) + + assert result.exit_code == 0 + + output_files = list(output_dir.glob('*.pdf')) + assert len(output_files) == 1 + assert output_files[0].name == 'single_page_1.pdf' From 5d30f2dc8852cab0f34dfa86d3ca204195a2b504 Mon Sep 17 00:00:00 2001 From: Alessio Date: Tue, 9 Dec 2025 20:55:33 +0100 Subject: [PATCH 3/3] Document pdf:split and pdf:merge commands --- README.md | 8 ++ docs/howto/pdf_manipulation.md | 255 +++++++++++++++++++++++++++++++++ docs/tutorials/using_cli.md | 59 ++++++++ 3 files changed, 322 insertions(+) create mode 100644 docs/howto/pdf_manipulation.md diff --git a/README.md b/README.md index 11bb3bb..46b0909 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,8 @@ Once installed, you can use the `parxy` command to: - `parxy parse`: Extract text content from documents with customizable granularity levels and output formats. Process individual files or entire folders, use multiple drivers, and control output with progress bars. - `parxy preview`: Interactive document viewer showing metadata, table of contents, and content preview in a scrollable interface - `parxy markdown`: Convert documents into Markdown format, with optional combining of multiple documents +- `parxy pdf:merge`: Merge multiple PDF files into one, with support for selecting specific page ranges +- `parxy pdf:split`: Split a PDF file into individual pages - `parxy drivers`: List available document processing drivers - `parxy env`: Create a configuration file with default settings - `parxy docker`: Generate a Docker Compose configuration for self-hosted services @@ -89,6 +91,12 @@ parxy preview document.pdf # Convert multiple PDFs to markdown and combine them parxy markdown --combine -o output/ doc1.pdf doc2.pdf +# Merge multiple PDFs with page ranges +parxy pdf:merge cover.pdf doc1.pdf[1:10] doc2.pdf -o merged.pdf + +# Split a PDF into individual pages +parxy pdf:split document.pdf -o ./pages + # List available drivers parxy drivers ``` diff --git a/docs/howto/pdf_manipulation.md b/docs/howto/pdf_manipulation.md new file mode 100644 index 0000000..6f7c2f7 --- /dev/null +++ b/docs/howto/pdf_manipulation.md @@ -0,0 +1,255 @@ +# How to Manipulate PDFs with Parxy + +Parxy provides powerful **PDF manipulation commands** that allow you to merge multiple PDF files into one or split a single PDF into multiple files — all from the command line. + +These commands are useful for: +- Combining multiple PDF documents into a single file +- Extracting specific page ranges from PDFs +- Splitting large PDFs into smaller, manageable files +- Reorganizing PDF pages + +## Merging PDFs + +The `pdf:merge` command combines multiple PDF files into a single output file, with support for selecting specific page ranges. + +### Basic Merging + +Merge two or more PDF files: + +```bash +parxy pdf:merge file1.pdf file2.pdf -o merged.pdf +``` + +If you don't specify an output file, you'll be prompted to enter one: + +```bash +parxy pdf:merge file1.pdf file2.pdf +# Prompts: Enter output filename or path: merged.pdf +``` + +### Merging Entire Folders + +You can merge all PDFs in a folder (non-recursively): + +```bash +parxy pdf:merge /path/to/folder -o combined.pdf +``` + +Files from folders are included in alphabetical order. + +### Combining Files and Folders + +Mix individual files and folders: + +```bash +parxy pdf:merge cover.pdf /path/to/chapters appendix.pdf -o book.pdf +``` + +### Selecting Specific Pages + +Use square brackets to specify page ranges (1-based indexing): + +**Single page:** +```bash +parxy pdf:merge document.pdf[1] -o first_page.pdf +``` + +**Page range:** +```bash +parxy pdf:merge document.pdf[1:3] -o first_three_pages.pdf +``` + +**From start to page N:** +```bash +parxy pdf:merge document.pdf[:5] -o first_five_pages.pdf +``` + +**From page N to end:** +```bash +parxy pdf:merge document.pdf[10:] -o from_page_10.pdf +``` + +### Advanced Merging Examples + +**Combine specific pages from multiple documents:** +```bash +parxy pdf:merge doc1.pdf[1] doc2.pdf[2:4] doc3.pdf[:2] -o selected_pages.pdf +``` + +**Mix full files with page ranges:** +```bash +parxy pdf:merge cover.pdf report.pdf[1:10] summary.pdf appendix.pdf[5:] -o final_report.pdf +``` + +**Merge chapter files:** +```bash +parxy pdf:merge intro.pdf chapter1.pdf chapter2.pdf chapter3.pdf conclusion.pdf -o complete_book.pdf +``` + +### Output Path Handling + +- If you provide a full path, the file is created there +- If you provide just a filename, it's created in the same directory as the first input file +- The `.pdf` extension is added automatically if not provided + +```bash +# Creates merged.pdf in the same directory as file1.pdf +parxy pdf:merge file1.pdf file2.pdf -o merged + +# Creates in specified directory +parxy pdf:merge file1.pdf file2.pdf -o /output/dir/merged.pdf +``` + +## Splitting PDFs + +The `pdf:split` command divides a single PDF into individual pages, with each page becoming a separate PDF file. + +### Basic Splitting + +Split a PDF into individual pages: + +```bash +parxy pdf:split document.pdf +``` + +This creates a folder named `document_split/` containing: +- `document_page_1.pdf` +- `document_page_2.pdf` +- `document_page_3.pdf` +- etc. + +### Custom Output Directory + +Specify where to save the split files: + +```bash +parxy pdf:split document.pdf --output /path/to/output +``` + +### Custom Filename Prefix + +Change the prefix of output filenames: + +```bash +parxy pdf:split book.pdf --prefix chapter +``` + +Creates files named: +- `chapter_page_1.pdf` +- `chapter_page_2.pdf` +- etc. + +### Complete Examples + +**Split with custom output directory:** +```bash +parxy pdf:split annual_report.pdf -o ./pages +``` + +**Split with custom prefix:** +```bash +parxy pdf:split presentation.pdf --prefix slide +``` + +Creates: +- `slide_page_1.pdf` +- `slide_page_2.pdf` +- etc. + +**Split with both custom output and prefix:** +```bash +parxy pdf:split document.pdf -o ./individual_pages -p page +``` + +## Combining Merge and Split + +You can chain operations together using the CLI: + +**Example: Extract specific pages and split them:** +```bash +# First, extract pages 10-20 +parxy pdf:merge document.pdf[10:20] -o extracted.pdf + +# Then split into individual pages +parxy pdf:split extracted.pdf -o ./individual_pages +``` + +**Example: Merge and organize:** +```bash +# Merge selected pages from multiple documents +parxy pdf:merge doc1.pdf[1:5] doc2.pdf[3:8] -o combined.pdf + +# Split the combined result into individual pages +parxy pdf:split combined.pdf -o ./pages -p combined_page +``` + +## Tips and Best Practices + +### Page Numbering +- All page ranges use **1-based indexing** (first page is page 1, not 0) +- Ranges are **inclusive** (e.g., `[1:3]` includes pages 1, 2, and 3) + +### File Organization +- Use folders to keep merged/split files organized +- Use descriptive prefixes to make file purposes clear +- Split creates a dedicated folder by default to avoid clutter + +### Performance +- Both commands are optimized for speed +- Large PDFs are processed efficiently +- Progress information is displayed during processing + +### Error Handling +- Invalid page ranges are reported with warnings +- Missing files are detected before processing starts +- The commands validate input before making changes + +## Command Reference + +### pdf:merge + +```bash +parxy pdf:merge [FILES...] --output OUTPUT +``` + +**Arguments:** +- `FILES`: One or more PDF files or folders. Supports page ranges: `file.pdf[1:3]` + +**Options:** +- `--output, -o`: Output file path (prompted if not provided) + +**Examples:** +```bash +parxy pdf:merge file1.pdf file2.pdf -o merged.pdf +parxy pdf:merge folder1/ file.pdf folder2/ -o combined.pdf +parxy pdf:merge doc.pdf[1:10] doc.pdf[20:30] -o selections.pdf +``` + +### pdf:split + +```bash +parxy pdf:split INPUT_FILE [OPTIONS] +``` + +**Arguments:** +- `INPUT_FILE`: PDF file to split into individual pages + +**Options:** +- `--output, -o`: Output directory (default: `{filename}_split/`) +- `--prefix, -p`: Output filename prefix (default: input filename) + +**Examples:** +```bash +parxy pdf:split document.pdf +parxy pdf:split document.pdf -o ./pages +parxy pdf:split document.pdf -o ./pages -p page +``` + +## Getting Help + +For detailed command usage, use the `--help` flag: + +```bash +parxy pdf:merge --help +parxy pdf:split --help +``` diff --git a/docs/tutorials/using_cli.md b/docs/tutorials/using_cli.md index d5541f8..42adb33 100644 --- a/docs/tutorials/using_cli.md +++ b/docs/tutorials/using_cli.md @@ -13,6 +13,8 @@ The Parxy CLI lets you: | `parxy parse` | Extract text content from documents with customizable detail levels and output formats. Process files or folders with multiple drivers. | | `parxy preview` | Interactive document viewer with metadata, table of contents, and scrollable content preview | | `parxy markdown` | Convert parsed documents into Markdown format (optionally combine multiple files) | +| `parxy pdf:merge`| Merge multiple PDF files into one, with support for page ranges | +| `parxy pdf:split`| Split a PDF file into individual pages | | `parxy drivers` | List available document processing drivers | | `parxy env` | Generate a default `.env` configuration file | | `parxy docker` | Create a Docker Compose configuration for running Parxy-related services | @@ -207,6 +209,61 @@ parxy markdown --combine -o output/ doc1.pdf doc2.pdf doc3.pdf This will generate a file named `combined_output.md` in the output directory. +## Manipulating PDFs + +Parxy provides two powerful commands for PDF manipulation: merging multiple PDFs into one and splitting a single PDF into multiple files. + +### Merging PDFs + +The `pdf:merge` command combines multiple PDF files into a single output file. You can merge entire files, specific page ranges, or folders of PDFs. + +**Basic merge:** +```bash +parxy pdf:merge file1.pdf file2.pdf -o merged.pdf +``` + +**Merge with page ranges:** +```bash +parxy pdf:merge doc1.pdf[1:5] doc2.pdf[3:7] -o combined.pdf +``` + +Page range syntax (1-based indexing): +- `file.pdf[1]` - Single page (page 1) +- `file.pdf[1:5]` - Pages 1 through 5 +- `file.pdf[:3]` - First 3 pages +- `file.pdf[5:]` - From page 5 to the end + +**Merge entire folders:** +```bash +parxy pdf:merge /path/to/pdfs -o combined.pdf +``` + +**Mix files, folders, and page ranges:** +```bash +parxy pdf:merge cover.pdf /chapters doc.pdf[10:20] appendix.pdf -o book.pdf +``` + +### Splitting PDFs + +The `pdf:split` command divides a PDF file into individual pages, with each page becoming a separate PDF file. + +**Split into individual pages:** +```bash +parxy pdf:split document.pdf +``` + +This creates a `document_split/` folder containing `document_page_1.pdf`, `document_page_2.pdf`, etc. + +**Specify output directory and prefix:** +```bash +parxy pdf:split report.pdf -o ./pages -p page +``` + +Creates `page_1.pdf`, `page_2.pdf`, etc. in the `./pages` directory. + +For more detailed examples and use cases, see the [PDF Manipulation How-to Guide](../howto/pdf_manipulation.md). + + ## Managing Drivers To view the list of supported document parsing drivers: @@ -271,6 +328,8 @@ With the CLI, you can use Parxy as a **standalone document parsing tool** — id | `parxy parse` | Extract text from documents with multiple formats & drivers | | `parxy preview` | Interactive document viewer with metadata and TOC | | `parxy markdown` | Generate Markdown output | +| `parxy pdf:merge`| Merge multiple PDF files with page range support | +| `parxy pdf:split`| Split PDF files into individual pages | | `parxy drivers` | List supported drivers | | `parxy env` | Create default configuration file | | `parxy docker` | Generate Docker Compose setup |