Skip to content

Commit

Permalink
rename pie to fpie
Browse files Browse the repository at this point in the history
  • Loading branch information
shawnchan2014 committed Apr 13, 2022
1 parent b4b526a commit f7b961d
Show file tree
Hide file tree
Showing 39 changed files with 147 additions and 146 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ jobs:
run: |
cd tests
python3 data.py benchmark
pie --check-backend
pie -s circle6.png -t circle6.png -m circle6.png -o result.png -n 5000
fpie --check-backend
fpie -s circle6.png -t circle6.png -m circle6.png -o result.png -n 5000
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.4)

project(pie_core)
project(fpie_core)

if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
Expand All @@ -23,21 +23,21 @@ endif()

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/pybind11)

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/pybind11/include pie/core)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/pybind11/include fpie/core)

add_subdirectory(pie/core/gcc)
add_subdirectory(fpie/core/gcc)

find_package(OpenMP)
if(OpenMP_FOUND)
add_subdirectory(pie/core/openmp)
add_subdirectory(fpie/core/openmp)
endif()

find_package(MPI)
if(MPI_FOUND)
add_subdirectory(pie/core/mpi)
add_subdirectory(fpie/core/mpi)
endif()

find_package(CUDA)
if(CUDA_FOUND)
add_subdirectory(pie/core/cuda)
add_subdirectory(fpie/core/cuda)
endif()
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
include CMakeLists.txt
include cmake_modules/*.cmake
recursive-include pie CMakeLists.txt *.h *.cc *.cu
recursive-include fpie CMakeLists.txt *.h *.cc *.cu
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
SHELL = /bin/bash
PROJECT_NAME = pie
PYTHON_FILES = $(shell find setup.py pie tests -type f -name "*.py")
CPP_FILES = $(shell find pie -type f -name "*.h" -o -name "*.cc" -o -name "*.cu")
CMAKE_FILES = $(shell find pie -type f -name "CMakeLists.txt") $(shell find cmake_modules -type f) CMakeLists.txt
PROJECT_NAME = fpie
PYTHON_FILES = $(shell find setup.py fpie tests -type f -name "*.py")
CPP_FILES = $(shell find fpie -type f -name "*.h" -o -name "*.cc" -o -name "*.cu")
CMAKE_FILES = $(shell find fpie -type f -name "CMakeLists.txt") $(shell find cmake_modules -type f) CMakeLists.txt
COMMIT_HASH = $(shell git log -1 --format=%h)

# installation
Expand Down
27 changes: 15 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Poisson Image Editing - A Parallel Implementation

[![PyPI](https://img.shields.io/pypi/v/pie)](https://pypi.org/project/pie/)
[![Unittest](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/workflows/Unittest/badge.svg?branch=master)](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/actions)
[![PyPI](https://img.shields.io/pypi/v/fpie)](https://pypi.org/project/fpie/)
[![Unittest](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/workflows/Test/badge.svg?branch=main)](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/actions)

> Jiayi Weng (jiayiwen), Zixu Chen (zixuc)
Expand All @@ -17,6 +17,9 @@ This project aims to provide a fast poisson image editing algorithm (based on [J
# install cmake >= 3.4
# if you don't have sudo (like GHC), install cmake from source
# on macOS, type `brew install cmake`
$ pip install fpie

# or install from source
$ pip install .
```

Expand All @@ -34,7 +37,7 @@ $ pip install .
After installation, you can use `--check-backend` option to verify:

```bash
$ pie --check-backend
$ fpie --check-backend
['numpy', 'taichi-cpu', 'taichi-gpu', 'taichi-cuda', 'gcc', 'openmp', 'mpi', 'cuda']
```

Expand All @@ -51,14 +54,14 @@ $ cd tests && ./data.py
This script will download 8 tests from GitHub, and create 10 images for benchmarking (5 circle, 5 square). To run:

```bash
$ pie -s test1_src.jpg -m test1_mask.jpg -t test1_tgt.jpg -o result1.jpg -h1 -150 -w1 -50 -n 5000 -g max
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result2.jpg -h1 130 -w1 130 -n 5000 -g src
$ pie -s test3_src.jpg -m test3_mask.jpg -t test3_tgt.jpg -o result3.jpg -h1 100 -w1 100 -n 5000 -g max
$ pie -s test4_src.jpg -m test4_mask.jpg -t test4_tgt.jpg -o result4.jpg -h1 100 -w1 100 -n 5000 -g max
$ pie -s test5_src.jpg -m test5_mask.png -t test5_tgt.jpg -o result5.jpg -h0 -70 -w0 0 -h1 50 -w1 0 -n 5000 -g max
$ pie -s test6_src.png -m test6_mask.png -t test6_tgt.png -o result6.jpg -h1 50 -w1 0 -n 5000 -g max
$ pie -s test7_src.jpg -t test7_tgt.jpg -o result7.jpg -h1 50 -w1 30 -n 5000 -g max
$ pie -s test8_src.jpg -t test8_tgt.jpg -o result8.jpg -h1 90 -w1 90 -n 10000 -g max
$ fpie -s test1_src.jpg -m test1_mask.jpg -t test1_tgt.jpg -o result1.jpg -h1 -150 -w1 -50 -n 5000 -g max
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result2.jpg -h1 130 -w1 130 -n 5000 -g src
$ fpie -s test3_src.jpg -m test3_mask.jpg -t test3_tgt.jpg -o result3.jpg -h1 100 -w1 100 -n 5000 -g max
$ fpie -s test4_src.jpg -m test4_mask.jpg -t test4_tgt.jpg -o result4.jpg -h1 100 -w1 100 -n 5000 -g max
$ fpie -s test5_src.jpg -m test5_mask.png -t test5_tgt.jpg -o result5.jpg -h0 -70 -w0 0 -h1 50 -w1 0 -n 5000 -g max
$ fpie -s test6_src.png -m test6_mask.png -t test6_tgt.png -o result6.jpg -h1 50 -w1 0 -n 5000 -g max
$ fpie -s test7_src.jpg -t test7_tgt.jpg -o result7.jpg -h1 50 -w1 30 -n 5000 -g max
$ fpie -s test8_src.jpg -t test8_tgt.jpg -o result8.jpg -h1 90 -w1 90 -n 10000 -g max
```

Here are the results:
Expand All @@ -80,7 +83,7 @@ We have provided 6 backends. Each backend has two solvers: EquSolver and GridSol

For different backend usage, please check out the related documentation under [docs/backend](/docs/backend).

For other usage, please run `pie -h` to see the hint.
For other usage, please run `fpie -h` to see the hint.

## Benchmark Result

Expand Down
44 changes: 22 additions & 22 deletions docs/backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

To specify backend, simply typing `-b cuda` or `--backend openmp`, together with other parameters described below.

Feel free to play `pie` with other arguments!
Feel free to play `fpie` with other arguments!

## GridSolver

Expand Down Expand Up @@ -32,7 +32,7 @@ This backend uses NumPy vectorized operation for parallel computation.
There's no extra parameter for NumPy EquSolver:

```bash
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b numpy --method equ
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b numpy --method equ
Successfully initialize PIE equ solver with numpy backend
# of vars: 12559
Iter 5000, abs error [450.09415 445.24747 636.1397 ]
Expand All @@ -43,7 +43,7 @@ Successfully write image to result.jpg
There's no extra parameter for NumPy GridSolver:

```bash
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b numpy --method grid
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b numpy --method grid
Successfully initialize PIE grid solver with numpy backend
# of vars: 17227
Iter 5000, abs error [450.07922 445.27014 636.1374 ]
Expand All @@ -60,7 +60,7 @@ This backend uses a single thread C++ program to perform computation.
There's no extra parameter for GCC EquSolver:

```bash
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b gcc --method equ
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b gcc --method equ
Successfully initialize PIE equ solver with gcc backend
# of vars: 12559
Iter 5000, abs error [ 5.179281 6.6939087 11.006622 ]
Expand All @@ -71,7 +71,7 @@ Successfully write image to result.jpg
For GCC GridSolver, you need to specify `--grid-x` and `--grid-y` described in the first section:

```bash
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b gcc --method grid --grid-x 8 --grid-y 8
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b gcc --method grid --grid-x 8 --grid-y 8
Successfully initialize PIE grid solver with gcc backend
# of vars: 17227
Iter 5000, abs error [ 5.1776047 6.69458 11.001862 ]
Expand All @@ -92,7 +92,7 @@ There's no other parameters for Taichi EquSolver:

```bash
# taichi-cpu
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b taichi-cpu --method equ -c 6
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b taichi-cpu --method equ -c 6
[Taichi] version 0.9.2, llvm 10.0.0, commit 7a4d73cd, linux, python 3.8.10
[Taichi] Starting on arch=x64
Successfully initialize PIE equ solver with taichi-cpu backend
Expand All @@ -104,7 +104,7 @@ Successfully write image to result.jpg

```bash
# taichi-gpu
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b taichi-gpu --method equ -z 1024
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b taichi-gpu --method equ -z 1024
[Taichi] version 0.9.2, llvm 10.0.0, commit 7a4d73cd, linux, python 3.8.10
[Taichi] Starting on arch=cuda
Successfully initialize PIE equ solver with taichi-gpu backend
Expand All @@ -118,7 +118,7 @@ For Taichi GridSolver, you also need to specify `--grid-x` and `--grid-y` descri

```bash
# taichi-cpu
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b taichi-cpu --method grid --grid-x 16 --grid-y 16 -c 12
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b taichi-cpu --method grid --grid-x 16 --grid-y 16 -c 12
[Taichi] version 0.9.2, llvm 10.0.0, commit 7a4d73cd, linux, python 3.8.10
[Taichi] Starting on arch=x64
Successfully initialize PIE grid solver with taichi-cpu backend
Expand All @@ -130,7 +130,7 @@ Successfully write image to result.jpg

```bash
# taichi-gpu
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b taichi-gpu --method grid --grid-x 8 --grid-y 8 -z 64
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b taichi-gpu --method grid --grid-x 8 --grid-y 8 -z 64
[Taichi] version 0.9.2, llvm 10.0.0, commit 7a4d73cd, linux, python 3.8.10
[Taichi] Starting on arch=cuda
Successfully initialize PIE grid solver with taichi-gpu backend
Expand All @@ -147,7 +147,7 @@ OpenMP backend needs to specify the number of CPU cores it can use, with `-c` or
There's no other parameters for OpenMP EquSolver:

```bash
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b openmp --method equ -c 6
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b openmp --method equ -c 6
Successfully initialize PIE equ solver with openmp backend
# of vars: 12559
Iter 5000, abs error [ 5.2758713 6.768402 11.11969 ]
Expand All @@ -158,7 +158,7 @@ Successfully write image to result.jpg
For OpenMP GridSolver, you also need to specify `--grid-x` and `--grid-y` described in the first section:

```bash
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b openmp --method grid --grid-x 8 --grid-y 8 -c 6
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b openmp --method grid --grid-x 8 --grid-y 8 -c 6
Successfully initialize PIE grid solver with openmp backend
# of vars: 17227
Iter 5000, abs error [ 5.187172 6.701462 11.020264]
Expand All @@ -168,22 +168,22 @@ Successfully write image to result.jpg

### Parallelization Strategy

For [EquSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/pie/core/openmp/equ.cc), it first groups the pixels into two folds by `(i+j)%2`, then parallelizes per-pixel iteration inside a group in each step. This strategy can utilize the thread-local accessment.
For [EquSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/fpie/core/openmp/equ.cc), it first groups the pixels into two folds by `(i+j)%2`, then parallelizes per-pixel iteration inside a group in each step. This strategy can utilize the thread-local accessment.

For [GridSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/pie/core/openmp/grid.cc), it parallelizes per-grid iteration in each step, where the grid size is `(grid_x, grid_y)`. It simply iterates all pixels in each grid.
For [GridSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/fpie/core/openmp/grid.cc), it parallelizes per-grid iteration in each step, where the grid size is `(grid_x, grid_y)`. It simply iterates all pixels in each grid.

## MPI

To run with MPI backend, you need to install both mpicc and mpi4py (`pip install mpi4py`).

Different from other methods, you need to use `mpiexec` or `mpirun` to launch MPI service instead of directly calling `pie` program. `-np` option is to indicate the number of process it will launch.
Different from other methods, you need to use `mpiexec` or `mpirun` to launch MPI service instead of directly calling `fpie` program. `-np` option is to indicate the number of process it will launch.

Apart from that, you need to specify the synchronization interval for MPI backend with `--mpi-sync-interval`. If this number is too small, it will cause a large amount of overhead of synchronization; however, if it is too large, the quality of solution drops down dramatically.

MPI EquSolver and GridSolver don't have any other arguments because of the parallelization strategy we used, see the next section.

```bash
$ mpiexec -np 6 pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b mpi --method equ --mpi-sync-interval 100
$ mpiexec -np 6 fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b mpi --method equ --mpi-sync-interval 100
Successfully initialize PIE equ solver with mpi backend
# of vars: 12559
Iter 5000, abs error [264.6767 269.55304 368.4869 ]
Expand All @@ -192,7 +192,7 @@ Successfully write image to result.jpg
```

```bash
$ mpiexec -np 6 pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b mpi --method grid --mpi-sync-interval 100
$ mpiexec -np 6 fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b mpi --method grid --mpi-sync-interval 100
Successfully initialize PIE grid solver with mpi backend
# of vars: 17227
Iter 5000, abs error [204.41124 215.00548 296.4441 ]
Expand All @@ -204,9 +204,9 @@ Successfully write image to result.jpg

MPI cannot use share-memory program model, so that we need to reduce the amount of data for communication. Each process is only responsible for a part of computation, and synchronized with other process per `mpi_sync_interval` steps.

For [EquSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/pie/core/mpi/equ.cc), it's hard to say which part of the data should be exchanged to other process, since it relabels all pixels at the very beginning of this process. We use `MPI_Bcast` to force sync all data.
For [EquSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/fpie/core/mpi/equ.cc), it's hard to say which part of the data should be exchanged to other process, since it relabels all pixels at the very beginning of this process. We use `MPI_Bcast` to force sync all data.

For [GridSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/pie/core/mpi/grid.cc), we use line partition: process `i` exchanges its first and last line data with process `i-1` and `i+1` separately. This strategy has a continuous memory layout to exchange, thus has less overhead comparing with block partition.
For [GridSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/fpie/core/mpi/grid.cc), we use line partition: process `i` exchanges its first and last line data with process `i-1` and `i+1` separately. This strategy has a continuous memory layout to exchange, thus has less overhead comparing with block partition.

However, even if we don't use the synchronization in MPI (set `mpi_sync_interval` to be greater than the number of iteration), it is still slower than OpenMP and CUDA backends.

Expand All @@ -217,7 +217,7 @@ CUDA backend needs to specify the number of threads in one block it will use, wi
There's no other parameters for CUDA EquSolver:

```bash
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b cuda --method equ -z 256
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b cuda --method equ -z 256
---------------------------------------------------------
Found 1 CUDA devices
Device 0: NVIDIA GeForce GTX 1060
Expand All @@ -235,7 +235,7 @@ Successfully write image to result.jpg
For CUDA GridSolver, you also need to specify `--grid-x` and `--grid-y` described in the first section:

```bash
$ pie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b cuda --method grid --grid-x 4 --grid-y 128 -z 1024
$ fpie -s test2_src.png -m test2_mask.png -t test2_tgt.png -o result.jpg -h1 130 -w1 130 -n 5000 -g src -b cuda --method grid --grid-x 4 --grid-y 128 -z 1024
---------------------------------------------------------
Found 1 CUDA devices
Device 0: NVIDIA GeForce GTX 1060
Expand All @@ -254,6 +254,6 @@ Successfully write image to result.jpg

The strategy used in CUDA backend is quite similar to OpenMP.

For [EquSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/pie/core/cuda/equ.cu), it performs equation-level parallelization.
For [EquSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/fpie/core/cuda/equ.cu), it performs equation-level parallelization.

For [GridSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/pie/core/cuda/grid.cu), each grid with size `(grid_x, grid_y)` will be in the same block. A thread in a block performs iteration only for a single pixel.
For [GridSolver](https://github.com/Trinkle23897/Fast-Poisson-Image-Editing/blob/main/fpie/core/cuda/grid.cu), each grid with size `(grid_x, grid_y)` will be in the same block. A thread in a block performs iteration only for a single pixel.
Loading

0 comments on commit f7b961d

Please sign in to comment.