diff --git a/avisynth/README.txt b/avisynth/README.txt index ed4739d..0c2db91 100644 --- a/avisynth/README.txt +++ b/avisynth/README.txt @@ -66,7 +66,7 @@ Syntax: 1 - forth SSE4.1 + SSE2 + SSE routine. - others(64bit only) - use AVX2 + FMA3 + AVX routine. + others - use AVX2 + FMA3 + AVX routine. ------------------------------------------------------------------------ @@ -83,7 +83,7 @@ Syntax: chroma - same as TCannyMod. (default = 0) - opt - same as TCannyMod. (default = automated) + opt - same as TCannyMod. (default = auto) ------------------------------------------------------------------------ @@ -104,7 +104,7 @@ Syntax: sobel - same as TCannyMod. (default = false) - opt - same as TCannyMod. (default = automated) + opt - same as TCannyMod. (default = auto) ------------------------------------------------------------------------ @@ -136,14 +136,18 @@ Changelog: 1.0.0 (20160326): - Almost rewrite. - VS2013 to VS2015. - - Add AVX2(64bit only) / SSE4.1(both 32bit and 64bit) support. + - Add AVX2(64bit only)/SSE4.1(both 32bit and 64bit) support. - Change direction values from 1,3,7,15 to 31,63,127,255. - Reduce waste processes. 1.1.0 (20160328): - Add EMask(). - Implement simd non-maximum-suppression. - - a bit optimized gaussian-blur/ hysteresis. + - a bit optimized gaussian-blur/hysteresis. + + 1.1.1 (20160330): + - Add AVX2 support for 32bit. + Source code: diff --git a/avisynth/src/edge_detection.h b/avisynth/src/edge_detection.h index 1edc0ae..c602d43 100644 --- a/avisynth/src/edge_detection.h +++ b/avisynth/src/edge_detection.h @@ -27,50 +27,32 @@ #define EDGE_DETECTION_H #include +#include #include "simd.h" -template -SFINLINE void calc_direction(const Vf& gx, const Vf& gy, int32_t* dirp) -{ - - static const Vf t0225 = set1_ps(std::sqrt(2.0f) - 1.0f); // tan(pi/8) - static const Vf t0675 = set1_ps(std::sqrt(2.0f) + 1.0f); // tan(3*pi/8) - static const Vf t1125 = sub(zero(), t0675); // tan(5*pi/8) = -tan(3*pi/8) - static const Vf t1575 = sub(zero(), t0225); // tan(7*pi/8) = -tan(pi/8) - - Vf z = zero(); - Vf vertical = set1_ps(90.0f); - - // if gy < 0, gx = -gx - Vf mask = cmplt_ps(gy, z); - Vf gx2 = blendv(gx, sub(z, gx), mask); - - // tan = gy / gx - Vf tan = mul(rcp_hq(gx2), abs(gy)); - - // if tan is unorderd(inf or NaN), tan = 90.0f - mask = cmpord_ps(tan, tan); - tan = blendv(vertical, tan, mask); - - // if t1575 <= tan < t0225, direction is 31 (horizontal) - Vi d0 = castps_si(and_ps(cmpge_ps(tan, t1575), cmplt_ps(tan, t0225))); - d0 = srli_i32(d0, 27); - // if t0225 <= tan < t0675, direction is 63 (45' up) - Vi d1 = castps_si(and_ps(cmpge_ps(tan, t0225), cmplt_ps(tan, t0675))); - d1 = srli_i32(d1, 26); - - // if t0675 <= tan or tan < t1125, direction is 127 (vertical) - Vi d2 = castps_si(or_ps(cmpge_ps(tan, t0675), cmplt_ps(tan, t1125))); - d2 = srli_i32(d2, 25); - - // if t1125 <= tan < t1575, direction is 255 (45' down) - Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575))); - d3 = srli_i32(d3, 24); - - d0 = or_si(or_si(d0, d1), or_si(d2, d3)); - - stream(dirp, d0); +const float* get_tangent(int idx) +{ + alignas(32) static const float tangent[32] = { + 0.414213538169860839843750f, 0.414213538169860839843750f, // tan(pi/8) + 0.414213538169860839843750f, 0.414213538169860839843750f, + 0.414213538169860839843750f, 0.414213538169860839843750f, + 0.414213538169860839843750f, 0.414213538169860839843750f, + 2.414213657379150390625000f, 2.414213657379150390625000f, // tan(3*pi/8) + 2.414213657379150390625000f, 2.414213657379150390625000f, + 2.414213657379150390625000f, 2.414213657379150390625000f, + 2.414213657379150390625000f, 2.414213657379150390625000f, + -2.414213657379150390625000f, -2.414213657379150390625000f, // tan(5*pi/8) + -2.414213657379150390625000f, -2.414213657379150390625000f, + -2.414213657379150390625000f, -2.414213657379150390625000f, + -2.414213657379150390625000f, -2.414213657379150390625000f, + -0.414213538169860839843750f, -0.414213538169860839843750f, // tan(7*pi/8) + -0.414213538169860839843750f, -0.414213538169860839843750f, + -0.414213538169860839843750f, -0.414213538169860839843750f, + -0.414213538169860839843750f, -0.414213538169860839843750f, + }; + + return tangent + 8 * idx; } @@ -80,12 +62,18 @@ standard(float* blurp, const size_t blur_pitch, float* emaskp, const size_t emask_pitch, int32_t* dirp, const size_t dir_pitch, const size_t width, const size_t height) { + constexpr size_t step = sizeof(Vf) / sizeof(float); float* p0 = blurp; float* p1 = blurp; float* p2 = blurp + blur_pitch; + const float* tan0225 = get_tangent(0); + const float* tan0675 = get_tangent(1); + const float* tan1125 = get_tangent(2); + const float* tan1575 = get_tangent(3); + for (size_t y = 0; y < height; y++) { p1[-1] = p1[0]; p1[width] = p1[width - 1]; @@ -95,7 +83,34 @@ standard(float* blurp, const size_t blur_pitch, float* emaskp, Vf gx = sub(loadu(p1 + x + 1), loadu(p1 + x - 1)); // [-1, 0, 1] if (CALC_DIR) { - calc_direction(gx, gy, dirp + x); + const Vf z = zero(); + const Vf vertical = set1_ps(90.0f); + // if gy < 0, gx = -gx + Vf mask = cmplt_ps(gy, z); + Vf gx2 = blendv(gx, sub(z, gx), mask); + // tan = gy / gx + Vf tan = mul(rcp_hq(gx2), abs(gy)); + // if tan is unorderd(inf or NaN), tan = 90.0f + mask = cmpord_ps(tan, tan); + tan = blendv(vertical, tan, mask); + const Vf t0225 = load(tan0225); + const Vf t0675 = load(tan0675); + const Vf t1125 = load(tan1125); + const Vf t1575 = load(tan1575); + // if t1575 <= tan < t0225, direction is 31 (horizontal) + Vi d0 = castps_si(and_ps(cmpge_ps(tan, t1575), cmplt_ps(tan, t0225))); + d0 = srli_i32(d0, 27); + // if t0225 <= tan < t0675, direction is 63 (45' up) + Vi d1 = castps_si(and_ps(cmpge_ps(tan, t0225), cmplt_ps(tan, t0675))); + d1 = srli_i32(d1, 26); + // if t0675 <= tan or tan < t1125, direction is 127 (vertical) + Vi d2 = castps_si(or_ps(cmpge_ps(tan, t0675), cmplt_ps(tan, t1125))); + d2 = srli_i32(d2, 25); + // if t1125 <= tan < t1575, direction is 255 (45' down) + Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575))); + d3 = srli_i32(d3, 24); + d0 = or_si(or_si(d0, d1), or_si(d2, d3)); + stream(dirp + x, d0); } Vf magnitude = mul(gx, gx); @@ -134,6 +149,11 @@ sobel(float* blurp, const size_t blur_pitch, float* emaskp, p1[-1] = p1[0]; p1[width] = p1[width - 1]; + const float* tan0225 = get_tangent(0); + const float* tan0675 = get_tangent(1); + const float* tan1125 = get_tangent(2); + const float* tan1575 = get_tangent(3); + for (size_t y = 0; y < height; y++) { p2[-1] = p2[0]; p2[width] = p2[width - 1]; @@ -157,7 +177,27 @@ sobel(float* blurp, const size_t blur_pitch, float* emaskp, gy = sub(gy, add(t, t)); if (CALC_DIR) { - calc_direction(gx, gy, dirp + x); + const Vf z = zero(); + const Vf vertical = set1_ps(90.0f); + Vf mask = cmplt_ps(gy, z); + Vf gx2 = blendv(gx, sub(z, gx), mask); + Vf tan = mul(rcp_hq(gx2), abs(gy)); + mask = cmpord_ps(tan, tan); + tan = blendv(vertical, tan, mask); + const Vf t0225 = load(tan0225); + const Vf t0675 = load(tan0675); + const Vf t1125 = load(tan1125); + const Vf t1575 = load(tan1575); + Vi d0 = castps_si(and_ps(cmpge_ps(tan, t1575), cmplt_ps(tan, t0225))); + d0 = srli_i32(d0, 27); + Vi d1 = castps_si(and_ps(cmpge_ps(tan, t0225), cmplt_ps(tan, t0675))); + d1 = srli_i32(d1, 26); + Vi d2 = castps_si(or_ps(cmpge_ps(tan, t0675), cmplt_ps(tan, t1125))); + d2 = srli_i32(d2, 25); + Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575))); + d3 = srli_i32(d3, 24); + d0 = or_si(or_si(d0, d1), or_si(d2, d3)); + stream(dirp + x, d0); } Vf magnitude = mul(gx, gx); diff --git a/avisynth/src/gaussian_blur.h b/avisynth/src/gaussian_blur.h index 1929688..a0739a4 100644 --- a/avisynth/src/gaussian_blur.h +++ b/avisynth/src/gaussian_blur.h @@ -50,7 +50,7 @@ convert_to_float(const size_t width, const size_t height, const uint8_t* srcp, template static void horizontal_blur(const float* hkernel, float* buffp, const int radius, - const size_t width, const float* kernel, float* blurp) + const size_t width, float* blurp) { constexpr size_t step = sizeof(Vf) / sizeof(float); const int length = radius * 2 + 1; @@ -108,8 +108,8 @@ gaussian_blur(const int radius, const float* kernel, const float* hkernel, } store(buffp + x, sum); } - horizontal_blur(hkernel, buffp, radius, width, kernel, - blurp + blur_pitch * y); + horizontal_blur(hkernel, buffp, radius, width, blurp); + blurp += blur_pitch; for (int l = 0; l < length - 1; ++l) { p[l] = p[l + 1]; diff --git a/avisynth/src/tcannymod.cpp b/avisynth/src/tcannymod.cpp index 15d8cb0..5b692c0 100644 --- a/avisynth/src/tcannymod.cpp +++ b/avisynth/src/tcannymod.cpp @@ -126,19 +126,13 @@ set_gb_kernel(float sigma, int& radius, float* kernel) static arch_t get_arch(int opt) { - // on 32bit with /arch:AVX outputs weird result. - // I don't know why... if (opt == 0 || !has_sse41()) { return HAS_SSE2; } -#if !defined(_WIN64) - return HAS_SSE41; -#else if (opt == 1 || !has_avx2()) { return HAS_SSE41; } return HAS_AVX2; -#endif } diff --git a/avisynth/src/tcannymod.h b/avisynth/src/tcannymod.h index 9d1b29c..ca46fcd 100644 --- a/avisynth/src/tcannymod.h +++ b/avisynth/src/tcannymod.h @@ -38,7 +38,7 @@ #include "write_frame.h" -#define TCANNY_M_VERSION "1.1.0" +#define TCANNY_M_VERSION "1.1.1" constexpr size_t GB_MAX_LENGTH = 17;