Skip to content

Commit

Permalink
Add AVX2 support for 32bit.
Browse files Browse the repository at this point in the history
  • Loading branch information
chikuzen committed Mar 30, 2016
1 parent b8c5aee commit f5e36c3
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 58 deletions.
14 changes: 9 additions & 5 deletions avisynth/README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ Syntax:

1 - forth SSE4.1 + SSE2 + SSE routine.

others(64bit only) - use AVX2 + FMA3 + AVX routine.
others - use AVX2 + FMA3 + AVX routine.

------------------------------------------------------------------------

Expand All @@ -83,7 +83,7 @@ Syntax:

chroma - same as TCannyMod. (default = 0)

opt - same as TCannyMod. (default = automated)
opt - same as TCannyMod. (default = auto)

------------------------------------------------------------------------

Expand All @@ -104,7 +104,7 @@ Syntax:

sobel - same as TCannyMod. (default = false)

opt - same as TCannyMod. (default = automated)
opt - same as TCannyMod. (default = auto)

------------------------------------------------------------------------

Expand Down Expand Up @@ -136,14 +136,18 @@ Changelog:
1.0.0 (20160326):
- Almost rewrite.
- VS2013 to VS2015.
- Add AVX2(64bit only) / SSE4.1(both 32bit and 64bit) support.
- Add AVX2(64bit only)/SSE4.1(both 32bit and 64bit) support.
- Change direction values from 1,3,7,15 to 31,63,127,255.
- Reduce waste processes.

1.1.0 (20160328):
- Add EMask().
- Implement simd non-maximum-suppression.
- a bit optimized gaussian-blur/ hysteresis.
- a bit optimized gaussian-blur/hysteresis.

1.1.1 (20160330):
- Add AVX2 support for 32bit.


Source code:

Expand Down
126 changes: 83 additions & 43 deletions avisynth/src/edge_detection.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,50 +27,32 @@
#define EDGE_DETECTION_H

#include <cstdint>
#include <algorithm>
#include "simd.h"

template <typename Vf, typename Vi>
SFINLINE void calc_direction(const Vf& gx, const Vf& gy, int32_t* dirp)
{

static const Vf t0225 = set1_ps<Vf>(std::sqrt(2.0f) - 1.0f); // tan(pi/8)
static const Vf t0675 = set1_ps<Vf>(std::sqrt(2.0f) + 1.0f); // tan(3*pi/8)
static const Vf t1125 = sub(zero<Vf>(), t0675); // tan(5*pi/8) = -tan(3*pi/8)
static const Vf t1575 = sub(zero<Vf>(), t0225); // tan(7*pi/8) = -tan(pi/8)

Vf z = zero<Vf>();
Vf vertical = set1_ps<Vf>(90.0f);

// if gy < 0, gx = -gx
Vf mask = cmplt_ps(gy, z);
Vf gx2 = blendv(gx, sub(z, gx), mask);

// tan = gy / gx
Vf tan = mul(rcp_hq(gx2), abs(gy));

// if tan is unorderd(inf or NaN), tan = 90.0f
mask = cmpord_ps(tan, tan);
tan = blendv(vertical, tan, mask);

// if t1575 <= tan < t0225, direction is 31 (horizontal)
Vi d0 = castps_si(and_ps(cmpge_ps(tan, t1575), cmplt_ps(tan, t0225)));
d0 = srli_i32(d0, 27);

// if t0225 <= tan < t0675, direction is 63 (45' up)
Vi d1 = castps_si(and_ps(cmpge_ps(tan, t0225), cmplt_ps(tan, t0675)));
d1 = srli_i32(d1, 26);

// if t0675 <= tan or tan < t1125, direction is 127 (vertical)
Vi d2 = castps_si(or_ps(cmpge_ps(tan, t0675), cmplt_ps(tan, t1125)));
d2 = srli_i32(d2, 25);

// if t1125 <= tan < t1575, direction is 255 (45' down)
Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575)));
d3 = srli_i32(d3, 24);

d0 = or_si(or_si(d0, d1), or_si(d2, d3));

stream<Vi>(dirp, d0);
const float* get_tangent(int idx)
{
alignas(32) static const float tangent[32] = {
0.414213538169860839843750f, 0.414213538169860839843750f, // tan(pi/8)
0.414213538169860839843750f, 0.414213538169860839843750f,
0.414213538169860839843750f, 0.414213538169860839843750f,
0.414213538169860839843750f, 0.414213538169860839843750f,
2.414213657379150390625000f, 2.414213657379150390625000f, // tan(3*pi/8)
2.414213657379150390625000f, 2.414213657379150390625000f,
2.414213657379150390625000f, 2.414213657379150390625000f,
2.414213657379150390625000f, 2.414213657379150390625000f,
-2.414213657379150390625000f, -2.414213657379150390625000f, // tan(5*pi/8)
-2.414213657379150390625000f, -2.414213657379150390625000f,
-2.414213657379150390625000f, -2.414213657379150390625000f,
-2.414213657379150390625000f, -2.414213657379150390625000f,
-0.414213538169860839843750f, -0.414213538169860839843750f, // tan(7*pi/8)
-0.414213538169860839843750f, -0.414213538169860839843750f,
-0.414213538169860839843750f, -0.414213538169860839843750f,
-0.414213538169860839843750f, -0.414213538169860839843750f,
};

return tangent + 8 * idx;
}


Expand All @@ -80,12 +62,18 @@ standard(float* blurp, const size_t blur_pitch, float* emaskp,
const size_t emask_pitch, int32_t* dirp, const size_t dir_pitch,
const size_t width, const size_t height)
{

constexpr size_t step = sizeof(Vf) / sizeof(float);

float* p0 = blurp;
float* p1 = blurp;
float* p2 = blurp + blur_pitch;

const float* tan0225 = get_tangent(0);
const float* tan0675 = get_tangent(1);
const float* tan1125 = get_tangent(2);
const float* tan1575 = get_tangent(3);

for (size_t y = 0; y < height; y++) {
p1[-1] = p1[0];
p1[width] = p1[width - 1];
Expand All @@ -95,7 +83,34 @@ standard(float* blurp, const size_t blur_pitch, float* emaskp,
Vf gx = sub(loadu<Vf>(p1 + x + 1), loadu<Vf>(p1 + x - 1)); // [-1, 0, 1]

if (CALC_DIR) {
calc_direction<Vf, Vi>(gx, gy, dirp + x);
const Vf z = zero<Vf>();
const Vf vertical = set1_ps<Vf>(90.0f);
// if gy < 0, gx = -gx
Vf mask = cmplt_ps(gy, z);
Vf gx2 = blendv(gx, sub(z, gx), mask);
// tan = gy / gx
Vf tan = mul(rcp_hq(gx2), abs(gy));
// if tan is unorderd(inf or NaN), tan = 90.0f
mask = cmpord_ps(tan, tan);
tan = blendv(vertical, tan, mask);
const Vf t0225 = load<Vf>(tan0225);
const Vf t0675 = load<Vf>(tan0675);
const Vf t1125 = load<Vf>(tan1125);
const Vf t1575 = load<Vf>(tan1575);
// if t1575 <= tan < t0225, direction is 31 (horizontal)
Vi d0 = castps_si(and_ps(cmpge_ps(tan, t1575), cmplt_ps(tan, t0225)));
d0 = srli_i32(d0, 27);
// if t0225 <= tan < t0675, direction is 63 (45' up)
Vi d1 = castps_si(and_ps(cmpge_ps(tan, t0225), cmplt_ps(tan, t0675)));
d1 = srli_i32(d1, 26);
// if t0675 <= tan or tan < t1125, direction is 127 (vertical)
Vi d2 = castps_si(or_ps(cmpge_ps(tan, t0675), cmplt_ps(tan, t1125)));
d2 = srli_i32(d2, 25);
// if t1125 <= tan < t1575, direction is 255 (45' down)
Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575)));
d3 = srli_i32(d3, 24);
d0 = or_si(or_si(d0, d1), or_si(d2, d3));
stream<Vi>(dirp + x, d0);
}

Vf magnitude = mul(gx, gx);
Expand Down Expand Up @@ -134,6 +149,11 @@ sobel(float* blurp, const size_t blur_pitch, float* emaskp,
p1[-1] = p1[0];
p1[width] = p1[width - 1];

const float* tan0225 = get_tangent(0);
const float* tan0675 = get_tangent(1);
const float* tan1125 = get_tangent(2);
const float* tan1575 = get_tangent(3);

for (size_t y = 0; y < height; y++) {
p2[-1] = p2[0];
p2[width] = p2[width - 1];
Expand All @@ -157,7 +177,27 @@ sobel(float* blurp, const size_t blur_pitch, float* emaskp,
gy = sub(gy, add(t, t));

if (CALC_DIR) {
calc_direction<Vf, Vi>(gx, gy, dirp + x);
const Vf z = zero<Vf>();
const Vf vertical = set1_ps<Vf>(90.0f);
Vf mask = cmplt_ps(gy, z);
Vf gx2 = blendv(gx, sub(z, gx), mask);
Vf tan = mul(rcp_hq(gx2), abs(gy));
mask = cmpord_ps(tan, tan);
tan = blendv(vertical, tan, mask);
const Vf t0225 = load<Vf>(tan0225);
const Vf t0675 = load<Vf>(tan0675);
const Vf t1125 = load<Vf>(tan1125);
const Vf t1575 = load<Vf>(tan1575);
Vi d0 = castps_si(and_ps(cmpge_ps(tan, t1575), cmplt_ps(tan, t0225)));
d0 = srli_i32(d0, 27);
Vi d1 = castps_si(and_ps(cmpge_ps(tan, t0225), cmplt_ps(tan, t0675)));
d1 = srli_i32(d1, 26);
Vi d2 = castps_si(or_ps(cmpge_ps(tan, t0675), cmplt_ps(tan, t1125)));
d2 = srli_i32(d2, 25);
Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575)));
d3 = srli_i32(d3, 24);
d0 = or_si(or_si(d0, d1), or_si(d2, d3));
stream<Vi>(dirp + x, d0);
}

Vf magnitude = mul(gx, gx);
Expand Down
6 changes: 3 additions & 3 deletions avisynth/src/gaussian_blur.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ convert_to_float(const size_t width, const size_t height, const uint8_t* srcp,
template <typename Vf>
static void
horizontal_blur(const float* hkernel, float* buffp, const int radius,
const size_t width, const float* kernel, float* blurp)
const size_t width, float* blurp)
{
constexpr size_t step = sizeof(Vf) / sizeof(float);
const int length = radius * 2 + 1;
Expand Down Expand Up @@ -108,8 +108,8 @@ gaussian_blur(const int radius, const float* kernel, const float* hkernel,
}
store<Vf>(buffp + x, sum);
}
horizontal_blur<Vf>(hkernel, buffp, radius, width, kernel,
blurp + blur_pitch * y);
horizontal_blur<Vf>(hkernel, buffp, radius, width, blurp);
blurp += blur_pitch;

for (int l = 0; l < length - 1; ++l) {
p[l] = p[l + 1];
Expand Down
6 changes: 0 additions & 6 deletions avisynth/src/tcannymod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,19 +126,13 @@ set_gb_kernel(float sigma, int& radius, float* kernel)

static arch_t get_arch(int opt)
{
// on 32bit with /arch:AVX outputs weird result.
// I don't know why...
if (opt == 0 || !has_sse41()) {
return HAS_SSE2;
}
#if !defined(_WIN64)
return HAS_SSE41;
#else
if (opt == 1 || !has_avx2()) {
return HAS_SSE41;
}
return HAS_AVX2;
#endif
}


Expand Down
2 changes: 1 addition & 1 deletion avisynth/src/tcannymod.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
#include "write_frame.h"


#define TCANNY_M_VERSION "1.1.0"
#define TCANNY_M_VERSION "1.1.1"

constexpr size_t GB_MAX_LENGTH = 17;

Expand Down

0 comments on commit f5e36c3

Please sign in to comment.