diff --git a/avisynth/README.txt b/avisynth/README.txt
index ed4739d..0c2db91 100644
--- a/avisynth/README.txt
+++ b/avisynth/README.txt
@@ -66,7 +66,7 @@ Syntax:
 
              1 - forth SSE4.1 + SSE2 + SSE routine.
 
-             others(64bit only) - use AVX2 + FMA3 + AVX routine.
+             others - use AVX2 + FMA3 + AVX routine.
 
  ------------------------------------------------------------------------
 
@@ -83,7 +83,7 @@ Syntax:
 
         chroma - same as TCannyMod. (default = 0)
 
-        opt - same as TCannyMod. (default = automated)
+        opt - same as TCannyMod. (default = auto)
 
  ------------------------------------------------------------------------
 
@@ -104,7 +104,7 @@ Syntax:
 
         sobel - same as TCannyMod. (default = false)
 
-        opt - same as TCannyMod. (default = automated)
+        opt - same as TCannyMod. (default = auto)
 
 ------------------------------------------------------------------------
 
@@ -136,14 +136,18 @@ Changelog:
     1.0.0 (20160326):
         - Almost rewrite.
         - VS2013 to VS2015.
-        - Add AVX2(64bit only) / SSE4.1(both 32bit and 64bit) support.
+        - Add AVX2(64bit only)/SSE4.1(both 32bit and 64bit) support.
         - Change direction values from 1,3,7,15 to 31,63,127,255.
         - Reduce waste processes.
 
     1.1.0 (20160328):
         - Add EMask().
         - Implement simd non-maximum-suppression.
-        - a bit optimized gaussian-blur/ hysteresis.
+        - a bit optimized gaussian-blur/hysteresis.
+
+    1.1.1 (20160330):
+		- Add AVX2 support for 32bit.
+
 
 Source code:
 
diff --git a/avisynth/src/edge_detection.h b/avisynth/src/edge_detection.h
index 1edc0ae..c602d43 100644
--- a/avisynth/src/edge_detection.h
+++ b/avisynth/src/edge_detection.h
@@ -27,50 +27,32 @@
 #define EDGE_DETECTION_H
 
 #include <cstdint>
+#include <algorithm>
 #include "simd.h"
 
-template <typename Vf, typename Vi>
-SFINLINE void calc_direction(const Vf& gx, const Vf& gy, int32_t* dirp)
-{
-
-    static const Vf t0225 = set1_ps<Vf>(std::sqrt(2.0f) - 1.0f); // tan(pi/8)
-    static const Vf t0675 = set1_ps<Vf>(std::sqrt(2.0f) + 1.0f); // tan(3*pi/8)
-    static const Vf t1125 = sub(zero<Vf>(), t0675); // tan(5*pi/8) = -tan(3*pi/8)
-    static const Vf t1575 = sub(zero<Vf>(), t0225); // tan(7*pi/8) = -tan(pi/8)
-
-    Vf z = zero<Vf>();
-    Vf vertical = set1_ps<Vf>(90.0f);
-
-    // if gy < 0, gx = -gx
-    Vf mask = cmplt_ps(gy, z);
-    Vf gx2 = blendv(gx, sub(z, gx), mask);
-
-    // tan = gy / gx
-    Vf tan = mul(rcp_hq(gx2), abs(gy));
-
-    // if tan is unorderd(inf or NaN), tan = 90.0f
-    mask = cmpord_ps(tan, tan);
-    tan = blendv(vertical, tan, mask);
-
-    // if t1575 <= tan < t0225, direction is 31 (horizontal)
-    Vi d0 = castps_si(and_ps(cmpge_ps(tan, t1575), cmplt_ps(tan, t0225)));
-    d0 = srli_i32(d0, 27);
 
-    // if t0225 <= tan < t0675, direction is 63 (45' up)
-    Vi d1 = castps_si(and_ps(cmpge_ps(tan, t0225), cmplt_ps(tan, t0675)));
-    d1 = srli_i32(d1, 26);
-
-    // if t0675 <= tan or tan < t1125, direction is 127 (vertical)
-    Vi d2 = castps_si(or_ps(cmpge_ps(tan, t0675), cmplt_ps(tan, t1125)));
-    d2 = srli_i32(d2, 25);
-
-    // if t1125 <= tan < t1575, direction is 255 (45' down)
-    Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575)));
-    d3 = srli_i32(d3, 24);
-
-    d0 = or_si(or_si(d0, d1), or_si(d2, d3));
-
-    stream<Vi>(dirp, d0);
+const float* get_tangent(int idx)
+{
+     alignas(32) static const float tangent[32] = {
+        0.414213538169860839843750f, 0.414213538169860839843750f, // tan(pi/8)
+        0.414213538169860839843750f, 0.414213538169860839843750f,
+        0.414213538169860839843750f, 0.414213538169860839843750f,
+        0.414213538169860839843750f, 0.414213538169860839843750f,
+        2.414213657379150390625000f, 2.414213657379150390625000f, // tan(3*pi/8)
+        2.414213657379150390625000f, 2.414213657379150390625000f,
+        2.414213657379150390625000f, 2.414213657379150390625000f,
+        2.414213657379150390625000f, 2.414213657379150390625000f,
+        -2.414213657379150390625000f, -2.414213657379150390625000f, // tan(5*pi/8)
+        -2.414213657379150390625000f, -2.414213657379150390625000f,
+        -2.414213657379150390625000f, -2.414213657379150390625000f,
+        -2.414213657379150390625000f, -2.414213657379150390625000f,
+        -0.414213538169860839843750f, -0.414213538169860839843750f, // tan(7*pi/8)
+        -0.414213538169860839843750f, -0.414213538169860839843750f,
+        -0.414213538169860839843750f, -0.414213538169860839843750f,
+        -0.414213538169860839843750f, -0.414213538169860839843750f,
+    };
+
+    return tangent + 8 * idx;
 }
 
 
@@ -80,12 +62,18 @@ standard(float* blurp, const size_t blur_pitch, float* emaskp,
          const size_t emask_pitch, int32_t* dirp, const size_t dir_pitch,
          const size_t width, const size_t height)
 {
+
     constexpr size_t step = sizeof(Vf) / sizeof(float);
 
     float* p0 = blurp;
     float* p1 = blurp;
     float* p2 = blurp + blur_pitch;
 
+    const float* tan0225 = get_tangent(0);
+    const float* tan0675 = get_tangent(1);
+    const float* tan1125 = get_tangent(2);
+    const float* tan1575 = get_tangent(3);
+
     for (size_t y = 0; y < height; y++) {
         p1[-1] = p1[0];
         p1[width] = p1[width - 1];
@@ -95,7 +83,34 @@ standard(float* blurp, const size_t blur_pitch, float* emaskp,
             Vf gx = sub(loadu<Vf>(p1 + x + 1), loadu<Vf>(p1 + x - 1)); // [-1, 0, 1]
 
             if (CALC_DIR) {
-                calc_direction<Vf, Vi>(gx, gy, dirp + x);
+                const Vf z = zero<Vf>();
+                const Vf vertical = set1_ps<Vf>(90.0f);
+                // if gy < 0, gx = -gx
+                Vf mask = cmplt_ps(gy, z);
+                Vf gx2 = blendv(gx, sub(z, gx), mask);
+                // tan = gy / gx
+                Vf tan = mul(rcp_hq(gx2), abs(gy));
+                // if tan is unorderd(inf or NaN), tan = 90.0f
+                mask = cmpord_ps(tan, tan);
+                tan = blendv(vertical, tan, mask);
+                const Vf t0225 = load<Vf>(tan0225);
+                const Vf t0675 = load<Vf>(tan0675);
+                const Vf t1125 = load<Vf>(tan1125);
+                const Vf t1575 = load<Vf>(tan1575);
+                // if t1575 <= tan < t0225, direction is 31 (horizontal)
+                Vi d0 = castps_si(and_ps(cmpge_ps(tan, t1575), cmplt_ps(tan, t0225)));
+                d0 = srli_i32(d0, 27);
+                // if t0225 <= tan < t0675, direction is 63 (45' up)
+                Vi d1 = castps_si(and_ps(cmpge_ps(tan, t0225), cmplt_ps(tan, t0675)));
+                d1 = srli_i32(d1, 26);
+                // if t0675 <= tan or tan < t1125, direction is 127 (vertical)
+                Vi d2 = castps_si(or_ps(cmpge_ps(tan, t0675), cmplt_ps(tan, t1125)));
+                d2 = srli_i32(d2, 25);
+                // if t1125 <= tan < t1575, direction is 255 (45' down)
+                Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575)));
+                d3 = srli_i32(d3, 24);
+                d0 = or_si(or_si(d0, d1), or_si(d2, d3));
+                stream<Vi>(dirp + x, d0);
             }
 
             Vf magnitude = mul(gx, gx);
@@ -134,6 +149,11 @@ sobel(float* blurp, const size_t blur_pitch, float* emaskp,
     p1[-1] = p1[0];
     p1[width] = p1[width - 1];
 
+    const float* tan0225 = get_tangent(0);
+    const float* tan0675 = get_tangent(1);
+    const float* tan1125 = get_tangent(2);
+    const float* tan1575 = get_tangent(3);
+
     for (size_t y = 0; y < height; y++) {
         p2[-1] = p2[0];
         p2[width] = p2[width - 1];
@@ -157,7 +177,27 @@ sobel(float* blurp, const size_t blur_pitch, float* emaskp,
             gy = sub(gy, add(t, t));
 
             if (CALC_DIR) {
-                calc_direction<Vf, Vi>(gx, gy, dirp + x);
+                const Vf z = zero<Vf>();
+                const Vf vertical = set1_ps<Vf>(90.0f);
+                Vf mask = cmplt_ps(gy, z);
+                Vf gx2 = blendv(gx, sub(z, gx), mask);
+                Vf tan = mul(rcp_hq(gx2), abs(gy));
+                mask = cmpord_ps(tan, tan);
+                tan = blendv(vertical, tan, mask);
+                const Vf t0225 = load<Vf>(tan0225);
+                const Vf t0675 = load<Vf>(tan0675);
+                const Vf t1125 = load<Vf>(tan1125);
+                const Vf t1575 = load<Vf>(tan1575);
+                Vi d0 = castps_si(and_ps(cmpge_ps(tan, t1575), cmplt_ps(tan, t0225)));
+                d0 = srli_i32(d0, 27);
+                Vi d1 = castps_si(and_ps(cmpge_ps(tan, t0225), cmplt_ps(tan, t0675)));
+                d1 = srli_i32(d1, 26);
+                Vi d2 = castps_si(or_ps(cmpge_ps(tan, t0675), cmplt_ps(tan, t1125)));
+                d2 = srli_i32(d2, 25);
+                Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575)));
+                d3 = srli_i32(d3, 24);
+                d0 = or_si(or_si(d0, d1), or_si(d2, d3));
+                stream<Vi>(dirp + x, d0);
             }
 
             Vf magnitude = mul(gx, gx);
diff --git a/avisynth/src/gaussian_blur.h b/avisynth/src/gaussian_blur.h
index 1929688..a0739a4 100644
--- a/avisynth/src/gaussian_blur.h
+++ b/avisynth/src/gaussian_blur.h
@@ -50,7 +50,7 @@ convert_to_float(const size_t width, const size_t height, const uint8_t* srcp,
 template <typename Vf>
 static void
 horizontal_blur(const float* hkernel, float* buffp, const int radius,
-                const size_t width, const float* kernel, float* blurp)
+                const size_t width, float* blurp)
 {
     constexpr size_t step = sizeof(Vf) / sizeof(float);
     const int length = radius * 2 + 1;
@@ -108,8 +108,8 @@ gaussian_blur(const int radius, const float* kernel, const float* hkernel,
             }
             store<Vf>(buffp + x, sum);
         }
-        horizontal_blur<Vf>(hkernel, buffp, radius, width, kernel,
-                            blurp + blur_pitch * y);
+        horizontal_blur<Vf>(hkernel, buffp, radius, width, blurp);
+        blurp += blur_pitch;
 
         for (int l = 0; l < length - 1; ++l) {
             p[l] = p[l + 1];
diff --git a/avisynth/src/tcannymod.cpp b/avisynth/src/tcannymod.cpp
index 15d8cb0..5b692c0 100644
--- a/avisynth/src/tcannymod.cpp
+++ b/avisynth/src/tcannymod.cpp
@@ -126,19 +126,13 @@ set_gb_kernel(float sigma, int& radius, float* kernel)
 
 static arch_t get_arch(int opt)
 {
-    // on 32bit with /arch:AVX outputs weird result.
-    // I don't know why...
     if (opt == 0 || !has_sse41()) {
         return HAS_SSE2;
     }
-#if !defined(_WIN64)
-    return HAS_SSE41;
-#else
     if (opt == 1 || !has_avx2()) {
         return HAS_SSE41;
     }
     return HAS_AVX2;
-#endif
 }
 
 
diff --git a/avisynth/src/tcannymod.h b/avisynth/src/tcannymod.h
index 9d1b29c..ca46fcd 100644
--- a/avisynth/src/tcannymod.h
+++ b/avisynth/src/tcannymod.h
@@ -38,7 +38,7 @@
 #include "write_frame.h"
 
 
-#define TCANNY_M_VERSION "1.1.0"
+#define TCANNY_M_VERSION "1.1.1"
 
 constexpr size_t GB_MAX_LENGTH = 17;