Two segments intersection #334

sergey-239 · 2022-11-19T11:17:51Z

sergey-239
Nov 19, 2022

This thread created to continue the discussion of the best (most fast and accurate) two segments intersection implementation that started in #317

At some iteration @bahvalo in #317 (comment) proposed the algorithm that is derived from solving the two lines intersection equation (see https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection for details).

To the moment with the @alexisnaveros's efforts and some ideas from myself we ended up with the following (please read the original thread to understand how did we get here):

__attribute__ ((always_inline)) static inline int64_t idiv128( __int128 num, int64_t denom )
{
  int64_t res;
  asm (
  "idivq %1\n\t"
  : "=a"(res) : "rm" (denom), "A" (num) );
  return res;
}

__attribute__ ((always_inline)) static inline  __int128 imul128( int64_t a, int64_t b )
{
  __int128 res;
  asm (
  "imulq %2\n\t"
  : "=A"(res) : "a" (a), "rm" (b) );
  return res;
}

/* GCC is smart enough to grab the higher half of the __int128 register pair, without any store/load/shift/whatever */
/* Can all compilers do that? Feel free to tweak as necessary for your compiler to emit good code */
#if 1
 #define MATH_INT128_GET_HIGH64(x) (int64_t)((x)>>64)
#else
 #define MATH_INT128_GET_HIGH64(x) (int64_t)(*((int64_t *)(&x)+1))
#endif

static inline int mathVertexLineLineIntersection_PB6( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int shift;
  int64_t xormask, highmask, denomlow, alphalow;
  int64_t signbit, halfdenom;
  __int128 denom;
  __int128 alpha;
  __int128 hitx, hity;
  denom = imul128( l0p1x - l0p0x, l1p1y - l1p0y ) - imul128( l0p1y - l0p0y, l1p1x - l1p0x );
  if( denom == 0 )
  {
    // segments lay on parallel lines
    // special case with segments on the same line -- may be considered if necessary
    return 0;
  }
  else
  {
    alpha = imul128( l1p0x - l0p0x, l1p1y - l1p0y ) - imul128( l1p0y - l0p0y, l1p1x - l1p0x );
    xormask = MATH_INT128_GET_HIGH64( denom ) >> 63;
    alpha = ( alpha ^ xormask ) - xormask;
    denom = ( denom ^ xormask ) - xormask;
#if ABORT_IF_HIT_IS_OUTSIDE_LINE_SEGMENTS
    __int128 beta;
    if( (unsigned __int128)alpha > denom )
      return 0;
    beta  = (imul128( l1p0x - l0p0x, l0p1y - l0p0y ) - imul128( l1p0y - l0p0y, l0p1x - l0p0x ) 
            ^ xormask) - xormask;
                 
    if( (unsigned __int128)beta > denom )
      return 0;
#endif
    highmask = MATH_INT128_GET_HIGH64( denom );
    if( highmask )
    {
      shift = 65 - __builtin_clzll( highmask );
      denom >>= shift;
      denomlow = (int64_t)denom;
    }
    else
    {
      shift = (uint64_t)( (int64_t)denom | ( MATH_INT128_GET_HIGH64( alpha ) ^ (int64_t)alpha ) ) >> 63;
      denomlow = (int64_t)denom;
      if( !shift )
        goto shiftzero;
      denomlow = (int64_t)( (uint64_t)denomlow >> shift );
    }
    alpha >>= shift;
    shiftzero:
    alphalow = (int64_t)alpha;
    halfdenom = denomlow >> 1;
    hitx = imul128( alphalow, l0p1x - l0p0x );
    signbit = MATH_INT128_GET_HIGH64( hitx ) >> 63;
    hitx += ( halfdenom ^ signbit ) - signbit;
    hitpt[0] = l0p0x + idiv128( hitx, denomlow );
    hity = imul128( alphalow, l0p1y - l0p0y );
    signbit = MATH_INT128_GET_HIGH64( hity ) >> 63;
    hity += ( halfdenom ^ signbit ) - signbit;
    hitpt[1] = l0p0y + idiv128( hity, denomlow );
  }
  return 1;
}

Full inline assembly implementation:

static inline int mathVertexLineLineIntersection_ASM2( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int retval;
  int64_t hitptx, hitpty;

  __asm__ __volatile__(
    /* rax = l0p1x - l0p0x; (preserve to r11) */
    /* rbx = l1p1y - l1p0y; (preserve rbx) */
    "movq %5, %%rax\n"
    "movq %10, %%rbx\n"
    "subq %3, %%rax\n"
    "subq %8, %%rbx\n"
    "movq %%rax, %%r11\n"
    /* r10:rdi(denom) = ( l0p1x - l0p0x ) * ( l1p1y - l1p0y ); */
    "imulq %%rbx\n"
    "movq %%rax, %%rdi\n"
    "movq %%rdx, %%r10\n"
    /* rax = l0p1y - l0p0y; (preserve to r12) */
    /* r13 = l1p1x - l1p0x; (preserve r13) */
    "movq %6, %%rax\n"
    "movq %9, %%r13\n"
    "subq %4, %%rax\n"
    "subq %7, %%r13\n"
    "movq %%rax, %%r12\n"
    /* rdx:rax = ( l0p1y - l0p0y ) * ( l1p1x - l1p0x ); */
    "imulq %%r13\n"
    /* r10:rdi(denom) = imul() - imul(); */
    /* retval=0; if( denom == 0 ), jmp end; */
    /* rax = l1p0x - l0p0x; */
    "subq %%rax, %%rdi\n"
    "movq %%rdi, %%rbp\n"
    "movq %7, %%rax\n"
    "sbbq %%rdx, %%r10\n"
    "subq %3, %%rax\n"
    "orq %%r10, %%rbp\n"
    "jne 1f\n"
    /* eax(retval) = 0; (output register) */
    "xorl %%eax, %%eax\n"
    "jmp 3f\n"

    ".p2align 4\n"
    "1:\n"
    /* rsi(xormask) = r10(denom.high) >> 63; */
    /* rbx = l1p1y - l1p0y; (preserved) */
    /* r9:rbx(alpha) = ( l0p1x - l0p0x ) * ( l1p1y - l1p0y ); */
    "movq %%r10, %%rsi\n"
    "imulq %%rbx\n"
    "movq %%rax, %%rbx\n"
    "movq %%rdx, %%r9\n"
    "sarq $63, %%rsi\n"
    /* rax = l1p0y - l0p0y; */
    "movq %8, %%rax\n"
    "subq %4, %%rax\n"
    /* r13 = l1p1x - l1p0x; (preserved) */
    /* rdx:rax = ( l1p0y - l0p0y ) * ( l1p1x - l1p0x ); */
    "imulq %%r13\n"
    /* r10:rdi(denom) ^= xormask; */
    "xorq %%rsi, %%rdi\n"
    "xorq %%rsi, %%r10\n"
    /* r9:rbx(alpha) = imul() - imul(); */
    "subq %%rax, %%rbx\n"
    "sbbq %%rdx, %%r9\n"
    /* r10:rdi(denom) += xormask & 1; */
    "subq %%rsi, %%rdi\n"
    "sbbq %%rsi, %%r10\n"
    /* r9:rbx(alpha) ^= xormask; */
    "xorq %%rsi, %%rbx\n"
    "xorq %%rsi, %%r9\n"
    /* rax = r11 = l0p1x - l0p0x; (preserved) */
    "movq %%r11, %%rax\n"
    /* r9:rbx(alpha) += xormask & 1; */
    "subq %%rsi, %%rbx\n"
    "sbbq %%rsi, %%r9\n"

    /* if( denom.high == 0 ), jmp skip; */
    "testq %%r10, %%r10\n"
    "je 2f\n"
    /* branch ( denom.high != 0 ) */
    /* ecx(shift) = 65 - __builtin_clzll( denom.high ); */
    "lzcntq %%r10, %%rsi\n"
    "movl $65, %%ecx\n"
    "subl %%esi, %%ecx\n"
    /* r10:rdi(denom) >>= shift, feeding bits from r10(denom.high) */
    "shrdq %%r10, %%rdi\n"
    "jmp 4f\n"

    ".p2align 4\n"
    "2:\n"
    /* branch ( denom.high == 0 ) */
    /* shift = (uint64_t)( (int64_t)denom | ( MATH_INT128_GET_HIGH64( alpha ) ^ (int64_t)alpha ) ) >> 63; */
    "movq %%r9, %%rcx\n"
    "xorq %%rbx, %%rcx\n"
    "orq %%rdi, %%rcx\n"
    "shrq $63, %%rcx\n"
    /* skip if %%cl==0 */
    "jz 5f\n"
    /* rdi(denom.low) >>= shift */
    "shrq %%cl, %%rdi\n"

    /* branch done ~ if( denom.high == 0 ) */
    "4:\n"
    /* r9:rbx(alpha) >>= shift, feeding bits from r9(alpha.high) */
    "shrdq %%r9, %%rbx\n"
    /* target, if skip %%cl==0 */
    "5:\n"
    /* rcx = denom >> 1; */
    /* rdx:rax(hitfx) = rbx(alpha.low) * ( l0p1x - l0p0x ); */
    "movq %%rdi, %%rcx\n"
    "imulq %%rbx\n"
    "sarq %%rcx\n"
    /* rcx = denom >> 1; (preserved) */
    /* signbit = rdx(hitx.high) >> 63; */
    /* rdx:rax(hitxf) += ( ( denom >> 1 ) ^ signbit ) - signbit; */
    "movq %%rdx, %%rbp\n"
    "movq %%rcx, %%rsi\n"
    "sarq $63, %%rbp\n"
    "xorq %%rbp, %%rsi\n"
    "subq %%rbp, %%rsi\n"
    "movq %%rsi, %%rbp\n"
    "sarq $63, %%rsi\n"
    "addq %%rbp, %%rax\n"
    "adcq %%rsi, %%rdx\n"
    /* rdx:rax(hitx) = hitfx / rdi(denom.low) */
    /* rsi = -denom >> 1; */
    "idivq %%rdi\n"
    /* rax(hitx) += l0p0x; */
    "addq %3, %%rax\n"
    /* %1 = hitptx; (output) */
    "movq %%rax, %1\n"
    /* r12 = l0p1y - l0p0y; (preserved) */
    "movq %%r12, %%rax\n"
    /* rdx:rax(hitfy) = rbx(alpha.low) * ( l0p1y - l0p0y ); */
    "imulq %%rbx\n"
    /* rcx = denom >> 1; (preserved, destroyed) */
    /* signbit = rdx(hitx.high) >> 63; */
    /* rdx:rax(hitfy) += ( ( denom >> 1 ) ^ signbit ) - signbit; */
    "movq %%rdx, %%rbp\n"
    "sarq $63, %%rbp\n"
    "xorq %%rbp, %%rcx\n"
    "subq %%rbp, %%rcx\n"
    "movq %%rcx, %%rbp\n"
    "sarq $63, %%rcx\n"
    "addq %%rbp, %%rax\n"
    "adcq %%rcx, %%rdx\n"
    /* rdx:rax(hity) = hitfy / rdi(denom.low) */
    "idivq %%rdi\n"
    /* hity += l0p0y; */
    "addq %4, %%rax\n"
    /* %2 = hitpty; (output) */
    "movq %%rax, %2\n"
    /* eax(retval) = 1; (output register) */
    "movl $1, %%eax\n"
    /* exit */
    "3:\n"

    : "=a"(retval), "=&g"(hitptx), "=g"(hitpty)
    : "m" (l0p0x), "m" (l0p0y), "m" (l0p1x), "m" (l0p1y), "m" (l1p0x), "m" (l1p0y), "m" (l1p1x), "m" (l1p1y)
    : "%rbx", "%rcx", "%rdx", "%rdi", "%rsi", "%rbp", "%r9", "%r10", "%r11", "%r12", "%r13", "cc" );

  hitpt[0] = hitptx;
  hitpt[1] = hitpty;
  return retval;
}

Readers, please note, that this is not thoroughly tested yet and may contain bugs.

sergey-239 · 2022-11-19T14:50:21Z

sergey-239
Nov 19, 2022
Author

@alexisnaveros
why did you remove rounding from idiv?

0 replies

alexisnaveros · 2022-11-19T17:31:46Z

alexisnaveros
Nov 19, 2022

Yes, it's about time this became a discussion.

The rounding is still there; it just happens before the idiv (adjusting the numerator) rather than after (using the remainder). That used to be a little faster, but now that the code has changed with extra shifting, rounding after seems to be a little faster on my box. Go figure!

__attribute__ ((always_inline)) static inline int64_t idiv128_round( __int128 num, int64_t denom )
{
  int64_t res, rem;
  asm (
  "idivq %2\n\t"
  : "=a"(res), "=d"(rem) : "rm" (denom), "A" (num) );
  res += ( rem >= ( denom >> 1 ) );
  res -= ( rem <= -( denom >> 1 ) );
  return res;
}

static inline int mathVertexLineLineIntersection_PB5( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int shift;
  int64_t xormask, highmask, denomlow, alphalow;
  __int128 denom;
  __int128 alpha;
  denom = imul128( l0p1x - l0p0x, l1p1y - l1p0y ) - imul128( l0p1y - l0p0y, l1p1x - l1p0x );
  if( denom == 0 )
  {
    // segments lay on parallel lines
    // special case with segments on the same line -- may be considered if necessary
    return 0;
  }
  else
  {
    alpha = imul128( l1p0x - l0p0x, l1p1y - l1p0y ) - imul128( l1p0y - l0p0y, l1p1x - l1p0x );
    xormask = MATH_INT128_GET_HIGH64( denom ) >> 63;
    alpha = ( alpha ^ xormask ) - xormask;
    denom = ( denom ^ xormask ) - xormask;
#if ABORT_IF_HIT_IS_OUTSIDE_LINE_SEGMENTS
    __int128 beta;
    if( (unsigned __int128)alpha > denom )
      return 0;
    beta  = imul128( l1p0x - l0p0x, l0p1y - l0p0y ) - imul128( l1p0y - l0p0y, l0p1x - l0p0x );
    beta = ( beta ^ xormask ) - xormask;
    if( (unsigned __int128)beta > denom )
      return 0;
#endif
    highmask = MATH_INT128_GET_HIGH64( denom );
    if( highmask )
    {
      shift = 65 - __builtin_clzll( highmask );
      denom >>= shift;
      denomlow = (int64_t)denom;
    }
    else
    {
      shift = (uint64_t)( (int64_t)denom | ( MATH_INT128_GET_HIGH64( alpha ) ^ (int64_t)alpha ) ) >> 63;
      denomlow = (int64_t)denom;
      if( !shift )
        goto shiftzero;
      denomlow = (int64_t)( (uint64_t)denomlow >> shift );
    }
    alpha >>= shift;
    shiftzero:
    alphalow = (int64_t)alpha;
    hitpt[0] = l0p0x + idiv128_round( imul128( alphalow, l0p1x - l0p0x ), denomlow );
    hitpt[1] = l0p0y + idiv128_round( imul128( alphalow, l0p1y - l0p0y ), denomlow );
  }
  return 1;
}

EDIT: And it doesn't appear to be a compiler thing, as the assembly version rounding after the idiv is also faster. It's internal CPU scheduling stuff I assume.

8 replies

sergey-239 Nov 20, 2022
Author

BTW, why do you round away from zero instead of round up?

ISTM that most compilers including c++, by default round away from zero.

Checked the docs, The FPU & SSE FP math does the same in default rounding mode, so the question is closed.

sergey-239 Nov 20, 2022
Author

@alexisnaveros, I prepared the test to demonstrate you the performance issue with asm implementation I observe, compile with -mfpmath=sse -ffast-math -msse2 -O3. So, I get the following:
mathVertexLineLineIntersection_PB5       count: 100,000,000      time: 268.8 ms
mathVertexLineLineIntersection_PB6       count: 100,000,000      time: 267.7 ms
mathVertexLineLineIntersection_ASM2      count: 100,000,000      time: 7,513.2 ms

I am sorry, it's my fault. The compiler optimises out most of the C implementation as hitpt is not used after a call. Adding the asm (""::"m"(hit)); at the end of each loop fixes this, now I get:

mathVertexLineLineIntersection_PB5       count: 100,000,000      time: 8,030.8 ms
mathVertexLineLineIntersection_PB6       count: 100,000,000      time: 8,276.7 ms
mathVertexLineLineIntersection_ASM2      count: 100,000,000      time: 7,467.5 ms

alexisnaveros Nov 20, 2022

Cool. That makes more sense, I'm glad I'm not that bad at writing assembly. :)

And yes, optimizing compilers can make it hard to benchmark stuff (I usually read from volatile and write the output to volatile variables, to be sure).

alexisnaveros Nov 20, 2022

@sergey-239 Out of curiosity, how does that compare with the inaccurate double-based _A solutions and such? I can see your chip has a 42-95 latency for idiv r64, which is... amazingly bad (divpd is 13 cycles like always). Clearly they didn't bother throwing hardware at integer division (the first Athlon64 had a 74 cycles maximum latency for idiv r64, while Ryzen3 has a 17 cycles latency).

On that kind of hardware, I guess one could detect when the higher bits are irrelevant, and branch to a double-based solution (using a trick to convert int64_t<->double that only works when abs(int64) < (1<<52), 6 times faster than a C/C++ cast )...

sergey-239 Nov 21, 2022
Author

Out of curiosity, how does that compare with the inaccurate double-based _A solutions and such? I

I will include _A into run next time.

On that kind of hardware, I guess one could detect when the higher bits are irrelevant, and branch to a double-based solution (using a trick to convert int64_t<->double that only works when abs(int64) < (1<<52), 6 times faster than a C/C++ cast )...

This is almost that I am trying to implement now in asm for Intel.

AngusJohnson · 2022-11-20T22:24:43Z

AngusJohnson
Nov 20, 2022
Maintainer

Here's a modifiy _A that's fractionally faster. (It has only one division.)
Of course this modification still has all the limitations of _A.

  static inline int mathVertexLineLineIntersection_A2(int64_t* hitpt, 
      int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y)
  {
    double ln0a = static_cast<double>(l0p1y - l0p0y);
    double ln0b = static_cast<double>(l0p1x - l0p0x);
    double ln1a = static_cast<double>(l1p1y - l1p0y);
    double ln1b = static_cast<double>(l1p1x - l1p0x);
    double det = ln0a * ln1b - ln1a * ln0b;
    if (det == 0.0) return 0;
    double t = ((l0p0x - l1p0x) * ln1a - (l0p0y - l1p0y) * ln1b) / det;
    hitpt[0] = (int64_t)nearbyint(l0p0x + t * ln0b);
    hitpt[1] = (int64_t)nearbyint(l0p0y + t * ln0a);
    return 1;
  }

EDIT: THIS ALGORITHM CAN DEGRADE BADLY WHEN THE 2 SEGMENTS APPROACH COLLINEAR, SO IT'S NOT RECOMMENDED.

1 reply

sergey-239 Nov 21, 2022
Author

Also, this implementation loses accuracy when segment projections lengths exceeds 2^26 - the more an excess, the less an accuracy.

alexisnaveros · 2022-11-20T23:18:45Z

alexisnaveros
Nov 20, 2022

Yup, that's good. Note that compilers do that on their own when compiling with -ffast-math, but I also like being explicit about that stuff.

There's a new little problem with the int128 solutions (I'm using them somewhere); it can raise a SIGFPE signal when the int128_t multiplication product, after being divided by the int64_t, doesn't fit in a int64_t.
EDIT: Okay, that's just because alpha is much bigger than denom and there isn't an actual intersection.

And I think it's worth detecting when many high bits are all zeroes/ones, to branch and perform a double division, even on chips with fast idiv. When the branch is taken, it's just 10% slower than _A.

I guess I'll post a PB7 soon.

9 replies

alexisnaveros Nov 21, 2022

Like MSVC, I'm pretty sure C# and Delphi must provide some way to do add-with-carry and extended multiplication... It's way too essential to be able to access these hardware functionalities for any serious programming language.

I'll share a PB7 version soon... I have to investigate some cases that popped up when I left the code run for 1.5 days on all cores, comparing the results of random lines with perfect 1024 bits arithmetic.

sergey-239 Nov 21, 2022
Author

As we anyway would have an optimised asm version then there is no need to dance with it: both C# and Delphi allows linking with arbitrary object modules/libraries, while maintainability of the source code should rise

AngusJohnson Nov 21, 2022
Maintainer

Like MSVC, I'm pretty sure C# and Delphi must provide some way to do add-with-carry and extended multiplication... It's way too essential to be able to access these hardware functionalities for any serious programming language.

I still can't see how we can sensibly use 128bit integers in C++.
I've had a look at MSVC's very limiited 128bit integer support and I don't think it can be leveraged into use.
For example in using algorithm _A, det would become the 128bit int result of multiplying two 64bit ints. However, det later becomes the divisor and MSVC's _div128 function only takes a 64bit value as the divisor.

sergey-239 Nov 21, 2022
Author

IRC, since PB5 we use 64-bit divisor and @alexisnaveros confirmed that the accuracy is the one he expects.
Anyway, we have the MASM that should solve all possible issues with lack of intrinsics

alexisnaveros Nov 21, 2022

Correct. The PB* functions detect when the divisor has too many bits, and right shift both the numerator and the divisor equally (pushing bits from the the "high" in64_t to the lower one, it can use the shrd instruction but it's cheap to emulate just with x = ( high << (64-shift) ) | ( low >> shift ). There's a loss of accuracy, but you still have 63 bits in the denominator, so the effect is minimal.

EDIT: Fixed x = ( high << (64-shift) ) | ( low >> shift ), had it messed up.

AngusJohnson · 2022-11-20T23:39:19Z

AngusJohnson
Nov 20, 2022
Maintainer

Here's my testing code for these algorithms.
I think this demonstrates pretty well that when coords are within +/- 1 << 52 that precision is generally excellent.
Of course what was well demonstrated in the Issue (link at the top), is that precision with the current _A algorithm does deteriorate when edge coords are close to the maximum coord range and when edges are almost collinear.

inline int64_t RandInt64(int max_bits = 52) 
{
  int64_t result = 0;
  for (int i = 0; i < max_bits; ++i) result = result << 1 | rand() % 2;
  return result;
}

inline void MakeRandomSeg(const Point64& ip, Point64& seg1, Point64& seg2)
{
  // make the intersection the midpoint of each segment
  seg1.x = RandInt64();
  seg1.y = RandInt64();
  seg2.x = ip.x * 2 - seg1.x;
  seg2.y = ip.y * 2 - seg1.y;
}

static int CheckAlgorithm(const Point64& randomPt, const 
  Point64& p1, const Point64& p2, const Point64& p3, const Point64& p4,
  int (*func)(int64_t*, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t))
{
  Point64 ip;
  if (!func(&ip.x, p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)) return MAX_COORD;
  return std::max(std::abs(ip.x - randomPt.x), std::abs(ip.y - randomPt.y));
}

static void CheckAlgorithms()
{ 
  Point64 ip, p1, p2, p3, p4;
  int a = 0, a2 = 0, b = 0, c = 0, d = 0, g = 0;
  static const int count = 1000;
  for (int i = 0; i < count; ++i)
  {
    ip = Point64(RandInt64(), RandInt64());
    MakeRandomSeg(ip, p1, p2);
    MakeRandomSeg(ip, p3, p4);
    a = std::max(a, CheckAlgorithm(ip, p1, p2, p3, p4, mathVertexLineLineIntersection_A));
    b = std::max(b, CheckAlgorithm(ip, p1, p2, p3, p4, mathVertexLineLineIntersection_B));
    c = std::max(c, CheckAlgorithm(ip, p1, p2, p3, p4, mathVertexLineLineIntersection_C));
    d = std::max(d, CheckAlgorithm(ip, p1, p2, p3, p4, mathVertexLineLineIntersection_D));
    g = std::max(g, CheckAlgorithm(ip, p1, p2, p3, p4, mathVertexLineLineIntersection_G));
  }
  std::cout << "\nTypical intersection point: " << ip << std::endl << std::endl;

  std::cout << "\nMax coordinate difference: " << std::endl;
  std::cout << "A: " << a / (count * 2) << std::endl;
  std::cout << "B: " << b / (count * 2) << std::endl;
  std::cout << "C: " << c / (count * 2) << std::endl;
  std::cout << "D: " << d / (count * 2) << std::endl;
  std::cout << "G: " << d / (count * 2) << std::endl;
}

Edit:
It seems that my mathVertexLineLineIntersection_A2 algorithm above performs very poorly when edges are almost collinear.
This is demonstated by modifying CheckAlgorithms() to make the p3,p4 segment very close to p1,p2:

    ip = Point64(0, 0);
    MakeRandomSeg(ip, p1, p2);
    //MakeRandomSeg(ip, p3, p4);
    p3 = Point64(p1.x + 1, p1.y + 1);
    p4 = Point64(p2.x - 1, p2.y - 1);

0 replies

bahvalo · 2022-11-21T06:27:14Z

bahvalo
Nov 21, 2022

I appreciate your effort to make a robust algorithm for input data in the range within +/- 1 << 62. But does anyone really need this range?

For me, the data within +/- 1 << 51 will be fine. And as I understand, this is also good for Angus. In this case, I think that one can use my initial function ('PB'), which is much faster than 'PB3'.

For the input data in the range within +- 1<<30, one can write an even faster function.

7 replies

bahvalo Nov 22, 2022

+- 1<<53! The goal here is not to achieve +-1<<62, but to get reasonable performance when users who have source float point data scaled to ints in the range +-1<<53. The reasoning of doing so is obvious: some tasks require that precision and thus the accuracy is important.

Please give me an example with the input data in the range within +-1<<53 for which my 'PB' version fails. I'll think about it.

alexisnaveros Nov 22, 2022

Please give me an example with the input data in the range within +-1<<53 for which my 'PB' version fails. I'll think about it.

Sure.
Line0: -9007199254740992 9007199254740992 to 4326287356334343 1937292443380737
Line1: 5308442084454874 942807781223714 to 1920182258853459 4336955676312212

The correct intersection point is:
4303756836756732.948675
1949238956582536.688425

_PB returns:
4303756836756737 1949238956582535
an error of -4.051 and 1.688

bahvalo Nov 22, 2022

That's why I initially wrote the range +/- 1 << 51.
I tried to improve my initial code to treat the range +/- 1 << 53 properly. It's a bit slower but still much faster than 'PB3'.
Not sure whether this will be useful.

PS. Please stop naming your subroutines by my initials.

static inline int mathVertexLineLineIntersection_PB1a( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  __int128 alpha_num = __int128(l1p0x-l0p0x)*__int128(l1p1y-l1p0y) - __int128(l1p0y-l0p0y)*__int128(l1p1x-l1p0x);
  __int128  beta_num = __int128(l1p0x-l0p0x)*__int128(l0p1y-l0p0y) - __int128(l1p0y-l0p0y)*__int128(l0p1x-l0p0x);
  __int128     denom = __int128(l0p1x-l0p0x)*__int128(l1p1y-l1p0y) - __int128(l0p1y-l0p0y)*__int128(l1p1x-l1p0x);
  if(denom==0) {
    if(alpha_num || beta_num) return 0; // segments lay on parallel lines
    // special case with segments on the same line -- may be considered if necessary
    return 0;
  }
  else {
    if(denom<0) { alpha_num=-alpha_num; beta_num=-beta_num; denom=-denom; }
    if(alpha_num<0 || beta_num<0 || alpha_num>denom || beta_num>denom) return 0;
    __int128 denom4 = denom >> 2;
      if(alpha_num<denom4) {
        double alpha = double(alpha_num)/double(denom);
        hitpt[0] = l0p0x+(int64_t)nearbyint(alpha*(l0p1x-l0p0x));
        hitpt[1] = l0p0y+(int64_t)nearbyint(alpha*(l0p1y-l0p0y));
      }
      else if(alpha_num<3*denom4) {
        double alpha = double(2*alpha_num-denom)/double(2*denom);
        hitpt[0] = (int64_t)nearbyint(alpha*(l0p1x-l0p0x) + 0.5*(l0p0x+l0p1x));
        hitpt[1] = (int64_t)nearbyint(alpha*(l0p1y-l0p0y) + 0.5*(l0p0y+l0p1y));
      }
      else {
        double alpha = double(denom-alpha_num)/double(denom);
        hitpt[0] = l0p1x+(int64_t)nearbyint(alpha*(l0p0x-l0p1x));
        hitpt[1] = l0p1y+(int64_t)nearbyint(alpha*(l0p0y-l0p1y));
      }
    return 1;
  }
}

alexisnaveros Nov 22, 2022

Yup, that's more accurate. I thought about "swapping" the order of the points to get more accuracy out of the double path, but I didn't like how unpredictable that branch would be (or a bunch of cmov/blendvpd).

Line0: 9007199254740992 -9007199254740992 to -3444820055265805 2200161969356459
Line1: -5431086266366366 -810053631614162 to 2115738322067645 -333266045157211

The correct intersection point is:
-449942438970873.382979
-495358687345999.929693

_PB1a returns:
-449942438970875
-495358687345999
an error of 1.617 and -0.930

Benchmark, two short edges far from the origin:

Benchmark A      : 1332 ms ( error 726 )
Benchmark B      : 2307 ms ( error 0 )
Benchmark F      : 2299 ms ( error 0 )
Benchmark H      : 3544 ms ( error 0 )
Benchmark PB1a   : 3575 ms ( error 1 )
Benchmark PB3    : 5422 ms ( error 0 )
Benchmark Gizmo4 : 3520 ms ( error 0 )
Benchmark Gizmo5 : 2319 ms ( error 0 )
Benchmark Gizmo6 : 2361 ms ( error 0 )
Benchmark Gizmo7 : 1637 ms ( error 0 )

bahvalo Nov 23, 2022

I agree that branches show down the code, and it's very difficult to correctly predict the last bit. So I'm not going to suggest more versions with the division of floating point numbers.

alexisnaveros · 2022-11-22T07:39:11Z

alexisnaveros
Nov 22, 2022

Here's an update, mathVertexLineLineIntersection_Gizmo7.

(You can enable MATH_LINELINE_PREVENT_SIGFPE to smoothly handle when two non-intersecting almost-collinear lines put the intersection point in the neighborhood of Alpha Centauri. Otherwise, an idiv overflow causes a SIGFPE, which is nasty to recover from)

/* Have VertexLineLineIntersection_* return zero if the intersection point is outside the range of the two line segments */
#define MATH_LINELINE_RETZERO_IF_OUTSIDE (0)

/* Prevent idiv overflow (SIGFPE) if the hit point between the two vectors is out of int64_t range */
/* Unnecessary if you know the segments actually do intersect, or almost so */
#define MATH_LINELINE_PREVENT_SIGFPE (0)

/* Switch to double-precision math with a maximum error of 0.125 */
#define MATH_LINELINE_FLOATBITS_THRESHOLD (50)

/* GCC is smart enough to grab the higher half of the __int128 register pair, without any store/load/shift/whatever */
/* Can all compilers do that? Feel free to tweak as necessary for your compiler to emit good code */
#if 1
 #define MATH_INT128_GET_HIGH64(x) (int64_t)((x)>>64)
#else
 #define MATH_INT128_GET_HIGH64(x) (int64_t)(*((int64_t *)(&x)+1))
#endif

__attribute__ ((always_inline)) static inline  __int128 imul128( int64_t a, int64_t b )
{
  __int128 res;
  asm (
  "imulq %2\n\t"
  : "=A"(res) : "a" (a), "rm" (b) );
  return res;
}

__attribute__ ((always_inline)) static inline int64_t idiv128_round( __int128 num, int64_t denom )
{
  int64_t res, rem;
  asm (
  "idivq %2\n\t"
  : "=a"(res), "=d"(rem) : "rm" (denom), "A" (num) );
  res += ( rem >= ( (denom+1) >> 1 ) );
  res -= ( rem <= -( (denom+1) >> 1 ) );
  return res;
}

static inline int mathVertexLineLineIntersection_Gizmo7( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int shift;
  int64_t xormask, highmask, denomlow, alphalow, mulx, muly;
  double fdet;
  __int128 denom;
  __int128 alpha;
  __int128 hitx, hity;
#if MATH_LINELINE_PREVENT_SIGFPE
  __int128 mhigh;
#endif
  mulx = l0p1x - l0p0x;
  muly = l0p1y - l0p0y;
  denom = imul128( mulx, l1p1y - l1p0y ) - imul128( muly, l1p1x - l1p0x );
  if( denom == 0 )
  {
    // segments lay on parallel lines
    // special case with segments on the same line -- may be considered if necessary
    return 0;
  }
  else
  {
    alpha = imul128( l1p0x - l0p0x, l1p1y - l1p0y ) - imul128( l1p0y - l0p0y, l1p1x - l1p0x );
    xormask = MATH_INT128_GET_HIGH64( denom ) >> 63;
    alpha = ( alpha ^ xormask ) - xormask;
    denom = ( denom ^ xormask ) - xormask;
#if MATH_LINELINE_RETZERO_IF_OUTSIDE
    __int128 beta;
    if( (unsigned __int128)alpha > denom )
      return 0;
    beta  = imul128( l1p0x - l0p0x, muly ) - imul128( l1p0y - l0p0y, mulx );
    beta = ( beta ^ xormask ) - xormask;
    if( (unsigned __int128)beta > denom )
      return 0;
#endif

#if MATH_LINELINE_RETZERO_IF_OUTSIDE
    highmask = MATH_INT128_GET_HIGH64( denom );
#else
    highmask = MATH_INT128_GET_HIGH64( alpha );
    highmask = ( highmask ^ ( highmask >> 63 ) ) | MATH_INT128_GET_HIGH64( denom );
#endif
    if( highmask )
    {
      shift = 65 - __builtin_clzll( highmask );
      denom >>= shift;
      denomlow = (int64_t)denom;
    }
    else
    {
      denomlow = (int64_t)denom;
      highmask = denomlow | ( MATH_INT128_GET_HIGH64( alpha ) ^ (int64_t)alpha );
      if( !( ( highmask | ( ( mulx >> 63 ) ^ mulx ) | ( ( muly >> 63 ) ^ muly ) ) & ~(((int64_t)1<<MATH_LINELINE_FLOATBITS_THRESHOLD)-1) ) )
      {
        /* All operands are low, switch to double-based math ~ max error is 0.125 */
        fdet = (double)((int64_t)alpha) / (double)denomlow;
        hitpt[0] = l0p0x + (int64_t)nearbyint( (double)mulx * fdet );
        hitpt[1] = l0p0y + (int64_t)nearbyint( (double)muly * fdet );
        return 1;
      }
      shift = (uint64_t)highmask >> 63;
      if( !shift )
        goto shiftzero;
      denomlow = (int64_t)( (uint64_t)denomlow >> shift );
    }
    alpha >>= shift;
    shiftzero:
    alphalow = (int64_t)alpha;

    hitx = imul128( alphalow, mulx );
#if MATH_LINELINE_PREVENT_SIGFPE
    mhigh = MATH_INT128_GET_HIGH64( hitx );
    xormask = mhigh >> 63;
    mhigh = mhigh ^ xormask;
    if( (uint64_t)( mhigh << 1 ) > (uint64_t)(denomlow-1) )
      return 0;
#endif
    hitpt[0] = l0p0x + idiv128_round( hitx, denomlow );

    hity = imul128( alphalow, muly );
#if MATH_LINELINE_PREVENT_SIGFPE
    mhigh = MATH_INT128_GET_HIGH64( hity );
    xormask = mhigh >> 63;
    mhigh = mhigh ^ xormask;
    if( (uint64_t)( mhigh << 1 ) > (uint64_t)(denomlow-1) )
      return 0;
#endif
    hitpt[1] = l0p0y + idiv128_round( hity, denomlow );
  }
  return 1;
}

With an input range of [-INT64_MAX/2,INT64_MAX/2], the maximum error is 2.0
With an input range of [-INT64_MAX/4,INT64_MAX/4], the maximum error is 1.0
With an input range of [-INT64_MAX/8,INT64_MAX/8], the maximum error is 0.5
With an input range of [-INT64_MAX/16,INT64_MAX/16], the maximum error is 0.25
With an input range of [-INT64_MAX/32,INT64_MAX/32], the maximum error is 0.125

The threshold to use faster double-precision math is set up in a way that the maximum error is 0.125 (when edge lengths near 2^50)

(to clarify what an error of 0.125 means, it means a true intersection point at 1000000000000007.4 might be rounded to 1000000000000008 instead of 1000000000000007)

Benchmark, two short edges far from the origin:

Benchmark A        : 1328 ms ( error 726 )
Benchmark B        : 2307 ms ( error 0 )
Benchmark F        : 2312 ms ( error 0 )
Benchmark H        : 3571 ms ( error 0 )
Benchmark PB3      : 5409 ms ( error 0 )
Benchmark Gizmo4   : 3434 ms ( error 0 )
Benchmark Gizmo5   : 2326 ms ( error 0 )
Benchmark Gizmo6   : 2410 ms ( error 0 )
Benchmark Gizmo7   : 1635 ms ( error 0 )

Benchmark, two very long edges:

Benchmark A        : 1348 ms ( error 4821303631675428 )
Benchmark B        : 2325 ms ( error 3287252476142327 )
Benchmark F        : 2320 ms ( error 3287252476142327 )
Benchmark H        : 3519 ms ( error 1095750825380774 )
Benchmark PB3      : 5316 ms ( error 0 )
Benchmark Gizmo4   : 4869 ms ( error 0 )
Benchmark Gizmo5   : 4873 ms ( error 0 )
Benchmark Gizmo6   : 4858 ms ( error 0 )
Benchmark Gizmo7   : 4858 ms ( error 0 )

EDIT: I had written ranges based on INT_MAX, I meant INT64_MAX.

1 reply

sergey-239 Nov 22, 2022
Author

could you please include SM1 (#334 (comment)) into your benchmark, please

sergey-239 · 2022-11-22T21:05:46Z

sergey-239
Nov 22, 2022
Author

Edited: Nov 26
Okay, here is what I ended up rewriting PB3 in assembly with my vision (without fallback to some faster method when suitable).
Notes:

safe range is ±INT64_MAX/2
order of segments is irrelevant, the algorithm computes alpha for both variants and picks the one, that produces better accuracy
return value:
- 0 lines do not intersect, hitpt is undefined
- 1 lines do intersect, one or both coordinates of hitpt are ±INT64_MAX that denotes out of safe range coordinate
- 2 lines do intersect, hitpt contains the intersection coordinates
mul/div sequence to calculate hitpoint coordinates is done using unsigned math
denom & alpha scaled to fit unsigned 64-bit integer and rounded
rounding of quotient is done before division

Known issues:

when |alpha| is much greater than |denom| the results are inaccurate (see Two segments intersection #334 (reply in thread))
when |alpha| or |denom|, which is greater, contains all ones, the result will be incorrect

Code

#ifndef USE_ALIGNED_ACCESS
#define USE_ALIGNED_ACCESS 0
#endif
#if USE_ALIGNED_ACCESS
#define MOVQD "movdqa "
#else
#define MOVQD "movdqu "
#endif

static  inline
int mathVertexLineLineIntersection_SM3(const Point64& l0p0,  const Point64& l0p1,  const Point64& l1p0,  const Point64& l1p1, Point64& hitpt)
{
    static_assert(!USE_ALIGNED_ACCESS || ((alignof(Point64)+offsetof(Point64,x)) % 16 == 0),
       "align misconfiguration");
    static_assert(offsetof(Point64,x) + 8 == offsetof(Point64,y));

    // Linux x86_64 calling convention:
    // Arguments 1-6 are passed via registers RDI, RSI, RDX, RCX, R8, R9 respectively;
    // However, when inlining, there is no way to tell gcc to put hitpt into R8, 
    // so we will preserve it anyway
    int res;
    register int64_t l0p1x_l0p0x asm ("r11"), l1p0x_l0p0x asm ("r12"), l1p1x_l1p0x asm ("r13"); 
    register uint64_t denomlo asm("rdi"), denomhi asm("rsi");
    register uint64_t alphalo asm("r9"), alphahi asm("r10");
    register uint64_t mask asm ("r14"), tmp asm ("r15");

    __asm volatile (
        "### denom: %[denomhi]:%[denomlo] \n\t"
        "### alpha: %[alphahi]:%[alphalo] \n\t"
        MOVQD "(%[l0p0]),%%xmm0\n\t" 
        MOVQD "(%[l0p1]),%%xmm1\n\t"
        MOVQD "(%[l1p0]),%%xmm2\n\t"
        MOVQD "(%[l1p1]),%%xmm3\n\t"
        "movq   %[hitpt],%%r8\n\t"
        "movdqa %%xmm2,%%xmm4\n\t"
        "psubq  %%xmm2,%%xmm3 # l0p1y - l0p0y : l0p1x - l0p0x\n\t" 
        "psubq  %%xmm0,%%xmm1 # l1p1y - l1p0y : l1p1x - l1p0x\n\t" 
        "psubq  %%xmm0,%%xmm2 # l1p0y - l0p0y : l1p0x - l0p0x\n\t" 
        "movq   %%xmm1,%[l0p1x_l0p0x] # l0p1y - l0p0y\n\t" 
        "movq   %%xmm2,%[l1p0x_l0p0x] # l1p0y - l0p0y\n\t" 
        "movq   %%xmm3,%[l1p1x_l1p0x] # l1p1y - l1p0y\n\t" 
        "psrldq  $8,%%xmm1 # xmm1 l0p1y - l0p0y\n\t" 
        "psrldq  $8,%%xmm2 # xmm2 l1p0y - l0p0y\n\t" 
        "psrldq  $8,%%xmm3 # xmm3 l1p1y - l1p0y\n\t" 

        "###   denom = imul128( l0p1x - l0p0x, l1p1y - l1p0y ) - imul128( l0p1y - l0p0y, l1p1x - l1p0x );\n\t"

        "movq   %%xmm1,%%rax\n\t"
        "imulq  %[l1p1x_l1p0x]\n\t"
        "movq   %%rax,%%rbx\n\t"
        "movq   %%rdx,%%rcx\n\t"
        "movq   %%xmm3,%%rax\n\t"
        "imulq  %[l0p1x_l0p0x]\n\t"
        "subq   %%rbx,%%rax\n\t"
        "sbbq   %%rcx,%%rdx\n\t"
        "movq   %%rax,%[denomlo]\n\t"
        "movq   %%rdx,%[denomhi]\n\t"
        "### if (!denom) goto NoIntersection;\n\t"
        "orq    %%rdx,%%rax\n\t"
        "jz     L6.%=\n\t"
        "### Label: %=\n\t" 
        "### rdx = mask\n\t"
        "sarq   $63,%%rdx\n\t"
        "### denom = (denom ^ mask) - mask;\n\t"
        "xorq   %%rdx,%[denomlo]\n\t"
        "xorq   %%rdx,%[denomhi]\n\t"
        "subq   %%rdx,%[denomlo]\n\t"
        "sbbq   %%rdx,%[denomhi]\n\t"
        "movq   %%rdx,%[mask]\n\t"

        "###   alpha = imul128( l1p0x - l0p0x, l1p1y - l1p0y ) - imul128( l1p0y - l0p0y, l1p1x - l1p0x );\n\t"

        "movq   %%xmm2,%%rax\n\t"
        "imulq  %[l1p1x_l1p0x]\n\t"
        "movq   %%rax,%%rbx\n\t"
        "movq   %%rdx,%%rcx\n\t"
        "movq   %%xmm3,%%rax\n\t"
        "imulq  %[l1p0x_l0p0x]\n\t"
        "subq   %%rbx,%%rax\n\t"
        "sbbq   %%rcx,%%rdx\n\t"
        "movq   %%rdx,%%rbx\n\t"
        "sarq   $63,%%rbx\n\t"
        "xorq   %%rbx,%%rax\n\t"
        "xorq   %%rbx,%%rdx\n\t"
        "subq   %%rbx,%%rax\n\t"
        "sbbq   %%rbx,%%rdx\n\t"
        "movq   %%rax,%[alphalo]\n\t"
        "movq   %%rdx,%[alphahi]\n\t"
        "movq   %%rbx,%[tmp]\n\t"

        "###   alpha2 = imul128( l1p0x_l0p0x, -(l0p1y - l0p0y) ) - imul128( -(l1p0y - l0p0y), l0p1x_l0p0x);\n\t"

        "movq   %%xmm2,%%rax\n\t"
        "negq   %%rax\n\t"
        "imulq  %[l0p1x_l0p0x]\n\t"
        "movq   %%rax,%%rbx\n\t"
        "movq   %%rdx,%%rcx\n\t"
        "movq   %%xmm1,%%rax\n\t"
        "negq   %%rax\n\t"
        "imulq  %[l1p0x_l0p0x]\n\t"
        "subq   %%rbx,%%rax\n\t"
        "sbbq   %%rcx,%%rdx\n\t"
        "movq   %%rdx,%%rbx\n\t"
        "sarq   $63,%%rbx\n\t"
        "xorq   %%rbx,%%rax\n\t"
        "xorq   %%rbx,%%rdx\n\t"
        "subq   %%rbx,%%rax\n\t"
        "sbbq   %%rbx,%%rdx\n\t"
        "cmpq   %[alphalo],%%rax\n\t"
        "movq   %%rdx,%%rcx\n\t"
        "sbbq   %[alphahi],%%rcx\n\t"
        "ja     1f\n\t"
        "not    %%rbx\n\t"
        "movq   %%rax,%[alphalo]\n\t"
        "movq   %%rdx,%[alphahi]\n\t"
        "movq   %%rbx,%[tmp]\n\t"
        "movq   %[l1p1x_l1p0x],%[l0p1x_l0p0x]\n\t"
        "movdqa %%xmm4,%%xmm0\n\t"
        "movdqa %%xmm3,%%xmm1\n\t"
    "1:\n\t"
        "xorq   %[tmp],%[mask]\n\t"

        "movl   $-1,%%eax\n\t"
        "bsrq   %[denomhi],%%rcx\n\t"
        "cmovzl %%eax,%%ecx\n\t"
        "bsrq   %[alphahi],%%rbx\n\t"
        "cmovzl %%eax,%%ebx\n\t"
        "cmpl   %%ebx,%%ecx\n\t"
        "cmovll %%ebx,%%ecx\n\t"
        "incl   %%ecx\n\t"
        "jmp    L3.%=\n\t"
    "L2.%=:\n\t"   
        "cqo\n\t"
        "xorq   %%rdx,%%rax\n\t"
        "movq   %%rdx,%%rbx\n\t"
        "subq   %%rdx,%%rax\n\t"
        "xorq   %[mask],%%rbx\n\t"
        "mulq   %[alphalo]\n\t"
    	  "addq   %[denomhi],%%rax\n\t"
    	  "adcq   $0,%%rdx\n\t"
        "cmpq   %[denomlo],%%rdx\n\t"
        "jae    L5.%=\n\t"
        "divq   %[denomlo]\n\t"
        "testq  %%rax,%%rax\n\t"
        "js     L5.%=\n\t"
        "xorq   %%rbx,%%rax\n\t"
        "subq   %%rbx,%%rax\n\t"
        "movq   %%xmm0,%%rbx\n\t"
        "addq   %%rbx,%%rax\n\t"
        "jo     L5.%=\n\t"
        "ret\n\t"
    "L5.%=:\n\t"
        "movq   $0x7FFFFFFFFFFFFFFF,%%rax\n\t"
        "movl   $1,%k[tmp]\n\t"
        "xorq   %%rbx,%%rax\n\t"
        "subq   %%rbx,%%rax\n\t"
        "ret\n\t"
    "L3.%=:\n\t"
        "clc\n\t"
        "movq   $1,%%rax\n\t"
        "shrdq  %[denomhi],%[denomlo]\n\t" // if CL == 0 the 
        "adcq   $0,%[denomlo]\n\t"
        "cmovzq %%rax,%[denomlo]\n\t"
        "clc\n\t"
        "shrdq  %[alphahi],%[alphalo]\n\t"
        "adcq   $0,%[alphalo]\n\t"
    "L4.%=:\n\t"
        "movq   %[denomlo],%[denomhi]\n\t"
        "### hitpt[0] = l0p0x + idiv128_round( imul128( alphalow, l0p1x - l0p0x ), denomlow ); \n\t"
        "movq   %[l0p1x_l0p0x],%%rax\n\t"
        "shrq   $1,%[denomhi]\n\t"
        "movl   $2,%k[tmp]\n\t"   // prepare ret code
        "call   L2.%=\n\t"
        "movq   %%rax,(%%r8)\n\t"
        "psrldq  $8,%%xmm0\n\t"
        "### hitpt[1] = l0p0y + idiv128_round( imul128( alphalow, l0p1y - l0p0y ), denomlow ); \n\t"
        "movq   %%xmm1,%%rax\n\t"
        "call   L2.%=\n\t"
        "movq   %%rax,8(%%r8)\n\t"
        "movl   %k[tmp],%%eax\n\t"
    "L6.%=:\n\t"
        : "=a"(res),
          [l0p1x_l0p0x]"=q"(l0p1x_l0p0x), [l1p1x_l1p0x]"=q"(l1p1x_l1p0x), [l1p0x_l0p0x]"=q"(l1p0x_l0p0x),
          [denomlo]"=D"(denomlo),[denomhi]"=S"(denomhi),   
          [alphalo]"=q"(alphalo),[alphahi]"=q"(alphahi),
          [mask]"=q"(mask),[tmp]"=q"(tmp)
        : [l0p0]"D"(&l0p0), [l0p1]"S"(&l0p1), [l1p0]"d"(&l1p0), [l1p1]"c"(&l1p1), [hitpt]"q"(&hitpt)

        : "rbx", "r8", "xmm0","xmm1","xmm2","xmm3", "cc", "memory"
    );

    return res;
}

27 replies

AngusJohnson Nov 26, 2022
Maintainer

If that 12% becomes 20% slower, it doesn't sound too catastrophic; global performance would be reduced by 2%, seems like a fair trade for perfect accuracy.

ISTM, that for the vast majority of users, their precision requirements (ie coordinate ranges) would be below 1.0e⁸ where precision is currently perfectly accurate (and I suspect perfectly accurate to 1.0e¹²).

alexisnaveros Nov 26, 2022

I'm curious, and I apologize, is it rather a resistance to the idea of "contaminating" the main code with a bunch of inline assembly, compiler-specific stuff, SIMD intrinsics, arcane infinite precision math, Shewchuk summation locally disabling some compiler optimization, and so on? If so, I perfectly get it. 😁

Yeah, I really do. It's not pretty, especially if you aren't fully comfortable with that stuff, but you know it's a bunch of extra code that you'll have to maintain, and dig into if something doesn't work right. That's not fun.

I try to always write a plain C/C++ fallback to whatever SIMD or other arcane stuff I do, but this __int128 stuff is tricky to write a decent plain fallback for.

AngusJohnson Nov 27, 2022
Maintainer

I'm curious, and I apologize, is it rather a resistance to the idea of "contaminating" the main code with a bunch of inline assembly, compiler-specific stuff, SIMD intrinsics, arcane infinite precision math, Shewchuk summation locally disabling some compiler optimization, and so on? If so, I perfectly get it. 😁

No need to apologise 😁.
There are multiple of reasons:

I don't fully understand the code and it's complicated and it's not pretty as you say 😜.
it can't easily be translated into Delphi & C#
it's not both 32bit and 64bit, nor likely fully compiler agnostic.

rs0xFFFF Nov 27, 2022

A compiled C high precision/speed-optimized code from an OBJ file is also popular with Delphi users. ;-)

sergey-239 Nov 27, 2022
Author

A compiled C high precision/speed-optimized code from an OBJ file is also popular with Delphi users. ;-)

I suspected that ;)

I do remember that even Borland did a lot of stuff in assembly for their different libraries, like GDI or their text-mode windowing library, (I can't recall its name, I have been working a lot with Turbo Pascal and C in the end of eighties and in the first half of nineties), though their compilers were quite good at optimisation.

alexisnaveros · 2022-11-24T06:07:30Z

alexisnaveros
Nov 24, 2022

Okay, Gizmo8 it is.

Sorry for the mess of #if to compile with options and experimental settings. That needs to be cleaned up and split into multiple specialized functions.

Enabling MATH_LINELINE_OUTSIDE_ACCURATE makes non-intersecting lines much more accurate. It can also be a little more accurate than Gizmo7 (intersecting lines or not) with MATH_LINELINE_SHIFT_ROUND. Integer overflow can now be caught.

Enabling the gizmos and goodies is about 25% slower. If they are all disabled, it's only 17% slower than inaccurate _A for typical input.

/* Return zero if the intersection point is outside the range of the two line segments */
#define MATH_LINELINE_RETZERO_IF_OUTSIDE (0)

/* Obtain more accurate results even when the intersection point is outside the range of the two line segments */
#define MATH_LINELINE_OUTSIDE_ACCURATE (1)

/* Prevent idiv overflow (SIGFPE) if the hit point between the two vectors is out of int64_t range */
/* Unnecessary if you know the segments actually do intersect, or almost so */
#define MATH_LINELINE_PREVENT_SIGFPE (1)

/* Detect add/mul overflow while building the intersection point of VertexLineLineIntersection_* */
/* Return 0 on overflow, the hit point is out of int64_t's numerical range */
/* Mostly free on GCC/clang/ICC (compiler support for hardware flags) ~ fallback path is okay */
/* Unnecessary if MATH_LINELINE_RETZERO_IF_OUTSIDE is enabled */
#define MATH_LINELINE_DETECT_OVERFLOW (1)

/* When right shifting bits out due to operands exceeding 64 bits, perform rounding based on shifted out bits */
/* This is very slightly more accurate */
#define MATH_LINELINE_SHIFT_ROUND (1)

#if !defined(__SIZEOF_INT128__)
 #warning No support for int128 math, we should do something about that, fallback and stuff
#endif

#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
 #define MATH_LIKELY(x) __builtin_expect(!!(x), 1)
 #define MATH_UNLIKELY(x) __builtin_expect(!!(x), 0)
 #define MATH_ALWAYSINLINE __attribute__((always_inline))
#elif defined(_MSC_VER)
 #define MATH_LIKELY(x) (x)
 #define MATH_UNLIKELY(x) (x)
 #define MATH_ALWAYSINLINE __forceinline
#else
 #define MATH_LIKELY(x) (x)
 #define MATH_UNLIKELY(x) (x)
 #define MATH_ALWAYSINLINE
#endif

/* Signed add4 ~ returns non-zero on success, zero on overflow */
MATH_ALWAYSINLINE static inline int mathOverflow_Add64s( int64_t *dst, int64_t src0, int64_t src1 )
{
#if __GNUC__
  int retval;
  long long int sum;
  retval = __builtin_saddll_overflow( src0, src1, &sum );
  *dst = (int64_t)sum;
  return !retval;
#else
  uint64_t a, b, sum;
  /* We must use unsigned addition ~ signed overflow is undefined by the C standard, and compilers can optimize it away */
  a = (uint64_t)src0;
  b = (uint64_t)src1;
  sum = a + b;
  *dst = (int64_t)sum;
  return !( ( ~( a ^ b ) & ( a ^ sum ) ) >> 63 );
#endif
}

MATH_ALWAYSINLINE static inline int64_t math128_idiv( __int128 num, int64_t denom )
{
  int64_t res;
#if __GNUC__ && ( defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) )
  __asm__(
  "idivq %1\n\t"
  : "=a"(res) : "rm" (denom), "A" (num) );
#else
  res = num / denom;
#endif
  return res;
}

MATH_ALWAYSINLINE static inline int64_t math128_idivmod( __int128 num, int64_t denom, int64_t *retrem )
{
  int64_t res, rem;
#if __GNUC__ && ( defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) )
  __asm__(
  "idivq %2\n\t"
  : "=a"(res), "=d"(rem) : "rm" (denom), "A" (num) );
  *retrem = rem;
#else
  res = num / denom;
  rem = num % denom;
  *retrem = rem;
#endif
  return res;
}

/* denom must be >= 0 for proper rounding behavior */
MATH_ALWAYSINLINE static inline int64_t math128_idivRound( __int128 num, int64_t denom )
{
  int64_t res, rem;
#if __GNUC__ && ( defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) )
  __asm__(
  "idivq %2\n\t"
  : "=a"(res), "=d"(rem) : "rm" (denom), "A" (num) );
#else
  res = num / denom;
  rem = num % denom;
#endif
  res += ( rem >= ( (denom+1) >> 1 ) );
  res -= ( rem <= -( (denom+1) >> 1 ) );
  return res;
}

MATH_ALWAYSINLINE static inline  __int128 math128_imul( int64_t a, int64_t b )
{
  __int128 res;
#if __GNUC__ && ( defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) )
  __asm__(
  "imulq %2\n\t"
  : "=A"(res) : "a" (a), "rm" (b) );
#else
  res = (__int128)a * (__int128)b;
#endif
  return res;
}

/* GCC is smart enough to grab the higher half of the __int128 register pair, without any store/load/shift/whatever */
/* Can all compilers do that? Feel free to tweak as necessary for your compiler to emit good code */
#if 1
 #define MATH_INT128_GET_HIGH64(x) (int64_t)((x)>>64)
#else
 #define MATH_INT128_GET_HIGH64(x) (int64_t)(*((int64_t *)(&x)+1))
#endif

#if MATH_LINELINE_OUTSIDE_ACCURATE
 /* Threshold for double-precision math, when we care about accuracy for non-intersecting edges */
 /* 25 bits give us a maximum error of 0.125 for all edges, intersecting or not */
 #define MATH_LINELINE_FLOATBITS_THRESHOLD (25)
#else
 /* Threshold for double-precision math, when we don't care about accuracy for non-intersecting edges */
 /* 50 bits give us a maximum error of 0.125 for intersecting edges */
 #define MATH_LINELINE_FLOATBITS_THRESHOLD (50)
#endif

static inline int mathVertexLineLineIntersection_Gizmo8( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int shift;
  int64_t xormask, highmask, denomlow, alphalow, alphahigh, mulx, muly;
  double fdet;
  __int128 denom;
  __int128 alpha;
  __int128 hitx, hity;
#if MATH_LINELINE_PREVENT_SIGFPE
  __int128 mhigh;
#endif
#if MATH_LINELINE_DETECT_OVERFLOW
  int64_t offx, offy;
  __int128 off128x, off128y;
  double fmx, fmy;
#endif
#if MATH_LINELINE_SHIFT_ROUND
 int64_t shiftrem;
#endif
  mulx = l0p1x - l0p0x;
  muly = l0p1y - l0p0y;
  denom = math128_imul( mulx, l1p1y - l1p0y ) - math128_imul( muly, l1p1x - l1p0x );
  if( MATH_UNLIKELY( denom == 0 ) )
  {
    // segments lay on parallel lines
    // special case with segments on the same line -- may be considered if necessary
    return 0;
  }
  else
  {
    alpha = math128_imul( l1p0x - l0p0x, l1p1y - l1p0y ) - math128_imul( l1p0y - l0p0y, l1p1x - l1p0x );
    xormask = MATH_INT128_GET_HIGH64( denom ) >> 63;
    alpha = ( alpha ^ xormask ) - xormask;
    denom = ( denom ^ xormask ) - xormask;
#if MATH_LINELINE_RETZERO_IF_OUTSIDE
    __int128 beta;
    if( (unsigned __int128)alpha > denom )
      return 0;
    beta  = math128_imul( l1p0x - l0p0x, muly ) - math128_imul( l1p0y - l0p0y, mulx );
    beta = ( beta ^ xormask ) - xormask;
    if( (unsigned __int128)beta > denom )
      return 0;
#endif

#if MATH_LINELINE_OUTSIDE_ACCURATE && !MATH_LINELINE_RETZERO_IF_OUTSIDE
    /* Reduce denom to significant 63 bits ~ alpha might still have more bits */
    highmask = MATH_INT128_GET_HIGH64( denom );
    if( highmask )
    {
      shift = 65 - __builtin_clzll( highmask );
 #if MATH_LINELINE_SHIFT_ROUND
      shiftrem = ( (int64_t)denom >> (shift-1) ) & 0x1;
      denom >>= shift;
      denomlow = (int64_t)denom;
      if( MATH_LIKELY( denomlow != 0x7fffffffffffffff ) )
        denomlow += shiftrem;
 #else
      denom >>= shift;
      denomlow = (int64_t)denom;
 #endif
    }
    else
    {
      denomlow = (int64_t)denom;
      alphalow = (int64_t)alpha;
      shift = (uint64_t)denomlow >> 63;
      if( !shift )
        goto shiftzero;
 #if MATH_LINELINE_SHIFT_ROUND
      shiftrem = denomlow & 0x1;
      denomlow = (int64_t)( (uint64_t)denomlow >> shift );
      if( MATH_LIKELY( denomlow != 0x7fffffffffffffff ) )
        denomlow += shiftrem;
 #else
      denomlow = (int64_t)( (uint64_t)denomlow >> shift );
 #endif
    }

 #if MATH_LINELINE_SHIFT_ROUND
    shiftrem = ( (int64_t)alpha >> (shift-1) ) & 0x1;
    alpha >>= shift;
    alphalow = (int64_t)alpha;
    if( MATH_LIKELY( alphalow != 0x7fffffffffffffff ) )
      alphalow += shiftrem;
    shiftzero:
 #else
    alpha >>= shift;
    alphalow = (int64_t)alpha;
    shiftzero:
 #endif

    /* If alpha has more than 63 significant bits, reduce */
    alphahigh = MATH_INT128_GET_HIGH64( alpha );
    xormask = alphahigh >> 63;
    if( ( alphahigh ^ xormask ) | ( ( alphalow ^ xormask ) >> 63 ) )
    {
      int64_t c, r;
 #if MATH_LINELINE_PREVENT_SIGFPE
      if( MATH_UNLIKELY( (uint64_t)( ( alphahigh ^ xormask ) << 1 ) >= (uint64_t)denomlow ) )
        return 0;
 #endif
      c = math128_idivmod( alpha, denomlow, &r );
 #if MATH_LINELINE_DETECT_OVERFLOW
      off128x = (__int128)l0p0x + ( (__int128)c * (__int128)mulx );
      off128y = (__int128)l0p0y + ( (__int128)c * (__int128)muly );
      if( MATH_UNLIKELY( off128x < INT64_MIN ) )
        return 0;
      if( MATH_UNLIKELY( off128y < INT64_MIN ) )
        return 0;
      if( MATH_UNLIKELY( off128x > INT64_MAX ) )
        return 0;
      if( MATH_UNLIKELY( off128y > INT64_MAX ) )
        return 0;
      l0p0x = (int64_t)off128x;
      l0p0y = (int64_t)off128y;
 #else
      l0p0x += c * mulx;
      l0p0y += c * muly;
 #endif
      alphalow = (int64_t)r;
    }

    /* Denomlow and alphalow both fit in 63 bits signed */
    highmask = denomlow | ( alphalow ^ ( alphalow >> 63 ) );
    if( !( ( highmask | ( ( mulx >> 63 ) ^ mulx ) | ( ( muly >> 63 ) ^ muly ) ) & ~(((int64_t)1<<MATH_LINELINE_FLOATBITS_THRESHOLD)-1) ) )
    {
      /* All operands are low, switch to double-based math ~ max error is 0.125 */
      fdet = (double)( alphalow ) / (double)denomlow;
 #if MATH_LINELINE_DETECT_OVERFLOW
      fmx = (double)mulx * fdet;
      fmy = (double)muly * fdet;
      if( MATH_UNLIKELY( fmin( fmx, fmy ) < (double)INT64_MIN ) )
        return 0;
      if( MATH_UNLIKELY( fmax( fmx, fmy ) > (double)INT64_MAX ) )
        return 0;
      offx = (int64_t)nearbyint( fmx );
      offy = (int64_t)nearbyint( fmy );
      if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[0], l0p0x, offx ) ) )
        return 0;
      if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[1], l0p0y, offy ) ) )
        return 0;
 #else
      hitpt[0] = l0p0x + (int64_t)nearbyint( (double)mulx * fdet );
      hitpt[1] = l0p0y + (int64_t)nearbyint( (double)muly * fdet );
 #endif
      return 1;
    }
#else
 #if MATH_LINELINE_RETZERO_IF_OUTSIDE
    highmask = MATH_INT128_GET_HIGH64( denom );
 #else
    highmask = MATH_INT128_GET_HIGH64( alpha );
    highmask = ( highmask ^ ( highmask >> 63 ) ) | MATH_INT128_GET_HIGH64( denom );
 #endif
    if( highmask )
    {
      shift = 65 - __builtin_clzll( highmask );
 #if MATH_LINELINE_SHIFT_ROUND
      shiftrem = ( (int64_t)denom >> (shift-1) ) & 0x1;
      denom >>= shift;
      denomlow = (int64_t)denom;
      if( MATH_LIKELY( denomlow != 0x7fffffffffffffff ) )
        denomlow += shiftrem;
 #else
      denom >>= shift;
      denomlow = (int64_t)denom;
 #endif
 #if !MATH_LINELINE_RETZERO_IF_OUTSIDE
      if( !denomlow )
        return 0;
 #endif
    }
    else
    {
      denomlow = (int64_t)denom;
      highmask = denomlow | ( MATH_INT128_GET_HIGH64( alpha ) ^ (int64_t)alpha );
      if( !( ( highmask | ( ( mulx >> 63 ) ^ mulx ) | ( ( muly >> 63 ) ^ muly ) ) & ~(((int64_t)1<<MATH_LINELINE_FLOATBITS_THRESHOLD)-1) ) )
      {
        /* All operands are low, switch to double-based math ~ max error is 0.125 */
        fdet = (double)( (int64_t)alpha ) / (double)denomlow;
 #if MATH_LINELINE_DETECT_OVERFLOW
        fmx = (double)mulx * fdet;
        fmy = (double)muly * fdet;
        if( MATH_UNLIKELY( fmin( fmx, fmy ) < (double)INT64_MIN ) )
          return 0;
        if( MATH_UNLIKELY( fmax( fmx, fmy ) > (double)INT64_MAX ) )
          return 0;
        offx = (int64_t)nearbyint( fmx );
        offy = (int64_t)nearbyint( fmy );
        if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[0], l0p0x, offx ) ) )
          return 0;
        if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[1], l0p0y, offy ) ) )
          return 0;
 #else
        hitpt[0] = l0p0x + (int64_t)nearbyint( (double)mulx * fdet );
        hitpt[1] = l0p0y + (int64_t)nearbyint( (double)muly * fdet );
 #endif
        return 1;
      }

      shift = (uint64_t)highmask >> 63;
      alphalow = (int64_t)alpha;
      if( !shift )
        goto shiftzero;
 #if MATH_LINELINE_SHIFT_ROUND
      shiftrem = denomlow & 0x1;
      denomlow = (int64_t)( (uint64_t)denomlow >> shift );
      if( MATH_LIKELY( denomlow != 0x7fffffffffffffff ) )
        denomlow += shiftrem;
 #else
      denomlow = (int64_t)( (uint64_t)denomlow >> shift );
 #endif
    }
 #if MATH_LINELINE_SHIFT_ROUND
    shiftrem = ( (int64_t)alpha >> (shift-1) ) & 0x1;
    alpha >>= shift;
    alphalow = (int64_t)alpha;
    if( MATH_LIKELY( alphalow != 0x7fffffffffffffff ) )
      alphalow += shiftrem;
    shiftzero:
 #else
    alpha >>= shift;
    alphalow = (int64_t)alpha;
    shiftzero:
 #endif
#endif

    hitx = math128_imul( alphalow, mulx );
#if MATH_LINELINE_PREVENT_SIGFPE
    mhigh = MATH_INT128_GET_HIGH64( hitx );
    if( (uint64_t)( ( mhigh ^ ( mhigh >> 63 ) ) << 1 ) >= (uint64_t)denomlow )
      return 0;
#endif
#if MATH_LINELINE_DETECT_OVERFLOW
    if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[0], l0p0x, math128_idivRound( hitx, denomlow ) ) ) )
      return 0;
#else
    hitpt[0] = l0p0x + math128_idivRound( hitx, denomlow );
#endif

    hity = math128_imul( alphalow, muly );
#if MATH_LINELINE_PREVENT_SIGFPE
    mhigh = MATH_INT128_GET_HIGH64( hity );
    if( (uint64_t)( ( mhigh ^ ( mhigh >> 63 ) ) << 1 ) >= (uint64_t)denomlow )
      return 0;
#endif
#if MATH_LINELINE_DETECT_OVERFLOW
    if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[1], l0p0y, math128_idivRound( hity, denomlow ) ) ) )
      return 0;
#else
    hitpt[1] = l0p0y + math128_idivRound( hity, denomlow );
#endif
  }
  return 1;
}

Yeah, that needs a severe clean up.

0 replies

alexisnaveros · 2022-11-24T22:49:19Z

alexisnaveros
Nov 24, 2022

Disregard Gizmo8 above, here's Gizmo9.

Inspired by @sergey-239 's SM1, I switched to unsigned math, and that gives us a whole extra bit of numerical accuracy. For an input range of [-INT64_MAX/2,INT64_MAX/2], the maximum error is now 0.8 for intersecting edges if MATH_LINELINE_SHIFT_ROUND is enabled (otherwise, it's 1.0). The mess of #if remains though.

Performance may be 5% lower than Gizmo8 in some cases.

/* Return zero if the intersection point is outside the range of the two line segments */
#define MATH_LINELINE_RETZERO_IF_OUTSIDE (0)

/* Obtain more accurate results even when the intersection point is outside the range of the two line segments */
#define MATH_LINELINE_OUTSIDE_ACCURATE (1)

/* Prevent div/idiv overflow (SIGFPE) if the hit point between the two vectors is out of int64_t range */
/* Unnecessary if you know the segments actually do intersect, or almost so */
#define MATH_LINELINE_PREVENT_SIGFPE (1)

/* Detect add/mul overflow while building the intersection point of VertexLineLineIntersection_* */
/* Return 0 on overflow, the hit point is out of int64_t's numerical range */
/* Mostly free on GCC/clang/ICC (compiler support for hardware flags) ~ fallback path is okay */
/* Unnecessary if MATH_LINELINE_RETZERO_IF_OUTSIDE is enabled */
#define MATH_LINELINE_DETECT_OVERFLOW (1)

/* When right shifting bits out due to operands exceeding 64 bits, perform rounding based on shifted out bits */
/* This is very slightly more accurate */
#define MATH_LINELINE_SHIFT_ROUND (1)

#if !defined(__SIZEOF_INT128__)
 #warning No support for int128 math, we should do something about that, fallback and stuff
#endif

#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
 #define MATH_LIKELY(x) __builtin_expect(!!(x), 1)
 #define MATH_UNLIKELY(x) __builtin_expect(!!(x), 0)
 #define MATH_ALWAYSINLINE __attribute__((always_inline))
#elif defined(_MSC_VER)
 #define MATH_LIKELY(x) (x)
 #define MATH_UNLIKELY(x) (x)
 #define MATH_ALWAYSINLINE __forceinline
#else
 #define MATH_LIKELY(x) (x)
 #define MATH_UNLIKELY(x) (x)
 #define MATH_ALWAYSINLINE
#endif

/* Signed add64 ~ returns non-zero on success, zero on overflow */
MATH_ALWAYSINLINE static inline int mathOverflow_Add64s( int64_t *dst, int64_t src0, int64_t src1 )
{
#if __GNUC__
  int retval;
  long long int sum;
  retval = __builtin_saddll_overflow( src0, src1, &sum );
  *dst = (int64_t)sum;
  return !retval;
#else
  uint64_t a, b, sum;
  /* We must use unsigned addition ~ signed overflow is undefined by the C standard, and compilers can optimize it away */
  a = (uint64_t)src0;
  b = (uint64_t)src1;
  sum = a + b;
  *dst = (int64_t)sum;
  return !( ( ~( a ^ b ) & ( a ^ sum ) ) >> 63 );
#endif
}

/* Signed mul64 ~ returns non-zero on success, zero on overflow */
MATH_ALWAYSINLINE static inline int mathOverflow_Mul64s( int64_t *dst, int64_t src0, int64_t src1 )
{
#if __GNUC__ && 0
  int retval;
  long long int product;
  retval = !__builtin_smulll_overflow( src0, src1, &product );
  *dst = (int64_t)product;
  return retval;
#else
  /* Why is it faster to check the top 65 bits of the product, rather than call __builtin_smulll_overflow()? This is weird */
  /* GCC emits the proper imulq/setno sequence, I guess Ryzen chips don't like using flags after an imulq */
  __int128 product;
  int64_t res;
  product = (__int128)src0 * (__int128)src1;
  res = (int64_t)product;
  *dst = res;
  return ( (int64_t)( product >> 64 ) == ( res >> 63 ) );
#endif
}

MATH_ALWAYSINLINE static inline uint64_t math128_divmod( unsigned __int128 num, uint64_t denom, uint64_t *retrem )
{
  uint64_t res, rem;
#if __GNUC__ && ( defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) )
  __asm__(
  "divq %2\n\t"
  : "=a"(res), "=d"(rem) : "rm" (denom), "A" (num) );
  *retrem = rem;
#else
  res = num / denom;
  rem = num % denom;
  *retrem = rem;
#endif
  return res;
}

MATH_ALWAYSINLINE static inline uint64_t math128_divRound( unsigned __int128 num, uint64_t denom )
{
  uint64_t res, rem;
#if __GNUC__ && ( defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) )
  /* On x86, it's faster to use the remainder than adjust the numerator */
  /* Not touching the numerator allows the CPU to schedule the divq instruction earlier */
  __asm__(
  "divq %2\n\t"
  : "=a"(res), "=d"(rem) : "rm" (denom), "A" (num) );
  res += ( rem >= ( (denom+1) >> 1 ) );
#else
  /* Friendlier to ARM, AARCH64 */
  num += denom >> 1;
  res = num / denom;
#endif
  return res;
}

MATH_ALWAYSINLINE static inline  __int128 math128_imul( int64_t a, int64_t b )
{
  __int128 res;
#if __GNUC__ && ( defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) )
  __asm__(
  "imulq %2\n\t"
  : "=A"(res) : "a" (a), "rm" (b) );
#else
  res = (__int128)a * (__int128)b;
#endif
  return res;
}

MATH_ALWAYSINLINE static inline unsigned __int128 math128_mul( uint64_t a, uint64_t b )
{
  unsigned __int128 res;
#if __GNUC__ && ( defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) )
  __asm__(
  "mulq %2\n\t"
  : "=A"(res) : "a" (a), "rm" (b) );
#else
  res = (unsigned __int128)a * (unsigned __int128)b;
#endif
  return res;
}

MATH_ALWAYSINLINE static inline uint64_t math128_shrdRoundLow64( unsigned __int128 src, int shift )
{
  uint64_t res;
#if __GNUC__ && ( defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) )
  __asm__(
  "shrdq %2, %0\n"
  "adcq $0, %0\n"
  : "=r"(res) : "0" ((uint64_t)src), "r" ((uint64_t)(src>>64)), "c" (shift) );
#elif MATH_LINELINE_SHIFT_ROUND
  res = (uint64_t)( src >> shift ) + ( ( (uint64_t)src >> (shift-1) ) & 0x1 );
#else
  res = src >> shift;
#endif
  return res;
}

MATH_ALWAYSINLINE static inline unsigned __int128 math128_shrdRound( unsigned __int128 src, int shift )
{
  unsigned __int128 res;
#if __GNUC__ && ( defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) )
  uint64_t low, high;
  low = (uint64_t)src;
  high = (uint64_t)( src >> 64 );
  __asm__(
  "shrdq %3, %0\n"
  "adcq $0, %0\n"
  "adcq $0, %1\n"
  : "=&r"(low), "=r"(high) : "0" (low), "r" (high), "1" (high>>shift), "c" (shift) );
  res = (unsigned __int128)low | ( (unsigned __int128)high << 64 );
#elif MATH_LINELINE_SHIFT_ROUND
  res = ( src + ( (int64_t)1 << ( shift - 1 ) ) ) >> shift;
#else
  res = src >> shift;
#endif
  return res;
}

/* GCC is smart enough to grab the higher half of the __int128 register pair, without any store/load/shift/whatever */
/* Can all compilers do that? Feel free to tweak as necessary for your compiler to emit good code */
#if 1
 #define MATH_INT128_GET_HIGH64(x) (int64_t)((x)>>64)
#else
 #define MATH_INT128_GET_HIGH64(x) (int64_t)(*((int64_t *)(&x)+1))
#endif

#if MATH_LINELINE_OUTSIDE_ACCURATE
 /* Threshold for double-precision math, when we care about accuracy for non-intersecting edges */
 /* 25 bits give us a maximum error of 0.125 for all edges, intersecting or not */
 #define MATH_LINELINE_FLOATBITS_THRESHOLD (25)
#else
 /* Threshold for double-precision math, when we don't care about accuracy for non-intersecting edges */
 /* 50 bits give us a maximum error of 0.125 for intersecting edges */
 #define MATH_LINELINE_FLOATBITS_THRESHOLD (50)
#endif

static inline int mathVertexLineLineIntersection_Gizmo9( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int shift;
  int64_t mulx, muly;
  uint64_t denomlow, alphalow, alphahigh, umulx, umuly;
  int64_t xormask, alphaxormask, highmask, mulxmask, mulymask;
  double fdet;
  unsigned __int128 denom;
  unsigned __int128 alpha;
  unsigned __int128 hitx, hity;
#if MATH_LINELINE_PREVENT_SIGFPE
  __int128 mhigh;
#endif
#if MATH_LINELINE_DETECT_OVERFLOW
  int64_t offx, offy;
  double fmx, fmy;
#endif
  mulx = l0p1x - l0p0x;
  muly = l0p1y - l0p0y;
  denom = (unsigned __int128)math128_imul( mulx, l1p1y - l1p0y ) - math128_imul( muly, l1p1x - l1p0x );
  if( MATH_UNLIKELY( denom == 0 ) )
  {
    // segments lay on parallel lines
    // special case with segments on the same line -- may be considered if necessary
    return 0;
  }
  else
  {
    alpha = (unsigned __int128)math128_imul( l1p0x - l0p0x, l1p1y - l1p0y ) - math128_imul( l1p0y - l0p0y, l1p1x - l1p0x );
    xormask = (int64_t)MATH_INT128_GET_HIGH64( denom ) >> 63;
    alphaxormask = (int64_t)MATH_INT128_GET_HIGH64( alpha ) >> 63;
    denom = ( denom ^ xormask ) - xormask;
#if MATH_LINELINE_RETZERO_IF_OUTSIDE
    __int128 beta;
    alpha = ( alpha ^ xormask ) - xormask;
    if( (unsigned __int128)alpha > denom )
      return 0;
    beta = math128_imul( l1p0x - l0p0x, muly ) - math128_imul( l1p0y - l0p0y, mulx );
    beta = ( beta ^ xormask ) - xormask;
    if( (unsigned __int128)beta > denom )
      return 0;
    alphaxormask ^= xormask;
    alpha = ( alpha ^ alphaxormask ) - alphaxormask;
#else
    alpha = ( alpha ^ alphaxormask ) - alphaxormask;
    alphaxormask ^= xormask;
#endif

#if MATH_LINELINE_OUTSIDE_ACCURATE && !MATH_LINELINE_RETZERO_IF_OUTSIDE
    /* Reduce denom to significant 64 bits ~ alpha might still have more bits */
    denomlow = (uint64_t)denom;
    highmask = MATH_INT128_GET_HIGH64( denom );
    if( highmask )
    {
      shift = 64 - __builtin_clzll( highmask );
      denomlow = math128_shrdRoundLow64( denom, shift );
      alpha = math128_shrdRound( alpha, shift );
    }

    /* If alpha has more than 64 significant bits, reduce */
    alphalow = (uint64_t)alpha;
    alphahigh = MATH_INT128_GET_HIGH64( alpha );
    if( alphahigh )
    {
      uint64_t c, r;
 #if MATH_LINELINE_PREVENT_SIGFPE
      if( MATH_UNLIKELY( (uint64_t)alphahigh > (uint64_t)denomlow ) )
        return 0;
 #endif
      c = math128_divmod( alpha, denomlow, &r );
 #if MATH_LINELINE_DETECT_OVERFLOW
      if( MATH_UNLIKELY( (int64_t)c < 0 ) )
        return 0;
      if( MATH_UNLIKELY( !mathOverflow_Mul64s( &offx, c, mulx ) ) )
        return 0;
      if( MATH_UNLIKELY( !mathOverflow_Mul64s( &offy, c, muly ) ) )
        return 0;
      if( MATH_UNLIKELY( !mathOverflow_Add64s( &l0p0x, l0p0x, ( offx ^ alphaxormask ) - alphaxormask ) ) )
        return 0;
      if( MATH_UNLIKELY( !mathOverflow_Add64s( &l0p0y, l0p0y, ( offy ^ alphaxormask ) - alphaxormask ) ) )
        return 0;
 #else
      c = ( c ^ alphaxormask ) - alphaxormask;
      l0p0x += (int64_t)c * mulx;
      l0p0y += (int64_t)c * muly;
 #endif
      alphalow = (uint64_t)r;
    }

    /* Denomlow and alphalow both fit in 64 bits unsigned */
    highmask = denomlow | alphalow;
    if( !( ( highmask | ( ( mulx >> 63 ) ^ mulx ) | ( ( muly >> 63 ) ^ muly ) ) & ~(((int64_t)1<<MATH_LINELINE_FLOATBITS_THRESHOLD)-1) ) )
    {
      /* All operands are low, switch to double-based math ~ max error is 0.125 */
      fdet = (double)alphalow / (double)denomlow;
 #if MATH_LINELINE_DETECT_OVERFLOW
      fmx = (double)mulx * fdet;
      fmy = (double)muly * fdet;
      if( MATH_UNLIKELY( fmin( fmx, fmy ) < (double)INT64_MIN ) )
        return 0;
      if( MATH_UNLIKELY( fmax( fmx, fmy ) > (double)INT64_MAX ) )
        return 0;
      offx = ( (int64_t)nearbyint( fmx ) ^ alphaxormask ) - alphaxormask;
      offy = ( (int64_t)nearbyint( fmy ) ^ alphaxormask ) - alphaxormask;
      if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[0], l0p0x, offx ) ) )
        return 0;
      if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[1], l0p0y, offy ) ) )
        return 0;
 #else
      hitpt[0] = l0p0x + ( (int64_t)nearbyint( (double)mulx * fdet ) ^ alphaxormask ) - alphaxormask;
      hitpt[1] = l0p0y + ( (int64_t)nearbyint( (double)muly * fdet ) ^ alphaxormask ) - alphaxormask;
 #endif
      return 1;
    }
#else
 #if MATH_LINELINE_RETZERO_IF_OUTSIDE
    highmask = MATH_INT128_GET_HIGH64( denom );
 #else
    highmask = MATH_INT128_GET_HIGH64( denom ) | MATH_INT128_GET_HIGH64( alpha );
 #endif
    if( highmask )
    {
      shift = 64 - __builtin_clzll( highmask );
      denomlow = math128_shrdRoundLow64( denom, shift );
 #if !MATH_LINELINE_RETZERO_IF_OUTSIDE
      if( !denomlow )
        return 0;
 #endif
      alphalow = math128_shrdRoundLow64( alpha, shift );
    }
    else
    {
      denomlow = (uint64_t)denom;
      alphalow = (uint64_t)alpha;
      highmask = denomlow | alphalow;
      if( !( ( highmask | ( ( mulx >> 63 ) ^ mulx ) | ( ( muly >> 63 ) ^ muly ) ) & ~(((int64_t)1<<MATH_LINELINE_FLOATBITS_THRESHOLD)-1) ) )
      {
        /* All operands are low, switch to double-based math ~ max error is 0.125 */
        fdet = (double)alphalow / (double)denomlow;
 #if MATH_LINELINE_DETECT_OVERFLOW
        fmx = (double)mulx * fdet;
        fmy = (double)muly * fdet;
        if( MATH_UNLIKELY( fmin( fmx, fmy ) < (double)INT64_MIN ) )
          return 0;
        if( MATH_UNLIKELY( fmax( fmx, fmy ) > (double)INT64_MAX ) )
          return 0;
        offx = ( (int64_t)nearbyint( fmx ) ^ alphaxormask ) - alphaxormask;
        offy = ( (int64_t)nearbyint( fmy ) ^ alphaxormask ) - alphaxormask;
        if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[0], l0p0x, offx ) ) )
          return 0;
        if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[1], l0p0y, offy ) ) )
          return 0;
 #else
        hitpt[0] = l0p0x + ( (int64_t)nearbyint( (double)mulx * fdet ) ^ alphaxormask ) - alphaxormask;
        hitpt[1] = l0p0y + ( (int64_t)nearbyint( (double)muly * fdet ) ^ alphaxormask ) - alphaxormask;
 #endif
        return 1;
      }
    }

#endif

    mulxmask = mulx >> 63;
    mulymask = muly >> 63;
    umulx = ( mulx ^ mulxmask ) - mulxmask;
    umuly = ( muly ^ mulymask ) - mulymask;

    hitx = math128_mul( alphalow, umulx );
#if MATH_LINELINE_PREVENT_SIGFPE
    if( (uint64_t)MATH_INT128_GET_HIGH64( hitx ) > (uint64_t)denomlow )
      return 0;
#endif
    xormask = alphaxormask ^ mulxmask;
#if MATH_LINELINE_DETECT_OVERFLOW
    offx = (int64_t)math128_divRound( hitx, denomlow );
    if( offx < 0 )
      return 0;
    offx = (int64_t)( ( (uint64_t)offx ^ xormask ) - xormask );
    if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[0], l0p0x, offx ) ) )
      return 0;
#else
    hitpt[0] = l0p0x + (int64_t)( ( math128_divRound( hitx, denomlow ) ^ xormask ) - xormask );
#endif

    hity = math128_mul( alphalow, umuly );
#if MATH_LINELINE_PREVENT_SIGFPE
    if( (uint64_t)MATH_INT128_GET_HIGH64( hity ) > (uint64_t)denomlow )
      return 0;
#endif
    xormask = alphaxormask ^ mulymask;
#if MATH_LINELINE_DETECT_OVERFLOW
    offy = (int64_t)math128_divRound( hity, denomlow );
    if( offy < 0 )
      return 0;
    offy = (int64_t)( ( (uint64_t)offy ^ xormask ) - xormask );
    if( MATH_UNLIKELY( !mathOverflow_Add64s( &hitpt[1], l0p0y, offy ) ) )
      return 0;
#else
    hitpt[1] = l0p0y + (int64_t)( ( math128_divRound( hity, denomlow ) ^ xormask ) - xormask );
#endif
  }
  return 1;
}

EDIT: Added mathOverflow_Mul64s() and optimized the worst case path (non-intersecting, huge alpha, tiny denom, overflow detection), it's now faster by some 10%.

EDIT 2: Added inline assembly for proper after-shift rounding, making the extra rounding practically free.

1 reply

alexisnaveros Nov 25, 2022

Besides a clean-up into specialized functions, I don't really see much potential improvement (it's running in my non-Clipper processing, beautifully chewing on 300gb of data, there are no topological errors to correct afterward... it's wonderful, I think I'm going to cry).

Is anyone interested in translating that into MSVC builtin 128 math pseudo-functions? It should be pretty easy... Then Angus could try it out in Clipper2.

The code uses floating point math when it can safely do so without a loss of accuracy (max error 0.125), otherwise doing 128 bits integer math to get accurate results.

But note that 128 bits stuff is a bit slow on ARM64 CPUs. ARM64 has a separate instruction to get the "high product" of a multiplication, and somehow it's 2-3x slower than the instruction to get the "low product" (x86 gets both from a single instruction). Another issue is that there's no instruction to do a 128bits/64bits divide on ARM64. On top of that, the uint64 divide instruction doesn't give you the remainder (that would be a separate instruction), so rounding the division input would be saner than rounding using the remainder (as Gizmo8/9 currently do). So yeah, performance is not great on ARM, unless you keep the numbers small.

EDIT: Made the code a bit friendlier to ARM; not using division remainders anymore unless on amd64/x86-64.

alexisnaveros · 2022-11-27T05:39:46Z

alexisnaveros
Nov 27, 2022

I have cleaned up these experiments a bit for proper integration with code at work, here:
http://www.rayforce.net/clipper2stuff/mathlineline.c
http://www.rayforce.net/clipper2stuff/mathlineline.h

I also added some quick MSVC support by testing through godbolt.org, and I could see that the optimization coming out of MSVC is very much horrible. It's not able to inline (int)nearbyint() or round(), while both are technically one instruction (cvtsd2si, roundsd) no matter what /O2 /fp:fast /Oi /arch:AVX2 I tried. It's doing a bunch of silly stuff everywhere... Please give msys/mingw64 a try if you are on Windows.

The API is pretty simple:

/* Check line intersection: yes */
/* Return non-zero if line segments intersect, don't compute the intersection point */
int mathLineLine_Check( int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y );

/* Check line intersection: yes */
/* Get intersection hit point: yes */
/* Return non-zero if line segments intersect, intersection point stored in hitpt */
int mathLineLine_CheckHit( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y );

/* Check line intersection: no */
/* Get intersection hit point: yes */
/* Accurate intersection outside lines: yes */
/* SIGFPE protection: yes */
/* Overflow checking: yes */
/* Return non-zero on success, intersection point stored in hitpt */
/* Return zero if parallel or intersection point outside of numerical range */
int mathLineLine_HitAnySafeRobust( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y );

23 replies

alexisnaveros Dec 8, 2022

Oh. Well, I don't personally care if 3.5 ends up being rounded as 3 or 4, both are equally accurate (and inaccurate).

The exact intersection point is -4.0, 0.5, so I would say both -4,0 and -4,1 are the same to me. The maximum error of 0.8, in the most extreme cases, means it might occasionally round the wrong way. Where it rounds when exactly x.5 doesn't seem like a significant concern.

sergey-239 Dec 8, 2022
Author

Thank you. Sorry again for confusing you.

sergey-239 Dec 16, 2022
Author

@alexisnaveros, I was correct: the linked sources have gcc detection disabled by && 0, so it was running FP fast path. The problem exists: {{0,0},{13,14}} ,{{259990364428861297,1009},{259990364428861308,1021}} causes division overflow at line 643

alexisnaveros Dec 17, 2022

@sergey-239 Oops, I can confirm that. SIGFPE protection isn't catching that one...

{ 0x0000000000000002 0x32dcb9ff6dd3753d } / 0x0000000000000002

Yeah, high is non-zero after division, hence overflow. I'll fix that in a couple hours.

Thanks for catching that! It has been running flawlessly over a couple terabytes of data, but any SIGFPE would be Very Bad.

sergey-239 Dec 25, 2022
Author

@alexisnaveros, you have one more bug in denom and alpha calculation: when 65 most significant bits of 128-bit value are all ones, the rounding after scaling to 64-bit value causes an overflow. Test case is simple:
{0,0},{1ULL << 34, 1},
{0,0},{1, 1ULL << 34},

sergey-239 · 2022-12-02T00:31:38Z

sergey-239
Dec 2, 2022
Author

This implementation should work on any platform that has a C++ compiler with 64-bit long long int support (win32 msvc does). It performs 3.5 times slower than #334 (comment) both being compiled with 64-bit gcc, though the boost implementation of long math "automatically" adjusts to the orders of magnitudes.
I made it for testing purposes, but it might be useful for someone else, so I decided to share it as well.

safe range for input coordinates is ±INT64_MAX/2. However, it could be extended to ±INT64_MAX quite easy;
the intersection coordinates range is ±INT64_MAX. Infinities reported as INT64_MIN and INT64_MAX

#include <boost/multiprecision/cpp_int.hpp>

namespace bn = ::boost::multiprecision;

#if defined(BOOST_HAS_INT128) && BOOST_ENDIAN_LITTLE_BYTE
static inline ::boost::int128_type mulsubmul(int64_t a, int64_t b, int64_t c, int64_t d) {
    return static_cast<::boost::int128_type>(a)*b - static_cast<::boost::int128_type>(c)*d;
}
#else
static inline bn::int128_t mulsubmul(int64_t a, int64_t b, int64_t c, int64_t d)
{
    bn::int128_t p1(a), p2(c);
    p1 *= b;
    p2 *= d;
    p1 -= p2;
    return p1;
}
#endif

static inline int mathVertexLineLineIntersection_boost(const  Point64& l0p0, const  Point64& l0p1, const  Point64& l1p0, const Point64& l1p1, Point64& hitpt)
{
    bn::int128_t alpha_num(mulsubmul(l1p0.x-l0p0.x, l1p1.y-l1p0.y, l1p0.y-l0p0.y, l1p1.x-l1p0.x));
    bn::int128_t  beta_num(mulsubmul(l1p0.x-l0p0.x, l0p1.y-l0p0.y, l1p0.y-l0p0.y, l0p1.x-l0p0.x));
    bn::int128_t     denom(mulsubmul(l0p1.x-l0p0.x, l1p1.y-l1p0.y, l0p1.y-l0p0.y, l1p1.x-l1p0.x));

    if(denom.is_zero()) {
        // if(alpha_num || beta_num) return 0; // segments lay on parallel lines
        // special case with segments on the same line -- may be considered if necessary
        return 0;
    }
    else {
        // if(denom<0) { alpha_num=-alpha_num; beta_num=-beta_num; denom=-denom; }
        // if(alpha_num<0 || beta_num<0 || alpha_num>denom || beta_num>denom) return 0;

        bool sign = denom.backend().sign() ^ alpha_num.backend().sign();
        denom.backend().sign(false);
        alpha_num.backend().sign(false);

        bn::number<bn::backends::cpp_int_backend<192, 192, bn::signed_magnitude, bn::unchecked, void> > t; 
        bn::int128_t halfdenom(denom);
        halfdenom >>= 1;

        bool tmp_sign; 

        t = alpha_num;
        t *= (l0p1.x-l0p0.x);
        tmp_sign = t.backend().sign() ^ sign;
        t.backend().sign(false);
        t += halfdenom;
        t /= denom;
        t.backend().sign(tmp_sign);
        t += l0p0.x;
        hitpt.x = static_cast<int64_t>(t);

        t = alpha_num; 
        t *= (l0p1.y-l0p0.y);
        tmp_sign = t.backend().sign() ^ sign;
        t.backend().sign(false);
        t += halfdenom;
        t /= denom;
        t.backend().sign(tmp_sign);
        t += l0p0.y;
        hitpt.y = static_cast<int64_t>(t);
        return 1;
    }
}

0 replies

sergey-239 · 2022-12-17T01:48:34Z

sergey-239
Dec 17, 2022
Author

While I was trying to speedup SM3 and add all necessary checks for border cases I got that I need to start from scratch.

So, I ended up with the following SM5 (I jumped over a version as I still plan to update the SM3 code for completeness)

My attempt to speedup the division with 32-bit version of div failed as splitting rax into edx:eax pair and all other extra adjustments eats up all latency difference between 64-bit division when rdx is zero and 32-bit division.

Instead, I borrowed an idea from @alexisnaveros to use FP math for products magnitudes that fits into 53 bits but for a larger number of suitable cases.

The other thing I implemented is an estimation of intersection offsets from the base point (the l0p0) using FP math and then using division only for cases when this estimation differs from the true intersection point by more than one. This leaves approximately 30% cases for integer division.

Also, I dropped the original intent to write the function completely in assembly language as gcc allocates registers and schedules independent instruction flows much faster than myself. So I used assembly only there, where it is impossible express in C what I need. If there will be an interest to make a complete asm version for msvc/Delphi environment then this is still possible.

It performs 30%-to-50% faster than mathLineLine_HitAnySafeRobust on my Intel(R) Core(TM) i5-7200U CPU @ 2.50GHz notebook. However it is slower than mathLineLine_HitAnySafeRobust by 30% on AMD FX-8320E Eight-Core Processor @ 1400 (according to @alexisnaveros this processor has quite poor FP SSE design and the performance results confirms this). Here is the benchmark result from run at my notebook. This test run performed 26^6 iterations (six nested loops) over different segment projections lengths and distances between starting segment points in the range from 2^12 through 2^60 with logarithmic increment of 2^4:

A2                          	 total: 308915776 returns:  1: 308915776	 time: 4636.25 ms
HitAnySafeRobust            	 total: 308915776 returns:  0: 29511988 1: 279403788	 time: 27840.6 ms
LineIntersection_SM5        	 total: 308915776 returns:  1: 29511988 2: 279403788	 time: 21201.3 ms

The thorough testing over 63^6 combinations is not performed yet, but the random cases I already passed during debugging gives difference with boost implementation not bigger than 1.41 where FP is used to calculate the intersection.

safe input range is ±INT64_MAX/2
function may return intersection coordinates in the range ±INT64_MAX
return value:
0 lines do not intersect, hitpt is undefined
1 lines do intersect, one or both coordinates of hitpt are ±INT64_MAX that denotes out of representable range coordinates
2 lines do intersect, hitpt contains the intersection coordinates

Code:

static inline int msb(int64_t v) {
   static const int minusone = -1;
   int ret;
   asm (
       "bsrq   %1,%q0\n\t"
       "cmovzl %2,%0\n\t"
       :"=q"(ret)
       :"q"(v),"m"(minusone)
       : "cc"
   );
   return ret+1;
}

static inline uint64_t shrq(unsigned __int128 v, int shift) {
   uint64_t ret = (uint64_t)v;
   asm (
       "shrdq  %%cl,%1,%0\n\t"
       :"+q"(ret)
       : "q"((uint64_t)(v >> 64)),"c"(shift)
       : "cc"
   );
   return ret;
}

static void _sm5_helper() {
   asm volatile (
       "movabsq $0x7fffffffffffffff,%%rax\n\t"
       "movl   $1,%%ebx\n\t"
       "xorq   %%rsi,%%rax\n\t"
       "subq   %%rsi,%%rax\n\t"
       ::
   );
}

int mathVertexLineLineIntersection_SM5(const Point64& l0p0, const Point64& l0p1, const Point64& l1p0, const Point64& l1p1, Point64& hitpt) {

   static const uint64_t safe_shift_mask[65] = {
       0, ~((1ULL<<63)-1), ~((1ULL<<62)-1), ~((1ULL<<61)-1), ~((1ULL<<60)-1), ~((1ULL<<59)-1),
       ~((1ULL<<58)-1), ~((1ULL<<57)-1), ~((1ULL<<56)-1), ~((1ULL<<55)-1), ~((1ULL<<54)-1), ~((1ULL<<53)-1),
       ~((1ULL<<52)-1), ~((1ULL<<51)-1), ~((1ULL<<50)-1), ~((1ULL<<49)-1), ~((1ULL<<48)-1), ~((1ULL<<47)-1),
       ~((1ULL<<46)-1), ~((1ULL<<45)-1), ~((1ULL<<44)-1), ~((1ULL<<43)-1), ~((1ULL<<42)-1), ~((1ULL<<41)-1),
       ~((1ULL<<40)-1), ~((1ULL<<39)-1), ~((1ULL<<38)-1), ~((1ULL<<37)-1), ~((1ULL<<36)-1), ~((1ULL<<35)-1),
       ~((1ULL<<34)-1), ~((1ULL<<33)-1), ~((1ULL<<32)-1), ~((1ULL<<31)-1), ~((1ULL<<30)-1), ~((1ULL<<29)-1),
       ~((1ULL<<28)-1), ~((1ULL<<27)-1), ~((1ULL<<26)-1), ~((1ULL<<25)-1), ~((1ULL<<24)-1), ~((1ULL<<23)-1),
       ~((1ULL<<22)-1), ~((1ULL<<21)-1), ~((1ULL<<20)-1), ~((1ULL<<19)-1), ~((1ULL<<18)-1), ~((1ULL<<17)-1),
       ~((1ULL<<16)-1), ~((1ULL<<15)-1), ~((1ULL<<14)-1), ~((1ULL<<13)-1), ~((1ULL<<12)-1), ~((1ULL<<11)-1),
       ~((1ULL<<10)-1), ~((1ULL<<9)-1), ~((1ULL<<8)-1), ~((1ULL<<7)-1), ~((1ULL<<6)-1), ~((1ULL<<5)-1),
       ~((1ULL<<4)-1), ~((1ULL<<3)-1), ~((1ULL<<2)-1), ~((1ULL<<1)-1), ~((1ULL<<0)-1)
   };

   static const uint64_t roundup[64] = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
       0, 0, 0, 0, 0, 
       1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047
   };

   static const __m128d absolute_mask = (__m128d)__m128i{((1ULL<<63)-1),((1ULL<<63)-1)};

   unsigned mxcsr;
   asm volatile("stmxcsr %0\n\t":"=m"(mxcsr));

   int64_t cl0p0_x = l0p0.x;
   int64_t cl0p0_y = l0p0.y;

   int64_t cl0p1_l0p0_x = l0p1.x;
   int64_t cl0p1_l0p0_y = l0p1.y;

   int64_t cl1p0_l0p0_x = l1p0.x;
   int64_t cl1p0_l0p0_y = l1p0.y;

   int64_t cl1p1_l1p0_x = l1p1.x;
   int64_t cl1p1_l1p0_y = l1p1.y;

   cl1p1_l1p0_x -= cl1p0_l0p0_x;
   cl1p1_l1p0_y -= cl1p0_l0p0_y;
   
   cl1p0_l0p0_x -= cl0p0_x;
   cl1p0_l0p0_y -= cl0p0_y;
   
   cl0p1_l0p0_x -= cl0p0_x;
   cl0p1_l0p0_y -= cl0p0_y;

   __int128 denom = (__int128)cl0p1_l0p0_x*cl1p1_l1p0_y-(__int128)cl0p1_l0p0_y*cl1p1_l1p0_x;
   __int128 alpha = (__int128)cl1p0_l0p0_x*cl1p1_l1p0_y-(__int128)cl1p0_l0p0_y*cl1p1_l1p0_x;

   __m128d falpha; falpha[1] = falpha[0] = (double)(int64_t)alpha;
   __m128d fdenom; fdenom[1] = fdenom[0] = (double)(int64_t)denom;
   __m128d fcl0p1_l0p0 = {(double)cl0p1_l0p0_x, (double)cl0p1_l0p0_y};
   __m128d xy = fcl0p1_l0p0*falpha/fdenom;

   if (!denom)
       return 0;

   int64_t denom_mask = (int64_t)(denom >> 64)>>63, alpha_mask = (int64_t)(alpha >> 64)>>63;
   denom = (denom ^ denom_mask) - denom_mask;
   alpha = (alpha ^ alpha_mask) - alpha_mask;
   denom_mask ^= alpha_mask;

#if ABORT_IF_HIT_IS_OUTSIDE_LINE_SEGMENTS
   if (denom_mask || denom < alpha)
       return 0;
#endif 

   char flags = 0;

#define SAFEBITS 54
   if (!((uint64_t)(denom>>64)|(uint64_t)(alpha>>64) | (((uint64_t)denom 
           | (uint64_t)alpha) & ~( (1ULL << SAFEBITS) - 1)))) {
       int bits = SAFEBITS-msb((int64_t)alpha); 
       flags = (bits >= msb(abs(cl0p1_l0p0_x)))|((bits >= msb(abs(cl0p1_l0p0_y))) << 1);
       if (flags & 1) hitpt.x = cl0p0_x+(int64_t)nearbyint(xy[0]);
       if (flags & 2) hitpt.y = cl0p0_y+(int64_t)nearbyint(xy[1]);
       if (flags == 3)
           return 2;
   }

   {
       unsigned newmxcsr = mxcsr | _MM_ROUND_TOWARD_ZERO;
       asm volatile ("ldmxcsr %2\n\t":"+qm"(cl0p1_l0p0_x), "+qm"(cl0p1_l0p0_y):"m"(newmxcsr));
   }

   xy = _mm_and_pd(__m128d{(double)cl0p1_l0p0_x, (double)cl0p1_l0p0_y}, absolute_mask);

   int shiftalpha = msb((uint64_t)(alpha>>64));
   int shiftdenom = msb((uint64_t)(denom>>64));
   uint64_t alphalo = shrq(alpha, shiftalpha);
   uint64_t denomlo = shrq(denom, shiftdenom);

   {
       uint64_t mdenomlo = denomlo;
       uint64_t malphalo = alphalo;

       asm (
           "bsrq   %[mdenomlo],%%rax\n\t"
           "addq   %p[roundup](,%%rax,8),%[mdenomlo]\n\t"
           "movq   %[malphalo],%%rax\n\t"
           "jc     2f\n\t"
           "orq    %[mdenomlo],%%rax\n\t"
           "jns    3f\n\t"
       "1:\n\t"
           "shrq   $1,%[malphalo]\n\t"
           "shrq   $1,%[mdenomlo]\n\t"
           "adcq   $0,%[mdenomlo]\n\t"
           "js     1b\n\t"
           "jmp    3f\n\t"
       "2:\n\t"
           "rcrq   $1,%[mdenomlo]\n\t"
           "shrq   $2,%[malphalo]\n\t"
           "shrq   $1,%[mdenomlo]\n\t"
       "3:\n\t"
           :[mdenomlo]"+q"(mdenomlo), [malphalo]"+q"(malphalo) 
           :[roundup]"Z"(roundup)
           :"rax", "cc"
       );
       fdenom[1] = fdenom[0] = (double)(int64_t) mdenomlo;
       falpha[1] = falpha[0] = (double)(int64_t) malphalo;
   }

   xy = xy * falpha / fdenom;
   
   int shift = shiftalpha - shiftdenom;

   xy = (__m128d)((__m128i)xy + __m128i{(int64_t)shift << 52,(int64_t)shift << 52});

   int64_t mask_cl0p1_l0p0_x = cl0p1_l0p0_x >> 63;
   int64_t mask_cl0p1_l0p0_y = cl0p1_l0p0_y >> 63;
   cl0p1_l0p0_x = (cl0p1_l0p0_x ^ mask_cl0p1_l0p0_x) - mask_cl0p1_l0p0_x;
   cl0p1_l0p0_y = (cl0p1_l0p0_y ^ mask_cl0p1_l0p0_y) - mask_cl0p1_l0p0_y;

   uint64_t halfdenomlo = denomlo >> 1;

#define _x 0
#define _y 1

#define MAGIC(__XY, __shift) \
   do { \
       uint64_t tmp, estimate = (int64_t)xy[_##__XY]; \
       int64_t result; \
       asm volatile ( \
       "movq   %[estimate],%%rax\n\t" \
       "mulq   %[denomlo]\n\t" \
       "xchgq  %%rax,%[m]\n\t" \
       "movq   %%rdx,%[tmp]\n\t" \
       "mulq   %[alphalo]\n\t" \
       "xorq   %[denom_mask],%[mask]\n\t" \
       __shift \
       "addq   %[halfdenomlo],%%rax\n\t" \
       "adcq   $0,%%rdx\n\t" \
       "jc     2f\n\t" \
       "cmpq   %[denomlo],%%rdx\n\t" \
       "jae    2f\n\t" \
       "subq   %[m],%%rax\n\t" \
       "sbbq   %[tmp],%%rdx\n\t" \
       "jb     2f\n\t" \
       "xor    %[m],%[m]\n\t" \
       "subq   %[denomlo],%%rax\n\t" \
       "lea    1(%[estimate]),%[tmp]\n\t" \
       "sbbq   $0,%%rdx\n\t" \
       "cmovnbq %[tmp],%[estimate]\n\t" \
       "cmovbq %[m],%%rax\n\t" \
       "jb     1f\n\t" \
       "subq   %[denomlo],%%rax\n\t" \
       "lea    1(%[estimate]),%[tmp]\n\t" \
       "sbbq   $0,%%rdx\n\t" \
       "cmovnbq %[tmp],%[estimate]\n\t" \
       "cmovbq %[m],%%rax\n\t" \
       "jb     1f\n\t" \
       "divq   %[denomlo]\n\t" \
   "1:\n\t" \
       "addq   %[estimate],%%rax\n\t" \
       "jo     2f\n\t" \
       "js     2f\n\t" \
       "xorq   %[mask],%%rax\n\t" \
       "subq   %[mask],%%rax\n\t" \
       "addq   %[offset],%%rax\n\t" \
       "jo     2f\n\t" \
       ".byte  0x48,0xbe\n\t" \
   "2:\n\t" \
       "call   %p[helper]\n\t" \
       ".byte  0x0f,0x1f,0x00\n\t" \
       : "=a"(result), [ret]"+b"(ret),[m]"+q"(cl0p1_l0p0_##__XY),[estimate]"+q"(estimate), [tmp]"=q"(tmp), \
           [denomlo]"+q"(denomlo), [halfdenomlo]"+qm"(halfdenomlo), [alphalo]"+qm"(alphalo), [denom_mask]"+qm"(denom_mask), \
           [mask]"+S"(mask_cl0p1_l0p0_##__XY), [offset]"+qm"(cl0p0_##__XY), "+c"(shift) \
       : [safe_shift_mask]"Z"(safe_shift_mask), [helper]"Z"(_sm5_helper) \
       : "rdx","cc"); \
       hitpt.__XY = result; \
   } while (0)

   int ret = 2;

   if (shift > 0) {
       if (!(flags & 1)) {
           MAGIC(x, "testq %p[safe_shift_mask](,%%ecx,8),%%rdx\n\tjnz 2f\n\tshldq %%cl,%%rax,%%rdx\n\tshlq %%cl,%%rax\n\t");
       }
       if (!(flags & 2)) {
           MAGIC(y, "testq %p[safe_shift_mask](,%%ecx,8),%%rdx\n\tjnz 2f\n\tshldq %%cl,%%rax,%%rdx\n\tshlq %%cl,%%rax\n\t");
       }
       asm volatile ("ldmxcsr %0\n\t"::"m"(mxcsr));
       return ret;
   } else if (shift < 0) {
       shift = -shift;
       if (!(flags & 1)) {
           MAGIC(x, "shrdq %%cl,%%rdx,%%rax\n\tshrq %%cl,%%rdx\n\t");
       }
       if (!(flags & 2)) {
           MAGIC(y, "shrdq %%cl,%%rdx,%%rax\n\tshrq %%cl,%%rdx\n\t");
       }
       asm volatile ("ldmxcsr %0\n\t"::"m"(mxcsr));
       return ret;
   } else {
       if (!(flags & 1)) {
           MAGIC(x, "");
       }
       if (!(flags & 2)) {
           MAGIC(y, "");
       }
       asm volatile ("ldmxcsr %0\n\t"::"m"(mxcsr));
       return ret;
   }
}

6 replies

sergey-239 Dec 25, 2022
Author

@alexisnaveros , any comments on SM7?

alexisnaveros Dec 26, 2022

Hey. I had some trouble compiling SM5. I wasn't aware of extensions that allow a syntax such as (__m128d)__m128i{((1ULL<<63)-1),((1ULL<<63)-1)}, but even after switching to regular intrinsics _mm_castsi128_pd( _mm_set1_epi64x( ((1ULL<<63)-1) ) ) (I'm compiling in C where that peculiar syntax extension doesn't exist), I had a bunch of warnings and errors about impossible constraints in the asm() statements. I probably should compile it unchanged as C++ to see if that goes away, somehow.

I still had SM2 in my benchmark suite, and I realized the state of registers wasn't what the compiler expects after mathVertexLineLineIntersection_SM2(). SM2 is probably missing some clobbered registers somewhere, that messed up the following SM7 tests until I removed SM2.

SM7 compiles, runs and tests fine.

Looking at intersecting pairs of tiny edges far from origin:

Benchmark mathLineLine_HitAnySafeRobust : 2325 ms
Benchmark SM7 : 3203 ms

On a Ryzen2 Epyc, it's slower than mathLineLine_HitAnySafeRobust(), but I'm not surprised they aren't throwing many transistors at x87's long double on modern chips.

Looking at intersecting pairs of very long edges:

Benchmark mathLineLine_HitAnySafeRobust : 4957 ms
Benchmark SM7 : 3990 ms

Yup, SM7 is faster in such cases.

Or looking at the hardest cases with alpha overflow shifting:

Benchmark mathLineLine_HitAnySafeRobust : 5308 ms
Benchmark SM7 : 5564 ms

The error range seems to be identical to mathLineLine_HitAnySafeRobust(). both for intersecting and non-intersecting edges.

I'm inclined to stick to integer math, as I also love the elegance of reliable answers, no matter the underlying hardware/compiler/code... That problem actually came up very recently: I have client programs performing changes to a global mesh of 800 million vertices, and by using an integer-only version of mathLineLine(), I'm sure both the client and server get the exact same answer, without needing the server to send back updated data.

Now, if only I could use Clipper2 to do things like "subtract a triangle from another shape of 800 million vertices"... ;)

sergey-239 Dec 26, 2022
Author

I wasn't aware of extensions that allow a syntax such as (__m128d)__m128i{((1ULL<<63)-1),((1ULL<<63)-1)}

It's one of C++ syntaxes of constructing a struct object, no surprise you had troubles to compile it with C.

The SM7's error range is a bit worse than mathLineLine_HitAnySafeRobust's: when alpha is several orders of magnitude greater than denom, then the division error becomes noteable.

As for underlying hardware and preferences - of course you will do what you think is the most appropriate. ;)

Now, if only I could use Clipper2 to do things like "subtract a triangle from another shape of 800 million vertices"... ;)

What keeps you from using Clipper2 for these purposes?

alexisnaveros Dec 26, 2022

Now, if only I could use Clipper2 to do things like "subtract a triangle from another shape of 800 million vertices"... ;)
What keeps you from using Clipper2 for these purposes?

Scalability is currently quite poor with huge datasets, I have touched the topic before, here for example:
#263

I'm still using my gigantic old code meanwhile, which scales well (also using all cores and SIMD to work on gigantic datasets) but Clipper2 is much more elegant and robust by design. I want to switch over eventually. I guess I'll wait for Angus to finish and clean up the Clipper2 algorithm, then I'll probably propose many changes to improve scalability, parallelism and low-level optimization, which will make it usable on my end.

alexisnaveros Dec 26, 2022

It's one of C++ syntaxes of constructing a struct object, no surprise you had troubles to compile it with C.

C also has compound literals since C99, but __m128 isn't supposed to be a struct, that's the peculiar extension that surprised me.

AngusJohnson · 2023-12-03T09:45:08Z

AngusJohnson
Dec 3, 2023
Maintainer

Relevant to this discusssion, I've just added a new GetIntersectPoint benchmark test.
It also demonstrates loss of function accuracy as coordinate ranges approach +/- 10e18.

0 replies

Two segments intersection #334

Replies: 14 comments · 83 replies

sergey-239 Nov 19, 2022 Author

sergey-239 Nov 20, 2022 Author

sergey-239 Nov 20, 2022 Author

sergey-239 Nov 21, 2022 Author

AngusJohnson Nov 20, 2022 Maintainer

sergey-239 Nov 21, 2022 Author

sergey-239 Nov 21, 2022 Author

AngusJohnson Nov 21, 2022 Maintainer

sergey-239 Nov 21, 2022 Author

AngusJohnson Nov 20, 2022 Maintainer

sergey-239 Nov 22, 2022 Author

sergey-239 Nov 22, 2022 Author

AngusJohnson Nov 26, 2022 Maintainer

AngusJohnson Nov 27, 2022 Maintainer

sergey-239 Nov 27, 2022 Author

sergey-239 Dec 8, 2022 Author

sergey-239 Dec 16, 2022 Author

sergey-239 Dec 25, 2022 Author

sergey-239 Dec 2, 2022 Author

sergey-239 Dec 17, 2022 Author

sergey-239 Dec 25, 2022 Author

sergey-239 Dec 26, 2022 Author

AngusJohnson Dec 3, 2023 Maintainer

Replies: 14 comments 83 replies

sergey-239
Nov 19, 2022
Author

sergey-239 Nov 20, 2022
Author

sergey-239 Nov 20, 2022
Author

sergey-239 Nov 21, 2022
Author

AngusJohnson
Nov 20, 2022
Maintainer

sergey-239 Nov 21, 2022
Author

sergey-239 Nov 21, 2022
Author

AngusJohnson Nov 21, 2022
Maintainer

sergey-239 Nov 21, 2022
Author

AngusJohnson
Nov 20, 2022
Maintainer

sergey-239 Nov 22, 2022
Author

sergey-239
Nov 22, 2022
Author

AngusJohnson Nov 26, 2022
Maintainer

AngusJohnson Nov 27, 2022
Maintainer

sergey-239 Nov 27, 2022
Author

sergey-239 Dec 8, 2022
Author

sergey-239 Dec 16, 2022
Author

sergey-239 Dec 25, 2022
Author

sergey-239
Dec 2, 2022
Author

sergey-239
Dec 17, 2022
Author

sergey-239 Dec 25, 2022
Author

sergey-239 Dec 26, 2022
Author

AngusJohnson
Dec 3, 2023
Maintainer