@@ -4545,19 +4545,42 @@ S_intuit_more(pTHX_ char *s, char *e)
4545
4545
/* Here is '[': maybe we have a character class. Examine the guts */
4546
4546
s ++ ;
4547
4547
4548
+ /* khw: If the context of this call is $foo[...], we may be able to avoid
4549
+ * the heuristics below. The possibilities are:
4550
+ * strict @foo $foo
4551
+ * vars? exists exists
4552
+ * y n n This is an error; return false now
4553
+ * y n y must be a a charclass
4554
+ * y y n must be a a subscript
4555
+ * y y y ambiguous; do heuristics below
4556
+ * n n n ambiguous; do heuristics below
4557
+ * n n y ambiguous; do heuristics below, but I
4558
+ * wonder if the initial bias should be a
4559
+ * little towards charclass
4560
+ * n y n ambiguous; do heuristics below, but I
4561
+ * wonder if the initial bias should be a
4562
+ * little towards subscript
4563
+ * n y y ambiguous; do heuristics below
4564
+ */
4565
+
4548
4566
/* '^' implies a character class; An empty '[]' isn't legal, but it does
4549
4567
* mean there isn't more to come */
4550
4568
if (s [0 ] == ']' || s [0 ] == '^' )
4551
4569
return FALSE;
4552
4570
4553
- /* Find matching ']'. khw: This means any s[1] below is guaranteed to
4554
- * exist */
4571
+ /* Find matching ']'. khw: Actually it finds the next ']' and assumes it
4572
+ * matches the '['. In order to account for the possibility of the ']'
4573
+ * being inside the scope of \Q or preceded by an even number of
4574
+ * backslashes, this should be rewritten */
4555
4575
const char * const send = (char * ) memchr (s , ']' , e - s );
4556
4576
if (! send ) /* has to be an expression */
4557
4577
return TRUE;
4558
4578
4559
4579
/* If the construct consists entirely of one or two digits, call it a
4560
- * subscript. */
4580
+ * subscript.
4581
+ *
4582
+ * khw: No one writes 03 to mean 3. Any string of digits beginning with
4583
+ * '0' is likely to be a charclass, including length 2 ones. */
4561
4584
if (isDIGIT (s [0 ]) && send - s <= 2 && (send - s == 1 || (isDIGIT (s [1 ])))) {
4562
4585
return TRUE;
4563
4586
}
@@ -4613,15 +4636,42 @@ S_intuit_more(pTHX_ char *s, char *e)
4613
4636
char tmpbuf [sizeof PL_tokenbuf * 4 ];
4614
4637
scan_ident (s , tmpbuf , sizeof tmpbuf , FALSE);
4615
4638
len = (int )strlen (tmpbuf );
4639
+
4640
+ /* khw: This only looks at global variables; lexicals came
4641
+ * later, and this hasn't been updated. Ouch!! */
4616
4642
if ( len > 1
4617
4643
&& gv_fetchpvn_flags (tmpbuf ,
4618
4644
len ,
4619
4645
UTF ? SVf_UTF8 : 0 ,
4620
4646
SVt_PV ))
4647
+ {
4621
4648
weight -= 100 ;
4622
- else /* Not a multi-char identifier already known in the
4623
- program; is somewhat likely to be a subscript */
4649
+
4650
+ /* khw: Below we keep track of repeated characters; People
4651
+ * rarely say qr/[aba]/, as the second a is pointless.
4652
+ * (Some do it though as a mnemonic that is meaningful to
4653
+ * them.) But generally, repeated characters makes things
4654
+ * more likely to be a charclass. But we have here that
4655
+ * this an identifier so likely a subscript. Its spelling
4656
+ * should be irrelevant to the repeated characters test.
4657
+ * So, we should advance past it. Suppose it is a hash
4658
+ * element, like $subscripts{$which}. We should advance
4659
+ * past the braces and key */
4660
+ }
4661
+ else {
4662
+ /* Not a multi-char identifier already known in the
4663
+ * program; is somewhat likely to be a subscript.
4664
+ *
4665
+ * khw: Our test suite contains several constructs like
4666
+ * [$A-Z]. Excluding length 1 identifiers in the
4667
+ * conditional above means such are much less likely to be
4668
+ * mistaken for subscripts. I would argue that if the next
4669
+ * character is a '-' followed by an alpha, that would make
4670
+ * it much more likely to be a charclass. It would only
4671
+ * make sense to be an expression if that alpha string is a
4672
+ * bareword with meaning; something like [$A-ord] */
4624
4673
weight -= 10 ;
4674
+ }
4625
4675
}
4626
4676
else if ( s [0 ] == '$'
4627
4677
&& s [1 ]
@@ -4638,13 +4688,43 @@ S_intuit_more(pTHX_ char *s, char *e)
4638
4688
}
4639
4689
break ;
4640
4690
4691
+ /* khw: [:blank:] strongly indicates a charclass */
4692
+
4641
4693
case '\\' :
4642
4694
if (s [1 ]) {
4643
- if (memCHRs ("wds]" , s [1 ]))
4695
+ if (memCHRs ("wds]" , s [1 ])) {
4644
4696
weight += 100 ; /* \w \d \s => strongly charclass */
4645
- /* khw: Why not \W \D \S \h \v, etc as well? */
4646
- else if (seen [(U8 )'\'' ] || seen [(U8 )'"' ])
4647
- weight += 1 ; /* \' => mildly charclass */
4697
+ /* khw: \] can't happen, as any ']' is beyond our search.
4698
+ * Why not \W \D \S \h \v, etc as well? Should they have
4699
+ * the same weights as \w \d \s or should all or some be
4700
+ * in the 'abcfnrtvx' below? */
4701
+ } else if (seen [(U8 )'\'' ] || seen [(U8 )'"' ]) {
4702
+ weight += 1 ;
4703
+ /* khw: This is problematic. Enough so, that I misread
4704
+ * it, and added a wrong comment about what it does in
4705
+ * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it
4706
+ * doesn't look at the current character. What it
4707
+ * actually does is: if any quote has been seen in the
4708
+ * parse, don't do the rest of the else's below, but for
4709
+ * every subsequent backslashed character encountered
4710
+ * (except \0 \w \s \d), increment the weight to lean a
4711
+ * bit more towards being a charclass. That means that
4712
+ * every backslash sequence following the first occurrence
4713
+ * of a quote increments the weight regardless of what the
4714
+ * sequence is. Again, \0 \w \d and \s are not controlled
4715
+ * by this else, so they change the weight by a lot more.
4716
+ * But what makes them so special that they aren't subject
4717
+ * to this. Any why does having a quote change the
4718
+ * behavior from then on. And why only backslashed
4719
+ * sequences get this treatment? This code has been
4720
+ * unchanged since this function was added in 1993. I
4721
+ * don't get it. Instead, it does seem to me that it is
4722
+ * especially unlikely to repeat a quote in a charclass,
4723
+ * but that having just a single quote is indicative of a
4724
+ * charclass, and having pairs of quotes is indicative of
4725
+ * a subscript. Similarly for things that could indicate
4726
+ * nesting of braces or parens. */
4727
+ }
4648
4728
else if (memCHRs ("abcfnrtvx" , s [1 ]))
4649
4729
weight += 40 ; /* \n, etc => charclass */
4650
4730
/* khw: Why not \e etc as well? */
@@ -4653,6 +4733,19 @@ S_intuit_more(pTHX_ char *s, char *e)
4653
4733
while (s [1 ] && isDIGIT (s [1 ]))
4654
4734
s ++ ;
4655
4735
}
4736
+
4737
+ /* khw: There are lots more possible escape sequences. Some,
4738
+ * like \A,\z have no special meaning to charclasses, so might
4739
+ * indicate a subscript, but I don't know what they would be
4740
+ * doing there either. Some have been added to the language
4741
+ * after this code was written, but no one thought to, or
4742
+ * could wade through this function, to add them. Things like
4743
+ * \p{} for properties, \N and \N{}, for example.
4744
+ *
4745
+ * It's problematic that \a is treated as plain 'a' for
4746
+ * purposes of the 'seen' array. Whatever is matched by these
4747
+ * backslashed sequences should not be added to 'seen'. That
4748
+ * includes the backslash. */
4656
4749
}
4657
4750
else /* \ followed by NUL strongly indicates character class */
4658
4751
weight += 100 ;
@@ -4698,7 +4791,25 @@ S_intuit_more(pTHX_ char *s, char *e)
4698
4791
&& isALPHA (s [1 ]))
4699
4792
{
4700
4793
/* Here it's \W (that isn't [$@&] ) followed immediately by two
4701
- * alphas in a row. Accumulate all the consecutive alphas */
4794
+ * alphas in a row. Accumulate all the consecutive alphas.
4795
+ *
4796
+ * khw: The code below was changed in 2015 by
4797
+ * 56f81afc0f2d331537f38e6f12b86a850187cb8a to solve a
4798
+ * buffer overrun. Prior to that commit, the code copied all
4799
+ * the consecutive alphas to a temporary. The problem was
4800
+ * that temporary's size could be exceeded, and the temporary
4801
+ * wasn't even needed (at least by 2015). The called
4802
+ * keyword() function doesn't need a copy. It takes a pointer
4803
+ * to the first character and a length, hence it can operate
4804
+ * on the original source text. It is intended to catch cases
4805
+ * like $a[ord]. If it does match a keyword, we don't want
4806
+ * the spelling of that keyword to affect the seen[] array.
4807
+ * But if it isn't a keyword we do want to fall back to the
4808
+ * normal behavior. And the 2015 commit removed that. It
4809
+ * absorbs every bareword regardless, defeating the intent of
4810
+ * the algorithm implementing the heuristics. That not many
4811
+ * bugs have surfaced since indicates this whole thing doesn't
4812
+ * get applied very much */
4702
4813
char * d = s ;
4703
4814
while (isALPHA (s [0 ]))
4704
4815
s ++ ;
@@ -4708,7 +4819,10 @@ S_intuit_more(pTHX_ char *s, char *e)
4708
4819
if (keyword (d , s - d , 0 ))
4709
4820
weight -= 150 ;
4710
4821
4711
- /* khw: Should those alphas be marked as seen? */
4822
+ /* khw: Barewords could also be subroutine calls, and these
4823
+ * would also indicate a subscript. Like [green] where
4824
+ * 'green' has been declared, for example, in 'use constant'
4825
+ * */
4712
4826
}
4713
4827
4714
4828
/* Consecutive chars like [...12...] and [...ab...] are presumed
@@ -4723,23 +4837,36 @@ S_intuit_more(pTHX_ char *s, char *e)
4723
4837
/* But repeating a character inside a character class does nothing,
4724
4838
* like [aba], so less likely that someone makes such a class, more
4725
4839
* likely that it is a subscript; the more repeats, the less
4726
- * likely. */
4840
+ * likely.
4841
+ *
4842
+ * khw: I think this changes the weight too rapidly. Each time
4843
+ * through the loop compounds the previous times. Instead, it
4844
+ * would be better to have a separate loop after all the rest that
4845
+ * changes the weight once based on how many times each character
4846
+ * gets repeated */
4727
4847
weight -= seen [un_char ];
4728
4848
break ;
4729
4849
} /* End of switch */
4730
4850
4731
4851
/* khw: 'seen' is declared as a char. This ++ can cause it to wrap.
4732
4852
* This gives different results with compilers for which a plain 'char'
4733
- * is actually unsigned, versus those where it is signed. I believe it
4734
- * is undefined behavior to wrap a 'signed'. I think it should be
4735
- * instead declared an unsigned int to make the chances of wrapping
4736
- * essentially zero.
4853
+ * is actually unsigned, versus those where it is signed. The C99
4854
+ * standard allows a compiler to raise a signal when a 'signed' char
4855
+ * is incremented outside its permissible range. I think 'seen'
4856
+ * should be instead declared an unsigned, and a conditional added
4857
+ * to prevent wrapping.
4737
4858
*
4738
4859
* And I believe that extra backslashes are different from other
4739
- * repeated characters. */
4860
+ * repeated characters. There may be others, like I have mentioned
4861
+ * quotes and paired delimiters */
4740
4862
seen [un_char ]++ ;
4741
4863
} /* End of loop through each character of the construct */
4742
4864
4865
+ /* khw: People on #irc have suggested things that I think boil down to:
4866
+ * under 'use 5.43.x', output a warning like existing warnings for
4867
+ * similar situations "Ambiguous use of [], resolved as ..." Perhaps
4868
+ * suppress the message if all (or maybe almost all) the evidence points
4869
+ * to the same outcome. This would involve two weight variables */
4743
4870
if (weight >= 0 ) /* probably a character class */
4744
4871
return FALSE;
4745
4872
0 commit comments