@@ -4549,19 +4549,47 @@ S_intuit_more(pTHX_ char *s, char *e)
4549
4549
/* Here is '[': maybe we have a character class. Examine the guts */
4550
4550
s ++ ;
4551
4551
4552
- /* '^' implies a character class; An empty '[]' isn't legal, but it does
4553
- * mean there isn't more to come */
4552
+ /* '^' implies a character class; An empty '[]' isn't legal, and it means
4553
+ * there isn't more to come */
4554
4554
if (s [0 ] == ']' || s [0 ] == '^' )
4555
4555
return FALSE;
4556
4556
4557
- /* Find matching ']'. khw: This means any s[1] below is guaranteed to
4558
- * exist */
4557
+ /* khw: If the context of this call is $foo[...], we may be able to avoid
4558
+ * the heuristics below. The possibilities are:
4559
+ * strict @foo $foo
4560
+ * vars? exists exists
4561
+ * y n n This is an error; return false now
4562
+ * y n y must be a a charclass
4563
+ * y y n must be a a subscript
4564
+ * y y y ambiguous; do heuristics below
4565
+ * n n n ambiguous; do heuristics below
4566
+ * n n y ambiguous; do heuristics below, but I
4567
+ * wonder if the initial bias should be a
4568
+ * little towards charclass
4569
+ * n y n ambiguous; do heuristics below, but I
4570
+ * wonder if the initial bias should be a
4571
+ * little towards subscript
4572
+ * n y y ambiguous; do heuristics below
4573
+ */
4574
+
4575
+ /* Find matching ']'. khw: Actually it finds the next ']' and assumes it
4576
+ * matches the '['. In order to account for the possibility of the ']'
4577
+ * being inside the scope of \Q or preceded by an even number of
4578
+ * backslashes, this should be rewritten */
4559
4579
const char * const send = (char * ) memchr (s , ']' , e - s );
4560
4580
if (! send ) /* has to be an expression */
4561
4581
return TRUE;
4562
4582
4583
+ /* Below here, the heuristics start. One idea from alh is, given 'use
4584
+ * 5.43.x', that for all digits, that if we have to resort to heuristics,
4585
+ * we instead raise an error with an explanation of how to make it
4586
+ * unambiguous: ${foo}[123] */
4587
+
4563
4588
/* If the construct consists entirely of one or two digits, call it a
4564
- * subscript. */
4589
+ * subscript.
4590
+ *
4591
+ * khw: No one writes 03 to mean 3. Any string of digits beginning with
4592
+ * '0' is likely to be a charclass, including length 2 ones. */
4565
4593
if (isDIGIT (s [0 ]) && send - s <= 2 && (send - s == 1 || (isDIGIT (s [1 ])))) {
4566
4594
return TRUE;
4567
4595
}
@@ -4596,6 +4624,13 @@ S_intuit_more(pTHX_ char *s, char *e)
4596
4624
Zero (seen , 256 , char );
4597
4625
4598
4626
/* Examine each character in the construct */
4627
+ /* That this knows nothing of UTF-8 can lead to opposite results if the
4628
+ * text is encoded in UTF-8 or not; another relic of the Unicode Bug.
4629
+ * Suppose a string consistis of various un-repeated code points between
4630
+ * 0x128 and 0x255. When encoded in UTF-8 their start bytes will all be
4631
+ * \xC2 or \xC3. The heuristics below will count those as repeated bytes,
4632
+ * and thus lean more towards this being a character class than when not
4633
+ * in UTF-8. */
4599
4634
bool first_time = true;
4600
4635
for (; s < send ; s ++ , first_time = false) {
4601
4636
unsigned char prev_un_char = un_char ;
@@ -4617,15 +4652,42 @@ S_intuit_more(pTHX_ char *s, char *e)
4617
4652
char tmpbuf [sizeof PL_tokenbuf * 4 ];
4618
4653
scan_ident (s , tmpbuf , sizeof tmpbuf , FALSE);
4619
4654
len = (int )strlen (tmpbuf );
4655
+
4656
+ /* khw: This only looks at global variables; lexicals came
4657
+ * later, and this hasn't been updated. Ouch!! */
4620
4658
if ( len > 1
4621
4659
&& gv_fetchpvn_flags (tmpbuf ,
4622
4660
len ,
4623
4661
UTF ? SVf_UTF8 : 0 ,
4624
4662
SVt_PV ))
4663
+ {
4625
4664
weight -= 100 ;
4626
- else /* Not a multi-char identifier already known in the
4627
- program; is somewhat likely to be a subscript */
4665
+
4666
+ /* khw: Below we keep track of repeated characters; People
4667
+ * rarely say qr/[aba]/, as the second a is pointless.
4668
+ * (Some do it though as a mnemonic that is meaningful to
4669
+ * them.) But generally, repeated characters makes things
4670
+ * more likely to be a charclass. But we have here that
4671
+ * this an identifier so likely a subscript. Its spelling
4672
+ * should be irrelevant to the repeated characters test.
4673
+ * So, we should advance past it. Suppose it is a hash
4674
+ * element, like $subscripts{$which}. We should advance
4675
+ * past the braces and key */
4676
+ }
4677
+ else {
4678
+ /* Not a multi-char identifier already known in the
4679
+ * program; is somewhat likely to be a subscript.
4680
+ *
4681
+ * khw: Our test suite contains several constructs like
4682
+ * [$A-Z]. Excluding length 1 identifiers in the
4683
+ * conditional above means such are much less likely to be
4684
+ * mistaken for subscripts. I would argue that if the next
4685
+ * character is a '-' followed by an alpha, that would make
4686
+ * it much more likely to be a charclass. It would only
4687
+ * make sense to be an expression if that alpha string is a
4688
+ * bareword with meaning; something like [$A-ord] */
4628
4689
weight -= 10 ;
4690
+ }
4629
4691
}
4630
4692
else if ( s [0 ] == '$'
4631
4693
&& s [1 ]
@@ -4642,13 +4704,51 @@ S_intuit_more(pTHX_ char *s, char *e)
4642
4704
}
4643
4705
break ;
4644
4706
4707
+ /* khw: [:blank:] strongly indicates a charclass */
4708
+ /* khw: Z-A definitely subscript
4709
+ * Z-Z likely subscript
4710
+ * "x - z" with blanks very likely subscript
4711
+ * \N without { must be subscript
4712
+ * \R must be subscript
4713
+ * \? must be subscript for things like \d, but not \a.
4714
+ */
4715
+
4716
+
4645
4717
case '\\' :
4646
4718
if (s [1 ]) {
4647
- if (memCHRs ("wds]" , s [1 ]))
4719
+ if (memCHRs ("wds]" , s [1 ])) {
4648
4720
weight += 100 ; /* \w \d \s => strongly charclass */
4649
- /* khw: Why not \W \D \S \h \v, etc as well? */
4650
- else if (seen [(U8 )'\'' ] || seen [(U8 )'"' ])
4651
- weight += 1 ; /* \' => mildly charclass */
4721
+ /* khw: \] can't happen, as any ']' is beyond our search.
4722
+ * Why not \W \D \S \h \v, etc as well? Should they have
4723
+ * the same weights as \w \d \s or should all or some be
4724
+ * in the 'abcfnrtvx' below? */
4725
+ } else if (seen [(U8 )'\'' ] || seen [(U8 )'"' ]) {
4726
+ weight += 1 ;
4727
+ /* khw: This is problematic. Enough so, that I misread
4728
+ * it, and added a wrong comment about what it does in
4729
+ * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it
4730
+ * doesn't look at the current character. What it
4731
+ * actually does is: if any quote has been seen in the
4732
+ * parse, don't do the rest of the else's below, but for
4733
+ * every subsequent backslashed character encountered
4734
+ * (except \0 \w \s \d), increment the weight to lean a
4735
+ * bit more towards being a charclass. That means that
4736
+ * every backslash sequence following the first occurrence
4737
+ * of a quote increments the weight regardless of what the
4738
+ * sequence is. Again, \0 \w \d and \s are not controlled
4739
+ * by this else, so they change the weight by a lot more.
4740
+ * But what makes them so special that they aren't subject
4741
+ * to this. Any why does having a quote change the
4742
+ * behavior from then on. And why only backslashed
4743
+ * sequences get this treatment? This code has been
4744
+ * unchanged since this function was added in 1993. I
4745
+ * don't get it. Instead, it does seem to me that it is
4746
+ * especially unlikely to repeat a quote in a charclass,
4747
+ * but that having just a single quote is indicative of a
4748
+ * charclass, and having pairs of quotes is indicative of
4749
+ * a subscript. Similarly for things that could indicate
4750
+ * nesting of braces or parens. */
4751
+ }
4652
4752
else if (memCHRs ("abcfnrtvx" , s [1 ]))
4653
4753
weight += 40 ; /* \n, etc => charclass */
4654
4754
/* khw: Why not \e etc as well? */
@@ -4657,6 +4757,19 @@ S_intuit_more(pTHX_ char *s, char *e)
4657
4757
while (s [1 ] && isDIGIT (s [1 ]))
4658
4758
s ++ ;
4659
4759
}
4760
+
4761
+ /* khw: There are lots more possible escape sequences. Some,
4762
+ * like \A,\z have no special meaning to charclasses, so might
4763
+ * indicate a subscript, but I don't know what they would be
4764
+ * doing there either. Some have been added to the language
4765
+ * after this code was written, but no one thought to, or
4766
+ * could wade through this function, to add them. Things like
4767
+ * \p{} for properties, \N and \N{}, for example.
4768
+ *
4769
+ * It's problematic that \a is treated as plain 'a' for
4770
+ * purposes of the 'seen' array. Whatever is matched by these
4771
+ * backslashed sequences should not be added to 'seen'. That
4772
+ * includes the backslash. */
4660
4773
}
4661
4774
else /* \ followed by NUL strongly indicates character class */
4662
4775
weight += 100 ;
@@ -4702,7 +4815,25 @@ S_intuit_more(pTHX_ char *s, char *e)
4702
4815
&& isALPHA (s [1 ]))
4703
4816
{
4704
4817
/* Here it's \W (that isn't [$@&] ) followed immediately by two
4705
- * alphas in a row. Accumulate all the consecutive alphas */
4818
+ * alphas in a row. Accumulate all the consecutive alphas.
4819
+ *
4820
+ * khw: The code below was changed in 2015 by
4821
+ * 56f81afc0f2d331537f38e6f12b86a850187cb8a to solve a
4822
+ * buffer overrun. Prior to that commit, the code copied all
4823
+ * the consecutive alphas to a temporary. The problem was
4824
+ * that temporary's size could be exceeded, and the temporary
4825
+ * wasn't even needed (at least by 2015). The called
4826
+ * keyword() function doesn't need a copy. It takes a pointer
4827
+ * to the first character and a length, hence it can operate
4828
+ * on the original source text. It is intended to catch cases
4829
+ * like $a[ord]. If it does match a keyword, we don't want
4830
+ * the spelling of that keyword to affect the seen[] array.
4831
+ * But if it isn't a keyword we do want to fall back to the
4832
+ * normal behavior. And the 2015 commit removed that. It
4833
+ * absorbs every bareword regardless, defeating the intent of
4834
+ * the algorithm implementing the heuristics. That not many
4835
+ * bugs have surfaced since indicates this whole thing doesn't
4836
+ * get applied very much */
4706
4837
char * d = s ;
4707
4838
while (isALPHA (s [0 ]))
4708
4839
s ++ ;
@@ -4712,7 +4843,12 @@ S_intuit_more(pTHX_ char *s, char *e)
4712
4843
if (keyword (d , s - d , 0 ))
4713
4844
weight -= 150 ;
4714
4845
4715
- /* khw: Should those alphas be marked as seen? */
4846
+ /* khw: Barewords could also be subroutine calls, and these
4847
+ * would also indicate a subscript. Like [green] where
4848
+ * 'green' has been declared, for example, in 'use constant'
4849
+ * Or maybe it should just call intuit_method() which checks
4850
+ * for keyword, subs, and methods.
4851
+ * */
4716
4852
}
4717
4853
4718
4854
/* Consecutive chars like [...12...] and [...ab...] are presumed
@@ -4727,23 +4863,36 @@ S_intuit_more(pTHX_ char *s, char *e)
4727
4863
/* But repeating a character inside a character class does nothing,
4728
4864
* like [aba], so less likely that someone makes such a class, more
4729
4865
* likely that it is a subscript; the more repeats, the less
4730
- * likely. */
4866
+ * likely.
4867
+ *
4868
+ * khw: I think this changes the weight too rapidly. Each time
4869
+ * through the loop compounds the previous times. Instead, it
4870
+ * would be better to have a separate loop after all the rest that
4871
+ * changes the weight once based on how many times each character
4872
+ * gets repeated */
4731
4873
weight -= seen [un_char ];
4732
4874
break ;
4733
4875
} /* End of switch */
4734
4876
4735
4877
/* khw: 'seen' is declared as a char. This ++ can cause it to wrap.
4736
4878
* This gives different results with compilers for which a plain 'char'
4737
- * is actually unsigned, versus those where it is signed. I believe it
4738
- * is undefined behavior to wrap a 'signed'. I think it should be
4739
- * instead declared an unsigned int to make the chances of wrapping
4740
- * essentially zero.
4879
+ * is actually unsigned, versus those where it is signed. The C99
4880
+ * standard allows a compiler to raise a signal when a 'signed' char
4881
+ * is incremented outside its permissible range. I think 'seen'
4882
+ * should be instead declared an unsigned, and a conditional added
4883
+ * to prevent wrapping.
4741
4884
*
4742
4885
* And I believe that extra backslashes are different from other
4743
- * repeated characters. */
4886
+ * repeated characters. There may be others, like I have mentioned
4887
+ * quotes and paired delimiters */
4744
4888
seen [un_char ]++ ;
4745
4889
} /* End of loop through each character of the construct */
4746
4890
4891
+ /* khw: People on #irc have suggested things that I think boil down to:
4892
+ * under 'use 5.43.x', output a warning like existing warnings for
4893
+ * similar situations "Ambiguous use of [], resolved as ..." Perhaps
4894
+ * suppress the message if all (or maybe almost all) the evidence points
4895
+ * to the same outcome. This would involve two weight variables */
4747
4896
if (weight >= 0 ) /* probably a character class */
4748
4897
return FALSE;
4749
4898
0 commit comments