Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

match beginning and end of line correctly #3575

Merged
merged 1 commit into from
Feb 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions internal/buffer/loc.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ func (l Loc) LessEqual(b Loc) bool {
return l == b
}

// Clamp clamps a loc between start and end
func (l Loc) Clamp(start, end Loc) Loc {
if l.GreaterEqual(end) {
return end
} else if l.LessThan(start) {
return start
}
return l
}

// The following functions require a buffer to know where newlines are

// Diff returns the distance between two locations
Expand Down Expand Up @@ -139,10 +149,5 @@ func ByteOffset(pos Loc, buf *Buffer) int {

// clamps a loc within a buffer
func clamp(pos Loc, la *LineArray) Loc {
if pos.GreaterEqual(la.End()) {
return la.End()
} else if pos.LessThan(la.Start()) {
return la.Start()
}
return pos
return pos.Clamp(la.Start(), la.End())
}
202 changes: 121 additions & 81 deletions internal/buffer/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,56 @@ package buffer

import (
"regexp"
"unicode/utf8"

"github.com/zyedidia/micro/v2/internal/util"
)

// We want "^" and "$" to match only the beginning/end of a line, not the
// beginning/end of the search region if it is in the middle of a line.
// In that case we use padded regexps to require a rune before or after
// the match. (This also affects other empty-string patters like "\\b".)
// The following two flags indicate the padding used.
const (
padStart = 1 << iota
padEnd
)

func findLineParams(b *Buffer, start, end Loc, i int, r *regexp.Regexp) ([]byte, int, int, *regexp.Regexp) {
l := b.LineBytes(i)
charpos := 0
padMode := 0

if i == end.Y {
nchars := util.CharacterCount(l)
end.X = util.Clamp(end.X, 0, nchars)
if end.X < nchars {
l = util.SliceStart(l, end.X+1)
padMode |= padEnd
}
}

if i == start.Y {
nchars := util.CharacterCount(l)
start.X = util.Clamp(start.X, 0, nchars)
if start.X > 0 {
charpos = start.X - 1
l = util.SliceEnd(l, charpos)
padMode |= padStart
}
}

if padMode == padStart {
r = regexp.MustCompile(".(?:" + r.String() + ")")
} else if padMode == padEnd {
r = regexp.MustCompile("(?:" + r.String() + ").")
} else if padMode == padStart|padEnd {
r = regexp.MustCompile(".(?:" + r.String() + ").")
}

return l, charpos, padMode, r
}

func (b *Buffer) findDown(r *regexp.Regexp, start, end Loc) ([2]Loc, bool) {
lastcn := util.CharacterCount(b.LineBytes(b.LinesNum() - 1))
if start.Y > b.LinesNum()-1 {
Expand All @@ -22,30 +68,19 @@ func (b *Buffer) findDown(r *regexp.Regexp, start, end Loc) ([2]Loc, bool) {
}

for i := start.Y; i <= end.Y; i++ {
l := b.LineBytes(i)
charpos := 0

if i == start.Y && start.Y == end.Y {
nchars := util.CharacterCount(l)
start.X = util.Clamp(start.X, 0, nchars)
end.X = util.Clamp(end.X, 0, nchars)
l = util.SliceStart(l, end.X)
l = util.SliceEnd(l, start.X)
charpos = start.X
} else if i == start.Y {
nchars := util.CharacterCount(l)
start.X = util.Clamp(start.X, 0, nchars)
l = util.SliceEnd(l, start.X)
charpos = start.X
} else if i == end.Y {
nchars := util.CharacterCount(l)
end.X = util.Clamp(end.X, 0, nchars)
l = util.SliceStart(l, end.X)
}
l, charpos, padMode, rPadded := findLineParams(b, start, end, i, r)

match := r.FindIndex(l)
match := rPadded.FindIndex(l)

if match != nil {
if padMode&padStart != 0 {
_, size := utf8.DecodeRune(l[match[0]:])
match[0] += size
}
if padMode&padEnd != 0 {
_, size := utf8.DecodeLastRune(l[:match[1]])
match[1] -= size
}
matthias314 marked this conversation as resolved.
Show resolved Hide resolved
start := Loc{charpos + util.RunePos(l, match[0]), i}
end := Loc{charpos + util.RunePos(l, match[1]), i}
return [2]Loc{start, end}, true
Expand All @@ -70,39 +105,39 @@ func (b *Buffer) findUp(r *regexp.Regexp, start, end Loc) ([2]Loc, bool) {
}

for i := end.Y; i >= start.Y; i-- {
l := b.LineBytes(i)
charpos := 0

if i == start.Y && start.Y == end.Y {
nchars := util.CharacterCount(l)
start.X = util.Clamp(start.X, 0, nchars)
end.X = util.Clamp(end.X, 0, nchars)
l = util.SliceStart(l, end.X)
l = util.SliceEnd(l, start.X)
charpos = start.X
} else if i == start.Y {
nchars := util.CharacterCount(l)
start.X = util.Clamp(start.X, 0, nchars)
l = util.SliceEnd(l, start.X)
charpos = start.X
} else if i == end.Y {
nchars := util.CharacterCount(l)
end.X = util.Clamp(end.X, 0, nchars)
l = util.SliceStart(l, end.X)
}

allMatches := r.FindAllIndex(l, -1)
charCount := util.CharacterCount(b.LineBytes(i))
from := Loc{0, i}.Clamp(start, end)
to := Loc{charCount, i}.Clamp(start, end)

allMatches := b.findAll(r, from, to)
if allMatches != nil {
match := allMatches[len(allMatches)-1]
start := Loc{charpos + util.RunePos(l, match[0]), i}
end := Loc{charpos + util.RunePos(l, match[1]), i}
return [2]Loc{start, end}, true
return [2]Loc{match[0], match[1]}, true
}
}
return [2]Loc{}, false
}

func (b *Buffer) findAll(r *regexp.Regexp, start, end Loc) [][2]Loc {
var matches [][2]Loc
loc := start
for {
match, found := b.findDown(r, loc, end)
if !found {
break
}
matches = append(matches, match)
if match[0] != match[1] {
loc = match[1]
} else if match[1] != end {
loc = match[1].Move(1, b)
} else {
break
}
}
return matches
}

// FindNext finds the next occurrence of a given string in the buffer
// It returns the start and end location of the match (if found) and
// a boolean indicating if it was found
Expand Down Expand Up @@ -146,53 +181,58 @@ func (b *Buffer) FindNext(s string, start, end, from Loc, down bool, useRegex bo
}

// ReplaceRegex replaces all occurrences of 'search' with 'replace' in the given area
// and returns the number of replacements made and the number of runes
// and returns the number of replacements made and the number of characters
// added or removed on the last line of the range
func (b *Buffer) ReplaceRegex(start, end Loc, search *regexp.Regexp, replace []byte, captureGroups bool) (int, int) {
if start.GreaterThan(end) {
start, end = end, start
}

netrunes := 0

charsEnd := util.CharacterCount(b.LineBytes(end.Y))
found := 0
var deltas []Delta

for i := start.Y; i <= end.Y; i++ {
l := b.lines[i].data
charpos := 0

if start.Y == end.Y && i == start.Y {
l = util.SliceStart(l, end.X)
l = util.SliceEnd(l, start.X)
charpos = start.X
} else if i == start.Y {
l = util.SliceEnd(l, start.X)
charpos = start.X
} else if i == end.Y {
l = util.SliceStart(l, end.X)
}
newText := search.ReplaceAllFunc(l, func(in []byte) []byte {
var result []byte
if captureGroups {
for _, submatches := range search.FindAllSubmatchIndex(in, -1) {
result = search.Expand(result, replace, in, submatches)
l := b.LineBytes(i)
charCount := util.CharacterCount(l)
if (i == start.Y && start.X > 0) || (i == end.Y && end.X < charCount) {
// This replacement code works in general, but it creates a separate
// modification for each match. We only use it for the first and last
// lines, which may use padded regexps

from := Loc{0, i}.Clamp(start, end)
to := Loc{charCount, i}.Clamp(start, end)
matches := b.findAll(search, from, to)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still, what is the advantage of using findAll() to remember all matches in some temporary storage to handle them afterwards, instead of just using findDown() and handling each match right away?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One could do that. I wrote the code with the idea in mind to have some public FindAll function at a later point. That code would be based on the current findAll. One could change findAll to some findAllFunc that takes a callback function as argument (simialr to ReplaceAllFunc). This way the problem of creating temporary lists would disappear. (That would also be a good idea for a public function.) Currently findAll is only called for the first and last lines, so that performance doesn't really matter. That would change of course if we used this function for all lines.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somehow I missed the email notification about your comments a week ago...

One could do that. I wrote the code with the idea in mind to have some public FindAll function at a later point. That code would be based on the current findAll. One could change findAll to some findAllFunc that takes a callback function as argument (simialr to ReplaceAllFunc).

Still, what for? Why not keep things simple?

Currently findAll is only called for the first and last lines, so that performance doesn't really matter.

A single line may be huge. (Although that's an edge case, and currently micro handles huge lines very poorly anyway, for unrelated reasons, due to the dumb data structure with O(n) access, which IMO we should fix some day, because that's a shame.)

But my point is not so much about performance but about simplicity. We have a chance to try to make the logic not just more correct but at the same time simpler, by simply calling findDown() in a loop for the entire range, instead of complicating the code with more special cases, so why not take this chance?

Copy link
Contributor Author

@matthias314 matthias314 Feb 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want to call MultipleReplace only once in ReplaceRegex with all deltas at once (as done in master and this PR), then I believe we cannot deal with each match right away (unless we modify MultipleReplace). The problem is that multiple deltas on a single line only work if the last (= rightmost) change comes first. If we simply want to call findDown repeatedly over the whole buffer, then we have to store all matches in a list (currently done in findAll) and then process this list backwards (done in ReplaceRegex). (My understanding is that there is no efficient way in Go to prepend to a slice.)

Apart from this list reversal, this would indeed lead to fairly simple code. Padded repexps would be compiled for almost every call to findDown. If one doesn't want that, then one could create the padded regexps before calling findDown.

Please let me know how you want to proceed.

EDIT: Modifying the processing of deltas should be easy to do (in ExecuteTextEvent). As far as I can tell, MultipleReplace is the only place where a TextEvent with multiple deltas is created, and ReplaceRegex is the only function calling MultipleReplace.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want to call MultipleReplace only once in ReplaceRegex with all deltas at once (as done in master and this PR), then I believe we cannot deal with each match right away (unless we modify MultipleReplace).

Indeed, good point. Thanks for explanations.

I've analyzed all this in more detail and now it seems to me that the current approach in this PR is already as simple and practical as it can be without further refactoring of EventHandler (which we can do separately, if we want to).

Padded repexps would be compiled for almost every call to findDown.

Yeah, that's also an argument.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you be interested in a separate PR that modifies ExecuteTextEvent? In a separate commit I could modify the search code (precompile padded regexps and use the findDown method only in ReplaceRegex).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you be interested in a separate PR that modifies ExecuteTextEvent?

If it goes well, why not?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's now the first part of #3658.

found += len(matches)

for j := len(matches) - 1; j >= 0; j-- {
// if we counted upwards, the different deltas would interfere
match := matches[j]
var newText []byte
if captureGroups {
newText = search.ReplaceAll(b.Substr(match[0], match[1]), replace)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've noticed that because of this ReplaceAll() usage, matching \b still doesn't work fully correctly:

  1. replace \\b aaa (interactive replace) -> finds matches but doesn't replace them (IOW correctly finds the next beginning or end of a word, but doesn't insert aaa here).
  2. replace \\b aaa -a with selected text -> if the first or last line of the selection is not fully covered by the selection, it doesn't replace matches in this (first or last) line.

This "fixes" both problems for me:

diff --git a/internal/buffer/search.go b/internal/buffer/search.go
index a48e1f87..f2e645e3 100644
--- a/internal/buffer/search.go
+++ b/internal/buffer/search.go
@@ -209,7 +209,7 @@ func (b *Buffer) ReplaceRegex(start, end Loc, search *regexp.Regexp, replace []b
 				// if we counted upwards, the different deltas would interfere
 				match := matches[j]
 				var newText []byte
-				if captureGroups {
+				if captureGroups && match[0] != match[1] {
 					newText = search.ReplaceAll(b.Substr(match[0], match[1]), replace)
 				} else {
 					newText = replace

but this is an incorrect hack (at least because we should not disable expanding ${1} etc even if the match is empty).

I'm not sure how to fix it correctly.

Copy link
Contributor Author

@matthias314 matthias314 Feb 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This bug is in master already. The problem is to apply the given regexp to b.Substr(match[0], match[1]) because this substring may fail to have a word boundary matching \b.

The good news is that this problem does not seem to exist in my draft PR #3658. There, instead of using (*Regexp).ReplaceAll(), I store the match and use it for (*Regexp).Expand(),

https://github.com/matthias314/micro/blob/bab91644e3ae8a90371f145b1f6106ad67619886/internal/buffer/search.go#L326-L333

For replaceall that should fix the problem. It also seems to work for interactive replacements although the new function (*Buffer).ReplaceAll() (which replaced (*Buffer).ReplaceRegex() is called there. I still need to understand why. Even if there is a problem, one should be able to solve it by using Expand there, too.

EDIT: I think that #3658 works for interactive replaces, too. The reason is that the added padding makes the problem of an empty match disappear, and then (*Regexp).Expand() works as for replaceall.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, for sure this is not a regression.

The good news is that this problem does not seem to exist in my draft PR #3658. There, instead of using (*Regexp).ReplaceAll(), I store the match and use it for (*Regexp).Expand()

Cool. Yeah I see, to fix the problem we just need to Expand() on the original line, but that seems tricky to do without some rework in EventHandler first.

I'll take a look at #3658 once I have some time. In the meantime I think we can merge this PR now anyway.

} else {
newText = replace
}
} else {
result = replace
deltas = append(deltas, Delta{newText, match[0], match[1]})
}
found++
if i == end.Y {
netrunes += util.CharacterCount(result) - util.CharacterCount(in)
}
return result
})

from := Loc{charpos, i}
to := Loc{charpos + util.CharacterCount(l), i}

deltas = append(deltas, Delta{newText, from, to})
} else {
newLine := search.ReplaceAllFunc(l, func(in []byte) []byte {
found++
var result []byte
if captureGroups {
match := search.FindSubmatchIndex(in)
result = search.Expand(result, replace, in, match)
} else {
result = replace
}
return result
})
deltas = append(deltas, Delta{newLine, Loc{0, i}, Loc{charCount, i}})
}
}

b.MultipleReplace(deltas)

return found, netrunes
return found, util.CharacterCount(b.LineBytes(end.Y)) - charsEnd
}