fix: language server windows issues (#4821)

This PR resolves two language server bugs that especially affect Windows users: 1. Editing the header could result in the watchdog not correctly restarting the file worker (#3786, #3787), which would lead to the file seemingly being processed forever. - The cause of this issue was a race condition in the watchdog that was accidentally introduced as far back as #1884: In specific circumstances, the watchdog will attempt forwarding a message to the file worker after the process has exited due to a changed header, but before the file worker exiting has been noticed by the watchdog (which will then restart the file worker). In this case, the watchdog would mark the file worker as having crashed and not look at its exit code to restart the file worker, but instead treat it like a crashed file worker that will only be restarted when editing the file again. Not inspecting the exit code of the file worker when it crashed from forwarding a message from the file worker is necessary since we do not restart the file worker until another notification from the client arrives, and so we would read the same crash exit code over and over again in the main loop of the watchdog if we did not remove it from our list of file workers that we listen to. - This PR resolves this issue by distinguishing between "crashes when forwarding messages to the file worker" and "crashes when forwarding messages from the file worker". In the former case, we still inspect the exit code of the file worker and potentially restart it if the imports changed, whereas in the latter case, we stop inspecting the exit code of the file worker. This is correct because the latter case is exactly the one where we need to stop inspecting the exit code but where a crash cannot occur as a result of a changed header, whereas the former case is exactly the one where we still need to inspect the exit code after a crash to ensure that we restart the file worker in case it exited because the header changed. - At some point in the future, it would be nice to revamp the concurrency model of the watchdog entirely now that we have all those fancy concurrency primitives that were not available four years ago when the watchdog was first written. 2. On an especially slow Windows machine, we found that starting the language server would sometimes not succeed at all because reading from the stdin pipe in the watchdog produced an EINVAL error, which was in turn caused by an NT "pipe empty" error. - After lots of debugging, @Kha found that Lake accidentally passes its stdin to Git because it does not explicitly set the `stdin` field to `null` when spawning the process. - Changing this fixes the issue, which suggests that Git may mutate the pipe we pass to it to be non-blocking, which then causes a "pipe empty" error in the watchdog when we also attempt to read from that same pipe. - I'm still very uncertain why we only saw this issue on one particularly slow machine and not across the whole eco system. This PR also resolves an issue where we would not correctly emit messages that we received while the file worker is being restarted to the corresponding file worker after the restart. Closes #3786, closes #3787. --------- Co-authored-by: Sebastian Ullrich <sebasti@nullri.ch>
leanprover · Aug 7, 2024 · 574066b · 574066b
1 parent 1e6d617
commit 574066b
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 18 deletions.
diff --git a/src/Lean/Server/Watchdog.lean b/src/Lean/Server/Watchdog.lean
@@ -90,6 +90,10 @@ section Utils
     | crashed (e : IO.Error)
     | ioError (e : IO.Error)
 
+  inductive CrashOrigin
+    | fileWorkerToClientForwarding
+    | clientToFileWorkerForwarding
+
   inductive WorkerState where
     /-- The watchdog can detect a crashed file worker in two places: When trying to send a message
       to the file worker and when reading a request reply.
@@ -98,7 +102,7 @@ section Utils
       that are in-flight are errored. Upon receiving the next packet for that file worker, the file
       worker is restarted and the packet is forwarded to it. If the crash was detected while writing
       a packet, we queue that packet until the next packet for the file worker arrives. -/
-    | crashed (queuedMsgs : Array JsonRpc.Message)
+    | crashed (queuedMsgs : Array JsonRpc.Message) (origin : CrashOrigin)
     | running
 
   abbrev PendingRequestMap := RBMap RequestID JsonRpc.Message compare
@@ -136,6 +140,11 @@ section FileWorker
     for ⟨id, _⟩ in pendingRequests do
       hError.writeLspResponseError { id := id, code := code, message := msg }
 
+  def queuedMsgs (fw : FileWorker) : Array JsonRpc.Message :=
+    match fw.state with
+    | .running => #[]
+    | .crashed queuedMsgs _ => queuedMsgs
+
   end FileWorker
 end FileWorker
 
@@ -404,10 +413,23 @@ section ServerM
       return
     eraseFileWorker uri
 
-  def handleCrash (uri : DocumentUri) (queuedMsgs : Array JsonRpc.Message) : ServerM Unit := do
+  def handleCrash (uri : DocumentUri) (queuedMsgs : Array JsonRpc.Message) (origin: CrashOrigin) : ServerM Unit := do
     let some fw ← findFileWorker? uri
       | return
-    updateFileWorkers { fw with state := WorkerState.crashed queuedMsgs }
+    updateFileWorkers { fw with state := WorkerState.crashed queuedMsgs origin }
+
+  def tryDischargeQueuedMessages (uri : DocumentUri) (queuedMsgs : Array JsonRpc.Message) : ServerM Unit := do
+      let some fw ← findFileWorker? uri
+        | throwServerError "Cannot find file worker for '{uri}'."
+      let mut crashedMsgs := #[]
+      -- Try to discharge all queued msgs, tracking the ones that we can't discharge
+      for msg in queuedMsgs do
+        try
+          fw.stdin.writeLspMessage msg
+        catch _ =>
+          crashedMsgs := crashedMsgs.push msg
+      if ¬ crashedMsgs.isEmpty then
+        handleCrash uri crashedMsgs .clientToFileWorkerForwarding
 
   /-- Tries to write a message, sets the state of the FileWorker to `crashed` if it does not succeed
       and restarts the file worker if the `crashed` flag was already set. Just logs an error if
@@ -423,7 +445,7 @@ section ServerM
     let some fw ← findFileWorker? uri
       | return
     match fw.state with
-    | WorkerState.crashed queuedMsgs =>
+    | WorkerState.crashed queuedMsgs _ =>
       let mut queuedMsgs := queuedMsgs
       if queueFailedMessage then
         queuedMsgs := queuedMsgs.push msg
@@ -432,17 +454,7 @@ section ServerM
       -- restart the crashed FileWorker
       eraseFileWorker uri
       startFileWorker fw.doc
-      let some newFw ← findFileWorker? uri
-        | throwServerError "Cannot find file worker for '{uri}'."
-      let mut crashedMsgs := #[]
-      -- try to discharge all queued msgs, tracking the ones that we can't discharge
-      for msg in queuedMsgs do
-        try
-          newFw.stdin.writeLspMessage msg
-        catch _ =>
-          crashedMsgs := crashedMsgs.push msg
-      if ¬ crashedMsgs.isEmpty then
-        handleCrash uri crashedMsgs
+      tryDischargeQueuedMessages uri queuedMsgs
     | WorkerState.running =>
       let initialQueuedMsgs :=
         if queueFailedMessage then
@@ -452,7 +464,7 @@ section ServerM
       try
         fw.stdin.writeLspMessage msg
       catch _ =>
-        handleCrash uri initialQueuedMsgs
+        handleCrash uri initialQueuedMsgs .clientToFileWorkerForwarding
 
   /--
   Sends a notification to the file worker identified by `uri` that its dependency `staleDependency`
@@ -955,7 +967,16 @@ section MainLoop
     let workers ← st.fileWorkersRef.get
     let mut workerTasks := #[]
     for (_, fw) in workers do
-      if let WorkerState.running := fw.state then
+      -- When the forwarding task crashes, its return value will be stuck at
+      -- `WorkerEvent.crashed _`.
+      -- We want to handle this event only once, not over and over again,
+      -- so once the state becomes `WorkerState.crashed _ .fileWorkerToClientForwarding`
+      -- as a result of `WorkerEvent.crashed _`, we stop handling this event until
+      -- eventually the file worker is restarted by a notification from the client.
+      -- We do not want to filter the forwarding task in case of
+      -- `WorkerState.crashed _ .clientToFileWorkerForwarding`, since the forwarding task
+      -- exit code may still contain valuable information in this case (e.g. that the imports changed).
+      if !(fw.state matches WorkerState.crashed _ .fileWorkerToClientForwarding) then
         workerTasks := workerTasks.push <| fw.commTask.map (ServerEvent.workerEvent fw)
 
     let ev ← IO.waitAny (clientTask :: workerTasks.toList)
@@ -984,13 +1005,16 @@ section MainLoop
       | WorkerEvent.ioError e =>
         throwServerError s!"IO error while processing events for {fw.doc.uri}: {e}"
       | WorkerEvent.crashed _ =>
-        handleCrash fw.doc.uri #[]
+        handleCrash fw.doc.uri fw.queuedMsgs .fileWorkerToClientForwarding
         mainLoop clientTask
       | WorkerEvent.terminated =>
         throwServerError <| "Internal server error: got termination event for worker that "
           ++ "should have been removed"
       | .importsChanged =>
+        let uri := fw.doc.uri
+        let queuedMsgs := fw.queuedMsgs
         startFileWorker fw.doc
+        tryDischargeQueuedMessages uri queuedMsgs
         mainLoop clientTask
 end MainLoop
 

diff --git a/src/lake/Lake/Util/Proc.lean b/src/lake/Lake/Util/Proc.lean
@@ -57,6 +57,7 @@ def testProc (args : IO.Process.SpawnArgs) : BaseIO Bool :=
   EIO.catchExceptions (h := fun _ => pure false) do
     let child ← IO.Process.spawn {
       args with
+      stdin := IO.Process.Stdio.null
       stdout := IO.Process.Stdio.null
       stderr := IO.Process.Stdio.null
     }