13
13
import asyncio
14
14
import datetime
15
15
import logging
16
- import traceback
17
16
from abc import ABC , abstractmethod
18
17
from dataclasses import dataclass , field
19
18
from typing import Final
@@ -551,7 +550,7 @@ async def _start_tasks(
551
550
project_id : ProjectID ,
552
551
scheduled_tasks : dict [NodeID , CompTaskAtDB ],
553
552
pipeline_params : ScheduledPipelineParams ,
554
- ) -> list :
553
+ ) -> None :
555
554
...
556
555
557
556
@abstractmethod
@@ -663,7 +662,7 @@ async def _schedule_pipeline(
663
662
user_id , project_id , iteration , RunningState .ABORTED
664
663
)
665
664
self .scheduled_pipelines .pop ((user_id , project_id , iteration ), None )
666
- except DaskClientAcquisisitonError :
665
+ except ( DaskClientAcquisisitonError , ClustersKeeperNotAvailableError ) :
667
666
_logger .exception (
668
667
"Unexpected error while connecting with computational backend, aborting pipeline"
669
668
)
@@ -692,12 +691,14 @@ async def _schedule_tasks_to_stop(
692
691
) -> None :
693
692
# get any running task and stop them
694
693
comp_tasks_repo = CompTasksRepository .instance (self .db_engine )
695
- await comp_tasks_repo .mark_project_published_tasks_as_aborted (project_id )
694
+ await comp_tasks_repo .mark_project_published_waiting_for_cluster_tasks_as_aborted (
695
+ project_id
696
+ )
696
697
# stop any remaining running task, these are already submitted
697
698
tasks_to_stop = [t for t in comp_tasks .values () if t .state in PROCESSING_STATES ]
698
699
await self ._stop_tasks (user_id , tasks_to_stop , pipeline_params )
699
700
700
- async def _schedule_tasks_to_start (
701
+ async def _schedule_tasks_to_start ( # noqa: C901
701
702
self ,
702
703
user_id : UserID ,
703
704
project_id : ProjectID ,
@@ -729,77 +730,32 @@ async def _schedule_tasks_to_start(
729
730
return comp_tasks
730
731
731
732
try :
732
- results = await self ._start_tasks (
733
+ await self ._start_tasks (
733
734
user_id = user_id ,
734
735
project_id = project_id ,
735
736
scheduled_tasks = tasks_ready_to_start ,
736
737
pipeline_params = pipeline_params ,
737
738
)
739
+ except (
740
+ ComputationalBackendNotConnectedError ,
741
+ ComputationalSchedulerChangedError ,
742
+ ):
743
+ _logger .exception (
744
+ "Issue with computational backend. Tasks are set back "
745
+ "to WAITING_FOR_CLUSTER state until scheduler comes back!" ,
746
+ )
747
+ await CompTasksRepository .instance (
748
+ self .db_engine
749
+ ).update_project_tasks_state (
750
+ project_id ,
751
+ list (tasks_ready_to_start .keys ()),
752
+ RunningState .WAITING_FOR_CLUSTER ,
753
+ )
754
+ for task in tasks_ready_to_start :
755
+ comp_tasks [
756
+ NodeIDStr (f"{ task } " )
757
+ ].state = RunningState .WAITING_FOR_CLUSTER
738
758
739
- # Handling errors raised when _start_tasks(...)
740
- for r , t in zip (results , tasks_ready_to_start , strict = True ):
741
- if isinstance (r , TaskSchedulingError ):
742
- _logger .error (
743
- "Project '%s''s task '%s' could not be scheduled due to the following: %s" ,
744
- r .project_id ,
745
- r .node_id ,
746
- f"{ r } " ,
747
- )
748
-
749
- await CompTasksRepository .instance (
750
- self .db_engine
751
- ).update_project_tasks_state (
752
- project_id ,
753
- [r .node_id ],
754
- RunningState .FAILED ,
755
- r .get_errors (),
756
- optional_progress = 1.0 ,
757
- optional_stopped = arrow .utcnow ().datetime ,
758
- )
759
- comp_tasks [NodeIDStr (f"{ t } " )].state = RunningState .FAILED
760
- elif isinstance (
761
- r ,
762
- ComputationalBackendNotConnectedError
763
- | ComputationalSchedulerChangedError ,
764
- ):
765
- _logger .error (
766
- "Issue with computational backend: %s. Tasks are set back "
767
- "to WAITING_FOR_CLUSTER state until scheduler comes back!" ,
768
- r ,
769
- )
770
- # we should try re-connecting.
771
- # in the meantime we cannot schedule tasks on the scheduler,
772
- # let's put these tasks back to WAITING_FOR_CLUSTER, so they might be re-submitted later
773
- await CompTasksRepository .instance (
774
- self .db_engine
775
- ).update_project_tasks_state (
776
- project_id ,
777
- list (tasks_ready_to_start .keys ()),
778
- RunningState .WAITING_FOR_CLUSTER ,
779
- )
780
- comp_tasks [
781
- NodeIDStr (f"{ t } " )
782
- ].state = RunningState .WAITING_FOR_CLUSTER
783
- elif isinstance (r , Exception ):
784
- _logger .error (
785
- "Unexpected error for %s with %s on %s happened when scheduling %s:\n %s\n %s" ,
786
- f"{ user_id = } " ,
787
- f"{ project_id = } " ,
788
- f"{ pipeline_params .cluster_id = } " ,
789
- f"{ tasks_ready_to_start .keys ()= } " ,
790
- f"{ r } " ,
791
- "" .join (traceback .format_tb (r .__traceback__ )),
792
- )
793
- await CompTasksRepository .instance (
794
- self .db_engine
795
- ).update_project_tasks_state (
796
- project_id ,
797
- [t ],
798
- RunningState .FAILED ,
799
- optional_progress = 1.0 ,
800
- optional_stopped = arrow .utcnow ().datetime ,
801
- )
802
- comp_tasks [NodeIDStr (f"{ t } " )].state = RunningState .FAILED
803
759
except ComputationalBackendOnDemandNotReadyError as exc :
804
760
_logger .info (
805
761
"The on demand computational backend is not ready yet: %s" , exc
@@ -819,8 +775,10 @@ async def _schedule_tasks_to_start(
819
775
list (tasks_ready_to_start .keys ()),
820
776
RunningState .WAITING_FOR_CLUSTER ,
821
777
)
822
- for task in comp_tasks .values ():
823
- task .state = RunningState .WAITING_FOR_CLUSTER
778
+ for task in tasks_ready_to_start :
779
+ comp_tasks [
780
+ NodeIDStr (f"{ task } " )
781
+ ].state = RunningState .WAITING_FOR_CLUSTER
824
782
except ClustersKeeperNotAvailableError :
825
783
_logger .exception ("Unexpected error while starting tasks:" )
826
784
await publish_project_log (
@@ -840,8 +798,46 @@ async def _schedule_tasks_to_start(
840
798
optional_progress = 1.0 ,
841
799
optional_stopped = arrow .utcnow ().datetime ,
842
800
)
843
- for task in comp_tasks .values ():
844
- task .state = RunningState .FAILED
801
+ for task in tasks_ready_to_start :
802
+ comp_tasks [NodeIDStr (f"{ task } " )].state = RunningState .FAILED
803
+ raise
804
+ except TaskSchedulingError as exc :
805
+ _logger .exception (
806
+ "Project '%s''s task '%s' could not be scheduled" ,
807
+ exc .project_id ,
808
+ exc .node_id ,
809
+ )
810
+ await CompTasksRepository .instance (
811
+ self .db_engine
812
+ ).update_project_tasks_state (
813
+ project_id ,
814
+ [exc .node_id ],
815
+ RunningState .FAILED ,
816
+ exc .get_errors (),
817
+ optional_progress = 1.0 ,
818
+ optional_stopped = arrow .utcnow ().datetime ,
819
+ )
820
+ comp_tasks [NodeIDStr (f"{ exc .node_id } " )].state = RunningState .FAILED
821
+ except Exception :
822
+ _logger .exception (
823
+ "Unexpected error for %s with %s on %s happened when scheduling %s:" ,
824
+ f"{ user_id = } " ,
825
+ f"{ project_id = } " ,
826
+ f"{ pipeline_params .cluster_id = } " ,
827
+ f"{ tasks_ready_to_start .keys ()= } " ,
828
+ )
829
+ await CompTasksRepository .instance (
830
+ self .db_engine
831
+ ).update_project_tasks_state (
832
+ project_id ,
833
+ list (tasks_ready_to_start .keys ()),
834
+ RunningState .FAILED ,
835
+ optional_progress = 1.0 ,
836
+ optional_stopped = arrow .utcnow ().datetime ,
837
+ )
838
+ for task in tasks_ready_to_start :
839
+ comp_tasks [NodeIDStr (f"{ task } " )].state = RunningState .FAILED
840
+ raise
845
841
846
842
return comp_tasks
847
843
0 commit comments