@@ -119,6 +119,7 @@ static int rss_file;
119
119
static int compress ;
120
120
static int checksum ;
121
121
static int service ;
122
+ static unsigned int timeout ;
122
123
123
124
124
125
#define BIT (x ) (1ULL << x)
@@ -483,6 +484,7 @@ static int seize_pid(pid_t pid)
483
484
int status ;
484
485
siginfo_t si ;
485
486
487
+ printf ("[+] seizing tid %d\n" , pid );
486
488
ret = ptrace (PTRACE_SEIZE , pid , NULL , 0 );
487
489
if (ret ) {
488
490
if (errno == ESRCH ) {
@@ -495,6 +497,7 @@ static int seize_pid(pid_t pid)
495
497
}
496
498
497
499
try_again :
500
+
498
501
ret = ptrace (PTRACE_INTERRUPT , pid , NULL , NULL );
499
502
if (ret ) {
500
503
fprintf (stderr , "ptrace(PTRACE_INTERRUPT) pid %d: %m\n" , pid );
@@ -535,6 +538,7 @@ static int seize_pid(pid_t pid)
535
538
return 1 ;
536
539
}
537
540
541
+ printf ("[i] delivered signal %d to pid %d\n" , si .si_signo , pid );
538
542
goto try_again ;
539
543
}
540
544
@@ -549,6 +553,25 @@ static int seize_pid(pid_t pid)
549
553
return 0 ;
550
554
}
551
555
556
+ static int unseize_pid (pid_t pid )
557
+ {
558
+ return ptrace (PTRACE_DETACH , pid , NULL , 0 );
559
+ }
560
+
561
+ static int unseize_target (void )
562
+ {
563
+ int ret = 0 ;
564
+ int i ;
565
+
566
+ printf ("[+] unseizing target\n" );
567
+
568
+ for (i = 0 ; i < nr_threads ; i ++ )
569
+ ret |= unseize_pid (tids [i ]);
570
+ nr_threads = 0 ;
571
+
572
+ return ret ;
573
+ }
574
+
552
575
static int seize_target (pid_t pid )
553
576
{
554
577
int ret ;
@@ -573,25 +596,6 @@ static int seize_target(pid_t pid)
573
596
return 0 ;
574
597
}
575
598
576
- static int unseize_pid (pid_t pid )
577
- {
578
- return ptrace (PTRACE_DETACH , pid , NULL , 0 );
579
- }
580
-
581
- static int unseize_target (void )
582
- {
583
- int ret = 0 ;
584
- int i ;
585
-
586
- printf ("[+] unseizing target\n" );
587
-
588
- for (i = 0 ; i < nr_threads ; i ++ )
589
- ret |= unseize_pid (tids [i ]);
590
- nr_threads = 0 ;
591
-
592
- return ret ;
593
- }
594
-
595
599
static int parasite_socket_create (pid_t pid )
596
600
{
597
601
int pid_netns = -1 ;
@@ -688,6 +692,8 @@ static int __read(int fd, void *buf, size_t count, int (*check_peer_ok)(void), i
688
692
continue ;
689
693
690
694
break ;
695
+ } else if (errno == EINTR ) {
696
+ continue ;
691
697
}
692
698
693
699
if (silent == FALSE)
@@ -923,6 +929,20 @@ static void clear_pid_on_worker_exit_non_blocking(pid_t worker)
923
929
}
924
930
}
925
931
932
+ static int get_pid_worker (pid_t pid )
933
+ {
934
+ int worker = PID_INVALID ;
935
+ pthread_mutex_lock (& checkpoint_service_data_lock );
936
+ for (int i = 0 ; i < CHECKPOINTED_PIDS_LIMIT ; ++ i ) {
937
+ if (checkpoint_service_data [i ].pid == pid ) {
938
+ worker = checkpoint_service_data [i ].worker ;
939
+ break ;
940
+ }
941
+ }
942
+ pthread_mutex_unlock (& checkpoint_service_data_lock );
943
+ return worker ;
944
+ }
945
+
926
946
static int can_checkpoint_pid (pid_t pid )
927
947
{
928
948
pthread_mutex_lock (& checkpoint_service_data_lock );
@@ -2470,24 +2490,37 @@ static void try_to_abort_checkpoint(pid_t pid)
2470
2490
}
2471
2491
}
2472
2492
2473
- static void checkpoint_procedure_service (int checkpointSocket , int cd )
2493
+ static int checkpoint_procedure_service (int checkpointSocket , int cd , int pid , int worker_pid )
2474
2494
{
2475
2495
int ret ;
2476
2496
struct service_response svc_resp ;
2477
2497
2478
- fprintf (stdout , "[+] Service waiting for worker checkpoint...\n" );
2498
+ if (timeout ) {
2499
+ fprintf (stdout , "[+] Service waiting for worker checkpoint with timeout %d[s]...\n" , timeout );
2500
+ struct timeval rcv_timeout = { .tv_sec = timeout , .tv_usec = 0 };
2501
+ ret = setsockopt (checkpointSocket , SOL_SOCKET , SO_RCVTIMEO , & rcv_timeout , sizeof (rcv_timeout ));
2502
+ if (ret < 0 )
2503
+ fprintf (stderr , "[-] Error setting socket timeout: %m, waiting forever!\n" );
2504
+ } else
2505
+ fprintf (stdout , "[+] Service waiting for worker checkpoint...\n" );
2506
+
2479
2507
ret = _read (checkpointSocket , & svc_resp , sizeof (svc_resp )); // receive resp from child
2480
2508
2481
2509
if (ret == sizeof (svc_resp )) {
2482
2510
fprintf (stdout , "[+] Service received checkpoint response, informing client...\n" );
2483
2511
send_response_to_client (cd , svc_resp .resp_code );
2512
+ return svc_resp .resp_code ;
2484
2513
} else {
2485
2514
fprintf (stderr , "[!] Error reading checkpoint response from worker!\n" );
2515
+ // unnable to read response from worker, kill both
2516
+ kill (pid , SIGKILL );
2517
+ kill (worker_pid , SIGKILL );
2486
2518
send_response_to_client (cd , MEMCR_ERROR_GENERAL );
2519
+ return MEMCR_ERROR_GENERAL ;
2487
2520
}
2488
2521
}
2489
2522
2490
- static void restore_procedure_service (int cd , struct service_command svc_cmd )
2523
+ static void restore_procedure_service (int cd , struct service_command svc_cmd , int worker_pid )
2491
2524
{
2492
2525
int rd , ret = 0 ;
2493
2526
struct service_response svc_resp ;
@@ -2504,12 +2537,23 @@ static void restore_procedure_service(int cd, struct service_command svc_cmd)
2504
2537
ret = -1 ;
2505
2538
}
2506
2539
2507
- fprintf (stdout , "[+] Service waiting for worker to restore... \n" );
2540
+ if (timeout ) {
2541
+ fprintf (stdout , "[+] Service waiting for worker to restore with timeout %d[s]...\n" , timeout );
2542
+ struct timeval rcv_timeout = { .tv_sec = timeout , .tv_usec = 0 };
2543
+ ret = setsockopt (rd , SOL_SOCKET , SO_RCVTIMEO , & rcv_timeout , sizeof (rcv_timeout ));
2544
+ if (ret < 0 )
2545
+ fprintf (stderr , "[-] Error setting socket timeout: %m, waiting forever!\n" );
2546
+ } else
2547
+ fprintf (stdout , "[+] Service waiting for worker to restore... \n" );
2548
+
2508
2549
ret = _read (rd , & svc_resp , sizeof (struct service_response )); // read response from service
2509
2550
close (rd );
2510
2551
2511
2552
if (ret != sizeof (struct service_response )) {
2512
2553
fprintf (stderr , "[-] %s() read() svc_resp failed: ret %d\n" , __func__ , ret );
2554
+ // unnable to read response from worker, kill both
2555
+ kill (svc_cmd .pid , SIGKILL );
2556
+ kill (worker_pid , SIGKILL );
2513
2557
ret = -1 ;
2514
2558
}
2515
2559
@@ -2573,18 +2617,30 @@ static void *service_command_thread(void *ptr)
2573
2617
} else if (forkpid > 0 ) {
2574
2618
close (checkpoint_resp_sockets [1 ]);
2575
2619
set_pid_checkpointing (svc_ctx .svc_cmd .pid , checkpoint_resp_sockets [0 ]);
2576
- checkpoint_procedure_service (checkpoint_resp_sockets [0 ], svc_ctx .cd );
2577
- set_pid_checkpointed (svc_ctx .svc_cmd .pid , forkpid );
2620
+ if (checkpoint_procedure_service (checkpoint_resp_sockets [0 ], svc_ctx .cd ,
2621
+ svc_ctx .svc_cmd .pid , forkpid ))
2622
+ clear_pid_checkpoint_data (svc_ctx .svc_cmd .pid );
2623
+ else
2624
+ set_pid_checkpointed (svc_ctx .svc_cmd .pid , forkpid );
2625
+
2578
2626
close (checkpoint_resp_sockets [0 ]);
2579
2627
} else {
2580
2628
fprintf (stderr , "%s(): Fork error!\n" , __func__ );
2629
+ clear_pid_checkpoint_data (svc_ctx .svc_cmd .pid );
2581
2630
}
2582
2631
2583
2632
break ;
2584
2633
}
2585
2634
case MEMCR_RESTORE : {
2586
2635
fprintf (stdout , "[+] handling MEMCR_RESTORE for %d.\n" , svc_ctx .svc_cmd .pid );
2587
- restore_procedure_service (svc_ctx .cd , svc_ctx .svc_cmd );
2636
+ int worker_pid = get_pid_worker (svc_ctx .svc_cmd .pid );
2637
+ if (worker_pid == PID_INVALID ) {
2638
+ fprintf (stderr , "%s(): Error, worker pid not found for %d!\n" , __func__ , svc_ctx .svc_cmd .pid );
2639
+ send_response_to_client (svc_ctx .cd , MEMCR_ERROR_GENERAL );
2640
+ close (svc_ctx .cd );
2641
+ break ;
2642
+ }
2643
+ restore_procedure_service (svc_ctx .cd , svc_ctx .svc_cmd , worker_pid );
2588
2644
clear_pid_checkpoint_data (svc_ctx .svc_cmd .pid );
2589
2645
break ;
2590
2646
}
@@ -2800,7 +2856,8 @@ static void usage(const char *name, int status)
2800
2856
" -f --rss-file include file mapped memory\n" \
2801
2857
" -z --compress compress memory dump\n" \
2802
2858
" -c --checksum enable md5 checksum for memory dump\n" \
2803
- " -e --encrypt enable encryption of memory dump\n" ,
2859
+ " -e --encrypt enable encryption of memory dump\n" \
2860
+ " -t --timeout timeout in seconds for checkpoint/restore execution in service mode\n" ,
2804
2861
name );
2805
2862
2806
2863
exit (status );
@@ -2840,14 +2897,15 @@ int main(int argc, char *argv[])
2840
2897
{ "compress" , 0 , NULL , 'z' },
2841
2898
{ "checksum" , 0 , NULL , 'c' },
2842
2899
{ "encrypt" , 2 , 0 , 'e' },
2900
+ { "timeout" , 1 , 0 , 't' },
2843
2901
{ NULL , 0 , NULL , 0 }
2844
2902
};
2845
2903
2846
2904
dump_dir = "/tmp" ;
2847
2905
parasite_socket_dir = NULL ;
2848
2906
parasite_socket_use_netns = 0 ;
2849
2907
2850
- while ((opt = getopt_long (argc , argv , "hp:d:S:Nl:nmfzce::" , long_options , & option_index )) != -1 ) {
2908
+ while ((opt = getopt_long (argc , argv , "hp:d:S:Nl:nmfzce::t: " , long_options , & option_index )) != -1 ) {
2851
2909
switch (opt ) {
2852
2910
case 'h' :
2853
2911
usage (argv [0 ], 0 );
@@ -2896,6 +2954,9 @@ int main(int argc, char *argv[])
2896
2954
else if (optind < argc && argv [optind ][0 ] != '-' )
2897
2955
encrypt_arg = argv [optind ++ ];
2898
2956
break ;
2957
+ case 't' :
2958
+ timeout = atoi (optarg );
2959
+ break ;
2899
2960
default : /* '?' */
2900
2961
usage (argv [0 ], 1 );
2901
2962
}
0 commit comments