Skip to content

Commit 30f1591

Browse files
yzx1983root
authored and
root
committed
ZOOKEEPER-2802:Zookeeper C client hang @wait_sync_completion
We should not terminate the do_io thread immediately when the client in "unrecoverable" state, because it can not guarantee there is no new request gets in the work queue at the same time. The safer way is make do_io keep completing the new requests when the client is unrecoverable, until a close request gets in. Also, since we don't terminate do_io thread at SESSIONEXPIRED, remove the matching testcase from TestZookeeperClose.cc
1 parent ceebda9 commit 30f1591

File tree

2 files changed

+79
-59
lines changed

2 files changed

+79
-59
lines changed

zookeeper-client/zookeeper-client-c/src/mt_adaptor.c

Lines changed: 79 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
#include <sys/time.h>
4545
#endif
4646

47+
extern void free_completions(zhandle_t *zh,int callCompletion,int reason);
48+
4749
int zoo_lock_auth(zhandle_t *zh)
4850
{
4951
return pthread_mutex_lock(&zh->auth_h.lock);
@@ -376,27 +378,38 @@ void *do_io(void *v)
376378
int timeout;
377379
int maxfd=1;
378380

379-
zh->io_count++;
380-
381-
zookeeper_interest(zh, &fd, &interest, &tv);
382-
if (fd != -1) {
383-
fds[1].fd=fd;
384-
fds[1].events=(interest&ZOOKEEPER_READ)?POLLIN:0;
385-
fds[1].events|=(interest&ZOOKEEPER_WRITE)?POLLOUT:0;
386-
maxfd=2;
387-
}
388-
timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);
389-
390-
poll(fds,maxfd,timeout);
391-
if (fd != -1) {
392-
interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;
393-
interest|=((fds[1].revents&POLLOUT)||(fds[1].revents&POLLHUP))?ZOOKEEPER_WRITE:0;
394-
}
395-
if(fds[0].revents&POLLIN){
396-
// flush the pipe
397-
char b[128];
398-
while(read(adaptor_threads->self_pipe[0],b,sizeof(b))==sizeof(b)){}
399-
}
381+
// check the current state of the zhandle and terminate
382+
// if it is_unrecoverable()
383+
if(is_unrecoverable(zh)) {
384+
/* Signal any syncronous completions */
385+
enter_critical(zh);
386+
free_completions(zh,1,ZSESSIONEXPIRED);
387+
leave_critical(zh);
388+
// don't exhaust cpu
389+
usleep(1000);
390+
} else {
391+
392+
zh->io_count++;
393+
394+
zookeeper_interest(zh, &fd, &interest, &tv);
395+
if (fd != -1) {
396+
fds[1].fd=fd;
397+
fds[1].events=(interest&ZOOKEEPER_READ)?POLLIN:0;
398+
fds[1].events|=(interest&ZOOKEEPER_WRITE)?POLLOUT:0;
399+
maxfd=2;
400+
}
401+
timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);
402+
403+
poll(fds,maxfd,timeout);
404+
if (fd != -1) {
405+
interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;
406+
interest|=((fds[1].revents&POLLOUT)||(fds[1].revents&POLLHUP))?ZOOKEEPER_WRITE:0;
407+
}
408+
if(fds[0].revents&POLLIN){
409+
// flush the pipe
410+
char b[128];
411+
while(read(adaptor_threads->self_pipe[0],b,sizeof(b))==sizeof(b)){}
412+
}
400413
#else
401414
fd_set rfds, wfds;
402415
struct adaptor_threads *adaptor_threads = zh->adaptor_priv;
@@ -410,51 +423,59 @@ void *do_io(void *v)
410423
int interest;
411424
int rc;
412425

413-
zookeeper_interest(zh, &fd, &interest, &tv);
414-
415-
// FD_ZERO is cheap on Win32, it just sets count of elements to zero.
416-
// It needs to be done to ensure no stale entries.
417-
FD_ZERO(&rfds);
418-
FD_ZERO(&wfds);
419-
420-
if (fd != -1) {
421-
if (interest&ZOOKEEPER_READ) {
422-
FD_SET(fd, &rfds);
423-
}
424-
425-
if (interest&ZOOKEEPER_WRITE) {
426-
FD_SET(fd, &wfds);
426+
// check the current state of the zhandle and terminate
427+
// if it is_unrecoverable()
428+
if(is_unrecoverable(zh)) {
429+
/* Signal any syncronous completions */
430+
enter_critical(zh);
431+
free_completions(zh,1,ZSESSIONEXPIRED);
432+
leave_critical(zh);
433+
// dont' exhaust cpu
434+
usleep(1000);
435+
} else {
436+
437+
zookeeper_interest(zh, &fd, &interest, &tv);
438+
439+
// FD_ZERO is cheap on Win32, it just sets count of elements to zero.
440+
// It needs to be done to ensure no stale entries.
441+
FD_ZERO(&rfds);
442+
FD_ZERO(&wfds);
443+
444+
if (fd != -1) {
445+
if (interest&ZOOKEEPER_READ) {
446+
FD_SET(fd, &rfds);
447+
}
448+
449+
if (interest&ZOOKEEPER_WRITE) {
450+
FD_SET(fd, &wfds);
451+
}
427452
}
428-
}
429453

430-
// Always interested in self_pipe.
431-
FD_SET(adaptor_threads->self_pipe[0], &rfds);
454+
// Always interested in self_pipe.
455+
FD_SET(adaptor_threads->self_pipe[0], &rfds);
432456

433-
rc = select(/* unused */0, &rfds, &wfds, NULL, &tv);
434-
if (rc > 0) {
435-
interest=(FD_ISSET(fd, &rfds))? ZOOKEEPER_READ: 0;
436-
interest|=(FD_ISSET(fd, &wfds))? ZOOKEEPER_WRITE: 0;
457+
rc = select(/* unused */0, &rfds, &wfds, NULL, &tv);
458+
if (rc > 0) {
459+
interest=(FD_ISSET(fd, &rfds))? ZOOKEEPER_READ: 0;
460+
interest|=(FD_ISSET(fd, &wfds))? ZOOKEEPER_WRITE: 0;
437461

438-
if (FD_ISSET(adaptor_threads->self_pipe[0], &rfds)){
439-
// flush the pipe/socket
440-
char b[128];
441-
while(recv(adaptor_threads->self_pipe[0],b,sizeof(b), 0)==sizeof(b)){}
462+
if (FD_ISSET(adaptor_threads->self_pipe[0], &rfds)){
463+
// flush the pipe/socket
464+
char b[128];
465+
while(recv(adaptor_threads->self_pipe[0],b,sizeof(b), 0)==sizeof(b)){}
466+
}
442467
}
443-
}
444-
else if (rc < 0) {
445-
LOG_ERROR(LOGCALLBACK(zh), ("select() failed %d [%d].", rc, WSAGetLastError()));
468+
else if (rc < 0) {
469+
LOG_ERROR(LOGCALLBACK(zh), ("select() failed %d [%d].", rc, WSAGetLastError()));
446470

447-
// Clear interest events for zookeeper_process if select() fails.
448-
interest = 0;
449-
}
471+
// Clear interest events for zookeeper_process if select() fails.
472+
interest = 0;
473+
}
450474

451475
#endif
452-
// dispatch zookeeper events
453-
zookeeper_process(zh, interest);
454-
// check the current state of the zhandle and terminate
455-
// if it is_unrecoverable()
456-
if(is_unrecoverable(zh))
457-
break;
476+
// dispatch zookeeper events
477+
zookeeper_process(zh, interest);
478+
}
458479
}
459480
api_epilog(zh, 0);
460481
LOG_DEBUG(LOGCALLBACK(zh), "IO thread terminated");

zookeeper-client/zookeeper-client-c/tests/TestZookeeperClose.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,6 @@ class Zookeeper_close : public CPPUNIT_NS::TestFixture
425425

426426
CPPUNIT_ASSERT(zh!=0);
427427
CPPUNIT_ASSERT(ensureCondition(SessionExpired(zh),1000)<1000);
428-
CPPUNIT_ASSERT(ensureCondition(IOThreadStopped(zh),1000)<1000);
429428
// make sure the watcher has been processed
430429
CPPUNIT_ASSERT(ensureCondition(closeAction.isWatcherTriggered(),1000)<1000);
431430
// make sure the threads have not been destroyed yet

0 commit comments

Comments
 (0)