Skip to content

Commit 953da96

Browse files
committed
Race condition in TarantoolClientImpl
- Avoid a possible race between reading, writing and reconnecting threads when a reconnection process is started. It might have happened that the lagged thread (reading or writing) could reset the state to RECONNECT after the reconnecting thread has already started and set the state to 0. As a result, all next attempts to reconnect will never happen. Now the reconnect thread holds on the state as long as it is required. - Avoid another possible race between reading and writing threads when they are started during the reconnection process. It might have happened that one of the threads crashed when it was starting and another slightly lagged thread set up its flag. It could have led that the reconnecting thread saw RECONNECT + R/W state instead of pure RECONNECT. Again, this case broke down all next reconnection attempts. Now reading and writing threads take into account whether RECONNECT state is already set or not. - Replace LockSupport with ReentrantLock.Condition for a thread to be suspended and woken up. Our cluster tests and standalone demo app show that LockSupport is not a safe memory barrier as it could be. The reconnect thread relies on a visibility guarantee between park-unpark invocations which, actually, sometimes doesn't work. Also, according to java-docs LockSupport is more like an internal component to build high-level blocking primitives. It is not recommended using this class directly. It was replaced by ReentrantLock.Condition primitive based on LockSupport but which has proper LockSupport usage inside. Fixes: #142 Affects: #34, #136
1 parent 06755d5 commit 953da96

File tree

2 files changed

+156
-52
lines changed

2 files changed

+156
-52
lines changed

src/main/java/org/tarantool/TarantoolClientImpl.java

Lines changed: 153 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import java.util.concurrent.atomic.AtomicInteger;
2323
import java.util.concurrent.atomic.AtomicReference;
2424
import java.util.concurrent.locks.Condition;
25-
import java.util.concurrent.locks.LockSupport;
2625
import java.util.concurrent.locks.ReentrantLock;
2726

2827

@@ -63,17 +62,19 @@ public class TarantoolClientImpl extends TarantoolBase<Future<?>> implements Tar
6362
*/
6463
protected TarantoolClientStats stats;
6564
protected StateHelper state = new StateHelper(StateHelper.RECONNECT);
66-
protected Thread reader;
67-
protected Thread writer;
65+
protected volatile Thread reader;
66+
protected volatile Thread writer;
6867

6968
protected Thread connector = new Thread(new Runnable() {
7069
@Override
7170
public void run() {
7271
while (!Thread.currentThread().isInterrupted()) {
73-
if (state.compareAndSet(StateHelper.RECONNECT, 0)) {
74-
reconnect(0, thumbstone);
72+
reconnect(0, thumbstone);
73+
try {
74+
state.awaitReconnection();
75+
} catch (InterruptedException e) {
76+
Thread.currentThread().interrupt();
7577
}
76-
LockSupport.park(state);
7778
}
7879
}
7980
});
@@ -139,16 +140,13 @@ protected void reconnect(int retry, Throwable lastError) {
139140
protected void connect(final SocketChannel channel) throws Exception {
140141
try {
141142
TarantoolGreeting greeting = ProtoUtils.connect(channel,
142-
config.username, config.password);
143+
config.username, config.password);
143144
this.serverVersion = greeting.getServerVersion();
144145
} catch (IOException e) {
145-
try {
146-
channel.close();
147-
} catch (IOException ignored) {
148-
}
149-
146+
closeChannel(channel);
150147
throw new CommunicationException("Couldn't connect to tarantool", e);
151148
}
149+
152150
channel.configureBlocking(false);
153151
this.channel = channel;
154152
this.readChannel = new ReadableViaSelectorChannel(channel);
@@ -174,8 +172,21 @@ public void run() {
174172
readThread();
175173
} finally {
176174
state.release(StateHelper.READING);
177-
if (state.compareAndSet(0, StateHelper.RECONNECT))
178-
LockSupport.unpark(connector);
175+
// Skip the old gen. attempt to reconnect
176+
//
177+
// there're two cases when a read thread is here
178+
// 1. it's a new generation thread inside/outside
179+
// a reconnection process (currentThread == reader)
180+
// 2. It's an old generation thread inside
181+
// a reconnection process (currentThread != reader)
182+
// NOTE: reader thread checks the statuses in reverse
183+
// order the connector thread changes them. It
184+
// makes this thread be sure that reference to
185+
// it is not outdated when the state is UNINITIALIZED
186+
if (state.getState() == StateHelper.UNINITIALIZED
187+
&& Thread.currentThread() == reader) {
188+
state.trySignalForReconnection();
189+
}
179190
}
180191
}
181192
}
@@ -189,13 +200,33 @@ public void run() {
189200
writeThread();
190201
} finally {
191202
state.release(StateHelper.WRITING);
192-
if (state.compareAndSet(0, StateHelper.RECONNECT))
193-
LockSupport.unpark(connector);
203+
// Skip the old gen. attempt to reconnect
204+
//
205+
// there're two cases when a write thread is here
206+
// 1. it's a new generation thread inside/outside
207+
// a reconnection process (currentThread == writer)
208+
// 2. It's an old generation thread inside
209+
// a reconnection process (currentThread != writer)
210+
// NOTE: writer thread checks the statuses in reverse
211+
// order the connector thread changes them. It
212+
// makes this thread be sure that reference to
213+
// it is not outdated when the state is UNINITIALIZED
214+
if (state.getState() == StateHelper.UNINITIALIZED
215+
&& Thread.currentThread() == writer) {
216+
state.trySignalForReconnection();
217+
}
194218
}
195219
}
196220
}
197221
});
198222

223+
// reconnection preparation is done
224+
// before reconnection the state will be released
225+
// reader/writer threads have been replaced by new ones
226+
// it's required to be sure that old r/w threads see correct
227+
// client's r/w references
228+
state.release(StateHelper.RECONNECT);
229+
199230
configureThreads(threadName);
200231
reader.start();
201232
writer.start();
@@ -337,25 +368,21 @@ private boolean directWrite(ByteBuffer buffer) throws InterruptedException, IOEx
337368
}
338369

339370
protected void readThread() {
340-
try {
341-
while (!Thread.currentThread().isInterrupted()) {
342-
try {
343-
TarantoolPacket packet = ProtoUtils.readPacket(readChannel);
371+
while (!Thread.currentThread().isInterrupted()) {
372+
try {
373+
TarantoolPacket packet = ProtoUtils.readPacket(readChannel);
344374

345-
Map<Integer, Object> headers = packet.getHeaders();
375+
Map<Integer, Object> headers = packet.getHeaders();
346376

347-
Long syncId = (Long) headers.get(Key.SYNC.getId());
348-
TarantoolOp<?> future = futures.remove(syncId);
349-
stats.received++;
350-
wait.decrementAndGet();
351-
complete(packet, future);
352-
} catch (Exception e) {
353-
die("Cant read answer", e);
354-
return;
355-
}
377+
Long syncId = (Long) headers.get(Key.SYNC.getId());
378+
TarantoolOp<?> future = futures.remove(syncId);
379+
stats.received++;
380+
wait.decrementAndGet();
381+
complete(packet, future);
382+
} catch (Exception e) {
383+
die("Cant read answer", e);
384+
return;
356385
}
357-
} catch (Exception e) {
358-
die("Cant init thread", e);
359386
}
360387
}
361388

@@ -498,7 +525,7 @@ public TarantoolClientOps<Integer, List<?>, Object, List<?>> syncOps() {
498525

499526
@Override
500527
public TarantoolClientOps<Integer, List<?>, Object, Future<List<?>>> asyncOps() {
501-
return (TarantoolClientOps)this;
528+
return (TarantoolClientOps) this;
502529
}
503530

504531
@Override
@@ -514,7 +541,7 @@ public TarantoolClientOps<Integer, List<?>, Object, Long> fireAndForgetOps() {
514541

515542
@Override
516543
public TarantoolSQLOps<Object, Long, List<Map<String, Object>>> sqlSyncOps() {
517-
return new TarantoolSQLOps<Object, Long, List<Map<String,Object>>>() {
544+
return new TarantoolSQLOps<Object, Long, List<Map<String, Object>>>() {
518545

519546
@Override
520547
public Long update(String sql, Object... bind) {
@@ -530,7 +557,7 @@ public List<Map<String, Object>> query(String sql, Object... bind) {
530557

531558
@Override
532559
public TarantoolSQLOps<Object, Future<Long>, Future<List<Map<String, Object>>>> sqlAsyncOps() {
533-
return new TarantoolSQLOps<Object, Future<Long>, Future<List<Map<String,Object>>>>() {
560+
return new TarantoolSQLOps<Object, Future<Long>, Future<List<Map<String, Object>>>>() {
534561
@Override
535562
public Future<Long> update(String sql, Object... bind) {
536563
return (Future<Long>) exec(Code.EXECUTE, Key.SQL_TEXT, sql, Key.SQL_BIND, bind);
@@ -618,6 +645,7 @@ public TarantoolClientStats getStats() {
618645
* Manages state changes.
619646
*/
620647
protected final class StateHelper {
648+
static final int UNINITIALIZED = 0;
621649
static final int READING = 1;
622650
static final int WRITING = 2;
623651
static final int ALIVE = READING | WRITING;
@@ -627,10 +655,22 @@ protected final class StateHelper {
627655
private final AtomicInteger state;
628656

629657
private final AtomicReference<CountDownLatch> nextAliveLatch =
630-
new AtomicReference<CountDownLatch>(new CountDownLatch(1));
658+
new AtomicReference<>(new CountDownLatch(1));
631659

632660
private final CountDownLatch closedLatch = new CountDownLatch(1);
633661

662+
/**
663+
* The condition variable to signal a reconnection is needed from reader /
664+
* writer threads and waiting for that signal from the reconnection thread.
665+
*
666+
* The lock variable to access this condition.
667+
*
668+
* @see #awaitReconnection()
669+
* @see #trySignalForReconnection()
670+
*/
671+
protected final ReentrantLock connectorLock = new ReentrantLock();
672+
protected final Condition reconnectRequired = connectorLock.newCondition();
673+
634674
protected StateHelper(int state) {
635675
this.state = new AtomicInteger(state);
636676
}
@@ -639,35 +679,60 @@ protected int getState() {
639679
return state.get();
640680
}
641681

682+
/**
683+
* Set CLOSED state, drop RECONNECT state.
684+
*/
642685
protected boolean close() {
643-
for (;;) {
686+
for (; ; ) {
644687
int st = getState();
688+
689+
/* CLOSED is the terminal state. */
645690
if ((st & CLOSED) == CLOSED)
646691
return false;
692+
693+
/* Drop RECONNECT, set CLOSED. */
647694
if (compareAndSet(st, (st & ~RECONNECT) | CLOSED))
648695
return true;
649696
}
650697
}
651698

699+
/**
700+
* Move from a current state to a give one.
701+
*
702+
* Some moves are forbidden.
703+
*/
652704
protected boolean acquire(int mask) {
653-
for (;;) {
654-
int st = getState();
655-
if ((st & CLOSED) == CLOSED)
705+
for (; ; ) {
706+
int currentState = getState();
707+
708+
/* CLOSED is the terminal state. */
709+
if ((currentState & CLOSED) == CLOSED) {
710+
return false;
711+
}
712+
713+
/* Don't move to READING, WRITING or ALIVE from RECONNECT. */
714+
if ((currentState & RECONNECT) > mask) {
656715
return false;
716+
}
657717

658-
if ((st & mask) != 0)
718+
/* Cannot move from a state to the same state. */
719+
if ((currentState & mask) != 0) {
659720
throw new IllegalStateException("State is already " + mask);
721+
}
660722

661-
if (compareAndSet(st, st | mask))
723+
/* Set acquired state. */
724+
if (compareAndSet(currentState, currentState | mask)) {
662725
return true;
726+
}
663727
}
664728
}
665729

666730
protected void release(int mask) {
667-
for (;;) {
731+
for (; ; ) {
668732
int st = getState();
669-
if (compareAndSet(st, st & ~mask))
733+
if (compareAndSet(st, st & ~mask)) {
670734
return;
735+
}
671736
}
672737
}
673738

@@ -686,10 +751,18 @@ protected boolean compareAndSet(int expect, int update) {
686751
return true;
687752
}
688753

754+
/**
755+
* Reconnection uses another way to await state via receiving a signal
756+
* instead of latches.
757+
*/
689758
protected void awaitState(int state) throws InterruptedException {
690-
CountDownLatch latch = getStateLatch(state);
691-
if (latch != null) {
692-
latch.await();
759+
if (state == RECONNECT) {
760+
awaitReconnection();
761+
} else {
762+
CountDownLatch latch = getStateLatch(state);
763+
if (latch != null) {
764+
latch.await();
765+
}
693766
}
694767
}
695768

@@ -709,10 +782,42 @@ private CountDownLatch getStateLatch(int state) {
709782
CountDownLatch latch = nextAliveLatch.get();
710783
/* It may happen so that an error is detected but the state is still alive.
711784
Wait for the 'next' alive state in such cases. */
712-
return (getState() == ALIVE && thumbstone == null) ? null : latch;
785+
return (getState() == ALIVE && thumbstone == null) ? null : latch;
713786
}
714787
return null;
715788
}
789+
790+
/**
791+
* Blocks until a reconnection signal will be received.
792+
*
793+
* @see #trySignalForReconnection()
794+
*/
795+
private void awaitReconnection() throws InterruptedException {
796+
connectorLock.lock();
797+
try {
798+
while (getState() != StateHelper.RECONNECT) {
799+
reconnectRequired.await();
800+
}
801+
} finally {
802+
connectorLock.unlock();
803+
}
804+
}
805+
806+
/**
807+
* Signals to the connector that reconnection process can be performed.
808+
*
809+
* @see #awaitReconnection()
810+
*/
811+
private void trySignalForReconnection() {
812+
if (compareAndSet(StateHelper.UNINITIALIZED, StateHelper.RECONNECT)) {
813+
connectorLock.lock();
814+
try {
815+
reconnectRequired.signal();
816+
} finally {
817+
connectorLock.unlock();
818+
}
819+
}
820+
}
716821
}
717822

718823
protected static class TarantoolOp<V> extends CompletableFuture<V> {

src/test/java/org/tarantool/ClientReconnectIT.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
import static org.junit.jupiter.api.Assertions.assertEquals;
2222
import static org.junit.jupiter.api.Assertions.assertFalse;
23-
import static org.junit.jupiter.api.Assertions.assertNull;
2423
import static org.junit.jupiter.api.Assertions.assertNotNull;
2524
import static org.junit.jupiter.api.Assertions.assertThrows;
2625
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -227,13 +226,13 @@ public void run() {
227226
public void testLongParallelCloseReconnects() {
228227
int numThreads = 4;
229228
int numClients = 4;
230-
int timeBudget = 30*1000;
229+
int timeBudget = 30 * 1000;
231230

232231
SocketChannelProvider provider = new TestSocketChannelProvider(host,
233232
port, RESTART_TIMEOUT).setSoLinger(0);
234233

235234
final AtomicReferenceArray<TarantoolClient> clients =
236-
new AtomicReferenceArray<TarantoolClient>(numClients);
235+
new AtomicReferenceArray<>(numClients);
237236

238237
for (int idx = 0; idx < clients.length(); idx++) {
239238
clients.set(idx, makeClient(provider));
@@ -301,7 +300,7 @@ public void run() {
301300

302301
// Wait for all threads to finish.
303302
try {
304-
assertTrue(latch.await(RESTART_TIMEOUT, TimeUnit.MILLISECONDS));
303+
assertTrue(latch.await(RESTART_TIMEOUT * 2, TimeUnit.MILLISECONDS));
305304
} catch (InterruptedException e) {
306305
fail(e);
307306
}

0 commit comments

Comments
 (0)