Skip to content

Commit 44c29e8

Browse files
committed
Fix segfault when handling ChangeGroupSizeInUnits during VDisk LocalRecovery (#28613)
1 parent 62e021d commit 44c29e8

File tree

9 files changed

+401
-96
lines changed

9 files changed

+401
-96
lines changed

ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,13 @@
77
#include <ydb/core/blobstorage/base/blobstorage_events.h>
88
#include <ydb/core/blobstorage/pdisk/blobstorage_pdisk_tools.h>
99
#include <ydb/core/blobstorage/pdisk/blobstorage_pdisk_ut_http_request.h>
10+
#include <ydb/core/blobstorage/vdisk/localrecovery/localrecovery_public.h>
11+
#include <ydb/core/blobstorage/vdisk/common/vdisk_events.h>
1012
#include <ydb/core/mind/bscontroller/bsc.h>
1113
#include <ydb/core/util/actorsys_test/testactorsys.h>
1214

1315
#include <ydb/library/pdisk_io/sector_map.h>
16+
#include <ydb/core/testlib/actors/block_events.h>
1417
#include <ydb/core/util/random.h>
1518

1619
#include <google/protobuf/text_format.h>
@@ -1119,6 +1122,79 @@ Y_UNIT_TEST_SUITE(TBlobStorageWardenTest) {
11191122
CheckInferredPDiskSettings(runtime, fakeWhiteboard, fakeNodeWarden,
11201123
pdiskId, 12, 2u);
11211124
}
1125+
1126+
void ChangeGroupSizeInUnits(TTestBasicRuntime& runtime, TString poolName, ui32 groupId, ui32 groupSizeInUnits) {
1127+
TActorId edge = runtime.AllocateEdgeActor();
1128+
1129+
auto storagePool = DescribeStoragePool(runtime, poolName);
1130+
auto request = std::make_unique<TEvBlobStorage::TEvControllerConfigRequest>();
1131+
auto& cmd = *request->Record.MutableRequest()->AddCommand()->MutableChangeGroupSizeInUnits();
1132+
cmd.SetBoxId(storagePool.GetBoxId());
1133+
cmd.SetItemConfigGeneration(storagePool.GetItemConfigGeneration());
1134+
cmd.SetStoragePoolId(storagePool.GetStoragePoolId());
1135+
cmd.AddGroupId(groupId);
1136+
cmd.SetSizeInUnits(groupSizeInUnits);
1137+
1138+
NTabletPipe::TClientConfig pipeConfig;
1139+
pipeConfig.RetryPolicy = NTabletPipe::TClientRetryPolicy::WithRetries();
1140+
runtime.SendToPipe(MakeBSControllerID(), edge, request.release(), 0, pipeConfig);
1141+
1142+
auto reply = runtime.GrabEdgeEventRethrow<TEvBlobStorage::TEvControllerConfigResponse>(edge);
1143+
VERBOSE_COUT("TEvControllerConfigResponse# " << reply->ToString());
1144+
UNIT_ASSERT_VALUES_EQUAL(reply->Get()->Record.GetResponse().GetSuccess(), true);
1145+
}
1146+
1147+
void CheckVDiskStateUpdate(TTestBasicRuntime& runtime, TActorId fakeWhiteboard, ui32 groupId,
1148+
ui32 expectedGroupGeneration, ui32 expectedGroupSizeInUnits,
1149+
TDuration simTimeout = TDuration::Seconds(10)) {
1150+
TInstant deadline = runtime.GetCurrentTime() + simTimeout;
1151+
while (true) {
1152+
UNIT_ASSERT_LT(runtime.GetCurrentTime(), deadline);
1153+
1154+
const auto ev = runtime.GrabEdgeEventRethrow<NNodeWhiteboard::TEvWhiteboard::TEvVDiskStateUpdate>(fakeWhiteboard, deadline - runtime.GetCurrentTime());
1155+
VERBOSE_COUT(" Got TEvVDiskStateUpdate# " << ev->ToString());
1156+
1157+
NKikimrWhiteboard::TVDiskStateInfo vdiskInfo = ev->Get()->Record;
1158+
if (vdiskInfo.GetVDiskId().GetGroupID() != groupId || !vdiskInfo.HasGroupSizeInUnits()) {
1159+
continue;
1160+
}
1161+
1162+
UNIT_ASSERT_VALUES_EQUAL(vdiskInfo.GetVDiskId().GetGroupGeneration(), expectedGroupGeneration);
1163+
UNIT_ASSERT_VALUES_EQUAL(vdiskInfo.GetGroupSizeInUnits(), expectedGroupSizeInUnits);
1164+
break;
1165+
}
1166+
}
1167+
1168+
CUSTOM_UNIT_TEST(TestEvVGenerationChangeRace) {
1169+
TTestBasicRuntime runtime(1, false);
1170+
Setup(runtime, "", nullptr);
1171+
runtime.SetLogPriority(NKikimrServices::BS_PROXY, NLog::PRI_ERROR);
1172+
runtime.SetLogPriority(NKikimrServices::BS_PROXY_PUT, NLog::PRI_ERROR);
1173+
runtime.SetLogPriority(NKikimrServices::BS_PROXY_BLOCK, NLog::PRI_ERROR);
1174+
runtime.SetLogPriority(NKikimrServices::BS_SKELETON, NLog::PRI_INFO);
1175+
runtime.SetLogPriority(NKikimrServices::BS_LOCALRECOVERY, NLog::PRI_INFO);
1176+
runtime.SetLogPriority(NKikimrServices::BS_NODE, NLog::PRI_INFO);
1177+
runtime.SetLogPriority(NKikimrServices::BS_CONTROLLER, NLog::PRI_INFO);
1178+
1179+
const ui32 nodeId = runtime.GetNodeId(0);
1180+
TActorId fakeWhiteboard = runtime.AllocateEdgeActor();
1181+
runtime.RegisterService(NNodeWhiteboard::MakeNodeWhiteboardServiceId(nodeId), fakeWhiteboard);
1182+
1183+
VERBOSE_COUT(" Starting test");
1184+
1185+
TBlockEvents<TEvBlobStorage::TEvLocalRecoveryDone> block(runtime);
1186+
1187+
const TString poolName = "testEvVGenerationChangeRace";
1188+
CreateStoragePool(runtime, poolName, "pool-kind-1");
1189+
ui32 groupId = GetGroupFromPool(runtime, poolName);
1190+
1191+
CheckVDiskStateUpdate(runtime, fakeWhiteboard, groupId, 1, 0u);
1192+
ChangeGroupSizeInUnits(runtime, poolName, groupId, 2u);
1193+
CheckVDiskStateUpdate(runtime, fakeWhiteboard, groupId, 1, 0u);
1194+
block.Stop().Unblock();
1195+
CheckVDiskStateUpdate(runtime, fakeWhiteboard, groupId, 2, 2u);
1196+
}
1197+
11221198
}
11231199

11241200
} // namespace NBlobStorageNodeWardenTest

ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeleton.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1927,6 +1927,17 @@ namespace NKikimr {
19271927
ApplyHugeBlobSize(Config->MinHugeBlobInBytes);
19281928
Y_VERIFY_S(MinHugeBlobInBytes, VCtx->VDiskLogPrefix);
19291929

1930+
if (Config->GroupSizeInUnits != GInfo->GroupSizeInUnits) {
1931+
Config->GroupSizeInUnits = GInfo->GroupSizeInUnits;
1932+
Y_VERIFY(PDiskCtx);
1933+
Y_VERIFY(PDiskCtx->Dsk);
1934+
ctx.Send(PDiskCtx->PDiskId,
1935+
new NPDisk::TEvYardResize(
1936+
PDiskCtx->Dsk->Owner,
1937+
PDiskCtx->Dsk->OwnerRound,
1938+
Config->GroupSizeInUnits));
1939+
}
1940+
19301941
// handle special case when donor disk starts and finds out that it has been wiped out
19311942
if (ev->Get()->LsnMngr->GetOriginallyRecoveredLsn() == 0 && Config->BaseInfo.DonorMode) {
19321943
// send drop donor cmd to NodeWarden
@@ -2444,10 +2455,11 @@ namespace NKikimr {
24442455
GInfo = msg->NewInfo;
24452456
SelfVDiskId = msg->NewVDiskId;
24462457

2447-
if (Config->GroupSizeInUnits != GInfo->GroupSizeInUnits) {
2458+
if (PDiskCtx && Config->GroupSizeInUnits != GInfo->GroupSizeInUnits) {
24482459
Config->GroupSizeInUnits = GInfo->GroupSizeInUnits;
24492460
UpdateWhiteboard(ctx);
24502461

2462+
Y_VERIFY(PDiskCtx->Dsk);
24512463
ctx.Send(PDiskCtx->PDiskId,
24522464
new NPDisk::TEvYardResize(
24532465
PDiskCtx->Dsk->Owner,

ydb/tests/compatibility/test_infer_pdisk_expected_slot_count.py

Lines changed: 10 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88
from ydb.tests.library.compatibility.fixtures import inter_stable_binary_path, inter_stable_name
99
from ydb.tests.library.compatibility.fixtures import current_binary_path, current_name
1010
from ydb.tests.library.common.types import Erasure
11-
import ydb.core.protos.msgbus_pb2 as msgbus
12-
import ydb.core.protos.blobstorage_config_pb2 as blobstorage_config_pb2
11+
from ydb.core.protos import blobstorage_config_pb2
1312

1413
logger = logging.getLogger(__name__)
1514

@@ -43,96 +42,18 @@ def setup(self):
4342
use_in_memory_pdisks=False)
4443
next(cluster_generator)
4544

46-
host_configs = self.read_host_configs()
45+
host_configs = self.cluster.client.read_host_configs()
4746
for host_config in host_configs:
4847
drive = host_config.Drive.add()
4948
drive.Path = CONST_PDISK_PATH
5049
drive.PDiskConfig.ExpectedSlotCount = CONST_EXPECTED_SLOT_COUNT
51-
self.define_host_configs(host_configs)
50+
self.cluster.client.define_host_configs(host_configs)
5251

5352
yield
5453

55-
def read_host_configs(self):
56-
request = msgbus.TBlobStorageConfigRequest()
57-
request.Domain = 1
58-
request.Request.Command.add().ReadHostConfig.SetInParent()
59-
60-
response = self.cluster.client.send(request, 'BlobStorageConfig').BlobStorageConfigResponse
61-
logger.info(f"read_host_config response: {response}")
62-
if not response.Success:
63-
raise RuntimeError('read_host_config request failed: %s' % response.ErrorDescription)
64-
status = response.Status[0]
65-
if not status.Success:
66-
raise RuntimeError('read_host_config has failed status: %s' % status.ErrorDescription)
67-
68-
return status.HostConfig
69-
70-
def define_host_configs(self, host_configs):
71-
"""Define host configuration with specified host config"""
72-
request = msgbus.TBlobStorageConfigRequest()
73-
request.Domain = 1
74-
for host_config in host_configs:
75-
request.Request.Command.add().DefineHostConfig.MergeFrom(host_config)
76-
77-
logger.info(f"define_host_config request: {request}")
78-
response = self.cluster.client.send(request, 'BlobStorageConfig').BlobStorageConfigResponse
79-
logger.info(f"define_host_config responce: {response}")
80-
if not response.Success:
81-
raise RuntimeError('define_host_config request failed: %s' % response.ErrorDescription)
82-
for i, status in enumerate(response.Status):
83-
if not status.Success:
84-
raise RuntimeError('define_host_config has failed status[%d]: %s' % (i, status))
85-
86-
def pdisk_set_all_active(self):
87-
"""Update all drive statuses to ACTIVE. Equivalent to
88-
`dstool pdisk set --status=ACTIVE --pdisk-ids <pdisks>`
89-
"""
90-
base_config = self.query_base_config()
91-
92-
request = msgbus.TBlobStorageConfigRequest()
93-
request.Domain = 1
94-
95-
for pdisk in base_config.BaseConfig.PDisk:
96-
if pdisk.Path != CONST_PDISK_PATH:
97-
continue
98-
cmd = request.Request.Command.add().UpdateDriveStatus
99-
cmd.HostKey.NodeId = pdisk.NodeId
100-
cmd.PDiskId = pdisk.PDiskId
101-
cmd.Status = blobstorage_config_pb2.EDriveStatus.ACTIVE
102-
103-
logger.info(f"update_all_drive_status_active request: {request}")
104-
response = self.cluster.client.send(request, 'BlobStorageConfig').BlobStorageConfigResponse
105-
logger.info(f"update_all_drive_status_active response: {response}")
106-
107-
if not response.Success:
108-
raise RuntimeError('update_all_drive_status_active request failed: %s' % response.ErrorDescription)
109-
for i, status in enumerate(response.Status):
110-
if not status.Success:
111-
raise RuntimeError('update_all_drive_status_active has failed status[%d]: %s' % (i, status))
112-
113-
def query_base_config(self):
114-
request = msgbus.TBlobStorageConfigRequest()
115-
request.Domain = 1
116-
117-
# Add QueryBaseConfig command
118-
command = request.Request.Command.add()
119-
command.QueryBaseConfig.RetrieveDevices = True
120-
command.QueryBaseConfig.VirtualGroupsOnly = False
121-
122-
# Send the request
123-
response = self.cluster.client.send(request, 'BlobStorageConfig').BlobStorageConfigResponse
124-
if not response.Success:
125-
raise RuntimeError('query_base_config failed: %s' % response.ErrorDescription)
126-
127-
status = response.Status[0]
128-
if not status.Success:
129-
raise RuntimeError('query_base_config failed: %s' % status.ErrorDescription)
130-
131-
return status
132-
13354
def pdisk_list(self):
13455
"""Equivalent to `dstool pdisk list`"""
135-
base_config = self.query_base_config()
56+
base_config = self.cluster.client.query_base_config()
13657

13758
# Collect PDisk information
13859
pdisks_info = []
@@ -152,11 +73,11 @@ def wait_and_check_pdisk_list(self, check_pdisks_fn, deadline, delay=1):
15273
except AssertionError as e:
15374
if time.time() > deadline:
15475
logger.warning(f"pdisk_list incorrect: {pdisks}")
155-
raise e from e
76+
raise e
15677
else:
15778
time.sleep(delay)
15879

159-
def test_infer_pdisk_expected_slot_count(self):
80+
def test(self):
16081
assert self.current_binary_paths_index == 0
16182
logger.info(f"Test started on {self.versions[0]} {time.time()=}")
16283
#################################################################
@@ -187,18 +108,18 @@ def check_pdisks(pdisks):
187108
######################################################################
188109

189110
t2 = time.time()
190-
host_configs = self.read_host_configs()
111+
host_configs = self.cluster.client.read_host_configs()
191112
for host_config in host_configs:
192113
drive = host_config.Drive[1]
193114
assert drive.Path == CONST_PDISK_PATH
194115
drive.ClearField('PDiskConfig')
195116
drive.PDiskConfig.SetInParent()
196117
drive.InferPDiskSlotCountFromUnitSize = CONST_10_GB
197118
drive.InferPDiskSlotCountMax = 32
198-
self.define_host_configs(host_configs)
119+
self.cluster.client.define_host_configs(host_configs)
199120
logger.info(f"Inferred PDisk setting applied {time.time()=}")
200121

201-
self.pdisk_set_all_active()
122+
self.cluster.client.pdisk_set_all_active(pdisk_path=CONST_PDISK_PATH)
202123
logger.info(f"Drives activated {time.time()=}")
203124

204125
deadline = time.time() + timeout
@@ -224,7 +145,7 @@ def check_pdisks(pdisks):
224145
logger.info(f"Restarted back on version {self.versions[0]} {time.time()=}")
225146
###########################################################################
226147

227-
self.pdisk_set_all_active()
148+
self.cluster.client.pdisk_set_all_active(pdisk_path=CONST_PDISK_PATH)
228149
logger.info(f"Drives activated {time.time()=}")
229150

230151
deadline = time.time() + timeout

0 commit comments

Comments
 (0)