@@ -116,14 +116,9 @@ void TfSchedulerConnManager::disconnectTfBuilder(const TfBuilderConfigStatus &pT
116116 pResponse.set_status (0 );
117117 const std::string &lTfBuilderId = pTfBuilderStatus.info ().process_id ();
118118
119- std::scoped_lock lLock (mStfSenderClientsLock );
120-
121- deleteTfBuilderRpcClient (lTfBuilderId);
122-
123- if (!stfSendersReady ()) {
124- IDDLOG (" TfBuilder Connection error: StfSenders not ready." );
125- pResponse.set_status (ERROR_STF_SENDERS_NOT_READY);
126- return ;
119+ {
120+ std::scoped_lock lLock (mStfSenderClientsLock );
121+ deleteTfBuilderRpcClient (lTfBuilderId);
127122 }
128123
129124 TfBuilderEndpoint lParam;
@@ -137,31 +132,70 @@ void TfSchedulerConnManager::disconnectTfBuilder(const TfBuilderConfigStatus &pT
137132 continue ; // not connected
138133 }
139134
140- lParam.set_tf_builder_id (lTfBuilderId);
141- lParam.set_endpoint (lSocketInfo.endpoint ());
135+ { // lock clients
136+ std::scoped_lock lLock (mStfSenderClientsLock );
137+
138+ if (mStfSenderRpcClients .count (lStfSenderId) == 0 ) {
139+ WDDLOG (" disconnectTfBuilder: Unknown StfSender. stfs_id={}" , lStfSenderId);
140+ continue ;
141+ }
142142
143- if (mStfSenderRpcClients .count (lStfSenderId) == 0 ) {
144- WDDLOG (" disconnectTfBuilder: Unknown StfSender. stfs_id={}" , lStfSenderId);
145- continue ;
143+ lParam.set_tf_builder_id (lTfBuilderId);
144+ lParam.set_endpoint (lSocketInfo.endpoint ());
145+ StatusResponse lResponse;
146+
147+ auto &lRpcClient = mStfSenderRpcClients [lSocketInfo.peer_id ()];
148+ if (!lRpcClient->DisconnectTfBuilderRequest (lParam, lResponse).ok ()) {
149+ EDDLOG (" StfSender Connection error: gRPC error. stfs_id={} tfb_id={}" , lStfSenderId, lTfBuilderId);
150+ pResponse.set_status (ERROR_GRPC_STF_SENDER);
151+ continue ;
152+ }
153+ // check StfSender status
154+ if (lResponse.status () != 0 ) {
155+ EDDLOG (" TfBuilder Connection error. stfs_id={} tfb_id={} response={}" , lStfSenderId, lTfBuilderId, lResponse.status ());
156+ pResponse.set_status (ERROR_STF_SENDER_CONNECTING);
157+ continue ;
158+ }
146159 }
160+ }
161+ }
147162
148- auto &lRpcClient = mStfSenderRpcClients [lSocketInfo.peer_id ()];
163+ // Partition RPC: keep sending until all TfBuilders are gone
164+ bool TfSchedulerConnManager::requestTfBuildersTerminate () {
165+ std::vector<std::string> lFailedRpcsForDeletion;
149166
150- StatusResponse lResponse ;
151- if (!lRpcClient-> DisconnectTfBuilderRequest (lParam, lResponse). ok ()) {
152- EDDLOG ( " TfBuilder Connection error: gRPC error. stfs_id={} tfb_id={} " , lStfSenderId, lTfBuilderId);
153- pResponse. set_status (ERROR_GRPC_STF_SENDER);
154- break ;
167+ std::scoped_lock lLock ( mStfSenderClientsLock ) ;
168+
169+ for ( auto &lTfBuilder : mTfBuilderRpcClients ) {
170+ if (!lTfBuilder. second . mClient -> TerminatePartition ()) {
171+ lFailedRpcsForDeletion. emplace_back (lTfBuilder. first ) ;
155172 }
173+ }
156174
157- // check StfSender status
158- if (lResponse.status () != 0 ) {
159- EDDLOG (" TfBuilder Connection error. stfs_id={} tfb_id={} response={}" ,
160- lStfSenderId, lTfBuilderId, lResponse.status ());
161- pResponse.set_status (ERROR_STF_SENDER_CONNECTING);
162- break ;
175+ for (const auto &lId : lFailedRpcsForDeletion) {
176+ deleteTfBuilderRpcClient (lId);
177+ }
178+
179+ return mTfBuilderRpcClients .size () == 0 ;
180+ }
181+
182+ // Partition RPC: notify all StfSenders and remove rpc clients
183+ bool TfSchedulerConnManager::requestStfSendersTerminate () {
184+ std::vector<std::string> lFailedRpcsForDeletion;
185+
186+ std::scoped_lock lLock (mStfSenderClientsLock );
187+
188+ for (auto &lStfSender : mStfSenderRpcClients ) {
189+ if (!lStfSender.second ->TerminatePartition ()) {
190+ lFailedRpcsForDeletion.emplace_back (lStfSender.first );
163191 }
164192 }
193+
194+ for (const auto &lId : lFailedRpcsForDeletion) {
195+ deleteTfBuilderRpcClient (lId);
196+ }
197+
198+ return mTfBuilderRpcClients .size () == 0 ;
165199}
166200
167201
@@ -260,10 +294,10 @@ void TfSchedulerConnManager::StfSenderMonitoringThread()
260294
261295 WDDLOG_RL (1000 , " Waiting for StfSenders. ready={} total={}" , lNumStfSenders, mPartitionInfo .mStfSenderIdList .size ());
262296 lSleep = 250ms;
297+ } else {
298+ mStfSenderState = STF_SENDER_STATE_OK;
263299 }
264300
265- mStfSenderState = STF_SENDER_STATE_OK;
266-
267301 // wait for drop futures
268302 {
269303 {
0 commit comments