Skip to content

Commit aa12503

Browse files
committed
Add warning to deadlock cron retrier to log any future failures for investigation
Change key for cron_schedule to cover all queries currently run, ensuring range is on the end of the key Refactor cron locking to remove UPDATE+JOIN which causes shared locks on the join followed by exclusive locks for the update, which can deadlock, to instead use an explicit full exclusive lock on all entries using forUpdate Implement cleanup of jobs that were running but did not complete and did not error, which can occur if PHP crashes or is killed, or database is restored from backup or migrated to staging environments
1 parent 679cd33 commit aa12503

File tree

6 files changed

+107
-19
lines changed

6 files changed

+107
-19
lines changed

app/code/Magento/Cron/Model/DeadlockRetrier.php

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,20 @@
1717
*/
1818
class DeadlockRetrier implements DeadlockRetrierInterface
1919
{
20+
/**
21+
* @var \Psr\Log\LoggerInterface
22+
*/
23+
private $logger;
24+
25+
/**
26+
* @param \Psr\Log\LoggerInterface $logger
27+
*/
28+
public function __construct(
29+
\Psr\Log\LoggerInterface $logger
30+
) {
31+
$this->logger = $logger;
32+
}
33+
2034
/**
2135
* @inheritdoc
2236
*/
@@ -30,6 +44,7 @@ public function execute(callable $callback, AdapterInterface $connection)
3044
try {
3145
return $callback();
3246
} catch (DeadlockException $e) {
47+
$this->logger->warning(sprintf("Deadlock detected in cron cleanup: %s", $e->getMessage()));
3348
continue;
3449
}
3550
}

app/code/Magento/Cron/Model/ResourceModel/Schedule.php

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,31 +65,47 @@ public function trySetJobStatusAtomic($scheduleId, $newStatus, $currentStatus)
6565
public function trySetJobUniqueStatusAtomic($scheduleId, $newStatus, $currentStatus)
6666
{
6767
$connection = $this->getConnection();
68+
$connection->beginTransaction();
6869

6970
// this condition added to avoid cron jobs locking after incorrect termination of running job
7071
$match = $connection->quoteInto(
7172
'existing.job_code = current.job_code ' .
72-
'AND (existing.executed_at > UTC_TIMESTAMP() - INTERVAL 1 DAY OR existing.executed_at IS NULL) ' .
73-
'AND existing.status = ?',
73+
'AND existing.status = ? ' .
74+
'AND (existing.executed_at > UTC_TIMESTAMP() - INTERVAL 1 DAY OR existing.executed_at IS NULL)',
7475
$newStatus
7576
);
7677

78+
// Select and lock all related schedules - this prevents deadlock in case cron overlaps and two jobs of
79+
// the same code attempt to lock at the same time, and force them to serialize
7780
$selectIfUnlocked = $connection->select()
81+
->from(
82+
['current' => $this->getTable('cron_schedule')],
83+
[]
84+
)
7885
->joinLeft(
7986
['existing' => $this->getTable('cron_schedule')],
8087
$match,
81-
['status' => new \Zend_Db_Expr($connection->quote($newStatus))]
88+
['existing.schedule_id']
8289
)
8390
->where('current.schedule_id = ?', $scheduleId)
8491
->where('current.status = ?', $currentStatus)
85-
->where('existing.schedule_id IS NULL');
86-
87-
$update = $connection->updateFromSelect($selectIfUnlocked, ['current' => $this->getTable('cron_schedule')]);
88-
$result = $connection->query($update)->rowCount();
92+
->forUpdate(true);
8993

90-
if ($result == 1) {
91-
return true;
94+
$scheduleId = $connection->fetchOne($selectIfUnlocked);
95+
if (!empty($scheduleId)) {
96+
// Existing running schedule found
97+
$connection->commit();
98+
return false;
9299
}
93-
return false;
100+
101+
// Mark our schedule as running
102+
$connection->update(
103+
$this->getTable('cron_schedule'),
104+
['status' => new \Zend_Db_Expr($connection->quote($newStatus))],
105+
['schedule_id = ?' => $scheduleId]
106+
);
107+
108+
$connection->commit();
109+
return true;
94110
}
95111
}

app/code/Magento/Cron/Observer/ProcessCronQueueObserver.php

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,7 @@ private function cleanupJobs($groupId, $currentTime)
550550
);
551551

552552
$this->cleanupDisabledJobs($groupId);
553+
$this->cleanupRunningJobs($groupId);
553554

554555
$historySuccess = (int)$this->getCronGroupConfigurationValue($groupId, self::XML_PATH_HISTORY_SUCCESS);
555556
$historyFailure = (int)$this->getCronGroupConfigurationValue($groupId, self::XML_PATH_HISTORY_FAILURE);
@@ -696,6 +697,35 @@ private function cleanupDisabledJobs($groupId)
696697
}
697698
}
698699

700+
/**
701+
* Cleanup jobs that were left in a running state due to an unexpected stop
702+
*
703+
* @param string $groupId
704+
* @return void
705+
*/
706+
private function cleanupRunningJobs($groupId)
707+
{
708+
$scheduleResource = $this->_scheduleFactory->create()->getResource();
709+
$connection = $scheduleResource->getConnection();
710+
711+
$jobs = $this->_config->getJobs();
712+
713+
$connection->update(
714+
$scheduleResource->getMainTable(),
715+
[
716+
'status' => \Magento\Cron\Model\Schedule::STATUS_ERROR,
717+
'messages' => 'Time out'
718+
],
719+
$connection->quoteInto(
720+
'status = ? ' .
721+
'AND job_code IN (?) ' .
722+
'AND (scheduled_at < UTC_TIMESTAMP() - INTERVAL 1 DAY)',
723+
\Magento\Cron\Model\Schedule::STATUS_RUNNING,
724+
array_keys($jobs[$groupId])
725+
)
726+
);
727+
}
728+
699729
/**
700730
* Get cron expression of cron job.
701731
*

app/code/Magento/Cron/Test/Unit/Model/DeadlockRetrierTest.php

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
use Magento\Framework\DB\Adapter\AdapterInterface;
1414
use Magento\Framework\DB\Adapter\DeadlockException;
1515
use PHPUnit\Framework\MockObject\MockObject;
16+
use Psr\Log\LoggerInterface;
1617

1718
class DeadlockRetrierTest extends \PHPUnit\Framework\TestCase
1819
{
@@ -27,6 +28,11 @@ class DeadlockRetrierTest extends \PHPUnit\Framework\TestCase
2728
*/
2829
private $adapterMock;
2930

31+
/**
32+
* @var LoggerInterface|MockObject
33+
*/
34+
private $loggerMock;
35+
3036
/**
3137
* @var AbstractModel|MockObject
3238
*/
@@ -38,8 +44,9 @@ class DeadlockRetrierTest extends \PHPUnit\Framework\TestCase
3844
protected function setUp(): void
3945
{
4046
$this->adapterMock = $this->getMockForAbstractClass(AdapterInterface::class);
47+
$this->loggerMock = $this->getMockForAbstractClass(LoggerInterface::class);
4148
$this->modelMock = $this->createMock(AbstractModel::class);
42-
$this->retrier = new DeadlockRetrier();
49+
$this->retrier = new DeadlockRetrier($this->loggerMock);
4350
}
4451

4552
/**
@@ -75,6 +82,8 @@ public function testRetry(): void
7582
$this->modelMock->expects($this->exactly(DeadlockRetrierInterface::MAX_RETRIES))
7683
->method('getId')
7784
->willThrowException(new DeadlockException());
85+
$this->loggerMock->expects($this->exactly(DeadlockRetrierInterface::MAX_RETRIES - 1))
86+
->method('warning');
7887

7988
$this->retrier->execute(
8089
function () {
@@ -95,6 +104,8 @@ public function testRetrySecond(): void
95104
$this->modelMock->expects($this->at(1))
96105
->method('getId')
97106
->willReturn(2);
107+
$this->loggerMock->expects($this->once())
108+
->method('warning');
98109

99110
$this->retrier->execute(
100111
function () {

app/code/Magento/Cron/Test/Unit/Observer/ProcessCronQueueObserverTest.php

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,8 +1047,8 @@ public function testMissedJobsCleanedInTime()
10471047
$this->scheduleCollectionMock->expects($this->any())->method('load')->willReturnSelf();
10481048

10491049
$scheduleMock->expects($this->any())->method('getCollection')->willReturn($this->scheduleCollectionMock);
1050-
$scheduleMock->expects($this->exactly(9))->method('getResource')->willReturn($this->scheduleResourceMock);
1051-
$this->scheduleFactoryMock->expects($this->exactly(10))->method('create')->willReturn($scheduleMock);
1050+
$scheduleMock->expects($this->exactly(10))->method('getResource')->willReturn($this->scheduleResourceMock);
1051+
$this->scheduleFactoryMock->expects($this->exactly(11))->method('create')->willReturn($scheduleMock);
10521052

10531053
$connectionMock = $this->getMockForAbstractClass(AdapterInterface::class);
10541054

@@ -1078,11 +1078,29 @@ public function testMissedJobsCleanedInTime()
10781078
)
10791079
->willReturn(1);
10801080

1081-
$this->scheduleResourceMock->expects($this->exactly(5))
1081+
$connectionMock->expects($this->once())
1082+
->method('quoteInto')
1083+
->with(
1084+
'status = ? AND job_code IN (?) AND (scheduled_at < UTC_TIMESTAMP() - INTERVAL 1 DAY)',
1085+
['test_job1'],
1086+
'running'
1087+
)
1088+
->willReturn('');
1089+
1090+
$connectionMock->expects($this->once())
1091+
->method('update')
1092+
->with(
1093+
$tableName,
1094+
['status' => 'error', 'messages' => 'Time out'],
1095+
''
1096+
)
1097+
->willReturn(0);
1098+
1099+
$this->scheduleResourceMock->expects($this->exactly(6))
10821100
->method('getTable')
10831101
->with($tableName)
10841102
->willReturn($tableName);
1085-
$this->scheduleResourceMock->expects($this->exactly(14))
1103+
$this->scheduleResourceMock->expects($this->exactly(15))
10861104
->method('getConnection')
10871105
->willReturn($connectionMock);
10881106

app/code/Magento/Cron/etc/db_schema.xml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,10 @@
2121
<constraint xsi:type="primary" referenceId="PRIMARY">
2222
<column name="schedule_id"/>
2323
</constraint>
24-
<index referenceId="CRON_SCHEDULE_JOB_CODE" indexType="btree">
24+
<index referenceId="CRON_SCHEDULE_STATUS_JOB_CODE_SCHEDULED_AT" indexType="btree">
25+
<column name="status"/>
2526
<column name="job_code"/>
26-
</index>
27-
<index referenceId="CRON_SCHEDULE_SCHEDULED_AT_STATUS" indexType="btree">
2827
<column name="scheduled_at"/>
29-
<column name="status"/>
3028
</index>
3129
</table>
3230
</schema>

0 commit comments

Comments
 (0)