-
Notifications
You must be signed in to change notification settings - Fork 576
Stripe size-sorted files across parallel jobs #5844
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,8 +5,12 @@ | |
| use PHPUnit\Framework\Attributes\DataProvider; | ||
| use PHPUnit\Framework\TestCase; | ||
| use function array_fill; | ||
| use function array_keys; | ||
| use function array_map; | ||
| use function array_merge; | ||
| use function count; | ||
| use function sort; | ||
| use function sprintf; | ||
|
|
||
| class SchedulerTest extends TestCase | ||
| { | ||
|
|
@@ -21,7 +25,7 @@ public static function dataSchedule(): array | |
| 50, | ||
| 115, | ||
| 1, | ||
| [50, 50, 15], | ||
| [39, 38, 38], | ||
| ], | ||
| [ | ||
| 16, | ||
|
|
@@ -30,7 +34,7 @@ public static function dataSchedule(): array | |
| 30, | ||
| 124, | ||
| 5, | ||
| [30, 30, 30, 30, 4], | ||
| [25, 25, 25, 25, 24], | ||
| ], | ||
| [ | ||
| 16, | ||
|
|
@@ -39,7 +43,7 @@ public static function dataSchedule(): array | |
| 30, | ||
| 124, | ||
| 3, | ||
| [30, 30, 30, 30, 4], | ||
| [25, 25, 25, 25, 24], | ||
| ], | ||
| [ | ||
| 16, | ||
|
|
@@ -48,7 +52,7 @@ public static function dataSchedule(): array | |
| 10, | ||
| 298, | ||
| 16, | ||
| [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8], | ||
| [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9], | ||
| ], | ||
| [ | ||
| 16, | ||
|
|
@@ -57,7 +61,7 @@ public static function dataSchedule(): array | |
| 30, | ||
| 124, | ||
| 2, | ||
| [30, 30, 30, 30, 4], | ||
| [25, 25, 25, 25, 24], | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the tests could be more expressive here. Could we supply differently sized files here, or maybe change the Scheduler signature so that we provide file sizes from the outside and the class is still unit testable?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done — |
||
| ], | ||
| [ | ||
| 16, | ||
|
|
@@ -91,11 +95,57 @@ public function testSchedule( | |
| { | ||
| $files = array_fill(0, $numberOfFiles, 'file.php'); | ||
| $scheduler = new Scheduler($jobSize, $maximumNumberOfProcesses, $minimumNumberOfJobsPerProcess); | ||
| $schedule = $scheduler->scheduleWork($cpuCores, $files); | ||
| $schedule = $scheduler->scheduleWork($cpuCores, $files, static fn (string $file): int => 0); | ||
|
|
||
| $this->assertSame($expectedNumberOfProcesses, $schedule->getNumberOfProcesses()); | ||
| $jobSizes = array_map(static fn (array $job): int => count($job), $schedule->getJobs()); | ||
| $this->assertSame($expectedJobSizes, $jobSizes); | ||
| } | ||
|
|
||
| public function testHeaviestFilesAreSpreadAcrossJobs(): void | ||
| { | ||
| $fileSizes = [ | ||
| 'a.php' => 100, | ||
| 'b.php' => 200, | ||
| 'c.php' => 300, | ||
| 'd.php' => 400, | ||
| 'e.php' => 500, | ||
| 'f.php' => 600, | ||
| ]; | ||
|
|
||
| $scheduler = new Scheduler(2, 16, 1); | ||
| $schedule = $scheduler->scheduleWork(16, array_keys($fileSizes), static fn (string $file): int => $fileSizes[$file] ?? 0); | ||
|
|
||
| // six files, job size 2 -> three jobs; the three heaviest files must not | ||
| // share a job, and every job pairs one heavy file with one light file | ||
| $this->assertSame([ | ||
| ['f.php', 'c.php'], | ||
| ['e.php', 'b.php'], | ||
| ['d.php', 'a.php'], | ||
| ], $schedule->getJobs()); | ||
| } | ||
|
|
||
| public function testEveryFileIsScheduledExactlyOnce(): void | ||
| { | ||
| $files = []; | ||
| $sizes = []; | ||
| for ($i = 0; $i < 47; $i++) { | ||
| $file = sprintf('file-%d.php', $i); | ||
| $files[] = $file; | ||
| $sizes[$file] = ($i * 37) % 1000; | ||
| } | ||
|
|
||
| $scheduler = new Scheduler(10, 16, 1); | ||
| $schedule = $scheduler->scheduleWork(16, $files, static fn (string $file): int => $sizes[$file]); | ||
|
|
||
| $scheduled = array_merge(...$schedule->getJobs()); | ||
| sort($scheduled); | ||
| sort($files); | ||
| $this->assertSame($files, $scheduled); | ||
|
|
||
| foreach ($schedule->getJobs() as $job) { | ||
| $this->assertLessThanOrEqual(10, count($job)); | ||
| } | ||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could this last block be simplified to
$jobs = array_chunk($files, $this->jobSize);again?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It can't, unfortunately — that's the one shape I measured as actively harmful. With the list sorted by size,
array_chunk()puts the ~20 heaviest files together in the first job, and whichever worker picks it up becomes a straggler the whole run waits for: sort + chunk measured +46% cold wall time vs. unsorted chunking on the src/Type self-analysis benchmark, while sort + round-robin striping is what produces the −21..23%. The striping is the point, not an artifact; the newtestHeaviestFilesAreSpreadAcrossJobstest pins exactly this property.