Last active
June 26, 2025 19:45
-
-
Save ZachWatkins/ee49177cdd4b5f0825c06816d6e4a5a1 to your computer and use it in GitHub Desktop.
Service class for using PHPSpreadsheet to read workbooks with low memory use
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Services; | |
use PhpOffice\PhpSpreadsheet\IOFactory; | |
use PhpOffice\PhpSpreadsheet\Reader\IReader; | |
use PhpOffice\PhpSpreadsheet\Reader\IReadFilter; | |
use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet; | |
class WorkbookReadService | |
{ | |
protected ?string $filePath = null; | |
protected ?string $fileType = null; | |
protected ?array $worksheetInfo = null; | |
public int $chunkSize = 400; | |
protected ?IReader $reader = null; | |
protected ?array $headerRow = null; | |
protected ?string $highestHeaderColumn = null; | |
protected $headerFormatter; | |
/** | |
* Set the file path and determine the file type for processing. | |
* @param string $filePath | |
*/ | |
public function setFilePath(string $filePath): void | |
{ | |
if ($filePath === $this->filePath) { | |
return; | |
} | |
$this->filePath = $filePath; | |
$this->fileType = ucfirst(pathinfo($filePath, PATHINFO_EXTENSION)); | |
$this->headerRow = null; | |
$this->worksheetInfo = null; | |
$this->reader = null; | |
$this->highestHeaderColumn = null; | |
} | |
/** | |
* Get the header row from the workbook. Filters out empty header cells. | |
* @return array | |
* @throws \RuntimeException If the file path is not set. | |
*/ | |
public function getHeaderRow(?Worksheet $sheet = null): array | |
{ | |
if (!$this->filePath) { | |
throw new \RuntimeException('File path is not set.'); | |
} | |
if ($this->headerRow) { | |
return $this->headerRow; | |
} | |
if (!$sheet) { | |
$reader = $this->getReader(); | |
$chunkFilter = new ChunkReadFilter(1, 1); | |
$reader->setReadFilter($chunkFilter); | |
$spreadsheet = $reader->load($this->filePath); | |
$sheet = $spreadsheet->getActiveSheet(); | |
} | |
$maxDataColumn = $sheet->getHighestColumn(); | |
$rowIterator = $sheet->getRowIterator(1, 1); | |
$this->headerRow = []; | |
foreach ($rowIterator as $row) { | |
$columnIterator = $row->getCellIterator('A', $maxDataColumn); | |
foreach ($columnIterator as $cell) { | |
if ($cell->getValue() !== null && $cell->getValue() !== '') { | |
$column = $cell->getColumn(); | |
$this->headerRow[$column] = $cell->getValue(); | |
$this->highestHeaderColumn = $column; | |
} | |
} | |
} | |
return $this->headerRow; | |
} | |
/** | |
* Get the worksheet info. | |
* @return array | |
*/ | |
public function getWorksheetInfo(): array | |
{ | |
if (!$this->filePath) { | |
throw new \RuntimeException('File path is not set.'); | |
} | |
if (!$this->worksheetInfo) { | |
$reader = $this->getReader(); | |
$worksheetInfo = $reader->listWorksheetInfo($this->filePath)[0] ?? []; | |
$this->worksheetInfo = [ | |
'worksheetName' => $worksheetInfo['worksheetName'] ?? '', | |
'totalRows' => $worksheetInfo['totalRows'] ?? 0, | |
'totalColumns' => $worksheetInfo['totalColumns'] ?? 0, | |
'lastColumnLetter' => $worksheetInfo['lastColumnLetter'] ?? '', | |
'lastColumnIndex' => $worksheetInfo['lastColumnIndex'] ?? 0, | |
'sheetState' => $worksheetInfo['sheetState'] ?? '', | |
]; | |
} | |
return $this->worksheetInfo; | |
} | |
/** | |
* Yield every row from the workbook. | |
* @return \Generator | |
*/ | |
public function yieldRow(): \Generator | |
{ | |
if (!$this->filePath) { | |
throw new \RuntimeException('File path is not set.'); | |
} | |
$rawRows = $this->getWorksheetInfo()['totalRows']; | |
for ($startRow = 1; $startRow <= $rawRows; $startRow += $this->chunkSize) { | |
$reader = $this->getReader(); | |
$chunkFilter = new ChunkReadFilter($startRow, $this->chunkSize); | |
$reader->setReadFilter($chunkFilter); | |
$spreadsheet = $reader->load($this->filePath); | |
$activeSheet = $spreadsheet->getActiveSheet(); | |
$endColumn = $this->getHighestHeaderColumn($activeSheet); | |
$endRow = min($startRow + $this->chunkSize - 1, $rawRows); | |
$rowIterator = $activeSheet->getRowIterator($startRow, $endRow); | |
foreach ($rowIterator as $row) { | |
$columnIterator = $row->getCellIterator('A', $endColumn); | |
$values = []; | |
foreach ($columnIterator as $cell) { | |
$values[$cell->getColumn()] = $cell->getValue(); | |
} | |
yield $values; | |
} | |
// Release memory. | |
$spreadsheet->disconnectWorksheets(); | |
$spreadsheet->__destruct(); | |
$spreadsheet = null; | |
unset($spreadsheet); | |
$reader = null; | |
unset($reader); | |
$chunkFilter = null; | |
unset($chunkFilter); | |
} | |
} | |
/** | |
* Yield a chunk of rows from the workbook in a single read operation. | |
* @param int $startRow The first row to read. | |
* @param int|null $endRow The last row to read. If null, it will read the next chunk size rows. | |
* @return \Generator | |
* @throws \RuntimeException If the file path is not set. | |
* @throws \InvalidArgumentException If the start row is less than 1. | |
*/ | |
public function yieldRowRange(int $startRow, int $endRow): \Generator | |
{ | |
if (!$this->filePath) { | |
throw new \RuntimeException('File path is not set.'); | |
} | |
if ($startRow < 1) { | |
throw new \InvalidArgumentException('Start row must be greater than or equal to 1.'); | |
} | |
if ($endRow < $startRow) { | |
throw new \InvalidArgumentException('End row must be greater than or equal to start row. Found startRow: ' . $startRow . ', endRow: ' . $endRow); | |
} | |
$reader = $this->getReader(); | |
$chunkFilter = new ChunkReadFilter($startRow, $this->chunkSize); | |
$reader->setReadFilter($chunkFilter); | |
$spreadsheet = $reader->load($this->filePath); | |
$activeSheet = $spreadsheet->getActiveSheet(); | |
$endColumn = $this->getHighestHeaderColumn($activeSheet); | |
$rowIterator = $activeSheet->getRowIterator($startRow, $endRow); | |
foreach ($rowIterator as $row) { | |
$columnIterator = $row->getCellIterator('A', $endColumn); | |
$values = []; | |
foreach ($columnIterator as $cell) { | |
$values[$cell->getColumn()] = $cell->getValue(); | |
} | |
yield $values; | |
} | |
// Release memory. | |
$spreadsheet->disconnectWorksheets(); | |
$spreadsheet->__destruct(); | |
$spreadsheet = null; | |
unset($spreadsheet); | |
$reader = null; | |
unset($reader); | |
$chunkFilter = null; | |
unset($chunkFilter); | |
} | |
/** | |
* Yield a chunk of rows from the workbook in a single read operation with the header row's cell values as row cell keys. | |
* @param int $startRow The first row to read. | |
* @param int|null $endRow The last row to read. If null, it will read the next chunk size rows. | |
* @return \Generator | |
* @throws \RuntimeException If the file path is not set. | |
* @throws \InvalidArgumentException If the start row is less than 1. | |
*/ | |
public function yieldRowRangeWithHeaders(int $startRow, int $endRow): \Generator | |
{ | |
if (!$this->filePath) { | |
throw new \RuntimeException('File path is not set.'); | |
} | |
if ($startRow < 1) { | |
throw new \InvalidArgumentException('Start row must be greater than or equal to 1.'); | |
} | |
if ($endRow < $startRow) { | |
throw new \InvalidArgumentException('End row must be greater than or equal to start row. Found startRow: ' . $startRow . ', endRow: ' . $endRow); | |
} | |
$reader = $this->getReader(); | |
$chunkFilter = new ChunkReadFilter($startRow, $this->chunkSize); | |
$reader->setReadFilter($chunkFilter); | |
$spreadsheet = $reader->load($this->filePath); | |
$activeSheet = $spreadsheet->getActiveSheet(); | |
$rowIterator = $activeSheet->getRowIterator(1, 1); | |
$header = []; | |
$endColumn = $activeSheet->getHighestDataColumn(); | |
foreach ($rowIterator as $row) { | |
$columnIterator = $row->getCellIterator('A', $activeSheet->getHighestDataColumn()); | |
foreach ($columnIterator as $cell) { | |
if ($cell->getValue() === null || $cell->getValue() === '') { | |
break; // Stop at the first empty cell in the header row. | |
} | |
$columnLetter = $cell->getColumn(); | |
if ($this->headerFormatter) { | |
$header[$columnLetter] = call_user_func($this->headerFormatter, $cell->getValue()); | |
} else { | |
$header[$columnLetter] = $cell->getValue(); | |
} | |
$endColumn = $columnLetter; | |
} | |
} | |
$rowIterator = $activeSheet->getRowIterator($startRow, $endRow); | |
foreach ($rowIterator as $row) { | |
$columnIterator = $row->getCellIterator('A', $endColumn); | |
$values = []; | |
foreach ($columnIterator as $cell) { | |
$values[$header[$cell->getColumn()]] = $cell->getValue(); | |
} | |
yield $values; | |
} | |
// Release memory. | |
$spreadsheet->disconnectWorksheets(); | |
$spreadsheet->__destruct(); | |
$spreadsheet = null; | |
unset($spreadsheet); | |
$reader = null; | |
unset($reader); | |
$chunkFilter = null; | |
unset($chunkFilter); | |
} | |
/** | |
* Set the file reader object, or create a new one based on the file type. | |
* @return IReader | |
*/ | |
protected function getReader(): IReader | |
{ | |
if (!$this->fileType) { | |
throw new \RuntimeException('File type is not set.'); | |
} | |
if (!$this->reader) { | |
$this->reader = IOFactory::createReader($this->fileType); | |
} | |
return $this->reader; | |
} | |
/** | |
* Get the highest header column letter from the worksheet. | |
* This is useful for determining the last column with data in the header row. | |
* @param Worksheet $sheet | |
* @return string | |
*/ | |
protected function getHighestHeaderColumn(Worksheet $sheet): string | |
{ | |
if ($this->highestHeaderColumn) { | |
return $this->highestHeaderColumn; | |
} | |
$maxColumn = $sheet->getHighestColumn(); | |
$rowIterator = $sheet->getRowIterator(1, 1); | |
foreach ($rowIterator as $row) { | |
$columnIterator = $row->getCellIterator('A', $maxColumn); | |
foreach ($columnIterator as $cell) { | |
if ($cell->getValue() !== null && $cell->getValue() !== '') { | |
$maxColumn = $cell->getColumn(); | |
} | |
} | |
} | |
$this->highestHeaderColumn = $maxColumn; | |
return $maxColumn; | |
} | |
public function setHeaderFormatter(callable $formatter): void | |
{ | |
$this->headerFormatter = $formatter; | |
} | |
} | |
/** Define a Read Filter class implementing IReadFilter */ | |
class ChunkReadFilter implements IReadFilter | |
{ | |
private int $startRow; | |
private int $endRow; | |
/** | |
* We expect a list of the rows that we want to read to be passed into the constructor. | |
* @param int $startRow The first row to read. | |
* @param int $chunkSize The number of rows to read in this chunk. | |
*/ | |
public function __construct(int $startRow, int $chunkSize) | |
{ | |
$this->startRow = $startRow; | |
$this->endRow = $startRow + $chunkSize; | |
} | |
public function readCell(string $columnAddress, int $row, string $worksheetName = ''): bool | |
{ | |
// Only read the heading row, and the rows that were configured in the constructor | |
if (($row == 1) || ($row >= $this->startRow && $row < $this->endRow)) { | |
return true; | |
} | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment