Skip to content

Instantly share code, notes, and snippets.

@thekid
Last active June 21, 2025 22:19
Show Gist options
  • Save thekid/e2176ddb615150e082a104149bd950ec to your computer and use it in GitHub Desktop.
Save thekid/e2176ddb615150e082a104149bd950ec to your computer and use it in GitHub Desktop.
PDF Parser
<?php
use io\streams\InputStream;
use lang\FormatException;
class CharacterMap {
private $tokens;
public $width= null;
public $chars= [], $ranges= [];
public $codespace= [0x0000, 0xffff];
public function __construct(InputStream $in) {
$this->tokens= new Tokens($in);
}
private function char($code) {
return iconv('ucs-4', 'utf-8', pack('N', $code));
}
public function parse(): self {
try {
do {
$token= $this->tokens->token();
// Parse `2 begin[...]` - an enumeration with 2 entries
if ('integer' === $token[0]) {
$length= $token[1];
} else if ('word' === $token[0] && 1 === sscanf($token[1], 'begin%s', $type)) {
switch ($type) {
case 'cmap';
break;
case 'codespacerange':
for ($i= 0; $i < $length; $i++) {
$lo= $this->tokens->token()[1];
$hi= $this->tokens->token()[1];
$this->codespace= [hexdec($lo), hexdec($hi)];
}
break;
case 'bfchar':
for ($i= 0; $i < $length; $i++) {
$src= $this->tokens->token()[1];
$map= $this->tokens->token()[1];
$this->chars[hexdec($src)]= iconv('utf-16be', 'utf-8', hex2bin($map));
}
$this->width??= strlen($src);
break;
case 'bfrange': case 'cidrange':
for ($i= 0; $i < $length; $i++) {
$lo= $this->tokens->token()[1];
$hi= $this->tokens->token()[1];
$token= $this->tokens->token();
if ('array-start' === $token[0]) {
$map= [];
next: $token= $this->tokens->token();
if ('hex' === $token[0]) {
$map[]= hexdec($token[1]);
goto next;
} else if ('integer' === $token[0]) {
$map[]= $token[1];
goto next;
}
} else if ('hex' === $token[0]) {
$map= hexdec($token[1]);
} else {
$map= $token[1];
}
$this->ranges[]= [hexdec($lo), hexdec($hi), $map];
}
$this->width??= strlen($lo);
break;
default:
throw new FormatException('Unknown character map enumeration '.$type);
}
}
} while ('endcmap' !== $token[1]);
return $this;
} finally {
$this->tokens->close();
}
}
public function translate($bytes, $fallback= null) {
$this->width ?? $this->parse();
$string= '';
for ($i= 0, $l= strlen($bytes), $b= ($this->width ?? 2) / 2; $i < $l; $i+= $b) {
$code= 1 === $b ? ord($bytes[$i]) : unpack('n', $bytes, $i)[1];
if (null !== ($char= $this->chars[$code] ?? null)) {
$string.= $char;
} else if ($code < $this->codespace[0] || $code > $this->codespace[1]) {
$string.= $this->char($code);
} else {
$char= null;
foreach ($this->ranges as $range) {
if ($code >= $range[0] && $code <= $range[1]) {
if (is_array($range[2])) {
$char= $this->char($range[2][$code - $range[0]]);
} else {
$char= $this->char($code - $range[0] + $range[2]);
}
break;
}
}
$string.= $char ?? sprintf('\u{%04x}', $code);
}
}
// DEBUG
// var_dump($this);
// echo "< $bytes\n";
// echo "> `", addcslashes($string, "\0..\37!\177..\377"), "`\n";
return $string;
}
}
<?php
use test\{Assert, Test, Values};
use io\streams\MemoryInputStream;
class CharacterMapTest {
private $definition= <<<'MULTIBYTE'
/CIDInit /ProcSet findresource begin
12 dict begin
begincmap
/CIDSystemInfo
<< /Registry (Adobe)
/Ordering (UCS)
/Supplement 0
>> def
/CMapName
/Adobe-Identity-UCS def
/CMapType 2 def
1 begincodespacerange
<0000> <FFFF>
endcodespacerange
4 beginbfchar
<0003> <0054>
<000F> <0065>
<0011> <0073>
<0012> <007400650064>
endbfchar
2 beginbfrange
<0013> <001C> <0030>
<03E0> <03E1> [<002C> <003A>]
endbfrange
endcmap
CMapName currentdict /CMap defineresource pop
end
end
MULTIBYTE;
/** @return iterable */
private function formats() {
yield [new MemoryInputStream($this->definition), 'multiline'];
yield [new MemoryInputStream(strtr($this->definition, "\n", ' ')), 'compact'];
}
#[Test, Values(from: 'formats')]
public function parse($stream, $kind) {
$fixture= (new CharacterMap($stream))->parse();
Assert::equals(4, $fixture->width);
Assert::equals([0x0000, 0xffff], $fixture->codespace);
Assert::equals([0x0003 => 'T', 0x000F => 'e', 0x0011 => 's', 0x0012 => 'ted'], $fixture->chars);
Assert::equals([[0x0013, 0x001C, 0x0030], [0x03E0, 0x03E1, [0x002C, 0x003A]]], $fixture->ranges);
}
#[Test]
public function translate_hex_string() {
$fixture= new CharacterMap(new MemoryInputStream($this->definition));
Assert::equals('Tested19:', $fixture->translate(hex2bin('0003000F001100120014001C03E1')));
}
}
<?php
use io\streams\InputStream;
class Content {
private $tokens;
public function __construct(InputStream $in) {
$this->tokens= new Tokens($in);
}
public function operations(): Generator {
$operands= [];
while (null !== ($token= $this->tokens->token())) {
if ('word' === $token[0]) {
// Attach inline image data
if ('ID' === $token[1]) {
$operands[]= ['$inline', trim($this->tokens->scan("\nEI"))];
}
yield $token[1] => $operands;
$operands= [];
} else {
$operands[]= $token;
}
}
$this->tokens->close();
}
}
<?php
use io\streams\MemoryInputStream;
use test\{Assert, Test};
class ContentTest {
/** Parses and returns content operations */
private function parse($content) {
$content= new Content(new MemoryInputStream($content));
$r= [];
foreach ($content->operations() as $op => $arguments) {
$r[]= [$op => $arguments];
}
return $r;
}
#[Test]
public function text_block() {
Assert::equals(
[
['BT' => []],
['Tf' => [['name', 'R13'], ['integer', 60]]],
['Tj' => [['string', 'Test']]],
['ET' => []],
],
$this->parse(<<<'PS'
BT
/R13 60 Tf
(Test)Tj
ET
PS)
);
}
#[Test]
public function inline_image() {
Assert::equals(
[
['BI' => []],
['ID' => [['name', 'CS'], ['name', 'RGB'], ['$inline', "GIF89a\n..."]]],
['EI' => []],
],
$this->parse(
"BI\n".
"/CS/RGB\n".
"ID GIF89a\n".
"...\n".
"EI\n"
)
);
}
}
<?php
use io\streams\FileInputStream;
use util\cmd\Console;
use util\Objects;
use lang\FormatException;
try {
$parser= new PdfFile(new FileInputStream($argv[1]));
} catch (Throwable $e) {
Console::writeLine($argv[1], ': ', $e);
return 2;
}
$objects= [];
$trailer= [];
Console::$err->write('Parsing ', $argv[1], ' [');
foreach ($parser->objects() as $kind => $value) {
Console::$err->write('.');
if ('object' === $kind) {
$objects[$value['id']->hashCode()]= $value['dict'];
} else if ('trailer' === $kind) {
$trailer+= $value;
}
}
Console::$err->writeLine(']');
// Handle files without `trailer` entry, e.g. linearized PDFs
if (empty($trailer)) {
foreach ($objects as $object) {
if ('XRef' === ($object['Type'] ?? null)) {
$trailer= $object;
break;
}
}
}
// Handle encryption
Console::writeLine('Trailer: ', $trailer);
if (isset($trailer['Encrypt'])) {
$encrypt= $objects[$trailer['Encrypt']->hashCode()];
Console::writeLine('Cannot handle encrypted PDFs: ', $encrypt);
return 1;
}
// Now, unpack object streams and add them to the lookup table
foreach ($objects as $id => $object) {
if ('ObjStm' === ($object['Type'] ?? null)) {
$n= $object['N'];
$tokens= new Tokens($object['$stream']);
$numbers= [];
for ($i= 0; $i < $n; $i++) {
$number= $tokens->token()[1];
$offset= $tokens->token()[1];
$numbers[$i]= $number.'_0';
}
for ($i= 0; $i < $n; $i++) {
$objects[$numbers[$i]]= $tokens->value();
}
}
}
if ($info= ($trailer['Info'] ?? null)) {
Console::writeLine('Info: ', $objects[$info->hashCode()]);
}
$root= $objects[$trailer['Root']->hashCode()];
Console::writeLine('Root: ', $root);
$pages= $objects[$root['Pages']->hashCode()];
Console::writeLine('Pages: ', $pages);
$objectOf= function($arg) use(&$objects) {
return $arg instanceof Ref ? $objects[$arg->hashCode()] : $arg;
};
$contentsOf= function($kids, $suffix= '') use(&$contentsOf, &$objectOf, &$objects) {
foreach ($kids as $i => $ref) {
$page= $objects[$ref->hashCode()];
$number= $suffix.($i + 1);
if ('Pages' === $page['Type']) {
yield from $contentsOf($page['Kids'], $number.'.');
continue;
}
// A single content object or an array of references
$contents= $objectOf($page['Contents']);
if ($stream= $contents['$stream'] ?? null) {
$streams= [$stream];
} else {
$streams= [];
foreach ($contents as $ref) {
$streams[]= $objects[$ref->hashCode()]['$stream'];
}
}
yield $number => $page + ['$contents' => $streams];
}
};
$stringOf= function($bytes, $encoding, $unicode= null) use(&$stringOf) {
switch ($encoding) {
case null: return $bytes;
case 'MacRomanEncoding': return iconv('macintosh', \xp::ENCODING, $bytes);
case 'WinAnsiEncoding': return iconv('cp1252', \xp::ENCODING, $bytes);
case 'Identity-H': return $unicode->translate($bytes);
default:
// FIXME: Respect differences
return $stringOf($bytes, $encoding['BaseEncoding'] ?? null);
}
};
$textOf= function($token, array $format) use(&$objects, &$objectOf, &$stringOf) {
if ($descendants= $format['font']['DescendantFonts'] ?? null) {
// TODO https://github.com/adobe-type-tools/cmap-resources
}
$bytes= 'hex' === $token[0] ? hex2bin($token[1]) : $token[1];
switch ($format['font']['Subtype']) {
case 'Type0': case 'Type1': case 'TrueType':
return $stringOf(
$bytes,
$objectOf($format['font']['Encoding'] ?? null),
$format['font']['$unicode'] ?? null
);
case 'Type3':
if ($map= $format['font']['$unicode'] ?? null) {
return $format['font']['$unicode']->translate($bytes);
}
throw new FormatException('Cannot handle type 3 fonts without unicode');
default:
throw new FormatException('Unknown font type '.$format['font']['Subtype']);
}
};
foreach ($contentsOf($pages['Kids']) as $number => $page) {
Console::writeLine('- Page #', $number, ' contents: ', Objects::stringOf($page['Contents'], ' '));
// Resolve fonts
$fonts= [];
if ($resources= $objectOf($page['Resources'])['Font'] ?? null) {
foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) {
$font= $objects[$ref->hashCode()];
// Console::writeLine(' Font <', $id, '> := ', Objects::stringOf($font, ' '));
if ($ref= $font['ToUnicode'] ?? null) {
$font['$unicode']= new CharacterMap($objects[$ref->hashCode()]['$stream']);
}
$fonts[$id]= $font;
}
}
// Select no font for the beginning
$font= null;
// Resolve objects
$xobjects= [];
if ($resources= $objectOf($page['Resources'])['XObject'] ?? null) {
foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) {
$xobjects[$id]= $objects[$ref->hashCode()];
}
}
// DEBUG
// foreach ($page['$contents'] as $i => $stream) {
// file_put_contents('text-block.'.$number.'_'.$i, $stream->bytes());
// }
$paragraph= '';
$content= new Content(new SequenceInputStream($page['$contents']));
foreach ($content->operations() as $op => $arguments) {
// DEBUG
// Console::$err->writeLine("\e[2m", '[', $number, ' | ', $op, ']: ', $arguments, "\e[0m");
if ('BT' === $op) {
// NOOP
} else if ('ET' === $op) {
Console::writeLine(' ', str_replace("\n", "\n ", $paragraph));
Console::writeLine();
$paragraph= '';
} else if ('Tf' === $op) {
$id= $arguments[0][1];
$size= $arguments[1][1];
$font= ['font' => $fonts[$id] ?? '@'.$id, 'size' => $size];
} else if ('Tj' === $op) {
$paragraph.= $textOf($arguments[0], $font);
} else if ('TJ' === $op) {
foreach ($arguments as $argument) {
if ('hex' === $argument[0] || 'string' === $argument[0]) {
$paragraph.= $textOf($argument, $font);
}
}
} else if ("'" === $op) {
$paragraph.= "\n".$textOf($arguments[0], $font);;
} else if ('"' === $op) {
$paragraph.= "\n".$textOf($arguments[2], $font);;
} else if ('TD' === $op) {
$paragraph.= "\n";
} else if ('Do' === $op) {
$id= $arguments[0][1];
$xobject= $xobjects[$id];
if ('Image' === $xobject['Subtype']) {
Console::writeLine(' ', "![{$id}]({$xobject['Subtype']}:{$xobject['Width']}x{$xobject['Height']})\n");
}
}
}
}
<?php
use io\streams\InputStream;
use lang\FormatException;
/** @see https://opensource.adobe.com/dc-acrobat-sdk-docs/pdflsdk/#pdf-reference */
class PdfFile {
private $tokens;
public $version;
public function __construct(InputStream $in) {
$this->tokens= new Tokens($in);
if (1 !== sscanf($this->tokens->line(), '%%PDF-%[0-9.]', $this->version)) {
throw new FormatException('PDF file header not found');
}
}
private function xref() {
$xref= [];
while (2 === sscanf($line= $this->tokens->line(), '%d %d', $number, $length)) {
for ($i= 0; $i < $length; $i++) {
$xref[]= $this->tokens->line();
}
}
$this->tokens->push($line."\n");
return $xref;
}
public function objects() {
$operands= [];
while (null !== ($token= $this->tokens->token())) {
if ('word' === $token[0]) {
if ('obj' === $token[1]) {
$number= $operands[0][1];
$generation= $operands[1][1];
yield 'object' => ['id' => new Ref($number, $generation), 'dict' => $this->tokens->value()];
} else if ('xref' === $token[1]) {
yield 'xref' => $this->xref();
} else if ('trailer' === $token[1]) {
yield 'trailer' => $this->tokens->value();
}
$operands= [];
} else if ('comment' !== $token[0]) {
$operands[]= $token;
}
}
}
}
<?php
use lang\Value;
use util\Objects;
class Ref implements Value {
public $number, $generation;
public function __construct($number, $generation) {
$this->number= $number;
$this->generation= $generation;
}
public function hashCode() {
return $this->number.'_'.$this->generation;
}
public function toString() {
return nameof($this).'('.$this->number.'_'.$this->generation.')';
}
public function compareTo($value) {
return $value instanceof self ? $this->hashCode() <=> $value->hashCode() : 1;
}
}
<?php
use io\streams\InputStream;
/** @test SequenceInputStreamTest */
class SequenceInputStream implements InputStream {
private $streams, $current;
/** @param iterable $streams */
public function __construct($streams) {
if ($streams instanceof Iterator) {
$this->streams= $streams;
} else {
$this->streams= new ArrayIterator($streams);
}
$this->current= $this->streams->current();
}
/** @return int */
public function available() {
do {
if ($r= $this->current->available()) return $r;
// No more data available on current stream, close and select next
$this->streams->next();
} while ($this->streams->valid() && ($this->current= $this->streams->current()));
return 0;
}
/**
* Reads up to the specified number of bytes
*
* @param int $bytes
* @return string
*/
public function read($bytes= 8192) {
do {
if ('' !== ($r= $this->current->read($bytes))) return $r;
// EOF from current stream, close and select next
$this->streams->next();
} while ($this->streams->valid() && ($this->current= $this->streams->current()));
return '';
}
/** @return void */
public function close() {
foreach ($this->streams as $stream) {
$stream->close();
}
}
/** Ensure streams are closed */
public function __destruct() {
$this->close();
}
}
<?php
use test\{Assert, Test};
use io\streams\MemoryInputStream;
class SequenceInputStreamTest {
/** Drains a stream */
private function drain($stream) {
$r= [];
while ($available= $stream->available()) {
$r[]= [$available, $stream->read()];
}
$r[]= [$stream->available(), $stream->read()];
return $r;
}
#[Test]
public function drain_one() {
$fixture= new SequenceInputStream([new MemoryInputStream('Test')]);
Assert::equals([[4, 'Test'], [0, '']], $this->drain($fixture));
}
#[Test]
public function drain_multiple() {
$fixture= new SequenceInputStream([
new MemoryInputStream('One'),
new MemoryInputStream('Two')
]);
Assert::equals([[3, 'One'], [3, 'Two'], [0, '']], $this->drain($fixture));
}
#[Test]
public function using_only_read() {
$fixture= new SequenceInputStream([
new MemoryInputStream('One'),
new MemoryInputStream('Two')
]);
Assert::equals('One', $fixture->read());
Assert::equals('Two', $fixture->read());
Assert::equals('', $fixture->read());
}
}
<?php
use io\streams\{InputStream, MemoryInputStream};
use lang\Value;
use util\Objects;
class Stream implements Value, InputStream {
private $bytes, $filter;
private $input= null;
public function __construct($bytes, $filter) {
$this->bytes= $bytes;
$this->filter= $filter;
}
public function bytes() {
switch ($this->filter) {
case null: return $this->bytes;
case 'DCTDecode': case 'JPXDecode': case 'CCITTFaxDecode': return $this->bytes;
case 'FlateDecode': return gzuncompress($this->bytes);
default: throw new IllegalArgumentException('Unknown filter '.$this->filter);
}
}
/** @return io.streams.InputStream */
private function input() {
if ('FlateDecode' === $this->filter) {
return new MemoryInputStream(gzuncompress($this->bytes)); // TODO: xp-forge/compress
} else {
return new MemoryInputStream($this->bytes);
}
}
public function available() {
$this->input??= $this->input();
return $this->input->available();
}
public function read($bytes= 8192) {
$this->input??= $this->input();
return $this->input->read($bytes= 8192);
}
public function close() {
$this->input && $this->input->close();
$this->input= null;
}
public function hashCode() {
return 'S'.md5($this->bytes);
}
public function toString() {
return nameof($this).'('.strlen($this->bytes).' bytes '.($this->filter ?? 'Plain').')';
}
public function compareTo($value) {
return $value instanceof self ? $this->bytes <=> $value->bytes : 1;
}
}
<?php
use io\streams\InputStream;
use lang\FormatException;
class Tokens {
private $in;
private $buffer= '';
public function __construct(InputStream $in) {
$this->in= $in;
}
public function push(string $buffer) {
$this->buffer= $buffer.$this->buffer;
}
public function bytes(int $length): ?string {
if (null === $this->buffer) return null;
while (strlen($this->buffer) < $length && $this->in->available()) {
$this->buffer.= $this->in->read();
}
$return= substr($this->buffer, 0, $length);
$this->buffer= substr($this->buffer, $length);
return $return;
}
public function line(): ?string {
if (null === $this->buffer) return null;
$eof= false;
do {
$p= strcspn($this->buffer, "\r\n");
if ($p < strlen($this->buffer) - 1 || $eof= !$this->in->available()) break;
$this->buffer.= $this->in->read();
} while (true);
// Check for Mac OS, Windows and Un*x line endings
if ("\r" === ($this->buffer[$p] ?? null)) {
$p >= strlen($this->buffer) && $this->buffer.= $this->in->read(1);
$e= "\n" === ($this->buffer[$p + 1] ?? null) ? 2 : 1;
} else {
$e= 1;
}
$return= substr($this->buffer, 0, $p);
$this->buffer= $eof ? null : substr($this->buffer, $p + $e);
return $return;
}
public function scan(string $marker): string {
if (null === $this->buffer) return null;
while (false === ($p= strpos($this->buffer, $marker))) {
if (!$this->in->available()) break;
$this->buffer.= $this->in->read();
}
$return= substr($this->buffer, 0, $p);
$this->buffer= substr($this->buffer, $p);
return $return;
}
public function token($peek= false): ?array {
if (null === $this->buffer) return null;
// Read a complete line
$eof= false;
value: do {
$p= strcspn($this->buffer, "\n");
if ($p < strlen($this->buffer) - 1 || $eof= !$this->in->available()) break;
$this->buffer.= $this->in->read();
} while (true);
// echo '=> `', addcslashes($this->buffer, "\0..\37!\177..\377"), "`\n";
if ('' === $this->buffer && $eof) return $this->buffer= null;
if ('/' === $this->buffer[0]) {
$p= strspn($this->buffer, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789#:.,+-_', 1);
$r= ['name', substr($this->buffer, 1, $p)];
$p++;
} else if ('(' === $this->buffer[0]) {
$string= '';
$offset= $braces= 1;
segment: $p= strcspn($this->buffer, '\\()', $offset);
$string.= substr($this->buffer, $offset, $p);
$offset+= $p;
if ('\\' === $this->buffer[$offset]) {
$offset++;
// Handle character sequences like `\374`
if (3 === strspn($this->buffer, '0123456789', $offset)) {
$string.= chr(octdec(substr($this->buffer, $offset, 3)));
$offset+= 3;
} else {
$string.= $this->buffer[$offset++];
}
goto segment;
} else if ('(' === $this->buffer[$offset]) {
$offset++;
if (++$braces) {
$string.= '(';
goto segment;
}
} else if (')' === $this->buffer[$offset]) {
$offset++;
if (--$braces) {
$string.= ')';
goto segment;
}
}
$p= $offset;
$r= ['string', $string];
} else if ('[' === $this->buffer[0]) {
$p= 1;
$r= ['array-start', null];
} else if (']' === $this->buffer[0]) {
$p= 1;
$r= ['array-end', null];
} else if ('%' === $this->buffer[0]) {
$p= strcspn($this->buffer, "\r\n", 1);
$r= ['comment', substr($this->buffer, 1 + strspn($this->buffer, ' ', 1), $p - 1)];
$p++;
} else if (0 === strncmp($this->buffer, '<<', 2)) {
$p= 2;
$r= ['dict-start', null];
} else if (0 === strncmp($this->buffer, '>>', 2)) {
$p= 2;
$r= ['dict-end', null];
} else if ('<' === $this->buffer[0]) {
$p= strpos($this->buffer, '>');
$r= ['hex', substr($this->buffer, 1, $p - 1)];
$p++;
} else if ($p= strspn($this->buffer, '-0123456789.')) {
// Disambiguate references from integer and decimal numbers
if (4 === sscanf($this->buffer, '%d %d %*[R]%n', $number, $generation, $l)) {
$p= $l;
$r= ['ref', new Ref($number, $generation)];
} else {
$number= substr($this->buffer, 0, $p);
$r= false === strpos($number, '.') ? ['integer', (int)$number] : ['decimal', (float)$number];
}
} else if (0 === strncmp($this->buffer, 'true', 4)) {
$p= 4;
$r= ['const', true];
} else if (0 === strncmp($this->buffer, 'null', 4)) {
$p= 4;
$r= ['const', null];
} else if (0 === strncmp($this->buffer, 'false', 5)) {
$p= 5;
$r= ['const', false];
} else if (0 === strncmp($this->buffer, 'stream', 6)) {
// Stream starts on a new line
$p= 6 + strspn($this->buffer, "\r\n", 6);
$r= ['stream-start', null];
} else if (0 === strncmp($this->buffer, 'endstream', 9)) {
$p= 9;
$r= ['stream-end', null];
} else if ($p= strspn($this->buffer, " \t\r\n")) {
$this->buffer= substr($this->buffer, $p);
goto value;
} else {
// Everything until the next whitespace or begin of a token
$p= strcspn($this->buffer, " \t\r\n/<[(%");
$r= ['word', substr($this->buffer, 0, $p)];
}
$peek || $this->buffer= substr($this->buffer, $p);
// var_dump($r);
return $r;
}
public function expect($kind) {
$token= $this->token();
if ($kind !== $token[0]) {
throw new FormatException(sprintf(
'Expected %s, have %s `%s...`',
$kind,
$token[0],
substr(addcslashes($token[1], "\0..\37!\177..\377"), 0, 42)
));
}
return $token;
}
public function value($token= null) {
$token??= $this->token();
if ('array-start' === $token[0]) {
$array= [];
element: if (null === ($token= $this->token())) throw new FormatException('unclosed array');
if ('array-end' !== $token[0]) {
$array[]= $this->value($token);
goto element;
}
return $array;
} else if ('dict-start' === $token[0]) {
$object= [];
pair: if (null === ($token= $this->token())) throw new FormatException('unclosed dict');
if ('dict-end' !== $token[0]) {
$object[$token[1]]= $this->value();
goto pair;
}
// Handle stream objects
$token= $this->token(true);
if (['stream-start', null] === $token) {
$this->expect('stream-start');
if (is_int($object['Length'])) {
$bytes= $this->bytes($object['Length']);
} else {
$bytes= $this->scan("endstream\n");
}
$this->expect('stream-end');
$object['$stream']= new Stream($bytes, $object['Filter'] ?? null);
}
return $object;
} else if ('string' === $token[0]) {
if (0 === strncmp("\xfe\xff", $token[1], 2)) {
return iconv('utf-16be', \xp::ENCODING, substr($token[1], 2));
} else {
return $token[1];
}
} else {
return $token[1];
}
}
public function close() {
$this->in->close();
}
}
<?php
use test\{Assert, Ignore, Test, Values};
use io\streams\InputStream;
class TokensTest {
/** @param string|string[] $chunks */
private function fixture($chunks): Tokens {
return new Tokens(new class((array)$chunks) implements InputStream {
private $chunks;
public function __construct($chunks) { $this->chunks= $chunks; }
public function available() { return sizeof($this->chunks); }
public function read($bytes= 8192) { return array_shift($this->chunks); }
public function close() { $this->chunks= []; }
});
}
#[Test]
public function read_twice() {
$fixture= $this->fixture('Tested');
Assert::equals(['Test', 'ed'], [$fixture->bytes(4), $fixture->bytes(2)]);
}
#[Test]
public function read_empty() {
$fixture= $this->fixture('');
Assert::equals('', $fixture->bytes(1));
}
#[Test, Values(['Test', "Test\n"])]
public function line($input) {
$fixture= $this->fixture($input);
Assert::equals('Test', $fixture->line());
Assert::null($fixture->line());
}
#[Test, Values(["Line 1\n\nLine 3", "Line 1\r\rLine 3", "Line 1\r\n\r\nLine 3"])]
public function empty_line($input) {
$fixture= $this->fixture($input);
Assert::equals('Line 1', $fixture->line());
Assert::equals('', $fixture->line());
Assert::equals('Line 3', $fixture->line());
Assert::null($fixture->line());
}
#[Test, Values([['/Length', 'Length'], ['/ca', 'ca'], ['/S', 'S'], ['/FlateDecode', 'FlateDecode'], ['/C2_0', 'C2_0'], ['/Font,Bold', 'Font,Bold']])]
public function name($input, $expected) {
Assert::equals(['name', $expected], $this->fixture($input)->token());
}
#[Test, Values([['()', ''], ['(Test)', 'Test']])]
public function string($input, $expected) {
Assert::equals(['string', $expected], $this->fixture($input)->token());
}
#[Test, Values([['(C:\\\\PHP \\(8.5\\))', 'C:\\PHP (8.5)'], ['(f\\303\\274r)', 'für']])]
public function escapes_in_string($input, $expected) {
Assert::equals(['string', $expected], $this->fixture($input)->token());
}
#[Test, Values([['(())', '()'], ['(Test (OK))', 'Test (OK)']])]
public function balanced_braces_in_string($input, $expected) {
Assert::equals(['string', $expected], $this->fixture($input)->token());
}
#[Test, Values([['1', 1], ['-1', -1], ['6100', 6100]])]
public function integer($input, $expected) {
Assert::equals(['integer', $expected], $this->fixture($input)->token());
}
#[Test, Values([['1.5', 1.5], ['-0.5', -0.5], ['3.141', 3.141]])]
public function decimal($input, $expected) {
Assert::equals(['decimal', $expected], $this->fixture($input)->token());
}
#[Test]
public function ref() {
Assert::equals(['ref', new Ref(62, 0)], $this->fixture('62 0 R')->token());
}
#[Test]
public function comment() {
Assert::equals(['comment', 'Test'], $this->fixture('% Test')->token());
}
#[Test]
public function hex() {
Assert::equals(['hex', 'B5FEF09943'], $this->fixture('<B5FEF09943>')->token());
}
#[Test, Ignore('Not yet implemented')]
public function multiline_string() {
Assert::equals(['Test passed'], $this->fixture("(Test\n passed)")->token());
}
#[Test, Values([['true', true], ['false', false], ['null', null]])]
public function constants($input, $expected) {
Assert::equals(['const', $expected], $this->fixture($input)->token());
}
#[Test, Values(['[]', '[ ]', '[ ]'])]
public function empty_array($input) {
$fixture= $this->fixture($input);
Assert::equals(['array-start', null], $fixture->token());
Assert::equals(['array-end', null], $fixture->token());
}
#[Test, Values(['[[]]', '[ [] ]', '[ [ ] ]'])]
public function nested_array($input) {
$fixture= $this->fixture($input);
Assert::equals(['array-start', null], $fixture->token());
Assert::equals(['array-start', null], $fixture->token());
Assert::equals(['array-end', null], $fixture->token());
Assert::equals(['array-end', null], $fixture->token());
}
#[Test, Values(['[1 2]', '[1 2 ]', '[ 1 2]', '[ 1 2 ]', '[ 1 2 ]'])]
public function array($input) {
$fixture= $this->fixture($input);
Assert::equals(['array-start', null], $fixture->token());
Assert::equals(['integer', 1], $fixture->token());
Assert::equals(['integer', 2], $fixture->token());
Assert::equals(['array-end', null], $fixture->token());
}
#[Test, Values(['<<>>', '<< >>', '<< >>'])]
public function empty_dict($input) {
$fixture= $this->fixture($input);
Assert::equals(['dict-start', null], $fixture->token());
Assert::equals(['dict-end', null], $fixture->token());
}
#[Test, Values(['<</Length 3>>', '<< /Length 3>>', '<</Length 3 >>'])]
public function dict($input) {
$fixture= $this->fixture($input);
Assert::equals(['dict-start', null], $fixture->token());
Assert::equals(['name', 'Length'], $fixture->token());
Assert::equals(['integer', 3], $fixture->token());
Assert::equals(['dict-end', null], $fixture->token());
}
#[Test]
public function multiline_dict() {
$fixture= $this->fixture(["<</Type /StructElem\n", "/S /Link>>"]);
Assert::equals(['dict-start', null], $fixture->token());
Assert::equals(['name', 'Type'], $fixture->token());
Assert::equals(['name', 'StructElem'], $fixture->token());
Assert::equals(['name', 'S'], $fixture->token());
Assert::equals(['name', 'Link'], $fixture->token());
Assert::equals(['dict-end', null], $fixture->token());
}
#[Test, Values([[["<<>> stream\nTest\nendstream"]], [["<<>> stream\r\nTest\r\nendstream"]], [["<<>>\nstream\nTest\nendstream"]], [["<<>> stream\n", "Test\nendstream"]], [["<<>>\n", "stream\n", "Test\nendstream"]], [["<<>>", "\n", "stream\n", "Test\nendstream"]]])]
public function stream_object($chunks) {
$fixture= $this->fixture($chunks);
Assert::equals(['dict-start', null], $fixture->token());
Assert::equals(['dict-end', null], $fixture->token());
Assert::equals(['stream-start', null], $fixture->token());
Assert::equals('Test', $fixture->bytes(4));
Assert::equals(['stream-end', null], $fixture->token());
}
}
<?php
use test\{Assert, Test, Values};
use io\streams\MemoryInputStream;
class ValueTest {
public function parse($input) {
return (new Tokens(new MemoryInputStream($input)))->value();
}
#[Test, Values([['1', 1], ['-1', -1], ['6100', 6100]])]
public function integer($input, $expected) {
Assert::equals($expected, $this->parse($input));
}
#[Test, Values([['1.5', 1.5], ['-0.5', -0.5], ['3.141', 3.141]])]
public function decimal($input, $expected) {
Assert::equals($expected, $this->parse($input));
}
#[Test, Values([['/Length', 'Length'], ['/ca', 'ca'], ['/FlateDecode', 'FlateDecode']])]
public function literal($input, $expected) {
Assert::equals($expected, $this->parse($input));
}
#[Test, Values([['()', ''], ['(Test)', 'Test'], ['(C:\\\\PHP \\(8.5\\))', 'C:\\PHP (8.5)']])]
public function string($input, $expected) {
Assert::equals($expected, $this->parse($input));
}
#[Test, Values([['[1 2 3]', [1, 2, 3]], ['[(One) (Two)]', ['One', 'Two']], ['[(One)-0.7 (Two)-6]', ['One', -0.7, 'Two', -6]]])]
public function array($input, $expected) {
Assert::equals($expected, $this->parse($input));
}
#[Test, Values(["[1 2 3\n4 5 6]", "[\n1 2 3\n4 5 6\n]", "[\n1\n2\n3\n4\n5\n6\n]"])]
public function multiline_array($input) {
Assert::equals([1, 2, 3, 4, 5, 6], $this->parse($input));
}
#[Test]
public function ref() {
Assert::equals(new Ref(62, 0), $this->parse('62 0 R'));
}
#[Test]
public function hex() {
Assert::equals('B5FEF09943A87A6B1F64BB26ED1B7737', $this->parse('<B5FEF09943A87A6B1F64BB26ED1B7737>'));
}
#[Test]
public function oneliner_dict() {
Assert::equals(['Test' => 3], $this->parse('<</Test 3>>'));
}
#[Test]
public function compact_dict() {
Assert::equals(
['Filter' => 'FlateDecode', 'Length' => 597, 'Info' => new Ref(433, 0)],
$this->parse('<</Filter/FlateDecode/Length 597/Info 433 0 R>>')
);
}
#[Test]
public function multiline_indented_dict() {
Assert::equals(
['N' => 3, 'Filter' => 'FlateDecode', 'ColorSpace' => ['Indexed', 'DeviceRGB', 255, '00FF']],
$this->parse("<<\n /N 3\n /Filter /FlateDecode\n /ColorSpace [/Indexed /DeviceRGB 255 <00FF>]\n>>")
);
}
#[Test]
public function metadata_dict() {
Assert::equals(
['Title' => 'Test', 'Creator' => 'XP', 'CreationDate' => 'D:20250530173115+00\'00\''],
$this->parse("<</Title (Test)\n/Creator (XP)\n/CreationDate (D:20250530173115+00'00')>>")
);
}
#[Test]
public function nested_dict() {
Assert::equals(
[
'Resources' => ['Font' => ['F6' => new Ref(6, 0), 'F7' => new Ref(7, 0)]],
'Tabs' => 'S',
],
$this->parse("<</Resources <</Font <</F6 6 0 R\n/F7 7 0 R>>>>\n/Tabs /S>>")
);
}
#[Test, Values(['', ' ', "\n", "\r", "\r\n"])]
public function dict_with_stream($separator) {
$bytes= "\x78\x9c\x0b\x49\x2d\x2e\x01\x00\x03\xdd\x01\xa1"; // gzcompress("Test")
Assert::equals(
['Filter' => 'FlateDecode', 'Length' => 12, '$stream' => new Stream($bytes, 'FlateDecode')],
$this->parse("<</Filter /FlateDecode\n/Length 12>>{$separator}stream\n{$bytes}\nendstream")
);
}
#[Test, Values(["Line 1", "Line 1\nLine 2"])]
public function dict_with_ref_length($bytes) {
Assert::equals(
['Filter' => 'FlateDecode', 'Length' => new Ref(13, 0), '$stream' => new Stream($bytes, 'FlateDecode')],
$this->parse("<</Filter /FlateDecode\n/Length 13 0 R>> stream\n{$bytes}endstream\n")
);
}
#[Test]
public function array_with_whitespace() {
Assert::equals(
[new Ref(13, 0), new Ref(20, 0), new Ref(23, 0)],
$this->parse("[13 0 R 20 0 R 23 0 R ]")
);
}
#[Test]
public function dict_with_whitespace() {
Assert::equals(
['Length' => new Ref(41, 0), 'Filter' => 'FlateDecode'],
$this->parse("<< /Length 41 0 R /Filter /FlateDecode >>")
);
}
}
@thekid
Copy link
Author

thekid commented Jun 19, 2025

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment