Last active
June 21, 2025 22:19
-
-
Save thekid/e2176ddb615150e082a104149bd950ec to your computer and use it in GitHub Desktop.
PDF Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use io\streams\InputStream; | |
use lang\FormatException; | |
class CharacterMap { | |
private $tokens; | |
public $width= null; | |
public $chars= [], $ranges= []; | |
public $codespace= [0x0000, 0xffff]; | |
public function __construct(InputStream $in) { | |
$this->tokens= new Tokens($in); | |
} | |
private function char($code) { | |
return iconv('ucs-4', 'utf-8', pack('N', $code)); | |
} | |
public function parse(): self { | |
try { | |
do { | |
$token= $this->tokens->token(); | |
// Parse `2 begin[...]` - an enumeration with 2 entries | |
if ('integer' === $token[0]) { | |
$length= $token[1]; | |
} else if ('word' === $token[0] && 1 === sscanf($token[1], 'begin%s', $type)) { | |
switch ($type) { | |
case 'cmap'; | |
break; | |
case 'codespacerange': | |
for ($i= 0; $i < $length; $i++) { | |
$lo= $this->tokens->token()[1]; | |
$hi= $this->tokens->token()[1]; | |
$this->codespace= [hexdec($lo), hexdec($hi)]; | |
} | |
break; | |
case 'bfchar': | |
for ($i= 0; $i < $length; $i++) { | |
$src= $this->tokens->token()[1]; | |
$map= $this->tokens->token()[1]; | |
$this->chars[hexdec($src)]= iconv('utf-16be', 'utf-8', hex2bin($map)); | |
} | |
$this->width??= strlen($src); | |
break; | |
case 'bfrange': case 'cidrange': | |
for ($i= 0; $i < $length; $i++) { | |
$lo= $this->tokens->token()[1]; | |
$hi= $this->tokens->token()[1]; | |
$token= $this->tokens->token(); | |
if ('array-start' === $token[0]) { | |
$map= []; | |
next: $token= $this->tokens->token(); | |
if ('hex' === $token[0]) { | |
$map[]= hexdec($token[1]); | |
goto next; | |
} else if ('integer' === $token[0]) { | |
$map[]= $token[1]; | |
goto next; | |
} | |
} else if ('hex' === $token[0]) { | |
$map= hexdec($token[1]); | |
} else { | |
$map= $token[1]; | |
} | |
$this->ranges[]= [hexdec($lo), hexdec($hi), $map]; | |
} | |
$this->width??= strlen($lo); | |
break; | |
default: | |
throw new FormatException('Unknown character map enumeration '.$type); | |
} | |
} | |
} while ('endcmap' !== $token[1]); | |
return $this; | |
} finally { | |
$this->tokens->close(); | |
} | |
} | |
public function translate($bytes, $fallback= null) { | |
$this->width ?? $this->parse(); | |
$string= ''; | |
for ($i= 0, $l= strlen($bytes), $b= ($this->width ?? 2) / 2; $i < $l; $i+= $b) { | |
$code= 1 === $b ? ord($bytes[$i]) : unpack('n', $bytes, $i)[1]; | |
if (null !== ($char= $this->chars[$code] ?? null)) { | |
$string.= $char; | |
} else if ($code < $this->codespace[0] || $code > $this->codespace[1]) { | |
$string.= $this->char($code); | |
} else { | |
$char= null; | |
foreach ($this->ranges as $range) { | |
if ($code >= $range[0] && $code <= $range[1]) { | |
if (is_array($range[2])) { | |
$char= $this->char($range[2][$code - $range[0]]); | |
} else { | |
$char= $this->char($code - $range[0] + $range[2]); | |
} | |
break; | |
} | |
} | |
$string.= $char ?? sprintf('\u{%04x}', $code); | |
} | |
} | |
// DEBUG | |
// var_dump($this); | |
// echo "< $bytes\n"; | |
// echo "> `", addcslashes($string, "\0..\37!\177..\377"), "`\n"; | |
return $string; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use test\{Assert, Test, Values}; | |
use io\streams\MemoryInputStream; | |
class CharacterMapTest { | |
private $definition= <<<'MULTIBYTE' | |
/CIDInit /ProcSet findresource begin | |
12 dict begin | |
begincmap | |
/CIDSystemInfo | |
<< /Registry (Adobe) | |
/Ordering (UCS) | |
/Supplement 0 | |
>> def | |
/CMapName | |
/Adobe-Identity-UCS def | |
/CMapType 2 def | |
1 begincodespacerange | |
<0000> <FFFF> | |
endcodespacerange | |
4 beginbfchar | |
<0003> <0054> | |
<000F> <0065> | |
<0011> <0073> | |
<0012> <007400650064> | |
endbfchar | |
2 beginbfrange | |
<0013> <001C> <0030> | |
<03E0> <03E1> [<002C> <003A>] | |
endbfrange | |
endcmap | |
CMapName currentdict /CMap defineresource pop | |
end | |
end | |
MULTIBYTE; | |
/** @return iterable */ | |
private function formats() { | |
yield [new MemoryInputStream($this->definition), 'multiline']; | |
yield [new MemoryInputStream(strtr($this->definition, "\n", ' ')), 'compact']; | |
} | |
#[Test, Values(from: 'formats')] | |
public function parse($stream, $kind) { | |
$fixture= (new CharacterMap($stream))->parse(); | |
Assert::equals(4, $fixture->width); | |
Assert::equals([0x0000, 0xffff], $fixture->codespace); | |
Assert::equals([0x0003 => 'T', 0x000F => 'e', 0x0011 => 's', 0x0012 => 'ted'], $fixture->chars); | |
Assert::equals([[0x0013, 0x001C, 0x0030], [0x03E0, 0x03E1, [0x002C, 0x003A]]], $fixture->ranges); | |
} | |
#[Test] | |
public function translate_hex_string() { | |
$fixture= new CharacterMap(new MemoryInputStream($this->definition)); | |
Assert::equals('Tested19:', $fixture->translate(hex2bin('0003000F001100120014001C03E1'))); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use io\streams\InputStream; | |
class Content { | |
private $tokens; | |
public function __construct(InputStream $in) { | |
$this->tokens= new Tokens($in); | |
} | |
public function operations(): Generator { | |
$operands= []; | |
while (null !== ($token= $this->tokens->token())) { | |
if ('word' === $token[0]) { | |
// Attach inline image data | |
if ('ID' === $token[1]) { | |
$operands[]= ['$inline', trim($this->tokens->scan("\nEI"))]; | |
} | |
yield $token[1] => $operands; | |
$operands= []; | |
} else { | |
$operands[]= $token; | |
} | |
} | |
$this->tokens->close(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use io\streams\MemoryInputStream; | |
use test\{Assert, Test}; | |
class ContentTest { | |
/** Parses and returns content operations */ | |
private function parse($content) { | |
$content= new Content(new MemoryInputStream($content)); | |
$r= []; | |
foreach ($content->operations() as $op => $arguments) { | |
$r[]= [$op => $arguments]; | |
} | |
return $r; | |
} | |
#[Test] | |
public function text_block() { | |
Assert::equals( | |
[ | |
['BT' => []], | |
['Tf' => [['name', 'R13'], ['integer', 60]]], | |
['Tj' => [['string', 'Test']]], | |
['ET' => []], | |
], | |
$this->parse(<<<'PS' | |
BT | |
/R13 60 Tf | |
(Test)Tj | |
ET | |
PS) | |
); | |
} | |
#[Test] | |
public function inline_image() { | |
Assert::equals( | |
[ | |
['BI' => []], | |
['ID' => [['name', 'CS'], ['name', 'RGB'], ['$inline', "GIF89a\n..."]]], | |
['EI' => []], | |
], | |
$this->parse( | |
"BI\n". | |
"/CS/RGB\n". | |
"ID GIF89a\n". | |
"...\n". | |
"EI\n" | |
) | |
); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use io\streams\FileInputStream; | |
use util\cmd\Console; | |
use util\Objects; | |
use lang\FormatException; | |
try { | |
$parser= new PdfFile(new FileInputStream($argv[1])); | |
} catch (Throwable $e) { | |
Console::writeLine($argv[1], ': ', $e); | |
return 2; | |
} | |
$objects= []; | |
$trailer= []; | |
Console::$err->write('Parsing ', $argv[1], ' ['); | |
foreach ($parser->objects() as $kind => $value) { | |
Console::$err->write('.'); | |
if ('object' === $kind) { | |
$objects[$value['id']->hashCode()]= $value['dict']; | |
} else if ('trailer' === $kind) { | |
$trailer+= $value; | |
} | |
} | |
Console::$err->writeLine(']'); | |
// Handle files without `trailer` entry, e.g. linearized PDFs | |
if (empty($trailer)) { | |
foreach ($objects as $object) { | |
if ('XRef' === ($object['Type'] ?? null)) { | |
$trailer= $object; | |
break; | |
} | |
} | |
} | |
// Handle encryption | |
Console::writeLine('Trailer: ', $trailer); | |
if (isset($trailer['Encrypt'])) { | |
$encrypt= $objects[$trailer['Encrypt']->hashCode()]; | |
Console::writeLine('Cannot handle encrypted PDFs: ', $encrypt); | |
return 1; | |
} | |
// Now, unpack object streams and add them to the lookup table | |
foreach ($objects as $id => $object) { | |
if ('ObjStm' === ($object['Type'] ?? null)) { | |
$n= $object['N']; | |
$tokens= new Tokens($object['$stream']); | |
$numbers= []; | |
for ($i= 0; $i < $n; $i++) { | |
$number= $tokens->token()[1]; | |
$offset= $tokens->token()[1]; | |
$numbers[$i]= $number.'_0'; | |
} | |
for ($i= 0; $i < $n; $i++) { | |
$objects[$numbers[$i]]= $tokens->value(); | |
} | |
} | |
} | |
if ($info= ($trailer['Info'] ?? null)) { | |
Console::writeLine('Info: ', $objects[$info->hashCode()]); | |
} | |
$root= $objects[$trailer['Root']->hashCode()]; | |
Console::writeLine('Root: ', $root); | |
$pages= $objects[$root['Pages']->hashCode()]; | |
Console::writeLine('Pages: ', $pages); | |
$objectOf= function($arg) use(&$objects) { | |
return $arg instanceof Ref ? $objects[$arg->hashCode()] : $arg; | |
}; | |
$contentsOf= function($kids, $suffix= '') use(&$contentsOf, &$objectOf, &$objects) { | |
foreach ($kids as $i => $ref) { | |
$page= $objects[$ref->hashCode()]; | |
$number= $suffix.($i + 1); | |
if ('Pages' === $page['Type']) { | |
yield from $contentsOf($page['Kids'], $number.'.'); | |
continue; | |
} | |
// A single content object or an array of references | |
$contents= $objectOf($page['Contents']); | |
if ($stream= $contents['$stream'] ?? null) { | |
$streams= [$stream]; | |
} else { | |
$streams= []; | |
foreach ($contents as $ref) { | |
$streams[]= $objects[$ref->hashCode()]['$stream']; | |
} | |
} | |
yield $number => $page + ['$contents' => $streams]; | |
} | |
}; | |
$stringOf= function($bytes, $encoding, $unicode= null) use(&$stringOf) { | |
switch ($encoding) { | |
case null: return $bytes; | |
case 'MacRomanEncoding': return iconv('macintosh', \xp::ENCODING, $bytes); | |
case 'WinAnsiEncoding': return iconv('cp1252', \xp::ENCODING, $bytes); | |
case 'Identity-H': return $unicode->translate($bytes); | |
default: | |
// FIXME: Respect differences | |
return $stringOf($bytes, $encoding['BaseEncoding'] ?? null); | |
} | |
}; | |
$textOf= function($token, array $format) use(&$objects, &$objectOf, &$stringOf) { | |
if ($descendants= $format['font']['DescendantFonts'] ?? null) { | |
// TODO https://github.com/adobe-type-tools/cmap-resources | |
} | |
$bytes= 'hex' === $token[0] ? hex2bin($token[1]) : $token[1]; | |
switch ($format['font']['Subtype']) { | |
case 'Type0': case 'Type1': case 'TrueType': | |
return $stringOf( | |
$bytes, | |
$objectOf($format['font']['Encoding'] ?? null), | |
$format['font']['$unicode'] ?? null | |
); | |
case 'Type3': | |
if ($map= $format['font']['$unicode'] ?? null) { | |
return $format['font']['$unicode']->translate($bytes); | |
} | |
throw new FormatException('Cannot handle type 3 fonts without unicode'); | |
default: | |
throw new FormatException('Unknown font type '.$format['font']['Subtype']); | |
} | |
}; | |
foreach ($contentsOf($pages['Kids']) as $number => $page) { | |
Console::writeLine('- Page #', $number, ' contents: ', Objects::stringOf($page['Contents'], ' ')); | |
// Resolve fonts | |
$fonts= []; | |
if ($resources= $objectOf($page['Resources'])['Font'] ?? null) { | |
foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) { | |
$font= $objects[$ref->hashCode()]; | |
// Console::writeLine(' Font <', $id, '> := ', Objects::stringOf($font, ' ')); | |
if ($ref= $font['ToUnicode'] ?? null) { | |
$font['$unicode']= new CharacterMap($objects[$ref->hashCode()]['$stream']); | |
} | |
$fonts[$id]= $font; | |
} | |
} | |
// Select no font for the beginning | |
$font= null; | |
// Resolve objects | |
$xobjects= []; | |
if ($resources= $objectOf($page['Resources'])['XObject'] ?? null) { | |
foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) { | |
$xobjects[$id]= $objects[$ref->hashCode()]; | |
} | |
} | |
// DEBUG | |
// foreach ($page['$contents'] as $i => $stream) { | |
// file_put_contents('text-block.'.$number.'_'.$i, $stream->bytes()); | |
// } | |
$paragraph= ''; | |
$content= new Content(new SequenceInputStream($page['$contents'])); | |
foreach ($content->operations() as $op => $arguments) { | |
// DEBUG | |
// Console::$err->writeLine("\e[2m", '[', $number, ' | ', $op, ']: ', $arguments, "\e[0m"); | |
if ('BT' === $op) { | |
// NOOP | |
} else if ('ET' === $op) { | |
Console::writeLine(' ', str_replace("\n", "\n ", $paragraph)); | |
Console::writeLine(); | |
$paragraph= ''; | |
} else if ('Tf' === $op) { | |
$id= $arguments[0][1]; | |
$size= $arguments[1][1]; | |
$font= ['font' => $fonts[$id] ?? '@'.$id, 'size' => $size]; | |
} else if ('Tj' === $op) { | |
$paragraph.= $textOf($arguments[0], $font); | |
} else if ('TJ' === $op) { | |
foreach ($arguments as $argument) { | |
if ('hex' === $argument[0] || 'string' === $argument[0]) { | |
$paragraph.= $textOf($argument, $font); | |
} | |
} | |
} else if ("'" === $op) { | |
$paragraph.= "\n".$textOf($arguments[0], $font);; | |
} else if ('"' === $op) { | |
$paragraph.= "\n".$textOf($arguments[2], $font);; | |
} else if ('TD' === $op) { | |
$paragraph.= "\n"; | |
} else if ('Do' === $op) { | |
$id= $arguments[0][1]; | |
$xobject= $xobjects[$id]; | |
if ('Image' === $xobject['Subtype']) { | |
Console::writeLine(' ', "\n"); | |
} | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use io\streams\InputStream; | |
use lang\FormatException; | |
/** @see https://opensource.adobe.com/dc-acrobat-sdk-docs/pdflsdk/#pdf-reference */ | |
class PdfFile { | |
private $tokens; | |
public $version; | |
public function __construct(InputStream $in) { | |
$this->tokens= new Tokens($in); | |
if (1 !== sscanf($this->tokens->line(), '%%PDF-%[0-9.]', $this->version)) { | |
throw new FormatException('PDF file header not found'); | |
} | |
} | |
private function xref() { | |
$xref= []; | |
while (2 === sscanf($line= $this->tokens->line(), '%d %d', $number, $length)) { | |
for ($i= 0; $i < $length; $i++) { | |
$xref[]= $this->tokens->line(); | |
} | |
} | |
$this->tokens->push($line."\n"); | |
return $xref; | |
} | |
public function objects() { | |
$operands= []; | |
while (null !== ($token= $this->tokens->token())) { | |
if ('word' === $token[0]) { | |
if ('obj' === $token[1]) { | |
$number= $operands[0][1]; | |
$generation= $operands[1][1]; | |
yield 'object' => ['id' => new Ref($number, $generation), 'dict' => $this->tokens->value()]; | |
} else if ('xref' === $token[1]) { | |
yield 'xref' => $this->xref(); | |
} else if ('trailer' === $token[1]) { | |
yield 'trailer' => $this->tokens->value(); | |
} | |
$operands= []; | |
} else if ('comment' !== $token[0]) { | |
$operands[]= $token; | |
} | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use lang\Value; | |
use util\Objects; | |
class Ref implements Value { | |
public $number, $generation; | |
public function __construct($number, $generation) { | |
$this->number= $number; | |
$this->generation= $generation; | |
} | |
public function hashCode() { | |
return $this->number.'_'.$this->generation; | |
} | |
public function toString() { | |
return nameof($this).'('.$this->number.'_'.$this->generation.')'; | |
} | |
public function compareTo($value) { | |
return $value instanceof self ? $this->hashCode() <=> $value->hashCode() : 1; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use io\streams\InputStream; | |
/** @test SequenceInputStreamTest */ | |
class SequenceInputStream implements InputStream { | |
private $streams, $current; | |
/** @param iterable $streams */ | |
public function __construct($streams) { | |
if ($streams instanceof Iterator) { | |
$this->streams= $streams; | |
} else { | |
$this->streams= new ArrayIterator($streams); | |
} | |
$this->current= $this->streams->current(); | |
} | |
/** @return int */ | |
public function available() { | |
do { | |
if ($r= $this->current->available()) return $r; | |
// No more data available on current stream, close and select next | |
$this->streams->next(); | |
} while ($this->streams->valid() && ($this->current= $this->streams->current())); | |
return 0; | |
} | |
/** | |
* Reads up to the specified number of bytes | |
* | |
* @param int $bytes | |
* @return string | |
*/ | |
public function read($bytes= 8192) { | |
do { | |
if ('' !== ($r= $this->current->read($bytes))) return $r; | |
// EOF from current stream, close and select next | |
$this->streams->next(); | |
} while ($this->streams->valid() && ($this->current= $this->streams->current())); | |
return ''; | |
} | |
/** @return void */ | |
public function close() { | |
foreach ($this->streams as $stream) { | |
$stream->close(); | |
} | |
} | |
/** Ensure streams are closed */ | |
public function __destruct() { | |
$this->close(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use test\{Assert, Test}; | |
use io\streams\MemoryInputStream; | |
class SequenceInputStreamTest { | |
/** Drains a stream */ | |
private function drain($stream) { | |
$r= []; | |
while ($available= $stream->available()) { | |
$r[]= [$available, $stream->read()]; | |
} | |
$r[]= [$stream->available(), $stream->read()]; | |
return $r; | |
} | |
#[Test] | |
public function drain_one() { | |
$fixture= new SequenceInputStream([new MemoryInputStream('Test')]); | |
Assert::equals([[4, 'Test'], [0, '']], $this->drain($fixture)); | |
} | |
#[Test] | |
public function drain_multiple() { | |
$fixture= new SequenceInputStream([ | |
new MemoryInputStream('One'), | |
new MemoryInputStream('Two') | |
]); | |
Assert::equals([[3, 'One'], [3, 'Two'], [0, '']], $this->drain($fixture)); | |
} | |
#[Test] | |
public function using_only_read() { | |
$fixture= new SequenceInputStream([ | |
new MemoryInputStream('One'), | |
new MemoryInputStream('Two') | |
]); | |
Assert::equals('One', $fixture->read()); | |
Assert::equals('Two', $fixture->read()); | |
Assert::equals('', $fixture->read()); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use io\streams\{InputStream, MemoryInputStream}; | |
use lang\Value; | |
use util\Objects; | |
class Stream implements Value, InputStream { | |
private $bytes, $filter; | |
private $input= null; | |
public function __construct($bytes, $filter) { | |
$this->bytes= $bytes; | |
$this->filter= $filter; | |
} | |
public function bytes() { | |
switch ($this->filter) { | |
case null: return $this->bytes; | |
case 'DCTDecode': case 'JPXDecode': case 'CCITTFaxDecode': return $this->bytes; | |
case 'FlateDecode': return gzuncompress($this->bytes); | |
default: throw new IllegalArgumentException('Unknown filter '.$this->filter); | |
} | |
} | |
/** @return io.streams.InputStream */ | |
private function input() { | |
if ('FlateDecode' === $this->filter) { | |
return new MemoryInputStream(gzuncompress($this->bytes)); // TODO: xp-forge/compress | |
} else { | |
return new MemoryInputStream($this->bytes); | |
} | |
} | |
public function available() { | |
$this->input??= $this->input(); | |
return $this->input->available(); | |
} | |
public function read($bytes= 8192) { | |
$this->input??= $this->input(); | |
return $this->input->read($bytes= 8192); | |
} | |
public function close() { | |
$this->input && $this->input->close(); | |
$this->input= null; | |
} | |
public function hashCode() { | |
return 'S'.md5($this->bytes); | |
} | |
public function toString() { | |
return nameof($this).'('.strlen($this->bytes).' bytes '.($this->filter ?? 'Plain').')'; | |
} | |
public function compareTo($value) { | |
return $value instanceof self ? $this->bytes <=> $value->bytes : 1; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use io\streams\InputStream; | |
use lang\FormatException; | |
class Tokens { | |
private $in; | |
private $buffer= ''; | |
public function __construct(InputStream $in) { | |
$this->in= $in; | |
} | |
public function push(string $buffer) { | |
$this->buffer= $buffer.$this->buffer; | |
} | |
public function bytes(int $length): ?string { | |
if (null === $this->buffer) return null; | |
while (strlen($this->buffer) < $length && $this->in->available()) { | |
$this->buffer.= $this->in->read(); | |
} | |
$return= substr($this->buffer, 0, $length); | |
$this->buffer= substr($this->buffer, $length); | |
return $return; | |
} | |
public function line(): ?string { | |
if (null === $this->buffer) return null; | |
$eof= false; | |
do { | |
$p= strcspn($this->buffer, "\r\n"); | |
if ($p < strlen($this->buffer) - 1 || $eof= !$this->in->available()) break; | |
$this->buffer.= $this->in->read(); | |
} while (true); | |
// Check for Mac OS, Windows and Un*x line endings | |
if ("\r" === ($this->buffer[$p] ?? null)) { | |
$p >= strlen($this->buffer) && $this->buffer.= $this->in->read(1); | |
$e= "\n" === ($this->buffer[$p + 1] ?? null) ? 2 : 1; | |
} else { | |
$e= 1; | |
} | |
$return= substr($this->buffer, 0, $p); | |
$this->buffer= $eof ? null : substr($this->buffer, $p + $e); | |
return $return; | |
} | |
public function scan(string $marker): string { | |
if (null === $this->buffer) return null; | |
while (false === ($p= strpos($this->buffer, $marker))) { | |
if (!$this->in->available()) break; | |
$this->buffer.= $this->in->read(); | |
} | |
$return= substr($this->buffer, 0, $p); | |
$this->buffer= substr($this->buffer, $p); | |
return $return; | |
} | |
public function token($peek= false): ?array { | |
if (null === $this->buffer) return null; | |
// Read a complete line | |
$eof= false; | |
value: do { | |
$p= strcspn($this->buffer, "\n"); | |
if ($p < strlen($this->buffer) - 1 || $eof= !$this->in->available()) break; | |
$this->buffer.= $this->in->read(); | |
} while (true); | |
// echo '=> `', addcslashes($this->buffer, "\0..\37!\177..\377"), "`\n"; | |
if ('' === $this->buffer && $eof) return $this->buffer= null; | |
if ('/' === $this->buffer[0]) { | |
$p= strspn($this->buffer, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789#:.,+-_', 1); | |
$r= ['name', substr($this->buffer, 1, $p)]; | |
$p++; | |
} else if ('(' === $this->buffer[0]) { | |
$string= ''; | |
$offset= $braces= 1; | |
segment: $p= strcspn($this->buffer, '\\()', $offset); | |
$string.= substr($this->buffer, $offset, $p); | |
$offset+= $p; | |
if ('\\' === $this->buffer[$offset]) { | |
$offset++; | |
// Handle character sequences like `\374` | |
if (3 === strspn($this->buffer, '0123456789', $offset)) { | |
$string.= chr(octdec(substr($this->buffer, $offset, 3))); | |
$offset+= 3; | |
} else { | |
$string.= $this->buffer[$offset++]; | |
} | |
goto segment; | |
} else if ('(' === $this->buffer[$offset]) { | |
$offset++; | |
if (++$braces) { | |
$string.= '('; | |
goto segment; | |
} | |
} else if (')' === $this->buffer[$offset]) { | |
$offset++; | |
if (--$braces) { | |
$string.= ')'; | |
goto segment; | |
} | |
} | |
$p= $offset; | |
$r= ['string', $string]; | |
} else if ('[' === $this->buffer[0]) { | |
$p= 1; | |
$r= ['array-start', null]; | |
} else if (']' === $this->buffer[0]) { | |
$p= 1; | |
$r= ['array-end', null]; | |
} else if ('%' === $this->buffer[0]) { | |
$p= strcspn($this->buffer, "\r\n", 1); | |
$r= ['comment', substr($this->buffer, 1 + strspn($this->buffer, ' ', 1), $p - 1)]; | |
$p++; | |
} else if (0 === strncmp($this->buffer, '<<', 2)) { | |
$p= 2; | |
$r= ['dict-start', null]; | |
} else if (0 === strncmp($this->buffer, '>>', 2)) { | |
$p= 2; | |
$r= ['dict-end', null]; | |
} else if ('<' === $this->buffer[0]) { | |
$p= strpos($this->buffer, '>'); | |
$r= ['hex', substr($this->buffer, 1, $p - 1)]; | |
$p++; | |
} else if ($p= strspn($this->buffer, '-0123456789.')) { | |
// Disambiguate references from integer and decimal numbers | |
if (4 === sscanf($this->buffer, '%d %d %*[R]%n', $number, $generation, $l)) { | |
$p= $l; | |
$r= ['ref', new Ref($number, $generation)]; | |
} else { | |
$number= substr($this->buffer, 0, $p); | |
$r= false === strpos($number, '.') ? ['integer', (int)$number] : ['decimal', (float)$number]; | |
} | |
} else if (0 === strncmp($this->buffer, 'true', 4)) { | |
$p= 4; | |
$r= ['const', true]; | |
} else if (0 === strncmp($this->buffer, 'null', 4)) { | |
$p= 4; | |
$r= ['const', null]; | |
} else if (0 === strncmp($this->buffer, 'false', 5)) { | |
$p= 5; | |
$r= ['const', false]; | |
} else if (0 === strncmp($this->buffer, 'stream', 6)) { | |
// Stream starts on a new line | |
$p= 6 + strspn($this->buffer, "\r\n", 6); | |
$r= ['stream-start', null]; | |
} else if (0 === strncmp($this->buffer, 'endstream', 9)) { | |
$p= 9; | |
$r= ['stream-end', null]; | |
} else if ($p= strspn($this->buffer, " \t\r\n")) { | |
$this->buffer= substr($this->buffer, $p); | |
goto value; | |
} else { | |
// Everything until the next whitespace or begin of a token | |
$p= strcspn($this->buffer, " \t\r\n/<[(%"); | |
$r= ['word', substr($this->buffer, 0, $p)]; | |
} | |
$peek || $this->buffer= substr($this->buffer, $p); | |
// var_dump($r); | |
return $r; | |
} | |
public function expect($kind) { | |
$token= $this->token(); | |
if ($kind !== $token[0]) { | |
throw new FormatException(sprintf( | |
'Expected %s, have %s `%s...`', | |
$kind, | |
$token[0], | |
substr(addcslashes($token[1], "\0..\37!\177..\377"), 0, 42) | |
)); | |
} | |
return $token; | |
} | |
public function value($token= null) { | |
$token??= $this->token(); | |
if ('array-start' === $token[0]) { | |
$array= []; | |
element: if (null === ($token= $this->token())) throw new FormatException('unclosed array'); | |
if ('array-end' !== $token[0]) { | |
$array[]= $this->value($token); | |
goto element; | |
} | |
return $array; | |
} else if ('dict-start' === $token[0]) { | |
$object= []; | |
pair: if (null === ($token= $this->token())) throw new FormatException('unclosed dict'); | |
if ('dict-end' !== $token[0]) { | |
$object[$token[1]]= $this->value(); | |
goto pair; | |
} | |
// Handle stream objects | |
$token= $this->token(true); | |
if (['stream-start', null] === $token) { | |
$this->expect('stream-start'); | |
if (is_int($object['Length'])) { | |
$bytes= $this->bytes($object['Length']); | |
} else { | |
$bytes= $this->scan("endstream\n"); | |
} | |
$this->expect('stream-end'); | |
$object['$stream']= new Stream($bytes, $object['Filter'] ?? null); | |
} | |
return $object; | |
} else if ('string' === $token[0]) { | |
if (0 === strncmp("\xfe\xff", $token[1], 2)) { | |
return iconv('utf-16be', \xp::ENCODING, substr($token[1], 2)); | |
} else { | |
return $token[1]; | |
} | |
} else { | |
return $token[1]; | |
} | |
} | |
public function close() { | |
$this->in->close(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use test\{Assert, Ignore, Test, Values}; | |
use io\streams\InputStream; | |
class TokensTest { | |
/** @param string|string[] $chunks */ | |
private function fixture($chunks): Tokens { | |
return new Tokens(new class((array)$chunks) implements InputStream { | |
private $chunks; | |
public function __construct($chunks) { $this->chunks= $chunks; } | |
public function available() { return sizeof($this->chunks); } | |
public function read($bytes= 8192) { return array_shift($this->chunks); } | |
public function close() { $this->chunks= []; } | |
}); | |
} | |
#[Test] | |
public function read_twice() { | |
$fixture= $this->fixture('Tested'); | |
Assert::equals(['Test', 'ed'], [$fixture->bytes(4), $fixture->bytes(2)]); | |
} | |
#[Test] | |
public function read_empty() { | |
$fixture= $this->fixture(''); | |
Assert::equals('', $fixture->bytes(1)); | |
} | |
#[Test, Values(['Test', "Test\n"])] | |
public function line($input) { | |
$fixture= $this->fixture($input); | |
Assert::equals('Test', $fixture->line()); | |
Assert::null($fixture->line()); | |
} | |
#[Test, Values(["Line 1\n\nLine 3", "Line 1\r\rLine 3", "Line 1\r\n\r\nLine 3"])] | |
public function empty_line($input) { | |
$fixture= $this->fixture($input); | |
Assert::equals('Line 1', $fixture->line()); | |
Assert::equals('', $fixture->line()); | |
Assert::equals('Line 3', $fixture->line()); | |
Assert::null($fixture->line()); | |
} | |
#[Test, Values([['/Length', 'Length'], ['/ca', 'ca'], ['/S', 'S'], ['/FlateDecode', 'FlateDecode'], ['/C2_0', 'C2_0'], ['/Font,Bold', 'Font,Bold']])] | |
public function name($input, $expected) { | |
Assert::equals(['name', $expected], $this->fixture($input)->token()); | |
} | |
#[Test, Values([['()', ''], ['(Test)', 'Test']])] | |
public function string($input, $expected) { | |
Assert::equals(['string', $expected], $this->fixture($input)->token()); | |
} | |
#[Test, Values([['(C:\\\\PHP \\(8.5\\))', 'C:\\PHP (8.5)'], ['(f\\303\\274r)', 'für']])] | |
public function escapes_in_string($input, $expected) { | |
Assert::equals(['string', $expected], $this->fixture($input)->token()); | |
} | |
#[Test, Values([['(())', '()'], ['(Test (OK))', 'Test (OK)']])] | |
public function balanced_braces_in_string($input, $expected) { | |
Assert::equals(['string', $expected], $this->fixture($input)->token()); | |
} | |
#[Test, Values([['1', 1], ['-1', -1], ['6100', 6100]])] | |
public function integer($input, $expected) { | |
Assert::equals(['integer', $expected], $this->fixture($input)->token()); | |
} | |
#[Test, Values([['1.5', 1.5], ['-0.5', -0.5], ['3.141', 3.141]])] | |
public function decimal($input, $expected) { | |
Assert::equals(['decimal', $expected], $this->fixture($input)->token()); | |
} | |
#[Test] | |
public function ref() { | |
Assert::equals(['ref', new Ref(62, 0)], $this->fixture('62 0 R')->token()); | |
} | |
#[Test] | |
public function comment() { | |
Assert::equals(['comment', 'Test'], $this->fixture('% Test')->token()); | |
} | |
#[Test] | |
public function hex() { | |
Assert::equals(['hex', 'B5FEF09943'], $this->fixture('<B5FEF09943>')->token()); | |
} | |
#[Test, Ignore('Not yet implemented')] | |
public function multiline_string() { | |
Assert::equals(['Test passed'], $this->fixture("(Test\n passed)")->token()); | |
} | |
#[Test, Values([['true', true], ['false', false], ['null', null]])] | |
public function constants($input, $expected) { | |
Assert::equals(['const', $expected], $this->fixture($input)->token()); | |
} | |
#[Test, Values(['[]', '[ ]', '[ ]'])] | |
public function empty_array($input) { | |
$fixture= $this->fixture($input); | |
Assert::equals(['array-start', null], $fixture->token()); | |
Assert::equals(['array-end', null], $fixture->token()); | |
} | |
#[Test, Values(['[[]]', '[ [] ]', '[ [ ] ]'])] | |
public function nested_array($input) { | |
$fixture= $this->fixture($input); | |
Assert::equals(['array-start', null], $fixture->token()); | |
Assert::equals(['array-start', null], $fixture->token()); | |
Assert::equals(['array-end', null], $fixture->token()); | |
Assert::equals(['array-end', null], $fixture->token()); | |
} | |
#[Test, Values(['[1 2]', '[1 2 ]', '[ 1 2]', '[ 1 2 ]', '[ 1 2 ]'])] | |
public function array($input) { | |
$fixture= $this->fixture($input); | |
Assert::equals(['array-start', null], $fixture->token()); | |
Assert::equals(['integer', 1], $fixture->token()); | |
Assert::equals(['integer', 2], $fixture->token()); | |
Assert::equals(['array-end', null], $fixture->token()); | |
} | |
#[Test, Values(['<<>>', '<< >>', '<< >>'])] | |
public function empty_dict($input) { | |
$fixture= $this->fixture($input); | |
Assert::equals(['dict-start', null], $fixture->token()); | |
Assert::equals(['dict-end', null], $fixture->token()); | |
} | |
#[Test, Values(['<</Length 3>>', '<< /Length 3>>', '<</Length 3 >>'])] | |
public function dict($input) { | |
$fixture= $this->fixture($input); | |
Assert::equals(['dict-start', null], $fixture->token()); | |
Assert::equals(['name', 'Length'], $fixture->token()); | |
Assert::equals(['integer', 3], $fixture->token()); | |
Assert::equals(['dict-end', null], $fixture->token()); | |
} | |
#[Test] | |
public function multiline_dict() { | |
$fixture= $this->fixture(["<</Type /StructElem\n", "/S /Link>>"]); | |
Assert::equals(['dict-start', null], $fixture->token()); | |
Assert::equals(['name', 'Type'], $fixture->token()); | |
Assert::equals(['name', 'StructElem'], $fixture->token()); | |
Assert::equals(['name', 'S'], $fixture->token()); | |
Assert::equals(['name', 'Link'], $fixture->token()); | |
Assert::equals(['dict-end', null], $fixture->token()); | |
} | |
#[Test, Values([[["<<>> stream\nTest\nendstream"]], [["<<>> stream\r\nTest\r\nendstream"]], [["<<>>\nstream\nTest\nendstream"]], [["<<>> stream\n", "Test\nendstream"]], [["<<>>\n", "stream\n", "Test\nendstream"]], [["<<>>", "\n", "stream\n", "Test\nendstream"]]])] | |
public function stream_object($chunks) { | |
$fixture= $this->fixture($chunks); | |
Assert::equals(['dict-start', null], $fixture->token()); | |
Assert::equals(['dict-end', null], $fixture->token()); | |
Assert::equals(['stream-start', null], $fixture->token()); | |
Assert::equals('Test', $fixture->bytes(4)); | |
Assert::equals(['stream-end', null], $fixture->token()); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use test\{Assert, Test, Values}; | |
use io\streams\MemoryInputStream; | |
class ValueTest { | |
public function parse($input) { | |
return (new Tokens(new MemoryInputStream($input)))->value(); | |
} | |
#[Test, Values([['1', 1], ['-1', -1], ['6100', 6100]])] | |
public function integer($input, $expected) { | |
Assert::equals($expected, $this->parse($input)); | |
} | |
#[Test, Values([['1.5', 1.5], ['-0.5', -0.5], ['3.141', 3.141]])] | |
public function decimal($input, $expected) { | |
Assert::equals($expected, $this->parse($input)); | |
} | |
#[Test, Values([['/Length', 'Length'], ['/ca', 'ca'], ['/FlateDecode', 'FlateDecode']])] | |
public function literal($input, $expected) { | |
Assert::equals($expected, $this->parse($input)); | |
} | |
#[Test, Values([['()', ''], ['(Test)', 'Test'], ['(C:\\\\PHP \\(8.5\\))', 'C:\\PHP (8.5)']])] | |
public function string($input, $expected) { | |
Assert::equals($expected, $this->parse($input)); | |
} | |
#[Test, Values([['[1 2 3]', [1, 2, 3]], ['[(One) (Two)]', ['One', 'Two']], ['[(One)-0.7 (Two)-6]', ['One', -0.7, 'Two', -6]]])] | |
public function array($input, $expected) { | |
Assert::equals($expected, $this->parse($input)); | |
} | |
#[Test, Values(["[1 2 3\n4 5 6]", "[\n1 2 3\n4 5 6\n]", "[\n1\n2\n3\n4\n5\n6\n]"])] | |
public function multiline_array($input) { | |
Assert::equals([1, 2, 3, 4, 5, 6], $this->parse($input)); | |
} | |
#[Test] | |
public function ref() { | |
Assert::equals(new Ref(62, 0), $this->parse('62 0 R')); | |
} | |
#[Test] | |
public function hex() { | |
Assert::equals('B5FEF09943A87A6B1F64BB26ED1B7737', $this->parse('<B5FEF09943A87A6B1F64BB26ED1B7737>')); | |
} | |
#[Test] | |
public function oneliner_dict() { | |
Assert::equals(['Test' => 3], $this->parse('<</Test 3>>')); | |
} | |
#[Test] | |
public function compact_dict() { | |
Assert::equals( | |
['Filter' => 'FlateDecode', 'Length' => 597, 'Info' => new Ref(433, 0)], | |
$this->parse('<</Filter/FlateDecode/Length 597/Info 433 0 R>>') | |
); | |
} | |
#[Test] | |
public function multiline_indented_dict() { | |
Assert::equals( | |
['N' => 3, 'Filter' => 'FlateDecode', 'ColorSpace' => ['Indexed', 'DeviceRGB', 255, '00FF']], | |
$this->parse("<<\n /N 3\n /Filter /FlateDecode\n /ColorSpace [/Indexed /DeviceRGB 255 <00FF>]\n>>") | |
); | |
} | |
#[Test] | |
public function metadata_dict() { | |
Assert::equals( | |
['Title' => 'Test', 'Creator' => 'XP', 'CreationDate' => 'D:20250530173115+00\'00\''], | |
$this->parse("<</Title (Test)\n/Creator (XP)\n/CreationDate (D:20250530173115+00'00')>>") | |
); | |
} | |
#[Test] | |
public function nested_dict() { | |
Assert::equals( | |
[ | |
'Resources' => ['Font' => ['F6' => new Ref(6, 0), 'F7' => new Ref(7, 0)]], | |
'Tabs' => 'S', | |
], | |
$this->parse("<</Resources <</Font <</F6 6 0 R\n/F7 7 0 R>>>>\n/Tabs /S>>") | |
); | |
} | |
#[Test, Values(['', ' ', "\n", "\r", "\r\n"])] | |
public function dict_with_stream($separator) { | |
$bytes= "\x78\x9c\x0b\x49\x2d\x2e\x01\x00\x03\xdd\x01\xa1"; // gzcompress("Test") | |
Assert::equals( | |
['Filter' => 'FlateDecode', 'Length' => 12, '$stream' => new Stream($bytes, 'FlateDecode')], | |
$this->parse("<</Filter /FlateDecode\n/Length 12>>{$separator}stream\n{$bytes}\nendstream") | |
); | |
} | |
#[Test, Values(["Line 1", "Line 1\nLine 2"])] | |
public function dict_with_ref_length($bytes) { | |
Assert::equals( | |
['Filter' => 'FlateDecode', 'Length' => new Ref(13, 0), '$stream' => new Stream($bytes, 'FlateDecode')], | |
$this->parse("<</Filter /FlateDecode\n/Length 13 0 R>> stream\n{$bytes}endstream\n") | |
); | |
} | |
#[Test] | |
public function array_with_whitespace() { | |
Assert::equals( | |
[new Ref(13, 0), new Ref(20, 0), new Ref(23, 0)], | |
$this->parse("[13 0 R 20 0 R 23 0 R ]") | |
); | |
} | |
#[Test] | |
public function dict_with_whitespace() { | |
Assert::equals( | |
['Length' => new Ref(41, 0), 'Filter' => 'FlateDecode'], | |
$this->parse("<< /Length 41 0 R /Filter /FlateDecode >>") | |
); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdflsdk/#pdf-reference