thekid · June 21, 2025 22:19 · thekid · Jun 19, 2025
diff --git a/CharacterMap.class.php b/CharacterMap.class.php
 <?php

 use io\streams\InputStream;
 use lang\FormatException;

 class CharacterMap {
  private $tokens;

  public $width= null;
  public $chars= [], $ranges= [];
  public $codespace= [0x0000, 0xffff];

  public function __construct(InputStream $in) {
    $this->tokens= new Tokens($in);
  }

  private function char($code) {
    return iconv('ucs-4', 'utf-8', pack('N', $code));
  }

  public function parse(): self {
    try {
      do {
        $token= $this->tokens->token();

        // Parse `2 begin[...]` - an enumeration with 2 entries
        if ('integer' === $token[0]) {
          $length= $token[1];
        } else if ('word' === $token[0] && 1 === sscanf($token[1], 'begin%s', $type)) {
          switch ($type) {
            case 'cmap';
              break;

            case 'codespacerange':
              for ($i= 0; $i < $length; $i++) {
                $lo= $this->tokens->token()[1];
                $hi= $this->tokens->token()[1];
                $this->codespace= [hexdec($lo), hexdec($hi)];
              }
              break;

            case 'bfchar':
              for ($i= 0; $i < $length; $i++) {
                $src= $this->tokens->token()[1];
                $map= $this->tokens->token()[1];
                $this->chars[hexdec($src)]= iconv('utf-16be', 'utf-8', hex2bin($map));
              }
              $this->width??= strlen($src);
              break;

            case 'bfrange': case 'cidrange':
              for ($i= 0; $i < $length; $i++) {
                $lo= $this->tokens->token()[1];
                $hi= $this->tokens->token()[1];

                $token= $this->tokens->token();
                if ('array-start' === $token[0]) {
                  $map= [];
                  next: $token= $this->tokens->token();
                  if ('hex' === $token[0]) {
                    $map[]= hexdec($token[1]);
                    goto next;
                  } else if ('integer' === $token[0]) {
                    $map[]= $token[1];
                    goto next;
                  }
                } else if ('hex' === $token[0]) {
                  $map= hexdec($token[1]);
                } else {
                  $map= $token[1];
                }
                $this->ranges[]= [hexdec($lo), hexdec($hi), $map];
              }
              $this->width??= strlen($lo);
              break;

            default:
              throw new FormatException('Unknown character map enumeration '.$type);
          }
        }
      } while ('endcmap' !== $token[1]);

      return $this;
    } finally {
      $this->tokens->close();
    }
  }

  public function translate($bytes, $fallback= null) {
    $this->width ?? $this->parse();

    $string= '';
    for ($i= 0, $l= strlen($bytes), $b= ($this->width ?? 2) / 2; $i < $l; $i+= $b) {
      $code= 1 === $b ? ord($bytes[$i]) : unpack('n', $bytes, $i)[1];

      if (null !== ($char= $this->chars[$code] ?? null)) {
        $string.= $char;
      } else if ($code < $this->codespace[0] || $code > $this->codespace[1]) {
        $string.= $this->char($code);
      } else {
        $char= null;
        foreach ($this->ranges as $range) {
          if ($code >= $range[0] && $code <= $range[1]) {
            if (is_array($range[2])) {
              $char= $this->char($range[2][$code - $range[0]]);
            } else {
              $char= $this->char($code - $range[0] + $range[2]);
            }
            break;
          }
        }
        $string.= $char ?? sprintf('\u{%04x}', $code);
      }
    }

    // DEBUG
    // var_dump($this);
    // echo "< $bytes\n";
    // echo "> `", addcslashes($string, "\0..\37!\177..\377"), "`\n";

    return $string;
  }
 }
diff --git a/CharacterMapTest.class.php b/CharacterMapTest.class.php
 <?php

 use test\{Assert, Test, Values};
 use io\streams\MemoryInputStream;

 class CharacterMapTest {
  private $definition= <<<'MULTIBYTE'
    /CIDInit /ProcSet findresource begin
    12 dict begin
    begincmap
    /CIDSystemInfo
    << /Registry (Adobe)
    /Ordering (UCS)
    /Supplement 0
    >> def
    /CMapName
    /Adobe-Identity-UCS def
    /CMapType 2 def
    1 begincodespacerange
    <0000> <FFFF>
    endcodespacerange
    4 beginbfchar
    <0003> <0054>
    <000F> <0065>
    <0011> <0073>
    <0012> <007400650064>
    endbfchar
    2 beginbfrange
    <0013> <001C> <0030>
    <03E0> <03E1> [<002C> <003A>]
    endbfrange
    endcmap
    CMapName currentdict /CMap defineresource pop
    end
    end
  MULTIBYTE;

  /** @return iterable */
  private function formats() {
    yield [new MemoryInputStream($this->definition), 'multiline'];
    yield [new MemoryInputStream(strtr($this->definition, "\n", ' ')), 'compact'];
  }

  #[Test, Values(from: 'formats')]
  public function parse($stream, $kind) {
    $fixture= (new CharacterMap($stream))->parse();

    Assert::equals(4, $fixture->width);
    Assert::equals([0x0000, 0xffff], $fixture->codespace);
    Assert::equals([0x0003 => 'T', 0x000F => 'e', 0x0011 => 's', 0x0012 => 'ted'], $fixture->chars);
    Assert::equals([[0x0013, 0x001C, 0x0030], [0x03E0, 0x03E1, [0x002C, 0x003A]]], $fixture->ranges);
  }

  #[Test]
  public function translate_hex_string() {
    $fixture= new CharacterMap(new MemoryInputStream($this->definition));
    Assert::equals('Tested19:', $fixture->translate(hex2bin('0003000F001100120014001C03E1')));
  }
 }
diff --git a/Content.class.php b/Content.class.php
 <?php

 use io\streams\InputStream;

 class Content {
  private $tokens;

  public function __construct(InputStream $in) {
    $this->tokens= new Tokens($in);
  }

  public function operations(): Generator {
    $operands= [];
    while (null !== ($token= $this->tokens->token())) {
      if ('word' === $token[0]) {

        // Attach inline image data
        if ('ID' === $token[1]) {
          $operands[]= ['$inline', trim($this->tokens->scan("\nEI"))];
        }

        yield $token[1] => $operands;
        $operands= [];
      } else {
        $operands[]= $token;
      }
    }
    $this->tokens->close();
  }
 }
diff --git a/ContentTest.class.php b/ContentTest.class.php
 <?php

 use io\streams\MemoryInputStream;
 use test\{Assert, Test};

 class ContentTest {

  /** Parses and returns content operations */
  private function parse($content) {
    $content= new Content(new MemoryInputStream($content));
    $r= [];
    foreach ($content->operations() as $op => $arguments) {
      $r[]= [$op => $arguments];
    }
    return $r;
  }

  #[Test]
  public function text_block() {
    Assert::equals(
      [
        ['BT' => []],
        ['Tf' => [['name', 'R13'], ['integer', 60]]],
        ['Tj' => [['string', 'Test']]],
        ['ET' => []],
      ],
      $this->parse(<<<'PS'
        BT
        /R13 60 Tf
        (Test)Tj
        ET
      PS)
    );
  }

  #[Test]
  public function inline_image() {
    Assert::equals(
      [
        ['BI' => []],
        ['ID' => [['name', 'CS'], ['name', 'RGB'], ['$inline', "GIF89a\n..."]]],
        ['EI' => []],
      ],
      $this->parse(
        "BI\n".
        "/CS/RGB\n".
        "ID GIF89a\n".
        "...\n".
        "EI\n"
      )
    );
  }
 }
diff --git a/pdf.script.php b/pdf.script.php
 <?php

 use io\streams\FileInputStream;
 use util\cmd\Console;
 use util\Objects;
 use lang\FormatException;

 try {
  $parser= new PdfFile(new FileInputStream($argv[1]));
 } catch (Throwable $e) {
  Console::writeLine($argv[1], ': ', $e);
  return 2;
 }

 $objects= [];
 $trailer= [];
 Console::$err->write('Parsing ', $argv[1], ' [');
 foreach ($parser->objects() as $kind => $value) {
  Console::$err->write('.');
  if ('object' === $kind) {
    $objects[$value['id']->hashCode()]= $value['dict'];
  } else if ('trailer' === $kind) {
    $trailer+= $value;
  }
 }
 Console::$err->writeLine(']');

 // Handle files without `trailer` entry, e.g. linearized PDFs
 if (empty($trailer)) {
  foreach ($objects as $object) {
    if ('XRef' === ($object['Type'] ?? null)) {
      $trailer= $object;
      break;
    }
  }
 }

 // Handle encryption
 Console::writeLine('Trailer: ', $trailer);
 if (isset($trailer['Encrypt'])) {
  $encrypt= $objects[$trailer['Encrypt']->hashCode()];
  Console::writeLine('Cannot handle encrypted PDFs: ', $encrypt);
  return 1;
 }

 // Now, unpack object streams and add them to the lookup table
 foreach ($objects as $id => $object) {
  if ('ObjStm' === ($object['Type'] ?? null)) {
    $n= $object['N'];
    $tokens= new Tokens($object['$stream']);

    $numbers= [];
    for ($i= 0; $i < $n; $i++) {
      $number= $tokens->token()[1];
      $offset= $tokens->token()[1];
      $numbers[$i]= $number.'_0';
    }

    for ($i= 0; $i < $n; $i++) {
      $objects[$numbers[$i]]= $tokens->value();
    }
  }
 }

 if ($info= ($trailer['Info'] ?? null)) {
  Console::writeLine('Info: ', $objects[$info->hashCode()]);
 }

 $root= $objects[$trailer['Root']->hashCode()];
 Console::writeLine('Root: ', $root);

 $pages= $objects[$root['Pages']->hashCode()];
 Console::writeLine('Pages: ', $pages);

 $objectOf= function($arg) use(&$objects) {
  return $arg instanceof Ref ? $objects[$arg->hashCode()] : $arg;
 };

 $contentsOf= function($kids, $suffix= '') use(&$contentsOf, &$objectOf, &$objects) {
  foreach ($kids as $i => $ref) {
    $page= $objects[$ref->hashCode()];
    $number= $suffix.($i + 1);

    if ('Pages' === $page['Type']) {
      yield from $contentsOf($page['Kids'], $number.'.');
      continue;
    }

    // A single content object or an array of references
    $contents= $objectOf($page['Contents']);
    if ($stream= $contents['$stream'] ?? null) {
      $streams= [$stream];
    } else {
      $streams= [];
      foreach ($contents as $ref) {
        $streams[]= $objects[$ref->hashCode()]['$stream'];
      }
    }
    yield $number => $page + ['$contents' => $streams];
  }
 };

 $stringOf= function($bytes, $encoding, $unicode= null) use(&$stringOf) {
  switch ($encoding) {
    case null: return $bytes;
    case 'MacRomanEncoding': return iconv('macintosh', \xp::ENCODING, $bytes);
    case 'WinAnsiEncoding': return iconv('cp1252', \xp::ENCODING, $bytes);
    case 'Identity-H': return $unicode->translate($bytes);
    default:

      // FIXME: Respect differences
      return $stringOf($bytes, $encoding['BaseEncoding'] ?? null);
  }
 };

 $textOf= function($token, array $format) use(&$objects, &$objectOf, &$stringOf) {
  if ($descendants= $format['font']['DescendantFonts'] ?? null) {
    // TODO https://github.com/adobe-type-tools/cmap-resources
  }

  $bytes= 'hex' === $token[0] ? hex2bin($token[1]) : $token[1];
  switch ($format['font']['Subtype']) {
    case 'Type0': case 'Type1': case 'TrueType':
      return $stringOf(
        $bytes,
        $objectOf($format['font']['Encoding'] ?? null),
        $format['font']['$unicode'] ?? null
      );

    case 'Type3':
      if ($map= $format['font']['$unicode'] ?? null) {
        return $format['font']['$unicode']->translate($bytes);
      }
      throw new FormatException('Cannot handle type 3 fonts without unicode');

    default:
      throw new FormatException('Unknown font type '.$format['font']['Subtype']);
  }
 };

 foreach ($contentsOf($pages['Kids']) as $number => $page) {
  Console::writeLine('- Page #', $number, ' contents: ', Objects::stringOf($page['Contents'], '  '));

  // Resolve fonts
  $fonts= [];
  if ($resources= $objectOf($page['Resources'])['Font'] ?? null) {
    foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) {
      $font= $objects[$ref->hashCode()];
      // Console::writeLine('  Font <', $id, '> := ', Objects::stringOf($font, '  '));

      if ($ref= $font['ToUnicode'] ?? null) {
        $font['$unicode']= new CharacterMap($objects[$ref->hashCode()]['$stream']);
      }
      $fonts[$id]= $font;
    }
  }

  // Select no font for the beginning
  $font= null;

  // Resolve objects
  $xobjects= [];
  if ($resources= $objectOf($page['Resources'])['XObject'] ?? null) {
    foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) {
      $xobjects[$id]= $objects[$ref->hashCode()];
    }
  }

  // DEBUG
  // foreach ($page['$contents'] as $i => $stream) {
  //   file_put_contents('text-block.'.$number.'_'.$i, $stream->bytes());
  // }

  $paragraph= '';
  $content= new Content(new SequenceInputStream($page['$contents']));
  foreach ($content->operations() as $op => $arguments) {

    // DEBUG
    // Console::$err->writeLine("\e[2m", '[', $number, ' | ', $op, ']: ', $arguments, "\e[0m");

    if ('BT' === $op) {
      // NOOP
    } else if ('ET' === $op) {
      Console::writeLine('  ', str_replace("\n", "\n  ", $paragraph));
      Console::writeLine();
      $paragraph= '';
    } else if ('Tf' === $op) {
      $id= $arguments[0][1];
      $size= $arguments[1][1];
      $font= ['font' => $fonts[$id] ?? '@'.$id, 'size' => $size];
    } else if ('Tj' === $op) {
      $paragraph.= $textOf($arguments[0], $font);
    } else if ('TJ' === $op) {
      foreach ($arguments as $argument) {
        if ('hex' === $argument[0] || 'string' === $argument[0]) {
          $paragraph.= $textOf($argument, $font);
        }
      }
    } else if ("'" === $op) {
      $paragraph.= "\n".$textOf($arguments[0], $font);;
    } else if ('"' === $op) {
      $paragraph.= "\n".$textOf($arguments[2], $font);;
    } else if ('TD' === $op) {
      $paragraph.= "\n";
    } else if ('Do' === $op) {
      $id= $arguments[0][1];
      $xobject= $xobjects[$id];
      if ('Image' === $xobject['Subtype']) {
        Console::writeLine('  ', "![{$id}]({$xobject['Subtype']}:{$xobject['Width']}x{$xobject['Height']})\n");
      }
    }
  }
 }
diff --git a/PdfFile.class.php b/PdfFile.class.php
 <?php

 use io\streams\InputStream;
 use lang\FormatException;

 /** @see https://opensource.adobe.com/dc-acrobat-sdk-docs/pdflsdk/#pdf-reference */
 class PdfFile {
  private $tokens;
  public $version;

  public function __construct(InputStream $in) {
    $this->tokens= new Tokens($in);
    if (1 !== sscanf($this->tokens->line(), '%%PDF-%[0-9.]', $this->version)) {
      throw new FormatException('PDF file header not found');
    }
  }

  private function xref() {
    $xref= [];
    while (2 === sscanf($line= $this->tokens->line(), '%d %d', $number, $length)) {
      for ($i= 0; $i < $length; $i++) {
        $xref[]= $this->tokens->line();
      }
    }
    $this->tokens->push($line."\n");
    return $xref;
  }

  public function objects() {
    $operands= [];
    while (null !== ($token= $this->tokens->token())) {
      if ('word' === $token[0]) {
        if ('obj' === $token[1]) {
          $number= $operands[0][1];
          $generation= $operands[1][1];
          yield 'object' => ['id' => new Ref($number, $generation), 'dict' => $this->tokens->value()];
        } else if ('xref' === $token[1]) {
          yield 'xref' => $this->xref();
        } else if ('trailer' === $token[1]) {
          yield 'trailer' => $this->tokens->value();
        }
        $operands= [];
      } else if ('comment' !== $token[0]) {
        $operands[]= $token;
      }
    }
  }
 }
diff --git a/Ref.class.php b/Ref.class.php
 <?php

 use lang\Value;
 use util\Objects;

 class Ref implements Value {
  public $number, $generation;

  public function __construct($number, $generation) {
    $this->number= $number;
    $this->generation= $generation;
  }

  public function hashCode() {
    return $this->number.'_'.$this->generation;
  }

  public function toString() {
    return nameof($this).'('.$this->number.'_'.$this->generation.')';
  }

  public function compareTo($value) {
    return $value instanceof self ? $this->hashCode() <=> $value->hashCode() : 1;
  }
 }
diff --git a/SequenceInputStream.class.php b/SequenceInputStream.class.php
 <?php

 use io\streams\InputStream;

 /** @test SequenceInputStreamTest */
 class SequenceInputStream implements InputStream {
  private $streams, $current;

  /** @param iterable $streams */
  public function __construct($streams) {
    if ($streams instanceof Iterator) {
      $this->streams= $streams;
    } else {
      $this->streams= new ArrayIterator($streams);
    }
    $this->current= $this->streams->current();
  }

  /** @return int */
  public function available() {
    do {
      if ($r= $this->current->available()) return $r;

      // No more data available on current stream, close and select next
      $this->streams->next();
    } while ($this->streams->valid() && ($this->current= $this->streams->current()));

    return 0;
  }

  /**
   * Reads up to the specified number of bytes
   *
   * @param  int $bytes
   * @return string
   */
  public function read($bytes= 8192) {
    do {
      if ('' !== ($r= $this->current->read($bytes))) return $r;

      // EOF from current stream, close and select next
      $this->streams->next();
    } while ($this->streams->valid() && ($this->current= $this->streams->current()));

    return '';
  }

  /** @return void */
  public function close() {
    foreach ($this->streams as $stream) {
      $stream->close();
    }
  }

  /** Ensure streams are closed */
  public function __destruct() {
    $this->close();
  }
 }
diff --git a/SequenceInputStreamTest.class.php b/SequenceInputStreamTest.class.php
 <?php

 use test\{Assert, Test};
 use io\streams\MemoryInputStream;

 class SequenceInputStreamTest {

  /** Drains a stream */
  private function drain($stream) {
    $r= [];
    while ($available= $stream->available()) {
      $r[]= [$available, $stream->read()];
    }
    $r[]= [$stream->available(), $stream->read()];
    return $r;
  }

  #[Test]
  public function drain_one() {
    $fixture= new SequenceInputStream([new MemoryInputStream('Test')]);
    Assert::equals([[4, 'Test'], [0, '']], $this->drain($fixture));
  }

  #[Test]
  public function drain_multiple() {
    $fixture= new SequenceInputStream([
      new MemoryInputStream('One'),
      new MemoryInputStream('Two')
    ]);
    Assert::equals([[3, 'One'], [3, 'Two'], [0, '']], $this->drain($fixture));
  }

  #[Test]
  public function using_only_read() {
    $fixture= new SequenceInputStream([
      new MemoryInputStream('One'),
      new MemoryInputStream('Two')
    ]);

    Assert::equals('One', $fixture->read());
    Assert::equals('Two', $fixture->read());
    Assert::equals('', $fixture->read());
  }
 }
diff --git a/Stream.class.php b/Stream.class.php
 <?php

 use io\streams\{InputStream, MemoryInputStream};
 use lang\Value;
 use util\Objects;

 class Stream implements Value, InputStream {
  private $bytes, $filter;
  private $input= null;

  public function __construct($bytes, $filter) {
    $this->bytes= $bytes;
    $this->filter= $filter;
  }

  public function bytes() {
    switch ($this->filter) {
      case null: return $this->bytes;
      case 'DCTDecode': case 'JPXDecode': case 'CCITTFaxDecode': return $this->bytes;
      case 'FlateDecode': return gzuncompress($this->bytes);
      default: throw new IllegalArgumentException('Unknown filter '.$this->filter);
    }
  }

  /** @return io.streams.InputStream */
  private function input() {
    if ('FlateDecode' === $this->filter) {
      return new MemoryInputStream(gzuncompress($this->bytes)); // TODO: xp-forge/compress
    } else {
      return new MemoryInputStream($this->bytes);
    }
  }

  public function available() {
    $this->input??= $this->input();
    return $this->input->available();
  }

  public function read($bytes= 8192) {
    $this->input??= $this->input();
    return $this->input->read($bytes= 8192);
  }

  public function close() {
    $this->input && $this->input->close();
    $this->input= null;
  }

  public function hashCode() {
    return 'S'.md5($this->bytes);
  }

  public function toString() {
    return nameof($this).'('.strlen($this->bytes).' bytes '.($this->filter ?? 'Plain').')';
  }

  public function compareTo($value) {
    return $value instanceof self ? $this->bytes <=> $value->bytes : 1;
  }
 }
diff --git a/Tokens.class.php b/Tokens.class.php
 <?php

 use io\streams\InputStream;
 use lang\FormatException;

 class Tokens {
  private $in;
  private $buffer= '';

  public function __construct(InputStream $in) {
    $this->in= $in;
  }

  public function push(string $buffer) {
    $this->buffer= $buffer.$this->buffer;
  }

  public function bytes(int $length): ?string {
    if (null === $this->buffer) return null;

    while (strlen($this->buffer) < $length && $this->in->available()) {
      $this->buffer.= $this->in->read();
    }

    $return= substr($this->buffer, 0, $length);
    $this->buffer= substr($this->buffer, $length);
    return $return;
  }

  public function line(): ?string {
    if (null === $this->buffer) return null;

    $eof= false;
    do {
      $p= strcspn($this->buffer, "\r\n");
      if ($p < strlen($this->buffer) - 1 || $eof= !$this->in->available()) break;
      $this->buffer.= $this->in->read();
    } while (true);

    // Check for Mac OS, Windows and Un*x line endings
    if ("\r" === ($this->buffer[$p] ?? null)) {
      $p >= strlen($this->buffer) && $this->buffer.= $this->in->read(1);
      $e= "\n" === ($this->buffer[$p + 1] ?? null) ? 2 : 1;
    } else {
      $e= 1;
    }

    $return= substr($this->buffer, 0, $p);
    $this->buffer= $eof ? null : substr($this->buffer, $p + $e);
    return $return;
  }

  public function scan(string $marker): string {
    if (null === $this->buffer) return null;

    while (false === ($p= strpos($this->buffer, $marker))) {
      if (!$this->in->available()) break;
      $this->buffer.= $this->in->read();
    }

    $return= substr($this->buffer, 0, $p);
    $this->buffer= substr($this->buffer, $p);
    return $return;
  }

  public function token($peek= false): ?array {
    if (null === $this->buffer) return null;

    // Read a complete line
    $eof= false;
    value: do {
      $p= strcspn($this->buffer, "\n");
      if ($p < strlen($this->buffer) - 1 || $eof= !$this->in->available()) break;
      $this->buffer.= $this->in->read();
    } while (true);

    // echo '=> `', addcslashes($this->buffer, "\0..\37!\177..\377"), "`\n";
    if ('' === $this->buffer && $eof) return $this->buffer= null;

    if ('/' === $this->buffer[0]) {
      $p= strspn($this->buffer, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789#:.,+-_', 1);
      $r= ['name', substr($this->buffer, 1, $p)];
      $p++;
    } else if ('(' === $this->buffer[0]) {
      $string= '';
      $offset= $braces= 1;

      segment: $p= strcspn($this->buffer, '\\()', $offset);
      $string.= substr($this->buffer, $offset, $p);
      $offset+= $p;
      if ('\\' === $this->buffer[$offset]) {
        $offset++;

        // Handle character sequences like `\374`
        if (3 === strspn($this->buffer, '0123456789', $offset)) {
          $string.= chr(octdec(substr($this->buffer, $offset, 3)));
          $offset+= 3;
        } else {
          $string.= $this->buffer[$offset++];
        }
        goto segment;
      } else if ('(' === $this->buffer[$offset]) {
        $offset++;
        if (++$braces) {
          $string.= '(';
          goto segment;
        }
      } else if (')' === $this->buffer[$offset]) {
        $offset++;
        if (--$braces) {
          $string.= ')';
          goto segment;
        }
      }
      $p= $offset;
      $r= ['string', $string];
    } else if ('[' === $this->buffer[0]) {
      $p= 1;
      $r= ['array-start', null];
    } else if (']' === $this->buffer[0]) {
      $p= 1;
      $r= ['array-end', null];
    } else if ('%' === $this->buffer[0]) {
      $p= strcspn($this->buffer, "\r\n", 1);
      $r= ['comment', substr($this->buffer, 1 + strspn($this->buffer, ' ', 1), $p - 1)];
      $p++;
    } else if (0 === strncmp($this->buffer, '<<', 2)) {
      $p= 2;
      $r= ['dict-start', null];
    } else if (0 === strncmp($this->buffer, '>>', 2)) {
      $p= 2;
      $r= ['dict-end', null];
    } else if ('<' === $this->buffer[0]) {
      $p= strpos($this->buffer, '>');
      $r= ['hex', substr($this->buffer, 1, $p - 1)];
      $p++;
    } else if ($p= strspn($this->buffer, '-0123456789.')) {

      // Disambiguate references from integer and decimal numbers
      if (4 === sscanf($this->buffer, '%d %d %*[R]%n', $number, $generation, $l)) {
        $p= $l;
        $r= ['ref', new Ref($number, $generation)];
      } else {
        $number= substr($this->buffer, 0, $p);
        $r= false === strpos($number, '.') ? ['integer', (int)$number] : ['decimal', (float)$number];
      }
    } else if (0 === strncmp($this->buffer, 'true', 4)) {
      $p= 4;
      $r= ['const', true];
    } else if (0 === strncmp($this->buffer, 'null', 4)) {
      $p= 4;
      $r= ['const', null];
    } else if (0 === strncmp($this->buffer, 'false', 5)) {
      $p= 5;
      $r= ['const', false];
    } else if (0 === strncmp($this->buffer, 'stream', 6)) {

      // Stream starts on a new line
      $p= 6 + strspn($this->buffer, "\r\n", 6);
      $r= ['stream-start', null];
    } else if (0 === strncmp($this->buffer, 'endstream', 9)) {
      $p= 9;
      $r= ['stream-end', null];
    } else if ($p= strspn($this->buffer, " \t\r\n")) {
      $this->buffer= substr($this->buffer, $p);
      goto value;
    } else {

      // Everything until the next whitespace or begin of a token
      $p= strcspn($this->buffer, " \t\r\n/<[(%");
      $r= ['word', substr($this->buffer, 0, $p)];
    }

    $peek || $this->buffer= substr($this->buffer, $p);
    // var_dump($r);
    return $r;
  }

  public function expect($kind) {
    $token= $this->token();
    if ($kind !== $token[0]) {
      throw new FormatException(sprintf(
        'Expected %s, have %s `%s...`',
        $kind,
        $token[0],
        substr(addcslashes($token[1], "\0..\37!\177..\377"), 0, 42)
      ));
    }
    return $token;
  }

  public function value($token= null) {
    $token??= $this->token();
    if ('array-start' === $token[0]) {
      $array= [];
      element: if (null === ($token= $this->token())) throw new FormatException('unclosed array');
      if ('array-end' !== $token[0]) {
        $array[]= $this->value($token);
        goto element;
      }
      return $array;
    } else if ('dict-start' === $token[0]) {
      $object= [];
      pair: if (null === ($token= $this->token())) throw new FormatException('unclosed dict');
      if ('dict-end' !== $token[0]) {
        $object[$token[1]]= $this->value();
        goto pair;
      }

      // Handle stream objects
      $token= $this->token(true);
      if (['stream-start', null] === $token) {
        $this->expect('stream-start');
        if (is_int($object['Length'])) {
          $bytes= $this->bytes($object['Length']);
        } else {
          $bytes= $this->scan("endstream\n");
        }
        $this->expect('stream-end');

        $object['$stream']= new Stream($bytes, $object['Filter'] ?? null);
      }
      return $object;
    } else if ('string' === $token[0]) {
      if (0 === strncmp("\xfe\xff", $token[1], 2)) {
        return iconv('utf-16be', \xp::ENCODING, substr($token[1], 2));
      } else {
        return $token[1];
      }
    } else {
      return $token[1];
    }
  }

  public function close() {
    $this->in->close();
  }
 }
diff --git a/TokensTest.class.php b/TokensTest.class.php
 <?php

 use test\{Assert, Ignore, Test, Values};
 use io\streams\InputStream;

 class TokensTest {

  /** @param string|string[] $chunks */
  private function fixture($chunks): Tokens {
    return new Tokens(new class((array)$chunks) implements InputStream {
      private $chunks;
      public function __construct($chunks) { $this->chunks= $chunks; }
      public function available() { return sizeof($this->chunks); }
      public function read($bytes= 8192) { return array_shift($this->chunks); }
      public function close() { $this->chunks= []; }
    });
  }

  #[Test]
  public function read_twice() {
    $fixture= $this->fixture('Tested');
    Assert::equals(['Test', 'ed'], [$fixture->bytes(4), $fixture->bytes(2)]);
  }

  #[Test]
  public function read_empty() {
    $fixture= $this->fixture('');
    Assert::equals('', $fixture->bytes(1));
  }

  #[Test, Values(['Test', "Test\n"])]
  public function line($input) {
    $fixture= $this->fixture($input);
    Assert::equals('Test', $fixture->line());
    Assert::null($fixture->line());
  }

  #[Test, Values(["Line 1\n\nLine 3", "Line 1\r\rLine 3", "Line 1\r\n\r\nLine 3"])]
  public function empty_line($input) {
    $fixture= $this->fixture($input);
    Assert::equals('Line 1', $fixture->line());
    Assert::equals('', $fixture->line());
    Assert::equals('Line 3', $fixture->line());
    Assert::null($fixture->line());
  }

  #[Test, Values([['/Length', 'Length'], ['/ca', 'ca'], ['/S', 'S'], ['/FlateDecode', 'FlateDecode'], ['/C2_0', 'C2_0'], ['/Font,Bold', 'Font,Bold']])]
  public function name($input, $expected) {
    Assert::equals(['name', $expected], $this->fixture($input)->token());
  }

  #[Test, Values([['()', ''], ['(Test)', 'Test']])]
  public function string($input, $expected) {
    Assert::equals(['string', $expected], $this->fixture($input)->token());
  }

  #[Test, Values([['(C:\\\\PHP \\(8.5\\))', 'C:\\PHP (8.5)'], ['(f\\303\\274r)', 'für']])]
  public function escapes_in_string($input, $expected) {
    Assert::equals(['string', $expected], $this->fixture($input)->token());
  }

  #[Test, Values([['(())', '()'], ['(Test (OK))', 'Test (OK)']])]
  public function balanced_braces_in_string($input, $expected) {
    Assert::equals(['string', $expected], $this->fixture($input)->token());
  }

  #[Test, Values([['1', 1], ['-1', -1], ['6100', 6100]])]
  public function integer($input, $expected) {
    Assert::equals(['integer', $expected], $this->fixture($input)->token());
  }

  #[Test, Values([['1.5', 1.5], ['-0.5', -0.5], ['3.141', 3.141]])]
  public function decimal($input, $expected) {
    Assert::equals(['decimal', $expected], $this->fixture($input)->token());
  }

  #[Test]
  public function ref() {
    Assert::equals(['ref', new Ref(62, 0)], $this->fixture('62 0 R')->token());
  }

  #[Test]
  public function comment() {
    Assert::equals(['comment', 'Test'], $this->fixture('% Test')->token());
  }

  #[Test]
  public function hex() {
    Assert::equals(['hex', 'B5FEF09943'], $this->fixture('<B5FEF09943>')->token());
  }

  #[Test, Ignore('Not yet implemented')]
  public function multiline_string() {
    Assert::equals(['Test passed'], $this->fixture("(Test\n passed)")->token());
  }

  #[Test, Values([['true', true], ['false', false], ['null', null]])]
  public function constants($input, $expected) {
    Assert::equals(['const', $expected], $this->fixture($input)->token());
  }

  #[Test, Values(['[]', '[ ]', '[  ]'])]
  public function empty_array($input) {
    $fixture= $this->fixture($input);
    Assert::equals(['array-start', null], $fixture->token());
    Assert::equals(['array-end', null], $fixture->token());
  }

  #[Test, Values(['[[]]', '[ [] ]', '[ [ ] ]'])]
  public function nested_array($input) {
    $fixture= $this->fixture($input);
    Assert::equals(['array-start', null], $fixture->token());
    Assert::equals(['array-start', null], $fixture->token());
    Assert::equals(['array-end', null], $fixture->token());
    Assert::equals(['array-end', null], $fixture->token());
  }

  #[Test, Values(['[1 2]', '[1 2 ]', '[ 1 2]', '[ 1 2 ]', '[ 1  2 ]'])]
  public function array($input) {
    $fixture= $this->fixture($input);
    Assert::equals(['array-start', null], $fixture->token());
    Assert::equals(['integer', 1], $fixture->token());
    Assert::equals(['integer', 2], $fixture->token());
    Assert::equals(['array-end', null], $fixture->token());
  }

  #[Test, Values(['<<>>', '<< >>', '<<  >>'])]
  public function empty_dict($input) {
    $fixture= $this->fixture($input);
    Assert::equals(['dict-start', null], $fixture->token());
    Assert::equals(['dict-end', null], $fixture->token());
  }

  #[Test, Values(['<</Length 3>>', '<< /Length 3>>', '<</Length 3 >>'])]
  public function dict($input) {
    $fixture= $this->fixture($input);
    Assert::equals(['dict-start', null], $fixture->token());
    Assert::equals(['name', 'Length'], $fixture->token());
    Assert::equals(['integer', 3], $fixture->token());
    Assert::equals(['dict-end', null], $fixture->token());
  }

  #[Test]
  public function multiline_dict() {
    $fixture= $this->fixture(["<</Type /StructElem\n", "/S /Link>>"]);
    Assert::equals(['dict-start', null], $fixture->token());
    Assert::equals(['name', 'Type'], $fixture->token());
    Assert::equals(['name', 'StructElem'], $fixture->token());
    Assert::equals(['name', 'S'], $fixture->token());
    Assert::equals(['name', 'Link'], $fixture->token());
    Assert::equals(['dict-end', null], $fixture->token());
  }

  #[Test, Values([[["<<>> stream\nTest\nendstream"]], [["<<>> stream\r\nTest\r\nendstream"]], [["<<>>\nstream\nTest\nendstream"]], [["<<>> stream\n", "Test\nendstream"]], [["<<>>\n", "stream\n", "Test\nendstream"]], [["<<>>", "\n", "stream\n", "Test\nendstream"]]])]
  public function stream_object($chunks) {
    $fixture= $this->fixture($chunks);
    Assert::equals(['dict-start', null], $fixture->token());
    Assert::equals(['dict-end', null], $fixture->token());
    Assert::equals(['stream-start', null], $fixture->token());
    Assert::equals('Test', $fixture->bytes(4));
    Assert::equals(['stream-end', null], $fixture->token());
  }
 }
diff --git a/ValueTest.class.php b/ValueTest.class.php
 <?php

 use test\{Assert, Test, Values};
 use io\streams\MemoryInputStream;

 class ValueTest {

  public function parse($input) {
    return (new Tokens(new MemoryInputStream($input)))->value();
  }

  #[Test, Values([['1', 1], ['-1', -1], ['6100', 6100]])]
  public function integer($input, $expected) {
    Assert::equals($expected, $this->parse($input));
  }

  #[Test, Values([['1.5', 1.5], ['-0.5', -0.5], ['3.141', 3.141]])]
  public function decimal($input, $expected) {
    Assert::equals($expected, $this->parse($input));
  }

  #[Test, Values([['/Length', 'Length'], ['/ca', 'ca'], ['/FlateDecode', 'FlateDecode']])]
  public function literal($input, $expected) {
    Assert::equals($expected, $this->parse($input));
  }

  #[Test, Values([['()', ''], ['(Test)', 'Test'], ['(C:\\\\PHP \\(8.5\\))', 'C:\\PHP (8.5)']])]
  public function string($input, $expected) {
    Assert::equals($expected, $this->parse($input));
  }

  #[Test, Values([['[1 2 3]', [1, 2, 3]], ['[(One) (Two)]', ['One', 'Two']], ['[(One)-0.7 (Two)-6]', ['One', -0.7, 'Two', -6]]])]
  public function array($input, $expected) {
    Assert::equals($expected, $this->parse($input));
  }

  #[Test, Values(["[1 2 3\n4 5 6]", "[\n1 2 3\n4 5 6\n]", "[\n1\n2\n3\n4\n5\n6\n]"])]
  public function multiline_array($input) {
    Assert::equals([1, 2, 3, 4, 5, 6], $this->parse($input));
  }


  #[Test]
  public function ref() {
    Assert::equals(new Ref(62, 0), $this->parse('62 0 R'));
  }

  #[Test]
  public function hex() {
    Assert::equals('B5FEF09943A87A6B1F64BB26ED1B7737', $this->parse('<B5FEF09943A87A6B1F64BB26ED1B7737>'));
  }

  #[Test]
  public function oneliner_dict() {
    Assert::equals(['Test' => 3], $this->parse('<</Test 3>>'));
  }

  #[Test]
  public function compact_dict() {
    Assert::equals(
      ['Filter' => 'FlateDecode', 'Length' => 597, 'Info' => new Ref(433, 0)],
      $this->parse('<</Filter/FlateDecode/Length 597/Info 433 0 R>>')
    );
  }

  #[Test]
  public function multiline_indented_dict() {
    Assert::equals(
      ['N' => 3, 'Filter' => 'FlateDecode', 'ColorSpace' => ['Indexed', 'DeviceRGB', 255, '00FF']],
      $this->parse("<<\n  /N 3\n  /Filter /FlateDecode\n  /ColorSpace [/Indexed /DeviceRGB 255 <00FF>]\n>>")
    );
  }

  #[Test]
  public function metadata_dict() {
    Assert::equals(
      ['Title' => 'Test', 'Creator' => 'XP', 'CreationDate' => 'D:20250530173115+00\'00\''],
      $this->parse("<</Title (Test)\n/Creator (XP)\n/CreationDate (D:20250530173115+00'00')>>")
    );
  }

  #[Test]
  public function nested_dict() {
    Assert::equals(
      [
        'Resources' => ['Font' => ['F6' => new Ref(6, 0), 'F7' => new Ref(7, 0)]],
        'Tabs' => 'S',
      ],
      $this->parse("<</Resources <</Font <</F6 6 0 R\n/F7 7 0 R>>>>\n/Tabs /S>>")
    );
  }

  #[Test, Values(['', ' ', "\n", "\r", "\r\n"])]
  public function dict_with_stream($separator) {
    $bytes= "\x78\x9c\x0b\x49\x2d\x2e\x01\x00\x03\xdd\x01\xa1"; // gzcompress("Test")
    Assert::equals(
      ['Filter' => 'FlateDecode', 'Length' => 12, '$stream' => new Stream($bytes, 'FlateDecode')],
      $this->parse("<</Filter /FlateDecode\n/Length 12>>{$separator}stream\n{$bytes}\nendstream")
    );
  }

  #[Test, Values(["Line 1", "Line 1\nLine 2"])]
  public function dict_with_ref_length($bytes) {
    Assert::equals(
      ['Filter' => 'FlateDecode', 'Length' => new Ref(13, 0), '$stream' => new Stream($bytes, 'FlateDecode')],
      $this->parse("<</Filter /FlateDecode\n/Length 13 0 R>> stream\n{$bytes}endstream\n")
    );
  }

  #[Test]
  public function array_with_whitespace() {
    Assert::equals(
      [new Ref(13, 0), new Ref(20, 0), new Ref(23, 0)],
      $this->parse("[13 0 R 20 0 R 23 0 R ]")
    );
  }

  #[Test]
  public function dict_with_whitespace() {
    Assert::equals(
      ['Length' => new Ref(41, 0), 'Filter' => 'FlateDecode'],
      $this->parse("<< /Length 41 0 R /Filter /FlateDecode >>")
    );
  }
 }
	<?php

	use io\streams\InputStream;
	use lang\FormatException;

	class CharacterMap {
	private $tokens;

	public $width= null;
	public $chars= [], $ranges= [];
	public $codespace= [0x0000, 0xffff];

	public function __construct(InputStream $in) {
	$this->tokens= new Tokens($in);
	}

	private function char($code) {
	return iconv('ucs-4', 'utf-8', pack('N', $code));
	}

	public function parse(): self {
	try {
	do {
	$token= $this->tokens->token();

	// Parse `2 begin[...]` - an enumeration with 2 entries
	if ('integer' === $token[0]) {
	$length= $token[1];
	} else if ('word' === $token[0] && 1 === sscanf($token[1], 'begin%s', $type)) {
	switch ($type) {
	case 'cmap';
	break;

	case 'codespacerange':
	for ($i= 0; $i < $length; $i++) {
	$lo= $this->tokens->token()[1];
	$hi= $this->tokens->token()[1];
	$this->codespace= [hexdec($lo), hexdec($hi)];
	}
	break;

	case 'bfchar':
	for ($i= 0; $i < $length; $i++) {
	$src= $this->tokens->token()[1];
	$map= $this->tokens->token()[1];
	$this->chars[hexdec($src)]= iconv('utf-16be', 'utf-8', hex2bin($map));
	}
	$this->width??= strlen($src);
	break;

	case 'bfrange': case 'cidrange':
	for ($i= 0; $i < $length; $i++) {
	$lo= $this->tokens->token()[1];
	$hi= $this->tokens->token()[1];

	$token= $this->tokens->token();
	if ('array-start' === $token[0]) {
	$map= [];
	next: $token= $this->tokens->token();
	if ('hex' === $token[0]) {
	$map[]= hexdec($token[1]);
	goto next;
	} else if ('integer' === $token[0]) {
	$map[]= $token[1];
	goto next;
	}
	} else if ('hex' === $token[0]) {
	$map= hexdec($token[1]);
	} else {
	$map= $token[1];
	}
	$this->ranges[]= [hexdec($lo), hexdec($hi), $map];
	}
	$this->width??= strlen($lo);
	break;

	default:
	throw new FormatException('Unknown character map enumeration '.$type);
	}
	}
	} while ('endcmap' !== $token[1]);

	return $this;
	} finally {
	$this->tokens->close();
	}
	}

	public function translate($bytes, $fallback= null) {
	$this->width ?? $this->parse();

	$string= '';
	for ($i= 0, $l= strlen($bytes), $b= ($this->width ?? 2) / 2; $i < $l; $i+= $b) {
	$code= 1 === $b ? ord($bytes[$i]) : unpack('n', $bytes, $i)[1];

	if (null !== ($char= $this->chars[$code] ?? null)) {
	$string.= $char;
	} else if ($code < $this->codespace[0] \|\| $code > $this->codespace[1]) {
	$string.= $this->char($code);
	} else {
	$char= null;
	foreach ($this->ranges as $range) {
	if ($code >= $range[0] && $code <= $range[1]) {
	if (is_array($range[2])) {
	$char= $this->char($range[2][$code - $range[0]]);
	} else {
	$char= $this->char($code - $range[0] + $range[2]);
	}
	break;
	}
	}
	$string.= $char ?? sprintf('\u{%04x}', $code);
	}
	}

	// DEBUG
	// var_dump($this);
	// echo "< $bytes\n";
	// echo "> `", addcslashes($string, "\0..\37!\177..\377"), "`\n";

	return $string;
	}
	}
	<?php

	use test\{Assert, Test, Values};
	use io\streams\MemoryInputStream;

	class CharacterMapTest {
	private $definition= <<<'MULTIBYTE'
	/CIDInit /ProcSet findresource begin
	12 dict begin
	begincmap
	/CIDSystemInfo
	<< /Registry (Adobe)
	/Ordering (UCS)
	/Supplement 0
	>> def
	/CMapName
	/Adobe-Identity-UCS def
	/CMapType 2 def
	1 begincodespacerange
	<0000> <FFFF>
	endcodespacerange
	4 beginbfchar
	<0003> <0054>
	<000F> <0065>
	<0011> <0073>
	<0012> <007400650064>
	endbfchar
	2 beginbfrange
	<0013> <001C> <0030>
	<03E0> <03E1> [<002C> <003A>]
	endbfrange
	endcmap
	CMapName currentdict /CMap defineresource pop
	end
	end
	MULTIBYTE;

	/** @return iterable */
	private function formats() {
	yield [new MemoryInputStream($this->definition), 'multiline'];
	yield [new MemoryInputStream(strtr($this->definition, "\n", ' ')), 'compact'];
	}

	#[Test, Values(from: 'formats')]
	public function parse($stream, $kind) {
	$fixture= (new CharacterMap($stream))->parse();

	Assert::equals(4, $fixture->width);
	Assert::equals([0x0000, 0xffff], $fixture->codespace);
	Assert::equals([0x0003 => 'T', 0x000F => 'e', 0x0011 => 's', 0x0012 => 'ted'], $fixture->chars);
	Assert::equals([[0x0013, 0x001C, 0x0030], [0x03E0, 0x03E1, [0x002C, 0x003A]]], $fixture->ranges);
	}

	#[Test]
	public function translate_hex_string() {
	$fixture= new CharacterMap(new MemoryInputStream($this->definition));
	Assert::equals('Tested19:', $fixture->translate(hex2bin('0003000F001100120014001C03E1')));
	}
	}
	<?php

	use io\streams\InputStream;

	class Content {
	private $tokens;

	public function __construct(InputStream $in) {
	$this->tokens= new Tokens($in);
	}

	public function operations(): Generator {
	$operands= [];
	while (null !== ($token= $this->tokens->token())) {
	if ('word' === $token[0]) {

	// Attach inline image data
	if ('ID' === $token[1]) {
	$operands[]= ['$inline', trim($this->tokens->scan("\nEI"))];
	}

	yield $token[1] => $operands;
	$operands= [];
	} else {
	$operands[]= $token;
	}
	}
	$this->tokens->close();
	}
	}
	<?php

	use io\streams\MemoryInputStream;
	use test\{Assert, Test};

	class ContentTest {

	/** Parses and returns content operations */
	private function parse($content) {
	$content= new Content(new MemoryInputStream($content));
	$r= [];
	foreach ($content->operations() as $op => $arguments) {
	$r[]= [$op => $arguments];
	}
	return $r;
	}

	#[Test]
	public function text_block() {
	Assert::equals(
	[
	['BT' => []],
	['Tf' => [['name', 'R13'], ['integer', 60]]],
	['Tj' => [['string', 'Test']]],
	['ET' => []],
	],
	$this->parse(<<<'PS'
	BT
	/R13 60 Tf
	(Test)Tj
	ET
	PS)
	);
	}

	#[Test]
	public function inline_image() {
	Assert::equals(
	[
	['BI' => []],
	['ID' => [['name', 'CS'], ['name', 'RGB'], ['$inline', "GIF89a\n..."]]],
	['EI' => []],
	],
	$this->parse(
	"BI\n".
	"/CS/RGB\n".
	"ID GIF89a\n".
	"...\n".
	"EI\n"
	)
	);
	}
	}
	<?php

	use io\streams\FileInputStream;
	use util\cmd\Console;
	use util\Objects;
	use lang\FormatException;

	try {
	$parser= new PdfFile(new FileInputStream($argv[1]));
	} catch (Throwable $e) {
	Console::writeLine($argv[1], ': ', $e);
	return 2;
	}

	$objects= [];
	$trailer= [];
	Console::$err->write('Parsing ', $argv[1], ' [');
	foreach ($parser->objects() as $kind => $value) {
	Console::$err->write('.');
	if ('object' === $kind) {
	$objects[$value['id']->hashCode()]= $value['dict'];
	} else if ('trailer' === $kind) {
	$trailer+= $value;
	}
	}
	Console::$err->writeLine(']');

	// Handle files without `trailer` entry, e.g. linearized PDFs
	if (empty($trailer)) {
	foreach ($objects as $object) {
	if ('XRef' === ($object['Type'] ?? null)) {
	$trailer= $object;
	break;
	}
	}
	}

	// Handle encryption
	Console::writeLine('Trailer: ', $trailer);
	if (isset($trailer['Encrypt'])) {
	$encrypt= $objects[$trailer['Encrypt']->hashCode()];
	Console::writeLine('Cannot handle encrypted PDFs: ', $encrypt);
	return 1;
	}

	// Now, unpack object streams and add them to the lookup table
	foreach ($objects as $id => $object) {
	if ('ObjStm' === ($object['Type'] ?? null)) {
	$n= $object['N'];
	$tokens= new Tokens($object['$stream']);

	$numbers= [];
	for ($i= 0; $i < $n; $i++) {
	$number= $tokens->token()[1];
	$offset= $tokens->token()[1];
	$numbers[$i]= $number.'_0';
	}

	for ($i= 0; $i < $n; $i++) {
	$objects[$numbers[$i]]= $tokens->value();
	}
	}
	}

	if ($info= ($trailer['Info'] ?? null)) {
	Console::writeLine('Info: ', $objects[$info->hashCode()]);
	}

	$root= $objects[$trailer['Root']->hashCode()];
	Console::writeLine('Root: ', $root);

	$pages= $objects[$root['Pages']->hashCode()];
	Console::writeLine('Pages: ', $pages);

	$objectOf= function($arg) use(&$objects) {
	return $arg instanceof Ref ? $objects[$arg->hashCode()] : $arg;
	};

	$contentsOf= function($kids, $suffix= '') use(&$contentsOf, &$objectOf, &$objects) {
	foreach ($kids as $i => $ref) {
	$page= $objects[$ref->hashCode()];
	$number= $suffix.($i + 1);

	if ('Pages' === $page['Type']) {
	yield from $contentsOf($page['Kids'], $number.'.');
	continue;
	}

	// A single content object or an array of references
	$contents= $objectOf($page['Contents']);
	if ($stream= $contents['$stream'] ?? null) {
	$streams= [$stream];
	} else {
	$streams= [];
	foreach ($contents as $ref) {
	$streams[]= $objects[$ref->hashCode()]['$stream'];
	}
	}
	yield $number => $page + ['$contents' => $streams];
	}
	};

	$stringOf= function($bytes, $encoding, $unicode= null) use(&$stringOf) {
	switch ($encoding) {
	case null: return $bytes;
	case 'MacRomanEncoding': return iconv('macintosh', \xp::ENCODING, $bytes);
	case 'WinAnsiEncoding': return iconv('cp1252', \xp::ENCODING, $bytes);
	case 'Identity-H': return $unicode->translate($bytes);
	default:

	// FIXME: Respect differences
	return $stringOf($bytes, $encoding['BaseEncoding'] ?? null);
	}
	};

	$textOf= function($token, array $format) use(&$objects, &$objectOf, &$stringOf) {
	if ($descendants= $format['font']['DescendantFonts'] ?? null) {
	// TODO https://github.com/adobe-type-tools/cmap-resources
	}

	$bytes= 'hex' === $token[0] ? hex2bin($token[1]) : $token[1];
	switch ($format['font']['Subtype']) {
	case 'Type0': case 'Type1': case 'TrueType':
	return $stringOf(
	$bytes,
	$objectOf($format['font']['Encoding'] ?? null),
	$format['font']['$unicode'] ?? null
	);

	case 'Type3':
	if ($map= $format['font']['$unicode'] ?? null) {
	return $format['font']['$unicode']->translate($bytes);
	}
	throw new FormatException('Cannot handle type 3 fonts without unicode');

	default:
	throw new FormatException('Unknown font type '.$format['font']['Subtype']);
	}
	};

	foreach ($contentsOf($pages['Kids']) as $number => $page) {
	Console::writeLine('- Page #', $number, ' contents: ', Objects::stringOf($page['Contents'], ' '));

	// Resolve fonts
	$fonts= [];
	if ($resources= $objectOf($page['Resources'])['Font'] ?? null) {
	foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) {
	$font= $objects[$ref->hashCode()];
	// Console::writeLine(' Font <', $id, '> := ', Objects::stringOf($font, ' '));

	if ($ref= $font['ToUnicode'] ?? null) {
	$font['$unicode']= new CharacterMap($objects[$ref->hashCode()]['$stream']);
	}
	$fonts[$id]= $font;
	}
	}

	// Select no font for the beginning
	$font= null;

	// Resolve objects
	$xobjects= [];
	if ($resources= $objectOf($page['Resources'])['XObject'] ?? null) {
	foreach (is_array($resources) ? $resources : $objectOf($resources) as $id => $ref) {
	$xobjects[$id]= $objects[$ref->hashCode()];
	}
	}

	// DEBUG
	// foreach ($page['$contents'] as $i => $stream) {
	// file_put_contents('text-block.'.$number.'_'.$i, $stream->bytes());
	// }

	$paragraph= '';
	$content= new Content(new SequenceInputStream($page['$contents']));
	foreach ($content->operations() as $op => $arguments) {

	// DEBUG
	// Console::$err->writeLine("\e[2m", '[', $number, ' \| ', $op, ']: ', $arguments, "\e[0m");

	if ('BT' === $op) {
	// NOOP
	} else if ('ET' === $op) {
	Console::writeLine(' ', str_replace("\n", "\n ", $paragraph));
	Console::writeLine();
	$paragraph= '';
	} else if ('Tf' === $op) {
	$id= $arguments[0][1];
	$size= $arguments[1][1];
	$font= ['font' => $fonts[$id] ?? '@'.$id, 'size' => $size];
	} else if ('Tj' === $op) {
	$paragraph.= $textOf($arguments[0], $font);
	} else if ('TJ' === $op) {
	foreach ($arguments as $argument) {
	if ('hex' === $argument[0] \|\| 'string' === $argument[0]) {
	$paragraph.= $textOf($argument, $font);
	}
	}
	} else if ("'" === $op) {
	$paragraph.= "\n".$textOf($arguments[0], $font);;
	} else if ('"' === $op) {
	$paragraph.= "\n".$textOf($arguments[2], $font);;
	} else if ('TD' === $op) {
	$paragraph.= "\n";
	} else if ('Do' === $op) {
	$id= $arguments[0][1];
	$xobject= $xobjects[$id];
	if ('Image' === $xobject['Subtype']) {
	Console::writeLine(' ', "![{$id}]({$xobject['Subtype']}:{$xobject['Width']}x{$xobject['Height']})\n");
	}
	}
	}
	}
	<?php

	use lang\Value;
	use util\Objects;

	class Ref implements Value {
	public $number, $generation;

	public function __construct($number, $generation) {
	$this->number= $number;
	$this->generation= $generation;
	}

	public function hashCode() {
	return $this->number.'_'.$this->generation;
	}

	public function toString() {
	return nameof($this).'('.$this->number.'_'.$this->generation.')';
	}

	public function compareTo($value) {
	return $value instanceof self ? $this->hashCode() <=> $value->hashCode() : 1;
	}
	}
	<?php

	use io\streams\InputStream;

	/** @test SequenceInputStreamTest */
	class SequenceInputStream implements InputStream {
	private $streams, $current;

	/** @param iterable $streams */
	public function __construct($streams) {
	if ($streams instanceof Iterator) {
	$this->streams= $streams;
	} else {
	$this->streams= new ArrayIterator($streams);
	}
	$this->current= $this->streams->current();
	}

	/** @return int */
	public function available() {
	do {
	if ($r= $this->current->available()) return $r;

	// No more data available on current stream, close and select next
	$this->streams->next();
	} while ($this->streams->valid() && ($this->current= $this->streams->current()));

	return 0;
	}

	/**
	* Reads up to the specified number of bytes
	*
	* @param int $bytes
	* @return string
	*/
	public function read($bytes= 8192) {
	do {
	if ('' !== ($r= $this->current->read($bytes))) return $r;

	// EOF from current stream, close and select next
	$this->streams->next();
	} while ($this->streams->valid() && ($this->current= $this->streams->current()));

	return '';
	}

	/** @return void */
	public function close() {
	foreach ($this->streams as $stream) {
	$stream->close();
	}
	}

	/** Ensure streams are closed */
	public function __destruct() {
	$this->close();
	}
	}
	<?php

	use io\streams\{InputStream, MemoryInputStream};
	use lang\Value;
	use util\Objects;

	class Stream implements Value, InputStream {
	private $bytes, $filter;
	private $input= null;

	public function __construct($bytes, $filter) {
	$this->bytes= $bytes;
	$this->filter= $filter;
	}

	public function bytes() {
	switch ($this->filter) {
	case null: return $this->bytes;
	case 'DCTDecode': case 'JPXDecode': case 'CCITTFaxDecode': return $this->bytes;
	case 'FlateDecode': return gzuncompress($this->bytes);
	default: throw new IllegalArgumentException('Unknown filter '.$this->filter);
	}
	}

	/** @return io.streams.InputStream */
	private function input() {
	if ('FlateDecode' === $this->filter) {
	return new MemoryInputStream(gzuncompress($this->bytes)); // TODO: xp-forge/compress
	} else {
	return new MemoryInputStream($this->bytes);
	}
	}

	public function available() {
	$this->input??= $this->input();
	return $this->input->available();
	}

	public function read($bytes= 8192) {
	$this->input??= $this->input();
	return $this->input->read($bytes= 8192);
	}

	public function close() {
	$this->input && $this->input->close();
	$this->input= null;
	}

	public function hashCode() {
	return 'S'.md5($this->bytes);
	}

	public function toString() {
	return nameof($this).'('.strlen($this->bytes).' bytes '.($this->filter ?? 'Plain').')';
	}

	public function compareTo($value) {
	return $value instanceof self ? $this->bytes <=> $value->bytes : 1;
	}
	}
	<?php

	use test\{Assert, Ignore, Test, Values};
	use io\streams\InputStream;

	class TokensTest {

	/** @param string\|string[] $chunks */
	private function fixture($chunks): Tokens {
	return new Tokens(new class((array)$chunks) implements InputStream {
	private $chunks;
	public function __construct($chunks) { $this->chunks= $chunks; }
	public function available() { return sizeof($this->chunks); }
	public function read($bytes= 8192) { return array_shift($this->chunks); }
	public function close() { $this->chunks= []; }
	});
	}

	#[Test]
	public function read_twice() {
	$fixture= $this->fixture('Tested');
	Assert::equals(['Test', 'ed'], [$fixture->bytes(4), $fixture->bytes(2)]);
	}

	#[Test]
	public function read_empty() {
	$fixture= $this->fixture('');
	Assert::equals('', $fixture->bytes(1));
	}

	#[Test, Values(['Test', "Test\n"])]
	public function line($input) {
	$fixture= $this->fixture($input);
	Assert::equals('Test', $fixture->line());
	Assert::null($fixture->line());
	}

	#[Test, Values(["Line 1\n\nLine 3", "Line 1\r\rLine 3", "Line 1\r\n\r\nLine 3"])]
	public function empty_line($input) {
	$fixture= $this->fixture($input);
	Assert::equals('Line 1', $fixture->line());
	Assert::equals('', $fixture->line());
	Assert::equals('Line 3', $fixture->line());
	Assert::null($fixture->line());
	}

	#[Test, Values([['/Length', 'Length'], ['/ca', 'ca'], ['/S', 'S'], ['/FlateDecode', 'FlateDecode'], ['/C2_0', 'C2_0'], ['/Font,Bold', 'Font,Bold']])]
	public function name($input, $expected) {
	Assert::equals(['name', $expected], $this->fixture($input)->token());
	}

	#[Test, Values([['()', ''], ['(Test)', 'Test']])]
	public function string($input, $expected) {
	Assert::equals(['string', $expected], $this->fixture($input)->token());
	}

	#[Test, Values([['(C:\\\\PHP \\(8.5\\))', 'C:\\PHP (8.5)'], ['(f\\303\\274r)', 'für']])]
	public function escapes_in_string($input, $expected) {
	Assert::equals(['string', $expected], $this->fixture($input)->token());
	}

	#[Test, Values([['(())', '()'], ['(Test (OK))', 'Test (OK)']])]
	public function balanced_braces_in_string($input, $expected) {
	Assert::equals(['string', $expected], $this->fixture($input)->token());
	}

	#[Test, Values([['1', 1], ['-1', -1], ['6100', 6100]])]
	public function integer($input, $expected) {
	Assert::equals(['integer', $expected], $this->fixture($input)->token());
	}

	#[Test, Values([['1.5', 1.5], ['-0.5', -0.5], ['3.141', 3.141]])]
	public function decimal($input, $expected) {
	Assert::equals(['decimal', $expected], $this->fixture($input)->token());
	}

	#[Test]
	public function ref() {
	Assert::equals(['ref', new Ref(62, 0)], $this->fixture('62 0 R')->token());
	}

	#[Test]
	public function comment() {
	Assert::equals(['comment', 'Test'], $this->fixture('% Test')->token());
	}

	#[Test]
	public function hex() {
	Assert::equals(['hex', 'B5FEF09943'], $this->fixture('<B5FEF09943>')->token());
	}

	#[Test, Ignore('Not yet implemented')]
	public function multiline_string() {
	Assert::equals(['Test passed'], $this->fixture("(Test\n passed)")->token());
	}

	#[Test, Values([['true', true], ['false', false], ['null', null]])]
	public function constants($input, $expected) {
	Assert::equals(['const', $expected], $this->fixture($input)->token());
	}

	#[Test, Values(['[]', '[ ]', '[ ]'])]
	public function empty_array($input) {
	$fixture= $this->fixture($input);
	Assert::equals(['array-start', null], $fixture->token());
	Assert::equals(['array-end', null], $fixture->token());
	}

	#[Test, Values(['[[]]', '[ [] ]', '[ [ ] ]'])]
	public function nested_array($input) {
	$fixture= $this->fixture($input);
	Assert::equals(['array-start', null], $fixture->token());
	Assert::equals(['array-start', null], $fixture->token());
	Assert::equals(['array-end', null], $fixture->token());
	Assert::equals(['array-end', null], $fixture->token());
	}

	#[Test, Values(['[1 2]', '[1 2 ]', '[ 1 2]', '[ 1 2 ]', '[ 1 2 ]'])]
	public function array($input) {
	$fixture= $this->fixture($input);
	Assert::equals(['array-start', null], $fixture->token());
	Assert::equals(['integer', 1], $fixture->token());
	Assert::equals(['integer', 2], $fixture->token());
	Assert::equals(['array-end', null], $fixture->token());
	}

	#[Test, Values(['<<>>', '<< >>', '<< >>'])]
	public function empty_dict($input) {
	$fixture= $this->fixture($input);
	Assert::equals(['dict-start', null], $fixture->token());
	Assert::equals(['dict-end', null], $fixture->token());
	}

	#[Test, Values(['<</Length 3>>', '<< /Length 3>>', '<</Length 3 >>'])]
	public function dict($input) {
	$fixture= $this->fixture($input);
	Assert::equals(['dict-start', null], $fixture->token());
	Assert::equals(['name', 'Length'], $fixture->token());
	Assert::equals(['integer', 3], $fixture->token());
	Assert::equals(['dict-end', null], $fixture->token());
	}

	#[Test]
	public function multiline_dict() {
	$fixture= $this->fixture(["<</Type /StructElem\n", "/S /Link>>"]);
	Assert::equals(['dict-start', null], $fixture->token());
	Assert::equals(['name', 'Type'], $fixture->token());
	Assert::equals(['name', 'StructElem'], $fixture->token());
	Assert::equals(['name', 'S'], $fixture->token());
	Assert::equals(['name', 'Link'], $fixture->token());
	Assert::equals(['dict-end', null], $fixture->token());
	}

	#[Test, Values([[["<<>> stream\nTest\nendstream"]], [["<<>> stream\r\nTest\r\nendstream"]], [["<<>>\nstream\nTest\nendstream"]], [["<<>> stream\n", "Test\nendstream"]], [["<<>>\n", "stream\n", "Test\nendstream"]], [["<<>>", "\n", "stream\n", "Test\nendstream"]]])]
	public function stream_object($chunks) {
	$fixture= $this->fixture($chunks);
	Assert::equals(['dict-start', null], $fixture->token());
	Assert::equals(['dict-end', null], $fixture->token());
	Assert::equals(['stream-start', null], $fixture->token());
	Assert::equals('Test', $fixture->bytes(4));
	Assert::equals(['stream-end', null], $fixture->token());
	}
	}