Skip to content

Instantly share code, notes, and snippets.

@chuwy
Last active December 31, 2024 08:39
Show Gist options
  • Save chuwy/58763de4ff099b392ebf16669040cf77 to your computer and use it in GitHub Desktop.
Save chuwy/58763de4ff099b392ebf16669040cf77 to your computer and use it in GitHub Desktop.
Add semantic indent/dendent tokens into a stream of tokens with significant indentations (a-la Python, Haskell etc)
import scala.annotation.tailrec
// This module provides a post-lexing phase for languages with significant indentations (Python, YAML, Haskell etc)
// It inserts "virtual delimiters" called `Indent` and `Dendent` (like `{` and `}`) into the stream of tokens,
// where no real delimiters exist.
// It recognizes several patters like [`=`, `newline`, `space`] and turns them into [`=`, `indent`]
// (`newline` and `space` can be preserved, in `Behavior` config).
// Then if next line has the same amount of `space` tokens - it considered to be in the same "virtual block"
// If next line has even more `space` tokens - it considered to be start of the next "virtual block" (so new `indent`)
// If next line has *less* `space` tokens - we're inserting a `dendent` token (or several ones, depending how much less)
//
// Example:
// ```
// x =
// y =
// 1 + 1
// z = y + 1
// y + z
// ```
//
// Becomes:
//
// ```
// {
// x = {
// y = {
// 1 + 1
// }
// z = y + 1
// y + z
// }
// }
// ```
/**
* Virtual delimiters.
* `Indent` is identical to open curly brace.
* `Dedent` is identical to closing curly brace
*/
enum Brackets:
case Indent
case Dedent
/** The transformed token `T`, which is now either preserved `Original[T]` or semantic delimiter */
enum Bracketed[T]:
case Special(delimiter: Brackets)
case Original(token: T)
object Bracketed:
def indent[T]: Bracketed[T] = Bracketed.Special(Brackets.Indent)
def dedent[T]: Bracketed[T] = Bracketed.Special(Brackets.Dedent)
/** Defines indentation behavior for tokens `T` */
final case class Behavior[T](isOpening: T => Boolean,
isSpace: T => Option[Int],
isNewline: T => Boolean,
dropIndentNewline: Boolean,
mergeNewlines: Boolean):
def setIndent(past: List[Bracketed[T]]): List[Bracketed[T]] =
past match
case Bracketed.Original(t) :: rest if isNewline(t) && dropIndentNewline =>
Bracketed.indent :: rest
case tokens =>
Bracketed.indent :: tokens
def setNewline(past: List[Bracketed[T]]): List[Bracketed[T]] =
past match
case Bracketed.Original(t) :: rest if isNewline(t) && mergeNewlines =>
rest
case tokens =>
tokens
object Space:
def unapply(t: T): Option[Int] =
isSpace(t)
object Newline:
def unapply(t: T): Boolean =
isNewline(t)
object Opening:
def unapply(t: T): Boolean =
isOpening(t)
def perform[T](behavior: Behavior[T], tokens: List[T]): List[Bracketed[T]] =
import State.*
import Brackets.*
def keep(token: T, past: List[Bracketed[T]]): List[Bracketed[T]] =
Bracketed.Original(token) :: past
@tailrec
def go(indents: List[Int], past: List[Bracketed[T]], state: State, rest: List[T]): List[Bracketed[T]] =
(state, rest) match
case (_, Nil) =>
// End of stream. Base case. Close all open indents
closeAll(indents.length) ++ past
case (Fresh, t :: ts) => t match
// Turn into `Newline` state and keep the `Newline` token meanwhile
case behavior.Opening() => go(indents, keep(t, past), Opening, ts)
// A "typical" `Newline` encountered (e.g. delimiting two lines within same block), just keep going
case behavior.Newline() => go(indents, keep(t, past), Fresh, ts)
// A "typical" `Space` enountered (e.g. between operands), just keep going
case behavior.Space(_) => go(indents, keep(t, past), Fresh, ts)
// A "typical" token enountered (i.e. language-specific), just keep going
case _ => go(indents, keep(t, past), Fresh, ts)
case (Opening, t :: ts) => t match
// A `Newline` (after `Opening`) enountered. We're onto something!
case behavior.Newline() => go(indents, keep(t, past), Newline, ts)
// A `Space` (after `Opening`) enountered. Could be just a trailing space, keep waiting for `Newline`
case behavior.Space(_) => go(indents, keep(t, past), Opening, ts)
// A "typical" token after `Opening` means we're in fact not openning a new block (it's `x = y` kind of string)
case _ => go(List(0), keep(t, past), Fresh, ts)
case (Newline, t :: ts) => t match
// A 1+ `Newline` enountered. `Behavior` decides if they should be merged
case behavior.Newline() => go(indents, behavior.setNewline(past), Newline, ts)
// A `Space` (after `Newline`) enountered! Turn into `Block` state!
case behavior.Space(i) => go(indents, keep(t, past), Block(i), ts)
// A "typical" token after `Newline`. Which is identical to `Newline(0)`. Close all open indents
// Could also be `Opening`
case _ => go(List(0), closeAll(indents.length) ++ past, Fresh, ts)
case (Block(depth), t :: ts) => t match
// A 1+ `Newline` enountered. `Behavior` decides if they should be merged
// Also, now we technically can expect another `Space`. With a different level!
case behavior.Newline() => go(indents, behavior.setNewline(past), Block(depth), ts)
// Another `Space` (with previous `Newline` as subequent `Newline` pair assumed to be impossible)
// This brings us into the same state, but with different depth
// We can get arbitrary amount of `Newline`+`Space` pairs until we get into the next (`_`) case
case behavior.Space(i) => go(indents, past, Block(i), ts)
// Every `Newline`+`Space` pair is handled here eventually
case _ =>
Level.control(indents, depth) match
case Level.Same =>
go(indents, keep(t, past), Fresh, ts)
case Level.Next =>
go(depth :: indents, keep(t, behavior.setIndent(past)), Fresh, ts)
case Level.Close(amount, remaining) =>
go(remaining, keep(t, closeAll(amount) ++ past), Fresh, ts)
go(List(0), List(Bracketed.indent), State.Fresh, tokens).reverse
/** Internal state of the `perform` function */
enum State:
/**
* `Fresh` means we're either in the start or middle of the line, thus not making a decision about indentation
* Perhaps, we just handled an indentation
* Waiting for opening token.
*/
case Fresh
/**
* Encountered a token that *can* start a block (such as `=` or `:`)
* `Space` will be ignored, original tokens reset to fresh (it could be `a = b` statement)
* and `Newline` token change state to `Newline`
*/
case Opening
/** We've encountered `Opening` and `Newline`. If next is `Space` - get into `Block` state */
case Newline
/** `Block` means we've just encountered a `Newline`+`Space(i)` sequence */
case Block(i: Int)
/** Control what to do with current indentation `Block` (preserve current, open new or close several ones) */
enum Level:
case Same
case Next
case Close(closed: Int, remaining: List[Int])
object Level:
def control(idents: List[Int], currentDepth: Int): Level =
idents match
case lastDepth :: _ if currentDepth == lastDepth => Level.Same
case lastDepth :: _ if currentDepth > lastDepth => Level.Next
case _ =>
val (closed, remaining) = idents.span(depth => depth > currentDepth)
Level.Close(closed.length, remaining)
// TODO: `closeAll` should:
// 1. Accept current `indents: List[Int]` and indents `toClose: List[Int]`
// 2. Fail if `toClose` do not match with what we have in current
def closeAll[T](amount: Int): List[Bracketed[T]] =
List.fill(amount)(Bracketed.dedent)
/** Example token type */
enum Token:
case Ident(s: String)
case Eq
case Operator
case Newline
case Whitespaces(i: Int)
object Token:
val isSpace: Token => Option[Int] =
case Whitespaces(i) => Some(i)
case _ => None
val behavior: Behavior[Token] =
Behavior(_.isInstanceOf[Eq.type], isSpace, _.isInstanceOf[Newline.type], true, true)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment