Last active
December 31, 2024 08:39
-
-
Save chuwy/58763de4ff099b392ebf16669040cf77 to your computer and use it in GitHub Desktop.
Add semantic indent/dendent tokens into a stream of tokens with significant indentations (a-la Python, Haskell etc)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.annotation.tailrec | |
// This module provides a post-lexing phase for languages with significant indentations (Python, YAML, Haskell etc) | |
// It inserts "virtual delimiters" called `Indent` and `Dendent` (like `{` and `}`) into the stream of tokens, | |
// where no real delimiters exist. | |
// It recognizes several patters like [`=`, `newline`, `space`] and turns them into [`=`, `indent`] | |
// (`newline` and `space` can be preserved, in `Behavior` config). | |
// Then if next line has the same amount of `space` tokens - it considered to be in the same "virtual block" | |
// If next line has even more `space` tokens - it considered to be start of the next "virtual block" (so new `indent`) | |
// If next line has *less* `space` tokens - we're inserting a `dendent` token (or several ones, depending how much less) | |
// | |
// Example: | |
// ``` | |
// x = | |
// y = | |
// 1 + 1 | |
// z = y + 1 | |
// y + z | |
// ``` | |
// | |
// Becomes: | |
// | |
// ``` | |
// { | |
// x = { | |
// y = { | |
// 1 + 1 | |
// } | |
// z = y + 1 | |
// y + z | |
// } | |
// } | |
// ``` | |
/** | |
* Virtual delimiters. | |
* `Indent` is identical to open curly brace. | |
* `Dedent` is identical to closing curly brace | |
*/ | |
enum Brackets: | |
case Indent | |
case Dedent | |
/** The transformed token `T`, which is now either preserved `Original[T]` or semantic delimiter */ | |
enum Bracketed[T]: | |
case Special(delimiter: Brackets) | |
case Original(token: T) | |
object Bracketed: | |
def indent[T]: Bracketed[T] = Bracketed.Special(Brackets.Indent) | |
def dedent[T]: Bracketed[T] = Bracketed.Special(Brackets.Dedent) | |
/** Defines indentation behavior for tokens `T` */ | |
final case class Behavior[T](isOpening: T => Boolean, | |
isSpace: T => Option[Int], | |
isNewline: T => Boolean, | |
dropIndentNewline: Boolean, | |
mergeNewlines: Boolean): | |
def setIndent(past: List[Bracketed[T]]): List[Bracketed[T]] = | |
past match | |
case Bracketed.Original(t) :: rest if isNewline(t) && dropIndentNewline => | |
Bracketed.indent :: rest | |
case tokens => | |
Bracketed.indent :: tokens | |
def setNewline(past: List[Bracketed[T]]): List[Bracketed[T]] = | |
past match | |
case Bracketed.Original(t) :: rest if isNewline(t) && mergeNewlines => | |
rest | |
case tokens => | |
tokens | |
object Space: | |
def unapply(t: T): Option[Int] = | |
isSpace(t) | |
object Newline: | |
def unapply(t: T): Boolean = | |
isNewline(t) | |
object Opening: | |
def unapply(t: T): Boolean = | |
isOpening(t) | |
def perform[T](behavior: Behavior[T], tokens: List[T]): List[Bracketed[T]] = | |
import State.* | |
import Brackets.* | |
def keep(token: T, past: List[Bracketed[T]]): List[Bracketed[T]] = | |
Bracketed.Original(token) :: past | |
@tailrec | |
def go(indents: List[Int], past: List[Bracketed[T]], state: State, rest: List[T]): List[Bracketed[T]] = | |
(state, rest) match | |
case (_, Nil) => | |
// End of stream. Base case. Close all open indents | |
closeAll(indents.length) ++ past | |
case (Fresh, t :: ts) => t match | |
// Turn into `Newline` state and keep the `Newline` token meanwhile | |
case behavior.Opening() => go(indents, keep(t, past), Opening, ts) | |
// A "typical" `Newline` encountered (e.g. delimiting two lines within same block), just keep going | |
case behavior.Newline() => go(indents, keep(t, past), Fresh, ts) | |
// A "typical" `Space` enountered (e.g. between operands), just keep going | |
case behavior.Space(_) => go(indents, keep(t, past), Fresh, ts) | |
// A "typical" token enountered (i.e. language-specific), just keep going | |
case _ => go(indents, keep(t, past), Fresh, ts) | |
case (Opening, t :: ts) => t match | |
// A `Newline` (after `Opening`) enountered. We're onto something! | |
case behavior.Newline() => go(indents, keep(t, past), Newline, ts) | |
// A `Space` (after `Opening`) enountered. Could be just a trailing space, keep waiting for `Newline` | |
case behavior.Space(_) => go(indents, keep(t, past), Opening, ts) | |
// A "typical" token after `Opening` means we're in fact not openning a new block (it's `x = y` kind of string) | |
case _ => go(List(0), keep(t, past), Fresh, ts) | |
case (Newline, t :: ts) => t match | |
// A 1+ `Newline` enountered. `Behavior` decides if they should be merged | |
case behavior.Newline() => go(indents, behavior.setNewline(past), Newline, ts) | |
// A `Space` (after `Newline`) enountered! Turn into `Block` state! | |
case behavior.Space(i) => go(indents, keep(t, past), Block(i), ts) | |
// A "typical" token after `Newline`. Which is identical to `Newline(0)`. Close all open indents | |
// Could also be `Opening` | |
case _ => go(List(0), closeAll(indents.length) ++ past, Fresh, ts) | |
case (Block(depth), t :: ts) => t match | |
// A 1+ `Newline` enountered. `Behavior` decides if they should be merged | |
// Also, now we technically can expect another `Space`. With a different level! | |
case behavior.Newline() => go(indents, behavior.setNewline(past), Block(depth), ts) | |
// Another `Space` (with previous `Newline` as subequent `Newline` pair assumed to be impossible) | |
// This brings us into the same state, but with different depth | |
// We can get arbitrary amount of `Newline`+`Space` pairs until we get into the next (`_`) case | |
case behavior.Space(i) => go(indents, past, Block(i), ts) | |
// Every `Newline`+`Space` pair is handled here eventually | |
case _ => | |
Level.control(indents, depth) match | |
case Level.Same => | |
go(indents, keep(t, past), Fresh, ts) | |
case Level.Next => | |
go(depth :: indents, keep(t, behavior.setIndent(past)), Fresh, ts) | |
case Level.Close(amount, remaining) => | |
go(remaining, keep(t, closeAll(amount) ++ past), Fresh, ts) | |
go(List(0), List(Bracketed.indent), State.Fresh, tokens).reverse | |
/** Internal state of the `perform` function */ | |
enum State: | |
/** | |
* `Fresh` means we're either in the start or middle of the line, thus not making a decision about indentation | |
* Perhaps, we just handled an indentation | |
* Waiting for opening token. | |
*/ | |
case Fresh | |
/** | |
* Encountered a token that *can* start a block (such as `=` or `:`) | |
* `Space` will be ignored, original tokens reset to fresh (it could be `a = b` statement) | |
* and `Newline` token change state to `Newline` | |
*/ | |
case Opening | |
/** We've encountered `Opening` and `Newline`. If next is `Space` - get into `Block` state */ | |
case Newline | |
/** `Block` means we've just encountered a `Newline`+`Space(i)` sequence */ | |
case Block(i: Int) | |
/** Control what to do with current indentation `Block` (preserve current, open new or close several ones) */ | |
enum Level: | |
case Same | |
case Next | |
case Close(closed: Int, remaining: List[Int]) | |
object Level: | |
def control(idents: List[Int], currentDepth: Int): Level = | |
idents match | |
case lastDepth :: _ if currentDepth == lastDepth => Level.Same | |
case lastDepth :: _ if currentDepth > lastDepth => Level.Next | |
case _ => | |
val (closed, remaining) = idents.span(depth => depth > currentDepth) | |
Level.Close(closed.length, remaining) | |
// TODO: `closeAll` should: | |
// 1. Accept current `indents: List[Int]` and indents `toClose: List[Int]` | |
// 2. Fail if `toClose` do not match with what we have in current | |
def closeAll[T](amount: Int): List[Bracketed[T]] = | |
List.fill(amount)(Bracketed.dedent) | |
/** Example token type */ | |
enum Token: | |
case Ident(s: String) | |
case Eq | |
case Operator | |
case Newline | |
case Whitespaces(i: Int) | |
object Token: | |
val isSpace: Token => Option[Int] = | |
case Whitespaces(i) => Some(i) | |
case _ => None | |
val behavior: Behavior[Token] = | |
Behavior(_.isInstanceOf[Eq.type], isSpace, _.isInstanceOf[Newline.type], true, true) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment