From 969e3b62bf974b48d061986d5fced3c40398fb99 Mon Sep 17 00:00:00 2001 From: Lucas Satabin Date: Fri, 31 Dec 2021 15:26:17 +0100 Subject: [PATCH 1/6] Add copyless streaming tree transducers The copyless natures is ensured by using a controlled environment update language, without losing expressiveness. This execution model emits as soon as possible, and requires transducers to be total, otherwise the stream fails. This also ensures that everything is emitted up to the first error. The STT model can be used to implement query languages on tree structures and some transformations. --- build.sbt | 17 +- .../main/scala/fs2/data/stt/Assignment.scala | 56 ++++ .../src/main/scala/fs2/data/stt/Env.scala | 92 +++++++ .../src/main/scala/fs2/data/stt/STT.scala | 247 ++++++++++++++++++ .../scala/fs2/data/stt/STTException.scala | 19 ++ .../main/scala/fs2/data/stt/expressions.scala | 101 +++++++ .../src/main/scala/fs2/data/stt/tags.scala | 43 +++ .../src/test/scala/fs2/data/stt/STTSpec.scala | 91 +++++++ 8 files changed, 664 insertions(+), 2 deletions(-) create mode 100644 transducers/shared/src/main/scala/fs2/data/stt/Assignment.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/stt/Env.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/stt/STT.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/stt/STTException.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/stt/expressions.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/stt/tags.scala create mode 100644 transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala diff --git a/build.sbt b/build.sbt index 5896a386f..3518f2c6e 100644 --- a/build.sbt +++ b/build.sbt @@ -116,7 +116,8 @@ val root = (project in file(".")) jsonDiffson.js, jsonPlay.js, text.js, - xml.js), + xml.js, + transducers.js), ScalaUnidoc / siteSubdirName := "api", addMappingsToSiteDir(ScalaUnidoc / packageDoc / mappings, ScalaUnidoc / siteSubdirName), Nanoc / sourceDirectory := file("site"), @@ -140,7 +141,9 @@ val root = (project in file(".")) xml.jvm, xml.js, cbor.jvm, - cbor.js + cbor.js, + transducers.jvm, + transducers.js ) lazy val text = crossProject(JVMPlatform, JSPlatform) @@ -304,6 +307,16 @@ lazy val cbor = crossProject(JVMPlatform, JSPlatform) .flatten ) +lazy val transducers = crossProject(JVMPlatform, JSPlatform) + .crossType(CrossType.Full) + .in(file("transducers")) + .settings(commonSettings) + .settings(publishSettings) + .settings( + name := "fs2-data-transducers", + description := "Streaming transducers library" + ) + lazy val documentation = project .in(file("documentation")) .enablePlugins(MdocPlugin) diff --git a/transducers/shared/src/main/scala/fs2/data/stt/Assignment.scala b/transducers/shared/src/main/scala/fs2/data/stt/Assignment.scala new file mode 100644 index 000000000..f07e0dbbb --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/Assignment.scala @@ -0,0 +1,56 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +import cats.Show +import cats.syntax.show._ + +sealed trait Assignment[+V <: Variable, +C] +sealed trait Reset[+C] extends Assignment[Nothing, C] +object Reset { + def unapply[C](r: Reset[C]): Some[Reset[C]] = + Some(r) +} +object Assignment { + case class Empty(x: Variable.Normal) extends Reset[Nothing] + case class Hole(x: Variable.Normal) extends Reset[Nothing] + case class Char[C](x: Variable.Normal, c: C) extends Reset[C] + case class Subtree[C](x: Variable.Normal, open: C, close: C) extends Reset[C] + case class Append[V <: Variable](x: Variable.Normal, y: V) extends Assignment[V, Nothing] + case class Prepend[V <: Variable](x: Variable.Normal, y: V) extends Assignment[V, Nothing] + case class SubstInX[V <: Variable](x: Variable.Normal, y: V) extends Assignment[V, Nothing] + case class SubstInY[V <: Variable](x: Variable.Normal, y: V) extends Assignment[V, Nothing] + case class Swap[V <: Variable](x: Variable.Normal, y: V) extends Assignment[V, Nothing] + + implicit def show[V <: Variable, C: Show]: Show[Assignment[V, C]] = Show.show { + case Empty(x) => show"$x := ε;" + case Hole(x) => show"$x := ?;" + case Char(x, c) => show"$x := $c;" + case Subtree(x, open, close) => show"$x := ⧼$open ? $close⧽;" + case Append(x, y) => show"""$x := $x $y; + |$y := ε;""".stripMargin + case Prepend(x, y) => show"""$x := $y $x; + |$y := ε;""".stripMargin + case SubstInX(x, y) => show"""$x := $x[$y]; + |$y := ε;""".stripMargin + case SubstInY(x, y) => show"""$x := $y[$x]; + |$y := ε;""".stripMargin + case Swap(x, y) => show"""$x := $y; + |$y := $x;""".stripMargin + } + +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/Env.scala b/transducers/shared/src/main/scala/fs2/data/stt/Env.scala new file mode 100644 index 000000000..9ab714229 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/Env.scala @@ -0,0 +1,92 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +import cats.ApplicativeError +import cats.MonadError +import cats.Show +import cats.syntax.all._ + +import scala.collection.compat._ +import scala.reflect.ClassTag + +sealed trait Variable +object Variable { + case class Normal(name: String) extends Variable + object Normal { + implicit val show: Show[Normal] = _.name + } + case class Stack(name: String) extends Variable + + implicit val show: Show[Variable] = Show.show { + case Normal(name) => name + case Stack(name) => s"${name}ₚ" + } +} + +class Env[V <: Variable, C] private (private val vars: Map[V, Expr[C]]) { + + def widen: Env[Variable, C] = + new Env(Map.empty[Variable, Expr[C]] ++ vars) + + def stackify(implicit ev: V =:= Variable.Normal): Env[Variable.Stack, C] = + new Env(vars.collect { case (Variable.Normal(n), e) => + (Variable.Stack(n), e) + }) + + def destackify: Env[Variable.Normal, C] = + new Env(vars.collect { case (v: Variable.Normal, e) => + (v, e) + }) + + def merge(that: Env[Variable.Stack, C])(implicit ev: V =:= Variable.Normal): Env[Variable, C] = + new Env((this.vars ++ that.vars).toMap) + + def call: Env[V, C] = + new Env[V, C](vars.view.mapValues { + case Expr0(_) => Expr0.Empty[C]() + case Expr1(_) => Expr1.Hole[C]() + }.toMap) { + override def call = this + } + + def lookupExpr[F[_]](name: V)(implicit F: ApplicativeError[F, Throwable]): F[Expr[C]] = + vars.get(name).liftTo[F](STTException(show"unknown varibale $name in environment")) + + def lookup[F[_], E <: Expr[C]](name: V)(implicit F: MonadError[F, Throwable], E: ClassTag[E]): F[E] = + lookupExpr(name).flatMap { + case E(e) => + F.pure(e) + case _ => + F.raiseError(STTException(show"variable $name is of wrong type")) + } + + def update(name: V, e: Expr[C]): Env[V, C] = + new Env(vars.updated(name, e)) + +} + +object Env { + def create[C](names: Map[String, Type]): Env[Variable.Normal, C] = + new Env(names.map { + case (name, Type.Type0) => (Variable.Normal(name), Expr0.Empty[C]()) + case (name, Type.Type1) => (Variable.Normal(name), Expr1.Hole[C]()) + }) + + implicit def show[V <: Variable, C: Show]: Show[Env[V, C]] = + _.vars.map { case (v, e) => show"$v -> $e" }.mkString("\n") +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/STT.scala b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala new file mode 100644 index 000000000..2dc7e410a --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala @@ -0,0 +1,247 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package stt + +import cats.syntax.all._ +import cats.MonadError +import cats.data.OptionT +import cats.data.Chain +import cats.Show + +case class InternalTransition[Out](target: Int, update: List[Assignment[Variable.Normal, Out]]) + +case class CallTransition[Out, StackElem](target: Int, push: StackElem, update: List[Assignment[Variable.Normal, Out]]) + +case class ReturnTransition[Out](target: Int, update: List[Assignment[Variable, Out]]) + +/** A copyless streaming tree transducer implementation. */ +class STT[F[_], In, Out, StackElem](initial: Int, + internalTransitions: Map[(Int, In), InternalTransition[Out]], + callTransitions: Map[(Int, In), CallTransition[Out, StackElem]], + returnTransitions: Map[(Int, StackElem, In), ReturnTransition[Out]], + finalStates: Map[Int, Expr0[Out]], + variables: Map[String, Type])(implicit + F: MonadError[F, Throwable], + In: HasTag[In], + showIn: Show[In], + showOut: Show[Out]) + extends Pipe[F, In, Out] { + + def isFinal(state: Int): Boolean = finalStates.contains(state) + + def update[V <: Variable](env: Env[Variable, Out], + assignments: List[Assignment[V, Out]]): F[Env[Variable.Normal, Out]] = { + def loop(assignments: List[Assignment[V, Out]], env: Env[Variable, Out]): F[Env[Variable.Normal, Out]] = + assignments match { + case assignment :: rest => + assignment match { + case Assignment.Empty(x) => + loop(rest, env.update(x, Expr0.Empty())) + case Assignment.Hole(x) => + loop(rest, env.update(x, Expr1.Hole())) + case Assignment.Char(x, c) => + loop(rest, env.update(x, Expr0.Char(c))) + case Assignment.Subtree(x, open, close) => + loop(rest, env.update(x, Expr1.Subtree(open, Expr1.Hole(), close))) + case Assignment.Append(x, y) => + (env.lookupExpr[F](x), env.lookupExpr[F](y)).tupled.flatMap { + case (Expr0(xe), Expr0(ye)) => + loop(rest, env.update(x, Expr0.Concat(xe, ye)).update(y, Expr0.Empty())) + case (Expr1(xe), Expr0(ye)) => + loop(rest, env.update(x, Expr1.Concat10(xe, ye)).update(y, Expr0.Empty())) + case (Expr0(xe), Expr1(ye)) => + loop(rest, env.update(x, Expr1.Concat01(xe, ye)).update(y, Expr1.Hole())) + case (Expr1(xe), Expr1(ye)) => + F.raiseError(STTException("cannot append an expression of type 1 to another expression of type 1")) + } + case Assignment.Prepend(x, y) => + (env.lookupExpr[F](x), env.lookupExpr[F](y)).tupled.flatMap { + case (Expr0(xe), Expr0(ye)) => + loop(rest, env.update(x, Expr0.Concat(ye, xe)).update(y, Expr0.Empty())) + case (Expr1(xe), Expr0(ye)) => + loop(rest, env.update(x, Expr1.Concat01(ye, xe)).update(y, Expr0.Empty())) + case (Expr0(xe), Expr1(ye)) => + loop(rest, env.update(x, Expr1.Concat10(ye, xe)).update(y, Expr1.Hole())) + case (Expr1(xe), Expr1(ye)) => + F.raiseError(STTException("cannot prepend an expression of type 1 to another expression of type 1")) + } + case Assignment.SubstInX(x, y) => + (env.lookupExpr[F](x), env.lookupExpr[F](y)).tupled.flatMap { + case (Expr1(xe), Expr1(ye)) => + loop(rest, env.update(x, Expr1.Subst(xe, ye)).update(y, Expr1.Hole())) + case (Expr1(xe), Expr0(ye)) => + loop(rest, env.update(x, Expr0.Subst(xe, ye)).update(y, Expr0.Empty())) + case (Expr0(xe), ye) => + F.raiseError(STTException(show"cannot substitute in an expression of type 0: $xe")) + } + case Assignment.SubstInY(x, y) => + (env.lookupExpr[F](x), env.lookupExpr[F](y)).tupled.flatMap { + case (Expr1(xe), Expr1(ye)) => + loop(rest, env.update(x, Expr1.Subst(ye, xe)).update(y, Expr1.Hole())) + case (Expr0(xe), Expr1(ye)) => + loop(rest, env.update(x, Expr0.Subst(ye, xe)).update(y, Expr0.Empty())) + case (xe, Expr0(ye)) => + F.raiseError(STTException(show"cannot substitute in an expression of type 0: $ye")) + } + case Assignment.Swap(x, y) => + (env.lookupExpr[F](x), env.lookupExpr[F](y)).tupled.flatMap { case (xe, ye) => + loop(rest, env.update(x, ye).update(y, xe)) + } + } + case Nil => F.pure(env.destackify) + } + loop(assignments, env) + } + + private def step(q: Int, stack: List[(StackElem, Env[Variable.Stack, Out])], env: Env[Variable.Normal, Out])( + in: In): OptionT[F, (Int, List[(StackElem, Env[Variable.Stack, Out])], Env[Variable.Normal, Out])] = + in match { + case Tag.Internal() => + OptionT.fromOption(internalTransitions.get(q -> in)).semiflatMap { case InternalTransition(q1, upd) => + update(env.widen, upd).map((q1, stack, _)) + } + case Tag.Call() => + OptionT.fromOption(callTransitions.get(q -> in)).semiflatMap { case CallTransition(q1, p, upd) => + update(env.widen, upd).map(env => (q1, (p, env.stackify) :: stack, env.call)) + } + case Tag.Return() => + stack match { + case (p, env1) :: stack => + OptionT.fromOption(returnTransitions.get((q, p, in))).semiflatMap { case ReturnTransition(q1, upd) => + update(env.merge(env1), upd).map((q1, stack, _)) + } + case Nil => + OptionT.liftF( + F.raiseError(STTException("inconsistent stack state. Input is probably not a well-formed tree"))) + } + } + + def eval0(env: Env[Variable.Normal, Out], e: Expr0[Out]): F[Chain[Out]] = { + def loop(e: Expr0[Out], acc: Chain[Out]): F[Chain[Out]] = { + e match { + case Expr0.Empty() => + F.pure(acc) + case Expr0.Var(x) => + env.lookup[F, Expr0[Out]](x).flatMap(loop(_, acc)) + case Expr0.Char(c) => + F.pure(acc.append(c)) + case Expr0.Subtree(open, sub, close) => + loop(sub, acc.append(open)).map(_.append(close)) + case Expr0.Concat(left, right) => + for { + acc <- loop(left, acc) + acc <- loop(right, acc) + } yield acc + case Expr0.Subst(inner, arg) => + loop(inner.subst(arg), acc) + } + } + loop(e, Chain.empty) + } + + def apply(s: Stream[F, In]): Stream[F, Out] = { + def go(s: Stream[F, In], + state: Int, + stack: List[(StackElem, Env[Variable.Stack, Out])], + env: Env[Variable.Normal, Out], + lastKnownFinal: Option[(Int, Env[Variable.Normal, Out])], + accSinceLastFinal: Chain[In]): Pull[F, Out, Unit] = + s.pull.peek1.flatMap { + case Some((in, s)) => + Pull + // try to step with the current input character + .eval(step(state, stack, env)(in).value) + .flatMap { + case Some((state, stack, env)) if isFinal(state) => + // we can step and the target state is final, + // register this as the last encountered final state, + // reinitialize the input buffer to empty, and proceed + // consume the input symbol + go(s.tail, state, stack, env, (state, env).some, Chain.empty) + case Some((state, stack, env)) => + // we can step and the target state is NOT final, + // add the just read input into the buffer of read + // inputs since last final state, and proceed + go(s.tail, state, stack, env, lastKnownFinal, accSinceLastFinal.append(in)) + case None => + // we cannot step from this state, is it final? + finalStates.get(state) match { + case Some(finalExpr) if accSinceLastFinal.nonEmpty => + // it is a final state, emit the associated output, reset buffer, + // reset to initial state, and proceed without consuming the input + Pull + .eval(eval0(env, finalExpr)) + .flatMap(outs => + Pull.output(Chunk.chain(outs)) >> go(s, initial, Nil, Env.create(variables), None, Chain.empty)) + case _ => + // it is not a final state + lastKnownFinal match { + case Some((state, env)) => + // we reached a final state before, let's emit what should have been emitted + // there, and push the input buffer back to the stream + Pull + .eval(eval0(env, finalStates(state))) + .flatMap(outs => + Pull.output(Chunk.chain(outs)) >> go(Stream.chunk(Chunk.chain(accSinceLastFinal)) ++ s, + initial, + Nil, + Env.create(variables), + None, + Chain.empty)) + case None => + // there is no known final, we will emit nothing and just fail + Pull.raiseError( + STTException( + show"malformed input, prefix ${(accSinceLastFinal :+ in).mkString_(", ")} is not accepted")) + } + } + } + case None => + // we are at the end of the input + // did we reach a final state? + lastKnownFinal match { + case Some((state, env)) => + // we did reach a final state, emit the outputs from the last one reached + // and push back the input read since into the stream, then proceed + Pull + .eval(eval0(env, finalStates(state))) + .flatMap(outs => + Pull.output(Chunk.chain(outs)) >> go(Stream.chunk(Chunk.chain(accSinceLastFinal)), + initial, + Nil, + Env.create(variables), + None, + Chain.empty)) + case None => + // we did not reach a final state, do we have leftover inputs? + if (accSinceLastFinal.isEmpty) { + // no we don't, everything has been processed, stop here + Pull.done + } else { + // we do have unprocessed inputs, this is an error + Pull.raiseError(STTException("malformed input")) + } + } + } + + go(s, initial, List.empty, Env.create(variables), None, Chain.empty).stream + + } + +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/STTException.scala b/transducers/shared/src/main/scala/fs2/data/stt/STTException.scala new file mode 100644 index 000000000..ed4306e0c --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/STTException.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +case class STTException(msg: String, inner: Throwable = null) extends Exception(msg, inner) diff --git a/transducers/shared/src/main/scala/fs2/data/stt/expressions.scala b/transducers/shared/src/main/scala/fs2/data/stt/expressions.scala new file mode 100644 index 000000000..5002d7523 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/expressions.scala @@ -0,0 +1,101 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +import cats.Show +import cats.syntax.show._ + +sealed trait Type +object Type { + case object Type0 extends Type + case object Type1 extends Type +} + +sealed trait Expr[C] { + val tpe: Type +} +object Expr { + implicit def show[C: Show]: Show[Expr[C]] = Show.show { + case Expr0(e) => e.show + case Expr1(e) => e.show + } +} + +sealed trait Expr0[C] extends Expr[C] { + val tpe = Type.Type0 + def ~(that: Expr0[C]): Expr0[C] = Expr0.Concat(this, that) +} +object Expr0 { + case class Empty[C]() extends Expr0[C] + case class Var[C](x: Variable.Normal) extends Expr0[C] + case class Char[C](c: C) extends Expr0[C] + case class Subtree[C](open: C, sub: Expr0[C], close: C) extends Expr0[C] + case class Concat[C](left: Expr0[C], right: Expr0[C]) extends Expr0[C] + case class Subst[C](inner: Expr1[C], arg: Expr0[C]) extends Expr0[C] + + def unapply[C](e: Expr0[C]): Some[Expr0[C]] = + Some(e) + + implicit def show[C: Show]: Show[Expr0[C]] = Show.show { + case Empty() => "ε" + case Var(x) => x.show + case Char(c) => c.show + case Subtree(open, sub, close) => show"⧼$open $sub $close⧽" + case Concat(left, right) => show"$left $right" + case Subst(inner, arg) => show"$inner[$arg]" + } +} + +sealed trait Expr1[C] extends Expr[C] { + val tpe = Type.Type1 + def subst(e: Expr0[C]): Expr0[C] = + this match { + case Expr1.Hole() => e + case Expr1.Subtree(open, sub, close) => Expr0.Subtree(open, sub.subst(e), close) + case Expr1.Concat01(left, right) => Expr0.Concat(left, right.subst(e)) + case Expr1.Concat10(left, right) => Expr0.Concat(left.subst(e), right) + case Expr1.Subst(inner, arg) => inner.subst(arg).subst(e) + } + + def subst(e: Expr1[C]): Expr1[C] = + this match { + case Expr1.Hole() => e + case Expr1.Subtree(open, sub, close) => Expr1.Subtree(open, sub.subst(e), close) + case Expr1.Concat01(left, right) => Expr1.Concat01(left, right.subst(e)) + case Expr1.Concat10(left, right) => Expr1.Concat10(left.subst(e), right) + case Expr1.Subst(inner, arg) => inner.subst(arg).subst(e) + } + +} +object Expr1 { + case class Hole[C]() extends Expr1[C] + case class Subtree[C](open: C, sub: Expr1[C], close: C) extends Expr1[C] + case class Concat01[C](left: Expr0[C], right: Expr1[C]) extends Expr1[C] + case class Concat10[C](left: Expr1[C], right: Expr0[C]) extends Expr1[C] + case class Subst[C](inner: Expr1[C], arg: Expr1[C]) extends Expr1[C] + + def unapply[C](e: Expr1[C]): Some[Expr1[C]] = + Some(e) + + implicit def show[C: Show]: Show[Expr1[C]] = Show.show { + case Hole() => "?" + case Subtree(open, sub, close) => show"⧼$open $sub $close⧽" + case Concat01(left, right) => show"$left $right" + case Concat10(left, right) => show"$left $right" + case Subst(inner, arg) => show"$inner[$arg]" + } +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/tags.scala b/transducers/shared/src/main/scala/fs2/data/stt/tags.scala new file mode 100644 index 000000000..d3ff3c65e --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/tags.scala @@ -0,0 +1,43 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +import scala.annotation.implicitNotFound + +sealed trait Tag { + def unapply[C](c: C)(implicit C: HasTag[C]): Boolean = + C.tag(c) == this +} +object Tag { + case object Call extends Tag + case object Return extends Tag + case object Internal extends Tag +} + +/** Typeclass indicating that the characters of type `C` + * can be tagged as: + * - call (e.g. opening tag), + * - return (e.g. closing tag), + * - internal (non structuring character). + */ +@implicitNotFound( + "Cannot prove that type ${C} has tags. Make sure to provide an implicit instance of `fs2.data.stt.HasTag[${C}]` in scope") +trait HasTag[C] { + def tag(c: C): Tag + + def unapply(c: C): Some[Tag] = Some(tag(c)) +} diff --git a/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala b/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala new file mode 100644 index 000000000..2960b3b01 --- /dev/null +++ b/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala @@ -0,0 +1,91 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package stt + +import cats.effect._ + +import weaver._ +import cats.Show + +sealed trait Tree +object Tree { + case class Open(name: String) extends Tree + case class Close(name: String) extends Tree + case class Leaf(value: Int) extends Tree + + implicit val hasTags: HasTag[Tree] = { + case Open(_) => Tag.Call + case Close(_) => Tag.Return + case Leaf(_) => Tag.Internal + } + + implicit val show: Show[Tree] = Show.show { + case Open(name) => s"<$name" + case Close(name) => s"$name>" + case Leaf(v) => v.toString + } + +} + +object STTSpec extends SimpleIOSuite { + + val leaf0: Tree = Tree.Leaf(0) + val leaf1: Tree = Tree.Leaf(1) + val openA: Tree = Tree.Open("a") + val closeA: Tree = Tree.Close("a") + val openB: Tree = Tree.Open("b") + val closeB: Tree = Tree.Close("b") + + val x = Variable.Normal("x") + val y = Variable.Normal("y") + val z = Variable.Normal("z") + val xp = Variable.Stack("x") + + test("reverse tree") { + + import Assignment._ + val internalTransition = Map( + (0, leaf0) -> InternalTransition(0, List(Char(y, leaf0), Prepend(x, y))), + (0, leaf1) -> InternalTransition(0, List(Char(y, leaf1), Prepend(x, y))) + ) + val callTransition = Map((0, openA) -> CallTransition[Tree, Tree](0, openA, Nil), + (0, openB) -> CallTransition[Tree, Tree](0, openB, Nil)) + val returnTransition = Map( + (0, openA, closeA) -> ReturnTransition(0, List(Subtree(z, openA, closeA), SubstInY(x, z), Append(x, xp))), + (0, openB, closeB) -> ReturnTransition(0, List(Subtree(z, openB, closeB), SubstInY(x, z), Append(x, xp))) + ) + val finalStates = Map(0 -> Expr0.Var[Tree](x)) + val reverse = + new STT[IO, Tree, Tree, Tree](0, + internalTransition, + callTransition, + returnTransition, + finalStates, + Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) + + Stream(openA, openB, leaf0, closeB, leaf1, closeA, openB, closeB) + .through(reverse) + .compile + .toList + .map(result => expect(result == List(openB, closeB, openA, leaf1, openB, leaf0, closeB, closeA))) + } + + test("") + +} From 494c3a9f166c32aed10caf5426472d233c75259c Mon Sep 17 00:00:00 2001 From: Lucas Satabin Date: Fri, 31 Dec 2021 16:14:49 +0100 Subject: [PATCH 2/6] Respect chunk structure as much as possible --- .../src/main/scala/fs2/data/stt/STT.scala | 160 ++++++++++-------- .../src/test/scala/fs2/data/stt/STTSpec.scala | 1 + 2 files changed, 88 insertions(+), 73 deletions(-) diff --git a/transducers/shared/src/main/scala/fs2/data/stt/STT.scala b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala index 2dc7e410a..213e3a76e 100644 --- a/transducers/shared/src/main/scala/fs2/data/stt/STT.scala +++ b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala @@ -156,91 +156,105 @@ class STT[F[_], In, Out, StackElem](initial: Int, } def apply(s: Stream[F, In]): Stream[F, Out] = { - def go(s: Stream[F, In], + def go(chunk: Chunk[In], + idx: Int, + rest: Stream[F, In], state: Int, stack: List[(StackElem, Env[Variable.Stack, Out])], env: Env[Variable.Normal, Out], lastKnownFinal: Option[(Int, Env[Variable.Normal, Out])], - accSinceLastFinal: Chain[In]): Pull[F, Out, Unit] = - s.pull.peek1.flatMap { - case Some((in, s)) => - Pull - // try to step with the current input character - .eval(step(state, stack, env)(in).value) - .flatMap { - case Some((state, stack, env)) if isFinal(state) => - // we can step and the target state is final, - // register this as the last encountered final state, - // reinitialize the input buffer to empty, and proceed - // consume the input symbol - go(s.tail, state, stack, env, (state, env).some, Chain.empty) - case Some((state, stack, env)) => - // we can step and the target state is NOT final, - // add the just read input into the buffer of read - // inputs since last final state, and proceed - go(s.tail, state, stack, env, lastKnownFinal, accSinceLastFinal.append(in)) + accSinceLastFinal: Chain[In], + chunkAcc: Chain[Out]): Pull[F, Out, Unit] = + if (idx >= chunk.size) { + Pull.output(Chunk.chain(chunkAcc)) >> rest.pull.uncons.flatMap { + case Some((hd, tl)) => + go(hd, 0, tl, state, stack, env, lastKnownFinal, accSinceLastFinal, Chain.empty) + case None => + // we are at the end of the input + // did we reach a final state? + lastKnownFinal match { + case Some((state, env)) => + // we did reach a final state, emit the outputs from the last one reached + // and push back the input read since into the stream, then proceed + Pull + .eval(eval0(env, finalStates(state))) + .flatMap(outs => + go(Chunk.chain(accSinceLastFinal), + 0, + Stream.empty, + initial, + Nil, + Env.create(variables), + None, + Chain.empty, + chunkAcc ++ outs)) case None => - // we cannot step from this state, is it final? - finalStates.get(state) match { - case Some(finalExpr) if accSinceLastFinal.nonEmpty => - // it is a final state, emit the associated output, reset buffer, - // reset to initial state, and proceed without consuming the input - Pull - .eval(eval0(env, finalExpr)) - .flatMap(outs => - Pull.output(Chunk.chain(outs)) >> go(s, initial, Nil, Env.create(variables), None, Chain.empty)) - case _ => - // it is not a final state - lastKnownFinal match { - case Some((state, env)) => - // we reached a final state before, let's emit what should have been emitted - // there, and push the input buffer back to the stream - Pull - .eval(eval0(env, finalStates(state))) - .flatMap(outs => - Pull.output(Chunk.chain(outs)) >> go(Stream.chunk(Chunk.chain(accSinceLastFinal)) ++ s, - initial, - Nil, - Env.create(variables), - None, - Chain.empty)) - case None => - // there is no known final, we will emit nothing and just fail - Pull.raiseError( - STTException( - show"malformed input, prefix ${(accSinceLastFinal :+ in).mkString_(", ")} is not accepted")) - } + // we did not reach a final state, do we have leftover inputs? + if (accSinceLastFinal.isEmpty) { + // no we don't, everything has been processed, stop here + Pull.done + } else { + // we do have unprocessed inputs, this is an error + Pull.raiseError(STTException("malformed input")) } } - case None => - // we are at the end of the input - // did we reach a final state? - lastKnownFinal match { - case Some((state, env)) => - // we did reach a final state, emit the outputs from the last one reached - // and push back the input read since into the stream, then proceed - Pull - .eval(eval0(env, finalStates(state))) - .flatMap(outs => - Pull.output(Chunk.chain(outs)) >> go(Stream.chunk(Chunk.chain(accSinceLastFinal)), - initial, - Nil, - Env.create(variables), - None, - Chain.empty)) + } + } else { + val in = chunk(idx) + Pull + // try to step with the current input character + .eval(step(state, stack, env)(in).value) + .flatMap { + case Some((state, stack, env)) if isFinal(state) => + // we can step and the target state is final, + // register this as the last encountered final state, + // reinitialize the input buffer to empty, and proceed + // consume the input symbol + go(chunk, idx + 1, rest, state, stack, env, (state, env).some, Chain.empty, chunkAcc) + case Some((state, stack, env)) => + // we can step and the target state is NOT final, + // add the just read input into the buffer of read + // inputs since last final state, and proceed + go(chunk, idx + 1, rest, state, stack, env, lastKnownFinal, accSinceLastFinal.append(in), chunkAcc) case None => - // we did not reach a final state, do we have leftover inputs? - if (accSinceLastFinal.isEmpty) { - // no we don't, everything has been processed, stop here - Pull.done - } else { - // we do have unprocessed inputs, this is an error - Pull.raiseError(STTException("malformed input")) + // we cannot step from this state, is it final? + finalStates.get(state) match { + case Some(finalExpr) if accSinceLastFinal.nonEmpty => + // it is a final state, emit the associated output, reset buffer, + // reset to initial state, and proceed without consuming the input + Pull + .eval(eval0(env, finalExpr)) + .flatMap(outs => + go(chunk, idx, rest, initial, Nil, Env.create(variables), None, Chain.empty, chunkAcc ++ outs)) + case _ => + // it is not a final state + lastKnownFinal match { + case Some((state, env)) => + // we reached a final state before, let's emit what should have been emitted + // there, and push the input buffer back to the stream + Pull + .eval(eval0(env, finalStates(state))) + .flatMap(outs => + go(Chunk.chain(accSinceLastFinal), + 0, + rest, + initial, + Nil, + Env.create(variables), + None, + Chain.empty, + chunkAcc ++ outs)) + case None => + // there is no known final, we will emit nothing and just fail + Pull.output(Chunk.chain(chunkAcc)) >> Pull.raiseError( + STTException( + show"malformed input, prefix ${(accSinceLastFinal :+ in).mkString_(", ")} is not accepted")) + } } } } - go(s, initial, List.empty, Env.create(variables), None, Chain.empty).stream + go(Chunk.empty, 0, s, initial, List.empty, Env.create(variables), None, Chain.empty, Chain.empty).stream } diff --git a/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala b/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala index 2960b3b01..d849f20a6 100644 --- a/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala +++ b/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala @@ -80,6 +80,7 @@ object STTSpec extends SimpleIOSuite { Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) Stream(openA, openB, leaf0, closeB, leaf1, closeA, openB, closeB) + .rechunkRandomly() .through(reverse) .compile .toList From e27d2ba348a3170f9259a546eb914aacb22f11d3 Mon Sep 17 00:00:00 2001 From: Lucas Satabin Date: Sun, 2 Jan 2022 19:24:26 +0100 Subject: [PATCH 3/6] Allow for symbolic transducers By abstracting the transition table type with a dedicated typeclass, one can create symbolic transducers, which can come in handy to handle infinite input alphabets. --- .../src/main/scala/fs2/data/stt/STT.scala | 13 +-- .../src/main/scala/fs2/data/stt/Table.scala | 38 +++++++++ .../src/main/scala/fs2/data/stt/package.scala | 10 +++ .../src/test/scala/fs2/data/stt/STTSpec.scala | 81 +++++++++++++------ 4 files changed, 112 insertions(+), 30 deletions(-) create mode 100644 transducers/shared/src/main/scala/fs2/data/stt/Table.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/stt/package.scala diff --git a/transducers/shared/src/main/scala/fs2/data/stt/STT.scala b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala index 213e3a76e..1c95bd183 100644 --- a/transducers/shared/src/main/scala/fs2/data/stt/STT.scala +++ b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala @@ -31,13 +31,14 @@ case class CallTransition[Out, StackElem](target: Int, push: StackElem, update: case class ReturnTransition[Out](target: Int, update: List[Assignment[Variable, Out]]) /** A copyless streaming tree transducer implementation. */ -class STT[F[_], In, Out, StackElem](initial: Int, - internalTransitions: Map[(Int, In), InternalTransition[Out]], - callTransitions: Map[(Int, In), CallTransition[Out, StackElem]], - returnTransitions: Map[(Int, StackElem, In), ReturnTransition[Out]], - finalStates: Map[Int, Expr0[Out]], - variables: Map[String, Type])(implicit +class STT[F[_], T[_, _], In, Out, StackElem](initial: Int, + internalTransitions: T[(Int, In), InternalTransition[Out]], + callTransitions: T[(Int, In), CallTransition[Out, StackElem]], + returnTransitions: T[(Int, StackElem, In), ReturnTransition[Out]], + finalStates: Map[Int, Expr0[Out]], + variables: Map[String, Type])(implicit F: MonadError[F, Throwable], + T: Table[T], In: HasTag[In], showIn: Show[In], showOut: Show[Out]) diff --git a/transducers/shared/src/main/scala/fs2/data/stt/Table.scala b/transducers/shared/src/main/scala/fs2/data/stt/Table.scala new file mode 100644 index 000000000..049943bab --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/Table.scala @@ -0,0 +1,38 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +import scala.annotation.implicitNotFound + +/** A typeclass indicating that some type `T` can be used as a lookup table. + */ +@implicitNotFound("Cannot prove that type ${T} can be used as a lookup table. Make sure to provide an implicit instance of `fs2.data.stt.Table[${T}]` in scope") +trait Table[T[_, _]] { + def get[From, To](m: T[From, To])(from: From): Option[To] +} + +object Table { + + implicit object PartialFunctionTable extends Table[PartialFunction] { + def get[From, To](m: PartialFunction[From, To])(from: From): Option[To] = + m.lift(from) + } + + implicit object MapTable extends Table[Map] { + def get[From, To](m: Map[From, To])(from: From): Option[To] = m.get(from) + } +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/package.scala b/transducers/shared/src/main/scala/fs2/data/stt/package.scala new file mode 100644 index 000000000..ff62ddd10 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/package.scala @@ -0,0 +1,10 @@ +package fs2.data + +package object stt { + + implicit class MappableOps[M[_, _], From, To](val m: M[From, To]) extends AnyVal { + def get(from: From)(implicit M: Table[M]): Option[To] = + M.get(m)(from) + } + +} diff --git a/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala b/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala index d849f20a6..96d0dc2cb 100644 --- a/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala +++ b/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala @@ -18,8 +18,6 @@ package fs2 package data package stt -import cats.effect._ - import weaver._ import cats.Show @@ -29,18 +27,18 @@ object Tree { case class Close(name: String) extends Tree case class Leaf(value: Int) extends Tree - implicit val hasTags: HasTag[Tree] = { - case Open(_) => Tag.Call - case Close(_) => Tag.Return - case Leaf(_) => Tag.Internal - } - implicit val show: Show[Tree] = Show.show { case Open(name) => s"<$name" case Close(name) => s"$name>" case Leaf(v) => v.toString } + implicit val hasTags: HasTag[Tree] = { + case Open(_) => Tag.Call + case Close(_) => Tag.Return + case Leaf(_) => Tag.Internal + } + } object STTSpec extends SimpleIOSuite { @@ -57,27 +55,62 @@ object STTSpec extends SimpleIOSuite { val z = Variable.Normal("z") val xp = Variable.Stack("x") - test("reverse tree") { + test("reverse tree (map)") { import Assignment._ - val internalTransition = Map( - (0, leaf0) -> InternalTransition(0, List(Char(y, leaf0), Prepend(x, y))), - (0, leaf1) -> InternalTransition(0, List(Char(y, leaf1), Prepend(x, y))) - ) - val callTransition = Map((0, openA) -> CallTransition[Tree, Tree](0, openA, Nil), - (0, openB) -> CallTransition[Tree, Tree](0, openB, Nil)) - val returnTransition = Map( - (0, openA, closeA) -> ReturnTransition(0, List(Subtree(z, openA, closeA), SubstInY(x, z), Append(x, xp))), - (0, openB, closeB) -> ReturnTransition(0, List(Subtree(z, openB, closeB), SubstInY(x, z), Append(x, xp))) + val internalTransition = + Map[(Int, Tree), InternalTransition[Tree]]((0, leaf0) -> + InternalTransition(0, List(Char(y, leaf0), Prepend(x, y))), + (0, leaf1) -> + InternalTransition(0, List(Char(y, leaf1), Prepend(x, y)))) + val callTransition = Map[(Int, Tree), CallTransition[Tree, Tree]]((0, openA) -> + CallTransition(0, openA, Nil), + (0, openB) -> + CallTransition(0, openB, Nil)) + val returnTransition = Map[(Int, Tree, Tree), ReturnTransition[Tree]]( + (0, openA, closeA) -> + ReturnTransition(0, List(Subtree(z, openA, closeA), SubstInY(x, z), Append(x, xp))), + (0, openB, closeB) -> + ReturnTransition(0, List(Subtree(z, openB, closeB), SubstInY(x, z), Append(x, xp))) ) val finalStates = Map(0 -> Expr0.Var[Tree](x)) val reverse = - new STT[IO, Tree, Tree, Tree](0, - internalTransition, - callTransition, - returnTransition, - finalStates, - Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) + new STT(0, + internalTransition, + callTransition, + returnTransition, + finalStates, + Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) + + Stream(openA, openB, leaf0, closeB, leaf1, closeA, openB, closeB) + .rechunkRandomly() + .through(reverse) + .compile + .toList + .map(result => expect(result == List(openB, closeB, openA, leaf1, openB, leaf0, closeB, closeA))) + } + + test("reverse tree (symbolic)") { + + import Assignment._ + val internalTransition: PartialFunction[(Int, Tree), InternalTransition[Tree]] = { case (0, l @ Tree.Leaf(_)) => + InternalTransition(0, List(Char(y, l), Prepend(x, y))) + } + val callTransition: PartialFunction[(Int, Tree), CallTransition[Tree, Tree]] = { case (0, o @ Tree.Open(_)) => + CallTransition(0, o, Nil) + } + val returnTransition: PartialFunction[(Int, Tree, Tree), ReturnTransition[Tree]] = { + case (0, open @ Tree.Open(nopen), close @ Tree.Close(nclose)) if nopen == nclose => + ReturnTransition(0, List(Subtree(z, open, close), SubstInY(x, z), Append(x, xp))) + } + val finalStates = Map(0 -> Expr0.Var[Tree](x)) + val reverse = + new STT(0, + internalTransition, + callTransition, + returnTransition, + finalStates, + Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) Stream(openA, openB, leaf0, closeB, leaf1, closeA, openB, closeB) .rechunkRandomly() From dbbcb9008e19e948c7ee0433054a4e948aeb06af Mon Sep 17 00:00:00 2001 From: Lucas Satabin Date: Sun, 2 Jan 2022 19:56:21 +0100 Subject: [PATCH 4/6] Put unconsumed chunk back into stream --- .../src/main/scala/fs2/data/stt/STT.scala | 5 +- .../src/main/scala/fs2/data/stt/package.scala | 16 +++++ .../src/test/scala/fs2/data/stt/STTSpec.scala | 58 ++++++++++++++----- 3 files changed, 64 insertions(+), 15 deletions(-) diff --git a/transducers/shared/src/main/scala/fs2/data/stt/STT.scala b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala index 1c95bd183..74c35ad69 100644 --- a/transducers/shared/src/main/scala/fs2/data/stt/STT.scala +++ b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala @@ -232,13 +232,14 @@ class STT[F[_], T[_, _], In, Out, StackElem](initial: Int, lastKnownFinal match { case Some((state, env)) => // we reached a final state before, let's emit what should have been emitted - // there, and push the input buffer back to the stream + // there, and push the input buffer back to the stream, + // as well as unconsumed current chunk Pull .eval(eval0(env, finalStates(state))) .flatMap(outs => go(Chunk.chain(accSinceLastFinal), 0, - rest, + Stream.chunk(chunk.drop(idx)) ++ rest, initial, Nil, Env.create(variables), diff --git a/transducers/shared/src/main/scala/fs2/data/stt/package.scala b/transducers/shared/src/main/scala/fs2/data/stt/package.scala index ff62ddd10..7d66de689 100644 --- a/transducers/shared/src/main/scala/fs2/data/stt/package.scala +++ b/transducers/shared/src/main/scala/fs2/data/stt/package.scala @@ -1,3 +1,19 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package fs2.data package object stt { diff --git a/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala b/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala index 96d0dc2cb..da4cda63f 100644 --- a/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala +++ b/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala @@ -20,6 +20,7 @@ package stt import weaver._ import cats.Show +import cats.effect.IO sealed trait Tree object Tree { @@ -75,12 +76,12 @@ object STTSpec extends SimpleIOSuite { ) val finalStates = Map(0 -> Expr0.Var[Tree](x)) val reverse = - new STT(0, - internalTransition, - callTransition, - returnTransition, - finalStates, - Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) + new STT[IO, Map, Tree, Tree, Tree](0, + internalTransition, + callTransition, + returnTransition, + finalStates, + Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) Stream(openA, openB, leaf0, closeB, leaf1, closeA, openB, closeB) .rechunkRandomly() @@ -105,12 +106,12 @@ object STTSpec extends SimpleIOSuite { } val finalStates = Map(0 -> Expr0.Var[Tree](x)) val reverse = - new STT(0, - internalTransition, - callTransition, - returnTransition, - finalStates, - Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) + new STT[IO, PartialFunction, Tree, Tree, Tree](0, + internalTransition, + callTransition, + returnTransition, + finalStates, + Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) Stream(openA, openB, leaf0, closeB, leaf1, closeA, openB, closeB) .rechunkRandomly() @@ -120,6 +121,37 @@ object STTSpec extends SimpleIOSuite { .map(result => expect(result == List(openB, closeB, openA, leaf1, openB, leaf0, closeB, closeA))) } - test("") + test("emit until error") { + import Assignment._ + val internalTransition = + Map[(Int, Tree), InternalTransition[Tree]]() + val callTransition = Map[(Int, Tree), CallTransition[Tree, Unit]]( + (0, openA) -> + CallTransition(0, (), List(Char(y, openA), Append(x, y)))) + val returnTransition = Map[(Int, Unit, Tree), ReturnTransition[Tree]]( + (0, (), closeA) -> + ReturnTransition(0, List(Char(y, closeA), Append(x, y), Prepend(x, xp))) + ) + val finalStates = Map(0 -> Expr0.Var[Tree](x)) + val onlyA = new STT[IO, Map, Tree, Tree, Unit](0, + internalTransition, + callTransition, + returnTransition, + finalStates, + Map("x" -> Type.Type0, "y" -> Type.Type0)) + Stream(openA, openA, closeA, closeA, openB, closeB) + .through(onlyA) + .attempt + .compile + .toList + .map { result => + expect( + result == List(Right(openA), + Right(openA), + Right(closeA), + Right(closeA), + Left(STTException("malformed input, prefix Date: Sat, 8 Jan 2022 19:53:41 +0100 Subject: [PATCH 5/6] Add implementation of FST The streaming FST are simulated and try to emit as early as possible when there is no ambiguity. --- build.sbt | 20 +- .../fs2/data/transducer/CharRangesSpec.scala | 96 +++++++++ .../main/scala/fs2/data/fst/CopyFunc.scala | 100 +++++++++ .../src/main/scala/fs2/data/fst/FST.scala | 135 ++++++++++++ .../src/main/scala/fs2/data/fst/FSTPipe.scala | 79 +++++++ .../scala/fs2/data/{stt => fst}/package.scala | 11 +- .../src/main/scala/fs2/data/stt/STT.scala | 2 + .../fs2/data/transducer/CharRanges.scala | 43 ++++ .../main/scala/fs2/data/transducer/Func.scala | 42 ++++ .../scala/fs2/data/transducer/RangeSet.scala | 195 ++++++++++++++++++ .../scala/fs2/data/transducer/SetLike.scala | 25 +++ .../fs2/data/{stt => transducer}/Table.scala | 16 +- .../scala/fs2/data/transducer/package.scala | 33 +++ 13 files changed, 779 insertions(+), 18 deletions(-) create mode 100644 transducers/jvm/src/test/scala/fs2/data/transducer/CharRangesSpec.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/fst/CopyFunc.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/fst/FST.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/fst/FSTPipe.scala rename transducers/shared/src/main/scala/fs2/data/{stt => fst}/package.scala (73%) create mode 100644 transducers/shared/src/main/scala/fs2/data/transducer/CharRanges.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/transducer/Func.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/transducer/RangeSet.scala create mode 100644 transducers/shared/src/main/scala/fs2/data/transducer/SetLike.scala rename transducers/shared/src/main/scala/fs2/data/{stt => transducer}/Table.scala (61%) create mode 100644 transducers/shared/src/main/scala/fs2/data/transducer/package.scala diff --git a/build.sbt b/build.sbt index 3518f2c6e..124987001 100644 --- a/build.sbt +++ b/build.sbt @@ -8,6 +8,7 @@ val shapeless2Version = "2.3.7" val shapeless3Version = "3.0.3" val scalaJavaTimeVersion = "2.3.0" val diffsonVersion = "4.1.1" +val weaverVersion = "0.7.9" val commonSettings = List( scalaVersion := scala213, @@ -56,13 +57,14 @@ val commonSettings = List( libraryDependencies ++= List( "co.fs2" %%% "fs2-core" % fs2Version, "org.scala-lang.modules" %%% "scala-collection-compat" % "2.6.0", - "io.circe" %%% "circe-parser" % circeVersion % "test", - "co.fs2" %% "fs2-io" % fs2Version % "test", - "com.disneystreaming" %%% "weaver-cats" % "0.7.9" % "test", - "com.disneystreaming" %%% "weaver-cats-core" % "0.7.9" % "test", - "com.disneystreaming" %%% "weaver-core" % "0.7.9" % "test", - "com.disneystreaming" %%% "weaver-framework" % "0.7.9" % "test", - "com.eed3si9n.expecty" %%% "expecty" % "0.15.4" % "test", + "io.circe" %%% "circe-parser" % circeVersion % Test, + "co.fs2" %% "fs2-io" % fs2Version % Test, + "com.disneystreaming" %%% "weaver-cats" % weaverVersion % Test, + "com.disneystreaming" %%% "weaver-cats-core" % weaverVersion % Test, + "com.disneystreaming" %%% "weaver-core" % weaverVersion % Test, + "com.disneystreaming" %%% "weaver-framework" % weaverVersion % Test, + "com.disneystreaming" %% "weaver-scalacheck" % weaverVersion % Test, + "com.eed3si9n.expecty" %%% "expecty" % "0.15.4" % Test, "org.portable-scala" %%% "portable-scala-reflect" % "1.1.1" cross CrossVersion.for3Use2_13 ) ++ PartialFunction .condOpt(CrossVersion.partialVersion(scalaVersion.value)) { case Some((2, _)) => @@ -228,7 +230,7 @@ lazy val jsonCirce = crossProject(JVMPlatform, JSPlatform) description := "Streaming JSON library with support for circe ASTs", libraryDependencies ++= List( "io.circe" %%% "circe-core" % circeVersion, - "org.gnieh" %%% "diffson-circe" % diffsonVersion % "test" + "org.gnieh" %%% "diffson-circe" % diffsonVersion % Test ) ) .dependsOn(json % "compile->compile;test->test", jsonDiffson % "test->test") @@ -244,7 +246,7 @@ lazy val jsonPlay = crossProject(JVMPlatform, JSPlatform) crossScalaVersions := Seq(scala212, scala213), libraryDependencies ++= List( "com.typesafe.play" %%% "play-json" % playVersion, - "org.gnieh" %%% "diffson-play-json" % diffsonVersion % "test" + "org.gnieh" %%% "diffson-play-json" % diffsonVersion % Test ) ) .dependsOn(json % "compile->compile;test->test", jsonDiffson % "test->test") diff --git a/transducers/jvm/src/test/scala/fs2/data/transducer/CharRangesSpec.scala b/transducers/jvm/src/test/scala/fs2/data/transducer/CharRangesSpec.scala new file mode 100644 index 000000000..8808b4dac --- /dev/null +++ b/transducers/jvm/src/test/scala/fs2/data/transducer/CharRangesSpec.scala @@ -0,0 +1,96 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +import weaver._ +import weaver.scalacheck._ + +import org.scalacheck._ + +object CharRangesSpec extends SimpleIOSuite with Checkers { + + val aChar = Gen.choose(Char.MinValue, Char.MaxValue) + val aRange = + for { + c1 <- aChar + c2 <- aChar + } yield (c1, c2) + + val someRanges = Gen.nonEmptyListOf(aRange).map { + case Nil => CharRanges.empty + case r :: Nil => CharRanges.range(r) + case r1 :: r2 :: rs => CharRanges.ranges(r1, r2, rs: _*) + } + + implicit val aCharRanges: Arbitrary[CharRanges] = + Arbitrary(Gen.oneOf(Gen.const(CharRanges.all), Gen.const(CharRanges.empty), someRanges)) + + pureTest("merge adjacent") { + expect(CharRanges.ranges('a' -> 'd', 'e' -> 'z') == CharRanges.range('a', 'z')) + } + + pureTest("merge overlapping") { + expect(CharRanges.ranges('a' -> 'l', 'e' -> 'z') == CharRanges.range('a', 'z')) + } + + pureTest("simplify all") { + expect(CharRanges.range(Char.MinValue, Char.MaxValue) == CharRanges.all) + } + + pureTest("ranges inclusive") { + expect.all( + CharRanges.range('a' -> 'z').contains('a'), + CharRanges.range('a' -> 'z').contains('z'), + CharRanges.all.contains(Char.MinValue), + CharRanges.all.contains(Char.MaxValue), + CharRanges.char('a').contains('a') + ) + } + + pureTest("empty doesn't overlap all") { + expect.all( + !CharRanges.empty.overlap(CharRanges.all), + !CharRanges.empty.overlap(CharRanges.empty), + !CharRanges.all.overlap(CharRanges.empty) + ) + } + + test("empty overlaps nothing") { + forall { (ranges: CharRanges) => + expect(true) + } + } + + test("overlapping with all") { + forall { (ranges: CharRanges) => + expect(ranges.isEmpty) || expect.all(CharRanges.all.overlap(ranges), ranges.overlap(CharRanges.all)) + } + } + + test("invert and back") { + forall { (ranges: CharRanges) => + expect(ranges.invert.invert == ranges) + } + } + + test("invert inverts") { + forall { (ranges: CharRanges, c: Char) => + expect(ranges.invert.contains(c) == !ranges.contains(c)) + } + } + +} diff --git a/transducers/shared/src/main/scala/fs2/data/fst/CopyFunc.scala b/transducers/shared/src/main/scala/fs2/data/fst/CopyFunc.scala new file mode 100644 index 000000000..44badf368 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/fst/CopyFunc.scala @@ -0,0 +1,100 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data +package fst + +import transducer.Func +import cats.kernel.BoundedEnumerable +import cats.Id +import cats.Show +import cats.syntax.all._ + +/** Functions that inject their arguments into the range type + * or are constant functions (i.e. ignoring their argument). + */ +sealed trait CopyFunc[+T, +C] +object CopyFunc { + case object CopyArg extends CopyFunc[Nothing, Nothing] + case class CopyConst[Out](out: Out) extends CopyFunc[Nothing, Out] + + implicit def show[T: Show, C: Show]: Show[CopyFunc[T, C]] = Show.show { + case CopyFunc.CopyArg => "" + case CopyFunc.CopyConst(out) => out.show + } + + implicit def CopyFuncChar[X]: Func.Aux[CopyFunc[Char, List[Either[String, X]]], Char, List[Either[String, X]]] = + new Func[CopyFunc[Char, List[Either[String, X]]]] { + type Dom = Char + type Rng = List[Either[String, X]] + def eval(f: CopyFunc[Char, List[Either[String, X]]])(arg: Dom): Rng = + f match { + case CopyFunc.CopyArg => Left(arg.toString) :: Nil + case CopyFunc.CopyConst(out) => out + } + + def isConst(f: CopyFunc[Char, List[Either[String, X]]]): Option[Rng] = + f match { + case CopyFunc.CopyArg => None + case CopyFunc.CopyConst(out) => Some(out) + } + + def inDom(t: Char)(f: CopyFunc[Char, List[Either[String, X]]]): Boolean = true + + def domain(f: CopyFunc[Char, List[Either[String, X]]]): LazyList[Char] = BoundedEnumerable[Char].membersAscending + + } + + implicit def CopyFuncEitherListFunc[A, X](implicit + A: BoundedEnumerable[A]): Func.Aux[CopyFunc[A, List[Either[A, X]]], A, List[Either[A, X]]] = + new Func[CopyFunc[A, List[Either[A, X]]]] { + type Dom = A + type Rng = List[Either[A, X]] + + def eval(f: CopyFunc[A, List[Either[A, X]]])(arg: Dom): Rng = + f match { + case CopyArg => Left(arg) :: Nil + case CopyConst(out) => out + } + def isConst(f: CopyFunc[A, List[Either[A, X]]]): Option[Rng] = + f match { + case CopyArg => None + case CopyConst(out) => Some(out) + } + def inDom(t: Dom)(f: CopyFunc[A, List[Either[A, X]]]): Boolean = true + def domain(f: CopyFunc[A, List[Either[A, X]]]): LazyList[Dom] = A.membersAscending + } + + implicit def CopyFuncIdentityFunc[A](implicit + A: BoundedEnumerable[A]): Func.Aux[CopyFunc[A, List[Id[A]]], A, List[Id[A]]] = + new Func[CopyFunc[A, List[Id[A]]]] { + type Dom = A + type Rng = List[Id[A]] + + def eval(f: CopyFunc[A, List[Id[A]]])(arg: Dom): Rng = + f match { + case CopyArg => List(arg) + case CopyConst(out) => out + } + def isConst(f: CopyFunc[A, List[Id[A]]]): Option[Rng] = + f match { + case CopyArg => None + case CopyConst(out) => Some(out) + } + def inDom(t: Dom)(f: CopyFunc[A, List[Id[A]]]): Boolean = true + def domain(f: CopyFunc[A, List[Id[A]]]): LazyList[Dom] = A.membersAscending + } +} diff --git a/transducers/shared/src/main/scala/fs2/data/fst/FST.scala b/transducers/shared/src/main/scala/fs2/data/fst/FST.scala new file mode 100644 index 000000000..f37706702 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/fst/FST.scala @@ -0,0 +1,135 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package fst + +import transducer._ + +import cats.syntax.all._ +import cats.{Monoid, Show} + +/** Non-deterministis finit state transducer. + * To be well-formed, for any state, there is either + * - epsilon transitions, or + * - symbol transitions but + * - not both + */ +case class FST[Q, Pred, Fun, In, Out](initial: Q, + states: Set[Q], + edges: OrderedEdgeSet[Q, Pred, Fun, Out], + finals: Set[Q])(implicit Fun: Func.Aux[Fun, In, Out]) { + + def isChoiceState(q: Q): Boolean = + edges.forwardEpsilon.contains(q) + + def isSkipState(q: Q): Boolean = + edges.forwardEpsilon.get(q) match { + case Some(List(_)) => true + case _ => false + } + + def isJoinState(q: Q): Boolean = + edges.backward.getOrElse(q, Nil).size + edges.backwardEpsilon.getOrElse(q, Nil).size > 1 + + def enumerateStates: FST[Int, Pred, Fun, In, Out] = { + val q2int = states.toList.zipWithIndex.toMap + val intedges = OrderedEdgeSet.fromList[Int, Pred, Fun, Out](edges.toList.map { case (src, act, tgt) => + (q2int(src), act, q2int(tgt)) + }) + FST(q2int(initial), q2int.values.toSet, intedges, finals.map(q2int(_))) + } + + def evalEdges(q: Q, in: In)(implicit Pred: SetLike[Pred, In]): List[(Out, Q)] = + edges.forward.get(q) match { + case None => Nil + case Some(ts) => + ts.flatMap { case (pred, f, q) => + if (pred.contains(in)) + List((Fun.eval(f)(in), q)) + else + Nil + } + } + + def rightClosure(q: Q)(implicit Out: Monoid[Out]): List[(Out, Q)] = { + def go(q: Q, visited: Set[Q], out: Out): (Set[Q], List[(Out, Q)]) = + edges.forwardEpsilon.get(q) match { + case Some(Nil) | None => + (visited, List(out -> q)) + case Some(ts) => + ts.foldLeft((visited, List.empty[(Out, Q)])) { case ((visited, acc), (w, q1)) => + if (visited.contains(q1)) { + (visited, acc) + } else { + val (visited1, ys) = go(q1, visited + q1, out.combine(w)) + (visited1, acc ++ ys) + } + } + } + go(q, Set.empty, Out.empty)._2 + } + + def pipe[F[_]: RaiseThrowable]( + emitEarly: Boolean = true)(implicit Out: Monoid[Out], Pred: SetLike[Pred, In]): Pipe[F, In, Out] = + new FSTPipe(this, emitEarly) + +} + +object FST { + + implicit def show[Q: Show, pred: Show, F: Show, In, Out: Show]: Show[FST[Q, pred, F, In, Out]] = Show.show { fst => + val transitions = fst.states.toList.map { q => + if (fst.edges.forward.contains(q)) + fst.edges.forward(q).map { case (pred, f, tgt) => show"$q - $pred / $f -> $tgt" } + else + fst.edges.forwardEpsilon.getOrElse(q, Nil).map { case (out, tgt) => show"$q - / $out -> $tgt" } + } + show"""FST { + | initial = ${fst.initial} + | ${transitions.mkString_("\n ")} + | finals = ${fst.finals} + |}""".stripMargin + } + +} + +case class OrderedEdgeSet[Q, Pred, F, Out](forward: Map[Q, List[(Pred, F, Q)]], + backward: Map[Q, List[(Pred, F, Q)]], + forwardEpsilon: Map[Q, List[(Out, Q)]], + backwardEpsilon: Map[Q, List[(Out, Q)]])(implicit F: Func.Range[F, Out]) { + + def toList: List[Edge[Q, Pred, F, Out]] = { + val sym = forward.toList.flatMap { case (q, ts) => ts.map { case (pred, fun, tgt) => (q, Left((pred, fun)), tgt) } } + val eps = forwardEpsilon.toList.flatMap { case (q, ts) => ts.map { case (out, tgt) => (q, Right(out), tgt) } } + sym ++ eps + } + +} + +object OrderedEdgeSet { + + def fromList[Q, Pred, Fun, Out](edges: List[Edge[Q, Pred, Fun, Out]])(implicit + Fun: Func.Range[Fun, Out]): OrderedEdgeSet[Q, Pred, Fun, Out] = + OrderedEdgeSet( + forward = edges.collect { case (q, Left((a, b)), q1) => Map(q -> List((a, b, q1))) }.combineAll, + backward = edges.collect { case (q, Left((a, b)), q1) => Map(q1 -> List((a, b, q))) }.combineAll, + forwardEpsilon = edges.collect { case (q, Right(out), q1) => Map((q -> List((out, q1)))) }.combineAll, + backwardEpsilon = edges.collect { case (q, Right(out), q1) => Map((q1 -> List((out, q)))) }.combineAll + ) + +} diff --git a/transducers/shared/src/main/scala/fs2/data/fst/FSTPipe.scala b/transducers/shared/src/main/scala/fs2/data/fst/FSTPipe.scala new file mode 100644 index 000000000..4108d6190 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/fst/FSTPipe.scala @@ -0,0 +1,79 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package fst + +import transducer.SetLike + +import cats.Monoid + +case class FSTException(msg: String) extends Exception(msg) + +private class FSTPipe[F[_]: RaiseThrowable, In, Out: Monoid, Q, Pred, Fun](fst: FST[Q, Pred, Fun, In, Out], + emitEarly: Boolean)(implicit + Pred: SetLike[Pred, In]) + extends Pipe[F, In, Out] { + def apply(s: Stream[F, In]): Stream[F, Out] = { + def go(chunk: Chunk[In], idx: Int, rest: Stream[F, In], outs: List[(List[Out], Q)]): Pull[F, Out, Unit] = + if (idx >= chunk.size) { + rest.pull.uncons.flatMap { + case Some((hd, tl)) => go(hd, 0, tl, outs) + case None => + outs.filter(p => fst.finals.contains(p._2)) match { + case (outs, q) :: _ => + Pull.output(Chunk.seq(outs.reverse)) + case _ => + Pull.raiseError(FSTException("invalid input")) + } + } + } else { + val in = chunk(idx) + + def prune(visited: Set[Q], l: List[(List[Out], Q)]): List[(List[Out], Q)] = + l match { + case Nil => Nil + case (outs, q) :: rest => + if (visited.contains(q)) + prune(visited, rest) + else + (outs, q) :: prune(visited + q, rest) + } + + def close(l: List[(List[Out], Q)]) = + prune(Set.empty, + l.flatMap { case (outs, q) => fst.rightClosure(q).map { case (out, q) => (out :: outs, q) } }) + + def step(outs: List[(List[Out], Q)]) = + outs.flatMap { case (outs, q) => + fst.evalEdges(q, in).map { case (out, q) => (out :: outs, q) } + } + + val outs1 = close(step(outs)) + + outs1 match { + case List((outs, q)) if emitEarly && fst.finals.contains(q) => + Pull.output(Chunk.seq(outs.reverse)) >> go(chunk, idx + 1, rest, List((Nil, q))) + case _ => + go(chunk, idx + 1, rest, outs1) + } + } + + go(Chunk.empty, 0, s, fst.rightClosure(fst.initial).map { case (out, q) => (List(out), q) }).stream + } + +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/package.scala b/transducers/shared/src/main/scala/fs2/data/fst/package.scala similarity index 73% rename from transducers/shared/src/main/scala/fs2/data/stt/package.scala rename to transducers/shared/src/main/scala/fs2/data/fst/package.scala index 7d66de689..ac063ab28 100644 --- a/transducers/shared/src/main/scala/fs2/data/stt/package.scala +++ b/transducers/shared/src/main/scala/fs2/data/fst/package.scala @@ -16,11 +16,12 @@ package fs2.data -package object stt { +import transducer.RangeSet - implicit class MappableOps[M[_, _], From, To](val m: M[From, To]) extends AnyVal { - def get(from: From)(implicit M: Table[M]): Option[To] = - M.get(m)(from) - } +package object fst { + + type Edge[Q, Pred, F, Out] = (Q, Either[(Pred, F), Out], Q) + + type Transducer[Q, Sigma, Gamma] = FST[Q, RangeSet[Sigma], CopyFunc[Sigma, List[Gamma]], Sigma, List[Gamma]] } diff --git a/transducers/shared/src/main/scala/fs2/data/stt/STT.scala b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala index 74c35ad69..5922c3c86 100644 --- a/transducers/shared/src/main/scala/fs2/data/stt/STT.scala +++ b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala @@ -18,6 +18,8 @@ package fs2 package data package stt +import transducer._ + import cats.syntax.all._ import cats.MonadError import cats.data.OptionT diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/CharRanges.scala b/transducers/shared/src/main/scala/fs2/data/transducer/CharRanges.scala new file mode 100644 index 000000000..92e1d3e95 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/CharRanges.scala @@ -0,0 +1,43 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +object CharRanges { + + /** The empty set of character ranges */ + val empty: CharRanges = RangeSet.empty + + /** The set that contains all characters */ + val all: CharRanges = RangeSet.all + + /** Creates a singleton set of a singleton range */ + def char(c: Char): CharRanges = + RangeSet.char(c) + + /** Creates a set of ranges based on the provided single characters */ + def chars(c1: Char, c2: Char, cs: Char*): CharRanges = + RangeSet.chars(c1, c2, cs: _*) + + /** Creates a singleton set of ranges */ + def range(r: (Char, Char)): CharRanges = + RangeSet.range(r) + + /** Creates a set of ranges */ + def ranges(r1: (Char, Char), r2: (Char, Char), rs: (Char, Char)*): CharRanges = + RangeSet.ranges(r1, r2, rs: _*) + +} diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/Func.scala b/transducers/shared/src/main/scala/fs2/data/transducer/Func.scala new file mode 100644 index 000000000..87a6958ef --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/Func.scala @@ -0,0 +1,42 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +trait Func[F] { + type Dom + type Rng + + def eval(f: F)(arg: Dom): Rng + def isConst(f: F): Option[Rng] + def inDom(t: Dom)(f: F): Boolean + def domain(f: F): LazyList[Dom] +} + +object Func { + type Range[F, R] = Func[F] { + type Rng = R + } + + type Domain[F, D] = Func[F] { + type Dom = D + } + + type Aux[F, D, R] = Func[F] { + type Dom = D + type Rng = R + } +} diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/RangeSet.scala b/transducers/shared/src/main/scala/fs2/data/transducer/RangeSet.scala new file mode 100644 index 000000000..8b10faf8e --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/RangeSet.scala @@ -0,0 +1,195 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +import cats.Order +import cats.syntax.all._ +import cats.Show +import cats.data.NonEmptyList +import cats.data.NonEmptyVector +import cats.kernel.BoundedEnumerable + +/** A set of ranges for some enumerable type. */ +sealed trait RangeSet[T] { + + /** Indicates whether this set of ranges contains the given character. */ + def contains(c: T): Boolean + + /** Inverts this set of character ranges. + * Forall c, this.contains(c) == !this.invert.contains(c) + */ + def invert: RangeSet[T] + + /** Enumerates all characters in this set of ranges, in ascending order. */ + def enumerate: LazyList[T] + + /** Returns the minimal elements in these sets. */ + def min: Option[T] + + /** Returns the maximal elements in these sets. */ + def max: Option[T] + + /** Indicates whether this sets contains no characters. */ + def isEmpty: Boolean + + /** Indicates whether `this` overlaps with `that`. + * Returns `true` iif the exists `t`, such that + * `this.contains(t) && that.contains(t)` + */ + def overlap(that: RangeSet[T]): Boolean + +} + +object RangeSet { + + /** The empty set of character ranges */ + def empty[T: BoundedEnumerable]: RangeSet[T] = Empty() + + /** The set that contains all characters */ + def all[T: BoundedEnumerable]: RangeSet[T] = All() + + /** Creates a singleton set of a singleton range */ + def char[T: BoundedEnumerable](c: T): RangeSet[T] = + Ranges(NonEmptyVector.one(Range(c, c)), false) + + /** Creates a set of ranges based on the provided single characters */ + def chars[T: BoundedEnumerable](c1: T, c2: T, cs: T*): RangeSet[T] = + ranges((c1, c1), (c2, c2), cs.map(c => (c, c)): _*) + + /** Creates a singleton set of ranges */ + def range[T](r: (T, T))(implicit T: BoundedEnumerable[T]): RangeSet[T] = { + implicit val order = T.order + val lower = r._1.min(r._2) + val upper = r._1.max(r._2) + if (lower == Char.MinValue && upper == Char.MaxValue) + All() + else + Ranges(NonEmptyVector.one(Range(lower, upper)), false) + } + + /** Creates a set of ranges */ + def ranges[T](r1: (T, T), r2: (T, T), rs: (T, T)*)(implicit T: BoundedEnumerable[T]): RangeSet[T] = { + implicit val order = T.order + val ranges = + NonEmptyList(r1, r2 :: rs.toList) + .map { case (c1, c2) => Range(c1.min(c2), c1.max(c2)) } + .sortBy(_.lower) + def merge(ranges: NonEmptyList[Range[T]]): NonEmptyList[Range[T]] = + ranges match { + case NonEmptyList(r1, r2 :: ranges) if r1.overlapsOrAdjacent(r2) => merge(NonEmptyList(r1.merge(r2), ranges)) + case NonEmptyList(r1, r2 :: ranges) => r1 :: merge(NonEmptyList(r2, ranges)) + case NonEmptyList(r1, Nil) => NonEmptyList.one(r1) + } + merge(ranges) match { + case NonEmptyList(Range(Char.MinValue, Char.MaxValue), Nil) => All() + case merged => Ranges(merged.toNev, false) + } + + } + + private case class Range[T](lower: T, upper: T)(implicit T: BoundedEnumerable[T]) { + implicit val order = T.order + def contains(c: T): Boolean = + lower <= c && upper >= c + def overlapsOrAdjacent(that: Range[T]): Boolean = + this.upper >= T.cyclePrevious(that.lower) && this.lower <= T.cycleNext(that.upper) + def merge(that: Range[T]): Range[T] = + Range(this.lower.min(that.lower), this.upper.max(that.upper)) + def enumerate: LazyList[T] = + LazyList.iterate(lower)(T.cycleNext(_)).takeWhile(_ <= upper) + } + + private object Range { + implicit def order[T](implicit T: Order[T]): Order[Range[T]] = Order.from { (x: Range[T], y: Range[T]) => + val mincmp = T.compare(x.lower, y.lower) + if (mincmp == 0) + T.compare(x.upper, y.upper) + else + mincmp + + } + implicit def show[T: Show]: Show[Range[T]] = Show.show { t => + if (t.lower == t.upper) + t.lower.show + else + show"${t.lower}-${t.upper}" + } + } + + private case class All[T]()(implicit T: BoundedEnumerable[T]) extends RangeSet[T] { + def contains(c: T): Boolean = true + def invert: RangeSet[T] = Empty() + def enumerate: LazyList[T] = T.membersAscending + def min: Option[T] = T.minBound.some + def max: Option[T] = T.maxBound.some + def overlap(that: RangeSet[T]): Boolean = that != Empty() + def isEmpty: Boolean = false + } + + private case class Empty[T: BoundedEnumerable]() extends RangeSet[T] { + def contains(c: T): Boolean = false + def invert: RangeSet[T] = All() + def enumerate: LazyList[T] = LazyList.empty + def min: Option[T] = None + def max: Option[T] = None + def overlap(that: RangeSet[T]): Boolean = false + def isEmpty: Boolean = true + } + + private case class Ranges[T](ranges: NonEmptyVector[Range[T]], inverted: Boolean) extends RangeSet[T] { + implicit val order = ranges.head.order + def contains(c: T): Boolean = { + def search(low: Int, high: Int): Boolean = + if (low > high) { + inverted + } else { + val mid = (low + high) / 2 + val range = ranges.getUnsafe(mid) + if (range.contains(c)) + !inverted + else if (c < range.lower) + search(low, mid - 1) + else + search(mid + 1, high) + } + search(0, ranges.length - 1) + } + def invert: RangeSet[T] = copy(inverted = !inverted) + def enumerate: LazyList[T] = LazyList.from(ranges.iterator).flatMap(_.enumerate) + def min: Option[T] = ranges.head.lower.some + def max: Option[T] = ranges.last.upper.some + def overlap(that: RangeSet[T]): Boolean = + that match { + case All() => true + case Empty() => false + case _ => enumerate.exists(that.contains(_)) + } + def isEmpty: Boolean = false + } + + implicit def RangeSetShow[T: Show]: Show[RangeSet[T]] = Show.show { + case Empty() => "ε" + case All() => "*" + case Ranges(ranges, false) => ranges.mkString_("[", "", "]") + case Ranges(ranges, true) => ranges.mkString_("[^", "", "]") + } + + implicit def RangeSetSetLike[T]: SetLike[RangeSet[T], T] = + new SetLike[RangeSet[T], T] { + def contains(s: RangeSet[T])(c: T): Boolean = s.contains(c) + } +} diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/SetLike.scala b/transducers/shared/src/main/scala/fs2/data/transducer/SetLike.scala new file mode 100644 index 000000000..6563adecd --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/SetLike.scala @@ -0,0 +1,25 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +import scala.annotation.implicitNotFound + +@implicitNotFound( + "Could not prove that ${S} can be used as a set of ${C}. Please make sure you provide an implicit `fs2.data.transducer.SetLike[${S}, ${C}]` in scope.") +trait SetLike[S, C] { + def contains(s: S)(c: C): Boolean +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/Table.scala b/transducers/shared/src/main/scala/fs2/data/transducer/Table.scala similarity index 61% rename from transducers/shared/src/main/scala/fs2/data/stt/Table.scala rename to transducers/shared/src/main/scala/fs2/data/transducer/Table.scala index 049943bab..9571c41c1 100644 --- a/transducers/shared/src/main/scala/fs2/data/stt/Table.scala +++ b/transducers/shared/src/main/scala/fs2/data/transducer/Table.scala @@ -14,15 +14,17 @@ * limitations under the License. */ -package fs2.data.stt +package fs2.data.transducer import scala.annotation.implicitNotFound /** A typeclass indicating that some type `T` can be used as a lookup table. */ -@implicitNotFound("Cannot prove that type ${T} can be used as a lookup table. Make sure to provide an implicit instance of `fs2.data.stt.Table[${T}]` in scope") -trait Table[T[_, _]] { - def get[From, To](m: T[From, To])(from: From): Option[To] +@implicitNotFound( + "Cannot prove that type ${T} can be used as a lookup table. Make sure to provide an implicit instance of `fs2.data.transducer.Table[${T}]` in scope") +trait Table[T[_, _]] extends NTable[T] { + def get[From, To](t: T[From, To])(from: From): Option[To] + def getOrdered[From, To](t: T[From, To])(from: From): List[To] = get(t)(from).toList } object Table { @@ -36,3 +38,9 @@ object Table { def get[From, To](m: Map[From, To])(from: From): Option[To] = m.get(from) } } + +@implicitNotFound( + "Cannot prove that type ${T} can be used as a non deterministic lookup table. Make sure to provide an implicit instance of `fs2.data.transducer.NTable[${T}]` in scope") +trait NTable[T[_, _]] { + def getOrdered[From, To](t: T[From, To])(f: From): List[To] +} diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/package.scala b/transducers/shared/src/main/scala/fs2/data/transducer/package.scala new file mode 100644 index 000000000..382d0b2a9 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/package.scala @@ -0,0 +1,33 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data + +package object transducer { + + type CharRanges = RangeSet[Char] + + implicit class TableOps[T[_, _], From, To](val m: T[From, To]) extends AnyVal { + def get(from: From)(implicit M: Table[T]): Option[To] = + M.get(m)(from) + } + + implicit class SetLikeOps[S](val s: S) extends AnyVal { + def contains[C](c: C)(implicit S: SetLike[S, C]): Boolean = + S.contains(s)(c) + } + +} From f92e9bfe62a64d799af1aa089c8941c37c4be0eb Mon Sep 17 00:00:00 2001 From: Lucas Satabin Date: Sat, 8 Jan 2022 19:59:15 +0100 Subject: [PATCH 6/6] Add Kleenex implementation --- build.sbt | 19 +- .../main/scala/fs2/data/kleenex/Action.scala | 32 ++ .../main/scala/fs2/data/kleenex/Check.scala | 216 ++++++++++++ .../scala/fs2/data/kleenex/Environment.scala | 48 +++ .../scala/fs2/data/kleenex/Interpreter.scala | 56 ++++ .../fs2/data/kleenex/KleenexParser.scala | 312 ++++++++++++++++++ .../main/scala/fs2/data/kleenex/Regex.scala | 34 ++ .../src/main/scala/fs2/data/kleenex/ast.scala | 55 +++ .../fs2/data/kleenex/core/Compiler.scala | 302 +++++++++++++++++ .../scala/fs2/data/kleenex/core/Grammar.scala | 59 ++++ .../kleenex/core/TransducerCompiler.scala | 100 ++++++ .../test/resources/kleenex/highlighter.kex | 36 ++ .../src/test/resources/kleenex/ini2json.kex | 19 ++ .../src/test/resources/kleenex/logrewrite.kex | 23 ++ .../src/test/resources/kleenex/mitm.kex | 9 + .../src/test/resources/kleenex/recursive.kex | 3 + .../src/test/resources/kleenex/simple.kex | 1 + .../src/test/resources/kleenex/test.kex | 38 +++ 18 files changed, 1360 insertions(+), 2 deletions(-) create mode 100644 kleenex/shared/src/main/scala/fs2/data/kleenex/Action.scala create mode 100644 kleenex/shared/src/main/scala/fs2/data/kleenex/Check.scala create mode 100644 kleenex/shared/src/main/scala/fs2/data/kleenex/Environment.scala create mode 100644 kleenex/shared/src/main/scala/fs2/data/kleenex/Interpreter.scala create mode 100644 kleenex/shared/src/main/scala/fs2/data/kleenex/KleenexParser.scala create mode 100644 kleenex/shared/src/main/scala/fs2/data/kleenex/Regex.scala create mode 100644 kleenex/shared/src/main/scala/fs2/data/kleenex/ast.scala create mode 100644 kleenex/shared/src/main/scala/fs2/data/kleenex/core/Compiler.scala create mode 100644 kleenex/shared/src/main/scala/fs2/data/kleenex/core/Grammar.scala create mode 100644 kleenex/shared/src/main/scala/fs2/data/kleenex/core/TransducerCompiler.scala create mode 100644 kleenex/shared/src/test/resources/kleenex/highlighter.kex create mode 100644 kleenex/shared/src/test/resources/kleenex/ini2json.kex create mode 100644 kleenex/shared/src/test/resources/kleenex/logrewrite.kex create mode 100644 kleenex/shared/src/test/resources/kleenex/mitm.kex create mode 100644 kleenex/shared/src/test/resources/kleenex/recursive.kex create mode 100644 kleenex/shared/src/test/resources/kleenex/simple.kex create mode 100644 kleenex/shared/src/test/resources/kleenex/test.kex diff --git a/build.sbt b/build.sbt index 124987001..782b19d80 100644 --- a/build.sbt +++ b/build.sbt @@ -119,7 +119,8 @@ val root = (project in file(".")) jsonPlay.js, text.js, xml.js, - transducers.js), + transducers.js, + kleenex.js), ScalaUnidoc / siteSubdirName := "api", addMappingsToSiteDir(ScalaUnidoc / packageDoc / mappings, ScalaUnidoc / siteSubdirName), Nanoc / sourceDirectory := file("site"), @@ -145,7 +146,9 @@ val root = (project in file(".")) cbor.jvm, cbor.js, transducers.jvm, - transducers.js + transducers.js, + kleenex.jvm, + kleenex.js ) lazy val text = crossProject(JVMPlatform, JSPlatform) @@ -319,6 +322,18 @@ lazy val transducers = crossProject(JVMPlatform, JSPlatform) description := "Streaming transducers library" ) +lazy val kleenex = crossProject(JVMPlatform, JSPlatform) + .crossType(CrossType.Full) + .in(file("kleenex")) + .settings(commonSettings) + .settings(publishSettings) + .settings( + name := "fs2-data-kleenex", + description := "Streaming text processing library", + libraryDependencies += "org.typelevel" %%% "cats-parse" % "0.3.6" + ) + .dependsOn(text, transducers) + lazy val documentation = project .in(file("documentation")) .enablePlugins(MdocPlugin) diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Action.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Action.scala new file mode 100644 index 000000000..057075a09 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Action.scala @@ -0,0 +1,32 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import cats.Show + +sealed trait Action +object Action { + case object Push extends Action + case class Pop(reg: String) extends Action + case class Write(reg: String) extends Action + + implicit val show: Show[Action] = Show.show { + case Push => "push" + case Pop(r) => s"pop $$$r" + case Write(r) => s"write $$$r" + } +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Check.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Check.scala new file mode 100644 index 000000000..530a59371 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Check.scala @@ -0,0 +1,216 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import cats.data.NonEmptyList +import cats.parse.Caret +import cats.syntax.all._ +import cats.data.StateT +import cats.MonadError +import fs2.data.kleenex.core.KleenexCompilerException +import scala.annotation.tailrec + +class Checker[F[_]](implicit F: MonadError[F, Throwable]) { + + def check(prog: Program): F[Unit] = { + val declMap = prog.productions.toList.map { case p @ Production(name, t) => (name, (p.pos, t)) }.toMap + scc(declMap).flatMap { components => + components.traverse_ { component => + val allStrictDeps = + component.flatMap(id => declMap.get(id).map { case (pos, t) => (id, pos, strictDependencies(t)) }) + val localStrictDeps = allStrictDeps.toList.mapFilter { case (id, pos, deps) => + // remove strict dependencies not in SCC + val deps1 = deps.view.filterKeys(component.contains(_)).toMap + if (deps1.nonEmpty) + (id, pos).some + else + None + } + if (localStrictDeps.nonEmpty) + F.raiseError[Unit](KleenexCompilerException(s"""Following productions contain non tail recursive calls: + |${localStrictDeps + .map { case (id, pos) => + s"$id (at line ${pos.line + 1})" + } + .mkString("\n")}""".stripMargin)) + else + F.unit + + } + } + } + + private def successors(id: String, term: Term): List[String] = { + def go(t: Term, acc: Set[String]): Set[String] = + t match { + case Term.Var(s) => acc + s + case Term.Concat(ts) => ts.foldLeft(acc)((acc, t) => go(t, acc)) + case Term.Alternative(ts) => ts.foldLeft(acc)((acc, t) => go(t, acc)) + case Term.Star(t) => go(t, acc) + case Term.Plus(t) => go(t, acc) + case Term.Question(t) => go(t, acc) + case Term.Range(t, _, _) => go(t, acc) + case Term.Suppress(t) => go(t, acc) + case Term.Capture(_, t) => go(t, acc) + case _ => acc + } + + go(term, Set.empty).toList + } + + private def termIdents(t: Term): Map[String, Set[Caret]] = + t match { + case Term.Var(name) => Map(name -> Set(t.pos)) + case Term.Concat(ts) => ts.toList.map(termIdents(_)).combineAll + case Term.Alternative(ts) => ts.toList.map(termIdents(_)).combineAll + case Term.Star(t) => termIdents(t) + case Term.Plus(t) => termIdents(t) + case Term.Question(t) => termIdents(t) + case Term.Suppress(t) => termIdents(t) + case Term.Capture(_, t) => termIdents(t) + case _ => Map.empty + } + + // strict dependencies are the variables occurring not in tail positions in sequences + def strictDependencies(t: Term): Map[String, Set[Caret]] = + t match { + case Term.Concat(NonEmptyList(t1, t2 :: ts)) => + strictDependencies(Term.Concat(NonEmptyList(t2, ts))).combine(termIdents(t1)) + case Term.Concat(NonEmptyList(t, Nil)) => strictDependencies(t) + case Term.Alternative(ts) => ts.toList.map(strictDependencies(_)).combineAll + case Term.Star(t) => strictDependencies(t) + case Term.Plus(t) => strictDependencies(t) + case Term.Question(t) => strictDependencies(t) + case Term.Suppress(t) => strictDependencies(t) + case Term.Capture(_, t) => strictDependencies(t) + case _ => Map.empty + } + + private type State[Res] = StateT[F, SCCState, Res] + + private def gets[Res](f: SCCState => Res): State[Res] = + StateT.inspect(f) + + private def getProps(id: String): State[Option[SCCProps]] = + StateT.inspect(_.props.get(id)) + + private def nop: State[Unit] = + StateT.empty + + private def modify(f: SCCState => SCCState): State[Unit] = + StateT.modify(f) + + private def update[Res](f: SCCState => (SCCState, Res)): State[Res] = + StateT.inspect(f).flatMap { case (st, res) => StateT.set(st).as(res) } + + private def raiseError[Res](t: Throwable): State[Res] = + nop.flatMapF(_ => t.raiseError) + + private def scc(declMap: Map[String, (Caret, Term)]): F[List[Set[String]]] = { + val state = SCCState(0, Nil, Map.empty, Nil) + + def process(v: String, t: Term): State[Unit] = + for { + // first push v on the stack and assign an index + vProps <- update { st => + val props = SCCProps(true, st.index, st.index) + (st.copy(index = st.index + 1, stack = v :: st.stack, props = st.props.updated(v, props)), props) + } + // then for each successor compute recursively + () <- successors(v, t).traverse_ { w => + getProps(w).flatMap { + case Some(wProps) => + // successor already processed + if (wProps.onStack) + // it is on stack, hence in the current SCC + modify(st => + st.copy(props = st.props.updated(v, vProps.copy(lowlink = vProps.lowlink.min(wProps.index))))) + else + // not on the stack, not in SCC + nop + case None => + // not processed yet, do it + declMap.get(w) match { + case Some((_, wt)) => + for { + () <- process(w, wt) + wProps <- gets(_.props(w)) + vProps <- gets(_.props(v)) + () <- modify(st => + st.copy(props = st.props.updated(v, vProps.copy(lowlink = vProps.lowlink.min(wProps.lowlink))))) + } yield () + case None => + raiseError[Unit]( + KleenexCompilerException(s"Unknown identifier $w in definition of $v at line ${t.pos.line + 1}")) + } + } + } + vProps <- gets(_.props(v)) + () <- + if (vProps.lowlink == vProps.index) + for { + stack <- gets(_.stack) + (component, stack1) = spanUntilIncluding(stack, v) + () <- modify { st => + st.copy( + // pop from stack + stack = stack1, + // update the components + components = component.toSet :: st.components, + // remove vertices in component from stack + props = component.foldLeft(st.props) { (props, w) => + props.updatedWith(w)(_.map(_.copy(onStack = false))) + } + ) + } + } yield () + else + nop + } yield () + + declMap.toList + // traverse each node (aka production identifier) + .traverse_ { case (id, (_, t)) => + getProps(id) + .flatMap { + case None => + // if no index has been assigned yet, process it + process(id, t) + case Some(_) => + // otherwise, just continue + nop + } + } + .runS(state) + .map(_.components) + } + + private def spanUntilIncluding(l: List[String], v: String): (List[String], List[String]) = { + @tailrec + def loop(l: List[String], acc: List[String]): (List[String], List[String]) = + l match { + case Nil => (l, Nil) + case `v` :: rest => ((v :: acc).reverse, rest) + case e :: rest => loop(rest, e :: acc) + } + loop(l, Nil) + } + +} + +case class SCCProps(onStack: Boolean, index: Int, lowlink: Int) +case class SCCState(index: Int, stack: List[String], props: Map[String, SCCProps], components: List[Set[String]]) diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Environment.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Environment.scala new file mode 100644 index 000000000..845754261 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Environment.scala @@ -0,0 +1,48 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +case class Environment(stack: List[String], registers: Map[String, String]) { + + /** Appends the `s` on top of the stack. */ + def append(s: String): Option[Environment] = + stack match { + case r :: stack => Some(copy((r + s) :: stack)) + case Nil => None + } + + /** Pushes an empty value on top of the stack. */ + def push: Environment = + copy(stack = "" :: stack) + + /** Pops the value on top of the stack and stores it in `reg`. */ + def pop(reg: String): Option[Environment] = + stack match { + case r :: stack => Some(copy(stack = stack, registers = registers.updated(reg, r))) + case Nil => None + } + + /** Appends the value in `reg` on top of the stack and empties the register. */ + def write(reg: String): Option[Environment] = + stack match { + case r :: stack => + val value = registers.getOrElse(reg, "") + Some(copy(stack = (r + value) :: stack, registers.updated(reg, ""))) + case Nil => None + } + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Interpreter.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Interpreter.scala new file mode 100644 index 000000000..5a3db0d4e --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Interpreter.scala @@ -0,0 +1,56 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package kleenex + +import cats.MonadError +import cats.syntax.all._ + +case class KleenexException(msg: String) extends Exception(msg) + +object Interpreter { + + def pipe[F[_]](implicit F: MonadError[F, Throwable]): Pipe[F, Either[String, Action], String] = { + (s: Stream[F, Either[String, Action]]) => + s + .evalScan(new Environment("" :: Nil, Map.empty)) { + case (env, Left(c)) => + env + .append(c) + .liftTo[F](KleenexException(s"cannot append on top of stack")) + case (env, Right(act)) => + act match { + case Action.Push => (env.push).pure[F] + case Action.Pop(reg) => + env + .pop(reg) + .liftTo[F](KleenexException(s"cannot pop to register $reg")) + case Action.Write(reg) => + env + .write(reg) + .liftTo[F](KleenexException(s"cannot write register $reg")) + } + } + .last + .evalMap { + case Some(Environment(s :: _, _)) => s.pure[F] + case _ => F.raiseError[String](KleenexException("cannot pop from empty stack")) + } + } + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/KleenexParser.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/KleenexParser.scala new file mode 100644 index 000000000..eea1ecbb9 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/KleenexParser.scala @@ -0,0 +1,312 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package kleenex + +import transducer.CharRanges + +import cats.ApplicativeError +import cats.data.NonEmptyList +import cats.parse.{Caret, LocationMap, Parser0, Parser => P} +import cats.syntax.all._ + +case class KleenexParseException(msg: String) extends Exception(msg) + +class KleenexParser[F[_]](implicit F: ApplicativeError[F, Throwable]) { + + def parse(content: String): F[Program] = + KleenexParser.program + .parseAll(content) + .leftMap { e => + val locations = LocationMap(content) + KleenexParseException(prettyprint(locations, e)) + } + .liftTo[F] + + private def description(x: P.Expectation): String = x match { + case P.Expectation.OneOfStr(_, List(str)) => + s"expected $str" + case P.Expectation.OneOfStr(_, strs) => + val strList = strs.map(x => s"'$x'").mkString(", ") + s"expected one of $strList" + case P.Expectation.InRange(_, lower, upper) => + if (lower == upper) s"expected '$lower'" + else s"expected '$lower' ~ '$upper'" + case P.Expectation.StartOfString(_) => + "expected beginning of file" + case P.Expectation.EndOfString(_, _) => + "expected end of file" + case P.Expectation.Length(_, expected, actual) => + s"unexpected eof; expected ${expected - actual} more characters" + case P.Expectation.ExpectedFailureAt(_, matched) => + s"unexpected '$matched'" + case P.Expectation.Fail(_) => + "failed to parse" + case P.Expectation.FailWith(_, message) => + message + case P.Expectation.WithContext(contextStr, _) => + s"expected $contextStr" + } + + private def prettyprint(locmap: LocationMap, x: P.Expectation): String = { + val (row, col) = locmap.toLineCol(x.offset).getOrElse((0, locmap.input.size)) + val (r, c) = (row + 1, col + 1) + val line = locmap.getLine(row).get + val offending = + s"${row.toString map { _ => ' ' }} | ${" " * col}^" + s""" + |$r:$c: error: ${description(x)} + |$r | $line + |$offending""".stripMargin + } + + private def prettyprint(locmap: LocationMap, x: P.Error): String = + x.expected.map(prettyprint(locmap, _)).toList.mkString("") +} + +object KleenexParser { + import P._ + + private[this] val whitespace: P[Unit] = oneOf(List(charIn(" \t\r\n"), string("//") ~ charsWhile(_ != '\n'))).void + private[this] val whitespaces0: Parser0[Unit] = whitespace.rep0.void + + private val regIdentStart = ('a' to 'z') + private val identStart = regIdentStart ++ ('A' to 'Z') + private val digit = ('0' to '9') + private val identChar = identStart ++ digit ++ List('-', '_') + private val hexDigit = digit ++ ('a' to 'f') ++ ('A' to 'F') + + private val ident: P[String] = + (peek(charIn(identStart)).with1 *> charsWhile(identChar.contains(_))) + .withContext("identifier") <* whitespaces0 + + private val regIdent: P[String] = + (peek(charIn(regIdentStart)).with1 *> charsWhile(identChar.contains(_))) + .withContext("register identifier (must start with lower case)") <* whitespaces0 + + private val str: P[String] = + oneOf( + List( + charsWhile(!"\\\"".contains(_)).string, + char('\\') *> oneOf(List( + char('"').as("\""), + char('\\').as("\\"), + char('r').as("\r"), + char('n').as("\n"), + char('t').as("\t"), + char('f').as("\f"), + char('x') *> charIn(hexDigit) + .rep(min = 2, max = 2) + .string + .map(codepoint => Character.toString(Integer.parseInt(codepoint, 16))), + char('u') *> charIn(hexDigit) + .rep(min = 4, max = 4) + .string + .map(codepoint => Character.toString(Integer.parseInt(codepoint, 16))) + )) + ) + ).rep0.map(_.combineAll).with1.surroundedBy(char('"')) <* whitespaces0 + + private val integer: P[Int] = + charIn(digit).rep.string.mapFilter(_.toIntOption).withContext("positive integer") + + def keyword(kw: String): P[Unit] = + string(kw) <* whitespaces0 + + private val range: P[(Int, Option[Int])] = + char('{') *> oneOf( + List( + char(',') *> integer.map(max => (0, Some(max))), + (integer ~ (char(',') *> integer.?).?).map { + case (min, None) => (min, Some(min)) + case (min, Some(None)) => (min, None) + case (min, Some(Some(max))) => (min, Some(max)) + } + )) <* char('}') + + val regex: P[Regex] = P.recursive[Regex] { regex => + val setChar = oneOf( + List( + charWhere(!"-]\\".contains(_)), + char('\\') *> oneOf(List( + char('\\').as('\\'), + char('/').as('/'), + char('-').as('-'), + char(']').as(']'), + char('[').as('['), + char('r').as('\r'), + char('n').as('\n'), + char('t').as('\t'), + char('f').as('\f') + )) + )) + val set = char('[') *> (char('^').as(false).?.map(_.getOrElse(true)) ~ oneOf(List( + char('-').as(('-', '-') :: Nil), + (setChar ~ (char('-') *> setChar.?).?).map { + case (fst, Some(Some(snd))) => (fst, snd) :: Nil + case (fst, Some(None)) => (fst, fst) :: ('-', '-') :: Nil + case (fst, None) => (fst, fst) :: Nil + } + )).rep0.map(_.flatten)).map { + case (false, Nil) => CharRanges.all + case (true, Nil) => CharRanges.empty + case (true, r :: Nil) => CharRanges.range(r) + case (false, r :: Nil) => CharRanges.range(r).invert + case (true, r1 :: r2 :: rs) => CharRanges.ranges(r1, r2, rs: _*) + case (false, r1 :: r2 :: rs) => CharRanges.ranges(r1, r2, rs: _*).invert + } <* char(']') + + val atom = + oneOf( + List( + char('.').as(Regex.Any), + set.map(Regex.Set(_)), + oneOf( + List( + charWhere(!"\\/?*+|{[().".contains(_)).string, + char('\\') *> oneOf(List( + char('/').as("/"), + char('\\').as("\\"), + char('r').as("\r"), + char('n').as("\n"), + char('t').as("\t"), + char('f').as("\f"), + char('?').as("?"), + char('*').as("*"), + char('+').as("+"), + char('|').as("|"), + char('{').as("{"), + char('[').as("["), + char('(').as("("), + char(')').as(")"), + char('.').as("."), + char('u') *> charIn(hexDigit) + .rep(min = 4, max = 4) + .string + .map(codepoint => Character.toString(Integer.parseInt(codepoint, 16))) + )) + ) + ).map(Regex.Str(_)), + regex.between(char('('), char(')')) + )) + + val greedy = char('?').?.map(_.isEmpty) + val suffixed = + (atom ~ oneOf( + List( + char('?') *> greedy.map(greedy => Regex.Question(_, greedy)), + char('+') *> greedy.map(greedy => Regex.Plus(_, greedy)), + char('*') *> greedy.map(greedy => Regex.Star(_, greedy)), + range.map { case (min, max) => Regex.Range(_, min, max) } + )).?) + .map { + case (atom, None) => atom + case (atom, Some(mod)) => mod(atom) + } + + def aggregateStr(seq: NonEmptyList[Regex]): NonEmptyList[Regex] = { + def loop(seq: NonEmptyList[Regex]): NonEmptyList[Regex] = + seq match { + case NonEmptyList(Regex.Str(s1), Regex.Str(s2) :: rest) => loop(NonEmptyList(Regex.Str(s1 + s2), rest)) + case NonEmptyList(re, r :: rest) => re :: loop(NonEmptyList(r, rest)) + case NonEmptyList(_, Nil) => seq + } + loop(seq) + } + + val seq = + suffixed.rep + .map(aggregateStr(_)) + .map { + case NonEmptyList(atom, Nil) => atom + case seq => Regex.Concat(seq) + } + + seq.repSep(keyword("|")).map { + case NonEmptyList(seq, Nil) => seq + case alts => Regex.Or(alts) + } + } + + private val registerUpdate: P[Term] = + (caret.with1 ~ regIdent ~ oneOf(List(keyword("<-").as(false), keyword("+=").as(true))) ~ oneOf( + List(str.map(RegOrStr.Str(_)), regIdent.map(RegOrStr.Reg(_)))).rep) + .map { case (((caret, reg), prepend), value) => + Term.UpdateReg(reg, if (prepend) RegOrStr.Reg(reg) :: value else value)(caret) + } + + val term: P[Term] = recursive[Term] { term => + val atom: P[Term] = + oneOf( + List( + caret.map(Term.One()).with1 <* keyword("1"), + (caret.with1 ~ str).map { case (pos, s) => Term.Str(s)(pos) }, + ((caret.with1 ~ ident).map { case (pos, v) => Term.Var(v)(pos) } <* !oneOf( + List(keyword(":="), keyword("@")))).backtrack, + (caret.with1 ~ (char('/') *> regex <* char('/'))).map { case (pos, re) => Term.RE(re)(pos) } <* whitespaces0, + (caret.with1 ~ (char('!') *> regIdent)).map { case (pos, reg) => Term.Output(reg)(pos) }, + registerUpdate.between(keyword("["), keyword("]")), + term.between(keyword("("), keyword(")")) + )) + + val suffixed: P[Term] = + (atom ~ oneOf[Term => Term](List( + (caret.with1 <* keyword("*")).map(pos => Term.Star(_)(pos)), + (caret.with1 <* keyword("+")).map(pos => Term.Plus(_)(pos)), + (caret.with1 <* keyword("?")).map(pos => Term.Question(_)(pos)), + (caret.with1 ~ range).map { case (pos, (min, max)) => Term.Range(_: Term, min, max)(pos) } <* whitespaces0 + )).?) + .map { + case (inner, None) => inner + case (inner, Some(mod)) => mod(inner) + } + + val prefixed: P[Term] = + oneOf( + List( + (caret.with1 ~ (keyword("~") *> suffixed)).map { case (pos, t) => Term.Suppress(t)(pos) }, + (caret.with1 ~ (regIdent <* keyword("@")).backtrack ~ suffixed).map { case ((pos, reg), t) => + Term.Capture(reg, t)(pos) + }, + suffixed + )) + + val seq: P[Term] = + prefixed.rep.map { + case NonEmptyList(atom, Nil) => atom + case seq => Term.Concat(seq) + } + + seq.repSep(keyword("|")).map { + case NonEmptyList(seq, Nil) => seq + case alts => Term.Alternative(alts) + } + } + + val production: P[Production] = + (caret.with1 ~ (ident <* keyword(":=")) ~ term).map { case ((pos, id), t) => Production(id, t)(pos) } + + private val pipeline: Parser0[(Caret, NonEmptyList[String])] = + caret ~ oneOf0(List(keyword("start:") *> ident.repSep(keyword(">>")), pure(NonEmptyList.one("main")))) + + val program: P[Program] = + (pipeline.with1 ~ production.rep) + .map { case ((pos, pipe), rules) => Program(pipe, rules)(pos) } + .surroundedBy(whitespaces0) + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Regex.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Regex.scala new file mode 100644 index 000000000..dbf2cf7cf --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Regex.scala @@ -0,0 +1,34 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import fs2.data.transducer.CharRanges + +import cats.data.NonEmptyList + +sealed trait Regex +object Regex { + case object Any extends Regex + case class Str(s: String) extends Regex + case class Concat(subs: NonEmptyList[Regex]) extends Regex + case class Or(alts: NonEmptyList[Regex]) extends Regex + case class Star(inner: Regex, greedy: Boolean) extends Regex + case class Plus(inner: Regex, greedy: Boolean) extends Regex + case class Question(inner: Regex, greedy: Boolean) extends Regex + case class Range(inner: Regex, fst: Int, snd: Option[Int]) extends Regex + case class Set(ranges: CharRanges) extends Regex +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/ast.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/ast.scala new file mode 100644 index 000000000..336f912a3 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/ast.scala @@ -0,0 +1,55 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import cats.data.NonEmptyList + +import cats.parse.Caret + +case class Program(pipeline: NonEmptyList[String], productions: NonEmptyList[Production])(val pos: Caret) + +case class Production(name: String, term: Term)(val pos: Caret) + +sealed trait Term { + val pos: Caret +} +object Term { + case class One()(val pos: Caret) extends Term + case class Str(s: String)(val pos: Caret) extends Term + case class Var(name: String)(val pos: Caret) extends Term + case class Capture(reg: String, inner: Term)(val pos: Caret) extends Term + case class Output(reg: String)(val pos: Caret) extends Term + case class UpdateReg(reg: String, value: NonEmptyList[RegOrStr])(val pos: Caret) extends Term + case class Alternative(cases: NonEmptyList[Term]) extends Term { + val pos: Caret = cases.head.pos + } + case class Concat(terms: NonEmptyList[Term]) extends Term { + val pos: Caret = terms.head.pos + } + case class RE(re: Regex)(val pos: Caret) extends Term + case class Suppress(inner: Term)(val pos: Caret) extends Term + case class Star(inner: Term)(val pos: Caret) extends Term + case class Plus(inner: Term)(val pos: Caret) extends Term + case class Question(inner: Term)(val pos: Caret) extends Term + case class Range(inner: Term, min: Int, max: Option[Int])(val pos: Caret) extends Term +} + +sealed trait RegOrStr +object RegOrStr { + case class Reg(name: String) extends RegOrStr + case class Str(s: String) extends RegOrStr +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Compiler.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Compiler.scala new file mode 100644 index 000000000..bf896f50d --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Compiler.scala @@ -0,0 +1,302 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex.core + +import fs2.data.kleenex.{Action, Production, Program => KProgram, Regex, RegOrStr, Term => KTerm} +import fs2.data.transducer.CharRanges + +import cats.MonadError +import cats.data.StateT +import cats.syntax.all._ +import cats.data.NonEmptyList + +case class KleenexCompilerException(msg: String) extends Exception(msg) + +case class CompilerState(idents: Map[(String, Boolean), Int], + decls: Map[Int, Term], + revDecls: Map[Term, Int], + fresh: Int) + +class Compiler[F[_]](implicit F: MonadError[F, Throwable]) { + + private type State[Res] = StateT[F, CompilerState, Res] + + /** Compiles a kleenex program into the core language representation. */ + def compile(prog: KProgram): F[Program] = { + // associate each production to 2 ids: + // - one when it outputs element + // - one when it outputs *no* elements + val idents = prog.productions + .flatMap { case Production(name, term) => + NonEmptyList.of((name, true), (name, false)) + } + .zipWithIndex + .toList + .toMap + val fresh = idents.size + val state = CompilerState(idents, Map.empty, Map.empty, fresh) + + val checkPipeline = + prog.pipeline.traverse(name => + idents.get(name -> true) match { + case Some(id) => id.pure[F] + case None => new KleenexCompilerException(s"Unknown production $name in pipeline").raiseError[F, Int] + }) + + val compiledProductions = + prog.productions + .traverse_ { case Production(name, term) => + for { + idout <- lookup(name, true) + idnoout <- lookup(name, false) + compiledout <- compile(true, term) + compilednoout <- compile(false, term) + _ <- insertDecl(idout, Term.Seq(List(compiledout))) + _ <- insertDecl(idnoout, Term.Seq(List(compilednoout))) + } yield () + } + + (checkPipeline, compiledProductions.runS(state)) + .mapN { (pipeline, state) => + val reached = reachable(pipeline.toList, state.decls) + compress(Program(pipeline, state.decls.view.filterKeys(reached.contains(_)).toMap)) + } + } + + def compile(re: Regex): F[Program] = { + compile(true, re) + .run(CompilerState(Map.empty, Map.empty, Map.empty, 0)) + .map { case (st, id) => Program(NonEmptyList.one(id), st.decls) } + } + + private def compile(output: Boolean, re: Regex): State[Int] = + re match { + case Regex.Any => + declare(Term.Read(CharRanges.all, output)) + case Regex.Str(str) => + str.toList + .traverse(c => declare(Term.Read(CharRanges.char(c), output))) + .flatMap(ids => declare(Term.Seq(ids))) + case Regex.Concat(res) => + res.traverse(compile(output, _)).flatMap(ids => declare(Term.Seq(ids.toList))) + case Regex.Or(alts) => + alts.traverse(compile(output, _)).flatMap(ids => declare(Term.Alternative(ids))) + case Regex.Plus(re, greedy) => + compile(output, re).flatMap(plus(_, greedy)) + case Regex.Star(re, greedy) => + compile(output, re).flatMap(star(_, greedy)) + case Regex.Question(re, greedy) => + compile(output, re).flatMap(question(_, greedy)) + case Regex.Set(chars) => + declare(Term.Read(chars, output)) + case Regex.Range(re, min, max) => + compile(output, re).flatMap(range(_, min, max)) + } + + private def compile(output: Boolean, term: KTerm): State[Int] = + term match { + case KTerm.One() => + declare(Term.epsilon) + case KTerm.Str(s) => + val toOuptut = if (output) s else "" + declare(Term.Const(Left(toOuptut))) + case KTerm.Var(v) => + lookup(v, output) + case KTerm.Capture(reg, t) => + for { + idt <- compile(output, t) + idpush <- declare(Term.Const(Right(Action.Push))) + idpop <- declare(Term.Const(Right(Action.Pop(reg)))) + id <- declare(Term.Seq(List(idpush, idt, idpop))) + } yield id + case KTerm.Output(reg) => + declare(Term.Const(Right(Action.Write(reg)))) + case KTerm.UpdateReg(reg, value) => + for { + idpush <- declare(Term.Const(Right(Action.Push))) + idsval <- value.map(updateSym(_)).traverse(c => declare(Term.Const(c))) + idpop <- declare(Term.Const(Right(Action.Pop(reg)))) + id <- declare(Term.Seq((idpush :: idsval).toList :+ idpop)) + } yield id + case KTerm.Alternative(cases) => + flattenAlternatives(cases) + .traverse(compile(output, _)) + .flatMap(ids => declare(Term.Alternative(ids))) + case KTerm.Concat(ts) => + flattenSequences(ts) + .traverse(compile(output, _)) + .flatMap(ids => declare(Term.Seq(ids))) + case KTerm.RE(re) => + compile(output, re) + case KTerm.Suppress(t) => + compile(false, t) + case KTerm.Star(t) => + compile(output, t).flatMap(star(_, true)) + case KTerm.Plus(t) => + compile(output, t).flatMap(plus(_, true)) + case KTerm.Question(t) => + compile(output, t).flatMap(question(_, true)) + case KTerm.Range(t, min, max) => + compile(output, t).flatMap(range(_, min, max)) + } + + // r* = r1 | 1 + // r1 = r r* + // r*? = 1 | r2 + // r2 = r r*? + private def star(idt: Int, greedy: Boolean): State[Int] = + for { + ideps <- declare(Term.epsilon) + id <- freshId + idloop <- declare(Term.Seq(List(idt, id))) + id <- insertDecl(id, + Term.Alternative( + // favor more over less + if (greedy) NonEmptyList.of(idloop, ideps) + // favor less over more + else NonEmptyList.of(ideps, idloop) + )) + } yield id + + // r+ = r r* + // r+? = r r*? + private def plus(idt: Int, greedy: Boolean): State[Int] = + for { + idstar <- star(idt, greedy) + id <- declare(Term.Seq(List(idt, idstar))) + } yield id + + // r? = r | 1 + // r?? = 1 | r + private def question(idt: Int, greedy: Boolean): State[Int] = + for { + ideps <- declare(Term.epsilon) + id <- declare( + Term.Alternative( + // favor one over zero + if (greedy) NonEmptyList.of(idt, ideps) + // favor zero over one + else NonEmptyList.of(ideps, idt))) + } yield id + + private def range(idt: Int, min: Int, max: Option[Int]): State[Int] = + max match { + case Some(max) if min == max => + declare(Term.Seq(List.fill(min)(idt))) + case Some(max) => + question(idt, true).flatMap(idq => declare(Term.Seq(List.fill(min)(idt) ++ List.fill(max - min)(idq)))) + case None => + star(idt, true).flatMap(idstar => declare(Term.Seq(List.fill(min)(idt) ++ List(idstar)))) + } + + private def updateSym(sym: RegOrStr): Either[String, Action] = + sym match { + case RegOrStr.Reg(reg) => Right(Action.Write(reg)) + case RegOrStr.Str(s) => Left(s) + } + + private def flattenAlternatives(alts: NonEmptyList[KTerm]): NonEmptyList[KTerm] = + alts match { + case NonEmptyList(KTerm.Alternative(alts), a :: rest) => + flattenAlternatives(alts).concatNel(flattenAlternatives(NonEmptyList(a, rest))) + case NonEmptyList(KTerm.Alternative(alts), Nil) => + flattenAlternatives(alts) + case NonEmptyList(t, a :: rest) => + t :: flattenAlternatives(NonEmptyList(a, rest)) + case NonEmptyList(t, Nil) => + NonEmptyList.one(t) + } + + private def flattenSequences(ts: NonEmptyList[KTerm]): List[KTerm] = + ts match { + case NonEmptyList(KTerm.Concat(ts), t :: rest) => flattenSequences(ts) ++ flattenSequences(NonEmptyList(t, rest)) + case NonEmptyList(t, h :: rest) => t :: flattenSequences(NonEmptyList(h, rest)) + case NonEmptyList(KTerm.Concat(ts), Nil) => flattenSequences(ts) + case NonEmptyList(t, Nil) => List(t) + } + + private def get: State[CompilerState] = + StateT.get + + private def modify(f: CompilerState => CompilerState): State[Unit] = + StateT.modify(f) + + private def freshId: State[Int] = + get.map(_.fresh) <* modify(s => s.copy(fresh = s.fresh + 1)) + + private def insertDecl(id: Int, term: Term): State[Int] = + modify(st => st.copy(decls = st.decls.updated(id, term), revDecls = st.revDecls.updated(term, id))).as(id) + + private def lookup(id: String, output: Boolean): State[Int] = + get.map(_.idents.get((id, output))).flatMapF { + case Some(id) => id.pure[F] + case None => KleenexCompilerException(s"Unknown non terminal identifier $id").raiseError[F, Int] + } + + private def declare(term: Term): State[Int] = + get.map(_.revDecls.get(term)).flatMap { + case Some(id) => id.pure[State] + case None => freshId.flatMap(insertDecl(_, term)) + } + + private def reachable(from: List[Int], decls: Map[Int, Term]): Set[Int] = { + def referenced(t: Term): List[Int] = + t match { + case Term.Seq(ids) => ids + case Term.Alternative(ids) => ids.toList + case _ => Nil + } + + def loop(from: List[Int], acc: Set[Int]): Set[Int] = + from match { + case id :: from => + if (acc.contains(id)) + loop(from, acc) + else + loop(decls.get(id).map(referenced(_)).getOrElse(Nil) reverse_::: from, acc + id) + case Nil => + acc + } + loop(from, Set.empty) + } + + private def compress(prog: Program): Program = { + def alias(aliases: Map[Int, Int], id: Int): Map[Int, Int] = + prog.decls.get(id) match { + case Some(Term.Seq(List(idt))) => + val aliases1 = alias(aliases, idt) + aliases1.updated(id, aliases1.getOrElse(idt, idt)) + case _ => + aliases + } + val aliases = prog.decls.keys.foldLeft(Map.empty[Int, Int])(alias(_, _)) + + def replace(t: Term): Term = + t match { + case Term.Alternative(ts) => Term.Alternative(ts.map(id => aliases.getOrElse(id, id))) + case Term.Seq(ts) => Term.Seq(ts.map(id => aliases.getOrElse(id, id))) + case _ => t + } + + if (aliases.isEmpty) + prog + else + Program(prog.pipeline.map(id => aliases.getOrElse(id, id)), + prog.decls.view.filterKeys(!aliases.contains(_)).mapValues(replace(_)).toMap) + } + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Grammar.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Grammar.scala new file mode 100644 index 000000000..34ef7df5b --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Grammar.scala @@ -0,0 +1,59 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data +package kleenex +package core + +import transducer.CharRanges + +import cats.Show +import cats.data.NonEmptyList +import cats.syntax.all._ + +case class Program(pipeline: NonEmptyList[Int], decls: Map[Int, Term]) +object Program { + implicit val show: Show[Program] = Show.show { case Program(pipeline, decls) => + s"""start: ${pipeline.mkString_(" >> ")} + | + |${decls.toList.sortBy(_._1).map { case (k, v) => show"$k -> $v" }.mkString_("\n")}""".stripMargin + } +} + +sealed trait Term +object Term { + case class Const(strOrReg: Either[String, Action]) extends Term + case class Read(ranges: CharRanges, output: Boolean) extends Term + case class Seq(idents: List[Int]) extends Term + case class Alternative(idents: NonEmptyList[Int]) extends Term + + def epsilon: Term = Seq(Nil) + + implicit val show: Show[Term] = Show.show { + case Const(Left(s)) => + s""""$s"""" + case Const(Right(a)) => + a.show + case Read(rs, true) => + rs.show + case Read(rs, false) => + show"~$rs" + case Seq(ids) => + ids.mkString_(" -> ") + case Alternative(alts) => + alts.mkString_(" | ") + } +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/core/TransducerCompiler.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/TransducerCompiler.scala new file mode 100644 index 000000000..0217e650d --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/TransducerCompiler.scala @@ -0,0 +1,100 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data +package kleenex +package core + +import fst._ +import transducer.CharRanges + +import cats.data.NonEmptyList +import cats.MonadError +import cats.syntax.all._ + +class TransducerCompiler[F[_]](implicit F: MonadError[F, Throwable]) { + + private type Q = List[Int] + private type E = + Edge[List[Int], CharRanges, CopyFunc[Char, List[Either[String, Action]]], List[Either[String, Action]]] + + def build(prog: Program): F[NonEmptyList[Transducer[Int, Char, Either[String, Action]]]] = + prog.pipeline.traverse { ident => + construct(prog, ident).map(_.enumerateStates) + } + + private def construct(prog: Program, ident: Int): F[Transducer[Q, Char, Either[String, Action]]] = { + def decl(id: Int): F[Term] = + prog.decls.get(id) match { + case Some(t) => t.pure[F] + case None => F.raiseError(KleenexCompilerException(s"Unknown identifier $id")) + } + // Optimization: Reduce number of generated states by contracting + // non-deterministic edges with no output. This is done by "skipping" states + // whose head nonterminal is declared to be a Seq term, or an RSum with only + // one successor. + def follow(qs: Q): F[Q] = + qs match { + case Nil => List.empty.pure[F] + case q :: qs1 => + decl(q).flatMap { + case Term.Seq(rs) => follow(rs ++ qs1) + case Term.Alternative(NonEmptyList(r, Nil)) => follow(r :: qs1) + case _ => qs.pure[F] + } + } + + def go(workingSet: List[Q], states: Set[Q], transitions: List[E]): F[(Set[Q], List[E])] = + workingSet match { + case Nil => + (states, transitions).pure[F] + case q :: rest if states.contains(q) => + go(rest, states, transitions) + case Nil :: rest => + go(rest, states + Nil, transitions) + case (h @ q :: qs) :: rest => + val states1 = states + h + decl(q).flatMap { + case Term.Const(out) => + follow(qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Right(List(out)), q1) :: transitions) + } + case Term.Read(pred, false) => + follow(qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Left((pred, CopyFunc.CopyConst(Nil))), q1) :: transitions) + } + case Term.Read(pred, true) => + follow(qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Left((pred, CopyFunc.CopyArg)), q1) :: transitions) + } + case Term.Seq(rs) => + follow(rs ++ qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Right(Nil), q1) :: transitions) + } + case Term.Alternative(rs) => + rs.toList.traverse(r => follow(r :: qs)).flatMap { qs1 => + val trans = qs1.map(q1 => (h, Right(Nil), q1)) + go(qs1 reverse_::: rest, states1, trans ++ transitions) + } + } + } + + go(List(List(ident)), Set.empty, Nil).map { case (states, transitions) => + new FST(List(ident), states, OrderedEdgeSet.fromList(transitions), Set(Nil)) + } + } + +} diff --git a/kleenex/shared/src/test/resources/kleenex/highlighter.kex b/kleenex/shared/src/test/resources/kleenex/highlighter.kex new file mode 100644 index 000000000..645c0b0c0 --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/highlighter.kex @@ -0,0 +1,36 @@ +main := ( escape | comment | term | symbol | ignored | ws * )* + +term := black /~/ (constant | match | ident) end + | (teal constant | yellow match | blue ident) end + +ignored := /[\]()|{},:[]/ + +ident := (letter | /[0-9_]/)+ + +symbol := yellow /<-|\+=|:=|>>|\*|\?|\+/ end + +constant := /"/ ( /\\./ | /[^\\"]/ )* /"/ + +comment := black ( /\/\/[^\n]*\n/ | /\/\*[^*\/]*\*\// ) end + +match := /\// ( /[^\/\n]/ | /\\./ )+ /\// + +escape := /\\\\/ + | blue /\\x[0-9a-fA-F]{2}/ end + | /\\[tnr]/ + +sp := / /* + +letter := /[a-zA-Z]/ + +word := letter+ + +ws := /[\t\r\n ]/ + +red := "\x1b[31m" +green := "\x1b[32m" +yellow:= "\x1b[33m" +blue := "\x1b[34m" +end := "\x1b[39;49m" +black := "\x1b[30m" +teal := "\x1b[36m" diff --git a/kleenex/shared/src/test/resources/kleenex/ini2json.kex b/kleenex/shared/src/test/resources/kleenex/ini2json.kex new file mode 100644 index 000000000..01ccfe83f --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/ini2json.kex @@ -0,0 +1,19 @@ +start: stripini >> ini2json +// Strips the comments +stripini := (~comment | ~blank | /[^\n]*\n/)* +comment := ws /;[^\n]*/ +blank := ws /\n/ +// Convert the stripped file +ini2json := "{\n" sections "}\n" +sections := (section "," /\n/)* section /\n/ +section := + ind "\"" header "\": {\n" (~/\n/ keyvalues)? ind "}" +header := ~ws ~/\[/ /[^\n\]]*/ ~/]/ ~ws +keyvalue := ind ind key ": " ~/=/ value +keyvalues := (keyvalue "," /\n/)* keyvalue "\n" +key := ~ws "\"" /[^; \t=\[\n]*/ "\"" ~ws +value := ~ws /"[^\n]*"/ ~ws +| ~ws "\"" escapedValue "\"" ~ws +escapedValue := (~/\\/ "\\\\" | ~/"/ "\\\"" | /[^\n]/)* +ws := /[ \t]*/ +ind := " " diff --git a/kleenex/shared/src/test/resources/kleenex/logrewrite.kex b/kleenex/shared/src/test/resources/kleenex/logrewrite.kex new file mode 100644 index 000000000..c403d8287 --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/logrewrite.kex @@ -0,0 +1,23 @@ +main := "[" loglines? "]\n" + +loglines := (logline "," /\n/)* logline /\n/ +logline := "{" host ~sep ~userid ~sep ~authuser sep timestamp sep + request sep code sep bytes sep referer sep useragent "}" + +host := "\"host\":\"" ip "\"" +userid := "\"user\":\"" rfc1413 "\"" +authuser := "\"authuser\":\"" /[^ \n]+/ "\"" +timestamp := "\"date\":\"" ~/\[/ /[^\n\]]+/ ~/]/ "\"" +request := "\"request\":" quotedString +code := "\"status\":\"" integer "\"" +bytes := "\"size\":\"" (integer | /-/) "\"" +referer := "\"url\":" quotedString +useragent := "\"agent\":" quotedString + +ws := /[\t ]+/ +sep := "," ~ws + +quotedString := /"([^"\n]|\\")*"/ +integer := /[0-9]+/ +ip := integer (/\./ integer){3} +rfc1413 := /-/ diff --git a/kleenex/shared/src/test/resources/kleenex/mitm.kex b/kleenex/shared/src/test/resources/kleenex/mitm.kex new file mode 100644 index 000000000..2dfd4a46c --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/mitm.kex @@ -0,0 +1,9 @@ +main := /
/ main + | /./ main + | "" + +url := q? /[^"’ >]/* q? +q := ~/"|’/ +addq := "\"" +sp := / /* +evil := addq "http://evil.com/?url=" !orig addq diff --git a/kleenex/shared/src/test/resources/kleenex/recursive.kex b/kleenex/shared/src/test/resources/kleenex/recursive.kex new file mode 100644 index 000000000..b64673f0a --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/recursive.kex @@ -0,0 +1,3 @@ +main := as | bs +as := "a" bs | 1 +bs := "b" as | 1 diff --git a/kleenex/shared/src/test/resources/kleenex/simple.kex b/kleenex/shared/src/test/resources/kleenex/simple.kex new file mode 100644 index 000000000..9433451aa --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/simple.kex @@ -0,0 +1 @@ +main := id @ /[a-z][a-z0-9]*/ !id !id diff --git a/kleenex/shared/src/test/resources/kleenex/test.kex b/kleenex/shared/src/test/resources/kleenex/test.kex new file mode 100644 index 000000000..08ebba3eb --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/test.kex @@ -0,0 +1,38 @@ +// A Kleenex program starts with what we call a pipeline declaration. +// This one can be understood: First remove the comments, +// then gather the numbers at the bottom. +start: remComments >> gatherNumbers + +// If no pipeline is specified, "main" is picked +// as the starting point. +// The most basic Kleenex term is matching. It matches +// the input against a regular expression, outputting it directly. +line := /[^\n]*\n/ +// Often you don’t want all the input turned into output. +// The ~ operator lets suppress the output otherwise produced +// by a term, in this case removing lines that start with "#", +// and preserving ones that don’t. +// When there’s ambiguity, the leftmost choice is always chosen. +commentLine := ~(/#/ line) | line +// Recursion is allowed, but only in tail position. Here we +// terminate the recursion with "1", which consumes nothing and +// always succeeds. +remComments := commentLine remComments | 1 + +// We also allow regex operators like *, + and ? on terms: +thousandSepLines := (thousandSep /\n/ | line)* + +// It’s possible to output text without matching by using "...". +// In this case, we use it to insert thousands separators into a number. +thousandSep := digit{1,3} ("," digit{3})* /\n/ +digit := /[0-9]/ + +// We also allow for more complicated operations. We call these ’actions’. +// reg@term runs the term as normal, but all output it would produce is +// stored in the register named reg. +// [ ... += ... ] allows you to append things to a register, both contents +// of other registers, as well as string constants. +// !reg outputs the contents of a register. +gatherNumbers := + (num@thousandSep [ numbers += num ] | line)* + !numbers