diff --git a/build.sbt b/build.sbt index 5896a386f..782b19d80 100644 --- a/build.sbt +++ b/build.sbt @@ -8,6 +8,7 @@ val shapeless2Version = "2.3.7" val shapeless3Version = "3.0.3" val scalaJavaTimeVersion = "2.3.0" val diffsonVersion = "4.1.1" +val weaverVersion = "0.7.9" val commonSettings = List( scalaVersion := scala213, @@ -56,13 +57,14 @@ val commonSettings = List( libraryDependencies ++= List( "co.fs2" %%% "fs2-core" % fs2Version, "org.scala-lang.modules" %%% "scala-collection-compat" % "2.6.0", - "io.circe" %%% "circe-parser" % circeVersion % "test", - "co.fs2" %% "fs2-io" % fs2Version % "test", - "com.disneystreaming" %%% "weaver-cats" % "0.7.9" % "test", - "com.disneystreaming" %%% "weaver-cats-core" % "0.7.9" % "test", - "com.disneystreaming" %%% "weaver-core" % "0.7.9" % "test", - "com.disneystreaming" %%% "weaver-framework" % "0.7.9" % "test", - "com.eed3si9n.expecty" %%% "expecty" % "0.15.4" % "test", + "io.circe" %%% "circe-parser" % circeVersion % Test, + "co.fs2" %% "fs2-io" % fs2Version % Test, + "com.disneystreaming" %%% "weaver-cats" % weaverVersion % Test, + "com.disneystreaming" %%% "weaver-cats-core" % weaverVersion % Test, + "com.disneystreaming" %%% "weaver-core" % weaverVersion % Test, + "com.disneystreaming" %%% "weaver-framework" % weaverVersion % Test, + "com.disneystreaming" %% "weaver-scalacheck" % weaverVersion % Test, + "com.eed3si9n.expecty" %%% "expecty" % "0.15.4" % Test, "org.portable-scala" %%% "portable-scala-reflect" % "1.1.1" cross CrossVersion.for3Use2_13 ) ++ PartialFunction .condOpt(CrossVersion.partialVersion(scalaVersion.value)) { case Some((2, _)) => @@ -116,7 +118,9 @@ val root = (project in file(".")) jsonDiffson.js, jsonPlay.js, text.js, - xml.js), + xml.js, + transducers.js, + kleenex.js), ScalaUnidoc / siteSubdirName := "api", addMappingsToSiteDir(ScalaUnidoc / packageDoc / mappings, ScalaUnidoc / siteSubdirName), Nanoc / sourceDirectory := file("site"), @@ -140,7 +144,11 @@ val root = (project in file(".")) xml.jvm, xml.js, cbor.jvm, - cbor.js + cbor.js, + transducers.jvm, + transducers.js, + kleenex.jvm, + kleenex.js ) lazy val text = crossProject(JVMPlatform, JSPlatform) @@ -225,7 +233,7 @@ lazy val jsonCirce = crossProject(JVMPlatform, JSPlatform) description := "Streaming JSON library with support for circe ASTs", libraryDependencies ++= List( "io.circe" %%% "circe-core" % circeVersion, - "org.gnieh" %%% "diffson-circe" % diffsonVersion % "test" + "org.gnieh" %%% "diffson-circe" % diffsonVersion % Test ) ) .dependsOn(json % "compile->compile;test->test", jsonDiffson % "test->test") @@ -241,7 +249,7 @@ lazy val jsonPlay = crossProject(JVMPlatform, JSPlatform) crossScalaVersions := Seq(scala212, scala213), libraryDependencies ++= List( "com.typesafe.play" %%% "play-json" % playVersion, - "org.gnieh" %%% "diffson-play-json" % diffsonVersion % "test" + "org.gnieh" %%% "diffson-play-json" % diffsonVersion % Test ) ) .dependsOn(json % "compile->compile;test->test", jsonDiffson % "test->test") @@ -304,6 +312,28 @@ lazy val cbor = crossProject(JVMPlatform, JSPlatform) .flatten ) +lazy val transducers = crossProject(JVMPlatform, JSPlatform) + .crossType(CrossType.Full) + .in(file("transducers")) + .settings(commonSettings) + .settings(publishSettings) + .settings( + name := "fs2-data-transducers", + description := "Streaming transducers library" + ) + +lazy val kleenex = crossProject(JVMPlatform, JSPlatform) + .crossType(CrossType.Full) + .in(file("kleenex")) + .settings(commonSettings) + .settings(publishSettings) + .settings( + name := "fs2-data-kleenex", + description := "Streaming text processing library", + libraryDependencies += "org.typelevel" %%% "cats-parse" % "0.3.6" + ) + .dependsOn(text, transducers) + lazy val documentation = project .in(file("documentation")) .enablePlugins(MdocPlugin) diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Action.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Action.scala new file mode 100644 index 000000000..057075a09 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Action.scala @@ -0,0 +1,32 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import cats.Show + +sealed trait Action +object Action { + case object Push extends Action + case class Pop(reg: String) extends Action + case class Write(reg: String) extends Action + + implicit val show: Show[Action] = Show.show { + case Push => "push" + case Pop(r) => s"pop $$$r" + case Write(r) => s"write $$$r" + } +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Check.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Check.scala new file mode 100644 index 000000000..530a59371 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Check.scala @@ -0,0 +1,216 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import cats.data.NonEmptyList +import cats.parse.Caret +import cats.syntax.all._ +import cats.data.StateT +import cats.MonadError +import fs2.data.kleenex.core.KleenexCompilerException +import scala.annotation.tailrec + +class Checker[F[_]](implicit F: MonadError[F, Throwable]) { + + def check(prog: Program): F[Unit] = { + val declMap = prog.productions.toList.map { case p @ Production(name, t) => (name, (p.pos, t)) }.toMap + scc(declMap).flatMap { components => + components.traverse_ { component => + val allStrictDeps = + component.flatMap(id => declMap.get(id).map { case (pos, t) => (id, pos, strictDependencies(t)) }) + val localStrictDeps = allStrictDeps.toList.mapFilter { case (id, pos, deps) => + // remove strict dependencies not in SCC + val deps1 = deps.view.filterKeys(component.contains(_)).toMap + if (deps1.nonEmpty) + (id, pos).some + else + None + } + if (localStrictDeps.nonEmpty) + F.raiseError[Unit](KleenexCompilerException(s"""Following productions contain non tail recursive calls: + |${localStrictDeps + .map { case (id, pos) => + s"$id (at line ${pos.line + 1})" + } + .mkString("\n")}""".stripMargin)) + else + F.unit + + } + } + } + + private def successors(id: String, term: Term): List[String] = { + def go(t: Term, acc: Set[String]): Set[String] = + t match { + case Term.Var(s) => acc + s + case Term.Concat(ts) => ts.foldLeft(acc)((acc, t) => go(t, acc)) + case Term.Alternative(ts) => ts.foldLeft(acc)((acc, t) => go(t, acc)) + case Term.Star(t) => go(t, acc) + case Term.Plus(t) => go(t, acc) + case Term.Question(t) => go(t, acc) + case Term.Range(t, _, _) => go(t, acc) + case Term.Suppress(t) => go(t, acc) + case Term.Capture(_, t) => go(t, acc) + case _ => acc + } + + go(term, Set.empty).toList + } + + private def termIdents(t: Term): Map[String, Set[Caret]] = + t match { + case Term.Var(name) => Map(name -> Set(t.pos)) + case Term.Concat(ts) => ts.toList.map(termIdents(_)).combineAll + case Term.Alternative(ts) => ts.toList.map(termIdents(_)).combineAll + case Term.Star(t) => termIdents(t) + case Term.Plus(t) => termIdents(t) + case Term.Question(t) => termIdents(t) + case Term.Suppress(t) => termIdents(t) + case Term.Capture(_, t) => termIdents(t) + case _ => Map.empty + } + + // strict dependencies are the variables occurring not in tail positions in sequences + def strictDependencies(t: Term): Map[String, Set[Caret]] = + t match { + case Term.Concat(NonEmptyList(t1, t2 :: ts)) => + strictDependencies(Term.Concat(NonEmptyList(t2, ts))).combine(termIdents(t1)) + case Term.Concat(NonEmptyList(t, Nil)) => strictDependencies(t) + case Term.Alternative(ts) => ts.toList.map(strictDependencies(_)).combineAll + case Term.Star(t) => strictDependencies(t) + case Term.Plus(t) => strictDependencies(t) + case Term.Question(t) => strictDependencies(t) + case Term.Suppress(t) => strictDependencies(t) + case Term.Capture(_, t) => strictDependencies(t) + case _ => Map.empty + } + + private type State[Res] = StateT[F, SCCState, Res] + + private def gets[Res](f: SCCState => Res): State[Res] = + StateT.inspect(f) + + private def getProps(id: String): State[Option[SCCProps]] = + StateT.inspect(_.props.get(id)) + + private def nop: State[Unit] = + StateT.empty + + private def modify(f: SCCState => SCCState): State[Unit] = + StateT.modify(f) + + private def update[Res](f: SCCState => (SCCState, Res)): State[Res] = + StateT.inspect(f).flatMap { case (st, res) => StateT.set(st).as(res) } + + private def raiseError[Res](t: Throwable): State[Res] = + nop.flatMapF(_ => t.raiseError) + + private def scc(declMap: Map[String, (Caret, Term)]): F[List[Set[String]]] = { + val state = SCCState(0, Nil, Map.empty, Nil) + + def process(v: String, t: Term): State[Unit] = + for { + // first push v on the stack and assign an index + vProps <- update { st => + val props = SCCProps(true, st.index, st.index) + (st.copy(index = st.index + 1, stack = v :: st.stack, props = st.props.updated(v, props)), props) + } + // then for each successor compute recursively + () <- successors(v, t).traverse_ { w => + getProps(w).flatMap { + case Some(wProps) => + // successor already processed + if (wProps.onStack) + // it is on stack, hence in the current SCC + modify(st => + st.copy(props = st.props.updated(v, vProps.copy(lowlink = vProps.lowlink.min(wProps.index))))) + else + // not on the stack, not in SCC + nop + case None => + // not processed yet, do it + declMap.get(w) match { + case Some((_, wt)) => + for { + () <- process(w, wt) + wProps <- gets(_.props(w)) + vProps <- gets(_.props(v)) + () <- modify(st => + st.copy(props = st.props.updated(v, vProps.copy(lowlink = vProps.lowlink.min(wProps.lowlink))))) + } yield () + case None => + raiseError[Unit]( + KleenexCompilerException(s"Unknown identifier $w in definition of $v at line ${t.pos.line + 1}")) + } + } + } + vProps <- gets(_.props(v)) + () <- + if (vProps.lowlink == vProps.index) + for { + stack <- gets(_.stack) + (component, stack1) = spanUntilIncluding(stack, v) + () <- modify { st => + st.copy( + // pop from stack + stack = stack1, + // update the components + components = component.toSet :: st.components, + // remove vertices in component from stack + props = component.foldLeft(st.props) { (props, w) => + props.updatedWith(w)(_.map(_.copy(onStack = false))) + } + ) + } + } yield () + else + nop + } yield () + + declMap.toList + // traverse each node (aka production identifier) + .traverse_ { case (id, (_, t)) => + getProps(id) + .flatMap { + case None => + // if no index has been assigned yet, process it + process(id, t) + case Some(_) => + // otherwise, just continue + nop + } + } + .runS(state) + .map(_.components) + } + + private def spanUntilIncluding(l: List[String], v: String): (List[String], List[String]) = { + @tailrec + def loop(l: List[String], acc: List[String]): (List[String], List[String]) = + l match { + case Nil => (l, Nil) + case `v` :: rest => ((v :: acc).reverse, rest) + case e :: rest => loop(rest, e :: acc) + } + loop(l, Nil) + } + +} + +case class SCCProps(onStack: Boolean, index: Int, lowlink: Int) +case class SCCState(index: Int, stack: List[String], props: Map[String, SCCProps], components: List[Set[String]]) diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Environment.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Environment.scala new file mode 100644 index 000000000..845754261 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Environment.scala @@ -0,0 +1,48 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +case class Environment(stack: List[String], registers: Map[String, String]) { + + /** Appends the `s` on top of the stack. */ + def append(s: String): Option[Environment] = + stack match { + case r :: stack => Some(copy((r + s) :: stack)) + case Nil => None + } + + /** Pushes an empty value on top of the stack. */ + def push: Environment = + copy(stack = "" :: stack) + + /** Pops the value on top of the stack and stores it in `reg`. */ + def pop(reg: String): Option[Environment] = + stack match { + case r :: stack => Some(copy(stack = stack, registers = registers.updated(reg, r))) + case Nil => None + } + + /** Appends the value in `reg` on top of the stack and empties the register. */ + def write(reg: String): Option[Environment] = + stack match { + case r :: stack => + val value = registers.getOrElse(reg, "") + Some(copy(stack = (r + value) :: stack, registers.updated(reg, ""))) + case Nil => None + } + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Interpreter.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Interpreter.scala new file mode 100644 index 000000000..5a3db0d4e --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Interpreter.scala @@ -0,0 +1,56 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package kleenex + +import cats.MonadError +import cats.syntax.all._ + +case class KleenexException(msg: String) extends Exception(msg) + +object Interpreter { + + def pipe[F[_]](implicit F: MonadError[F, Throwable]): Pipe[F, Either[String, Action], String] = { + (s: Stream[F, Either[String, Action]]) => + s + .evalScan(new Environment("" :: Nil, Map.empty)) { + case (env, Left(c)) => + env + .append(c) + .liftTo[F](KleenexException(s"cannot append on top of stack")) + case (env, Right(act)) => + act match { + case Action.Push => (env.push).pure[F] + case Action.Pop(reg) => + env + .pop(reg) + .liftTo[F](KleenexException(s"cannot pop to register $reg")) + case Action.Write(reg) => + env + .write(reg) + .liftTo[F](KleenexException(s"cannot write register $reg")) + } + } + .last + .evalMap { + case Some(Environment(s :: _, _)) => s.pure[F] + case _ => F.raiseError[String](KleenexException("cannot pop from empty stack")) + } + } + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/KleenexParser.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/KleenexParser.scala new file mode 100644 index 000000000..eea1ecbb9 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/KleenexParser.scala @@ -0,0 +1,312 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package kleenex + +import transducer.CharRanges + +import cats.ApplicativeError +import cats.data.NonEmptyList +import cats.parse.{Caret, LocationMap, Parser0, Parser => P} +import cats.syntax.all._ + +case class KleenexParseException(msg: String) extends Exception(msg) + +class KleenexParser[F[_]](implicit F: ApplicativeError[F, Throwable]) { + + def parse(content: String): F[Program] = + KleenexParser.program + .parseAll(content) + .leftMap { e => + val locations = LocationMap(content) + KleenexParseException(prettyprint(locations, e)) + } + .liftTo[F] + + private def description(x: P.Expectation): String = x match { + case P.Expectation.OneOfStr(_, List(str)) => + s"expected $str" + case P.Expectation.OneOfStr(_, strs) => + val strList = strs.map(x => s"'$x'").mkString(", ") + s"expected one of $strList" + case P.Expectation.InRange(_, lower, upper) => + if (lower == upper) s"expected '$lower'" + else s"expected '$lower' ~ '$upper'" + case P.Expectation.StartOfString(_) => + "expected beginning of file" + case P.Expectation.EndOfString(_, _) => + "expected end of file" + case P.Expectation.Length(_, expected, actual) => + s"unexpected eof; expected ${expected - actual} more characters" + case P.Expectation.ExpectedFailureAt(_, matched) => + s"unexpected '$matched'" + case P.Expectation.Fail(_) => + "failed to parse" + case P.Expectation.FailWith(_, message) => + message + case P.Expectation.WithContext(contextStr, _) => + s"expected $contextStr" + } + + private def prettyprint(locmap: LocationMap, x: P.Expectation): String = { + val (row, col) = locmap.toLineCol(x.offset).getOrElse((0, locmap.input.size)) + val (r, c) = (row + 1, col + 1) + val line = locmap.getLine(row).get + val offending = + s"${row.toString map { _ => ' ' }} | ${" " * col}^" + s""" + |$r:$c: error: ${description(x)} + |$r | $line + |$offending""".stripMargin + } + + private def prettyprint(locmap: LocationMap, x: P.Error): String = + x.expected.map(prettyprint(locmap, _)).toList.mkString("") +} + +object KleenexParser { + import P._ + + private[this] val whitespace: P[Unit] = oneOf(List(charIn(" \t\r\n"), string("//") ~ charsWhile(_ != '\n'))).void + private[this] val whitespaces0: Parser0[Unit] = whitespace.rep0.void + + private val regIdentStart = ('a' to 'z') + private val identStart = regIdentStart ++ ('A' to 'Z') + private val digit = ('0' to '9') + private val identChar = identStart ++ digit ++ List('-', '_') + private val hexDigit = digit ++ ('a' to 'f') ++ ('A' to 'F') + + private val ident: P[String] = + (peek(charIn(identStart)).with1 *> charsWhile(identChar.contains(_))) + .withContext("identifier") <* whitespaces0 + + private val regIdent: P[String] = + (peek(charIn(regIdentStart)).with1 *> charsWhile(identChar.contains(_))) + .withContext("register identifier (must start with lower case)") <* whitespaces0 + + private val str: P[String] = + oneOf( + List( + charsWhile(!"\\\"".contains(_)).string, + char('\\') *> oneOf(List( + char('"').as("\""), + char('\\').as("\\"), + char('r').as("\r"), + char('n').as("\n"), + char('t').as("\t"), + char('f').as("\f"), + char('x') *> charIn(hexDigit) + .rep(min = 2, max = 2) + .string + .map(codepoint => Character.toString(Integer.parseInt(codepoint, 16))), + char('u') *> charIn(hexDigit) + .rep(min = 4, max = 4) + .string + .map(codepoint => Character.toString(Integer.parseInt(codepoint, 16))) + )) + ) + ).rep0.map(_.combineAll).with1.surroundedBy(char('"')) <* whitespaces0 + + private val integer: P[Int] = + charIn(digit).rep.string.mapFilter(_.toIntOption).withContext("positive integer") + + def keyword(kw: String): P[Unit] = + string(kw) <* whitespaces0 + + private val range: P[(Int, Option[Int])] = + char('{') *> oneOf( + List( + char(',') *> integer.map(max => (0, Some(max))), + (integer ~ (char(',') *> integer.?).?).map { + case (min, None) => (min, Some(min)) + case (min, Some(None)) => (min, None) + case (min, Some(Some(max))) => (min, Some(max)) + } + )) <* char('}') + + val regex: P[Regex] = P.recursive[Regex] { regex => + val setChar = oneOf( + List( + charWhere(!"-]\\".contains(_)), + char('\\') *> oneOf(List( + char('\\').as('\\'), + char('/').as('/'), + char('-').as('-'), + char(']').as(']'), + char('[').as('['), + char('r').as('\r'), + char('n').as('\n'), + char('t').as('\t'), + char('f').as('\f') + )) + )) + val set = char('[') *> (char('^').as(false).?.map(_.getOrElse(true)) ~ oneOf(List( + char('-').as(('-', '-') :: Nil), + (setChar ~ (char('-') *> setChar.?).?).map { + case (fst, Some(Some(snd))) => (fst, snd) :: Nil + case (fst, Some(None)) => (fst, fst) :: ('-', '-') :: Nil + case (fst, None) => (fst, fst) :: Nil + } + )).rep0.map(_.flatten)).map { + case (false, Nil) => CharRanges.all + case (true, Nil) => CharRanges.empty + case (true, r :: Nil) => CharRanges.range(r) + case (false, r :: Nil) => CharRanges.range(r).invert + case (true, r1 :: r2 :: rs) => CharRanges.ranges(r1, r2, rs: _*) + case (false, r1 :: r2 :: rs) => CharRanges.ranges(r1, r2, rs: _*).invert + } <* char(']') + + val atom = + oneOf( + List( + char('.').as(Regex.Any), + set.map(Regex.Set(_)), + oneOf( + List( + charWhere(!"\\/?*+|{[().".contains(_)).string, + char('\\') *> oneOf(List( + char('/').as("/"), + char('\\').as("\\"), + char('r').as("\r"), + char('n').as("\n"), + char('t').as("\t"), + char('f').as("\f"), + char('?').as("?"), + char('*').as("*"), + char('+').as("+"), + char('|').as("|"), + char('{').as("{"), + char('[').as("["), + char('(').as("("), + char(')').as(")"), + char('.').as("."), + char('u') *> charIn(hexDigit) + .rep(min = 4, max = 4) + .string + .map(codepoint => Character.toString(Integer.parseInt(codepoint, 16))) + )) + ) + ).map(Regex.Str(_)), + regex.between(char('('), char(')')) + )) + + val greedy = char('?').?.map(_.isEmpty) + val suffixed = + (atom ~ oneOf( + List( + char('?') *> greedy.map(greedy => Regex.Question(_, greedy)), + char('+') *> greedy.map(greedy => Regex.Plus(_, greedy)), + char('*') *> greedy.map(greedy => Regex.Star(_, greedy)), + range.map { case (min, max) => Regex.Range(_, min, max) } + )).?) + .map { + case (atom, None) => atom + case (atom, Some(mod)) => mod(atom) + } + + def aggregateStr(seq: NonEmptyList[Regex]): NonEmptyList[Regex] = { + def loop(seq: NonEmptyList[Regex]): NonEmptyList[Regex] = + seq match { + case NonEmptyList(Regex.Str(s1), Regex.Str(s2) :: rest) => loop(NonEmptyList(Regex.Str(s1 + s2), rest)) + case NonEmptyList(re, r :: rest) => re :: loop(NonEmptyList(r, rest)) + case NonEmptyList(_, Nil) => seq + } + loop(seq) + } + + val seq = + suffixed.rep + .map(aggregateStr(_)) + .map { + case NonEmptyList(atom, Nil) => atom + case seq => Regex.Concat(seq) + } + + seq.repSep(keyword("|")).map { + case NonEmptyList(seq, Nil) => seq + case alts => Regex.Or(alts) + } + } + + private val registerUpdate: P[Term] = + (caret.with1 ~ regIdent ~ oneOf(List(keyword("<-").as(false), keyword("+=").as(true))) ~ oneOf( + List(str.map(RegOrStr.Str(_)), regIdent.map(RegOrStr.Reg(_)))).rep) + .map { case (((caret, reg), prepend), value) => + Term.UpdateReg(reg, if (prepend) RegOrStr.Reg(reg) :: value else value)(caret) + } + + val term: P[Term] = recursive[Term] { term => + val atom: P[Term] = + oneOf( + List( + caret.map(Term.One()).with1 <* keyword("1"), + (caret.with1 ~ str).map { case (pos, s) => Term.Str(s)(pos) }, + ((caret.with1 ~ ident).map { case (pos, v) => Term.Var(v)(pos) } <* !oneOf( + List(keyword(":="), keyword("@")))).backtrack, + (caret.with1 ~ (char('/') *> regex <* char('/'))).map { case (pos, re) => Term.RE(re)(pos) } <* whitespaces0, + (caret.with1 ~ (char('!') *> regIdent)).map { case (pos, reg) => Term.Output(reg)(pos) }, + registerUpdate.between(keyword("["), keyword("]")), + term.between(keyword("("), keyword(")")) + )) + + val suffixed: P[Term] = + (atom ~ oneOf[Term => Term](List( + (caret.with1 <* keyword("*")).map(pos => Term.Star(_)(pos)), + (caret.with1 <* keyword("+")).map(pos => Term.Plus(_)(pos)), + (caret.with1 <* keyword("?")).map(pos => Term.Question(_)(pos)), + (caret.with1 ~ range).map { case (pos, (min, max)) => Term.Range(_: Term, min, max)(pos) } <* whitespaces0 + )).?) + .map { + case (inner, None) => inner + case (inner, Some(mod)) => mod(inner) + } + + val prefixed: P[Term] = + oneOf( + List( + (caret.with1 ~ (keyword("~") *> suffixed)).map { case (pos, t) => Term.Suppress(t)(pos) }, + (caret.with1 ~ (regIdent <* keyword("@")).backtrack ~ suffixed).map { case ((pos, reg), t) => + Term.Capture(reg, t)(pos) + }, + suffixed + )) + + val seq: P[Term] = + prefixed.rep.map { + case NonEmptyList(atom, Nil) => atom + case seq => Term.Concat(seq) + } + + seq.repSep(keyword("|")).map { + case NonEmptyList(seq, Nil) => seq + case alts => Term.Alternative(alts) + } + } + + val production: P[Production] = + (caret.with1 ~ (ident <* keyword(":=")) ~ term).map { case ((pos, id), t) => Production(id, t)(pos) } + + private val pipeline: Parser0[(Caret, NonEmptyList[String])] = + caret ~ oneOf0(List(keyword("start:") *> ident.repSep(keyword(">>")), pure(NonEmptyList.one("main")))) + + val program: P[Program] = + (pipeline.with1 ~ production.rep) + .map { case ((pos, pipe), rules) => Program(pipe, rules)(pos) } + .surroundedBy(whitespaces0) + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Regex.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Regex.scala new file mode 100644 index 000000000..dbf2cf7cf --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Regex.scala @@ -0,0 +1,34 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import fs2.data.transducer.CharRanges + +import cats.data.NonEmptyList + +sealed trait Regex +object Regex { + case object Any extends Regex + case class Str(s: String) extends Regex + case class Concat(subs: NonEmptyList[Regex]) extends Regex + case class Or(alts: NonEmptyList[Regex]) extends Regex + case class Star(inner: Regex, greedy: Boolean) extends Regex + case class Plus(inner: Regex, greedy: Boolean) extends Regex + case class Question(inner: Regex, greedy: Boolean) extends Regex + case class Range(inner: Regex, fst: Int, snd: Option[Int]) extends Regex + case class Set(ranges: CharRanges) extends Regex +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/ast.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/ast.scala new file mode 100644 index 000000000..336f912a3 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/ast.scala @@ -0,0 +1,55 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import cats.data.NonEmptyList + +import cats.parse.Caret + +case class Program(pipeline: NonEmptyList[String], productions: NonEmptyList[Production])(val pos: Caret) + +case class Production(name: String, term: Term)(val pos: Caret) + +sealed trait Term { + val pos: Caret +} +object Term { + case class One()(val pos: Caret) extends Term + case class Str(s: String)(val pos: Caret) extends Term + case class Var(name: String)(val pos: Caret) extends Term + case class Capture(reg: String, inner: Term)(val pos: Caret) extends Term + case class Output(reg: String)(val pos: Caret) extends Term + case class UpdateReg(reg: String, value: NonEmptyList[RegOrStr])(val pos: Caret) extends Term + case class Alternative(cases: NonEmptyList[Term]) extends Term { + val pos: Caret = cases.head.pos + } + case class Concat(terms: NonEmptyList[Term]) extends Term { + val pos: Caret = terms.head.pos + } + case class RE(re: Regex)(val pos: Caret) extends Term + case class Suppress(inner: Term)(val pos: Caret) extends Term + case class Star(inner: Term)(val pos: Caret) extends Term + case class Plus(inner: Term)(val pos: Caret) extends Term + case class Question(inner: Term)(val pos: Caret) extends Term + case class Range(inner: Term, min: Int, max: Option[Int])(val pos: Caret) extends Term +} + +sealed trait RegOrStr +object RegOrStr { + case class Reg(name: String) extends RegOrStr + case class Str(s: String) extends RegOrStr +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Compiler.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Compiler.scala new file mode 100644 index 000000000..bf896f50d --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Compiler.scala @@ -0,0 +1,302 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex.core + +import fs2.data.kleenex.{Action, Production, Program => KProgram, Regex, RegOrStr, Term => KTerm} +import fs2.data.transducer.CharRanges + +import cats.MonadError +import cats.data.StateT +import cats.syntax.all._ +import cats.data.NonEmptyList + +case class KleenexCompilerException(msg: String) extends Exception(msg) + +case class CompilerState(idents: Map[(String, Boolean), Int], + decls: Map[Int, Term], + revDecls: Map[Term, Int], + fresh: Int) + +class Compiler[F[_]](implicit F: MonadError[F, Throwable]) { + + private type State[Res] = StateT[F, CompilerState, Res] + + /** Compiles a kleenex program into the core language representation. */ + def compile(prog: KProgram): F[Program] = { + // associate each production to 2 ids: + // - one when it outputs element + // - one when it outputs *no* elements + val idents = prog.productions + .flatMap { case Production(name, term) => + NonEmptyList.of((name, true), (name, false)) + } + .zipWithIndex + .toList + .toMap + val fresh = idents.size + val state = CompilerState(idents, Map.empty, Map.empty, fresh) + + val checkPipeline = + prog.pipeline.traverse(name => + idents.get(name -> true) match { + case Some(id) => id.pure[F] + case None => new KleenexCompilerException(s"Unknown production $name in pipeline").raiseError[F, Int] + }) + + val compiledProductions = + prog.productions + .traverse_ { case Production(name, term) => + for { + idout <- lookup(name, true) + idnoout <- lookup(name, false) + compiledout <- compile(true, term) + compilednoout <- compile(false, term) + _ <- insertDecl(idout, Term.Seq(List(compiledout))) + _ <- insertDecl(idnoout, Term.Seq(List(compilednoout))) + } yield () + } + + (checkPipeline, compiledProductions.runS(state)) + .mapN { (pipeline, state) => + val reached = reachable(pipeline.toList, state.decls) + compress(Program(pipeline, state.decls.view.filterKeys(reached.contains(_)).toMap)) + } + } + + def compile(re: Regex): F[Program] = { + compile(true, re) + .run(CompilerState(Map.empty, Map.empty, Map.empty, 0)) + .map { case (st, id) => Program(NonEmptyList.one(id), st.decls) } + } + + private def compile(output: Boolean, re: Regex): State[Int] = + re match { + case Regex.Any => + declare(Term.Read(CharRanges.all, output)) + case Regex.Str(str) => + str.toList + .traverse(c => declare(Term.Read(CharRanges.char(c), output))) + .flatMap(ids => declare(Term.Seq(ids))) + case Regex.Concat(res) => + res.traverse(compile(output, _)).flatMap(ids => declare(Term.Seq(ids.toList))) + case Regex.Or(alts) => + alts.traverse(compile(output, _)).flatMap(ids => declare(Term.Alternative(ids))) + case Regex.Plus(re, greedy) => + compile(output, re).flatMap(plus(_, greedy)) + case Regex.Star(re, greedy) => + compile(output, re).flatMap(star(_, greedy)) + case Regex.Question(re, greedy) => + compile(output, re).flatMap(question(_, greedy)) + case Regex.Set(chars) => + declare(Term.Read(chars, output)) + case Regex.Range(re, min, max) => + compile(output, re).flatMap(range(_, min, max)) + } + + private def compile(output: Boolean, term: KTerm): State[Int] = + term match { + case KTerm.One() => + declare(Term.epsilon) + case KTerm.Str(s) => + val toOuptut = if (output) s else "" + declare(Term.Const(Left(toOuptut))) + case KTerm.Var(v) => + lookup(v, output) + case KTerm.Capture(reg, t) => + for { + idt <- compile(output, t) + idpush <- declare(Term.Const(Right(Action.Push))) + idpop <- declare(Term.Const(Right(Action.Pop(reg)))) + id <- declare(Term.Seq(List(idpush, idt, idpop))) + } yield id + case KTerm.Output(reg) => + declare(Term.Const(Right(Action.Write(reg)))) + case KTerm.UpdateReg(reg, value) => + for { + idpush <- declare(Term.Const(Right(Action.Push))) + idsval <- value.map(updateSym(_)).traverse(c => declare(Term.Const(c))) + idpop <- declare(Term.Const(Right(Action.Pop(reg)))) + id <- declare(Term.Seq((idpush :: idsval).toList :+ idpop)) + } yield id + case KTerm.Alternative(cases) => + flattenAlternatives(cases) + .traverse(compile(output, _)) + .flatMap(ids => declare(Term.Alternative(ids))) + case KTerm.Concat(ts) => + flattenSequences(ts) + .traverse(compile(output, _)) + .flatMap(ids => declare(Term.Seq(ids))) + case KTerm.RE(re) => + compile(output, re) + case KTerm.Suppress(t) => + compile(false, t) + case KTerm.Star(t) => + compile(output, t).flatMap(star(_, true)) + case KTerm.Plus(t) => + compile(output, t).flatMap(plus(_, true)) + case KTerm.Question(t) => + compile(output, t).flatMap(question(_, true)) + case KTerm.Range(t, min, max) => + compile(output, t).flatMap(range(_, min, max)) + } + + // r* = r1 | 1 + // r1 = r r* + // r*? = 1 | r2 + // r2 = r r*? + private def star(idt: Int, greedy: Boolean): State[Int] = + for { + ideps <- declare(Term.epsilon) + id <- freshId + idloop <- declare(Term.Seq(List(idt, id))) + id <- insertDecl(id, + Term.Alternative( + // favor more over less + if (greedy) NonEmptyList.of(idloop, ideps) + // favor less over more + else NonEmptyList.of(ideps, idloop) + )) + } yield id + + // r+ = r r* + // r+? = r r*? + private def plus(idt: Int, greedy: Boolean): State[Int] = + for { + idstar <- star(idt, greedy) + id <- declare(Term.Seq(List(idt, idstar))) + } yield id + + // r? = r | 1 + // r?? = 1 | r + private def question(idt: Int, greedy: Boolean): State[Int] = + for { + ideps <- declare(Term.epsilon) + id <- declare( + Term.Alternative( + // favor one over zero + if (greedy) NonEmptyList.of(idt, ideps) + // favor zero over one + else NonEmptyList.of(ideps, idt))) + } yield id + + private def range(idt: Int, min: Int, max: Option[Int]): State[Int] = + max match { + case Some(max) if min == max => + declare(Term.Seq(List.fill(min)(idt))) + case Some(max) => + question(idt, true).flatMap(idq => declare(Term.Seq(List.fill(min)(idt) ++ List.fill(max - min)(idq)))) + case None => + star(idt, true).flatMap(idstar => declare(Term.Seq(List.fill(min)(idt) ++ List(idstar)))) + } + + private def updateSym(sym: RegOrStr): Either[String, Action] = + sym match { + case RegOrStr.Reg(reg) => Right(Action.Write(reg)) + case RegOrStr.Str(s) => Left(s) + } + + private def flattenAlternatives(alts: NonEmptyList[KTerm]): NonEmptyList[KTerm] = + alts match { + case NonEmptyList(KTerm.Alternative(alts), a :: rest) => + flattenAlternatives(alts).concatNel(flattenAlternatives(NonEmptyList(a, rest))) + case NonEmptyList(KTerm.Alternative(alts), Nil) => + flattenAlternatives(alts) + case NonEmptyList(t, a :: rest) => + t :: flattenAlternatives(NonEmptyList(a, rest)) + case NonEmptyList(t, Nil) => + NonEmptyList.one(t) + } + + private def flattenSequences(ts: NonEmptyList[KTerm]): List[KTerm] = + ts match { + case NonEmptyList(KTerm.Concat(ts), t :: rest) => flattenSequences(ts) ++ flattenSequences(NonEmptyList(t, rest)) + case NonEmptyList(t, h :: rest) => t :: flattenSequences(NonEmptyList(h, rest)) + case NonEmptyList(KTerm.Concat(ts), Nil) => flattenSequences(ts) + case NonEmptyList(t, Nil) => List(t) + } + + private def get: State[CompilerState] = + StateT.get + + private def modify(f: CompilerState => CompilerState): State[Unit] = + StateT.modify(f) + + private def freshId: State[Int] = + get.map(_.fresh) <* modify(s => s.copy(fresh = s.fresh + 1)) + + private def insertDecl(id: Int, term: Term): State[Int] = + modify(st => st.copy(decls = st.decls.updated(id, term), revDecls = st.revDecls.updated(term, id))).as(id) + + private def lookup(id: String, output: Boolean): State[Int] = + get.map(_.idents.get((id, output))).flatMapF { + case Some(id) => id.pure[F] + case None => KleenexCompilerException(s"Unknown non terminal identifier $id").raiseError[F, Int] + } + + private def declare(term: Term): State[Int] = + get.map(_.revDecls.get(term)).flatMap { + case Some(id) => id.pure[State] + case None => freshId.flatMap(insertDecl(_, term)) + } + + private def reachable(from: List[Int], decls: Map[Int, Term]): Set[Int] = { + def referenced(t: Term): List[Int] = + t match { + case Term.Seq(ids) => ids + case Term.Alternative(ids) => ids.toList + case _ => Nil + } + + def loop(from: List[Int], acc: Set[Int]): Set[Int] = + from match { + case id :: from => + if (acc.contains(id)) + loop(from, acc) + else + loop(decls.get(id).map(referenced(_)).getOrElse(Nil) reverse_::: from, acc + id) + case Nil => + acc + } + loop(from, Set.empty) + } + + private def compress(prog: Program): Program = { + def alias(aliases: Map[Int, Int], id: Int): Map[Int, Int] = + prog.decls.get(id) match { + case Some(Term.Seq(List(idt))) => + val aliases1 = alias(aliases, idt) + aliases1.updated(id, aliases1.getOrElse(idt, idt)) + case _ => + aliases + } + val aliases = prog.decls.keys.foldLeft(Map.empty[Int, Int])(alias(_, _)) + + def replace(t: Term): Term = + t match { + case Term.Alternative(ts) => Term.Alternative(ts.map(id => aliases.getOrElse(id, id))) + case Term.Seq(ts) => Term.Seq(ts.map(id => aliases.getOrElse(id, id))) + case _ => t + } + + if (aliases.isEmpty) + prog + else + Program(prog.pipeline.map(id => aliases.getOrElse(id, id)), + prog.decls.view.filterKeys(!aliases.contains(_)).mapValues(replace(_)).toMap) + } + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Grammar.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Grammar.scala new file mode 100644 index 000000000..34ef7df5b --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Grammar.scala @@ -0,0 +1,59 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data +package kleenex +package core + +import transducer.CharRanges + +import cats.Show +import cats.data.NonEmptyList +import cats.syntax.all._ + +case class Program(pipeline: NonEmptyList[Int], decls: Map[Int, Term]) +object Program { + implicit val show: Show[Program] = Show.show { case Program(pipeline, decls) => + s"""start: ${pipeline.mkString_(" >> ")} + | + |${decls.toList.sortBy(_._1).map { case (k, v) => show"$k -> $v" }.mkString_("\n")}""".stripMargin + } +} + +sealed trait Term +object Term { + case class Const(strOrReg: Either[String, Action]) extends Term + case class Read(ranges: CharRanges, output: Boolean) extends Term + case class Seq(idents: List[Int]) extends Term + case class Alternative(idents: NonEmptyList[Int]) extends Term + + def epsilon: Term = Seq(Nil) + + implicit val show: Show[Term] = Show.show { + case Const(Left(s)) => + s""""$s"""" + case Const(Right(a)) => + a.show + case Read(rs, true) => + rs.show + case Read(rs, false) => + show"~$rs" + case Seq(ids) => + ids.mkString_(" -> ") + case Alternative(alts) => + alts.mkString_(" | ") + } +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/core/TransducerCompiler.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/TransducerCompiler.scala new file mode 100644 index 000000000..0217e650d --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/TransducerCompiler.scala @@ -0,0 +1,100 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data +package kleenex +package core + +import fst._ +import transducer.CharRanges + +import cats.data.NonEmptyList +import cats.MonadError +import cats.syntax.all._ + +class TransducerCompiler[F[_]](implicit F: MonadError[F, Throwable]) { + + private type Q = List[Int] + private type E = + Edge[List[Int], CharRanges, CopyFunc[Char, List[Either[String, Action]]], List[Either[String, Action]]] + + def build(prog: Program): F[NonEmptyList[Transducer[Int, Char, Either[String, Action]]]] = + prog.pipeline.traverse { ident => + construct(prog, ident).map(_.enumerateStates) + } + + private def construct(prog: Program, ident: Int): F[Transducer[Q, Char, Either[String, Action]]] = { + def decl(id: Int): F[Term] = + prog.decls.get(id) match { + case Some(t) => t.pure[F] + case None => F.raiseError(KleenexCompilerException(s"Unknown identifier $id")) + } + // Optimization: Reduce number of generated states by contracting + // non-deterministic edges with no output. This is done by "skipping" states + // whose head nonterminal is declared to be a Seq term, or an RSum with only + // one successor. + def follow(qs: Q): F[Q] = + qs match { + case Nil => List.empty.pure[F] + case q :: qs1 => + decl(q).flatMap { + case Term.Seq(rs) => follow(rs ++ qs1) + case Term.Alternative(NonEmptyList(r, Nil)) => follow(r :: qs1) + case _ => qs.pure[F] + } + } + + def go(workingSet: List[Q], states: Set[Q], transitions: List[E]): F[(Set[Q], List[E])] = + workingSet match { + case Nil => + (states, transitions).pure[F] + case q :: rest if states.contains(q) => + go(rest, states, transitions) + case Nil :: rest => + go(rest, states + Nil, transitions) + case (h @ q :: qs) :: rest => + val states1 = states + h + decl(q).flatMap { + case Term.Const(out) => + follow(qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Right(List(out)), q1) :: transitions) + } + case Term.Read(pred, false) => + follow(qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Left((pred, CopyFunc.CopyConst(Nil))), q1) :: transitions) + } + case Term.Read(pred, true) => + follow(qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Left((pred, CopyFunc.CopyArg)), q1) :: transitions) + } + case Term.Seq(rs) => + follow(rs ++ qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Right(Nil), q1) :: transitions) + } + case Term.Alternative(rs) => + rs.toList.traverse(r => follow(r :: qs)).flatMap { qs1 => + val trans = qs1.map(q1 => (h, Right(Nil), q1)) + go(qs1 reverse_::: rest, states1, trans ++ transitions) + } + } + } + + go(List(List(ident)), Set.empty, Nil).map { case (states, transitions) => + new FST(List(ident), states, OrderedEdgeSet.fromList(transitions), Set(Nil)) + } + } + +} diff --git a/kleenex/shared/src/test/resources/kleenex/highlighter.kex b/kleenex/shared/src/test/resources/kleenex/highlighter.kex new file mode 100644 index 000000000..645c0b0c0 --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/highlighter.kex @@ -0,0 +1,36 @@ +main := ( escape | comment | term | symbol | ignored | ws * )* + +term := black /~/ (constant | match | ident) end + | (teal constant | yellow match | blue ident) end + +ignored := /[\]()|{},:[]/ + +ident := (letter | /[0-9_]/)+ + +symbol := yellow /<-|\+=|:=|>>|\*|\?|\+/ end + +constant := /"/ ( /\\./ | /[^\\"]/ )* /"/ + +comment := black ( /\/\/[^\n]*\n/ | /\/\*[^*\/]*\*\// ) end + +match := /\// ( /[^\/\n]/ | /\\./ )+ /\// + +escape := /\\\\/ + | blue /\\x[0-9a-fA-F]{2}/ end + | /\\[tnr]/ + +sp := / /* + +letter := /[a-zA-Z]/ + +word := letter+ + +ws := /[\t\r\n ]/ + +red := "\x1b[31m" +green := "\x1b[32m" +yellow:= "\x1b[33m" +blue := "\x1b[34m" +end := "\x1b[39;49m" +black := "\x1b[30m" +teal := "\x1b[36m" diff --git a/kleenex/shared/src/test/resources/kleenex/ini2json.kex b/kleenex/shared/src/test/resources/kleenex/ini2json.kex new file mode 100644 index 000000000..01ccfe83f --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/ini2json.kex @@ -0,0 +1,19 @@ +start: stripini >> ini2json +// Strips the comments +stripini := (~comment | ~blank | /[^\n]*\n/)* +comment := ws /;[^\n]*/ +blank := ws /\n/ +// Convert the stripped file +ini2json := "{\n" sections "}\n" +sections := (section "," /\n/)* section /\n/ +section := + ind "\"" header "\": {\n" (~/\n/ keyvalues)? ind "}" +header := ~ws ~/\[/ /[^\n\]]*/ ~/]/ ~ws +keyvalue := ind ind key ": " ~/=/ value +keyvalues := (keyvalue "," /\n/)* keyvalue "\n" +key := ~ws "\"" /[^; \t=\[\n]*/ "\"" ~ws +value := ~ws /"[^\n]*"/ ~ws +| ~ws "\"" escapedValue "\"" ~ws +escapedValue := (~/\\/ "\\\\" | ~/"/ "\\\"" | /[^\n]/)* +ws := /[ \t]*/ +ind := " " diff --git a/kleenex/shared/src/test/resources/kleenex/logrewrite.kex b/kleenex/shared/src/test/resources/kleenex/logrewrite.kex new file mode 100644 index 000000000..c403d8287 --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/logrewrite.kex @@ -0,0 +1,23 @@ +main := "[" loglines? "]\n" + +loglines := (logline "," /\n/)* logline /\n/ +logline := "{" host ~sep ~userid ~sep ~authuser sep timestamp sep + request sep code sep bytes sep referer sep useragent "}" + +host := "\"host\":\"" ip "\"" +userid := "\"user\":\"" rfc1413 "\"" +authuser := "\"authuser\":\"" /[^ \n]+/ "\"" +timestamp := "\"date\":\"" ~/\[/ /[^\n\]]+/ ~/]/ "\"" +request := "\"request\":" quotedString +code := "\"status\":\"" integer "\"" +bytes := "\"size\":\"" (integer | /-/) "\"" +referer := "\"url\":" quotedString +useragent := "\"agent\":" quotedString + +ws := /[\t ]+/ +sep := "," ~ws + +quotedString := /"([^"\n]|\\")*"/ +integer := /[0-9]+/ +ip := integer (/\./ integer){3} +rfc1413 := /-/ diff --git a/kleenex/shared/src/test/resources/kleenex/mitm.kex b/kleenex/shared/src/test/resources/kleenex/mitm.kex new file mode 100644 index 000000000..2dfd4a46c --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/mitm.kex @@ -0,0 +1,9 @@ +main := /
/ main + | /./ main + | "" + +url := q? /[^"’ >]/* q? +q := ~/"|’/ +addq := "\"" +sp := / /* +evil := addq "http://evil.com/?url=" !orig addq diff --git a/kleenex/shared/src/test/resources/kleenex/recursive.kex b/kleenex/shared/src/test/resources/kleenex/recursive.kex new file mode 100644 index 000000000..b64673f0a --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/recursive.kex @@ -0,0 +1,3 @@ +main := as | bs +as := "a" bs | 1 +bs := "b" as | 1 diff --git a/kleenex/shared/src/test/resources/kleenex/simple.kex b/kleenex/shared/src/test/resources/kleenex/simple.kex new file mode 100644 index 000000000..9433451aa --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/simple.kex @@ -0,0 +1 @@ +main := id @ /[a-z][a-z0-9]*/ !id !id diff --git a/kleenex/shared/src/test/resources/kleenex/test.kex b/kleenex/shared/src/test/resources/kleenex/test.kex new file mode 100644 index 000000000..08ebba3eb --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/test.kex @@ -0,0 +1,38 @@ +// A Kleenex program starts with what we call a pipeline declaration. +// This one can be understood: First remove the comments, +// then gather the numbers at the bottom. +start: remComments >> gatherNumbers + +// If no pipeline is specified, "main" is picked +// as the starting point. +// The most basic Kleenex term is matching. It matches +// the input against a regular expression, outputting it directly. +line := /[^\n]*\n/ +// Often you don’t want all the input turned into output. +// The ~ operator lets suppress the output otherwise produced +// by a term, in this case removing lines that start with "#", +// and preserving ones that don’t. +// When there’s ambiguity, the leftmost choice is always chosen. +commentLine := ~(/#/ line) | line +// Recursion is allowed, but only in tail position. Here we +// terminate the recursion with "1", which consumes nothing and +// always succeeds. +remComments := commentLine remComments | 1 + +// We also allow regex operators like *, + and ? on terms: +thousandSepLines := (thousandSep /\n/ | line)* + +// It’s possible to output text without matching by using "...". +// In this case, we use it to insert thousands separators into a number. +thousandSep := digit{1,3} ("," digit{3})* /\n/ +digit := /[0-9]/ + +// We also allow for more complicated operations. We call these ’actions’. +// reg@term runs the term as normal, but all output it would produce is +// stored in the register named reg. +// [ ... += ... ] allows you to append things to a register, both contents +// of other registers, as well as string constants. +// !reg outputs the contents of a register. +gatherNumbers := + (num@thousandSep [ numbers += num ] | line)* + !numbers diff --git a/transducers/jvm/src/test/scala/fs2/data/transducer/CharRangesSpec.scala b/transducers/jvm/src/test/scala/fs2/data/transducer/CharRangesSpec.scala new file mode 100644 index 000000000..8808b4dac --- /dev/null +++ b/transducers/jvm/src/test/scala/fs2/data/transducer/CharRangesSpec.scala @@ -0,0 +1,96 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +import weaver._ +import weaver.scalacheck._ + +import org.scalacheck._ + +object CharRangesSpec extends SimpleIOSuite with Checkers { + + val aChar = Gen.choose(Char.MinValue, Char.MaxValue) + val aRange = + for { + c1 <- aChar + c2 <- aChar + } yield (c1, c2) + + val someRanges = Gen.nonEmptyListOf(aRange).map { + case Nil => CharRanges.empty + case r :: Nil => CharRanges.range(r) + case r1 :: r2 :: rs => CharRanges.ranges(r1, r2, rs: _*) + } + + implicit val aCharRanges: Arbitrary[CharRanges] = + Arbitrary(Gen.oneOf(Gen.const(CharRanges.all), Gen.const(CharRanges.empty), someRanges)) + + pureTest("merge adjacent") { + expect(CharRanges.ranges('a' -> 'd', 'e' -> 'z') == CharRanges.range('a', 'z')) + } + + pureTest("merge overlapping") { + expect(CharRanges.ranges('a' -> 'l', 'e' -> 'z') == CharRanges.range('a', 'z')) + } + + pureTest("simplify all") { + expect(CharRanges.range(Char.MinValue, Char.MaxValue) == CharRanges.all) + } + + pureTest("ranges inclusive") { + expect.all( + CharRanges.range('a' -> 'z').contains('a'), + CharRanges.range('a' -> 'z').contains('z'), + CharRanges.all.contains(Char.MinValue), + CharRanges.all.contains(Char.MaxValue), + CharRanges.char('a').contains('a') + ) + } + + pureTest("empty doesn't overlap all") { + expect.all( + !CharRanges.empty.overlap(CharRanges.all), + !CharRanges.empty.overlap(CharRanges.empty), + !CharRanges.all.overlap(CharRanges.empty) + ) + } + + test("empty overlaps nothing") { + forall { (ranges: CharRanges) => + expect(true) + } + } + + test("overlapping with all") { + forall { (ranges: CharRanges) => + expect(ranges.isEmpty) || expect.all(CharRanges.all.overlap(ranges), ranges.overlap(CharRanges.all)) + } + } + + test("invert and back") { + forall { (ranges: CharRanges) => + expect(ranges.invert.invert == ranges) + } + } + + test("invert inverts") { + forall { (ranges: CharRanges, c: Char) => + expect(ranges.invert.contains(c) == !ranges.contains(c)) + } + } + +} diff --git a/transducers/shared/src/main/scala/fs2/data/fst/CopyFunc.scala b/transducers/shared/src/main/scala/fs2/data/fst/CopyFunc.scala new file mode 100644 index 000000000..44badf368 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/fst/CopyFunc.scala @@ -0,0 +1,100 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data +package fst + +import transducer.Func +import cats.kernel.BoundedEnumerable +import cats.Id +import cats.Show +import cats.syntax.all._ + +/** Functions that inject their arguments into the range type + * or are constant functions (i.e. ignoring their argument). + */ +sealed trait CopyFunc[+T, +C] +object CopyFunc { + case object CopyArg extends CopyFunc[Nothing, Nothing] + case class CopyConst[Out](out: Out) extends CopyFunc[Nothing, Out] + + implicit def show[T: Show, C: Show]: Show[CopyFunc[T, C]] = Show.show { + case CopyFunc.CopyArg => "" + case CopyFunc.CopyConst(out) => out.show + } + + implicit def CopyFuncChar[X]: Func.Aux[CopyFunc[Char, List[Either[String, X]]], Char, List[Either[String, X]]] = + new Func[CopyFunc[Char, List[Either[String, X]]]] { + type Dom = Char + type Rng = List[Either[String, X]] + def eval(f: CopyFunc[Char, List[Either[String, X]]])(arg: Dom): Rng = + f match { + case CopyFunc.CopyArg => Left(arg.toString) :: Nil + case CopyFunc.CopyConst(out) => out + } + + def isConst(f: CopyFunc[Char, List[Either[String, X]]]): Option[Rng] = + f match { + case CopyFunc.CopyArg => None + case CopyFunc.CopyConst(out) => Some(out) + } + + def inDom(t: Char)(f: CopyFunc[Char, List[Either[String, X]]]): Boolean = true + + def domain(f: CopyFunc[Char, List[Either[String, X]]]): LazyList[Char] = BoundedEnumerable[Char].membersAscending + + } + + implicit def CopyFuncEitherListFunc[A, X](implicit + A: BoundedEnumerable[A]): Func.Aux[CopyFunc[A, List[Either[A, X]]], A, List[Either[A, X]]] = + new Func[CopyFunc[A, List[Either[A, X]]]] { + type Dom = A + type Rng = List[Either[A, X]] + + def eval(f: CopyFunc[A, List[Either[A, X]]])(arg: Dom): Rng = + f match { + case CopyArg => Left(arg) :: Nil + case CopyConst(out) => out + } + def isConst(f: CopyFunc[A, List[Either[A, X]]]): Option[Rng] = + f match { + case CopyArg => None + case CopyConst(out) => Some(out) + } + def inDom(t: Dom)(f: CopyFunc[A, List[Either[A, X]]]): Boolean = true + def domain(f: CopyFunc[A, List[Either[A, X]]]): LazyList[Dom] = A.membersAscending + } + + implicit def CopyFuncIdentityFunc[A](implicit + A: BoundedEnumerable[A]): Func.Aux[CopyFunc[A, List[Id[A]]], A, List[Id[A]]] = + new Func[CopyFunc[A, List[Id[A]]]] { + type Dom = A + type Rng = List[Id[A]] + + def eval(f: CopyFunc[A, List[Id[A]]])(arg: Dom): Rng = + f match { + case CopyArg => List(arg) + case CopyConst(out) => out + } + def isConst(f: CopyFunc[A, List[Id[A]]]): Option[Rng] = + f match { + case CopyArg => None + case CopyConst(out) => Some(out) + } + def inDom(t: Dom)(f: CopyFunc[A, List[Id[A]]]): Boolean = true + def domain(f: CopyFunc[A, List[Id[A]]]): LazyList[Dom] = A.membersAscending + } +} diff --git a/transducers/shared/src/main/scala/fs2/data/fst/FST.scala b/transducers/shared/src/main/scala/fs2/data/fst/FST.scala new file mode 100644 index 000000000..f37706702 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/fst/FST.scala @@ -0,0 +1,135 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package fst + +import transducer._ + +import cats.syntax.all._ +import cats.{Monoid, Show} + +/** Non-deterministis finit state transducer. + * To be well-formed, for any state, there is either + * - epsilon transitions, or + * - symbol transitions but + * - not both + */ +case class FST[Q, Pred, Fun, In, Out](initial: Q, + states: Set[Q], + edges: OrderedEdgeSet[Q, Pred, Fun, Out], + finals: Set[Q])(implicit Fun: Func.Aux[Fun, In, Out]) { + + def isChoiceState(q: Q): Boolean = + edges.forwardEpsilon.contains(q) + + def isSkipState(q: Q): Boolean = + edges.forwardEpsilon.get(q) match { + case Some(List(_)) => true + case _ => false + } + + def isJoinState(q: Q): Boolean = + edges.backward.getOrElse(q, Nil).size + edges.backwardEpsilon.getOrElse(q, Nil).size > 1 + + def enumerateStates: FST[Int, Pred, Fun, In, Out] = { + val q2int = states.toList.zipWithIndex.toMap + val intedges = OrderedEdgeSet.fromList[Int, Pred, Fun, Out](edges.toList.map { case (src, act, tgt) => + (q2int(src), act, q2int(tgt)) + }) + FST(q2int(initial), q2int.values.toSet, intedges, finals.map(q2int(_))) + } + + def evalEdges(q: Q, in: In)(implicit Pred: SetLike[Pred, In]): List[(Out, Q)] = + edges.forward.get(q) match { + case None => Nil + case Some(ts) => + ts.flatMap { case (pred, f, q) => + if (pred.contains(in)) + List((Fun.eval(f)(in), q)) + else + Nil + } + } + + def rightClosure(q: Q)(implicit Out: Monoid[Out]): List[(Out, Q)] = { + def go(q: Q, visited: Set[Q], out: Out): (Set[Q], List[(Out, Q)]) = + edges.forwardEpsilon.get(q) match { + case Some(Nil) | None => + (visited, List(out -> q)) + case Some(ts) => + ts.foldLeft((visited, List.empty[(Out, Q)])) { case ((visited, acc), (w, q1)) => + if (visited.contains(q1)) { + (visited, acc) + } else { + val (visited1, ys) = go(q1, visited + q1, out.combine(w)) + (visited1, acc ++ ys) + } + } + } + go(q, Set.empty, Out.empty)._2 + } + + def pipe[F[_]: RaiseThrowable]( + emitEarly: Boolean = true)(implicit Out: Monoid[Out], Pred: SetLike[Pred, In]): Pipe[F, In, Out] = + new FSTPipe(this, emitEarly) + +} + +object FST { + + implicit def show[Q: Show, pred: Show, F: Show, In, Out: Show]: Show[FST[Q, pred, F, In, Out]] = Show.show { fst => + val transitions = fst.states.toList.map { q => + if (fst.edges.forward.contains(q)) + fst.edges.forward(q).map { case (pred, f, tgt) => show"$q - $pred / $f -> $tgt" } + else + fst.edges.forwardEpsilon.getOrElse(q, Nil).map { case (out, tgt) => show"$q - / $out -> $tgt" } + } + show"""FST { + | initial = ${fst.initial} + | ${transitions.mkString_("\n ")} + | finals = ${fst.finals} + |}""".stripMargin + } + +} + +case class OrderedEdgeSet[Q, Pred, F, Out](forward: Map[Q, List[(Pred, F, Q)]], + backward: Map[Q, List[(Pred, F, Q)]], + forwardEpsilon: Map[Q, List[(Out, Q)]], + backwardEpsilon: Map[Q, List[(Out, Q)]])(implicit F: Func.Range[F, Out]) { + + def toList: List[Edge[Q, Pred, F, Out]] = { + val sym = forward.toList.flatMap { case (q, ts) => ts.map { case (pred, fun, tgt) => (q, Left((pred, fun)), tgt) } } + val eps = forwardEpsilon.toList.flatMap { case (q, ts) => ts.map { case (out, tgt) => (q, Right(out), tgt) } } + sym ++ eps + } + +} + +object OrderedEdgeSet { + + def fromList[Q, Pred, Fun, Out](edges: List[Edge[Q, Pred, Fun, Out]])(implicit + Fun: Func.Range[Fun, Out]): OrderedEdgeSet[Q, Pred, Fun, Out] = + OrderedEdgeSet( + forward = edges.collect { case (q, Left((a, b)), q1) => Map(q -> List((a, b, q1))) }.combineAll, + backward = edges.collect { case (q, Left((a, b)), q1) => Map(q1 -> List((a, b, q))) }.combineAll, + forwardEpsilon = edges.collect { case (q, Right(out), q1) => Map((q -> List((out, q1)))) }.combineAll, + backwardEpsilon = edges.collect { case (q, Right(out), q1) => Map((q1 -> List((out, q)))) }.combineAll + ) + +} diff --git a/transducers/shared/src/main/scala/fs2/data/fst/FSTPipe.scala b/transducers/shared/src/main/scala/fs2/data/fst/FSTPipe.scala new file mode 100644 index 000000000..4108d6190 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/fst/FSTPipe.scala @@ -0,0 +1,79 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package fst + +import transducer.SetLike + +import cats.Monoid + +case class FSTException(msg: String) extends Exception(msg) + +private class FSTPipe[F[_]: RaiseThrowable, In, Out: Monoid, Q, Pred, Fun](fst: FST[Q, Pred, Fun, In, Out], + emitEarly: Boolean)(implicit + Pred: SetLike[Pred, In]) + extends Pipe[F, In, Out] { + def apply(s: Stream[F, In]): Stream[F, Out] = { + def go(chunk: Chunk[In], idx: Int, rest: Stream[F, In], outs: List[(List[Out], Q)]): Pull[F, Out, Unit] = + if (idx >= chunk.size) { + rest.pull.uncons.flatMap { + case Some((hd, tl)) => go(hd, 0, tl, outs) + case None => + outs.filter(p => fst.finals.contains(p._2)) match { + case (outs, q) :: _ => + Pull.output(Chunk.seq(outs.reverse)) + case _ => + Pull.raiseError(FSTException("invalid input")) + } + } + } else { + val in = chunk(idx) + + def prune(visited: Set[Q], l: List[(List[Out], Q)]): List[(List[Out], Q)] = + l match { + case Nil => Nil + case (outs, q) :: rest => + if (visited.contains(q)) + prune(visited, rest) + else + (outs, q) :: prune(visited + q, rest) + } + + def close(l: List[(List[Out], Q)]) = + prune(Set.empty, + l.flatMap { case (outs, q) => fst.rightClosure(q).map { case (out, q) => (out :: outs, q) } }) + + def step(outs: List[(List[Out], Q)]) = + outs.flatMap { case (outs, q) => + fst.evalEdges(q, in).map { case (out, q) => (out :: outs, q) } + } + + val outs1 = close(step(outs)) + + outs1 match { + case List((outs, q)) if emitEarly && fst.finals.contains(q) => + Pull.output(Chunk.seq(outs.reverse)) >> go(chunk, idx + 1, rest, List((Nil, q))) + case _ => + go(chunk, idx + 1, rest, outs1) + } + } + + go(Chunk.empty, 0, s, fst.rightClosure(fst.initial).map { case (out, q) => (List(out), q) }).stream + } + +} diff --git a/transducers/shared/src/main/scala/fs2/data/fst/package.scala b/transducers/shared/src/main/scala/fs2/data/fst/package.scala new file mode 100644 index 000000000..ac063ab28 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/fst/package.scala @@ -0,0 +1,27 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data + +import transducer.RangeSet + +package object fst { + + type Edge[Q, Pred, F, Out] = (Q, Either[(Pred, F), Out], Q) + + type Transducer[Q, Sigma, Gamma] = FST[Q, RangeSet[Sigma], CopyFunc[Sigma, List[Gamma]], Sigma, List[Gamma]] + +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/Assignment.scala b/transducers/shared/src/main/scala/fs2/data/stt/Assignment.scala new file mode 100644 index 000000000..f07e0dbbb --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/Assignment.scala @@ -0,0 +1,56 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +import cats.Show +import cats.syntax.show._ + +sealed trait Assignment[+V <: Variable, +C] +sealed trait Reset[+C] extends Assignment[Nothing, C] +object Reset { + def unapply[C](r: Reset[C]): Some[Reset[C]] = + Some(r) +} +object Assignment { + case class Empty(x: Variable.Normal) extends Reset[Nothing] + case class Hole(x: Variable.Normal) extends Reset[Nothing] + case class Char[C](x: Variable.Normal, c: C) extends Reset[C] + case class Subtree[C](x: Variable.Normal, open: C, close: C) extends Reset[C] + case class Append[V <: Variable](x: Variable.Normal, y: V) extends Assignment[V, Nothing] + case class Prepend[V <: Variable](x: Variable.Normal, y: V) extends Assignment[V, Nothing] + case class SubstInX[V <: Variable](x: Variable.Normal, y: V) extends Assignment[V, Nothing] + case class SubstInY[V <: Variable](x: Variable.Normal, y: V) extends Assignment[V, Nothing] + case class Swap[V <: Variable](x: Variable.Normal, y: V) extends Assignment[V, Nothing] + + implicit def show[V <: Variable, C: Show]: Show[Assignment[V, C]] = Show.show { + case Empty(x) => show"$x := ε;" + case Hole(x) => show"$x := ?;" + case Char(x, c) => show"$x := $c;" + case Subtree(x, open, close) => show"$x := ⧼$open ? $close⧽;" + case Append(x, y) => show"""$x := $x $y; + |$y := ε;""".stripMargin + case Prepend(x, y) => show"""$x := $y $x; + |$y := ε;""".stripMargin + case SubstInX(x, y) => show"""$x := $x[$y]; + |$y := ε;""".stripMargin + case SubstInY(x, y) => show"""$x := $y[$x]; + |$y := ε;""".stripMargin + case Swap(x, y) => show"""$x := $y; + |$y := $x;""".stripMargin + } + +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/Env.scala b/transducers/shared/src/main/scala/fs2/data/stt/Env.scala new file mode 100644 index 000000000..9ab714229 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/Env.scala @@ -0,0 +1,92 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +import cats.ApplicativeError +import cats.MonadError +import cats.Show +import cats.syntax.all._ + +import scala.collection.compat._ +import scala.reflect.ClassTag + +sealed trait Variable +object Variable { + case class Normal(name: String) extends Variable + object Normal { + implicit val show: Show[Normal] = _.name + } + case class Stack(name: String) extends Variable + + implicit val show: Show[Variable] = Show.show { + case Normal(name) => name + case Stack(name) => s"${name}ₚ" + } +} + +class Env[V <: Variable, C] private (private val vars: Map[V, Expr[C]]) { + + def widen: Env[Variable, C] = + new Env(Map.empty[Variable, Expr[C]] ++ vars) + + def stackify(implicit ev: V =:= Variable.Normal): Env[Variable.Stack, C] = + new Env(vars.collect { case (Variable.Normal(n), e) => + (Variable.Stack(n), e) + }) + + def destackify: Env[Variable.Normal, C] = + new Env(vars.collect { case (v: Variable.Normal, e) => + (v, e) + }) + + def merge(that: Env[Variable.Stack, C])(implicit ev: V =:= Variable.Normal): Env[Variable, C] = + new Env((this.vars ++ that.vars).toMap) + + def call: Env[V, C] = + new Env[V, C](vars.view.mapValues { + case Expr0(_) => Expr0.Empty[C]() + case Expr1(_) => Expr1.Hole[C]() + }.toMap) { + override def call = this + } + + def lookupExpr[F[_]](name: V)(implicit F: ApplicativeError[F, Throwable]): F[Expr[C]] = + vars.get(name).liftTo[F](STTException(show"unknown varibale $name in environment")) + + def lookup[F[_], E <: Expr[C]](name: V)(implicit F: MonadError[F, Throwable], E: ClassTag[E]): F[E] = + lookupExpr(name).flatMap { + case E(e) => + F.pure(e) + case _ => + F.raiseError(STTException(show"variable $name is of wrong type")) + } + + def update(name: V, e: Expr[C]): Env[V, C] = + new Env(vars.updated(name, e)) + +} + +object Env { + def create[C](names: Map[String, Type]): Env[Variable.Normal, C] = + new Env(names.map { + case (name, Type.Type0) => (Variable.Normal(name), Expr0.Empty[C]()) + case (name, Type.Type1) => (Variable.Normal(name), Expr1.Hole[C]()) + }) + + implicit def show[V <: Variable, C: Show]: Show[Env[V, C]] = + _.vars.map { case (v, e) => show"$v -> $e" }.mkString("\n") +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/STT.scala b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala new file mode 100644 index 000000000..5922c3c86 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/STT.scala @@ -0,0 +1,265 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package stt + +import transducer._ + +import cats.syntax.all._ +import cats.MonadError +import cats.data.OptionT +import cats.data.Chain +import cats.Show + +case class InternalTransition[Out](target: Int, update: List[Assignment[Variable.Normal, Out]]) + +case class CallTransition[Out, StackElem](target: Int, push: StackElem, update: List[Assignment[Variable.Normal, Out]]) + +case class ReturnTransition[Out](target: Int, update: List[Assignment[Variable, Out]]) + +/** A copyless streaming tree transducer implementation. */ +class STT[F[_], T[_, _], In, Out, StackElem](initial: Int, + internalTransitions: T[(Int, In), InternalTransition[Out]], + callTransitions: T[(Int, In), CallTransition[Out, StackElem]], + returnTransitions: T[(Int, StackElem, In), ReturnTransition[Out]], + finalStates: Map[Int, Expr0[Out]], + variables: Map[String, Type])(implicit + F: MonadError[F, Throwable], + T: Table[T], + In: HasTag[In], + showIn: Show[In], + showOut: Show[Out]) + extends Pipe[F, In, Out] { + + def isFinal(state: Int): Boolean = finalStates.contains(state) + + def update[V <: Variable](env: Env[Variable, Out], + assignments: List[Assignment[V, Out]]): F[Env[Variable.Normal, Out]] = { + def loop(assignments: List[Assignment[V, Out]], env: Env[Variable, Out]): F[Env[Variable.Normal, Out]] = + assignments match { + case assignment :: rest => + assignment match { + case Assignment.Empty(x) => + loop(rest, env.update(x, Expr0.Empty())) + case Assignment.Hole(x) => + loop(rest, env.update(x, Expr1.Hole())) + case Assignment.Char(x, c) => + loop(rest, env.update(x, Expr0.Char(c))) + case Assignment.Subtree(x, open, close) => + loop(rest, env.update(x, Expr1.Subtree(open, Expr1.Hole(), close))) + case Assignment.Append(x, y) => + (env.lookupExpr[F](x), env.lookupExpr[F](y)).tupled.flatMap { + case (Expr0(xe), Expr0(ye)) => + loop(rest, env.update(x, Expr0.Concat(xe, ye)).update(y, Expr0.Empty())) + case (Expr1(xe), Expr0(ye)) => + loop(rest, env.update(x, Expr1.Concat10(xe, ye)).update(y, Expr0.Empty())) + case (Expr0(xe), Expr1(ye)) => + loop(rest, env.update(x, Expr1.Concat01(xe, ye)).update(y, Expr1.Hole())) + case (Expr1(xe), Expr1(ye)) => + F.raiseError(STTException("cannot append an expression of type 1 to another expression of type 1")) + } + case Assignment.Prepend(x, y) => + (env.lookupExpr[F](x), env.lookupExpr[F](y)).tupled.flatMap { + case (Expr0(xe), Expr0(ye)) => + loop(rest, env.update(x, Expr0.Concat(ye, xe)).update(y, Expr0.Empty())) + case (Expr1(xe), Expr0(ye)) => + loop(rest, env.update(x, Expr1.Concat01(ye, xe)).update(y, Expr0.Empty())) + case (Expr0(xe), Expr1(ye)) => + loop(rest, env.update(x, Expr1.Concat10(ye, xe)).update(y, Expr1.Hole())) + case (Expr1(xe), Expr1(ye)) => + F.raiseError(STTException("cannot prepend an expression of type 1 to another expression of type 1")) + } + case Assignment.SubstInX(x, y) => + (env.lookupExpr[F](x), env.lookupExpr[F](y)).tupled.flatMap { + case (Expr1(xe), Expr1(ye)) => + loop(rest, env.update(x, Expr1.Subst(xe, ye)).update(y, Expr1.Hole())) + case (Expr1(xe), Expr0(ye)) => + loop(rest, env.update(x, Expr0.Subst(xe, ye)).update(y, Expr0.Empty())) + case (Expr0(xe), ye) => + F.raiseError(STTException(show"cannot substitute in an expression of type 0: $xe")) + } + case Assignment.SubstInY(x, y) => + (env.lookupExpr[F](x), env.lookupExpr[F](y)).tupled.flatMap { + case (Expr1(xe), Expr1(ye)) => + loop(rest, env.update(x, Expr1.Subst(ye, xe)).update(y, Expr1.Hole())) + case (Expr0(xe), Expr1(ye)) => + loop(rest, env.update(x, Expr0.Subst(ye, xe)).update(y, Expr0.Empty())) + case (xe, Expr0(ye)) => + F.raiseError(STTException(show"cannot substitute in an expression of type 0: $ye")) + } + case Assignment.Swap(x, y) => + (env.lookupExpr[F](x), env.lookupExpr[F](y)).tupled.flatMap { case (xe, ye) => + loop(rest, env.update(x, ye).update(y, xe)) + } + } + case Nil => F.pure(env.destackify) + } + loop(assignments, env) + } + + private def step(q: Int, stack: List[(StackElem, Env[Variable.Stack, Out])], env: Env[Variable.Normal, Out])( + in: In): OptionT[F, (Int, List[(StackElem, Env[Variable.Stack, Out])], Env[Variable.Normal, Out])] = + in match { + case Tag.Internal() => + OptionT.fromOption(internalTransitions.get(q -> in)).semiflatMap { case InternalTransition(q1, upd) => + update(env.widen, upd).map((q1, stack, _)) + } + case Tag.Call() => + OptionT.fromOption(callTransitions.get(q -> in)).semiflatMap { case CallTransition(q1, p, upd) => + update(env.widen, upd).map(env => (q1, (p, env.stackify) :: stack, env.call)) + } + case Tag.Return() => + stack match { + case (p, env1) :: stack => + OptionT.fromOption(returnTransitions.get((q, p, in))).semiflatMap { case ReturnTransition(q1, upd) => + update(env.merge(env1), upd).map((q1, stack, _)) + } + case Nil => + OptionT.liftF( + F.raiseError(STTException("inconsistent stack state. Input is probably not a well-formed tree"))) + } + } + + def eval0(env: Env[Variable.Normal, Out], e: Expr0[Out]): F[Chain[Out]] = { + def loop(e: Expr0[Out], acc: Chain[Out]): F[Chain[Out]] = { + e match { + case Expr0.Empty() => + F.pure(acc) + case Expr0.Var(x) => + env.lookup[F, Expr0[Out]](x).flatMap(loop(_, acc)) + case Expr0.Char(c) => + F.pure(acc.append(c)) + case Expr0.Subtree(open, sub, close) => + loop(sub, acc.append(open)).map(_.append(close)) + case Expr0.Concat(left, right) => + for { + acc <- loop(left, acc) + acc <- loop(right, acc) + } yield acc + case Expr0.Subst(inner, arg) => + loop(inner.subst(arg), acc) + } + } + loop(e, Chain.empty) + } + + def apply(s: Stream[F, In]): Stream[F, Out] = { + def go(chunk: Chunk[In], + idx: Int, + rest: Stream[F, In], + state: Int, + stack: List[(StackElem, Env[Variable.Stack, Out])], + env: Env[Variable.Normal, Out], + lastKnownFinal: Option[(Int, Env[Variable.Normal, Out])], + accSinceLastFinal: Chain[In], + chunkAcc: Chain[Out]): Pull[F, Out, Unit] = + if (idx >= chunk.size) { + Pull.output(Chunk.chain(chunkAcc)) >> rest.pull.uncons.flatMap { + case Some((hd, tl)) => + go(hd, 0, tl, state, stack, env, lastKnownFinal, accSinceLastFinal, Chain.empty) + case None => + // we are at the end of the input + // did we reach a final state? + lastKnownFinal match { + case Some((state, env)) => + // we did reach a final state, emit the outputs from the last one reached + // and push back the input read since into the stream, then proceed + Pull + .eval(eval0(env, finalStates(state))) + .flatMap(outs => + go(Chunk.chain(accSinceLastFinal), + 0, + Stream.empty, + initial, + Nil, + Env.create(variables), + None, + Chain.empty, + chunkAcc ++ outs)) + case None => + // we did not reach a final state, do we have leftover inputs? + if (accSinceLastFinal.isEmpty) { + // no we don't, everything has been processed, stop here + Pull.done + } else { + // we do have unprocessed inputs, this is an error + Pull.raiseError(STTException("malformed input")) + } + } + } + } else { + val in = chunk(idx) + Pull + // try to step with the current input character + .eval(step(state, stack, env)(in).value) + .flatMap { + case Some((state, stack, env)) if isFinal(state) => + // we can step and the target state is final, + // register this as the last encountered final state, + // reinitialize the input buffer to empty, and proceed + // consume the input symbol + go(chunk, idx + 1, rest, state, stack, env, (state, env).some, Chain.empty, chunkAcc) + case Some((state, stack, env)) => + // we can step and the target state is NOT final, + // add the just read input into the buffer of read + // inputs since last final state, and proceed + go(chunk, idx + 1, rest, state, stack, env, lastKnownFinal, accSinceLastFinal.append(in), chunkAcc) + case None => + // we cannot step from this state, is it final? + finalStates.get(state) match { + case Some(finalExpr) if accSinceLastFinal.nonEmpty => + // it is a final state, emit the associated output, reset buffer, + // reset to initial state, and proceed without consuming the input + Pull + .eval(eval0(env, finalExpr)) + .flatMap(outs => + go(chunk, idx, rest, initial, Nil, Env.create(variables), None, Chain.empty, chunkAcc ++ outs)) + case _ => + // it is not a final state + lastKnownFinal match { + case Some((state, env)) => + // we reached a final state before, let's emit what should have been emitted + // there, and push the input buffer back to the stream, + // as well as unconsumed current chunk + Pull + .eval(eval0(env, finalStates(state))) + .flatMap(outs => + go(Chunk.chain(accSinceLastFinal), + 0, + Stream.chunk(chunk.drop(idx)) ++ rest, + initial, + Nil, + Env.create(variables), + None, + Chain.empty, + chunkAcc ++ outs)) + case None => + // there is no known final, we will emit nothing and just fail + Pull.output(Chunk.chain(chunkAcc)) >> Pull.raiseError( + STTException( + show"malformed input, prefix ${(accSinceLastFinal :+ in).mkString_(", ")} is not accepted")) + } + } + } + } + + go(Chunk.empty, 0, s, initial, List.empty, Env.create(variables), None, Chain.empty, Chain.empty).stream + + } + +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/STTException.scala b/transducers/shared/src/main/scala/fs2/data/stt/STTException.scala new file mode 100644 index 000000000..ed4306e0c --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/STTException.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +case class STTException(msg: String, inner: Throwable = null) extends Exception(msg, inner) diff --git a/transducers/shared/src/main/scala/fs2/data/stt/expressions.scala b/transducers/shared/src/main/scala/fs2/data/stt/expressions.scala new file mode 100644 index 000000000..5002d7523 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/expressions.scala @@ -0,0 +1,101 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +import cats.Show +import cats.syntax.show._ + +sealed trait Type +object Type { + case object Type0 extends Type + case object Type1 extends Type +} + +sealed trait Expr[C] { + val tpe: Type +} +object Expr { + implicit def show[C: Show]: Show[Expr[C]] = Show.show { + case Expr0(e) => e.show + case Expr1(e) => e.show + } +} + +sealed trait Expr0[C] extends Expr[C] { + val tpe = Type.Type0 + def ~(that: Expr0[C]): Expr0[C] = Expr0.Concat(this, that) +} +object Expr0 { + case class Empty[C]() extends Expr0[C] + case class Var[C](x: Variable.Normal) extends Expr0[C] + case class Char[C](c: C) extends Expr0[C] + case class Subtree[C](open: C, sub: Expr0[C], close: C) extends Expr0[C] + case class Concat[C](left: Expr0[C], right: Expr0[C]) extends Expr0[C] + case class Subst[C](inner: Expr1[C], arg: Expr0[C]) extends Expr0[C] + + def unapply[C](e: Expr0[C]): Some[Expr0[C]] = + Some(e) + + implicit def show[C: Show]: Show[Expr0[C]] = Show.show { + case Empty() => "ε" + case Var(x) => x.show + case Char(c) => c.show + case Subtree(open, sub, close) => show"⧼$open $sub $close⧽" + case Concat(left, right) => show"$left $right" + case Subst(inner, arg) => show"$inner[$arg]" + } +} + +sealed trait Expr1[C] extends Expr[C] { + val tpe = Type.Type1 + def subst(e: Expr0[C]): Expr0[C] = + this match { + case Expr1.Hole() => e + case Expr1.Subtree(open, sub, close) => Expr0.Subtree(open, sub.subst(e), close) + case Expr1.Concat01(left, right) => Expr0.Concat(left, right.subst(e)) + case Expr1.Concat10(left, right) => Expr0.Concat(left.subst(e), right) + case Expr1.Subst(inner, arg) => inner.subst(arg).subst(e) + } + + def subst(e: Expr1[C]): Expr1[C] = + this match { + case Expr1.Hole() => e + case Expr1.Subtree(open, sub, close) => Expr1.Subtree(open, sub.subst(e), close) + case Expr1.Concat01(left, right) => Expr1.Concat01(left, right.subst(e)) + case Expr1.Concat10(left, right) => Expr1.Concat10(left.subst(e), right) + case Expr1.Subst(inner, arg) => inner.subst(arg).subst(e) + } + +} +object Expr1 { + case class Hole[C]() extends Expr1[C] + case class Subtree[C](open: C, sub: Expr1[C], close: C) extends Expr1[C] + case class Concat01[C](left: Expr0[C], right: Expr1[C]) extends Expr1[C] + case class Concat10[C](left: Expr1[C], right: Expr0[C]) extends Expr1[C] + case class Subst[C](inner: Expr1[C], arg: Expr1[C]) extends Expr1[C] + + def unapply[C](e: Expr1[C]): Some[Expr1[C]] = + Some(e) + + implicit def show[C: Show]: Show[Expr1[C]] = Show.show { + case Hole() => "?" + case Subtree(open, sub, close) => show"⧼$open $sub $close⧽" + case Concat01(left, right) => show"$left $right" + case Concat10(left, right) => show"$left $right" + case Subst(inner, arg) => show"$inner[$arg]" + } +} diff --git a/transducers/shared/src/main/scala/fs2/data/stt/tags.scala b/transducers/shared/src/main/scala/fs2/data/stt/tags.scala new file mode 100644 index 000000000..d3ff3c65e --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/stt/tags.scala @@ -0,0 +1,43 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.stt + +import scala.annotation.implicitNotFound + +sealed trait Tag { + def unapply[C](c: C)(implicit C: HasTag[C]): Boolean = + C.tag(c) == this +} +object Tag { + case object Call extends Tag + case object Return extends Tag + case object Internal extends Tag +} + +/** Typeclass indicating that the characters of type `C` + * can be tagged as: + * - call (e.g. opening tag), + * - return (e.g. closing tag), + * - internal (non structuring character). + */ +@implicitNotFound( + "Cannot prove that type ${C} has tags. Make sure to provide an implicit instance of `fs2.data.stt.HasTag[${C}]` in scope") +trait HasTag[C] { + def tag(c: C): Tag + + def unapply(c: C): Some[Tag] = Some(tag(c)) +} diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/CharRanges.scala b/transducers/shared/src/main/scala/fs2/data/transducer/CharRanges.scala new file mode 100644 index 000000000..92e1d3e95 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/CharRanges.scala @@ -0,0 +1,43 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +object CharRanges { + + /** The empty set of character ranges */ + val empty: CharRanges = RangeSet.empty + + /** The set that contains all characters */ + val all: CharRanges = RangeSet.all + + /** Creates a singleton set of a singleton range */ + def char(c: Char): CharRanges = + RangeSet.char(c) + + /** Creates a set of ranges based on the provided single characters */ + def chars(c1: Char, c2: Char, cs: Char*): CharRanges = + RangeSet.chars(c1, c2, cs: _*) + + /** Creates a singleton set of ranges */ + def range(r: (Char, Char)): CharRanges = + RangeSet.range(r) + + /** Creates a set of ranges */ + def ranges(r1: (Char, Char), r2: (Char, Char), rs: (Char, Char)*): CharRanges = + RangeSet.ranges(r1, r2, rs: _*) + +} diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/Func.scala b/transducers/shared/src/main/scala/fs2/data/transducer/Func.scala new file mode 100644 index 000000000..87a6958ef --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/Func.scala @@ -0,0 +1,42 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +trait Func[F] { + type Dom + type Rng + + def eval(f: F)(arg: Dom): Rng + def isConst(f: F): Option[Rng] + def inDom(t: Dom)(f: F): Boolean + def domain(f: F): LazyList[Dom] +} + +object Func { + type Range[F, R] = Func[F] { + type Rng = R + } + + type Domain[F, D] = Func[F] { + type Dom = D + } + + type Aux[F, D, R] = Func[F] { + type Dom = D + type Rng = R + } +} diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/RangeSet.scala b/transducers/shared/src/main/scala/fs2/data/transducer/RangeSet.scala new file mode 100644 index 000000000..8b10faf8e --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/RangeSet.scala @@ -0,0 +1,195 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +import cats.Order +import cats.syntax.all._ +import cats.Show +import cats.data.NonEmptyList +import cats.data.NonEmptyVector +import cats.kernel.BoundedEnumerable + +/** A set of ranges for some enumerable type. */ +sealed trait RangeSet[T] { + + /** Indicates whether this set of ranges contains the given character. */ + def contains(c: T): Boolean + + /** Inverts this set of character ranges. + * Forall c, this.contains(c) == !this.invert.contains(c) + */ + def invert: RangeSet[T] + + /** Enumerates all characters in this set of ranges, in ascending order. */ + def enumerate: LazyList[T] + + /** Returns the minimal elements in these sets. */ + def min: Option[T] + + /** Returns the maximal elements in these sets. */ + def max: Option[T] + + /** Indicates whether this sets contains no characters. */ + def isEmpty: Boolean + + /** Indicates whether `this` overlaps with `that`. + * Returns `true` iif the exists `t`, such that + * `this.contains(t) && that.contains(t)` + */ + def overlap(that: RangeSet[T]): Boolean + +} + +object RangeSet { + + /** The empty set of character ranges */ + def empty[T: BoundedEnumerable]: RangeSet[T] = Empty() + + /** The set that contains all characters */ + def all[T: BoundedEnumerable]: RangeSet[T] = All() + + /** Creates a singleton set of a singleton range */ + def char[T: BoundedEnumerable](c: T): RangeSet[T] = + Ranges(NonEmptyVector.one(Range(c, c)), false) + + /** Creates a set of ranges based on the provided single characters */ + def chars[T: BoundedEnumerable](c1: T, c2: T, cs: T*): RangeSet[T] = + ranges((c1, c1), (c2, c2), cs.map(c => (c, c)): _*) + + /** Creates a singleton set of ranges */ + def range[T](r: (T, T))(implicit T: BoundedEnumerable[T]): RangeSet[T] = { + implicit val order = T.order + val lower = r._1.min(r._2) + val upper = r._1.max(r._2) + if (lower == Char.MinValue && upper == Char.MaxValue) + All() + else + Ranges(NonEmptyVector.one(Range(lower, upper)), false) + } + + /** Creates a set of ranges */ + def ranges[T](r1: (T, T), r2: (T, T), rs: (T, T)*)(implicit T: BoundedEnumerable[T]): RangeSet[T] = { + implicit val order = T.order + val ranges = + NonEmptyList(r1, r2 :: rs.toList) + .map { case (c1, c2) => Range(c1.min(c2), c1.max(c2)) } + .sortBy(_.lower) + def merge(ranges: NonEmptyList[Range[T]]): NonEmptyList[Range[T]] = + ranges match { + case NonEmptyList(r1, r2 :: ranges) if r1.overlapsOrAdjacent(r2) => merge(NonEmptyList(r1.merge(r2), ranges)) + case NonEmptyList(r1, r2 :: ranges) => r1 :: merge(NonEmptyList(r2, ranges)) + case NonEmptyList(r1, Nil) => NonEmptyList.one(r1) + } + merge(ranges) match { + case NonEmptyList(Range(Char.MinValue, Char.MaxValue), Nil) => All() + case merged => Ranges(merged.toNev, false) + } + + } + + private case class Range[T](lower: T, upper: T)(implicit T: BoundedEnumerable[T]) { + implicit val order = T.order + def contains(c: T): Boolean = + lower <= c && upper >= c + def overlapsOrAdjacent(that: Range[T]): Boolean = + this.upper >= T.cyclePrevious(that.lower) && this.lower <= T.cycleNext(that.upper) + def merge(that: Range[T]): Range[T] = + Range(this.lower.min(that.lower), this.upper.max(that.upper)) + def enumerate: LazyList[T] = + LazyList.iterate(lower)(T.cycleNext(_)).takeWhile(_ <= upper) + } + + private object Range { + implicit def order[T](implicit T: Order[T]): Order[Range[T]] = Order.from { (x: Range[T], y: Range[T]) => + val mincmp = T.compare(x.lower, y.lower) + if (mincmp == 0) + T.compare(x.upper, y.upper) + else + mincmp + + } + implicit def show[T: Show]: Show[Range[T]] = Show.show { t => + if (t.lower == t.upper) + t.lower.show + else + show"${t.lower}-${t.upper}" + } + } + + private case class All[T]()(implicit T: BoundedEnumerable[T]) extends RangeSet[T] { + def contains(c: T): Boolean = true + def invert: RangeSet[T] = Empty() + def enumerate: LazyList[T] = T.membersAscending + def min: Option[T] = T.minBound.some + def max: Option[T] = T.maxBound.some + def overlap(that: RangeSet[T]): Boolean = that != Empty() + def isEmpty: Boolean = false + } + + private case class Empty[T: BoundedEnumerable]() extends RangeSet[T] { + def contains(c: T): Boolean = false + def invert: RangeSet[T] = All() + def enumerate: LazyList[T] = LazyList.empty + def min: Option[T] = None + def max: Option[T] = None + def overlap(that: RangeSet[T]): Boolean = false + def isEmpty: Boolean = true + } + + private case class Ranges[T](ranges: NonEmptyVector[Range[T]], inverted: Boolean) extends RangeSet[T] { + implicit val order = ranges.head.order + def contains(c: T): Boolean = { + def search(low: Int, high: Int): Boolean = + if (low > high) { + inverted + } else { + val mid = (low + high) / 2 + val range = ranges.getUnsafe(mid) + if (range.contains(c)) + !inverted + else if (c < range.lower) + search(low, mid - 1) + else + search(mid + 1, high) + } + search(0, ranges.length - 1) + } + def invert: RangeSet[T] = copy(inverted = !inverted) + def enumerate: LazyList[T] = LazyList.from(ranges.iterator).flatMap(_.enumerate) + def min: Option[T] = ranges.head.lower.some + def max: Option[T] = ranges.last.upper.some + def overlap(that: RangeSet[T]): Boolean = + that match { + case All() => true + case Empty() => false + case _ => enumerate.exists(that.contains(_)) + } + def isEmpty: Boolean = false + } + + implicit def RangeSetShow[T: Show]: Show[RangeSet[T]] = Show.show { + case Empty() => "ε" + case All() => "*" + case Ranges(ranges, false) => ranges.mkString_("[", "", "]") + case Ranges(ranges, true) => ranges.mkString_("[^", "", "]") + } + + implicit def RangeSetSetLike[T]: SetLike[RangeSet[T], T] = + new SetLike[RangeSet[T], T] { + def contains(s: RangeSet[T])(c: T): Boolean = s.contains(c) + } +} diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/SetLike.scala b/transducers/shared/src/main/scala/fs2/data/transducer/SetLike.scala new file mode 100644 index 000000000..6563adecd --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/SetLike.scala @@ -0,0 +1,25 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +import scala.annotation.implicitNotFound + +@implicitNotFound( + "Could not prove that ${S} can be used as a set of ${C}. Please make sure you provide an implicit `fs2.data.transducer.SetLike[${S}, ${C}]` in scope.") +trait SetLike[S, C] { + def contains(s: S)(c: C): Boolean +} diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/Table.scala b/transducers/shared/src/main/scala/fs2/data/transducer/Table.scala new file mode 100644 index 000000000..9571c41c1 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/Table.scala @@ -0,0 +1,46 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.transducer + +import scala.annotation.implicitNotFound + +/** A typeclass indicating that some type `T` can be used as a lookup table. + */ +@implicitNotFound( + "Cannot prove that type ${T} can be used as a lookup table. Make sure to provide an implicit instance of `fs2.data.transducer.Table[${T}]` in scope") +trait Table[T[_, _]] extends NTable[T] { + def get[From, To](t: T[From, To])(from: From): Option[To] + def getOrdered[From, To](t: T[From, To])(from: From): List[To] = get(t)(from).toList +} + +object Table { + + implicit object PartialFunctionTable extends Table[PartialFunction] { + def get[From, To](m: PartialFunction[From, To])(from: From): Option[To] = + m.lift(from) + } + + implicit object MapTable extends Table[Map] { + def get[From, To](m: Map[From, To])(from: From): Option[To] = m.get(from) + } +} + +@implicitNotFound( + "Cannot prove that type ${T} can be used as a non deterministic lookup table. Make sure to provide an implicit instance of `fs2.data.transducer.NTable[${T}]` in scope") +trait NTable[T[_, _]] { + def getOrdered[From, To](t: T[From, To])(f: From): List[To] +} diff --git a/transducers/shared/src/main/scala/fs2/data/transducer/package.scala b/transducers/shared/src/main/scala/fs2/data/transducer/package.scala new file mode 100644 index 000000000..382d0b2a9 --- /dev/null +++ b/transducers/shared/src/main/scala/fs2/data/transducer/package.scala @@ -0,0 +1,33 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data + +package object transducer { + + type CharRanges = RangeSet[Char] + + implicit class TableOps[T[_, _], From, To](val m: T[From, To]) extends AnyVal { + def get(from: From)(implicit M: Table[T]): Option[To] = + M.get(m)(from) + } + + implicit class SetLikeOps[S](val s: S) extends AnyVal { + def contains[C](c: C)(implicit S: SetLike[S, C]): Boolean = + S.contains(s)(c) + } + +} diff --git a/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala b/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala new file mode 100644 index 000000000..da4cda63f --- /dev/null +++ b/transducers/shared/src/test/scala/fs2/data/stt/STTSpec.scala @@ -0,0 +1,157 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package stt + +import weaver._ +import cats.Show +import cats.effect.IO + +sealed trait Tree +object Tree { + case class Open(name: String) extends Tree + case class Close(name: String) extends Tree + case class Leaf(value: Int) extends Tree + + implicit val show: Show[Tree] = Show.show { + case Open(name) => s"<$name" + case Close(name) => s"$name>" + case Leaf(v) => v.toString + } + + implicit val hasTags: HasTag[Tree] = { + case Open(_) => Tag.Call + case Close(_) => Tag.Return + case Leaf(_) => Tag.Internal + } + +} + +object STTSpec extends SimpleIOSuite { + + val leaf0: Tree = Tree.Leaf(0) + val leaf1: Tree = Tree.Leaf(1) + val openA: Tree = Tree.Open("a") + val closeA: Tree = Tree.Close("a") + val openB: Tree = Tree.Open("b") + val closeB: Tree = Tree.Close("b") + + val x = Variable.Normal("x") + val y = Variable.Normal("y") + val z = Variable.Normal("z") + val xp = Variable.Stack("x") + + test("reverse tree (map)") { + + import Assignment._ + val internalTransition = + Map[(Int, Tree), InternalTransition[Tree]]((0, leaf0) -> + InternalTransition(0, List(Char(y, leaf0), Prepend(x, y))), + (0, leaf1) -> + InternalTransition(0, List(Char(y, leaf1), Prepend(x, y)))) + val callTransition = Map[(Int, Tree), CallTransition[Tree, Tree]]((0, openA) -> + CallTransition(0, openA, Nil), + (0, openB) -> + CallTransition(0, openB, Nil)) + val returnTransition = Map[(Int, Tree, Tree), ReturnTransition[Tree]]( + (0, openA, closeA) -> + ReturnTransition(0, List(Subtree(z, openA, closeA), SubstInY(x, z), Append(x, xp))), + (0, openB, closeB) -> + ReturnTransition(0, List(Subtree(z, openB, closeB), SubstInY(x, z), Append(x, xp))) + ) + val finalStates = Map(0 -> Expr0.Var[Tree](x)) + val reverse = + new STT[IO, Map, Tree, Tree, Tree](0, + internalTransition, + callTransition, + returnTransition, + finalStates, + Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) + + Stream(openA, openB, leaf0, closeB, leaf1, closeA, openB, closeB) + .rechunkRandomly() + .through(reverse) + .compile + .toList + .map(result => expect(result == List(openB, closeB, openA, leaf1, openB, leaf0, closeB, closeA))) + } + + test("reverse tree (symbolic)") { + + import Assignment._ + val internalTransition: PartialFunction[(Int, Tree), InternalTransition[Tree]] = { case (0, l @ Tree.Leaf(_)) => + InternalTransition(0, List(Char(y, l), Prepend(x, y))) + } + val callTransition: PartialFunction[(Int, Tree), CallTransition[Tree, Tree]] = { case (0, o @ Tree.Open(_)) => + CallTransition(0, o, Nil) + } + val returnTransition: PartialFunction[(Int, Tree, Tree), ReturnTransition[Tree]] = { + case (0, open @ Tree.Open(nopen), close @ Tree.Close(nclose)) if nopen == nclose => + ReturnTransition(0, List(Subtree(z, open, close), SubstInY(x, z), Append(x, xp))) + } + val finalStates = Map(0 -> Expr0.Var[Tree](x)) + val reverse = + new STT[IO, PartialFunction, Tree, Tree, Tree](0, + internalTransition, + callTransition, + returnTransition, + finalStates, + Map("x" -> Type.Type0, "y" -> Type.Type0, "z" -> Type.Type1)) + + Stream(openA, openB, leaf0, closeB, leaf1, closeA, openB, closeB) + .rechunkRandomly() + .through(reverse) + .compile + .toList + .map(result => expect(result == List(openB, closeB, openA, leaf1, openB, leaf0, closeB, closeA))) + } + + test("emit until error") { + import Assignment._ + val internalTransition = + Map[(Int, Tree), InternalTransition[Tree]]() + val callTransition = Map[(Int, Tree), CallTransition[Tree, Unit]]( + (0, openA) -> + CallTransition(0, (), List(Char(y, openA), Append(x, y)))) + val returnTransition = Map[(Int, Unit, Tree), ReturnTransition[Tree]]( + (0, (), closeA) -> + ReturnTransition(0, List(Char(y, closeA), Append(x, y), Prepend(x, xp))) + ) + val finalStates = Map(0 -> Expr0.Var[Tree](x)) + val onlyA = new STT[IO, Map, Tree, Tree, Unit](0, + internalTransition, + callTransition, + returnTransition, + finalStates, + Map("x" -> Type.Type0, "y" -> Type.Type0)) + Stream(openA, openA, closeA, closeA, openB, closeB) + .through(onlyA) + .attempt + .compile + .toList + .map { result => + expect( + result == List(Right(openA), + Right(openA), + Right(closeA), + Right(closeA), + Left(STTException("malformed input, prefix