diff --git a/build.sbt b/build.sbt index 124987001..782b19d80 100644 --- a/build.sbt +++ b/build.sbt @@ -119,7 +119,8 @@ val root = (project in file(".")) jsonPlay.js, text.js, xml.js, - transducers.js), + transducers.js, + kleenex.js), ScalaUnidoc / siteSubdirName := "api", addMappingsToSiteDir(ScalaUnidoc / packageDoc / mappings, ScalaUnidoc / siteSubdirName), Nanoc / sourceDirectory := file("site"), @@ -145,7 +146,9 @@ val root = (project in file(".")) cbor.jvm, cbor.js, transducers.jvm, - transducers.js + transducers.js, + kleenex.jvm, + kleenex.js ) lazy val text = crossProject(JVMPlatform, JSPlatform) @@ -319,6 +322,18 @@ lazy val transducers = crossProject(JVMPlatform, JSPlatform) description := "Streaming transducers library" ) +lazy val kleenex = crossProject(JVMPlatform, JSPlatform) + .crossType(CrossType.Full) + .in(file("kleenex")) + .settings(commonSettings) + .settings(publishSettings) + .settings( + name := "fs2-data-kleenex", + description := "Streaming text processing library", + libraryDependencies += "org.typelevel" %%% "cats-parse" % "0.3.6" + ) + .dependsOn(text, transducers) + lazy val documentation = project .in(file("documentation")) .enablePlugins(MdocPlugin) diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Action.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Action.scala new file mode 100644 index 000000000..057075a09 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Action.scala @@ -0,0 +1,32 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import cats.Show + +sealed trait Action +object Action { + case object Push extends Action + case class Pop(reg: String) extends Action + case class Write(reg: String) extends Action + + implicit val show: Show[Action] = Show.show { + case Push => "push" + case Pop(r) => s"pop $$$r" + case Write(r) => s"write $$$r" + } +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Check.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Check.scala new file mode 100644 index 000000000..530a59371 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Check.scala @@ -0,0 +1,216 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import cats.data.NonEmptyList +import cats.parse.Caret +import cats.syntax.all._ +import cats.data.StateT +import cats.MonadError +import fs2.data.kleenex.core.KleenexCompilerException +import scala.annotation.tailrec + +class Checker[F[_]](implicit F: MonadError[F, Throwable]) { + + def check(prog: Program): F[Unit] = { + val declMap = prog.productions.toList.map { case p @ Production(name, t) => (name, (p.pos, t)) }.toMap + scc(declMap).flatMap { components => + components.traverse_ { component => + val allStrictDeps = + component.flatMap(id => declMap.get(id).map { case (pos, t) => (id, pos, strictDependencies(t)) }) + val localStrictDeps = allStrictDeps.toList.mapFilter { case (id, pos, deps) => + // remove strict dependencies not in SCC + val deps1 = deps.view.filterKeys(component.contains(_)).toMap + if (deps1.nonEmpty) + (id, pos).some + else + None + } + if (localStrictDeps.nonEmpty) + F.raiseError[Unit](KleenexCompilerException(s"""Following productions contain non tail recursive calls: + |${localStrictDeps + .map { case (id, pos) => + s"$id (at line ${pos.line + 1})" + } + .mkString("\n")}""".stripMargin)) + else + F.unit + + } + } + } + + private def successors(id: String, term: Term): List[String] = { + def go(t: Term, acc: Set[String]): Set[String] = + t match { + case Term.Var(s) => acc + s + case Term.Concat(ts) => ts.foldLeft(acc)((acc, t) => go(t, acc)) + case Term.Alternative(ts) => ts.foldLeft(acc)((acc, t) => go(t, acc)) + case Term.Star(t) => go(t, acc) + case Term.Plus(t) => go(t, acc) + case Term.Question(t) => go(t, acc) + case Term.Range(t, _, _) => go(t, acc) + case Term.Suppress(t) => go(t, acc) + case Term.Capture(_, t) => go(t, acc) + case _ => acc + } + + go(term, Set.empty).toList + } + + private def termIdents(t: Term): Map[String, Set[Caret]] = + t match { + case Term.Var(name) => Map(name -> Set(t.pos)) + case Term.Concat(ts) => ts.toList.map(termIdents(_)).combineAll + case Term.Alternative(ts) => ts.toList.map(termIdents(_)).combineAll + case Term.Star(t) => termIdents(t) + case Term.Plus(t) => termIdents(t) + case Term.Question(t) => termIdents(t) + case Term.Suppress(t) => termIdents(t) + case Term.Capture(_, t) => termIdents(t) + case _ => Map.empty + } + + // strict dependencies are the variables occurring not in tail positions in sequences + def strictDependencies(t: Term): Map[String, Set[Caret]] = + t match { + case Term.Concat(NonEmptyList(t1, t2 :: ts)) => + strictDependencies(Term.Concat(NonEmptyList(t2, ts))).combine(termIdents(t1)) + case Term.Concat(NonEmptyList(t, Nil)) => strictDependencies(t) + case Term.Alternative(ts) => ts.toList.map(strictDependencies(_)).combineAll + case Term.Star(t) => strictDependencies(t) + case Term.Plus(t) => strictDependencies(t) + case Term.Question(t) => strictDependencies(t) + case Term.Suppress(t) => strictDependencies(t) + case Term.Capture(_, t) => strictDependencies(t) + case _ => Map.empty + } + + private type State[Res] = StateT[F, SCCState, Res] + + private def gets[Res](f: SCCState => Res): State[Res] = + StateT.inspect(f) + + private def getProps(id: String): State[Option[SCCProps]] = + StateT.inspect(_.props.get(id)) + + private def nop: State[Unit] = + StateT.empty + + private def modify(f: SCCState => SCCState): State[Unit] = + StateT.modify(f) + + private def update[Res](f: SCCState => (SCCState, Res)): State[Res] = + StateT.inspect(f).flatMap { case (st, res) => StateT.set(st).as(res) } + + private def raiseError[Res](t: Throwable): State[Res] = + nop.flatMapF(_ => t.raiseError) + + private def scc(declMap: Map[String, (Caret, Term)]): F[List[Set[String]]] = { + val state = SCCState(0, Nil, Map.empty, Nil) + + def process(v: String, t: Term): State[Unit] = + for { + // first push v on the stack and assign an index + vProps <- update { st => + val props = SCCProps(true, st.index, st.index) + (st.copy(index = st.index + 1, stack = v :: st.stack, props = st.props.updated(v, props)), props) + } + // then for each successor compute recursively + () <- successors(v, t).traverse_ { w => + getProps(w).flatMap { + case Some(wProps) => + // successor already processed + if (wProps.onStack) + // it is on stack, hence in the current SCC + modify(st => + st.copy(props = st.props.updated(v, vProps.copy(lowlink = vProps.lowlink.min(wProps.index))))) + else + // not on the stack, not in SCC + nop + case None => + // not processed yet, do it + declMap.get(w) match { + case Some((_, wt)) => + for { + () <- process(w, wt) + wProps <- gets(_.props(w)) + vProps <- gets(_.props(v)) + () <- modify(st => + st.copy(props = st.props.updated(v, vProps.copy(lowlink = vProps.lowlink.min(wProps.lowlink))))) + } yield () + case None => + raiseError[Unit]( + KleenexCompilerException(s"Unknown identifier $w in definition of $v at line ${t.pos.line + 1}")) + } + } + } + vProps <- gets(_.props(v)) + () <- + if (vProps.lowlink == vProps.index) + for { + stack <- gets(_.stack) + (component, stack1) = spanUntilIncluding(stack, v) + () <- modify { st => + st.copy( + // pop from stack + stack = stack1, + // update the components + components = component.toSet :: st.components, + // remove vertices in component from stack + props = component.foldLeft(st.props) { (props, w) => + props.updatedWith(w)(_.map(_.copy(onStack = false))) + } + ) + } + } yield () + else + nop + } yield () + + declMap.toList + // traverse each node (aka production identifier) + .traverse_ { case (id, (_, t)) => + getProps(id) + .flatMap { + case None => + // if no index has been assigned yet, process it + process(id, t) + case Some(_) => + // otherwise, just continue + nop + } + } + .runS(state) + .map(_.components) + } + + private def spanUntilIncluding(l: List[String], v: String): (List[String], List[String]) = { + @tailrec + def loop(l: List[String], acc: List[String]): (List[String], List[String]) = + l match { + case Nil => (l, Nil) + case `v` :: rest => ((v :: acc).reverse, rest) + case e :: rest => loop(rest, e :: acc) + } + loop(l, Nil) + } + +} + +case class SCCProps(onStack: Boolean, index: Int, lowlink: Int) +case class SCCState(index: Int, stack: List[String], props: Map[String, SCCProps], components: List[Set[String]]) diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Environment.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Environment.scala new file mode 100644 index 000000000..845754261 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Environment.scala @@ -0,0 +1,48 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +case class Environment(stack: List[String], registers: Map[String, String]) { + + /** Appends the `s` on top of the stack. */ + def append(s: String): Option[Environment] = + stack match { + case r :: stack => Some(copy((r + s) :: stack)) + case Nil => None + } + + /** Pushes an empty value on top of the stack. */ + def push: Environment = + copy(stack = "" :: stack) + + /** Pops the value on top of the stack and stores it in `reg`. */ + def pop(reg: String): Option[Environment] = + stack match { + case r :: stack => Some(copy(stack = stack, registers = registers.updated(reg, r))) + case Nil => None + } + + /** Appends the value in `reg` on top of the stack and empties the register. */ + def write(reg: String): Option[Environment] = + stack match { + case r :: stack => + val value = registers.getOrElse(reg, "") + Some(copy(stack = (r + value) :: stack, registers.updated(reg, ""))) + case Nil => None + } + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Interpreter.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Interpreter.scala new file mode 100644 index 000000000..5a3db0d4e --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Interpreter.scala @@ -0,0 +1,56 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package kleenex + +import cats.MonadError +import cats.syntax.all._ + +case class KleenexException(msg: String) extends Exception(msg) + +object Interpreter { + + def pipe[F[_]](implicit F: MonadError[F, Throwable]): Pipe[F, Either[String, Action], String] = { + (s: Stream[F, Either[String, Action]]) => + s + .evalScan(new Environment("" :: Nil, Map.empty)) { + case (env, Left(c)) => + env + .append(c) + .liftTo[F](KleenexException(s"cannot append on top of stack")) + case (env, Right(act)) => + act match { + case Action.Push => (env.push).pure[F] + case Action.Pop(reg) => + env + .pop(reg) + .liftTo[F](KleenexException(s"cannot pop to register $reg")) + case Action.Write(reg) => + env + .write(reg) + .liftTo[F](KleenexException(s"cannot write register $reg")) + } + } + .last + .evalMap { + case Some(Environment(s :: _, _)) => s.pure[F] + case _ => F.raiseError[String](KleenexException("cannot pop from empty stack")) + } + } + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/KleenexParser.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/KleenexParser.scala new file mode 100644 index 000000000..eea1ecbb9 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/KleenexParser.scala @@ -0,0 +1,312 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package kleenex + +import transducer.CharRanges + +import cats.ApplicativeError +import cats.data.NonEmptyList +import cats.parse.{Caret, LocationMap, Parser0, Parser => P} +import cats.syntax.all._ + +case class KleenexParseException(msg: String) extends Exception(msg) + +class KleenexParser[F[_]](implicit F: ApplicativeError[F, Throwable]) { + + def parse(content: String): F[Program] = + KleenexParser.program + .parseAll(content) + .leftMap { e => + val locations = LocationMap(content) + KleenexParseException(prettyprint(locations, e)) + } + .liftTo[F] + + private def description(x: P.Expectation): String = x match { + case P.Expectation.OneOfStr(_, List(str)) => + s"expected $str" + case P.Expectation.OneOfStr(_, strs) => + val strList = strs.map(x => s"'$x'").mkString(", ") + s"expected one of $strList" + case P.Expectation.InRange(_, lower, upper) => + if (lower == upper) s"expected '$lower'" + else s"expected '$lower' ~ '$upper'" + case P.Expectation.StartOfString(_) => + "expected beginning of file" + case P.Expectation.EndOfString(_, _) => + "expected end of file" + case P.Expectation.Length(_, expected, actual) => + s"unexpected eof; expected ${expected - actual} more characters" + case P.Expectation.ExpectedFailureAt(_, matched) => + s"unexpected '$matched'" + case P.Expectation.Fail(_) => + "failed to parse" + case P.Expectation.FailWith(_, message) => + message + case P.Expectation.WithContext(contextStr, _) => + s"expected $contextStr" + } + + private def prettyprint(locmap: LocationMap, x: P.Expectation): String = { + val (row, col) = locmap.toLineCol(x.offset).getOrElse((0, locmap.input.size)) + val (r, c) = (row + 1, col + 1) + val line = locmap.getLine(row).get + val offending = + s"${row.toString map { _ => ' ' }} | ${" " * col}^" + s""" + |$r:$c: error: ${description(x)} + |$r | $line + |$offending""".stripMargin + } + + private def prettyprint(locmap: LocationMap, x: P.Error): String = + x.expected.map(prettyprint(locmap, _)).toList.mkString("") +} + +object KleenexParser { + import P._ + + private[this] val whitespace: P[Unit] = oneOf(List(charIn(" \t\r\n"), string("//") ~ charsWhile(_ != '\n'))).void + private[this] val whitespaces0: Parser0[Unit] = whitespace.rep0.void + + private val regIdentStart = ('a' to 'z') + private val identStart = regIdentStart ++ ('A' to 'Z') + private val digit = ('0' to '9') + private val identChar = identStart ++ digit ++ List('-', '_') + private val hexDigit = digit ++ ('a' to 'f') ++ ('A' to 'F') + + private val ident: P[String] = + (peek(charIn(identStart)).with1 *> charsWhile(identChar.contains(_))) + .withContext("identifier") <* whitespaces0 + + private val regIdent: P[String] = + (peek(charIn(regIdentStart)).with1 *> charsWhile(identChar.contains(_))) + .withContext("register identifier (must start with lower case)") <* whitespaces0 + + private val str: P[String] = + oneOf( + List( + charsWhile(!"\\\"".contains(_)).string, + char('\\') *> oneOf(List( + char('"').as("\""), + char('\\').as("\\"), + char('r').as("\r"), + char('n').as("\n"), + char('t').as("\t"), + char('f').as("\f"), + char('x') *> charIn(hexDigit) + .rep(min = 2, max = 2) + .string + .map(codepoint => Character.toString(Integer.parseInt(codepoint, 16))), + char('u') *> charIn(hexDigit) + .rep(min = 4, max = 4) + .string + .map(codepoint => Character.toString(Integer.parseInt(codepoint, 16))) + )) + ) + ).rep0.map(_.combineAll).with1.surroundedBy(char('"')) <* whitespaces0 + + private val integer: P[Int] = + charIn(digit).rep.string.mapFilter(_.toIntOption).withContext("positive integer") + + def keyword(kw: String): P[Unit] = + string(kw) <* whitespaces0 + + private val range: P[(Int, Option[Int])] = + char('{') *> oneOf( + List( + char(',') *> integer.map(max => (0, Some(max))), + (integer ~ (char(',') *> integer.?).?).map { + case (min, None) => (min, Some(min)) + case (min, Some(None)) => (min, None) + case (min, Some(Some(max))) => (min, Some(max)) + } + )) <* char('}') + + val regex: P[Regex] = P.recursive[Regex] { regex => + val setChar = oneOf( + List( + charWhere(!"-]\\".contains(_)), + char('\\') *> oneOf(List( + char('\\').as('\\'), + char('/').as('/'), + char('-').as('-'), + char(']').as(']'), + char('[').as('['), + char('r').as('\r'), + char('n').as('\n'), + char('t').as('\t'), + char('f').as('\f') + )) + )) + val set = char('[') *> (char('^').as(false).?.map(_.getOrElse(true)) ~ oneOf(List( + char('-').as(('-', '-') :: Nil), + (setChar ~ (char('-') *> setChar.?).?).map { + case (fst, Some(Some(snd))) => (fst, snd) :: Nil + case (fst, Some(None)) => (fst, fst) :: ('-', '-') :: Nil + case (fst, None) => (fst, fst) :: Nil + } + )).rep0.map(_.flatten)).map { + case (false, Nil) => CharRanges.all + case (true, Nil) => CharRanges.empty + case (true, r :: Nil) => CharRanges.range(r) + case (false, r :: Nil) => CharRanges.range(r).invert + case (true, r1 :: r2 :: rs) => CharRanges.ranges(r1, r2, rs: _*) + case (false, r1 :: r2 :: rs) => CharRanges.ranges(r1, r2, rs: _*).invert + } <* char(']') + + val atom = + oneOf( + List( + char('.').as(Regex.Any), + set.map(Regex.Set(_)), + oneOf( + List( + charWhere(!"\\/?*+|{[().".contains(_)).string, + char('\\') *> oneOf(List( + char('/').as("/"), + char('\\').as("\\"), + char('r').as("\r"), + char('n').as("\n"), + char('t').as("\t"), + char('f').as("\f"), + char('?').as("?"), + char('*').as("*"), + char('+').as("+"), + char('|').as("|"), + char('{').as("{"), + char('[').as("["), + char('(').as("("), + char(')').as(")"), + char('.').as("."), + char('u') *> charIn(hexDigit) + .rep(min = 4, max = 4) + .string + .map(codepoint => Character.toString(Integer.parseInt(codepoint, 16))) + )) + ) + ).map(Regex.Str(_)), + regex.between(char('('), char(')')) + )) + + val greedy = char('?').?.map(_.isEmpty) + val suffixed = + (atom ~ oneOf( + List( + char('?') *> greedy.map(greedy => Regex.Question(_, greedy)), + char('+') *> greedy.map(greedy => Regex.Plus(_, greedy)), + char('*') *> greedy.map(greedy => Regex.Star(_, greedy)), + range.map { case (min, max) => Regex.Range(_, min, max) } + )).?) + .map { + case (atom, None) => atom + case (atom, Some(mod)) => mod(atom) + } + + def aggregateStr(seq: NonEmptyList[Regex]): NonEmptyList[Regex] = { + def loop(seq: NonEmptyList[Regex]): NonEmptyList[Regex] = + seq match { + case NonEmptyList(Regex.Str(s1), Regex.Str(s2) :: rest) => loop(NonEmptyList(Regex.Str(s1 + s2), rest)) + case NonEmptyList(re, r :: rest) => re :: loop(NonEmptyList(r, rest)) + case NonEmptyList(_, Nil) => seq + } + loop(seq) + } + + val seq = + suffixed.rep + .map(aggregateStr(_)) + .map { + case NonEmptyList(atom, Nil) => atom + case seq => Regex.Concat(seq) + } + + seq.repSep(keyword("|")).map { + case NonEmptyList(seq, Nil) => seq + case alts => Regex.Or(alts) + } + } + + private val registerUpdate: P[Term] = + (caret.with1 ~ regIdent ~ oneOf(List(keyword("<-").as(false), keyword("+=").as(true))) ~ oneOf( + List(str.map(RegOrStr.Str(_)), regIdent.map(RegOrStr.Reg(_)))).rep) + .map { case (((caret, reg), prepend), value) => + Term.UpdateReg(reg, if (prepend) RegOrStr.Reg(reg) :: value else value)(caret) + } + + val term: P[Term] = recursive[Term] { term => + val atom: P[Term] = + oneOf( + List( + caret.map(Term.One()).with1 <* keyword("1"), + (caret.with1 ~ str).map { case (pos, s) => Term.Str(s)(pos) }, + ((caret.with1 ~ ident).map { case (pos, v) => Term.Var(v)(pos) } <* !oneOf( + List(keyword(":="), keyword("@")))).backtrack, + (caret.with1 ~ (char('/') *> regex <* char('/'))).map { case (pos, re) => Term.RE(re)(pos) } <* whitespaces0, + (caret.with1 ~ (char('!') *> regIdent)).map { case (pos, reg) => Term.Output(reg)(pos) }, + registerUpdate.between(keyword("["), keyword("]")), + term.between(keyword("("), keyword(")")) + )) + + val suffixed: P[Term] = + (atom ~ oneOf[Term => Term](List( + (caret.with1 <* keyword("*")).map(pos => Term.Star(_)(pos)), + (caret.with1 <* keyword("+")).map(pos => Term.Plus(_)(pos)), + (caret.with1 <* keyword("?")).map(pos => Term.Question(_)(pos)), + (caret.with1 ~ range).map { case (pos, (min, max)) => Term.Range(_: Term, min, max)(pos) } <* whitespaces0 + )).?) + .map { + case (inner, None) => inner + case (inner, Some(mod)) => mod(inner) + } + + val prefixed: P[Term] = + oneOf( + List( + (caret.with1 ~ (keyword("~") *> suffixed)).map { case (pos, t) => Term.Suppress(t)(pos) }, + (caret.with1 ~ (regIdent <* keyword("@")).backtrack ~ suffixed).map { case ((pos, reg), t) => + Term.Capture(reg, t)(pos) + }, + suffixed + )) + + val seq: P[Term] = + prefixed.rep.map { + case NonEmptyList(atom, Nil) => atom + case seq => Term.Concat(seq) + } + + seq.repSep(keyword("|")).map { + case NonEmptyList(seq, Nil) => seq + case alts => Term.Alternative(alts) + } + } + + val production: P[Production] = + (caret.with1 ~ (ident <* keyword(":=")) ~ term).map { case ((pos, id), t) => Production(id, t)(pos) } + + private val pipeline: Parser0[(Caret, NonEmptyList[String])] = + caret ~ oneOf0(List(keyword("start:") *> ident.repSep(keyword(">>")), pure(NonEmptyList.one("main")))) + + val program: P[Program] = + (pipeline.with1 ~ production.rep) + .map { case ((pos, pipe), rules) => Program(pipe, rules)(pos) } + .surroundedBy(whitespaces0) + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/Regex.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/Regex.scala new file mode 100644 index 000000000..dbf2cf7cf --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/Regex.scala @@ -0,0 +1,34 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import fs2.data.transducer.CharRanges + +import cats.data.NonEmptyList + +sealed trait Regex +object Regex { + case object Any extends Regex + case class Str(s: String) extends Regex + case class Concat(subs: NonEmptyList[Regex]) extends Regex + case class Or(alts: NonEmptyList[Regex]) extends Regex + case class Star(inner: Regex, greedy: Boolean) extends Regex + case class Plus(inner: Regex, greedy: Boolean) extends Regex + case class Question(inner: Regex, greedy: Boolean) extends Regex + case class Range(inner: Regex, fst: Int, snd: Option[Int]) extends Regex + case class Set(ranges: CharRanges) extends Regex +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/ast.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/ast.scala new file mode 100644 index 000000000..336f912a3 --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/ast.scala @@ -0,0 +1,55 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex + +import cats.data.NonEmptyList + +import cats.parse.Caret + +case class Program(pipeline: NonEmptyList[String], productions: NonEmptyList[Production])(val pos: Caret) + +case class Production(name: String, term: Term)(val pos: Caret) + +sealed trait Term { + val pos: Caret +} +object Term { + case class One()(val pos: Caret) extends Term + case class Str(s: String)(val pos: Caret) extends Term + case class Var(name: String)(val pos: Caret) extends Term + case class Capture(reg: String, inner: Term)(val pos: Caret) extends Term + case class Output(reg: String)(val pos: Caret) extends Term + case class UpdateReg(reg: String, value: NonEmptyList[RegOrStr])(val pos: Caret) extends Term + case class Alternative(cases: NonEmptyList[Term]) extends Term { + val pos: Caret = cases.head.pos + } + case class Concat(terms: NonEmptyList[Term]) extends Term { + val pos: Caret = terms.head.pos + } + case class RE(re: Regex)(val pos: Caret) extends Term + case class Suppress(inner: Term)(val pos: Caret) extends Term + case class Star(inner: Term)(val pos: Caret) extends Term + case class Plus(inner: Term)(val pos: Caret) extends Term + case class Question(inner: Term)(val pos: Caret) extends Term + case class Range(inner: Term, min: Int, max: Option[Int])(val pos: Caret) extends Term +} + +sealed trait RegOrStr +object RegOrStr { + case class Reg(name: String) extends RegOrStr + case class Str(s: String) extends RegOrStr +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Compiler.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Compiler.scala new file mode 100644 index 000000000..bf896f50d --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Compiler.scala @@ -0,0 +1,302 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data.kleenex.core + +import fs2.data.kleenex.{Action, Production, Program => KProgram, Regex, RegOrStr, Term => KTerm} +import fs2.data.transducer.CharRanges + +import cats.MonadError +import cats.data.StateT +import cats.syntax.all._ +import cats.data.NonEmptyList + +case class KleenexCompilerException(msg: String) extends Exception(msg) + +case class CompilerState(idents: Map[(String, Boolean), Int], + decls: Map[Int, Term], + revDecls: Map[Term, Int], + fresh: Int) + +class Compiler[F[_]](implicit F: MonadError[F, Throwable]) { + + private type State[Res] = StateT[F, CompilerState, Res] + + /** Compiles a kleenex program into the core language representation. */ + def compile(prog: KProgram): F[Program] = { + // associate each production to 2 ids: + // - one when it outputs element + // - one when it outputs *no* elements + val idents = prog.productions + .flatMap { case Production(name, term) => + NonEmptyList.of((name, true), (name, false)) + } + .zipWithIndex + .toList + .toMap + val fresh = idents.size + val state = CompilerState(idents, Map.empty, Map.empty, fresh) + + val checkPipeline = + prog.pipeline.traverse(name => + idents.get(name -> true) match { + case Some(id) => id.pure[F] + case None => new KleenexCompilerException(s"Unknown production $name in pipeline").raiseError[F, Int] + }) + + val compiledProductions = + prog.productions + .traverse_ { case Production(name, term) => + for { + idout <- lookup(name, true) + idnoout <- lookup(name, false) + compiledout <- compile(true, term) + compilednoout <- compile(false, term) + _ <- insertDecl(idout, Term.Seq(List(compiledout))) + _ <- insertDecl(idnoout, Term.Seq(List(compilednoout))) + } yield () + } + + (checkPipeline, compiledProductions.runS(state)) + .mapN { (pipeline, state) => + val reached = reachable(pipeline.toList, state.decls) + compress(Program(pipeline, state.decls.view.filterKeys(reached.contains(_)).toMap)) + } + } + + def compile(re: Regex): F[Program] = { + compile(true, re) + .run(CompilerState(Map.empty, Map.empty, Map.empty, 0)) + .map { case (st, id) => Program(NonEmptyList.one(id), st.decls) } + } + + private def compile(output: Boolean, re: Regex): State[Int] = + re match { + case Regex.Any => + declare(Term.Read(CharRanges.all, output)) + case Regex.Str(str) => + str.toList + .traverse(c => declare(Term.Read(CharRanges.char(c), output))) + .flatMap(ids => declare(Term.Seq(ids))) + case Regex.Concat(res) => + res.traverse(compile(output, _)).flatMap(ids => declare(Term.Seq(ids.toList))) + case Regex.Or(alts) => + alts.traverse(compile(output, _)).flatMap(ids => declare(Term.Alternative(ids))) + case Regex.Plus(re, greedy) => + compile(output, re).flatMap(plus(_, greedy)) + case Regex.Star(re, greedy) => + compile(output, re).flatMap(star(_, greedy)) + case Regex.Question(re, greedy) => + compile(output, re).flatMap(question(_, greedy)) + case Regex.Set(chars) => + declare(Term.Read(chars, output)) + case Regex.Range(re, min, max) => + compile(output, re).flatMap(range(_, min, max)) + } + + private def compile(output: Boolean, term: KTerm): State[Int] = + term match { + case KTerm.One() => + declare(Term.epsilon) + case KTerm.Str(s) => + val toOuptut = if (output) s else "" + declare(Term.Const(Left(toOuptut))) + case KTerm.Var(v) => + lookup(v, output) + case KTerm.Capture(reg, t) => + for { + idt <- compile(output, t) + idpush <- declare(Term.Const(Right(Action.Push))) + idpop <- declare(Term.Const(Right(Action.Pop(reg)))) + id <- declare(Term.Seq(List(idpush, idt, idpop))) + } yield id + case KTerm.Output(reg) => + declare(Term.Const(Right(Action.Write(reg)))) + case KTerm.UpdateReg(reg, value) => + for { + idpush <- declare(Term.Const(Right(Action.Push))) + idsval <- value.map(updateSym(_)).traverse(c => declare(Term.Const(c))) + idpop <- declare(Term.Const(Right(Action.Pop(reg)))) + id <- declare(Term.Seq((idpush :: idsval).toList :+ idpop)) + } yield id + case KTerm.Alternative(cases) => + flattenAlternatives(cases) + .traverse(compile(output, _)) + .flatMap(ids => declare(Term.Alternative(ids))) + case KTerm.Concat(ts) => + flattenSequences(ts) + .traverse(compile(output, _)) + .flatMap(ids => declare(Term.Seq(ids))) + case KTerm.RE(re) => + compile(output, re) + case KTerm.Suppress(t) => + compile(false, t) + case KTerm.Star(t) => + compile(output, t).flatMap(star(_, true)) + case KTerm.Plus(t) => + compile(output, t).flatMap(plus(_, true)) + case KTerm.Question(t) => + compile(output, t).flatMap(question(_, true)) + case KTerm.Range(t, min, max) => + compile(output, t).flatMap(range(_, min, max)) + } + + // r* = r1 | 1 + // r1 = r r* + // r*? = 1 | r2 + // r2 = r r*? + private def star(idt: Int, greedy: Boolean): State[Int] = + for { + ideps <- declare(Term.epsilon) + id <- freshId + idloop <- declare(Term.Seq(List(idt, id))) + id <- insertDecl(id, + Term.Alternative( + // favor more over less + if (greedy) NonEmptyList.of(idloop, ideps) + // favor less over more + else NonEmptyList.of(ideps, idloop) + )) + } yield id + + // r+ = r r* + // r+? = r r*? + private def plus(idt: Int, greedy: Boolean): State[Int] = + for { + idstar <- star(idt, greedy) + id <- declare(Term.Seq(List(idt, idstar))) + } yield id + + // r? = r | 1 + // r?? = 1 | r + private def question(idt: Int, greedy: Boolean): State[Int] = + for { + ideps <- declare(Term.epsilon) + id <- declare( + Term.Alternative( + // favor one over zero + if (greedy) NonEmptyList.of(idt, ideps) + // favor zero over one + else NonEmptyList.of(ideps, idt))) + } yield id + + private def range(idt: Int, min: Int, max: Option[Int]): State[Int] = + max match { + case Some(max) if min == max => + declare(Term.Seq(List.fill(min)(idt))) + case Some(max) => + question(idt, true).flatMap(idq => declare(Term.Seq(List.fill(min)(idt) ++ List.fill(max - min)(idq)))) + case None => + star(idt, true).flatMap(idstar => declare(Term.Seq(List.fill(min)(idt) ++ List(idstar)))) + } + + private def updateSym(sym: RegOrStr): Either[String, Action] = + sym match { + case RegOrStr.Reg(reg) => Right(Action.Write(reg)) + case RegOrStr.Str(s) => Left(s) + } + + private def flattenAlternatives(alts: NonEmptyList[KTerm]): NonEmptyList[KTerm] = + alts match { + case NonEmptyList(KTerm.Alternative(alts), a :: rest) => + flattenAlternatives(alts).concatNel(flattenAlternatives(NonEmptyList(a, rest))) + case NonEmptyList(KTerm.Alternative(alts), Nil) => + flattenAlternatives(alts) + case NonEmptyList(t, a :: rest) => + t :: flattenAlternatives(NonEmptyList(a, rest)) + case NonEmptyList(t, Nil) => + NonEmptyList.one(t) + } + + private def flattenSequences(ts: NonEmptyList[KTerm]): List[KTerm] = + ts match { + case NonEmptyList(KTerm.Concat(ts), t :: rest) => flattenSequences(ts) ++ flattenSequences(NonEmptyList(t, rest)) + case NonEmptyList(t, h :: rest) => t :: flattenSequences(NonEmptyList(h, rest)) + case NonEmptyList(KTerm.Concat(ts), Nil) => flattenSequences(ts) + case NonEmptyList(t, Nil) => List(t) + } + + private def get: State[CompilerState] = + StateT.get + + private def modify(f: CompilerState => CompilerState): State[Unit] = + StateT.modify(f) + + private def freshId: State[Int] = + get.map(_.fresh) <* modify(s => s.copy(fresh = s.fresh + 1)) + + private def insertDecl(id: Int, term: Term): State[Int] = + modify(st => st.copy(decls = st.decls.updated(id, term), revDecls = st.revDecls.updated(term, id))).as(id) + + private def lookup(id: String, output: Boolean): State[Int] = + get.map(_.idents.get((id, output))).flatMapF { + case Some(id) => id.pure[F] + case None => KleenexCompilerException(s"Unknown non terminal identifier $id").raiseError[F, Int] + } + + private def declare(term: Term): State[Int] = + get.map(_.revDecls.get(term)).flatMap { + case Some(id) => id.pure[State] + case None => freshId.flatMap(insertDecl(_, term)) + } + + private def reachable(from: List[Int], decls: Map[Int, Term]): Set[Int] = { + def referenced(t: Term): List[Int] = + t match { + case Term.Seq(ids) => ids + case Term.Alternative(ids) => ids.toList + case _ => Nil + } + + def loop(from: List[Int], acc: Set[Int]): Set[Int] = + from match { + case id :: from => + if (acc.contains(id)) + loop(from, acc) + else + loop(decls.get(id).map(referenced(_)).getOrElse(Nil) reverse_::: from, acc + id) + case Nil => + acc + } + loop(from, Set.empty) + } + + private def compress(prog: Program): Program = { + def alias(aliases: Map[Int, Int], id: Int): Map[Int, Int] = + prog.decls.get(id) match { + case Some(Term.Seq(List(idt))) => + val aliases1 = alias(aliases, idt) + aliases1.updated(id, aliases1.getOrElse(idt, idt)) + case _ => + aliases + } + val aliases = prog.decls.keys.foldLeft(Map.empty[Int, Int])(alias(_, _)) + + def replace(t: Term): Term = + t match { + case Term.Alternative(ts) => Term.Alternative(ts.map(id => aliases.getOrElse(id, id))) + case Term.Seq(ts) => Term.Seq(ts.map(id => aliases.getOrElse(id, id))) + case _ => t + } + + if (aliases.isEmpty) + prog + else + Program(prog.pipeline.map(id => aliases.getOrElse(id, id)), + prog.decls.view.filterKeys(!aliases.contains(_)).mapValues(replace(_)).toMap) + } + +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Grammar.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Grammar.scala new file mode 100644 index 000000000..34ef7df5b --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/Grammar.scala @@ -0,0 +1,59 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data +package kleenex +package core + +import transducer.CharRanges + +import cats.Show +import cats.data.NonEmptyList +import cats.syntax.all._ + +case class Program(pipeline: NonEmptyList[Int], decls: Map[Int, Term]) +object Program { + implicit val show: Show[Program] = Show.show { case Program(pipeline, decls) => + s"""start: ${pipeline.mkString_(" >> ")} + | + |${decls.toList.sortBy(_._1).map { case (k, v) => show"$k -> $v" }.mkString_("\n")}""".stripMargin + } +} + +sealed trait Term +object Term { + case class Const(strOrReg: Either[String, Action]) extends Term + case class Read(ranges: CharRanges, output: Boolean) extends Term + case class Seq(idents: List[Int]) extends Term + case class Alternative(idents: NonEmptyList[Int]) extends Term + + def epsilon: Term = Seq(Nil) + + implicit val show: Show[Term] = Show.show { + case Const(Left(s)) => + s""""$s"""" + case Const(Right(a)) => + a.show + case Read(rs, true) => + rs.show + case Read(rs, false) => + show"~$rs" + case Seq(ids) => + ids.mkString_(" -> ") + case Alternative(alts) => + alts.mkString_(" | ") + } +} diff --git a/kleenex/shared/src/main/scala/fs2/data/kleenex/core/TransducerCompiler.scala b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/TransducerCompiler.scala new file mode 100644 index 000000000..0217e650d --- /dev/null +++ b/kleenex/shared/src/main/scala/fs2/data/kleenex/core/TransducerCompiler.scala @@ -0,0 +1,100 @@ +/* + * Copyright 2021 Lucas Satabin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2.data +package kleenex +package core + +import fst._ +import transducer.CharRanges + +import cats.data.NonEmptyList +import cats.MonadError +import cats.syntax.all._ + +class TransducerCompiler[F[_]](implicit F: MonadError[F, Throwable]) { + + private type Q = List[Int] + private type E = + Edge[List[Int], CharRanges, CopyFunc[Char, List[Either[String, Action]]], List[Either[String, Action]]] + + def build(prog: Program): F[NonEmptyList[Transducer[Int, Char, Either[String, Action]]]] = + prog.pipeline.traverse { ident => + construct(prog, ident).map(_.enumerateStates) + } + + private def construct(prog: Program, ident: Int): F[Transducer[Q, Char, Either[String, Action]]] = { + def decl(id: Int): F[Term] = + prog.decls.get(id) match { + case Some(t) => t.pure[F] + case None => F.raiseError(KleenexCompilerException(s"Unknown identifier $id")) + } + // Optimization: Reduce number of generated states by contracting + // non-deterministic edges with no output. This is done by "skipping" states + // whose head nonterminal is declared to be a Seq term, or an RSum with only + // one successor. + def follow(qs: Q): F[Q] = + qs match { + case Nil => List.empty.pure[F] + case q :: qs1 => + decl(q).flatMap { + case Term.Seq(rs) => follow(rs ++ qs1) + case Term.Alternative(NonEmptyList(r, Nil)) => follow(r :: qs1) + case _ => qs.pure[F] + } + } + + def go(workingSet: List[Q], states: Set[Q], transitions: List[E]): F[(Set[Q], List[E])] = + workingSet match { + case Nil => + (states, transitions).pure[F] + case q :: rest if states.contains(q) => + go(rest, states, transitions) + case Nil :: rest => + go(rest, states + Nil, transitions) + case (h @ q :: qs) :: rest => + val states1 = states + h + decl(q).flatMap { + case Term.Const(out) => + follow(qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Right(List(out)), q1) :: transitions) + } + case Term.Read(pred, false) => + follow(qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Left((pred, CopyFunc.CopyConst(Nil))), q1) :: transitions) + } + case Term.Read(pred, true) => + follow(qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Left((pred, CopyFunc.CopyArg)), q1) :: transitions) + } + case Term.Seq(rs) => + follow(rs ++ qs).flatMap { q1 => + go(q1 :: rest, states1, (h, Right(Nil), q1) :: transitions) + } + case Term.Alternative(rs) => + rs.toList.traverse(r => follow(r :: qs)).flatMap { qs1 => + val trans = qs1.map(q1 => (h, Right(Nil), q1)) + go(qs1 reverse_::: rest, states1, trans ++ transitions) + } + } + } + + go(List(List(ident)), Set.empty, Nil).map { case (states, transitions) => + new FST(List(ident), states, OrderedEdgeSet.fromList(transitions), Set(Nil)) + } + } + +} diff --git a/kleenex/shared/src/test/resources/kleenex/highlighter.kex b/kleenex/shared/src/test/resources/kleenex/highlighter.kex new file mode 100644 index 000000000..645c0b0c0 --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/highlighter.kex @@ -0,0 +1,36 @@ +main := ( escape | comment | term | symbol | ignored | ws * )* + +term := black /~/ (constant | match | ident) end + | (teal constant | yellow match | blue ident) end + +ignored := /[\]()|{},:[]/ + +ident := (letter | /[0-9_]/)+ + +symbol := yellow /<-|\+=|:=|>>|\*|\?|\+/ end + +constant := /"/ ( /\\./ | /[^\\"]/ )* /"/ + +comment := black ( /\/\/[^\n]*\n/ | /\/\*[^*\/]*\*\// ) end + +match := /\// ( /[^\/\n]/ | /\\./ )+ /\// + +escape := /\\\\/ + | blue /\\x[0-9a-fA-F]{2}/ end + | /\\[tnr]/ + +sp := / /* + +letter := /[a-zA-Z]/ + +word := letter+ + +ws := /[\t\r\n ]/ + +red := "\x1b[31m" +green := "\x1b[32m" +yellow:= "\x1b[33m" +blue := "\x1b[34m" +end := "\x1b[39;49m" +black := "\x1b[30m" +teal := "\x1b[36m" diff --git a/kleenex/shared/src/test/resources/kleenex/ini2json.kex b/kleenex/shared/src/test/resources/kleenex/ini2json.kex new file mode 100644 index 000000000..01ccfe83f --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/ini2json.kex @@ -0,0 +1,19 @@ +start: stripini >> ini2json +// Strips the comments +stripini := (~comment | ~blank | /[^\n]*\n/)* +comment := ws /;[^\n]*/ +blank := ws /\n/ +// Convert the stripped file +ini2json := "{\n" sections "}\n" +sections := (section "," /\n/)* section /\n/ +section := + ind "\"" header "\": {\n" (~/\n/ keyvalues)? ind "}" +header := ~ws ~/\[/ /[^\n\]]*/ ~/]/ ~ws +keyvalue := ind ind key ": " ~/=/ value +keyvalues := (keyvalue "," /\n/)* keyvalue "\n" +key := ~ws "\"" /[^; \t=\[\n]*/ "\"" ~ws +value := ~ws /"[^\n]*"/ ~ws +| ~ws "\"" escapedValue "\"" ~ws +escapedValue := (~/\\/ "\\\\" | ~/"/ "\\\"" | /[^\n]/)* +ws := /[ \t]*/ +ind := " " diff --git a/kleenex/shared/src/test/resources/kleenex/logrewrite.kex b/kleenex/shared/src/test/resources/kleenex/logrewrite.kex new file mode 100644 index 000000000..c403d8287 --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/logrewrite.kex @@ -0,0 +1,23 @@ +main := "[" loglines? "]\n" + +loglines := (logline "," /\n/)* logline /\n/ +logline := "{" host ~sep ~userid ~sep ~authuser sep timestamp sep + request sep code sep bytes sep referer sep useragent "}" + +host := "\"host\":\"" ip "\"" +userid := "\"user\":\"" rfc1413 "\"" +authuser := "\"authuser\":\"" /[^ \n]+/ "\"" +timestamp := "\"date\":\"" ~/\[/ /[^\n\]]+/ ~/]/ "\"" +request := "\"request\":" quotedString +code := "\"status\":\"" integer "\"" +bytes := "\"size\":\"" (integer | /-/) "\"" +referer := "\"url\":" quotedString +useragent := "\"agent\":" quotedString + +ws := /[\t ]+/ +sep := "," ~ws + +quotedString := /"([^"\n]|\\")*"/ +integer := /[0-9]+/ +ip := integer (/\./ integer){3} +rfc1413 := /-/ diff --git a/kleenex/shared/src/test/resources/kleenex/mitm.kex b/kleenex/shared/src/test/resources/kleenex/mitm.kex new file mode 100644 index 000000000..2dfd4a46c --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/mitm.kex @@ -0,0 +1,9 @@ +main := /
/ main + | /./ main + | "" + +url := q? /[^"’ >]/* q? +q := ~/"|’/ +addq := "\"" +sp := / /* +evil := addq "http://evil.com/?url=" !orig addq diff --git a/kleenex/shared/src/test/resources/kleenex/recursive.kex b/kleenex/shared/src/test/resources/kleenex/recursive.kex new file mode 100644 index 000000000..b64673f0a --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/recursive.kex @@ -0,0 +1,3 @@ +main := as | bs +as := "a" bs | 1 +bs := "b" as | 1 diff --git a/kleenex/shared/src/test/resources/kleenex/simple.kex b/kleenex/shared/src/test/resources/kleenex/simple.kex new file mode 100644 index 000000000..9433451aa --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/simple.kex @@ -0,0 +1 @@ +main := id @ /[a-z][a-z0-9]*/ !id !id diff --git a/kleenex/shared/src/test/resources/kleenex/test.kex b/kleenex/shared/src/test/resources/kleenex/test.kex new file mode 100644 index 000000000..08ebba3eb --- /dev/null +++ b/kleenex/shared/src/test/resources/kleenex/test.kex @@ -0,0 +1,38 @@ +// A Kleenex program starts with what we call a pipeline declaration. +// This one can be understood: First remove the comments, +// then gather the numbers at the bottom. +start: remComments >> gatherNumbers + +// If no pipeline is specified, "main" is picked +// as the starting point. +// The most basic Kleenex term is matching. It matches +// the input against a regular expression, outputting it directly. +line := /[^\n]*\n/ +// Often you don’t want all the input turned into output. +// The ~ operator lets suppress the output otherwise produced +// by a term, in this case removing lines that start with "#", +// and preserving ones that don’t. +// When there’s ambiguity, the leftmost choice is always chosen. +commentLine := ~(/#/ line) | line +// Recursion is allowed, but only in tail position. Here we +// terminate the recursion with "1", which consumes nothing and +// always succeeds. +remComments := commentLine remComments | 1 + +// We also allow regex operators like *, + and ? on terms: +thousandSepLines := (thousandSep /\n/ | line)* + +// It’s possible to output text without matching by using "...". +// In this case, we use it to insert thousands separators into a number. +thousandSep := digit{1,3} ("," digit{3})* /\n/ +digit := /[0-9]/ + +// We also allow for more complicated operations. We call these ’actions’. +// reg@term runs the term as normal, but all output it would produce is +// stored in the register named reg. +// [ ... += ... ] allows you to append things to a register, both contents +// of other registers, as well as string constants. +// !reg outputs the contents of a register. +gatherNumbers := + (num@thousandSep [ numbers += num ] | line)* + !numbers