Skip to content

Commit

Permalink
feat: introduce FileSystem.unjarOnce (#1250)
Browse files Browse the repository at this point in the history
* feat: introduce `FileSystem.unjarOnce`

`specs2-html` currently copies all of its html resources (41 files) for each specification (executed with html output), reading and traversing the `specs2-html.jar` four times in the process, overwriting its own previously copied files over and over. This is inefficient and causes unnecessary strain on the disk usage. The newly introduced `unjarOnce` method is used by `specs2-html` to only unjar its resources once for each target location and filter criteria.

* feature: extract a more general LruCache

---------

Co-authored-by: NTPape <10488949+NTPape@users.noreply.github.com>
  • Loading branch information
etorreborre and NTPape authored Jun 16, 2024
1 parent 026d0e2 commit 08e4c4d
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 1 deletion.
40 changes: 40 additions & 0 deletions common/shared/src/main/scala/org/specs2/data/LruCache.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package org.specs2.data

import org.specs2.fp.*
import org.specs2.control.*
import org.specs2.time.*

/** LRU (least recently used) cache for processing items Values can be registered and the cached cleaned so that it
* doesn't go above a given size. The oldest elements are removed first.
*/
class LruCache[A](maxSize: Int, systemTime: SystemTime = JavaSystemTime):
private var values: Map[A, Long] = Map.empty

/** Checks if a value has already been processed; if not immediately adds it to the cache. If it has been processed,
* refresh its timestamp.
* @return
* the processed status
*/
def register(value: A): Operation[ProcessedStatus] =
Operation.delayed {
this.synchronized:
val alreadyProcessed = values.contains(value)
// refresh the timestamp even if the params were already registered
values += value -> systemTime.nanoTime
val status = if alreadyProcessed then ProcessedStatus.AlreadyProcessed else ProcessedStatus.ToProcess
while values.size > maxSize do values -= values.minBy(_._2)._1
status
}

/** Return the number of elements in the cache */
def size: Int =
values.size

/** Return the timestamp for the oldest element */
def oldestTimestamp: Long =
values.minBy(_._2)._2

/** This enum describes the status of an item in the LruCache */
enum ProcessedStatus:
case AlreadyProcessed
case ToProcess
27 changes: 27 additions & 0 deletions common/shared/src/main/scala/org/specs2/io/FileSystem.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package io

import control.*
import fp.syntax.*
import data.LruCache
import data.ProcessedStatus.*
import java.io.*
import java.util.regex.Pattern.*
import java.util.regex.Matcher.*
Expand Down Expand Up @@ -58,6 +60,28 @@ case class FileSystem(logger: Logger) extends FilePathReader:
def mkdirs(path: FilePath): Operation[Unit] =
mkdirs(path.dir)

/** Unjaring the same thing over and over is inefficient. LRU cache to keep track of what was already done. */
private val UnjarLRUCache = new LruCache[(URL, DirectoryPath, String)](maxSize = 1000)

/** Unjar the jar (or zip file) specified by "path" to the "dest" directory. Filters files which shouldn't be
* extracted with a regular expression. This is only done once per argument list (unless eventually evicted from LRU
* cache).
* @param jarUrl
* path of the jar file
* @param dest
* destination directory path
* @param regexFilter
* regular expression filtering files which shouldn't be extracted; the expression must capture the path of an
* entry as group 1 which will then be used relative to dirPath as target path for that entry
* @see
* [[unjar]]
*/
def unjarOnce(jarUrl: URL, dest: DirectoryPath, regexFilter: String): Operation[Unit] =
for
status <- UnjarLRUCache.register((jarUrl, dest, regexFilter))
_ <- unjar(jarUrl, dest, regexFilter).when(status == ToProcess)
yield ()

/** Unjar the jar (or zip file) specified by "path" to the "dest" directory. Filters files which shouldn't be
* extracted with a regular expression.
* @param jarUrl
Expand All @@ -67,6 +91,9 @@ case class FileSystem(logger: Logger) extends FilePathReader:
* @param regexFilter
* regular expression filtering files which shouldn't be extracted; the expression must capture the path of an
* entry as group 1 which will then be used relative to dirPath as target path for that entry
*
* @see
* [[unjarOnce]]
*/
def unjar(jarUrl: URL, dest: DirectoryPath, regexFilter: String): Operation[Unit] =
val regex = compile(regexFilter)
Expand Down
9 changes: 9 additions & 0 deletions common/shared/src/main/scala/org/specs2/time/SystemTime.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package org.specs2.time

/** This trait provides the current time */
trait SystemTime:
def nanoTime: Long

object JavaSystemTime extends SystemTime:
override def nanoTime: Long =
System.nanoTime()
2 changes: 1 addition & 1 deletion html/src/main/scala/org/specs2/reporter/HtmlPrinter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ case class HtmlPrinter(env: Env, searchPage: SearchPage, logger: Logger = Consol
case Some(url) =>
val fs = env.fileSystem
if url.getProtocol.equalsIgnoreCase("jar") then
fs.unjar(jarOf(url), outputDir, s"^${quote(base.path)}(/${quote(src.path)}/.*)$$")
fs.unjarOnce(jarOf(url), outputDir, s"^${quote(base.path)}(/${quote(src.path)}/.*)$$")
else fs.copyDir(DirectoryPath.unsafe(url.toURI), outputDir / src)
case _ =>
val message = s"no resource found for path ${(base / src).path}"
Expand Down
56 changes: 56 additions & 0 deletions tests/shared/src/test/scala/org/specs2/data/LruCacheSpec.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package org.specs2
package data

import org.scalacheck.*
import org.scalacheck.Arbitrary.*
import org.specs2.time.*
import org.specs2.fp.syntax.*
import ProcessedStatus.*

class LruCacheSpec extends Specification with ScalaCheck:
def is = s2"""

A LRU cache can be used to store elements and evict them when they have been unused for a long time
A status is returned to know if an element has already been seen before $e1
The cache can not contain more than a fixed number of elements $e2
The oldest elements are always evicted first $e3

"""

def e1 =
val cache = LruCache[Int](maxSize = 3, systemTime = MockSystemTime())
val operations = cache.register(1) >> cache.register(2) >> cache.register(1)
val status = operations.unsafeRun
status === AlreadyProcessed

def e2 = prop { (n: SmallInt) =>
val cache = LruCache[Int](maxSize = 3, systemTime = MockSystemTime())
val operations = (1 to n.value).toList.traverse(i => cache.register(i))
operations.void.unsafeRun
cache.size must be_<=(3)
}.set(minTestsOk = 10)

def e3 = prop { (n: SmallInt) =>
val mockSystemTime = MockSystemTime()
val cache = LruCache[Int](maxSize = 3, systemTime = mockSystemTime)
val operations = (1 to n.value).toList.traverse(i => cache.register(i))
operations.void.unsafeRun
cache.oldestTimestamp must be_<(mockSystemTime.nanoTime)
}.set(minTestsOk = 10)

/** HELPERS */
class MockSystemTime() extends SystemTime:
private var times: LazyList[Long] = LazyList.from(1).map(_.toLong)

def nanoTime: Long =
times match {
case t #:: ts => times = ts; t
}

case class SmallInt(value: Int)

object SmallInt {
given Arbitrary[SmallInt] = Arbitrary {
arbitrary[Int].map(n => SmallInt((n % 10).abs + 1))
}
}

0 comments on commit 08e4c4d

Please sign in to comment.