From 47c73b701d89e35326edda623a3f2965ed9df4ec Mon Sep 17 00:00:00 2001 From: Eugene Yokota Date: Mon, 25 May 2026 15:28:50 -0400 Subject: [PATCH] Port imohash to Scala --- build.sbt | 24 +++--- .../sbt/internal/util/hashing/FileHash.scala | 20 +++++ .../util/hashing/FileSampleHash.scala | 75 +++++++++++++++++++ .../sbt/internal/util/hashing/Hashing.scala | 4 + .../internal/util/FileSampleHashTest.scala | 44 +++++++++++ .../src/main/scala/sbt/util/Digest.scala | 56 +++++++++++++- .../src/test/scala/sbt/util/DigestTest.scala | 28 +++++++ 7 files changed, 237 insertions(+), 14 deletions(-) create mode 100644 internal/util-control/src/main/scala/sbt/internal/util/hashing/FileHash.scala create mode 100644 internal/util-control/src/main/scala/sbt/internal/util/hashing/FileSampleHash.scala create mode 100644 internal/util-control/src/test/scala/sbt/internal/util/FileSampleHashTest.scala diff --git a/build.sbt b/build.sbt index c78323fcf..8614c6dc9 100644 --- a/build.sbt +++ b/build.sbt @@ -292,16 +292,18 @@ lazy val utilInterface = (project in file("internal") / "util-interface").settin mimaSettings, ) -lazy val utilControl = (project in file("internal") / "util-control").settings( - utilCommonSettings, - name := "Util Control", - libraryDependencies ++= Seq( - scalacheck % Test, - scalaVerify % Test, - hedgehog % Test, - ), - mimaSettings, -) +lazy val utilControl = (project in file("internal") / "util-control") + .settings( + utilCommonSettings, + name := "Util Control", + libraryDependencies ++= Seq( + scalacheck % Test, + scalaVerify % Test, + hedgehog % Test, + ), + mimaSettings, + ) + .configure(addSbtIOForTest) lazy val utilPosition = (project in file("internal") / "util-position") .settings( @@ -374,7 +376,7 @@ lazy val utilCache = project // we generate JsonCodec only for actionresult.contra JsonCodecPlugin, ) - .dependsOn(utilLogging) + .dependsOn(utilLogging, utilControl) .settings( testedBaseSettings, name := "Util Cache", diff --git a/internal/util-control/src/main/scala/sbt/internal/util/hashing/FileHash.scala b/internal/util-control/src/main/scala/sbt/internal/util/hashing/FileHash.scala new file mode 100644 index 000000000..c72fecd99 --- /dev/null +++ b/internal/util-control/src/main/scala/sbt/internal/util/hashing/FileHash.scala @@ -0,0 +1,20 @@ +/* + * sbt + * Copyright 2023, Scala center + * Copyright 2011 - 2022, Lightbend, Inc. + * Copyright 2008 - 2010, Mark Harrah + * Licensed under Apache License 2.0 (see LICENSE) + * + */ + +package sbt.internal.util.hashing + +import java.io.File +import java.nio.file.{ Path as NioPath } + +trait FileHash: + def hash(file: File): Long + def hash(file: NioPath): Long + override def toString(): String = + getClass().getSimpleName() +end FileHash diff --git a/internal/util-control/src/main/scala/sbt/internal/util/hashing/FileSampleHash.scala b/internal/util-control/src/main/scala/sbt/internal/util/hashing/FileSampleHash.scala new file mode 100644 index 000000000..46ccd23be --- /dev/null +++ b/internal/util-control/src/main/scala/sbt/internal/util/hashing/FileSampleHash.scala @@ -0,0 +1,75 @@ +/* + * sbt + * Copyright 2023, Scala center + * Copyright 2011 - 2022, Lightbend, Inc. + * Copyright 2008 - 2010, Mark Harrah + * Licensed under Apache License 2.0 (see LICENSE) + * + */ + +package sbt.internal.util.hashing + +import java.io.{ File, RandomAccessFile } +import java.nio.ByteBuffer +import java.nio.file.{ Path as NioPath } +import scala.util.Using + +object FileSampleHash: + final val defaultSampleBytes = 16 * 1024 + final val defaultThresoldBytes = 128L * 1024L + + def apply(underlying: StreamingHashAlgo): FileSampleHash = + new FileSampleHash(defaultSampleBytes, defaultThresoldBytes, underlying) +end FileSampleHash + +/** + * Based on Imohash https://github.com/kalafut/imohash/blob/master/algorithm.md + */ +class FileSampleHash(sampleBytes: Int, thresholdBytes: Long, underlying: StreamingHashAlgo) + extends FileHash: + require(sampleBytes >= 0) + + val buffer: Array[Byte] = new Array[Byte](4096) + + override def hash(file: NioPath): Long = + hash(file.toFile()) + + override def hash(file: File): Long = + Using.resource(new RandomAccessFile(file, "r")): raf => + hash(raf, raf.length()) + + private def hash(input: RandomAccessFile, fileLength: Long): Long = + underlying.reset() + if fileLength < thresholdBytes || sampleBytes < 1 then hashBytes(input, fileLength) + else + hashBytes(input, sampleBytes) + // skip to halfway point + input.seek(fileLength / 2) + hashBytes(input, sampleBytes) + input.seek(fileLength - sampleBytes) + hashBytes(input, sampleBytes) + + // write file size + if fileLength > 0 then + val sizeBuf = ByteBuffer.allocate(java.lang.Long.BYTES) + sizeBuf.putLong(fileLength) + underlying.update(sizeBuf.array(), 0, sizeBuf.array().size) + + underlying.getValue + end hash + + private def hashBytes(input: RandomAccessFile, toHash: Long): Unit = + var remaining: Long = toHash + var pos = 0 + while remaining > 0 do + val toread = math.min(buffer.size - pos, remaining).toInt + val bytesRead = input.read(buffer, pos, toread) + if bytesRead < 0 then sys.error("unexpected EOF") + pos += bytesRead + remaining -= bytesRead + if pos >= buffer.length then + underlying.update(buffer, 0, buffer.length) + pos = 0 + if pos > 0 then underlying.update(buffer, 0, pos) + end hashBytes +end FileSampleHash diff --git a/internal/util-control/src/main/scala/sbt/internal/util/hashing/Hashing.scala b/internal/util-control/src/main/scala/sbt/internal/util/hashing/Hashing.scala index 2a7f929ca..781e21948 100644 --- a/internal/util-control/src/main/scala/sbt/internal/util/hashing/Hashing.scala +++ b/internal/util-control/src/main/scala/sbt/internal/util/hashing/Hashing.scala @@ -17,4 +17,8 @@ object Hashing: new StreamingXXHash64VarHandle(seed) def newStreamingWyHash64(seed: Long): StreamingHashAlgo = new StreamingWyHash64VarHandle(seed) + def samplingFileHashXXHash64(seed: Long): FileHash = + FileSampleHash(newStreamingXXHash64(seed)) + def samplingFileHashWyHash64(seed: Long): FileHash = + FileSampleHash(newStreamingWyHash64(seed)) end Hashing diff --git a/internal/util-control/src/test/scala/sbt/internal/util/FileSampleHashTest.scala b/internal/util-control/src/test/scala/sbt/internal/util/FileSampleHashTest.scala new file mode 100644 index 000000000..92fe09e0a --- /dev/null +++ b/internal/util-control/src/test/scala/sbt/internal/util/FileSampleHashTest.scala @@ -0,0 +1,44 @@ +/* + * sbt + * Copyright 2023, Scala center + * Copyright 2011 - 2022, Lightbend, Inc. + * Copyright 2008 - 2010, Mark Harrah + * Licensed under Apache License 2.0 (see LICENSE) + * + */ + +package sbt.internal.util.hashing + +import verify.BasicTestSuite +import sbt.io.IO +import sbt.io.syntax.* + +object FileSampleHashTest extends BasicTestSuite: + val emptyHash = -1205034819632174695L + val testHash = 2563739794714397383L + + test("Hash empty file"): + val hash64 = Hashing.samplingFileHashXXHash64(0) + IO.withTemporaryDirectory: dir => + val temp = dir / "test.txt" + IO.touch(temp) + val h = hash64.hash(temp) + assert(h == emptyHash) + + test("Hash small file"): + val hash64 = Hashing.samplingFileHashXXHash64(0) + IO.withTemporaryDirectory: dir => + val temp = dir / "test.txt" + IO.write(temp, "test") + val h = hash64.hash(temp) + assert(h == testHash) + + test("Hash medium file (1MB)"): + val hash64 = Hashing.samplingFileHashXXHash64(0) + IO.withTemporaryDirectory: dir => + val temp = dir / "test.txt" + val buf: Array[Byte] = Array.fill[Byte](1024)(0.toByte) + for i <- 0 until 1024 do IO.append(temp, buf) + val h = hash64.hash(temp) + assert(h == -5176567862428962592L) +end FileSampleHashTest diff --git a/util-cache/src/main/scala/sbt/util/Digest.scala b/util-cache/src/main/scala/sbt/util/Digest.scala index 8e059b8ea..bac3fb08f 100644 --- a/util-cache/src/main/scala/sbt/util/Digest.scala +++ b/util-cache/src/main/scala/sbt/util/Digest.scala @@ -2,6 +2,7 @@ package sbt.util import sjsonnew.IsoString import sbt.io.Hash +import sbt.internal.util.hashing.Hashing import xsbti.HashedVirtualFileRef import java.io.{ BufferedInputStream, InputStream } import java.nio.ByteBuffer @@ -18,6 +19,10 @@ object Digest: private[sbt] val Sha256 = "sha256" private[sbt] val Sha384 = "sha384" private[sbt] val Sha512 = "sha512" + private[sbt] val Imoxx64 = "imoxx64" + private[sbt] val Imowy64 = "imowy64" + private[sbt] val Xx64 = "xx64" + private[sbt] val Wy64 = "wy64" extension (d: Digest) def contentHashStr: String = @@ -44,9 +49,24 @@ object Digest: apply(ref.contentHashStr() + "/" + ref.sizeBytes.toString) def apply(algo: String, path: Path): Digest = - Using.resource(Files.newInputStream(path)) { input => - apply(algo, hashBytes(algo, input), Files.size(path)) - } + algo match + case Imoxx64 => + val hash64 = Hashing.samplingFileHashXXHash64(0) + val h = hash64.hash(path) + apply(algo, longsToBytes(Array(h)), Files.size(path)) + case Imowy64 => + val hash64 = Hashing.samplingFileHashWyHash64(0) + val h = hash64.hash(path) + apply(algo, longsToBytes(Array(h)), Files.size(path)) + case Xx64 | Wy64 => + Using.resource(Files.newInputStream(path)) { input => + val h = hashBytesInternal(algo, input) + apply(algo, longsToBytes(Array(h)), Files.size(path)) + } + case _ => + Using.resource(Files.newInputStream(path)) { input => + apply(algo, hashBytes(algo, input), Files.size(path)) + } // used to wrap a Long value as a fake Digest, which will // later be hashed using sha256 anyway. @@ -67,6 +87,10 @@ object Digest: def sha256Hash(digests: Digest*): Digest = sha256Hash(digests.toSeq.map(_.toBytes).flatten.toArray[Byte]) + def imoxx64Hash(path: Path): Digest = apply(Imoxx64, path) + + def xx64Hash(path: Path): Digest = apply(Xx64, path) + private[sbt] def md5Hash(bytes: Array[Byte]): Digest = apply(Md5, hashBytes(Md5, bytes), bytes.length) @@ -93,6 +117,24 @@ object Digest: digest.digest } + // using our own hashing algorithms + private def hashBytesInternal(algo: String, input: InputStream): Long = + val BufferSize = 8192 + Using.resource(BufferedInputStream(input)) { bis => + val digest = algo match + case Xx64 => Hashing.newStreamingXXHash64(0) + case Wy64 => Hashing.newStreamingWyHash64(0) + val buf = new Array[Byte](BufferSize) + while + val readBytes = input.read(buf) + if readBytes >= 0 then digest.update(buf, 0, readBytes) + readBytes >= 0 + do () + val h = digest.getValue + digest.close() + h + } + private def validateString(s: String): Unit = parse(s) () @@ -103,6 +145,14 @@ object Digest: case head :: rest :: Nil => val subtokens = head :: rest.split("/").toList subtokens match + case (a @ Xx64) :: value :: sizeBytes :: Nil => + (a, value, sizeBytes.toLong, parseHex(value, 64)) + case (a @ Wy64) :: value :: sizeBytes :: Nil => + (a, value, sizeBytes.toLong, parseHex(value, 64)) + case (a @ Imoxx64) :: value :: sizeBytes :: Nil => + (a, value, sizeBytes.toLong, parseHex(value, 64)) + case (a @ Imowy64) :: value :: sizeBytes :: Nil => + (a, value, sizeBytes.toLong, parseHex(value, 64)) case (a @ Murmur3) :: value :: sizeBytes :: Nil => (a, value, sizeBytes.toLong, parseHex(value, 128)) case (a @ Md5) :: value :: sizeBytes :: Nil => diff --git a/util-cache/src/test/scala/sbt/util/DigestTest.scala b/util-cache/src/test/scala/sbt/util/DigestTest.scala index 15c5de7d2..57b9a2c7b 100644 --- a/util-cache/src/test/scala/sbt/util/DigestTest.scala +++ b/util-cache/src/test/scala/sbt/util/DigestTest.scala @@ -42,6 +42,34 @@ object DigestTest extends verify.BasicTestSuite: testEmptyFile("sha512", expected) } + test("imoxx64") { + val expected = Digest( + "imoxx64-ef46db3751d8e999/0" + ) + testEmptyFile("imoxx64", expected) + } + + test("imowy64") { + val expected = Digest( + "imowy64-0409638ee2bde459/0" + ) + testEmptyFile("imowy64", expected) + } + + test("xx64") { + val expected = Digest( + "xx64-ef46db3751d8e999/0" + ) + testEmptyFile("xx64", expected) + } + + test("wy64") { + val expected = Digest( + "wy64-0409638ee2bde459/0" + ) + testEmptyFile("wy64", expected) + } + test("digest composition") { val dummy1 = Digest.dummy(0L) val dummy2 = Digest.dummy(0L)