Port imohash to Scala

This commit is contained in:
Eugene Yokota 2026-05-25 15:28:50 -04:00
parent c40beae1ff
commit 47c73b701d
7 changed files with 237 additions and 14 deletions

View File

@ -292,16 +292,18 @@ lazy val utilInterface = (project in file("internal") / "util-interface").settin
mimaSettings,
)
lazy val utilControl = (project in file("internal") / "util-control").settings(
utilCommonSettings,
name := "Util Control",
libraryDependencies ++= Seq(
scalacheck % Test,
scalaVerify % Test,
hedgehog % Test,
),
mimaSettings,
)
lazy val utilControl = (project in file("internal") / "util-control")
.settings(
utilCommonSettings,
name := "Util Control",
libraryDependencies ++= Seq(
scalacheck % Test,
scalaVerify % Test,
hedgehog % Test,
),
mimaSettings,
)
.configure(addSbtIOForTest)
lazy val utilPosition = (project in file("internal") / "util-position")
.settings(
@ -374,7 +376,7 @@ lazy val utilCache = project
// we generate JsonCodec only for actionresult.contra
JsonCodecPlugin,
)
.dependsOn(utilLogging)
.dependsOn(utilLogging, utilControl)
.settings(
testedBaseSettings,
name := "Util Cache",

View File

@ -0,0 +1,20 @@
/*
* sbt
* Copyright 2023, Scala center
* Copyright 2011 - 2022, Lightbend, Inc.
* Copyright 2008 - 2010, Mark Harrah
* Licensed under Apache License 2.0 (see LICENSE)
*
*/
package sbt.internal.util.hashing
import java.io.File
import java.nio.file.{ Path as NioPath }
trait FileHash:
def hash(file: File): Long
def hash(file: NioPath): Long
override def toString(): String =
getClass().getSimpleName()
end FileHash

View File

@ -0,0 +1,75 @@
/*
* sbt
* Copyright 2023, Scala center
* Copyright 2011 - 2022, Lightbend, Inc.
* Copyright 2008 - 2010, Mark Harrah
* Licensed under Apache License 2.0 (see LICENSE)
*
*/
package sbt.internal.util.hashing
import java.io.{ File, RandomAccessFile }
import java.nio.ByteBuffer
import java.nio.file.{ Path as NioPath }
import scala.util.Using
object FileSampleHash:
final val defaultSampleBytes = 16 * 1024
final val defaultThresoldBytes = 128L * 1024L
def apply(underlying: StreamingHashAlgo): FileSampleHash =
new FileSampleHash(defaultSampleBytes, defaultThresoldBytes, underlying)
end FileSampleHash
/**
* Based on Imohash https://github.com/kalafut/imohash/blob/master/algorithm.md
*/
class FileSampleHash(sampleBytes: Int, thresholdBytes: Long, underlying: StreamingHashAlgo)
extends FileHash:
require(sampleBytes >= 0)
val buffer: Array[Byte] = new Array[Byte](4096)
override def hash(file: NioPath): Long =
hash(file.toFile())
override def hash(file: File): Long =
Using.resource(new RandomAccessFile(file, "r")): raf =>
hash(raf, raf.length())
private def hash(input: RandomAccessFile, fileLength: Long): Long =
underlying.reset()
if fileLength < thresholdBytes || sampleBytes < 1 then hashBytes(input, fileLength)
else
hashBytes(input, sampleBytes)
// skip to halfway point
input.seek(fileLength / 2)
hashBytes(input, sampleBytes)
input.seek(fileLength - sampleBytes)
hashBytes(input, sampleBytes)
// write file size
if fileLength > 0 then
val sizeBuf = ByteBuffer.allocate(java.lang.Long.BYTES)
sizeBuf.putLong(fileLength)
underlying.update(sizeBuf.array(), 0, sizeBuf.array().size)
underlying.getValue
end hash
private def hashBytes(input: RandomAccessFile, toHash: Long): Unit =
var remaining: Long = toHash
var pos = 0
while remaining > 0 do
val toread = math.min(buffer.size - pos, remaining).toInt
val bytesRead = input.read(buffer, pos, toread)
if bytesRead < 0 then sys.error("unexpected EOF")
pos += bytesRead
remaining -= bytesRead
if pos >= buffer.length then
underlying.update(buffer, 0, buffer.length)
pos = 0
if pos > 0 then underlying.update(buffer, 0, pos)
end hashBytes
end FileSampleHash

View File

@ -17,4 +17,8 @@ object Hashing:
new StreamingXXHash64VarHandle(seed)
def newStreamingWyHash64(seed: Long): StreamingHashAlgo =
new StreamingWyHash64VarHandle(seed)
def samplingFileHashXXHash64(seed: Long): FileHash =
FileSampleHash(newStreamingXXHash64(seed))
def samplingFileHashWyHash64(seed: Long): FileHash =
FileSampleHash(newStreamingWyHash64(seed))
end Hashing

View File

@ -0,0 +1,44 @@
/*
* sbt
* Copyright 2023, Scala center
* Copyright 2011 - 2022, Lightbend, Inc.
* Copyright 2008 - 2010, Mark Harrah
* Licensed under Apache License 2.0 (see LICENSE)
*
*/
package sbt.internal.util.hashing
import verify.BasicTestSuite
import sbt.io.IO
import sbt.io.syntax.*
object FileSampleHashTest extends BasicTestSuite:
val emptyHash = -1205034819632174695L
val testHash = 2563739794714397383L
test("Hash empty file"):
val hash64 = Hashing.samplingFileHashXXHash64(0)
IO.withTemporaryDirectory: dir =>
val temp = dir / "test.txt"
IO.touch(temp)
val h = hash64.hash(temp)
assert(h == emptyHash)
test("Hash small file"):
val hash64 = Hashing.samplingFileHashXXHash64(0)
IO.withTemporaryDirectory: dir =>
val temp = dir / "test.txt"
IO.write(temp, "test")
val h = hash64.hash(temp)
assert(h == testHash)
test("Hash medium file (1MB)"):
val hash64 = Hashing.samplingFileHashXXHash64(0)
IO.withTemporaryDirectory: dir =>
val temp = dir / "test.txt"
val buf: Array[Byte] = Array.fill[Byte](1024)(0.toByte)
for i <- 0 until 1024 do IO.append(temp, buf)
val h = hash64.hash(temp)
assert(h == -5176567862428962592L)
end FileSampleHashTest

View File

@ -2,6 +2,7 @@ package sbt.util
import sjsonnew.IsoString
import sbt.io.Hash
import sbt.internal.util.hashing.Hashing
import xsbti.HashedVirtualFileRef
import java.io.{ BufferedInputStream, InputStream }
import java.nio.ByteBuffer
@ -18,6 +19,10 @@ object Digest:
private[sbt] val Sha256 = "sha256"
private[sbt] val Sha384 = "sha384"
private[sbt] val Sha512 = "sha512"
private[sbt] val Imoxx64 = "imoxx64"
private[sbt] val Imowy64 = "imowy64"
private[sbt] val Xx64 = "xx64"
private[sbt] val Wy64 = "wy64"
extension (d: Digest)
def contentHashStr: String =
@ -44,9 +49,24 @@ object Digest:
apply(ref.contentHashStr() + "/" + ref.sizeBytes.toString)
def apply(algo: String, path: Path): Digest =
Using.resource(Files.newInputStream(path)) { input =>
apply(algo, hashBytes(algo, input), Files.size(path))
}
algo match
case Imoxx64 =>
val hash64 = Hashing.samplingFileHashXXHash64(0)
val h = hash64.hash(path)
apply(algo, longsToBytes(Array(h)), Files.size(path))
case Imowy64 =>
val hash64 = Hashing.samplingFileHashWyHash64(0)
val h = hash64.hash(path)
apply(algo, longsToBytes(Array(h)), Files.size(path))
case Xx64 | Wy64 =>
Using.resource(Files.newInputStream(path)) { input =>
val h = hashBytesInternal(algo, input)
apply(algo, longsToBytes(Array(h)), Files.size(path))
}
case _ =>
Using.resource(Files.newInputStream(path)) { input =>
apply(algo, hashBytes(algo, input), Files.size(path))
}
// used to wrap a Long value as a fake Digest, which will
// later be hashed using sha256 anyway.
@ -67,6 +87,10 @@ object Digest:
def sha256Hash(digests: Digest*): Digest =
sha256Hash(digests.toSeq.map(_.toBytes).flatten.toArray[Byte])
def imoxx64Hash(path: Path): Digest = apply(Imoxx64, path)
def xx64Hash(path: Path): Digest = apply(Xx64, path)
private[sbt] def md5Hash(bytes: Array[Byte]): Digest =
apply(Md5, hashBytes(Md5, bytes), bytes.length)
@ -93,6 +117,24 @@ object Digest:
digest.digest
}
// using our own hashing algorithms
private def hashBytesInternal(algo: String, input: InputStream): Long =
val BufferSize = 8192
Using.resource(BufferedInputStream(input)) { bis =>
val digest = algo match
case Xx64 => Hashing.newStreamingXXHash64(0)
case Wy64 => Hashing.newStreamingWyHash64(0)
val buf = new Array[Byte](BufferSize)
while
val readBytes = input.read(buf)
if readBytes >= 0 then digest.update(buf, 0, readBytes)
readBytes >= 0
do ()
val h = digest.getValue
digest.close()
h
}
private def validateString(s: String): Unit =
parse(s)
()
@ -103,6 +145,14 @@ object Digest:
case head :: rest :: Nil =>
val subtokens = head :: rest.split("/").toList
subtokens match
case (a @ Xx64) :: value :: sizeBytes :: Nil =>
(a, value, sizeBytes.toLong, parseHex(value, 64))
case (a @ Wy64) :: value :: sizeBytes :: Nil =>
(a, value, sizeBytes.toLong, parseHex(value, 64))
case (a @ Imoxx64) :: value :: sizeBytes :: Nil =>
(a, value, sizeBytes.toLong, parseHex(value, 64))
case (a @ Imowy64) :: value :: sizeBytes :: Nil =>
(a, value, sizeBytes.toLong, parseHex(value, 64))
case (a @ Murmur3) :: value :: sizeBytes :: Nil =>
(a, value, sizeBytes.toLong, parseHex(value, 128))
case (a @ Md5) :: value :: sizeBytes :: Nil =>

View File

@ -42,6 +42,34 @@ object DigestTest extends verify.BasicTestSuite:
testEmptyFile("sha512", expected)
}
test("imoxx64") {
val expected = Digest(
"imoxx64-ef46db3751d8e999/0"
)
testEmptyFile("imoxx64", expected)
}
test("imowy64") {
val expected = Digest(
"imowy64-0409638ee2bde459/0"
)
testEmptyFile("imowy64", expected)
}
test("xx64") {
val expected = Digest(
"xx64-ef46db3751d8e999/0"
)
testEmptyFile("xx64", expected)
}
test("wy64") {
val expected = Digest(
"wy64-0409638ee2bde459/0"
)
testEmptyFile("wy64", expected)
}
test("digest composition") {
val dummy1 = Digest.dummy(0L)
val dummy2 = Digest.dummy(0L)