Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add CRC64-AVRO-LE fingerprint type #491

Merged
merged 2 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 68 additions & 11 deletions pkg/crc64/crc64.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@ func init() {
// Size is the of a CRC-64 checksum in bytes.
const Size = 8

// ByteOrder denotes how integers are encoded into bytes. The ByteOrder
// interface in encoding/binary cancels some optimizations, so use a more
// direct implementation.
type ByteOrder int

// ByteOrder constants.
const (
LittleEndian ByteOrder = iota
BigEndian
)

// Empty is the empty checksum.
const Empty = 0xc15d213aa4d7a795

Expand All @@ -38,16 +49,28 @@ func buildTable() {
}

type digest struct {
crc uint64
tab *Table
crc uint64
tab *Table
byteOrder ByteOrder
}

// New creates a new hash.Hash64 computing the Avro CRC-64 checksum.
// Its Sum method will lay the value out in big-endian byte order.
func New() hash.Hash64 {
return newDigest(BigEndian)
}

// NewWithByteOrder creates a new hash.Hash64 computing the Avro CRC-64
// checksum. Its Sum method will lay the value out in specified byte order.
func NewWithByteOrder(byteOrder ByteOrder) hash.Hash64 {
return newDigest(byteOrder)
}

func newDigest(byteOrder ByteOrder) *digest {
return &digest{
crc: Empty,
tab: crc64Table,
crc: Empty,
tab: crc64Table,
byteOrder: byteOrder,
}
}

Expand Down Expand Up @@ -82,16 +105,50 @@ func (d *digest) Sum64() uint64 {

// Sum returns the checksum as a byte slice, using the given byte slice.
func (d *digest) Sum(in []byte) []byte {
b := d.sumBytes()
return append(in, b[:]...)
}

// sumBytes returns the checksum as a byte array in digest byte order.
func (d *digest) sumBytes() [Size]byte {
s := d.Sum64()
return append(in, byte(s>>56), byte(s>>48), byte(s>>40), byte(s>>32), byte(s>>24), byte(s>>16), byte(s>>8), byte(s))

switch d.byteOrder {
case LittleEndian:
return [Size]byte{
byte(s),
byte(s >> 8),
byte(s >> 16),
byte(s >> 24),
byte(s >> 32),
byte(s >> 40),
byte(s >> 48),
byte(s >> 56),
}
case BigEndian:
return [Size]byte{
byte(s >> 56),
byte(s >> 48),
byte(s >> 40),
byte(s >> 32),
byte(s >> 24),
byte(s >> 16),
byte(s >> 8),
byte(s),
}
}
panic("unknown byte order")
}

// Sum returns the MD5 checksum of the data.
// Sum returns the CRC64 checksum of the data, in big-endian byte order.
func Sum(data []byte) [Size]byte {
d := digest{crc: Empty, tab: crc64Table}
d.Reset()
return SumWithByteOrder(data, BigEndian)
}

// SumWithByteOrder returns the CRC64 checksum of the data, in specified byte
// order.
func SumWithByteOrder(data []byte, byteOrder ByteOrder) [Size]byte {
d := newDigest(byteOrder)
_, _ = d.Write(data)
s := d.Sum64()
//nolint:lll
return [Size]byte{byte(s >> 56), byte(s >> 48), byte(s >> 40), byte(s >> 32), byte(s >> 24), byte(s >> 16), byte(s >> 8), byte(s)}
return d.sumBytes()
}
58 changes: 58 additions & 0 deletions pkg/crc64/crc64_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,40 @@ func TestDigest_BlockSize(t *testing.T) {
assert.Equal(t, 1, hash.BlockSize())
}

func TestGoldenSumWithByteOrder(t *testing.T) {
tests := []struct {
in string
be []byte
le []byte
}{
{
in: `"null"`,
be: []byte{0x63, 0xdd, 0x24, 0xe7, 0xcc, 0x25, 0x8f, 0x8a},
le: []byte{0x8a, 0x8f, 0x25, 0xcc, 0xe7, 0x24, 0xdd, 0x63},
},
{
in: `{"name":"foo","type":"fixed","size":15}`,
be: []byte{0x18, 0x60, 0x2e, 0xc3, 0xed, 0x31, 0xa5, 0x04},
le: []byte{0x04, 0xa5, 0x31, 0xed, 0xc3, 0x2e, 0x60, 0x18},
},
{
in: `{"name":"foo","type":"record","fields":[{"name":"f1","type":"boolean"}]}`,
be: []byte{0x6c, 0xd8, 0xea, 0xf1, 0xc9, 0x68, 0xa3, 0x3b},
le: []byte{0x3b, 0xa3, 0x68, 0xc9, 0xf1, 0xea, 0xd8, 0x6c},
},
}

for i, test := range tests {
t.Run(strconv.Itoa(i), func(t *testing.T) {
got := SumWithByteOrder([]byte(test.in), BigEndian)
assert.Equal(t, test.be, got[:])

got = SumWithByteOrder([]byte(test.in), LittleEndian)
assert.Equal(t, test.le, got[:])
})
}
}

func bench(b *testing.B, size int64) {
b.SetBytes(size)

Expand Down Expand Up @@ -115,3 +149,27 @@ func BenchmarkCrc64(b *testing.B) {
bench(b, 1<<10)
})
}

func BenchmarkSum(b *testing.B) {
data := make([]byte, 4<<10)
for i := range data {
data[i] = byte(i)
}

b.Run("BigEndian", func(b *testing.B) {
b.SetBytes(int64(len(data)))
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = SumWithByteOrder(data, BigEndian)
}
})
b.Run("LittleEndian", func(b *testing.B) {
b.SetBytes(int64(len(data)))
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = SumWithByteOrder(data, LittleEndian)
}
})
}
10 changes: 7 additions & 3 deletions schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,10 @@ type FingerprintType string

// Fingerprint type constants.
const (
CRC64Avro FingerprintType = "CRC64-AVRO"
MD5 FingerprintType = "MD5"
SHA256 FingerprintType = "SHA256"
CRC64Avro FingerprintType = "CRC64-AVRO"
CRC64AvroLE FingerprintType = "CRC64-AVRO-LE"
MD5 FingerprintType = "MD5"
SHA256 FingerprintType = "SHA256"
)

// SchemaCache is a cache of schemas.
Expand Down Expand Up @@ -306,6 +307,9 @@ func (f *fingerprinter) FingerprintUsing(typ FingerprintType, stringer fmt.Strin
case CRC64Avro:
h := crc64.Sum(data)
fingerprint = h[:]
case CRC64AvroLE:
h := crc64.SumWithByteOrder(data, crc64.LittleEndian)
fingerprint = h[:]
case MD5:
h := md5.Sum(data)
fingerprint = h[:]
Expand Down
6 changes: 6 additions & 0 deletions schema_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1216,6 +1216,12 @@ func TestSchema_FingerprintUsing(t *testing.T) {
typ: avro.CRC64Avro,
want: []byte{0x63, 0xdd, 0x24, 0xe7, 0xcc, 0x25, 0x8f, 0x8a},
},
{
name: "Null CRC64-AVRO-LE",
schema: "null",
typ: avro.CRC64AvroLE,
want: []byte{0x8a, 0x8f, 0x25, 0xcc, 0xe7, 0x24, 0xdd, 0x63},
},
{
name: "Null MD5",
schema: "null",
Expand Down
Loading