Skip to content

Commit

Permalink
feat: add CRC64-AVRO-LE fingerprint type
Browse files Browse the repository at this point in the history
The Avro specification details a Single Object Encoding using a header
to associate a schema ID with an Avro payload. The ID is defined as the
CRC64 fingerprint in little-endian encoding.

The pkg/crc64 module only provides big-endian CRC64, and the CRC64-AVRO
fingerprint type is implemented as such. The specification does not
detail endianness of the CRC64-AVRO fingerprint itself (only when
embedded in an SOE header).

To avoid breaking existing CRC64-AVRO fingerprints, add a new
fingerprint type CRC64-AVRO-LE, identical to CRC64-AVRO except
little-endian.

Generalize crc64.Sum() with a ByteOrder, so users can choose
big/little/native encodings as required.

Add tests and benchmarks for the Sum function.

Fixes #489.
  • Loading branch information
kimgr committed Jan 20, 2025
1 parent 68046a4 commit 00737c2
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 9 deletions.
12 changes: 7 additions & 5 deletions pkg/crc64/crc64.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package crc64

import (
"encoding/binary"
"hash"
)

Expand Down Expand Up @@ -86,12 +87,13 @@ func (d *digest) Sum(in []byte) []byte {
return append(in, byte(s>>56), byte(s>>48), byte(s>>40), byte(s>>32), byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
}

// Sum returns the MD5 checksum of the data.
func Sum(data []byte) [Size]byte {
// Sum returns the CRC64 checksum of the data, in given byte order.
func Sum(data []byte, order binary.ByteOrder) [Size]byte {
d := digest{crc: Empty, tab: crc64Table}
d.Reset()
_, _ = d.Write(data)
s := d.Sum64()
//nolint:lll
return [Size]byte{byte(s >> 56), byte(s >> 48), byte(s >> 40), byte(s >> 32), byte(s >> 24), byte(s >> 16), byte(s >> 8), byte(s)}

var b [Size]byte
order.PutUint64(b[:], s)
return b
}
57 changes: 57 additions & 0 deletions pkg/crc64/crc64_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package crc64

import (
"encoding/binary"
"strconv"
"testing"

Expand Down Expand Up @@ -82,6 +83,40 @@ func TestDigest_BlockSize(t *testing.T) {
assert.Equal(t, 1, hash.BlockSize())
}

func TestGoldenEndianSum(t *testing.T) {
tests := []struct {
in string
be []byte
le []byte
}{
{
in: `"null"`,
be: []byte{0x63, 0xdd, 0x24, 0xe7, 0xcc, 0x25, 0x8f, 0x8a},
le: []byte{0x8a, 0x8f, 0x25, 0xcc, 0xe7, 0x24, 0xdd, 0x63},
},
{
in: `{"name":"foo","type":"fixed","size":15}`,
be: []byte{0x18, 0x60, 0x2e, 0xc3, 0xed, 0x31, 0xa5, 0x04},
le: []byte{0x04, 0xa5, 0x31, 0xed, 0xc3, 0x2e, 0x60, 0x18},
},
{
in: `{"name":"foo","type":"record","fields":[{"name":"f1","type":"boolean"}]}`,
be: []byte{0x6c, 0xd8, 0xea, 0xf1, 0xc9, 0x68, 0xa3, 0x3b},
le: []byte{0x3b, 0xa3, 0x68, 0xc9, 0xf1, 0xea, 0xd8, 0x6c},
},
}

for i, test := range tests {
t.Run(strconv.Itoa(i), func(t *testing.T) {
got := Sum([]byte(test.in), binary.BigEndian)
assert.Equal(t, test.be, got[:])

got = Sum([]byte(test.in), binary.LittleEndian)
assert.Equal(t, test.le, got[:])
})
}
}

func bench(b *testing.B, size int64) {
b.SetBytes(size)

Expand Down Expand Up @@ -115,3 +150,25 @@ func BenchmarkCrc64(b *testing.B) {
bench(b, 1<<10)
})
}

func BenchmarkSum(b *testing.B) {
data := make([]byte, 4<<10)
for i := range data {
data[i] = byte(i)
}

b.Run("BigEndian", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = Sum(data, binary.BigEndian)
}
})
b.Run("LittleEndian", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = Sum(data, binary.LittleEndian)
}
})
}
13 changes: 9 additions & 4 deletions schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bytes"
"crypto/md5"
"crypto/sha256"
"encoding/binary"
"errors"
"fmt"
"sort"
Expand Down Expand Up @@ -106,9 +107,10 @@ type FingerprintType string

// Fingerprint type constants.
const (
CRC64Avro FingerprintType = "CRC64-AVRO"
MD5 FingerprintType = "MD5"
SHA256 FingerprintType = "SHA256"
CRC64Avro FingerprintType = "CRC64-AVRO"
CRC64AvroLE FingerprintType = "CRC64-AVRO-LE"
MD5 FingerprintType = "MD5"
SHA256 FingerprintType = "SHA256"
)

// SchemaCache is a cache of schemas.
Expand Down Expand Up @@ -304,7 +306,10 @@ func (f *fingerprinter) FingerprintUsing(typ FingerprintType, stringer fmt.Strin
var fingerprint []byte
switch typ {
case CRC64Avro:
h := crc64.Sum(data)
h := crc64.Sum(data, binary.BigEndian)
fingerprint = h[:]
case CRC64AvroLE:
h := crc64.Sum(data, binary.LittleEndian)
fingerprint = h[:]
case MD5:
h := md5.Sum(data)
Expand Down
6 changes: 6 additions & 0 deletions schema_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1216,6 +1216,12 @@ func TestSchema_FingerprintUsing(t *testing.T) {
typ: avro.CRC64Avro,
want: []byte{0x63, 0xdd, 0x24, 0xe7, 0xcc, 0x25, 0x8f, 0x8a},
},
{
name: "Null CRC64LE",
schema: "null",
typ: avro.CRC64AvroLE,
want: []byte{0x8a, 0x8f, 0x25, 0xcc, 0xe7, 0x24, 0xdd, 0x63},
},
{
name: "Null MD5",
schema: "null",
Expand Down

0 comments on commit 00737c2

Please sign in to comment.