Skip to content

Commit

Permalink
feat: add CRC64-AVRO-LE fingerprint type
Browse files Browse the repository at this point in the history
The Avro specification details a Single Object Encoding using a header
to associate a schema ID with an Avro payload. The ID is defined as the
CRC64 fingerprint in little-endian encoding.

The pkg/crc64 module only provides big-endian CRC64, and the CRC64-AVRO
fingerprint type is implemented as such. The specification does not
detail endianness of the CRC64-AVRO fingerprint itself (only when
embedded in an SOE header).

To avoid breaking existing CRC64-AVRO fingerprints, add a new
fingerprint type CRC64-AVRO-LE, identical to CRC64-AVRO except
little-endian.

Parameterize the crc64 package on byte order, add NewWithByteOrder and
SumWithByteOrder top-level functions so users can configure the hasher
to use a specific byte order.

Add tests and benchmarks for the SumWithByteOrder function.

Fixes #489.
  • Loading branch information
kimgr committed Jan 23, 2025
1 parent 68046a4 commit c76586d
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 13 deletions.
44 changes: 34 additions & 10 deletions pkg/crc64/crc64.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package crc64

import (
"encoding/binary"
"hash"
)

Expand Down Expand Up @@ -38,16 +39,28 @@ func buildTable() {
}

type digest struct {
crc uint64
tab *Table
crc uint64
tab *Table
byteOrder binary.ByteOrder
}

// New creates a new hash.Hash64 computing the Avro CRC-64 checksum.
// Its Sum method will lay the value out in big-endian byte order.
func New() hash.Hash64 {
return newDigest(binary.BigEndian)
}

// NewWithByteOrder creates a new hash.Hash64 computing the Avro CRC-64
// checksum. Its Sum method will lay the value out in specified byte order.
func NewWithByteOrder(order binary.ByteOrder) hash.Hash64 {
return newDigest(order)
}

func newDigest(order binary.ByteOrder) *digest {
return &digest{
crc: Empty,
tab: crc64Table,
crc: Empty,
tab: crc64Table,
byteOrder: order,
}
}

Expand Down Expand Up @@ -82,16 +95,27 @@ func (d *digest) Sum64() uint64 {

// Sum returns the checksum as a byte slice, using the given byte slice.
func (d *digest) Sum(in []byte) []byte {
// Byte order must also implement AppendByteOrder; the binary
// standard library ones do.
appendOrder := d.byteOrder.(binary.AppendByteOrder)

s := d.Sum64()
return append(in, byte(s>>56), byte(s>>48), byte(s>>40), byte(s>>32), byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
return appendOrder.AppendUint64(in, s)
}

// Sum returns the MD5 checksum of the data.
// Sum returns the CRC64 checksum of the data, in big-endian byte order.
func Sum(data []byte) [Size]byte {
d := digest{crc: Empty, tab: crc64Table}
d.Reset()
return SumWithByteOrder(data, binary.BigEndian)
}

// SumWithByteOrder returns the CRC64 checksum of the data, in specified byte
// order.
func SumWithByteOrder(data []byte, order binary.ByteOrder) [Size]byte {
d := newDigest(order)
_, _ = d.Write(data)
s := d.Sum64()
//nolint:lll
return [Size]byte{byte(s >> 56), byte(s >> 48), byte(s >> 40), byte(s >> 32), byte(s >> 24), byte(s >> 16), byte(s >> 8), byte(s)}

var buf [Size]byte
d.byteOrder.PutUint64(buf[:], s)
return buf
}
57 changes: 57 additions & 0 deletions pkg/crc64/crc64_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package crc64

import (
"encoding/binary"
"strconv"
"testing"

Expand Down Expand Up @@ -82,6 +83,40 @@ func TestDigest_BlockSize(t *testing.T) {
assert.Equal(t, 1, hash.BlockSize())
}

func TestGoldenSumWithByteOrder(t *testing.T) {
tests := []struct {
in string
be []byte
le []byte
}{
{
in: `"null"`,
be: []byte{0x63, 0xdd, 0x24, 0xe7, 0xcc, 0x25, 0x8f, 0x8a},
le: []byte{0x8a, 0x8f, 0x25, 0xcc, 0xe7, 0x24, 0xdd, 0x63},
},
{
in: `{"name":"foo","type":"fixed","size":15}`,
be: []byte{0x18, 0x60, 0x2e, 0xc3, 0xed, 0x31, 0xa5, 0x04},
le: []byte{0x04, 0xa5, 0x31, 0xed, 0xc3, 0x2e, 0x60, 0x18},
},
{
in: `{"name":"foo","type":"record","fields":[{"name":"f1","type":"boolean"}]}`,
be: []byte{0x6c, 0xd8, 0xea, 0xf1, 0xc9, 0x68, 0xa3, 0x3b},
le: []byte{0x3b, 0xa3, 0x68, 0xc9, 0xf1, 0xea, 0xd8, 0x6c},
},
}

for i, test := range tests {
t.Run(strconv.Itoa(i), func(t *testing.T) {
got := SumWithByteOrder([]byte(test.in), binary.BigEndian)
assert.Equal(t, test.be, got[:])

got = SumWithByteOrder([]byte(test.in), binary.LittleEndian)
assert.Equal(t, test.le, got[:])
})
}
}

func bench(b *testing.B, size int64) {
b.SetBytes(size)

Expand Down Expand Up @@ -115,3 +150,25 @@ func BenchmarkCrc64(b *testing.B) {
bench(b, 1<<10)
})
}

func BenchmarkSum(b *testing.B) {
data := make([]byte, 4<<10)
for i := range data {
data[i] = byte(i)
}

b.Run("BigEndian", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = SumWithByteOrder(data, binary.BigEndian)
}
})
b.Run("LittleEndian", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = SumWithByteOrder(data, binary.LittleEndian)
}
})
}
11 changes: 8 additions & 3 deletions schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bytes"
"crypto/md5"
"crypto/sha256"
"encoding/binary"
"errors"
"fmt"
"sort"
Expand Down Expand Up @@ -106,9 +107,10 @@ type FingerprintType string

// Fingerprint type constants.
const (
CRC64Avro FingerprintType = "CRC64-AVRO"
MD5 FingerprintType = "MD5"
SHA256 FingerprintType = "SHA256"
CRC64Avro FingerprintType = "CRC64-AVRO"
CRC64AvroLE FingerprintType = "CRC64-AVRO-LE"
MD5 FingerprintType = "MD5"
SHA256 FingerprintType = "SHA256"
)

// SchemaCache is a cache of schemas.
Expand Down Expand Up @@ -306,6 +308,9 @@ func (f *fingerprinter) FingerprintUsing(typ FingerprintType, stringer fmt.Strin
case CRC64Avro:
h := crc64.Sum(data)
fingerprint = h[:]
case CRC64AvroLE:
h := crc64.SumWithByteOrder(data, binary.LittleEndian)
fingerprint = h[:]
case MD5:
h := md5.Sum(data)
fingerprint = h[:]
Expand Down
6 changes: 6 additions & 0 deletions schema_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1216,6 +1216,12 @@ func TestSchema_FingerprintUsing(t *testing.T) {
typ: avro.CRC64Avro,
want: []byte{0x63, 0xdd, 0x24, 0xe7, 0xcc, 0x25, 0x8f, 0x8a},
},
{
name: "Null CRC64-AVRO-LE",
schema: "null",
typ: avro.CRC64AvroLE,
want: []byte{0x8a, 0x8f, 0x25, 0xcc, 0xe7, 0x24, 0xdd, 0x63},
},
{
name: "Null MD5",
schema: "null",
Expand Down

0 comments on commit c76586d

Please sign in to comment.