Skip to content

Commit

Permalink
incorporate upstream fixes to crc32c.c assembly (#52326)
Browse files Browse the repository at this point in the history
These are a response to
[this comment](https://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software/17646775#comment88832559_17646775)
on StackOverflow:

> Your asm constraints aren't strictly safe. A pointer in a `"r"`
constraint does not imply that the pointed-to memory is also an input.
You could just use memory operands (or the `_mm_crc32_u64` intrinsic),
but if you want to force the addressing mode you can use a dummy memory
operand like `asm("crc32 (%1),%0 " : "+r"(crc0) : "r"(next), "m"
(*(const char (*)[24]) pStr) )`;. Inlining or LTO could break this. See
[at&t asm inline c++
problem](https://stackoverflow.com/posts/comments/81680441) (which needs
an update: pointer-to-array is cleaner than a struct with a flexible
array member).

The existing test coverage wasn't enough to fully exercise Adler's code,
so I added another test.

Closes #52325
  • Loading branch information
stevengj committed Nov 29, 2023
1 parent 9bd2432 commit fd67cb2
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 16 deletions.
39 changes: 23 additions & 16 deletions src/crc32c.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* crc32c.c -- compute CRC-32C using software table or available hardware instructions
* Copyright (C) 2013 Mark Adler
* Version 1.1 1 Aug 2013 Mark Adler
* Copyright (C) 2013, 2021 Mark Adler
* Version 1.1 1 Aug 2013 Mark Adler, updates from Version 1.2 5 June 2021
*
* Code retrieved in August 2016 from August 2013 post by Mark Adler on
* http:https://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software
Expand All @@ -10,6 +10,7 @@
* - architecture and compiler detection
* - precompute crc32c tables and store in a generated .c file
* - ARMv8 support
* Updated to incorporate upstream 2021 patch by Mark Adler to register constraints.
*/

/*
Expand Down Expand Up @@ -39,6 +40,8 @@
/* Version history:
1.0 10 Feb 2013 First version
1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
1.2 5 Jun 2021 Correct register constraints on assembly instructions
(+ other changes that were superfluous for us)
*/

#include "julia.h"
Expand Down Expand Up @@ -98,16 +101,14 @@ static uint32_t crc32c_sse42(uint32_t crc, const char *buf, size_t len)
to an eight-byte boundary */
while (len && ((uintptr_t)buf & 7) != 0) {
__asm__("crc32b\t" "(%1), %0"
: "=r"(crc0)
: "r"(buf), "0"(crc0));
: "+r"(crc0)
: "r"(buf), "m"(*buf));
buf++;
len--;
}

/* compute the crc on sets of LONG*3 bytes, executing three independent crc
instructions, each on LONG bytes -- this is optimized for the Nehalem,
Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
throughput of one crc per cycle, but a latency of three cycles */
/* compute the crc on sets of LONG*3 bytes,
making use of three ALUs in parallel on a single core. */
while (len >= LONG * 3) {
uintptr_t crc1 = 0;
uintptr_t crc2 = 0;
Expand All @@ -116,8 +117,11 @@ static uint32_t crc32c_sse42(uint32_t crc, const char *buf, size_t len)
__asm__(CRC32_PTR "\t" "(%3), %0\n\t"
CRC32_PTR "\t" LONGx1 "(%3), %1\n\t"
CRC32_PTR "\t" LONGx2 "(%3), %2"
: "=r"(crc0), "=r"(crc1), "=r"(crc2)
: "r"(buf), "0"(crc0), "1"(crc1), "2"(crc2));
: "+r"(crc0), "+r"(crc1), "+r"(crc2)
: "r"(buf),
"m"(* (const char (*)[sizeof(void*)]) &buf[0]),
"m"(* (const char (*)[sizeof(void*)]) &buf[LONG]),
"m"(* (const char (*)[sizeof(void*)]) &buf[LONG*2]));
buf += sizeof(void*);
} while (buf < end);
crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
Expand All @@ -136,8 +140,11 @@ static uint32_t crc32c_sse42(uint32_t crc, const char *buf, size_t len)
__asm__(CRC32_PTR "\t" "(%3), %0\n\t"
CRC32_PTR "\t" SHORTx1 "(%3), %1\n\t"
CRC32_PTR "\t" SHORTx2 "(%3), %2"
: "=r"(crc0), "=r"(crc1), "=r"(crc2)
: "r"(buf), "0"(crc0), "1"(crc1), "2"(crc2));
: "+r"(crc0), "+r"(crc1), "+r"(crc2)
: "r"(buf),
"m"(* (const char (*)[sizeof(void*)]) &buf[0]),
"m"(* (const char (*)[sizeof(void*)]) &buf[SHORT]),
"m"(* (const char (*)[sizeof(void*)]) &buf[SHORT*2]));
buf += sizeof(void*);
} while (buf < end);
crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
Expand All @@ -151,17 +158,17 @@ static uint32_t crc32c_sse42(uint32_t crc, const char *buf, size_t len)
const char *end = buf + (len - (len & 7));
while (buf < end) {
__asm__(CRC32_PTR "\t" "(%1), %0"
: "=r"(crc0)
: "r"(buf), "0"(crc0));
: "+r"(crc0)
: "r"(buf), "m"(* (const char (*)[sizeof(void*)]) buf));
buf += sizeof(void*);
}
len &= 7;

/* compute the crc for up to seven trailing bytes */
while (len) {
__asm__("crc32b\t" "(%1), %0"
: "=r"(crc0)
: "r"(buf), "0"(crc0));
: "+r"(crc0)
: "r"(buf), "m"(*buf));
buf++;
len--;
}
Expand Down
9 changes: 9 additions & 0 deletions stdlib/CRC32c/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@ function test_crc32c(crc32c)
rm(f, force=true)
end
end

# test longer arrays to cover all the code paths in crc32c.c
LONG = 8192 # from crc32c.c
SHORT = 256 # from crc32c.c
n = LONG*3+SHORT*3+SHORT*2+64+7
big = vcat(reinterpret(UInt8, hton.(0x74d7f887 .^ (1:n÷4))), UInt8[1:n%4;])
for (offset,crc) in [(0, 0x13a5ecd5), (1, 0xecf34b7e), (2, 0xfa71b596), (3, 0xbfd24745), (4, 0xf0cb3370), (5, 0xb0ec88b5), (6, 0x258c20a8), (7, 0xa9bd638d)]
@test crc == crc32c(@view big[1+offset:end])
end
end
unsafe_crc32c_sw(a, n, crc) =
ccall(:jl_crc32c_sw, UInt32, (UInt32, Ptr{UInt8}, Csize_t), crc, a, n)
Expand Down

0 comments on commit fd67cb2

Please sign in to comment.