summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2015-04-14 23:58:16 +0300
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2015-04-14 23:58:16 +0300
commit936546dcbc24ad1f2b3d33e73aa5c5fde4d2be84 (patch)
treeb6d29aa4308a156d363a08bf4f7ebf89f4047e85 /src
parentb73e7a0716264e5159947b1a755b9ab864142489 (diff)
Optimize pg_comp_crc32c_sse42 routine slightly, and also use it on x86.
Eliminate the separate 'len' variable from the loops, and also use the 4 byte instruction. This shaves off a few more cycles. Even though this routine that uses the special SSE 4.2 instructions is much faster than a generic routine, it's still a hot spot, so let's make it as fast as possible. Change the configure test to not test _mm_crc32_u64. That variant is only available in the 64-bit x86-64 architecture, not in 32-bit x86. Modify pg_comp_crc32c_sse42 so that it only uses _mm_crc32_u64 on x86-64. With these changes, the SSE accelerated CRC-32C implementation can also be used on 32-bit x86 systems. This also fixes the 32-bit MSVC build.
Diffstat (limited to 'src')
-rw-r--r--src/port/pg_crc32c_sse42.c41
1 files changed, 28 insertions, 13 deletions
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index b6107103bef..a22a9dd78bf 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -22,30 +22,45 @@ pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
const unsigned char *p = data;
- const uint64 *p8;
+ const unsigned char *pend = p + len;
/*
* Process eight bytes of data at a time.
*
- * NB: We do unaligned 8-byte accesses here. The Intel architecture
- * allows that, and performance testing didn't show any performance
- * gain from aligning the beginning address.
+ * NB: We do unaligned accesses here. The Intel architecture allows that,
+ * and performance testing didn't show any performance gain from aligning
+ * the begin address.
*/
- p8 = (const uint64 *) p;
- while (len >= 8)
+#ifdef __x86_64__
+ while (p + 8 <= pend)
{
- crc = (uint32) _mm_crc32_u64(crc, *p8++);
- len -= 8;
+ crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
+ p += 8;
}
+ /* Process remaining full four bytes if any */
+ if (p + 4 <= pend)
+ {
+ crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
+ p += 4;
+ }
+#else
/*
- * Handle any remaining bytes one at a time.
+ * Process four bytes at a time. (The eight byte instruction is not
+ * available on the 32-bit x86 architecture).
*/
- p = (const unsigned char *) p8;
- while (len > 0)
+ while (p + 4 <= pend)
+ {
+ crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
+ p += 4;
+ }
+#endif /* __x86_64__ */
+
+ /* Process any remaining bytes one at a time. */
+ while (p < pend)
{
- crc = _mm_crc32_u8(crc, *p++);
- len--;
+ crc = _mm_crc32_u8(crc, *p);
+ p++;
}
return crc;