jsoftware
diff --git a/‎base64/include/libbase64.h‎
Lines changed: 42 additions & 7 deletions b/‎base64/include/libbase64.h‎
Lines changed: 42 additions & 7 deletions
diff --git a/‎base64/lib/arch/neon32/codec-neon32.c‎
Lines changed: 77 additions & 0 deletions b/‎base64/lib/arch/neon32/codec-neon32.c‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎base64/lib/arch/neon32/dec_loop.c‎
Lines changed: 106 additions & 0 deletions b/‎base64/lib/arch/neon32/dec_loop.c‎
Lines changed: 106 additions & 0 deletions
@@ -3,6 +3,41 @@
 
 #include <stddef.h>	/* size_t */
 
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#define BASE64_SYMBOL_IMPORT __declspec(dllimport)
+#define BASE64_SYMBOL_EXPORT __declspec(dllexport)
+#define BASE64_SYMBOL_PRIVATE
+
+#elif __GNUC__ >= 4
+#define BASE64_SYMBOL_IMPORT   __attribute__ ((visibility ("default")))
+#define BASE64_SYMBOL_EXPORT   __attribute__ ((visibility ("default")))
+#define BASE64_SYMBOL_PRIVATE  __attribute__ ((visibility ("hidden")))
+
+#else
+#define BASE64_SYMBOL_IMPORT
+#define BASE64_SYMBOL_EXPORT
+#define BASE64_SYMBOL_PRIVATE
+#endif
+
+#undef BASE64_STATIC_DEFINE
+#define BASE64_STATIC_DEFINE
+#if defined(BASE64_STATIC_DEFINE)
+#define BASE64_EXPORT
+#define BASE64_NO_EXPORT
+
+#else
+#if defined(BASE64_EXPORTS) // defined if we are building the shared library
+#define BASE64_EXPORT BASE64_SYMBOL_EXPORT
+
+#else
+#define BASE64_EXPORT BASE64_SYMBOL_IMPORT
+#endif
+
+#define BASE64_NO_EXPORT BASE64_SYMBOL_PRIVATE
+#endif
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -32,7 +67,7 @@ struct base64_state {
  * to *out without trailing zero. Output length in bytes is written to *outlen.
  * The buffer in `out` has been allocated by the caller and is at least 4/3 the
  * size of the input. See above for `flags`; set to 0 for default operation: */
-void base64_encode
+void BASE64_EXPORT base64_encode
 	( const char		*src
 	, size_t		 srclen
 	, char			*out
@@ -42,7 +77,7 @@ void base64_encode
 
 /* Call this before calling base64_stream_encode() to init the state. See above
  * for `flags`; set to 0 for default operation: */
-void base64_stream_encode_init
+void BASE64_EXPORT base64_stream_encode_init
 	( struct base64_state	*state
 	, int			 flags
 	) ;
@@ -52,7 +87,7 @@ void base64_stream_encode_init
  * must be at least 4/3 the size of the in-buffer, but take some margin. Places
  * the number of new bytes written into `outlen` (which is set to zero when the
  * function starts). Does not zero-terminate or finalize the output. */
-void base64_stream_encode
+void BASE64_EXPORT base64_stream_encode
 	( struct base64_state	*state
 	, const char		*src
 	, size_t		 srclen
@@ -64,7 +99,7 @@ void base64_stream_encode
  * Adds the required end-of-stream markers if appropriate. `outlen` is modified
  * and will contain the number of new bytes written at `out` (which will quite
  * often be zero). */
-void base64_stream_encode_final
+void BASE64_EXPORT base64_stream_encode_final
 	( struct base64_state	*state
 	, char			*out
 	, size_t		*outlen
@@ -74,7 +109,7 @@ void base64_stream_encode_final
  * to *out without trailing zero. Output length in bytes is written to *outlen.
  * The buffer in `out` has been allocated by the caller and is at least 3/4 the
  * size of the input. See above for `flags`, set to 0 for default operation: */
-int base64_decode
+int BASE64_EXPORT base64_decode
 	( const char		*src
 	, size_t		 srclen
 	, char			*out
@@ -84,7 +119,7 @@ int base64_decode
 
 /* Call this before calling base64_stream_decode() to init the state. See above
  * for `flags`; set to 0 for default operation: */
-void base64_stream_decode_init
+void BASE64_EXPORT base64_stream_decode_init
 	( struct base64_state	*state
 	, int			 flags
 	) ;
@@ -97,7 +132,7 @@ void base64_stream_decode_init
  * well, and 0 if a decoding error was found, such as an invalid character.
  * Returns -1 if the chosen codec is not included in the current build. Used by
  * the test harness to check whether a codec is available for testing. */
-int base64_stream_decode
+int BASE64_EXPORT base64_stream_decode
 	( struct base64_state	*state
 	, const char		*src
 	, size_t		 srclen
 
@@ -0,0 +1,77 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "../../config.h"
+#include "../../env.h"
+
+#ifdef __arm__
+#  if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
+#    define BASE64_USE_NEON32
+#  endif
+#endif
+
+#ifdef BASE64_USE_NEON32
+#include <arm_neon.h>
+
+// Only enable inline assembly on supported compilers.
+#if defined(__GNUC__) || defined(__clang__)
+#define BASE64_NEON32_USE_ASM
+#endif
+
+static inline uint8x16_t
+vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
+{
+	// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
+	// the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
+	uint8x8x2_t lut2;
+	uint8x8x2_t result;
+
+	lut2.val[0] = vget_low_u8(lut);
+	lut2.val[1] = vget_high_u8(lut);
+
+	result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
+	result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
+
+	return vcombine_u8(result.val[0], result.val[1]);
+}
+
+#include "../generic/32/dec_loop.c"
+#include "../generic/32/enc_loop.c"
+#include "dec_loop.c"
+#include "enc_reshuffle.c"
+#include "enc_translate.c"
+#include "enc_loop.c"
+
+#endif	// BASE64_USE_NEON32
+
+// Stride size is so large on these NEON 32-bit functions
+// (48 bytes encode, 32 bytes decode) that we inline the
+// uint32 codec to stay performant on smaller inputs.
+
+BASE64_ENC_FUNCTION(neon32)
+{
+#ifdef BASE64_USE_NEON32
+	#include "../generic/enc_head.c"
+	enc_loop_neon32(&s, &slen, &o, &olen);
+	enc_loop_generic_32(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	BASE64_ENC_STUB
+#endif
+}
+
+BASE64_DEC_FUNCTION(neon32)
+{
+#ifdef BASE64_USE_NEON32
+	#include "../generic/dec_head.c"
+	dec_loop_neon32(&s, &slen, &o, &olen);
+	dec_loop_generic_32(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	BASE64_DEC_STUB
+#endif
+}
@@ -0,0 +1,106 @@
+static inline int
+is_nonzero (const uint8x16_t v)
+{
+	uint64_t u64;
+	const uint64x2_t v64 = vreinterpretq_u64_u8(v);
+	const uint32x2_t v32 = vqmovn_u64(v64);
+
+	vst1_u64(&u64, vreinterpret_u64_u32(v32));
+	return u64 != 0;
+}
+
+static inline uint8x16_t
+delta_lookup (const uint8x16_t v)
+{
+	const uint8x8_t lut = {
+		0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
+	};
+
+	return vcombine_u8(
+		vtbl1_u8(lut, vget_low_u8(v)),
+		vtbl1_u8(lut, vget_high_u8(v)));
+}
+
+static inline uint8x16_t
+dec_loop_neon32_lane (uint8x16_t *lane)
+{
+	// See the SSSE3 decoder for an explanation of the algorithm.
+	const uint8x16_t lut_lo = {
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
+	};
+
+	const uint8x16_t lut_hi = {
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
+	};
+
+	const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
+	const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
+
+	const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
+	const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
+	const uint8x16_t eq_2F      = vceqq_u8(*lane, mask_2F);
+
+	const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
+	const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
+
+	// Now simply add the delta values to the input:
+	*lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
+
+	// Return the validity mask:
+	return vandq_u8(lo, hi);
+}
+
+static inline void
+dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 64) {
+		return;
+	}
+
+	// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
+	// extra trailing zero bytes are written, so it is not necessary to
+	// reserve extra input bytes:
+	size_t rounds = *slen / 64;
+
+	*slen -= rounds * 64;	// 64 bytes consumed per round
+	*olen += rounds * 48;	// 48 bytes produced per round
+
+	do {
+		uint8x16x3_t dec;
+
+		// Load 64 bytes and deinterleave:
+		uint8x16x4_t str = vld4q_u8(*s);
+
+		// Decode each lane, collect a mask of invalid inputs:
+		const uint8x16_t classified
+			= dec_loop_neon32_lane(&str.val[0])
+			| dec_loop_neon32_lane(&str.val[1])
+			| dec_loop_neon32_lane(&str.val[2])
+			| dec_loop_neon32_lane(&str.val[3]);
+
+		// Check for invalid input: if any of the delta values are
+		// zero, fall back on bytewise code to do error checking and
+		// reporting:
+		if (is_nonzero(classified)) {
+			break;
+		}
+
+		// Compress four bytes into three:
+		dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+		dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+		dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+
+		// Interleave and store decoded result:
+		vst3q_u8(*o, dec);
+
+		*s += 64;
+		*o += 48;
+
+	} while (--rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 64;
+	*olen -= rounds * 48;
+}