Skip to content

Commit df8eab2

Browse files
committed
Merge branch 'master' of jsoftware.com:jsource
# Conflicts: # jsrc/ct.c
2 parents c5e13d0 + a891653 commit df8eab2

40 files changed

Lines changed: 2049 additions & 331 deletions

base64/include/libbase64.h

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,41 @@
33

44
#include <stddef.h> /* size_t */
55

6+
7+
#if defined(_WIN32) || defined(__CYGWIN__)
8+
#define BASE64_SYMBOL_IMPORT __declspec(dllimport)
9+
#define BASE64_SYMBOL_EXPORT __declspec(dllexport)
10+
#define BASE64_SYMBOL_PRIVATE
11+
12+
#elif __GNUC__ >= 4
13+
#define BASE64_SYMBOL_IMPORT __attribute__ ((visibility ("default")))
14+
#define BASE64_SYMBOL_EXPORT __attribute__ ((visibility ("default")))
15+
#define BASE64_SYMBOL_PRIVATE __attribute__ ((visibility ("hidden")))
16+
17+
#else
18+
#define BASE64_SYMBOL_IMPORT
19+
#define BASE64_SYMBOL_EXPORT
20+
#define BASE64_SYMBOL_PRIVATE
21+
#endif
22+
23+
#undef BASE64_STATIC_DEFINE
24+
#define BASE64_STATIC_DEFINE
25+
#if defined(BASE64_STATIC_DEFINE)
26+
#define BASE64_EXPORT
27+
#define BASE64_NO_EXPORT
28+
29+
#else
30+
#if defined(BASE64_EXPORTS) // defined if we are building the shared library
31+
#define BASE64_EXPORT BASE64_SYMBOL_EXPORT
32+
33+
#else
34+
#define BASE64_EXPORT BASE64_SYMBOL_IMPORT
35+
#endif
36+
37+
#define BASE64_NO_EXPORT BASE64_SYMBOL_PRIVATE
38+
#endif
39+
40+
641
#ifdef __cplusplus
742
extern "C" {
843
#endif
@@ -32,7 +67,7 @@ struct base64_state {
3267
* to *out without trailing zero. Output length in bytes is written to *outlen.
3368
* The buffer in `out` has been allocated by the caller and is at least 4/3 the
3469
* size of the input. See above for `flags`; set to 0 for default operation: */
35-
void base64_encode
70+
void BASE64_EXPORT base64_encode
3671
( const char *src
3772
, size_t srclen
3873
, char *out
@@ -42,7 +77,7 @@ void base64_encode
4277

4378
/* Call this before calling base64_stream_encode() to init the state. See above
4479
* for `flags`; set to 0 for default operation: */
45-
void base64_stream_encode_init
80+
void BASE64_EXPORT base64_stream_encode_init
4681
( struct base64_state *state
4782
, int flags
4883
) ;
@@ -52,7 +87,7 @@ void base64_stream_encode_init
5287
* must be at least 4/3 the size of the in-buffer, but take some margin. Places
5388
* the number of new bytes written into `outlen` (which is set to zero when the
5489
* function starts). Does not zero-terminate or finalize the output. */
55-
void base64_stream_encode
90+
void BASE64_EXPORT base64_stream_encode
5691
( struct base64_state *state
5792
, const char *src
5893
, size_t srclen
@@ -64,7 +99,7 @@ void base64_stream_encode
6499
* Adds the required end-of-stream markers if appropriate. `outlen` is modified
65100
* and will contain the number of new bytes written at `out` (which will quite
66101
* often be zero). */
67-
void base64_stream_encode_final
102+
void BASE64_EXPORT base64_stream_encode_final
68103
( struct base64_state *state
69104
, char *out
70105
, size_t *outlen
@@ -74,7 +109,7 @@ void base64_stream_encode_final
74109
* to *out without trailing zero. Output length in bytes is written to *outlen.
75110
* The buffer in `out` has been allocated by the caller and is at least 3/4 the
76111
* size of the input. See above for `flags`, set to 0 for default operation: */
77-
int base64_decode
112+
int BASE64_EXPORT base64_decode
78113
( const char *src
79114
, size_t srclen
80115
, char *out
@@ -84,7 +119,7 @@ int base64_decode
84119

85120
/* Call this before calling base64_stream_decode() to init the state. See above
86121
* for `flags`; set to 0 for default operation: */
87-
void base64_stream_decode_init
122+
void BASE64_EXPORT base64_stream_decode_init
88123
( struct base64_state *state
89124
, int flags
90125
) ;
@@ -97,7 +132,7 @@ void base64_stream_decode_init
97132
* well, and 0 if a decoding error was found, such as an invalid character.
98133
* Returns -1 if the chosen codec is not included in the current build. Used by
99134
* the test harness to check whether a codec is available for testing. */
100-
int base64_stream_decode
135+
int BASE64_EXPORT base64_stream_decode
101136
( struct base64_state *state
102137
, const char *src
103138
, size_t srclen
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#include <stdint.h>
2+
#include <stddef.h>
3+
#include <string.h>
4+
5+
#include "../../../include/libbase64.h"
6+
#include "../../tables/tables.h"
7+
#include "../../codecs.h"
8+
#include "../../config.h"
9+
#include "../../env.h"
10+
11+
#ifdef __arm__
12+
# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
13+
# define BASE64_USE_NEON32
14+
# endif
15+
#endif
16+
17+
#ifdef BASE64_USE_NEON32
18+
#include <arm_neon.h>
19+
20+
// Only enable inline assembly on supported compilers.
21+
#if defined(__GNUC__) || defined(__clang__)
22+
#define BASE64_NEON32_USE_ASM
23+
#endif
24+
25+
static inline uint8x16_t
26+
vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
27+
{
28+
// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
29+
// the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
30+
uint8x8x2_t lut2;
31+
uint8x8x2_t result;
32+
33+
lut2.val[0] = vget_low_u8(lut);
34+
lut2.val[1] = vget_high_u8(lut);
35+
36+
result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
37+
result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
38+
39+
return vcombine_u8(result.val[0], result.val[1]);
40+
}
41+
42+
#include "../generic/32/dec_loop.c"
43+
#include "../generic/32/enc_loop.c"
44+
#include "dec_loop.c"
45+
#include "enc_reshuffle.c"
46+
#include "enc_translate.c"
47+
#include "enc_loop.c"
48+
49+
#endif // BASE64_USE_NEON32
50+
51+
// Stride size is so large on these NEON 32-bit functions
52+
// (48 bytes encode, 32 bytes decode) that we inline the
53+
// uint32 codec to stay performant on smaller inputs.
54+
55+
BASE64_ENC_FUNCTION(neon32)
56+
{
57+
#ifdef BASE64_USE_NEON32
58+
#include "../generic/enc_head.c"
59+
enc_loop_neon32(&s, &slen, &o, &olen);
60+
enc_loop_generic_32(&s, &slen, &o, &olen);
61+
#include "../generic/enc_tail.c"
62+
#else
63+
BASE64_ENC_STUB
64+
#endif
65+
}
66+
67+
BASE64_DEC_FUNCTION(neon32)
68+
{
69+
#ifdef BASE64_USE_NEON32
70+
#include "../generic/dec_head.c"
71+
dec_loop_neon32(&s, &slen, &o, &olen);
72+
dec_loop_generic_32(&s, &slen, &o, &olen);
73+
#include "../generic/dec_tail.c"
74+
#else
75+
BASE64_DEC_STUB
76+
#endif
77+
}

base64/lib/arch/neon32/dec_loop.c

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
static inline int
2+
is_nonzero (const uint8x16_t v)
3+
{
4+
uint64_t u64;
5+
const uint64x2_t v64 = vreinterpretq_u64_u8(v);
6+
const uint32x2_t v32 = vqmovn_u64(v64);
7+
8+
vst1_u64(&u64, vreinterpret_u64_u32(v32));
9+
return u64 != 0;
10+
}
11+
12+
static inline uint8x16_t
13+
delta_lookup (const uint8x16_t v)
14+
{
15+
const uint8x8_t lut = {
16+
0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
17+
};
18+
19+
return vcombine_u8(
20+
vtbl1_u8(lut, vget_low_u8(v)),
21+
vtbl1_u8(lut, vget_high_u8(v)));
22+
}
23+
24+
static inline uint8x16_t
25+
dec_loop_neon32_lane (uint8x16_t *lane)
26+
{
27+
// See the SSSE3 decoder for an explanation of the algorithm.
28+
const uint8x16_t lut_lo = {
29+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
30+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
31+
};
32+
33+
const uint8x16_t lut_hi = {
34+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
35+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
36+
};
37+
38+
const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
39+
const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
40+
41+
const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
42+
const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
43+
const uint8x16_t eq_2F = vceqq_u8(*lane, mask_2F);
44+
45+
const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
46+
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
47+
48+
// Now simply add the delta values to the input:
49+
*lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
50+
51+
// Return the validity mask:
52+
return vandq_u8(lo, hi);
53+
}
54+
55+
static inline void
56+
dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
57+
{
58+
if (*slen < 64) {
59+
return;
60+
}
61+
62+
// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
63+
// extra trailing zero bytes are written, so it is not necessary to
64+
// reserve extra input bytes:
65+
size_t rounds = *slen / 64;
66+
67+
*slen -= rounds * 64; // 64 bytes consumed per round
68+
*olen += rounds * 48; // 48 bytes produced per round
69+
70+
do {
71+
uint8x16x3_t dec;
72+
73+
// Load 64 bytes and deinterleave:
74+
uint8x16x4_t str = vld4q_u8(*s);
75+
76+
// Decode each lane, collect a mask of invalid inputs:
77+
const uint8x16_t classified
78+
= dec_loop_neon32_lane(&str.val[0])
79+
| dec_loop_neon32_lane(&str.val[1])
80+
| dec_loop_neon32_lane(&str.val[2])
81+
| dec_loop_neon32_lane(&str.val[3]);
82+
83+
// Check for invalid input: if any of the delta values are
84+
// zero, fall back on bytewise code to do error checking and
85+
// reporting:
86+
if (is_nonzero(classified)) {
87+
break;
88+
}
89+
90+
// Compress four bytes into three:
91+
dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
92+
dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
93+
dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
94+
95+
// Interleave and store decoded result:
96+
vst3q_u8(*o, dec);
97+
98+
*s += 64;
99+
*o += 48;
100+
101+
} while (--rounds > 0);
102+
103+
// Adjust for any rounds that were skipped:
104+
*slen += rounds * 64;
105+
*olen -= rounds * 48;
106+
}

0 commit comments

Comments
 (0)