OLD | NEW |
(Empty) | |
| 1 /* chunkcopy.h -- fast copies and sets |
| 2 * Copyright (C) 2017 ARM, Inc. |
| 3 * For conditions of distribution and use, see copyright notice in zlib.h |
| 4 */ |
| 5 |
| 6 #ifndef CHUNKCOPY_H |
| 7 #define CHUNKCOPY_H |
| 8 |
| 9 #include "zutil.h" |
| 10 #include <arm_neon.h> |
| 11 |
| 12 #if __STDC_VERSION__ >= 199901L |
| 13 #define Z_RESTRICT restrict |
| 14 #else |
| 15 #define Z_RESTRICT |
| 16 #endif |
| 17 |
| 18 typedef uint8x16_t chunkcopy_chunk_t; |
| 19 #define CHUNKCOPY_CHUNK_SIZE sizeof(chunkcopy_chunk_t) |
| 20 |
| 21 /* |
| 22 Ask the compiler to perform a wide, unaligned load with an machine |
| 23 instruction appropriate for the chunkcopy_chunk_t type. |
| 24 */ |
| 25 static inline chunkcopy_chunk_t loadchunk(const unsigned char FAR *s) { |
| 26 chunkcopy_chunk_t c; |
| 27 __builtin_memcpy(&c, s, sizeof(c)); |
| 28 return c; |
| 29 } |
| 30 |
| 31 /* |
| 32 Ask the compiler to perform a wide, unaligned store with an machine |
| 33 instruction appropriate for the chunkcopy_chunk_t type. |
| 34 */ |
| 35 static inline void storechunk(unsigned char FAR *d, chunkcopy_chunk_t c) { |
| 36 __builtin_memcpy(d, &c, sizeof(c)); |
| 37 } |
| 38 |
| 39 /* |
| 40 Perform a memcpy-like operation, but assume that length is non-zero and that |
| 41 it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if |
| 42 the length is shorter than this. |
| 43 |
| 44 It also guarantees that it will properly unroll the data if the distance |
| 45 between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on |
| 46 in chunkcopy_relaxed(). |
| 47 |
| 48 Aside from better memory bus utilisation, this means that short copies |
| 49 (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop |
| 50 without iteration, which will hopefully make the branch prediction more |
| 51 reliable. |
| 52 */ |
| 53 static inline unsigned char FAR *chunkcopy_core(unsigned char FAR *out, |
| 54 const unsigned char FAR *from, |
| 55 unsigned len) { |
| 56 int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1; |
| 57 storechunk(out, loadchunk(from)); |
| 58 out += bump; |
| 59 from += bump; |
| 60 len /= CHUNKCOPY_CHUNK_SIZE; |
| 61 while (len-- > 0) { |
| 62 storechunk(out, loadchunk(from)); |
| 63 out += CHUNKCOPY_CHUNK_SIZE; |
| 64 from += CHUNKCOPY_CHUNK_SIZE; |
| 65 } |
| 66 return out; |
| 67 } |
| 68 |
| 69 /* |
| 70 Like chunkcopy_core, but avoid writing beyond of legal output. |
| 71 |
| 72 Accepts an additional pointer to the end of safe output. A generic safe |
| 73 copy would use (out + len), but it's normally the case that the end of the |
| 74 output buffer is beyond the end of the current copy, and this can still be |
| 75 exploited. |
| 76 */ |
| 77 static inline unsigned char FAR *chunkcopy_core_safe(unsigned char FAR *out, |
| 78 const unsigned char FAR * f
rom, |
| 79 unsigned len, |
| 80 unsigned char FAR *limit) { |
| 81 Assert(out + len <= limit, "chunk copy exceeds safety limit"); |
| 82 if (limit - out < CHUNKCOPY_CHUNK_SIZE) { |
| 83 const unsigned char FAR * Z_RESTRICT rfrom = from; |
| 84 if (len & 8) { __builtin_memcpy(out, rfrom, 8); out += 8; rfrom += 8; } |
| 85 if (len & 4) { __builtin_memcpy(out, rfrom, 4); out += 4; rfrom += 4; } |
| 86 if (len & 2) { __builtin_memcpy(out, rfrom, 2); out += 2; rfrom += 2; } |
| 87 if (len & 1) { *out++ = *rfrom++; } |
| 88 return out; |
| 89 } |
| 90 return chunkcopy_core(out, from, len); |
| 91 } |
| 92 |
| 93 /* |
| 94 Perform short copies until distance can be rewritten as being at least |
| 95 CHUNKCOPY_CHUNK_SIZE. |
| 96 |
| 97 This assumes that it's OK to overwrite at least the first |
| 98 2*CHUNKCOPY_CHUNK_SIZE bytes of output even if the copy is shorter than |
| 99 this. This assumption holds within inflate_fast() which starts every |
| 100 iteration with at least 258 bytes of output space available (258 being the |
| 101 maximum length output from a single token; see inffast.c). |
| 102 */ |
| 103 static inline unsigned char FAR *chunkunroll_relaxed(unsigned char FAR *out, |
| 104 unsigned FAR *dist, |
| 105 unsigned FAR *len) { |
| 106 const unsigned char FAR *from = out - *dist; |
| 107 while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) { |
| 108 storechunk(out, loadchunk(from)); |
| 109 out += *dist; |
| 110 *len -= *dist; |
| 111 *dist += *dist; |
| 112 } |
| 113 return out; |
| 114 } |
| 115 |
| 116 |
| 117 static inline uint8x16_t chunkset_vld1q_dup_u8x8(const unsigned char FAR * Z_RES
TRICT from) { |
| 118 #if defined(__clang__) || defined(__aarch64__) |
| 119 return vreinterpretq_u8_u64(vld1q_dup_u64((void *)from)); |
| 120 #else |
| 121 /* 32-bit GCC uses an alignment hint for vld1q_dup_u64, even when given a |
| 122 * void pointer, so here's an alternate implementation. |
| 123 */ |
| 124 uint8x8_t h = vld1_u8(from); |
| 125 return vcombine_u8(h, h); |
| 126 #endif |
| 127 } |
| 128 |
| 129 /* |
| 130 Perform an overlapping copy which behaves as a memset() operation, but |
| 131 supporting periods other than one, and assume that length is non-zero and |
| 132 that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output |
| 133 even if the length is shorter than this. |
| 134 */ |
| 135 static inline unsigned char FAR *chunkset_core(unsigned char FAR *out, |
| 136 unsigned period, |
| 137 unsigned len) { |
| 138 uint8x16_t f; |
| 139 int bump = ((len - 1) % sizeof(f)) + 1; |
| 140 |
| 141 switch (period) { |
| 142 case 1: |
| 143 f = vld1q_dup_u8(out - 1); |
| 144 vst1q_u8(out, f); |
| 145 out += bump; |
| 146 len -= bump; |
| 147 while (len > 0) { |
| 148 vst1q_u8(out, f); |
| 149 out += sizeof(f); |
| 150 len -= sizeof(f); |
| 151 } |
| 152 return out; |
| 153 case 2: |
| 154 f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2))); |
| 155 vst1q_u8(out, f); |
| 156 out += bump; |
| 157 len -= bump; |
| 158 if (len > 0) { |
| 159 f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2))); |
| 160 do { |
| 161 vst1q_u8(out, f); |
| 162 out += sizeof(f); |
| 163 len -= sizeof(f); |
| 164 } while (len > 0); |
| 165 } |
| 166 return out; |
| 167 case 4: |
| 168 f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4))); |
| 169 vst1q_u8(out, f); |
| 170 out += bump; |
| 171 len -= bump; |
| 172 if (len > 0) { |
| 173 f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4))); |
| 174 do { |
| 175 vst1q_u8(out, f); |
| 176 out += sizeof(f); |
| 177 len -= sizeof(f); |
| 178 } while (len > 0); |
| 179 } |
| 180 return out; |
| 181 case 8: |
| 182 f = chunkset_vld1q_dup_u8x8(out - 8); |
| 183 vst1q_u8(out, f); |
| 184 out += bump; |
| 185 len -= bump; |
| 186 if (len > 0) { |
| 187 f = chunkset_vld1q_dup_u8x8(out - 8); |
| 188 do { |
| 189 vst1q_u8(out, f); |
| 190 out += sizeof(f); |
| 191 len -= sizeof(f); |
| 192 } while (len > 0); |
| 193 } |
| 194 return out; |
| 195 } |
| 196 out = chunkunroll_relaxed(out, &period, &len); |
| 197 return chunkcopy_core(out, out - period, len); |
| 198 } |
| 199 |
| 200 /* |
| 201 Perform a memcpy-like operation, but assume that length is non-zero and that |
| 202 it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if |
| 203 the length is shorter than this. |
| 204 |
| 205 Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour |
| 206 of overlapping buffers, regardless of the distance between the pointers. |
| 207 This is reflected in the `restrict`-qualified pointers, allowing the |
| 208 compiler to reorder loads and stores. |
| 209 */ |
| 210 static inline unsigned char FAR *chunkcopy_relaxed(unsigned char FAR * Z_RESTRIC
T out, |
| 211 const unsigned char FAR * Z_R
ESTRICT from, |
| 212 unsigned len) { |
| 213 return chunkcopy_core(out, from, len); |
| 214 } |
| 215 |
| 216 /* |
| 217 Like chunkcopy_relaxed, but avoid writing beyond of legal output. |
| 218 |
| 219 Unlike chunkcopy_core_safe() above, no guarantee is made regarding the |
| 220 behaviour of overlapping buffers, regardless of the distance between the |
| 221 pointers. This is reflected in the `restrict`-qualified pointers, allowing |
| 222 the compiler to reorder loads and stores. |
| 223 |
| 224 Accepts an additional pointer to the end of safe output. A generic safe |
| 225 copy would use (out + len), but it's normally the case that the end of the |
| 226 output buffer is beyond the end of the current copy, and this can still be |
| 227 exploited. |
| 228 */ |
| 229 static inline unsigned char FAR *chunkcopy_safe(unsigned char FAR *out, |
| 230 const unsigned char FAR * Z_REST
RICT from, |
| 231 unsigned len, |
| 232 unsigned char FAR *limit) { |
| 233 Assert(out + len <= limit, "chunk copy exceeds safety limit"); |
| 234 return chunkcopy_core_safe(out, from, len, limit); |
| 235 } |
| 236 |
| 237 /* |
| 238 Perform chunky copy within the same buffer, where the source and destination |
| 239 may potentially overlap. |
| 240 |
| 241 Assumes that len > 0 on entry, and that it's safe to write at least |
| 242 CHUNKCOPY_CHUNK_SIZE*3 bytes to the output. |
| 243 */ |
| 244 static inline unsigned char FAR *chunkcopy_lapped_relaxed(unsigned char FAR *out
, |
| 245 unsigned dist, |
| 246 unsigned len) { |
| 247 if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) { |
| 248 return chunkset_core(out, dist, len); |
| 249 } |
| 250 return chunkcopy_core(out, out - dist, len); |
| 251 } |
| 252 |
| 253 /* |
| 254 Behave like chunkcopy_lapped_relaxed, but avoid writing beyond of legal outpu
t. |
| 255 |
| 256 Accepts an additional pointer to the end of safe output. A generic safe |
| 257 copy would use (out + len), but it's normally the case that the end of the |
| 258 output buffer is beyond the end of the current copy, and this can still be |
| 259 exploited. |
| 260 */ |
| 261 static inline unsigned char FAR *chunkcopy_lapped_safe(unsigned char FAR *out, |
| 262 unsigned dist, |
| 263 unsigned len, |
| 264 unsigned char FAR *limit)
{ |
| 265 Assert(out + len <= limit, "chunk copy exceeds safety limit"); |
| 266 if (limit - out < CHUNKCOPY_CHUNK_SIZE * 3) { |
| 267 /* TODO: try harder to optimise this */ |
| 268 while (len-- > 0) { |
| 269 *out = *(out - dist); |
| 270 out++; |
| 271 } |
| 272 return out; |
| 273 } |
| 274 return chunkcopy_lapped_relaxed(out, dist, len); |
| 275 } |
| 276 |
| 277 #undef Z_RESTRICT |
| 278 |
| 279 #endif /* CHUNKCOPY_H */ |
OLD | NEW |