Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1599)

Side by Side Diff: third_party/zlib/contrib/arm/chunkcopy.h

Issue 2722063002: zlib: inflate using wider loads and stores
Patch Set: zlib: inflate using wider loads and stores Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/zlib/BUILD.gn ('k') | third_party/zlib/contrib/arm/inffast.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /* chunkcopy.h -- fast copies and sets
2 * Copyright (C) 2017 ARM, Inc.
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 */
5
6 #ifndef CHUNKCOPY_H
7 #define CHUNKCOPY_H
8
9 #include "zutil.h"
10 #include <arm_neon.h>
11
12 #if __STDC_VERSION__ >= 199901L
13 #define Z_RESTRICT restrict
14 #else
15 #define Z_RESTRICT
16 #endif
17
18 typedef uint8x16_t chunkcopy_chunk_t;
19 #define CHUNKCOPY_CHUNK_SIZE sizeof(chunkcopy_chunk_t)
20
21 /*
22 Ask the compiler to perform a wide, unaligned load with an machine
23 instruction appropriate for the chunkcopy_chunk_t type.
24 */
25 static inline chunkcopy_chunk_t loadchunk(const unsigned char FAR *s) {
26 chunkcopy_chunk_t c;
27 __builtin_memcpy(&c, s, sizeof(c));
28 return c;
29 }
30
31 /*
32 Ask the compiler to perform a wide, unaligned store with an machine
33 instruction appropriate for the chunkcopy_chunk_t type.
34 */
35 static inline void storechunk(unsigned char FAR *d, chunkcopy_chunk_t c) {
36 __builtin_memcpy(d, &c, sizeof(c));
37 }
38
39 /*
40 Perform a memcpy-like operation, but assume that length is non-zero and that
41 it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
42 the length is shorter than this.
43
44 It also guarantees that it will properly unroll the data if the distance
45 between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
46 in chunkcopy_relaxed().
47
48 Aside from better memory bus utilisation, this means that short copies
49 (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
50 without iteration, which will hopefully make the branch prediction more
51 reliable.
52 */
53 static inline unsigned char FAR *chunkcopy_core(unsigned char FAR *out,
54 const unsigned char FAR *from,
55 unsigned len) {
56 int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
57 storechunk(out, loadchunk(from));
58 out += bump;
59 from += bump;
60 len /= CHUNKCOPY_CHUNK_SIZE;
61 while (len-- > 0) {
62 storechunk(out, loadchunk(from));
63 out += CHUNKCOPY_CHUNK_SIZE;
64 from += CHUNKCOPY_CHUNK_SIZE;
65 }
66 return out;
67 }
68
69 /*
70 Like chunkcopy_core, but avoid writing beyond of legal output.
71
72 Accepts an additional pointer to the end of safe output. A generic safe
73 copy would use (out + len), but it's normally the case that the end of the
74 output buffer is beyond the end of the current copy, and this can still be
75 exploited.
76 */
77 static inline unsigned char FAR *chunkcopy_core_safe(unsigned char FAR *out,
78 const unsigned char FAR * f rom,
79 unsigned len,
80 unsigned char FAR *limit) {
81 Assert(out + len <= limit, "chunk copy exceeds safety limit");
82 if (limit - out < CHUNKCOPY_CHUNK_SIZE) {
83 const unsigned char FAR * Z_RESTRICT rfrom = from;
84 if (len & 8) { __builtin_memcpy(out, rfrom, 8); out += 8; rfrom += 8; }
85 if (len & 4) { __builtin_memcpy(out, rfrom, 4); out += 4; rfrom += 4; }
86 if (len & 2) { __builtin_memcpy(out, rfrom, 2); out += 2; rfrom += 2; }
87 if (len & 1) { *out++ = *rfrom++; }
88 return out;
89 }
90 return chunkcopy_core(out, from, len);
91 }
92
93 /*
94 Perform short copies until distance can be rewritten as being at least
95 CHUNKCOPY_CHUNK_SIZE.
96
97 This assumes that it's OK to overwrite at least the first
98 2*CHUNKCOPY_CHUNK_SIZE bytes of output even if the copy is shorter than
99 this. This assumption holds within inflate_fast() which starts every
100 iteration with at least 258 bytes of output space available (258 being the
101 maximum length output from a single token; see inffast.c).
102 */
103 static inline unsigned char FAR *chunkunroll_relaxed(unsigned char FAR *out,
104 unsigned FAR *dist,
105 unsigned FAR *len) {
106 const unsigned char FAR *from = out - *dist;
107 while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
108 storechunk(out, loadchunk(from));
109 out += *dist;
110 *len -= *dist;
111 *dist += *dist;
112 }
113 return out;
114 }
115
116
117 static inline uint8x16_t chunkset_vld1q_dup_u8x8(const unsigned char FAR * Z_RES TRICT from) {
118 #if defined(__clang__) || defined(__aarch64__)
119 return vreinterpretq_u8_u64(vld1q_dup_u64((void *)from));
120 #else
121 /* 32-bit GCC uses an alignment hint for vld1q_dup_u64, even when given a
122 * void pointer, so here's an alternate implementation.
123 */
124 uint8x8_t h = vld1_u8(from);
125 return vcombine_u8(h, h);
126 #endif
127 }
128
129 /*
130 Perform an overlapping copy which behaves as a memset() operation, but
131 supporting periods other than one, and assume that length is non-zero and
132 that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
133 even if the length is shorter than this.
134 */
135 static inline unsigned char FAR *chunkset_core(unsigned char FAR *out,
136 unsigned period,
137 unsigned len) {
138 uint8x16_t f;
139 int bump = ((len - 1) % sizeof(f)) + 1;
140
141 switch (period) {
142 case 1:
143 f = vld1q_dup_u8(out - 1);
144 vst1q_u8(out, f);
145 out += bump;
146 len -= bump;
147 while (len > 0) {
148 vst1q_u8(out, f);
149 out += sizeof(f);
150 len -= sizeof(f);
151 }
152 return out;
153 case 2:
154 f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2)));
155 vst1q_u8(out, f);
156 out += bump;
157 len -= bump;
158 if (len > 0) {
159 f = vreinterpretq_u8_u16(vld1q_dup_u16((void *)(out - 2)));
160 do {
161 vst1q_u8(out, f);
162 out += sizeof(f);
163 len -= sizeof(f);
164 } while (len > 0);
165 }
166 return out;
167 case 4:
168 f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4)));
169 vst1q_u8(out, f);
170 out += bump;
171 len -= bump;
172 if (len > 0) {
173 f = vreinterpretq_u8_u32(vld1q_dup_u32((void *)(out - 4)));
174 do {
175 vst1q_u8(out, f);
176 out += sizeof(f);
177 len -= sizeof(f);
178 } while (len > 0);
179 }
180 return out;
181 case 8:
182 f = chunkset_vld1q_dup_u8x8(out - 8);
183 vst1q_u8(out, f);
184 out += bump;
185 len -= bump;
186 if (len > 0) {
187 f = chunkset_vld1q_dup_u8x8(out - 8);
188 do {
189 vst1q_u8(out, f);
190 out += sizeof(f);
191 len -= sizeof(f);
192 } while (len > 0);
193 }
194 return out;
195 }
196 out = chunkunroll_relaxed(out, &period, &len);
197 return chunkcopy_core(out, out - period, len);
198 }
199
200 /*
201 Perform a memcpy-like operation, but assume that length is non-zero and that
202 it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
203 the length is shorter than this.
204
205 Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
206 of overlapping buffers, regardless of the distance between the pointers.
207 This is reflected in the `restrict`-qualified pointers, allowing the
208 compiler to reorder loads and stores.
209 */
210 static inline unsigned char FAR *chunkcopy_relaxed(unsigned char FAR * Z_RESTRIC T out,
211 const unsigned char FAR * Z_R ESTRICT from,
212 unsigned len) {
213 return chunkcopy_core(out, from, len);
214 }
215
216 /*
217 Like chunkcopy_relaxed, but avoid writing beyond of legal output.
218
219 Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
220 behaviour of overlapping buffers, regardless of the distance between the
221 pointers. This is reflected in the `restrict`-qualified pointers, allowing
222 the compiler to reorder loads and stores.
223
224 Accepts an additional pointer to the end of safe output. A generic safe
225 copy would use (out + len), but it's normally the case that the end of the
226 output buffer is beyond the end of the current copy, and this can still be
227 exploited.
228 */
229 static inline unsigned char FAR *chunkcopy_safe(unsigned char FAR *out,
230 const unsigned char FAR * Z_REST RICT from,
231 unsigned len,
232 unsigned char FAR *limit) {
233 Assert(out + len <= limit, "chunk copy exceeds safety limit");
234 return chunkcopy_core_safe(out, from, len, limit);
235 }
236
237 /*
238 Perform chunky copy within the same buffer, where the source and destination
239 may potentially overlap.
240
241 Assumes that len > 0 on entry, and that it's safe to write at least
242 CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
243 */
244 static inline unsigned char FAR *chunkcopy_lapped_relaxed(unsigned char FAR *out ,
245 unsigned dist,
246 unsigned len) {
247 if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
248 return chunkset_core(out, dist, len);
249 }
250 return chunkcopy_core(out, out - dist, len);
251 }
252
253 /*
254 Behave like chunkcopy_lapped_relaxed, but avoid writing beyond of legal outpu t.
255
256 Accepts an additional pointer to the end of safe output. A generic safe
257 copy would use (out + len), but it's normally the case that the end of the
258 output buffer is beyond the end of the current copy, and this can still be
259 exploited.
260 */
261 static inline unsigned char FAR *chunkcopy_lapped_safe(unsigned char FAR *out,
262 unsigned dist,
263 unsigned len,
264 unsigned char FAR *limit) {
265 Assert(out + len <= limit, "chunk copy exceeds safety limit");
266 if (limit - out < CHUNKCOPY_CHUNK_SIZE * 3) {
267 /* TODO: try harder to optimise this */
268 while (len-- > 0) {
269 *out = *(out - dist);
270 out++;
271 }
272 return out;
273 }
274 return chunkcopy_lapped_relaxed(out, dist, len);
275 }
276
277 #undef Z_RESTRICT
278
279 #endif /* CHUNKCOPY_H */
OLDNEW
« no previous file with comments | « third_party/zlib/BUILD.gn ('k') | third_party/zlib/contrib/arm/inffast.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698