source src/hashsig.c
Line | Flow | Count | Block(s) | Source |
---|---|---|---|---|
1 | - | /* | ||
2 | - | * Copyright (C) the libgit2 contributors. All rights reserved. | ||
3 | - | * | ||
4 | - | * This file is part of libgit2, distributed under the GNU GPL v2 with | ||
5 | - | * a Linking Exception. For full terms see the included COPYING file. | ||
6 | - | */ | ||
7 | - | |||
8 | - | #include "common.h" | ||
9 | - | |||
10 | - | #include "git2/sys/hashsig.h" | ||
11 | - | #include "futils.h" | ||
12 | - | #include "util.h" | ||
13 | - | |||
14 | - | typedef uint32_t hashsig_t; | ||
15 | - | typedef uint64_t hashsig_state; | ||
16 | - | |||
17 | - | #define HASHSIG_SCALE 100 | ||
18 | - | |||
19 | - | #define HASHSIG_MAX_RUN 80 | ||
20 | - | #define HASHSIG_HASH_START 0x012345678ABCDEF0LL | ||
21 | - | #define HASHSIG_HASH_SHIFT 5 | ||
22 | - | |||
23 | - | #define HASHSIG_HASH_MIX(S,CH) \ | ||
24 | - | (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH) | ||
25 | - | |||
26 | - | #define HASHSIG_HEAP_SIZE ((1 << 7) - 1) | ||
27 | - | #define HASHSIG_HEAP_MIN_SIZE 4 | ||
28 | - | |||
29 | - | typedef int (*hashsig_cmp)(const void *a, const void *b, void *); | ||
30 | - | |||
31 | - | typedef struct { | ||
32 | - | int size, asize; | ||
33 | - | hashsig_cmp cmp; | ||
34 | - | hashsig_t values[HASHSIG_HEAP_SIZE]; | ||
35 | - | } hashsig_heap; | ||
36 | - | |||
37 | - | struct git_hashsig { | ||
38 | - | hashsig_heap mins; | ||
39 | - | hashsig_heap maxs; | ||
40 | - | size_t lines; | ||
41 | - | git_hashsig_option_t opt; | ||
42 | - | }; | ||
43 | - | |||
44 | - | #define HEAP_LCHILD_OF(I) (((I)<<1)+1) | ||
45 | - | #define HEAP_RCHILD_OF(I) (((I)<<1)+2) | ||
46 | - | #define HEAP_PARENT_OF(I) (((I)-1)>>1) | ||
47 | - | |||
48 | 7402 | 2 | static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp) | |
49 | - | { | ||
50 | 7402 | 2 | h->size = 0; | |
51 | 7402 | 2 | h->asize = HASHSIG_HEAP_SIZE; | |
52 | 7402 | 2 | h->cmp = cmp; | |
53 | 7402 | 2 | } | |
54 | - | |||
55 | 917679 | 2 | static int hashsig_cmp_max(const void *a, const void *b, void *payload) | |
56 | - | { | ||
57 | 917679 | 2 | hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; | |
58 | - | GIT_UNUSED(payload); | ||
59 | 917679 | 2 | return (av < bv) ? -1 : (av > bv) ? 1 : 0; | |
60 | - | } | ||
61 | - | |||
62 | 1885743 | 2 | static int hashsig_cmp_min(const void *a, const void *b, void *payload) | |
63 | - | { | ||
64 | 1885743 | 2 | hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; | |
65 | - | GIT_UNUSED(payload); | ||
66 | 1885743 | 2 | return (av > bv) ? -1 : (av < bv) ? 1 : 0; | |
67 | - | } | ||
68 | - | |||
69 | 145006 | 2 | static void hashsig_heap_up(hashsig_heap *h, int el) | |
70 | - | { | ||
71 | 145006 | 2 | int parent_el = HEAP_PARENT_OF(el); | |
72 | - | |||
73 | 375355 | 2,4-6 | while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) { | |
74 | 230349 | 3 | hashsig_t t = h->values[el]; | |
75 | 230349 | 3 | h->values[el] = h->values[parent_el]; | |
76 | 230349 | 3 | h->values[parent_el] = t; | |
77 | - | |||
78 | 230349 | 3 | el = parent_el; | |
79 | 230349 | 3 | parent_el = HEAP_PARENT_OF(el); | |
80 | - | } | ||
81 | 145006 | 7 | } | |
82 | - | |||
83 | 37128 | 2 | static void hashsig_heap_down(hashsig_heap *h, int el) | |
84 | - | { | ||
85 | - | hashsig_t v, lv, rv; | ||
86 | - | |||
87 | - | /* 'el < h->size / 2' tests if el is bottom row of heap */ | ||
88 | - | |||
89 | 252584 | 2,13 | while (el < h->size / 2) { | |
90 | 220862 | 3 | int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel; | |
91 | - | |||
92 | 220862 | 3 | v = h->values[el]; | |
93 | 220862 | 3 | lv = h->values[lel]; | |
94 | 220862 | 3 | rv = h->values[rel]; | |
95 | - | |||
96 | 220862 | 3-6 | if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0) | |
97 | 5406 | 7 | break; | |
98 | - | |||
99 | 215456 | 8-11 | swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel; | |
100 | - | |||
101 | 215456 | 12 | h->values[el] = h->values[swapel]; | |
102 | 215456 | 12 | h->values[swapel] = v; | |
103 | - | |||
104 | 215456 | 12 | el = swapel; | |
105 | - | } | ||
106 | 37128 | 14 | } | |
107 | - | |||
108 | 2978 | 2 | static void hashsig_heap_sort(hashsig_heap *h) | |
109 | - | { | ||
110 | - | /* only need to do this at the end for signature comparison */ | ||
111 | 2978 | 2 | git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL); | |
112 | 2978 | 3 | } | |
113 | - | |||
114 | 226010 | 2 | static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val) | |
115 | - | { | ||
116 | - | /* if heap is not full, insert new element */ | ||
117 | 226010 | 2 | if (h->size < h->asize) { | |
118 | 145006 | 3 | h->values[h->size++] = val; | |
119 | 145006 | 3 | hashsig_heap_up(h, h->size - 1); | |
120 | - | } | ||
121 | - | |||
122 | - | /* if heap is full, pop top if new element should replace it */ | ||
123 | 81004 | 4,5 | else if (h->cmp(&val, &h->values[0], NULL) > 0) { | |
124 | 37128 | 6 | h->size--; | |
125 | 37128 | 6 | h->values[0] = h->values[h->size]; | |
126 | 37128 | 6 | hashsig_heap_down(h, 0); | |
127 | - | } | ||
128 | - | |||
129 | 226010 | 7 | } | |
130 | - | |||
131 | - | typedef struct { | ||
132 | - | int use_ignores; | ||
133 | - | uint8_t ignore_ch[256]; | ||
134 | - | } hashsig_in_progress; | ||
135 | - | |||
136 | 3701 | 2 | static void hashsig_in_progress_init( | |
137 | - | hashsig_in_progress *prog, git_hashsig *sig) | ||
138 | - | { | ||
139 | - | int i; | ||
140 | - | |||
141 | - | /* no more than one can be set */ | ||
142 | 3701 | 2-4 | assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) || | |
143 | - | !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE)); | ||
144 | - | |||
145 | 3701 | 5 | if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) { | |
146 | 9766 | 6,8,9 | for (i = 0; i < 256; ++i) | |
147 | 9728 | 7 | prog->ignore_ch[i] = git__isspace_nonlf(i); | |
148 | 38 | 10 | prog->use_ignores = 1; | |
149 | 3663 | 11 | } else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) { | |
150 | 928798 | 12,14,15 | for (i = 0; i < 256; ++i) | |
151 | 925184 | 13 | prog->ignore_ch[i] = git__isspace(i); | |
152 | 3614 | 16 | prog->use_ignores = 1; | |
153 | - | } else { | ||
154 | 49 | 17 | memset(prog, 0, sizeof(*prog)); | |
155 | - | } | ||
156 | 3701 | 18 | } | |
157 | - | |||
158 | 3698 | 2 | static int hashsig_add_hashes( | |
159 | - | git_hashsig *sig, | ||
160 | - | const uint8_t *data, | ||
161 | - | size_t size, | ||
162 | - | hashsig_in_progress *prog) | ||
163 | - | { | ||
164 | 3698 | 2 | const uint8_t *scan = data, *end = data + size; | |
165 | 3698 | 2 | hashsig_state state = HASHSIG_HASH_START; | |
166 | 3698 | 2 | int use_ignores = prog->use_ignores, len; | |
167 | - | uint8_t ch; | ||
168 | - | |||
169 | 104974882 | 2,33 | while (scan < end) { | |
170 | 104971184 | 3 | state = HASHSIG_HASH_START; | |
171 | - | |||
172 | 108720355 | 3,23,24 | for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) { | |
173 | 108703950 | 4 | ch = *scan; | |
174 | - | |||
175 | 108703950 | 4 | if (use_ignores) | |
176 | 143326 | 5-9 | for (; scan < end && git__isspace_nonlf(ch); ch = *scan) | |
177 | 64644 | 6 | ++scan; | |
178 | 108625268 | 10 | else if (sig->opt & | |
179 | - | (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE)) | ||
180 | 108607760 | 11-14 | for (; scan < end && ch == '\r'; ch = *scan) | |
181 | 7897 | 12 | ++scan; | |
182 | - | |||
183 | - | /* peek at next character to decide what to do next */ | ||
184 | 108703950 | 15 | if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) | |
185 | 108666709 | 16 | use_ignores = (ch == '\n'); | |
186 | - | |||
187 | 108703950 | 17 | if (scan >= end) | |
188 | 7 | 18 | break; | |
189 | 108703943 | 19 | ++scan; | |
190 | - | |||
191 | - | /* check run terminator */ | ||
192 | 108703943 | 19,20 | if (ch == '\n' || ch == '\0') { | |
193 | 104954772 | 21 | sig->lines++; | |
194 | 104954772 | 21 | break; | |
195 | - | } | ||
196 | - | |||
197 | 3749171 | 22 | ++len; | |
198 | 3749171 | 22 | HASHSIG_HASH_MIX(state, ch); | |
199 | - | } | ||
200 | - | |||
201 | 104971184 | 25 | if (len > 0) { | |
202 | 113005 | 26 | hashsig_heap_insert(&sig->mins, (hashsig_t)state); | |
203 | 113005 | 27 | hashsig_heap_insert(&sig->maxs, (hashsig_t)state); | |
204 | - | |||
205 | 142830 | 28,30-32 | while (scan < end && (*scan == '\n' || !*scan)) | |
206 | 29825 | 29 | ++scan; | |
207 | - | } | ||
208 | - | } | ||
209 | - | |||
210 | 3698 | 34 | prog->use_ignores = use_ignores; | |
211 | - | |||
212 | 3698 | 34 | return 0; | |
213 | - | } | ||
214 | - | |||
215 | 3701 | 2 | static int hashsig_finalize_hashes(git_hashsig *sig) | |
216 | - | { | ||
217 | 3701 | 2,3 | if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE && | |
218 | 2635 | 3 | !(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) { | |
219 | 2212 | 4 | git_error_set(GIT_ERROR_INVALID, | |
220 | - | "file too small for similarity signature calculation"); | ||
221 | 2212 | 5 | return GIT_EBUFS; | |
222 | - | } | ||
223 | - | |||
224 | 1489 | 6 | hashsig_heap_sort(&sig->mins); | |
225 | 1489 | 7 | hashsig_heap_sort(&sig->maxs); | |
226 | - | |||
227 | 1489 | 8 | return 0; | |
228 | - | } | ||
229 | - | |||
230 | 3701 | 2 | static git_hashsig *hashsig_alloc(git_hashsig_option_t opts) | |
231 | - | { | ||
232 | 3701 | 2 | git_hashsig *sig = git__calloc(1, sizeof(git_hashsig)); | |
233 | 3701 | 3 | if (!sig) | |
234 | ##### | 4 | return NULL; | |
235 | - | |||
236 | 3701 | 5 | hashsig_heap_init(&sig->mins, hashsig_cmp_min); | |
237 | 3701 | 6 | hashsig_heap_init(&sig->maxs, hashsig_cmp_max); | |
238 | 3701 | 7 | sig->opt = opts; | |
239 | - | |||
240 | 3701 | 7 | return sig; | |
241 | - | } | ||
242 | - | |||
243 | 3654 | 2 | int git_hashsig_create( | |
244 | - | git_hashsig **out, | ||
245 | - | const char *buf, | ||
246 | - | size_t buflen, | ||
247 | - | git_hashsig_option_t opts) | ||
248 | - | { | ||
249 | - | int error; | ||
250 | - | hashsig_in_progress prog; | ||
251 | 3654 | 2 | git_hashsig *sig = hashsig_alloc(opts); | |
252 | 3654 | 3,4 | GIT_ERROR_CHECK_ALLOC(sig); | |
253 | - | |||
254 | 3654 | 5 | hashsig_in_progress_init(&prog, sig); | |
255 | - | |||
256 | 3654 | 6 | error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog); | |
257 | - | |||
258 | 3654 | 7 | if (!error) | |
259 | 3654 | 8 | error = hashsig_finalize_hashes(sig); | |
260 | - | |||
261 | 3654 | 9 | if (!error) | |
262 | 1442 | 10 | *out = sig; | |
263 | - | else | ||
264 | 2212 | 11 | git_hashsig_free(sig); | |
265 | - | |||
266 | 3654 | 12 | return error; | |
267 | - | } | ||
268 | - | |||
269 | 47 | 2 | int git_hashsig_create_fromfile( | |
270 | - | git_hashsig **out, | ||
271 | - | const char *path, | ||
272 | - | git_hashsig_option_t opts) | ||
273 | - | { | ||
274 | - | uint8_t buf[0x1000]; | ||
275 | 47 | 2 | ssize_t buflen = 0; | |
276 | 47 | 2 | int error = 0, fd; | |
277 | - | hashsig_in_progress prog; | ||
278 | 47 | 2 | git_hashsig *sig = hashsig_alloc(opts); | |
279 | 47 | 3,4 | GIT_ERROR_CHECK_ALLOC(sig); | |
280 | - | |||
281 | 47 | 5,6 | if ((fd = git_futils_open_ro(path)) < 0) { | |
282 | ##### | 7 | git__free(sig); | |
283 | ##### | 8 | return fd; | |
284 | - | } | ||
285 | - | |||
286 | 47 | 9 | hashsig_in_progress_init(&prog, sig); | |
287 | - | |||
288 | 91 | 10,17 | while (!error) { | |
289 | 91 | 11,12 | if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) { | |
290 | 47 | 13 | if ((error = (int)buflen) < 0) | |
291 | ##### | 14 | git_error_set(GIT_ERROR_OS, | |
292 | - | "read error on '%s' calculating similarity hashes", path); | ||
293 | 47 | 15 | break; | |
294 | - | } | ||
295 | - | |||
296 | 44 | 16 | error = hashsig_add_hashes(sig, buf, buflen, &prog); | |
297 | - | } | ||
298 | - | |||
299 | 47 | 18 | p_close(fd); | |
300 | - | |||
301 | 47 | 19 | if (!error) | |
302 | 47 | 20 | error = hashsig_finalize_hashes(sig); | |
303 | - | |||
304 | 47 | 21 | if (!error) | |
305 | 47 | 22 | *out = sig; | |
306 | - | else | ||
307 | ##### | 23 | git_hashsig_free(sig); | |
308 | - | |||
309 | 47 | 24 | return error; | |
310 | - | } | ||
311 | - | |||
312 | 3701 | 2 | void git_hashsig_free(git_hashsig *sig) | |
313 | - | { | ||
314 | 3701 | 2 | git__free(sig); | |
315 | 3701 | 3 | } | |
316 | - | |||
317 | 40158 | 2 | static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b) | |
318 | - | { | ||
319 | 40158 | 2 | int matches = 0, i, j, cmp; | |
320 | - | |||
321 | 40158 | 2,3 | assert(a->cmp == b->cmp); | |
322 | - | |||
323 | - | /* hash heaps are sorted - just look for overlap vs total */ | ||
324 | - | |||
325 | 1475166 | 4,11,12 | for (i = 0, j = 0; i < a->size && j < b->size; ) { | |
326 | 1435008 | 5 | cmp = a->cmp(&a->values[i], &b->values[j], NULL); | |
327 | - | |||
328 | 1435008 | 6 | if (cmp < 0) | |
329 | 724434 | 7 | ++i; | |
330 | 710574 | 8 | else if (cmp > 0) | |
331 | 654721 | 9 | ++j; | |
332 | - | else { | ||
333 | 55853 | 10 | ++i; ++j; ++matches; | |
334 | - | } | ||
335 | - | } | ||
336 | - | |||
337 | 40158 | 13 | return HASHSIG_SCALE * (matches * 2) / (a->size + b->size); | |
338 | - | } | ||
339 | - | |||
340 | 37382 | 2 | int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b) | |
341 | - | { | ||
342 | - | /* if we have no elements in either file then each file is either | ||
343 | - | * empty or blank. if we're ignoring whitespace then the files are | ||
344 | - | * similar, otherwise they're dissimilar. | ||
345 | - | */ | ||
346 | 37382 | 2,3 | if (a->mins.size == 0 && b->mins.size == 0) { | |
347 | 8 | 4-6 | if ((!a->lines && !b->lines) || | |
348 | 6 | 6 | (a->opt & GIT_HASHSIG_IGNORE_WHITESPACE)) | |
349 | 7 | 7 | return HASHSIG_SCALE; | |
350 | - | else | ||
351 | 1 | 8 | return 0; | |
352 | - | } | ||
353 | - | |||
354 | - | /* if we have fewer than the maximum number of elements, then just use | ||
355 | - | * one array since the two arrays will be the same | ||
356 | - | */ | ||
357 | 37374 | 9 | if (a->mins.size < HASHSIG_HEAP_SIZE) | |
358 | 34590 | 10 | return hashsig_heap_compare(&a->mins, &b->mins); | |
359 | - | else | ||
360 | 2784 | 11 | return (hashsig_heap_compare(&a->mins, &b->mins) + | |
361 | 2784 | 12 | hashsig_heap_compare(&a->maxs, &b->maxs)) / 2; | |
362 | - | } |