Line Flow Count Block(s) Source
1 - /*
2 - * Copyright (C) the libgit2 contributors. All rights reserved.
3 - *
4 - * This file is part of libgit2, distributed under the GNU GPL v2 with
5 - * a Linking Exception. For full terms see the included COPYING file.
6 - */
7 -
8 - #include "common.h"
9 -
10 - #include "git2/sys/hashsig.h"
11 - #include "futils.h"
12 - #include "util.h"
13 -
14 - typedef uint32_t hashsig_t;
15 - typedef uint64_t hashsig_state;
16 -
17 - #define HASHSIG_SCALE 100
18 -
19 - #define HASHSIG_MAX_RUN 80
20 - #define HASHSIG_HASH_START 0x012345678ABCDEF0LL
21 - #define HASHSIG_HASH_SHIFT 5
22 -
23 - #define HASHSIG_HASH_MIX(S,CH) \
24 - (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
25 -
26 - #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
27 - #define HASHSIG_HEAP_MIN_SIZE 4
28 -
29 - typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
30 -
31 - typedef struct {
32 - int size, asize;
33 - hashsig_cmp cmp;
34 - hashsig_t values[HASHSIG_HEAP_SIZE];
35 - } hashsig_heap;
36 -
37 - struct git_hashsig {
38 - hashsig_heap mins;
39 - hashsig_heap maxs;
40 - size_t lines;
41 - git_hashsig_option_t opt;
42 - };
43 -
44 - #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
45 - #define HEAP_RCHILD_OF(I) (((I)<<1)+2)
46 - #define HEAP_PARENT_OF(I) (((I)-1)>>1)
47 -
48 7402 2 static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
49 - {
50 7402 2 h->size = 0;
51 7402 2 h->asize = HASHSIG_HEAP_SIZE;
52 7402 2 h->cmp = cmp;
53 7402 2 }
54 -
55 917679 2 static int hashsig_cmp_max(const void *a, const void *b, void *payload)
56 - {
57 917679 2 hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
58 - GIT_UNUSED(payload);
59 917679 2 return (av < bv) ? -1 : (av > bv) ? 1 : 0;
60 - }
61 -
62 1885743 2 static int hashsig_cmp_min(const void *a, const void *b, void *payload)
63 - {
64 1885743 2 hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
65 - GIT_UNUSED(payload);
66 1885743 2 return (av > bv) ? -1 : (av < bv) ? 1 : 0;
67 - }
68 -
69 145006 2 static void hashsig_heap_up(hashsig_heap *h, int el)
70 - {
71 145006 2 int parent_el = HEAP_PARENT_OF(el);
72 -
73 375355 2,4-6 while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
74 230349 3 hashsig_t t = h->values[el];
75 230349 3 h->values[el] = h->values[parent_el];
76 230349 3 h->values[parent_el] = t;
77 -
78 230349 3 el = parent_el;
79 230349 3 parent_el = HEAP_PARENT_OF(el);
80 - }
81 145006 7 }
82 -
83 37128 2 static void hashsig_heap_down(hashsig_heap *h, int el)
84 - {
85 - hashsig_t v, lv, rv;
86 -
87 - /* 'el < h->size / 2' tests if el is bottom row of heap */
88 -
89 252584 2,13 while (el < h->size / 2) {
90 220862 3 int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
91 -
92 220862 3 v = h->values[el];
93 220862 3 lv = h->values[lel];
94 220862 3 rv = h->values[rel];
95 -
96 220862 3-6 if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
97 5406 7 break;
98 -
99 215456 8-11 swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
100 -
101 215456 12 h->values[el] = h->values[swapel];
102 215456 12 h->values[swapel] = v;
103 -
104 215456 12 el = swapel;
105 - }
106 37128 14 }
107 -
108 2978 2 static void hashsig_heap_sort(hashsig_heap *h)
109 - {
110 - /* only need to do this at the end for signature comparison */
111 2978 2 git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
112 2978 3 }
113 -
114 226010 2 static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
115 - {
116 - /* if heap is not full, insert new element */
117 226010 2 if (h->size < h->asize) {
118 145006 3 h->values[h->size++] = val;
119 145006 3 hashsig_heap_up(h, h->size - 1);
120 - }
121 -
122 - /* if heap is full, pop top if new element should replace it */
123 81004 4,5 else if (h->cmp(&val, &h->values[0], NULL) > 0) {
124 37128 6 h->size--;
125 37128 6 h->values[0] = h->values[h->size];
126 37128 6 hashsig_heap_down(h, 0);
127 - }
128 -
129 226010 7 }
130 -
131 - typedef struct {
132 - int use_ignores;
133 - uint8_t ignore_ch[256];
134 - } hashsig_in_progress;
135 -
136 3701 2 static void hashsig_in_progress_init(
137 - hashsig_in_progress *prog, git_hashsig *sig)
138 - {
139 - int i;
140 -
141 - /* no more than one can be set */
142 3701 2-4 assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
143 - !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
144 -
145 3701 5 if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
146 9766 6,8,9 for (i = 0; i < 256; ++i)
147 9728 7 prog->ignore_ch[i] = git__isspace_nonlf(i);
148 38 10 prog->use_ignores = 1;
149 3663 11 } else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
150 928798 12,14,15 for (i = 0; i < 256; ++i)
151 925184 13 prog->ignore_ch[i] = git__isspace(i);
152 3614 16 prog->use_ignores = 1;
153 - } else {
154 49 17 memset(prog, 0, sizeof(*prog));
155 - }
156 3701 18 }
157 -
158 3698 2 static int hashsig_add_hashes(
159 - git_hashsig *sig,
160 - const uint8_t *data,
161 - size_t size,
162 - hashsig_in_progress *prog)
163 - {
164 3698 2 const uint8_t *scan = data, *end = data + size;
165 3698 2 hashsig_state state = HASHSIG_HASH_START;
166 3698 2 int use_ignores = prog->use_ignores, len;
167 - uint8_t ch;
168 -
169 104974882 2,33 while (scan < end) {
170 104971184 3 state = HASHSIG_HASH_START;
171 -
172 108720355 3,23,24 for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) {
173 108703950 4 ch = *scan;
174 -
175 108703950 4 if (use_ignores)
176 143326 5-9 for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
177 64644 6 ++scan;
178 108625268 10 else if (sig->opt &
179 - (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
180 108607760 11-14 for (; scan < end && ch == '\r'; ch = *scan)
181 7897 12 ++scan;
182 -
183 - /* peek at next character to decide what to do next */
184 108703950 15 if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
185 108666709 16 use_ignores = (ch == '\n');
186 -
187 108703950 17 if (scan >= end)
188 7 18 break;
189 108703943 19 ++scan;
190 -
191 - /* check run terminator */
192 108703943 19,20 if (ch == '\n' || ch == '\0') {
193 104954772 21 sig->lines++;
194 104954772 21 break;
195 - }
196 -
197 3749171 22 ++len;
198 3749171 22 HASHSIG_HASH_MIX(state, ch);
199 - }
200 -
201 104971184 25 if (len > 0) {
202 113005 26 hashsig_heap_insert(&sig->mins, (hashsig_t)state);
203 113005 27 hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
204 -
205 142830 28,30-32 while (scan < end && (*scan == '\n' || !*scan))
206 29825 29 ++scan;
207 - }
208 - }
209 -
210 3698 34 prog->use_ignores = use_ignores;
211 -
212 3698 34 return 0;
213 - }
214 -
215 3701 2 static int hashsig_finalize_hashes(git_hashsig *sig)
216 - {
217 3701 2,3 if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
218 2635 3 !(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
219 2212 4 git_error_set(GIT_ERROR_INVALID,
220 - "file too small for similarity signature calculation");
221 2212 5 return GIT_EBUFS;
222 - }
223 -
224 1489 6 hashsig_heap_sort(&sig->mins);
225 1489 7 hashsig_heap_sort(&sig->maxs);
226 -
227 1489 8 return 0;
228 - }
229 -
230 3701 2 static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
231 - {
232 3701 2 git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
233 3701 3 if (!sig)
234 ##### 4 return NULL;
235 -
236 3701 5 hashsig_heap_init(&sig->mins, hashsig_cmp_min);
237 3701 6 hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
238 3701 7 sig->opt = opts;
239 -
240 3701 7 return sig;
241 - }
242 -
243 3654 2 int git_hashsig_create(
244 - git_hashsig **out,
245 - const char *buf,
246 - size_t buflen,
247 - git_hashsig_option_t opts)
248 - {
249 - int error;
250 - hashsig_in_progress prog;
251 3654 2 git_hashsig *sig = hashsig_alloc(opts);
252 3654 3,4 GIT_ERROR_CHECK_ALLOC(sig);
253 -
254 3654 5 hashsig_in_progress_init(&prog, sig);
255 -
256 3654 6 error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog);
257 -
258 3654 7 if (!error)
259 3654 8 error = hashsig_finalize_hashes(sig);
260 -
261 3654 9 if (!error)
262 1442 10 *out = sig;
263 - else
264 2212 11 git_hashsig_free(sig);
265 -
266 3654 12 return error;
267 - }
268 -
269 47 2 int git_hashsig_create_fromfile(
270 - git_hashsig **out,
271 - const char *path,
272 - git_hashsig_option_t opts)
273 - {
274 - uint8_t buf[0x1000];
275 47 2 ssize_t buflen = 0;
276 47 2 int error = 0, fd;
277 - hashsig_in_progress prog;
278 47 2 git_hashsig *sig = hashsig_alloc(opts);
279 47 3,4 GIT_ERROR_CHECK_ALLOC(sig);
280 -
281 47 5,6 if ((fd = git_futils_open_ro(path)) < 0) {
282 ##### 7 git__free(sig);
283 ##### 8 return fd;
284 - }
285 -
286 47 9 hashsig_in_progress_init(&prog, sig);
287 -
288 91 10,17 while (!error) {
289 91 11,12 if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
290 47 13 if ((error = (int)buflen) < 0)
291 ##### 14 git_error_set(GIT_ERROR_OS,
292 - "read error on '%s' calculating similarity hashes", path);
293 47 15 break;
294 - }
295 -
296 44 16 error = hashsig_add_hashes(sig, buf, buflen, &prog);
297 - }
298 -
299 47 18 p_close(fd);
300 -
301 47 19 if (!error)
302 47 20 error = hashsig_finalize_hashes(sig);
303 -
304 47 21 if (!error)
305 47 22 *out = sig;
306 - else
307 ##### 23 git_hashsig_free(sig);
308 -
309 47 24 return error;
310 - }
311 -
312 3701 2 void git_hashsig_free(git_hashsig *sig)
313 - {
314 3701 2 git__free(sig);
315 3701 3 }
316 -
317 40158 2 static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
318 - {
319 40158 2 int matches = 0, i, j, cmp;
320 -
321 40158 2,3 assert(a->cmp == b->cmp);
322 -
323 - /* hash heaps are sorted - just look for overlap vs total */
324 -
325 1475166 4,11,12 for (i = 0, j = 0; i < a->size && j < b->size; ) {
326 1435008 5 cmp = a->cmp(&a->values[i], &b->values[j], NULL);
327 -
328 1435008 6 if (cmp < 0)
329 724434 7 ++i;
330 710574 8 else if (cmp > 0)
331 654721 9 ++j;
332 - else {
333 55853 10 ++i; ++j; ++matches;
334 - }
335 - }
336 -
337 40158 13 return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
338 - }
339 -
340 37382 2 int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
341 - {
342 - /* if we have no elements in either file then each file is either
343 - * empty or blank. if we're ignoring whitespace then the files are
344 - * similar, otherwise they're dissimilar.
345 - */
346 37382 2,3 if (a->mins.size == 0 && b->mins.size == 0) {
347 8 4-6 if ((!a->lines && !b->lines) ||
348 6 6 (a->opt & GIT_HASHSIG_IGNORE_WHITESPACE))
349 7 7 return HASHSIG_SCALE;
350 - else
351 1 8 return 0;
352 - }
353 -
354 - /* if we have fewer than the maximum number of elements, then just use
355 - * one array since the two arrays will be the same
356 - */
357 37374 9 if (a->mins.size < HASHSIG_HEAP_SIZE)
358 34590 10 return hashsig_heap_compare(&a->mins, &b->mins);
359 - else
360 2784 11 return (hashsig_heap_compare(&a->mins, &b->mins) +
361 2784 12 hashsig_heap_compare(&a->maxs, &b->maxs)) / 2;
362 - }