source src/hashsig.c
| Line | Flow | Count | Block(s) | Source |
|---|---|---|---|---|
| 1 | - | /* | ||
| 2 | - | * Copyright (C) the libgit2 contributors. All rights reserved. | ||
| 3 | - | * | ||
| 4 | - | * This file is part of libgit2, distributed under the GNU GPL v2 with | ||
| 5 | - | * a Linking Exception. For full terms see the included COPYING file. | ||
| 6 | - | */ | ||
| 7 | - | |||
| 8 | - | #include "common.h" | ||
| 9 | - | |||
| 10 | - | #include "git2/sys/hashsig.h" | ||
| 11 | - | #include "futils.h" | ||
| 12 | - | #include "util.h" | ||
| 13 | - | |||
| 14 | - | typedef uint32_t hashsig_t; | ||
| 15 | - | typedef uint64_t hashsig_state; | ||
| 16 | - | |||
| 17 | - | #define HASHSIG_SCALE 100 | ||
| 18 | - | |||
| 19 | - | #define HASHSIG_MAX_RUN 80 | ||
| 20 | - | #define HASHSIG_HASH_START 0x012345678ABCDEF0LL | ||
| 21 | - | #define HASHSIG_HASH_SHIFT 5 | ||
| 22 | - | |||
| 23 | - | #define HASHSIG_HASH_MIX(S,CH) \ | ||
| 24 | - | (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH) | ||
| 25 | - | |||
| 26 | - | #define HASHSIG_HEAP_SIZE ((1 << 7) - 1) | ||
| 27 | - | #define HASHSIG_HEAP_MIN_SIZE 4 | ||
| 28 | - | |||
| 29 | - | typedef int (*hashsig_cmp)(const void *a, const void *b, void *); | ||
| 30 | - | |||
| 31 | - | typedef struct { | ||
| 32 | - | int size, asize; | ||
| 33 | - | hashsig_cmp cmp; | ||
| 34 | - | hashsig_t values[HASHSIG_HEAP_SIZE]; | ||
| 35 | - | } hashsig_heap; | ||
| 36 | - | |||
| 37 | - | struct git_hashsig { | ||
| 38 | - | hashsig_heap mins; | ||
| 39 | - | hashsig_heap maxs; | ||
| 40 | - | size_t lines; | ||
| 41 | - | git_hashsig_option_t opt; | ||
| 42 | - | }; | ||
| 43 | - | |||
| 44 | - | #define HEAP_LCHILD_OF(I) (((I)<<1)+1) | ||
| 45 | - | #define HEAP_RCHILD_OF(I) (((I)<<1)+2) | ||
| 46 | - | #define HEAP_PARENT_OF(I) (((I)-1)>>1) | ||
| 47 | - | |||
| 48 | 7402 | 2 | static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp) | |
| 49 | - | { | ||
| 50 | 7402 | 2 | h->size = 0; | |
| 51 | 7402 | 2 | h->asize = HASHSIG_HEAP_SIZE; | |
| 52 | 7402 | 2 | h->cmp = cmp; | |
| 53 | 7402 | 2 | } | |
| 54 | - | |||
| 55 | 917679 | 2 | static int hashsig_cmp_max(const void *a, const void *b, void *payload) | |
| 56 | - | { | ||
| 57 | 917679 | 2 | hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; | |
| 58 | - | GIT_UNUSED(payload); | ||
| 59 | 917679 | 2 | return (av < bv) ? -1 : (av > bv) ? 1 : 0; | |
| 60 | - | } | ||
| 61 | - | |||
| 62 | 1885743 | 2 | static int hashsig_cmp_min(const void *a, const void *b, void *payload) | |
| 63 | - | { | ||
| 64 | 1885743 | 2 | hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; | |
| 65 | - | GIT_UNUSED(payload); | ||
| 66 | 1885743 | 2 | return (av > bv) ? -1 : (av < bv) ? 1 : 0; | |
| 67 | - | } | ||
| 68 | - | |||
| 69 | ![]() |
145006 | 2 | static void hashsig_heap_up(hashsig_heap *h, int el) |
| 70 | - | { | ||
| 71 | 145006 | 2 | int parent_el = HEAP_PARENT_OF(el); | |
| 72 | - | |||
| 73 | 375355 | 2,4-6 | while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) { | |
| 74 | 230349 | 3 | hashsig_t t = h->values[el]; | |
| 75 | 230349 | 3 | h->values[el] = h->values[parent_el]; | |
| 76 | 230349 | 3 | h->values[parent_el] = t; | |
| 77 | - | |||
| 78 | 230349 | 3 | el = parent_el; | |
| 79 | 230349 | 3 | parent_el = HEAP_PARENT_OF(el); | |
| 80 | - | } | ||
| 81 | 145006 | 7 | } | |
| 82 | - | |||
| 83 | ![]() |
37128 | 2 | static void hashsig_heap_down(hashsig_heap *h, int el) |
| 84 | - | { | ||
| 85 | - | hashsig_t v, lv, rv; | ||
| 86 | - | |||
| 87 | - | /* 'el < h->size / 2' tests if el is bottom row of heap */ | ||
| 88 | - | |||
| 89 | 252584 | 2,13 | while (el < h->size / 2) { | |
| 90 | 220862 | 3 | int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel; | |
| 91 | - | |||
| 92 | 220862 | 3 | v = h->values[el]; | |
| 93 | 220862 | 3 | lv = h->values[lel]; | |
| 94 | 220862 | 3 | rv = h->values[rel]; | |
| 95 | - | |||
| 96 | 220862 | 3-6 | if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0) | |
| 97 | 5406 | 7 | break; | |
| 98 | - | |||
| 99 | 215456 | 8-11 | swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel; | |
| 100 | - | |||
| 101 | 215456 | 12 | h->values[el] = h->values[swapel]; | |
| 102 | 215456 | 12 | h->values[swapel] = v; | |
| 103 | - | |||
| 104 | 215456 | 12 | el = swapel; | |
| 105 | - | } | ||
| 106 | 37128 | 14 | } | |
| 107 | - | |||
| 108 | 2978 | 2 | static void hashsig_heap_sort(hashsig_heap *h) | |
| 109 | - | { | ||
| 110 | - | /* only need to do this at the end for signature comparison */ | ||
| 111 | 2978 | 2 | git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL); | |
| 112 | 2978 | 3 | } | |
| 113 | - | |||
| 114 | 226010 | 2 | static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val) | |
| 115 | - | { | ||
| 116 | - | /* if heap is not full, insert new element */ | ||
| 117 | 226010 | 2 | if (h->size < h->asize) { | |
| 118 | 145006 | 3 | h->values[h->size++] = val; | |
| 119 | 145006 | 3 | hashsig_heap_up(h, h->size - 1); | |
| 120 | - | } | ||
| 121 | - | |||
| 122 | - | /* if heap is full, pop top if new element should replace it */ | ||
| 123 | 81004 | 4,5 | else if (h->cmp(&val, &h->values[0], NULL) > 0) { | |
| 124 | 37128 | 6 | h->size--; | |
| 125 | 37128 | 6 | h->values[0] = h->values[h->size]; | |
| 126 | 37128 | 6 | hashsig_heap_down(h, 0); | |
| 127 | - | } | ||
| 128 | - | |||
| 129 | 226010 | 7 | } | |
| 130 | - | |||
| 131 | - | typedef struct { | ||
| 132 | - | int use_ignores; | ||
| 133 | - | uint8_t ignore_ch[256]; | ||
| 134 | - | } hashsig_in_progress; | ||
| 135 | - | |||
| 136 | ![]() |
3701 | 2 | static void hashsig_in_progress_init( |
| 137 | - | hashsig_in_progress *prog, git_hashsig *sig) | ||
| 138 | - | { | ||
| 139 | - | int i; | ||
| 140 | - | |||
| 141 | - | /* no more than one can be set */ | ||
| 142 | 3701 | 2-4 | assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) || | |
| 143 | - | !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE)); | ||
| 144 | - | |||
| 145 | 3701 | 5 | if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) { | |
| 146 | 9766 | 6,8,9 | for (i = 0; i < 256; ++i) | |
| 147 | 9728 | 7 | prog->ignore_ch[i] = git__isspace_nonlf(i); | |
| 148 | 38 | 10 | prog->use_ignores = 1; | |
| 149 | 3663 | 11 | } else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) { | |
| 150 | 928798 | 12,14,15 | for (i = 0; i < 256; ++i) | |
| 151 | 925184 | 13 | prog->ignore_ch[i] = git__isspace(i); | |
| 152 | 3614 | 16 | prog->use_ignores = 1; | |
| 153 | - | } else { | ||
| 154 | 49 | 17 | memset(prog, 0, sizeof(*prog)); | |
| 155 | - | } | ||
| 156 | 3701 | 18 | } | |
| 157 | - | |||
| 158 | ![]() |
3698 | 2 | static int hashsig_add_hashes( |
| 159 | - | git_hashsig *sig, | ||
| 160 | - | const uint8_t *data, | ||
| 161 | - | size_t size, | ||
| 162 | - | hashsig_in_progress *prog) | ||
| 163 | - | { | ||
| 164 | 3698 | 2 | const uint8_t *scan = data, *end = data + size; | |
| 165 | 3698 | 2 | hashsig_state state = HASHSIG_HASH_START; | |
| 166 | 3698 | 2 | int use_ignores = prog->use_ignores, len; | |
| 167 | - | uint8_t ch; | ||
| 168 | - | |||
| 169 | 104974882 | 2,33 | while (scan < end) { | |
| 170 | 104971184 | 3 | state = HASHSIG_HASH_START; | |
| 171 | - | |||
| 172 | 108720355 | 3,23,24 | for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) { | |
| 173 | 108703950 | 4 | ch = *scan; | |
| 174 | - | |||
| 175 | 108703950 | 4 | if (use_ignores) | |
| 176 | 143326 | 5-9 | for (; scan < end && git__isspace_nonlf(ch); ch = *scan) | |
| 177 | 64644 | 6 | ++scan; | |
| 178 | 108625268 | 10 | else if (sig->opt & | |
| 179 | - | (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE)) | ||
| 180 | 108607760 | 11-14 | for (; scan < end && ch == '\r'; ch = *scan) | |
| 181 | 7897 | 12 | ++scan; | |
| 182 | - | |||
| 183 | - | /* peek at next character to decide what to do next */ | ||
| 184 | 108703950 | 15 | if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) | |
| 185 | 108666709 | 16 | use_ignores = (ch == '\n'); | |
| 186 | - | |||
| 187 | 108703950 | 17 | if (scan >= end) | |
| 188 | 7 | 18 | break; | |
| 189 | 108703943 | 19 | ++scan; | |
| 190 | - | |||
| 191 | - | /* check run terminator */ | ||
| 192 | 108703943 | 19,20 | if (ch == '\n' || ch == '\0') { | |
| 193 | 104954772 | 21 | sig->lines++; | |
| 194 | 104954772 | 21 | break; | |
| 195 | - | } | ||
| 196 | - | |||
| 197 | 3749171 | 22 | ++len; | |
| 198 | 3749171 | 22 | HASHSIG_HASH_MIX(state, ch); | |
| 199 | - | } | ||
| 200 | - | |||
| 201 | 104971184 | 25 | if (len > 0) { | |
| 202 | 113005 | 26 | hashsig_heap_insert(&sig->mins, (hashsig_t)state); | |
| 203 | 113005 | 27 | hashsig_heap_insert(&sig->maxs, (hashsig_t)state); | |
| 204 | - | |||
| 205 | 142830 | 28,30-32 | while (scan < end && (*scan == '\n' || !*scan)) | |
| 206 | 29825 | 29 | ++scan; | |
| 207 | - | } | ||
| 208 | - | } | ||
| 209 | - | |||
| 210 | 3698 | 34 | prog->use_ignores = use_ignores; | |
| 211 | - | |||
| 212 | 3698 | 34 | return 0; | |
| 213 | - | } | ||
| 214 | - | |||
| 215 | ![]() |
3701 | 2 | static int hashsig_finalize_hashes(git_hashsig *sig) |
| 216 | - | { | ||
| 217 | 3701 | 2,3 | if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE && | |
| 218 | 2635 | 3 | !(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) { | |
| 219 | 2212 | 4 | git_error_set(GIT_ERROR_INVALID, | |
| 220 | - | "file too small for similarity signature calculation"); | ||
| 221 | 2212 | 5 | return GIT_EBUFS; | |
| 222 | - | } | ||
| 223 | - | |||
| 224 | 1489 | 6 | hashsig_heap_sort(&sig->mins); | |
| 225 | 1489 | 7 | hashsig_heap_sort(&sig->maxs); | |
| 226 | - | |||
| 227 | 1489 | 8 | return 0; | |
| 228 | - | } | ||
| 229 | - | |||
| 230 | 3701 | 2 | static git_hashsig *hashsig_alloc(git_hashsig_option_t opts) | |
| 231 | - | { | ||
| 232 | 3701 | 2 | git_hashsig *sig = git__calloc(1, sizeof(git_hashsig)); | |
| 233 | 3701 | 3 | if (!sig) | |
| 234 | ##### | 4 | return NULL; | |
| 235 | - | |||
| 236 | 3701 | 5 | hashsig_heap_init(&sig->mins, hashsig_cmp_min); | |
| 237 | 3701 | 6 | hashsig_heap_init(&sig->maxs, hashsig_cmp_max); | |
| 238 | 3701 | 7 | sig->opt = opts; | |
| 239 | - | |||
| 240 | 3701 | 7 | return sig; | |
| 241 | - | } | ||
| 242 | - | |||
| 243 | 3654 | 2 | int git_hashsig_create( | |
| 244 | - | git_hashsig **out, | ||
| 245 | - | const char *buf, | ||
| 246 | - | size_t buflen, | ||
| 247 | - | git_hashsig_option_t opts) | ||
| 248 | - | { | ||
| 249 | - | int error; | ||
| 250 | - | hashsig_in_progress prog; | ||
| 251 | 3654 | 2 | git_hashsig *sig = hashsig_alloc(opts); | |
| 252 | 3654 | 3,4 | GIT_ERROR_CHECK_ALLOC(sig); | |
| 253 | - | |||
| 254 | 3654 | 5 | hashsig_in_progress_init(&prog, sig); | |
| 255 | - | |||
| 256 | 3654 | 6 | error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog); | |
| 257 | - | |||
| 258 | 3654 | 7 | if (!error) | |
| 259 | 3654 | 8 | error = hashsig_finalize_hashes(sig); | |
| 260 | - | |||
| 261 | 3654 | 9 | if (!error) | |
| 262 | 1442 | 10 | *out = sig; | |
| 263 | - | else | ||
| 264 | 2212 | 11 | git_hashsig_free(sig); | |
| 265 | - | |||
| 266 | 3654 | 12 | return error; | |
| 267 | - | } | ||
| 268 | - | |||
| 269 | ![]() |
47 | 2 | int git_hashsig_create_fromfile( |
| 270 | - | git_hashsig **out, | ||
| 271 | - | const char *path, | ||
| 272 | - | git_hashsig_option_t opts) | ||
| 273 | - | { | ||
| 274 | - | uint8_t buf[0x1000]; | ||
| 275 | 47 | 2 | ssize_t buflen = 0; | |
| 276 | 47 | 2 | int error = 0, fd; | |
| 277 | - | hashsig_in_progress prog; | ||
| 278 | 47 | 2 | git_hashsig *sig = hashsig_alloc(opts); | |
| 279 | 47 | 3,4 | GIT_ERROR_CHECK_ALLOC(sig); | |
| 280 | - | |||
| 281 | 47 | 5,6 | if ((fd = git_futils_open_ro(path)) < 0) { | |
| 282 | ##### | 7 | git__free(sig); | |
| 283 | ##### | 8 | return fd; | |
| 284 | - | } | ||
| 285 | - | |||
| 286 | 47 | 9 | hashsig_in_progress_init(&prog, sig); | |
| 287 | - | |||
| 288 | 91 | 10,17 | while (!error) { | |
| 289 | 91 | 11,12 | if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) { | |
| 290 | 47 | 13 | if ((error = (int)buflen) < 0) | |
| 291 | ##### | 14 | git_error_set(GIT_ERROR_OS, | |
| 292 | - | "read error on '%s' calculating similarity hashes", path); | ||
| 293 | 47 | 15 | break; | |
| 294 | - | } | ||
| 295 | - | |||
| 296 | 44 | 16 | error = hashsig_add_hashes(sig, buf, buflen, &prog); | |
| 297 | - | } | ||
| 298 | - | |||
| 299 | 47 | 18 | p_close(fd); | |
| 300 | - | |||
| 301 | 47 | 19 | if (!error) | |
| 302 | 47 | 20 | error = hashsig_finalize_hashes(sig); | |
| 303 | - | |||
| 304 | 47 | 21 | if (!error) | |
| 305 | 47 | 22 | *out = sig; | |
| 306 | - | else | ||
| 307 | ##### | 23 | git_hashsig_free(sig); | |
| 308 | - | |||
| 309 | 47 | 24 | return error; | |
| 310 | - | } | ||
| 311 | - | |||
| 312 | 3701 | 2 | void git_hashsig_free(git_hashsig *sig) | |
| 313 | - | { | ||
| 314 | 3701 | 2 | git__free(sig); | |
| 315 | 3701 | 3 | } | |
| 316 | - | |||
| 317 | ![]() |
40158 | 2 | static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b) |
| 318 | - | { | ||
| 319 | 40158 | 2 | int matches = 0, i, j, cmp; | |
| 320 | - | |||
| 321 | 40158 | 2,3 | assert(a->cmp == b->cmp); | |
| 322 | - | |||
| 323 | - | /* hash heaps are sorted - just look for overlap vs total */ | ||
| 324 | - | |||
| 325 | 1475166 | 4,11,12 | for (i = 0, j = 0; i < a->size && j < b->size; ) { | |
| 326 | 1435008 | 5 | cmp = a->cmp(&a->values[i], &b->values[j], NULL); | |
| 327 | - | |||
| 328 | 1435008 | 6 | if (cmp < 0) | |
| 329 | 724434 | 7 | ++i; | |
| 330 | 710574 | 8 | else if (cmp > 0) | |
| 331 | 654721 | 9 | ++j; | |
| 332 | - | else { | ||
| 333 | 55853 | 10 | ++i; ++j; ++matches; | |
| 334 | - | } | ||
| 335 | - | } | ||
| 336 | - | |||
| 337 | 40158 | 13 | return HASHSIG_SCALE * (matches * 2) / (a->size + b->size); | |
| 338 | - | } | ||
| 339 | - | |||
| 340 | ![]() |
37382 | 2 | int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b) |
| 341 | - | { | ||
| 342 | - | /* if we have no elements in either file then each file is either | ||
| 343 | - | * empty or blank. if we're ignoring whitespace then the files are | ||
| 344 | - | * similar, otherwise they're dissimilar. | ||
| 345 | - | */ | ||
| 346 | 37382 | 2,3 | if (a->mins.size == 0 && b->mins.size == 0) { | |
| 347 | 8 | 4-6 | if ((!a->lines && !b->lines) || | |
| 348 | 6 | 6 | (a->opt & GIT_HASHSIG_IGNORE_WHITESPACE)) | |
| 349 | 7 | 7 | return HASHSIG_SCALE; | |
| 350 | - | else | ||
| 351 | 1 | 8 | return 0; | |
| 352 | - | } | ||
| 353 | - | |||
| 354 | - | /* if we have fewer than the maximum number of elements, then just use | ||
| 355 | - | * one array since the two arrays will be the same | ||
| 356 | - | */ | ||
| 357 | 37374 | 9 | if (a->mins.size < HASHSIG_HEAP_SIZE) | |
| 358 | 34590 | 10 | return hashsig_heap_compare(&a->mins, &b->mins); | |
| 359 | - | else | ||
| 360 | 2784 | 11 | return (hashsig_heap_compare(&a->mins, &b->mins) + | |
| 361 | 2784 | 12 | hashsig_heap_compare(&a->maxs, &b->maxs)) / 2; | |
| 362 | - | } |