Line Flow Count Block(s) Source
1 - /*
2 - * LibXDiff by Davide Libenzi ( File Differential Library )
3 - * Copyright (C) 2003 Davide Libenzi
4 - *
5 - * This library is free software; you can redistribute it and/or
6 - * modify it under the terms of the GNU Lesser General Public
7 - * License as published by the Free Software Foundation; either
8 - * version 2.1 of the License, or (at your option) any later version.
9 - *
10 - * This library is distributed in the hope that it will be useful,
11 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 - * Lesser General Public License for more details.
14 - *
15 - * You should have received a copy of the GNU Lesser General Public
16 - * License along with this library; if not, see
17 - * <http://www.gnu.org/licenses/>.
18 - *
19 - * Davide Libenzi <davidel@xmailserver.org>
20 - *
21 - */
22 -
23 - #include "xinclude.h"
24 -
25 -
26 - #define XDL_KPDIS_RUN 4
27 - #define XDL_MAX_EQLIMIT 1024
28 - #define XDL_SIMSCAN_WINDOW 100
29 - #define XDL_GUESS_NLINES1 256
30 - #define XDL_GUESS_NLINES2 20
31 -
32 -
33 - typedef struct s_xdlclass {
34 - struct s_xdlclass *next;
35 - unsigned long ha;
36 - char const *line;
37 - long size;
38 - long idx;
39 - long len1, len2;
40 - } xdlclass_t;
41 -
42 - typedef struct s_xdlclassifier {
43 - unsigned int hbits;
44 - long hsize;
45 - xdlclass_t **rchash;
46 - chastore_t ncha;
47 - xdlclass_t **rcrecs;
48 - long alloc;
49 - long count;
50 - long flags;
51 - } xdlclassifier_t;
52 -
53 -
54 -
55 -
56 - static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags);
57 - static void xdl_free_classifier(xdlclassifier_t *cf);
58 - static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t **rhash,
59 - unsigned int hbits, xrecord_t *rec);
60 - static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
61 - xdlclassifier_t *cf, xdfile_t *xdf);
62 - static void xdl_free_ctx(xdfile_t *xdf);
63 - static int xdl_clean_mmatch(char const *dis, long i, long s, long e);
64 - static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2);
65 - static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2);
66 - static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2);
67 -
68 -
69 -
70 -
71 2123 2 static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
72 2123 2 cf->flags = flags;
73 -
74 2123 2 cf->hbits = xdl_hashbits((unsigned int) size);
75 2123 3 cf->hsize = 1 << cf->hbits;
76 -
77 2123 3,4 if (xdl_cha_init(&cf->ncha, sizeof(xdlclass_t), size / 4 + 1) < 0) {
78 -
79 ##### 5 return -1;
80 - }
81 2123 6,7 if (!(cf->rchash = (xdlclass_t **) xdl_malloc(cf->hsize * sizeof(xdlclass_t *)))) {
82 -
83 ##### 8 xdl_cha_free(&cf->ncha);
84 ##### 9 return -1;
85 - }
86 2123 10 memset(cf->rchash, 0, cf->hsize * sizeof(xdlclass_t *));
87 -
88 2123 10 cf->alloc = size;
89 2123 10,11 if (!(cf->rcrecs = (xdlclass_t **) xdl_malloc(cf->alloc * sizeof(xdlclass_t *)))) {
90 -
91 ##### 12 xdl_free(cf->rchash);
92 ##### 13 xdl_cha_free(&cf->ncha);
93 ##### 14 return -1;
94 - }
95 -
96 2123 15 cf->count = 0;
97 -
98 2123 15 return 0;
99 - }
100 -
101 -
102 2123 2 static void xdl_free_classifier(xdlclassifier_t *cf) {
103 -
104 2123 2 xdl_free(cf->rcrecs);
105 2123 3 xdl_free(cf->rchash);
106 2123 4 xdl_cha_free(&cf->ncha);
107 2123 5 }
108 -
109 -
110 172286 2 static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t **rhash,
111 - unsigned int hbits, xrecord_t *rec) {
112 - long hi;
113 - char const *line;
114 - xdlclass_t *rcrec;
115 - xdlclass_t **rcrecs;
116 -
117 172286 2 line = rec->ptr;
118 172286 2 hi = (long) XDL_HASHLONG(rec->ha, cf->hbits);
119 177476 2,7,8 for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next)
120 159548 3,5 if (rcrec->ha == rec->ha &&
121 154395 4 xdl_recmatch(rcrec->line, rcrec->size,
122 - rec->ptr, rec->size, cf->flags))
123 154358 6 break;
124 -
125 172286 9 if (!rcrec) {
126 17928 10,11 if (!(rcrec = xdl_cha_alloc(&cf->ncha))) {
127 -
128 ##### 12 return -1;
129 - }
130 17928 13 rcrec->idx = cf->count++;
131 17928 13 if (cf->count > cf->alloc) {
132 ##### 14 cf->alloc *= 2;
133 ##### 14,15 if (!(rcrecs = (xdlclass_t **) xdl_realloc(cf->rcrecs, cf->alloc * sizeof(xdlclass_t *)))) {
134 -
135 ##### 16 return -1;
136 - }
137 ##### 17 cf->rcrecs = rcrecs;
138 - }
139 17928 18 cf->rcrecs[rcrec->idx] = rcrec;
140 17928 18 rcrec->line = line;
141 17928 18 rcrec->size = rec->size;
142 17928 18 rcrec->ha = rec->ha;
143 17928 18 rcrec->len1 = rcrec->len2 = 0;
144 17928 18 rcrec->next = cf->rchash[hi];
145 17928 18 cf->rchash[hi] = rcrec;
146 - }
147 -
148 172286 19-21 (pass == 1) ? rcrec->len1++ : rcrec->len2++;
149 -
150 172286 22 rec->ha = (unsigned long) rcrec->idx;
151 -
152 172286 22 hi = (long) XDL_HASHLONG(rec->ha, hbits);
153 172286 22 rec->next = rhash[hi];
154 172286 22 rhash[hi] = rec;
155 -
156 172286 22 return 0;
157 - }
158 -
159 -
160 4246 2 static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
161 - xdlclassifier_t *cf, xdfile_t *xdf) {
162 - unsigned int hbits;
163 - long nrec, hsize, bsize;
164 - unsigned long hav;
165 - char const *blk, *cur, *top, *prev;
166 - xrecord_t *crec;
167 - xrecord_t **recs, **rrecs;
168 - xrecord_t **rhash;
169 - unsigned long *ha;
170 - char *rchg;
171 - long *rindex;
172 -
173 4246 2 ha = NULL;
174 4246 2 rindex = NULL;
175 4246 2 rchg = NULL;
176 4246 2 rhash = NULL;
177 4246 2 recs = NULL;
178 -
179 4246 2,3 if (xdl_cha_init(&xdf->rcha, sizeof(xrecord_t), narec / 4 + 1) < 0)
180 ##### 4 goto abort;
181 4246 5,6 if (!(recs = (xrecord_t **) xdl_malloc(narec * sizeof(xrecord_t *))))
182 ##### 7 goto abort;
183 -
184 4246 8 if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF)
185 ##### 9 hbits = hsize = 0;
186 - else {
187 4246 10 hbits = xdl_hashbits((unsigned int) narec);
188 4246 11 hsize = 1 << hbits;
189 4246 11,12 if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *))))
190 ##### 13 goto abort;
191 4246 14 memset(rhash, 0, hsize * sizeof(xrecord_t *));
192 - }
193 -
194 4246 15 nrec = 0;
195 4246 15,16 if ((cur = blk = xdl_mmfile_first(mf, &bsize)) != NULL) {
196 176471 17,31 for (top = blk + bsize; cur < top; ) {
197 172286 18 prev = cur;
198 172286 18 hav = xdl_hash_record(&cur, top, xpp->flags);
199 172286 19 if (nrec >= narec) {
200 ##### 20 narec *= 2;
201 ##### 20,21 if (!(rrecs = (xrecord_t **) xdl_realloc(recs, narec * sizeof(xrecord_t *))))
202 ##### 22 goto abort;
203 ##### 23 recs = rrecs;
204 - }
205 172286 24,25 if (!(crec = xdl_cha_alloc(&xdf->rcha)))
206 ##### 26 goto abort;
207 172286 27 crec->ptr = prev;
208 172286 27 crec->size = (long) (cur - prev);
209 172286 27 crec->ha = hav;
210 172286 27 recs[nrec++] = crec;
211 -
212 172286 27,29 if ((XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
213 172286 28 xdl_classify_record(pass, cf, rhash, hbits, crec) < 0)
214 ##### 30 goto abort;
215 - }
216 - }
217 -
218 4246 32,33 if (!(rchg = (char *) xdl_malloc((nrec + 2) * sizeof(char))))
219 ##### 34 goto abort;
220 4246 35 memset(rchg, 0, (nrec + 2) * sizeof(char));
221 -
222 4246 35,36 if (!(rindex = (long *) xdl_malloc((nrec + 1) * sizeof(long))))
223 ##### 37 goto abort;
224 4246 38,39 if (!(ha = (unsigned long *) xdl_malloc((nrec + 1) * sizeof(unsigned long))))
225 ##### 40 goto abort;
226 -
227 4246 41 xdf->nrec = nrec;
228 4246 41 xdf->recs = recs;
229 4246 41 xdf->hbits = hbits;
230 4246 41 xdf->rhash = rhash;
231 4246 41 xdf->rchg = rchg + 1;
232 4246 41 xdf->rindex = rindex;
233 4246 41 xdf->nreff = 0;
234 4246 41 xdf->ha = ha;
235 4246 41 xdf->dstart = 0;
236 4246 41 xdf->dend = nrec - 1;
237 -
238 4246 41 return 0;
239 -
240 - abort:
241 ##### 42 xdl_free(ha);
242 ##### 43 xdl_free(rindex);
243 ##### 44 xdl_free(rchg);
244 ##### 45 xdl_free(rhash);
245 ##### 46 xdl_free(recs);
246 ##### 47 xdl_cha_free(&xdf->rcha);
247 ##### 48 return -1;
248 - }
249 -
250 -
251 4246 2 static void xdl_free_ctx(xdfile_t *xdf) {
252 -
253 4246 2 xdl_free(xdf->rhash);
254 4246 3 xdl_free(xdf->rindex);
255 4246 4 xdl_free(xdf->rchg - 1);
256 4246 5 xdl_free(xdf->ha);
257 4246 6 xdl_free(xdf->recs);
258 4246 7 xdl_cha_free(&xdf->rcha);
259 4246 8 }
260 -
261 -
262 2123 2 int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
263 - xdfenv_t *xe) {
264 - long enl1, enl2, sample;
265 - xdlclassifier_t cf;
266 -
267 2123 2 memset(&cf, 0, sizeof(cf));
268 -
269 - /*
270 - * For histogram diff, we can afford a smaller sample size and
271 - * thus a poorer estimate of the number of lines, as the hash
272 - * table (rhash) won't be filled up/grown. The number of lines
273 - * (nrecs) will be updated correctly anyway by
274 - * xdl_prepare_ctx().
275 - */
276 2123 2-4 sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF
277 - ? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1);
278 -
279 2123 5 enl1 = xdl_guess_lines(mf1, sample) + 1;
280 2123 6 enl2 = xdl_guess_lines(mf2, sample) + 1;
281 -
282 2123 7,9 if (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF &&
283 2123 8 xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
284 ##### 10 return -1;
285 -
286 2123 11,12 if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
287 -
288 ##### 13 xdl_free_classifier(&cf);
289 ##### 14 return -1;
290 - }
291 2123 15,16 if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
292 -
293 ##### 17 xdl_free_ctx(&xe->xdf1);
294 ##### 18 xdl_free_classifier(&cf);
295 ##### 19 return -1;
296 - }
297 -
298 2123 20,21 if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
299 2122 21,23 (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
300 2122 22 xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {
301 -
302 ##### 24 xdl_free_ctx(&xe->xdf2);
303 ##### 25 xdl_free_ctx(&xe->xdf1);
304 ##### 26 xdl_free_classifier(&cf);
305 ##### 27 return -1;
306 - }
307 -
308 2123 28 if (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)
309 2123 29 xdl_free_classifier(&cf);
310 -
311 2123 30 return 0;
312 - }
313 -
314 -
315 2123 2 void xdl_free_env(xdfenv_t *xe) {
316 -
317 2123 2 xdl_free_ctx(&xe->xdf2);
318 2123 3 xdl_free_ctx(&xe->xdf1);
319 2123 4 }
320 -
321 -
322 1806 2 static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
323 - long r, rdis0, rpdis0, rdis1, rpdis1;
324 -
325 - /*
326 - * Limits the window the is examined during the similar-lines
327 - * scan. The loops below stops when dis[i - r] == 1 (line that
328 - * has no match), but there are corner cases where the loop
329 - * proceed all the way to the extremities by causing huge
330 - * performance penalties in case of big files.
331 - */
332 1806 2 if (i - s > XDL_SIMSCAN_WINDOW)
333 2 3 s = i - XDL_SIMSCAN_WINDOW;
334 1806 4 if (e - i > XDL_SIMSCAN_WINDOW)
335 ##### 5 e = i + XDL_SIMSCAN_WINDOW;
336 -
337 - /*
338 - * Scans the lines before 'i' to find a run of lines that either
339 - * have no match (dis[j] == 0) or have multiple matches (dis[j] > 1).
340 - * Note that we always call this function with dis[i] > 1, so the
341 - * current line (i) is already a multimatch line.
342 - */
343 8179 6,12,13 for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
344 6910 7 if (!dis[i - r])
345 1289 8 rdis0++;
346 5621 9 else if (dis[i - r] == 2)
347 5084 10 rpdis0++;
348 - else
349 537 11 break;
350 - }
351 - /*
352 - * If the run before the line 'i' found only multimatch lines, we
353 - * return 0 and hence we don't make the current line (i) discarded.
354 - * We want to discard multimatch lines only when they appear in the
355 - * middle of runs with nomatch lines (dis[j] == 0).
356 - */
357 1806 14 if (rdis0 == 0)
358 1290 15 return 0;
359 3931 16,22,23 for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
360 3465 17 if (!dis[i + r])
361 1374 18 rdis1++;
362 2091 19 else if (dis[i + r] == 2)
363 2041 20 rpdis1++;
364 - else
365 50 21 break;
366 - }
367 - /*
368 - * If the run after the line 'i' found only multimatch lines, we
369 - * return 0 and hence we don't make the current line (i) discarded.
370 - */
371 516 24 if (rdis1 == 0)
372 109 25 return 0;
373 407 26 rdis1 += rdis0;
374 407 26 rpdis1 += rpdis0;
375 -
376 407 26 return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
377 - }
378 -
379 -
380 - /*
381 - * Try to reduce the problem complexity, discard records that have no
382 - * matches on the other file. Also, lines that have multiple matches
383 - * might be potentially discarded if they happear in a run of discardable.
384 - */
385 2122 2 static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
386 - long i, nm, nreff, mlim;
387 - xrecord_t **recs;
388 - xdlclass_t *rcrec;
389 - char *dis, *dis1, *dis2;
390 -
391 2122 2,3 if (!(dis = (char *) xdl_malloc(xdf1->nrec + xdf2->nrec + 2))) {
392 -
393 ##### 4 return -1;
394 - }
395 2122 5 memset(dis, 0, xdf1->nrec + xdf2->nrec + 2);
396 2122 5 dis1 = dis;
397 2122 5 dis2 = dis1 + xdf1->nrec + 1;
398 -
399 2122 5,6 if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT)
400 ##### 7 mlim = XDL_MAX_EQLIMIT;
401 9950 8,18,19 for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
402 7828 9 rcrec = cf->rcrecs[(*recs)->ha];
403 7828 9-11 nm = rcrec ? rcrec->len2 : 0;
404 7828 12-17 dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
405 - }
406 -
407 2122 20,21 if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
408 ##### 22 mlim = XDL_MAX_EQLIMIT;
409 10799 23,33,34 for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
410 8677 24 rcrec = cf->rcrecs[(*recs)->ha];
411 8677 24-26 nm = rcrec ? rcrec->len1 : 0;
412 8677 27-32 dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
413 - }
414 -
415 9950 35,43 for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
416 7828 42 i <= xdf1->dend; i++, recs++) {
417 7828 36,37 if (dis1[i] == 1 ||
418 5687 37-39 (dis1[i] == 2 && !xdl_clean_mmatch(dis1, i, xdf1->dstart, xdf1->dend))) {
419 3129 40 xdf1->rindex[nreff] = i;
420 3129 40 xdf1->ha[nreff] = (*recs)->ha;
421 3129 40 nreff++;
422 - } else
423 4699 41 xdf1->rchg[i] = 1;
424 - }
425 2122 44 xdf1->nreff = nreff;
426 -
427 10799 44,52 for (nreff = 0, i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart];
428 8677 51 i <= xdf2->dend; i++, recs++) {
429 8677 45,46 if (dis2[i] == 1 ||
430 6391 46-48 (dis2[i] == 2 && !xdl_clean_mmatch(dis2, i, xdf2->dstart, xdf2->dend))) {
431 3067 49 xdf2->rindex[nreff] = i;
432 3067 49 xdf2->ha[nreff] = (*recs)->ha;
433 3067 49 nreff++;
434 - } else
435 5610 50 xdf2->rchg[i] = 1;
436 - }
437 2122 53 xdf2->nreff = nreff;
438 -
439 2122 53 xdl_free(dis);
440 -
441 2122 54 return 0;
442 - }
443 -
444 -
445 - /*
446 - * Early trim initial and terminal matching records.
447 - */
448 2122 2 static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) {
449 - long i, lim;
450 - xrecord_t **recs1, **recs2;
451 -
452 2122 2 recs1 = xdf1->recs;
453 2122 2 recs2 = xdf2->recs;
454 75707 2,6 for (i = 0, lim = XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim;
455 73585 5 i++, recs1++, recs2++)
456 75000 3 if ((*recs1)->ha != (*recs2)->ha)
457 1415 4 break;
458 -
459 2122 7 xdf1->dstart = xdf2->dstart = i;
460 -
461 2122 7 recs1 = xdf1->recs + xdf1->nrec - 1;
462 2122 7 recs2 = xdf2->recs + xdf2->nrec - 1;
463 6419 7,10,11 for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--)
464 5582 8 if ((*recs1)->ha != (*recs2)->ha)
465 1285 9 break;
466 -
467 2122 12 xdf1->dend = xdf1->nrec - i - 1;
468 2122 12 xdf2->dend = xdf2->nrec - i - 1;
469 -
470 2122 12 return 0;
471 - }
472 -
473 -
474 2122 2 static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
475 -
476 2122 2,3,5 if (xdl_trim_ends(xdf1, xdf2) < 0 ||
477 2122 4 xdl_cleanup_records(cf, xdf1, xdf2) < 0) {
478 -
479 ##### 6 return -1;
480 - }
481 -
482 2122 7 return 0;
483 - }